nfs_bio.c revision 158906
1204076Spjd/*-
2204076Spjd * Copyright (c) 1989, 1993
3219351Spjd *	The Regents of the University of California.  All rights reserved.
4204076Spjd *
5204076Spjd * This code is derived from software contributed to Berkeley by
6204076Spjd * Rick Macklem at The University of Guelph.
7204076Spjd *
8204076Spjd * Redistribution and use in source and binary forms, with or without
9204076Spjd * modification, are permitted provided that the following conditions
10204076Spjd * are met:
11204076Spjd * 1. Redistributions of source code must retain the above copyright
12204076Spjd *    notice, this list of conditions and the following disclaimer.
13204076Spjd * 2. Redistributions in binary form must reproduce the above copyright
14204076Spjd *    notice, this list of conditions and the following disclaimer in the
15204076Spjd *    documentation and/or other materials provided with the distribution.
16204076Spjd * 4. Neither the name of the University nor the names of its contributors
17204076Spjd *    may be used to endorse or promote products derived from this software
18204076Spjd *    without specific prior written permission.
19204076Spjd *
20204076Spjd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21204076Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22204076Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23204076Spjd * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24204076Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25204076Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26204076Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27204076Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28204076Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29204076Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30204076Spjd * SUCH DAMAGE.
31204076Spjd *
32204076Spjd *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
33204076Spjd */
34204076Spjd
35204076Spjd#include <sys/cdefs.h>
36204076Spjd__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 158906 2006-05-25 01:00:35Z ups $");
37204076Spjd
38204076Spjd#include <sys/param.h>
39204076Spjd#include <sys/systm.h>
40204076Spjd#include <sys/bio.h>
41204076Spjd#include <sys/buf.h>
42204076Spjd#include <sys/kernel.h>
43204076Spjd#include <sys/mount.h>
44204076Spjd#include <sys/proc.h>
45204076Spjd#include <sys/resourcevar.h>
46204076Spjd#include <sys/signalvar.h>
47204076Spjd#include <sys/vmmeter.h>
48211982Spjd#include <sys/vnode.h>
49204076Spjd
50204076Spjd#include <vm/vm.h>
51204076Spjd#include <vm/vm_extern.h>
52204076Spjd#include <vm/vm_page.h>
53204076Spjd#include <vm/vm_object.h>
54204076Spjd#include <vm/vm_pager.h>
55204076Spjd#include <vm/vnode_pager.h>
56204076Spjd
57204076Spjd#include <rpc/rpcclnt.h>
58204076Spjd
59204076Spjd#include <nfs/rpcv2.h>
60212038Spjd#include <nfs/nfsproto.h>
61204076Spjd#include <nfsclient/nfs.h>
62204076Spjd#include <nfsclient/nfsmount.h>
63204076Spjd#include <nfsclient/nfsnode.h>
64211886Spjd
65204076Spjd#include <nfs4client/nfs4.h>
66204076Spjd
67204076Spjdstatic struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
68204076Spjd		    struct thread *td);
69204076Spjdstatic int nfs_directio_write(struct vnode *vp, struct uio *uiop,
70204076Spjd			      struct ucred *cred, int ioflag);
71210886Spjd
72210886Spjdextern int nfs_directio_enable;
73210886Spjdextern int nfs_directio_allow_mmap;
74204076Spjd
75204076Spjd/*
76204076Spjd * Vnode op for VM getpages.
77204076Spjd */
78204076Spjdint
79204076Spjdnfs_getpages(struct vop_getpages_args *ap)
80204076Spjd{
81204076Spjd	int i, error, nextoff, size, toff, count, npages;
82204076Spjd	struct uio uio;
83204076Spjd	struct iovec iov;
84204076Spjd	vm_offset_t kva;
85204076Spjd	struct buf *bp;
86204076Spjd	struct vnode *vp;
87204076Spjd	struct thread *td;
88204076Spjd	struct ucred *cred;
89219818Spjd	struct nfsmount *nmp;
90204076Spjd	vm_object_t object;
91204076Spjd	vm_page_t *pages;
92204076Spjd	struct nfsnode *np;
93204076Spjd
94204076Spjd	vp = ap->a_vp;
95204076Spjd	np = VTONFS(vp);
96204076Spjd	td = curthread;				/* XXX */
97204076Spjd	cred = curthread->td_ucred;		/* XXX */
98204076Spjd	nmp = VFSTONFS(vp->v_mount);
99204076Spjd	pages = ap->a_m;
100204076Spjd	count = ap->a_count;
101204076Spjd
102204076Spjd	if ((object = vp->v_object) == NULL) {
103204076Spjd		nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
104204076Spjd		return VM_PAGER_ERROR;
105204076Spjd	}
106204076Spjd
107204076Spjd	if (nfs_directio_enable && !nfs_directio_allow_mmap) {
108204076Spjd		mtx_lock(&np->n_mtx);
109204076Spjd		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
110204076Spjd			mtx_unlock(&np->n_mtx);
111204076Spjd			nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
112204076Spjd			return VM_PAGER_ERROR;
113204076Spjd		} else
114204076Spjd			mtx_unlock(&np->n_mtx);
115204076Spjd	}
116204076Spjd
117204076Spjd	mtx_lock(&nmp->nm_mtx);
118204076Spjd	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
119204076Spjd	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
120204076Spjd		mtx_unlock(&nmp->nm_mtx);
121204076Spjd		/* We'll never get here for v4, because we always have fsinfo */
122204076Spjd		(void)nfs_fsinfo(nmp, vp, cred, td);
123204076Spjd	} else
124204076Spjd		mtx_unlock(&nmp->nm_mtx);
125204076Spjd
126204076Spjd	npages = btoc(count);
127204076Spjd
128204076Spjd	/*
129204076Spjd	 * If the requested page is partially valid, just return it and
130204076Spjd	 * allow the pager to zero-out the blanks.  Partially valid pages
131204076Spjd	 * can only occur at the file EOF.
132204076Spjd	 */
133204076Spjd
134204076Spjd	{
135204076Spjd		vm_page_t m = pages[ap->a_reqpage];
136204076Spjd
137204076Spjd		VM_OBJECT_LOCK(object);
138204076Spjd		vm_page_lock_queues();
139204076Spjd		if (m->valid != 0) {
140204076Spjd			/* handled by vm_fault now	  */
141204076Spjd			/* vm_page_zero_invalid(m, TRUE); */
142204076Spjd			for (i = 0; i < npages; ++i) {
143204076Spjd				if (i != ap->a_reqpage)
144204076Spjd					vm_page_free(pages[i]);
145204076Spjd			}
146204076Spjd			vm_page_unlock_queues();
147204076Spjd			VM_OBJECT_UNLOCK(object);
148204076Spjd			return(0);
149204076Spjd		}
150204076Spjd		vm_page_unlock_queues();
151204076Spjd		VM_OBJECT_UNLOCK(object);
152204076Spjd	}
153204076Spjd
154204076Spjd	/*
155204076Spjd	 * We use only the kva address for the buffer, but this is extremely
156204076Spjd	 * convienient and fast.
157204076Spjd	 */
158204076Spjd	bp = getpbuf(&nfs_pbuf_freecnt);
159204076Spjd
160204076Spjd	kva = (vm_offset_t) bp->b_data;
161204076Spjd	pmap_qenter(kva, pages, npages);
162204076Spjd	cnt.v_vnodein++;
163204076Spjd	cnt.v_vnodepgsin += npages;
164204076Spjd
165204076Spjd	iov.iov_base = (caddr_t) kva;
166204076Spjd	iov.iov_len = count;
167204076Spjd	uio.uio_iov = &iov;
168204076Spjd	uio.uio_iovcnt = 1;
169204076Spjd	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
170204076Spjd	uio.uio_resid = count;
171204076Spjd	uio.uio_segflg = UIO_SYSSPACE;
172204076Spjd	uio.uio_rw = UIO_READ;
173204076Spjd	uio.uio_td = td;
174204076Spjd
175204076Spjd	error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred);
176204076Spjd	pmap_qremove(kva, npages);
177204076Spjd
178214692Spjd	relpbuf(bp, &nfs_pbuf_freecnt);
179214692Spjd
180214692Spjd	if (error && (uio.uio_resid == count)) {
181204076Spjd		nfs_printf("nfs_getpages: error %d\n", error);
182214692Spjd		VM_OBJECT_LOCK(object);
183214692Spjd		vm_page_lock_queues();
184214692Spjd		for (i = 0; i < npages; ++i) {
185214692Spjd			if (i != ap->a_reqpage)
186219864Spjd				vm_page_free(pages[i]);
187214692Spjd		}
188204076Spjd		vm_page_unlock_queues();
189214692Spjd		VM_OBJECT_UNLOCK(object);
190214692Spjd		return VM_PAGER_ERROR;
191214692Spjd	}
192214692Spjd
193204076Spjd	/*
194204076Spjd	 * Calculate the number of bytes read and validate only that number
195204076Spjd	 * of bytes.  Note that due to pending writes, size may be 0.  This
196204076Spjd	 * does not mean that the remaining data is invalid!
197204076Spjd	 */
198204076Spjd
199204076Spjd	size = count - uio.uio_resid;
200204076Spjd	VM_OBJECT_LOCK(object);
201204076Spjd	vm_page_lock_queues();
202204076Spjd	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
203204076Spjd		vm_page_t m;
204204076Spjd		nextoff = toff + PAGE_SIZE;
205209183Spjd		m = pages[i];
206209183Spjd
207209183Spjd		if (nextoff <= size) {
208209183Spjd			/*
209204076Spjd			 * Read operation filled an entire page
210204076Spjd			 */
211204076Spjd			m->valid = VM_PAGE_BITS_ALL;
212204076Spjd			vm_page_undirty(m);
213204076Spjd		} else if (size > toff) {
214204076Spjd			/*
215204076Spjd			 * Read operation filled a partial page.
216204076Spjd			 */
217204076Spjd			m->valid = 0;
218204076Spjd			vm_page_set_validclean(m, 0, size - toff);
219204076Spjd			/* handled by vm_fault now	  */
220204076Spjd			/* vm_page_zero_invalid(m, TRUE); */
221204076Spjd		} else {
222220898Spjd			/*
223204076Spjd			 * Read operation was short.  If no error occured
224204076Spjd			 * we may have hit a zero-fill section.   We simply
225204076Spjd			 * leave valid set to 0.
226204076Spjd			 */
227204076Spjd			;
228204076Spjd		}
229204076Spjd		if (i != ap->a_reqpage) {
230204076Spjd			/*
231204076Spjd			 * Whether or not to leave the page activated is up in
232211982Spjd			 * the air, but we should put the page on a page queue
233204076Spjd			 * somewhere (it already is in the object).  Result:
234204076Spjd			 * It appears that emperical results show that
235204076Spjd			 * deactivating pages is best.
236204076Spjd			 */
237204076Spjd
238204076Spjd			/*
239204076Spjd			 * Just in case someone was asking for this page we
240204076Spjd			 * now tell them that it is ok to use.
241204076Spjd			 */
242204076Spjd			if (!error) {
243204076Spjd				if (m->flags & PG_WANTED)
244213533Spjd					vm_page_activate(m);
245204076Spjd				else
246204076Spjd					vm_page_deactivate(m);
247204076Spjd				vm_page_wakeup(m);
248204076Spjd			} else {
249213531Spjd				vm_page_free(m);
250213531Spjd			}
251204076Spjd		}
252204076Spjd	}
253204076Spjd	vm_page_unlock_queues();
254204076Spjd	VM_OBJECT_UNLOCK(object);
255204076Spjd	return 0;
256204076Spjd}
257204076Spjd
258204076Spjd/*
259204076Spjd * Vnode op for VM putpages.
260212899Spjd */
261204076Spjdint
262204076Spjdnfs_putpages(struct vop_putpages_args *ap)
263204076Spjd{
264204076Spjd	struct uio uio;
265218138Spjd	struct iovec iov;
266204076Spjd	vm_offset_t kva;
267204076Spjd	struct buf *bp;
268204076Spjd	int iomode, must_commit, i, error, npages, count;
269204076Spjd	off_t offset;
270204076Spjd	int *rtvals;
271204076Spjd	struct vnode *vp;
272204076Spjd	struct thread *td;
273212899Spjd	struct ucred *cred;
274204076Spjd	struct nfsmount *nmp;
275204076Spjd	struct nfsnode *np;
276204076Spjd	vm_page_t *pages;
277204076Spjd
278204076Spjd	vp = ap->a_vp;
279204076Spjd	np = VTONFS(vp);
280204076Spjd	td = curthread;				/* XXX */
281204076Spjd	cred = curthread->td_ucred;		/* XXX */
282204076Spjd	nmp = VFSTONFS(vp->v_mount);
283204076Spjd	pages = ap->a_m;
284204076Spjd	count = ap->a_count;
285204076Spjd	rtvals = ap->a_rtvals;
286204076Spjd	npages = btoc(count);
287204076Spjd	offset = IDX_TO_OFF(pages[0]->pindex);
288204076Spjd
289204076Spjd	mtx_lock(&nmp->nm_mtx);
290204076Spjd	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
291204076Spjd	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
292218138Spjd		mtx_unlock(&nmp->nm_mtx);
293218138Spjd		(void)nfs_fsinfo(nmp, vp, cred, td);
294204076Spjd	} else
295204076Spjd		mtx_unlock(&nmp->nm_mtx);
296204076Spjd
297204076Spjd	mtx_lock(&np->n_mtx);
298204076Spjd	if (nfs_directio_enable && !nfs_directio_allow_mmap &&
299204076Spjd	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
300204076Spjd		mtx_unlock(&np->n_mtx);
301204076Spjd		nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
302204076Spjd		mtx_lock(&np->n_mtx);
303210881Spjd	}
304210881Spjd
305210881Spjd	for (i = 0; i < npages; i++)
306210881Spjd		rtvals[i] = VM_PAGER_AGAIN;
307210881Spjd
308210881Spjd	/*
309210881Spjd	 * When putting pages, do not extend file past EOF.
310204076Spjd	 */
311204076Spjd	if (offset + count > np->n_size) {
312204076Spjd		count = np->n_size - offset;
313204076Spjd		if (count < 0)
314204076Spjd			count = 0;
315204076Spjd	}
316204076Spjd	mtx_unlock(&np->n_mtx);
317204076Spjd
318204076Spjd	/*
319204076Spjd	 * We use only the kva address for the buffer, but this is extremely
320204076Spjd	 * convienient and fast.
321204076Spjd	 */
322204076Spjd	bp = getpbuf(&nfs_pbuf_freecnt);
323204076Spjd
324204076Spjd	kva = (vm_offset_t) bp->b_data;
325204076Spjd	pmap_qenter(kva, pages, npages);
326204076Spjd	cnt.v_vnodeout++;
327204076Spjd	cnt.v_vnodepgsout += count;
328204076Spjd
329204076Spjd	iov.iov_base = (caddr_t) kva;
330204076Spjd	iov.iov_len = count;
331204076Spjd	uio.uio_iov = &iov;
332204076Spjd	uio.uio_iovcnt = 1;
333204076Spjd	uio.uio_offset = offset;
334204076Spjd	uio.uio_resid = count;
335204076Spjd	uio.uio_segflg = UIO_SYSSPACE;
336204076Spjd	uio.uio_rw = UIO_WRITE;
337204076Spjd	uio.uio_td = td;
338204076Spjd
339204076Spjd	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
340204076Spjd	    iomode = NFSV3WRITE_UNSTABLE;
341204076Spjd	else
342204076Spjd	    iomode = NFSV3WRITE_FILESYNC;
343204076Spjd
344204076Spjd	error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit);
345204076Spjd
346204076Spjd	pmap_qremove(kva, npages);
347204076Spjd	relpbuf(bp, &nfs_pbuf_freecnt);
348204076Spjd
349204076Spjd	if (!error) {
350204076Spjd		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
351204076Spjd		for (i = 0; i < nwritten; i++) {
352204076Spjd			rtvals[i] = VM_PAGER_OK;
353204076Spjd			vm_page_undirty(pages[i]);
354204076Spjd		}
355204076Spjd		if (must_commit) {
356204076Spjd			nfs_clearcommit(vp->v_mount);
357204076Spjd		}
358204076Spjd	}
359204076Spjd	return rtvals[0];
360204076Spjd}
361204076Spjd
362204076Spjd/*
363204076Spjd * For nfs, cache consistency can only be maintained approximately.
364204076Spjd * Although RFC1094 does not specify the criteria, the following is
365204076Spjd * believed to be compatible with the reference port.
366204076Spjd * For nfs:
367204076Spjd * If the file's modify time on the server has changed since the
368204076Spjd * last read rpc or you have written to the file,
369204076Spjd * you may have lost data cache consistency with the
370204076Spjd * server, so flush all of the file's data out of the cache.
371204076Spjd * Then force a getattr rpc to ensure that you have up to date
372204076Spjd * attributes.
373204076Spjd * NB: This implies that cache data can be read when up to
374204076Spjd * NFS_ATTRTIMEO seconds out of date. If you find that you need current
375204076Spjd * attributes this could be forced by setting n_attrstamp to 0 before
376204076Spjd * the VOP_GETATTR() call.
377204076Spjd */
378204076Spjdstatic inline int
379204076Spjdnfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
380204076Spjd{
381204076Spjd	int error = 0;
382204076Spjd	struct vattr vattr;
383204076Spjd	struct nfsnode *np = VTONFS(vp);
384204076Spjd	int old_lock;
385204076Spjd	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
386204076Spjd
387204076Spjd	/*
388204076Spjd	 * Grab the exclusive lock before checking whether the cache is
389204076Spjd	 * consistent.
390204076Spjd	 * XXX - We can make this cheaper later (by acquiring cheaper locks).
391204076Spjd	 * But for now, this suffices.
392204076Spjd	 */
393204076Spjd	old_lock = nfs_upgrade_vnlock(vp, td);
394204076Spjd	mtx_lock(&np->n_mtx);
395204076Spjd	if (np->n_flag & NMODIFIED) {
396204076Spjd		mtx_unlock(&np->n_mtx);
397204076Spjd		if (vp->v_type != VREG) {
398204076Spjd			if (vp->v_type != VDIR)
399204076Spjd				panic("nfs: bioread, not dir");
400204076Spjd			(nmp->nm_rpcops->nr_invaldir)(vp);
401204076Spjd			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
402204076Spjd			if (error)
403204076Spjd				goto out;
404204076Spjd		}
405204076Spjd		np->n_attrstamp = 0;
406204076Spjd		error = VOP_GETATTR(vp, &vattr, cred, td);
407204076Spjd		if (error)
408204076Spjd			goto out;
409204076Spjd		mtx_lock(&np->n_mtx);
410204076Spjd		np->n_mtime = vattr.va_mtime;
411204076Spjd		mtx_unlock(&np->n_mtx);
412204076Spjd	} else {
413204076Spjd		mtx_unlock(&np->n_mtx);
414204076Spjd		error = VOP_GETATTR(vp, &vattr, cred, td);
415204076Spjd		if (error)
416204076Spjd			return (error);
417204076Spjd		mtx_lock(&np->n_mtx);
418204076Spjd		if ((np->n_flag & NSIZECHANGED)
419204076Spjd		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
420204076Spjd			mtx_unlock(&np->n_mtx);
421204076Spjd			if (vp->v_type == VDIR)
422204076Spjd				(nmp->nm_rpcops->nr_invaldir)(vp);
423214284Spjd			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
424214284Spjd			if (error)
425214284Spjd				goto out;
426214284Spjd			mtx_lock(&np->n_mtx);
427214284Spjd			np->n_mtime = vattr.va_mtime;
428214284Spjd			np->n_flag &= ~NSIZECHANGED;
429214284Spjd		}
430214284Spjd		mtx_unlock(&np->n_mtx);
431214284Spjd	}
432214284Spjdout:
433214284Spjd	nfs_downgrade_vnlock(vp, td, old_lock);
434214284Spjd	return error;
435214284Spjd}
436214284Spjd
437214284Spjd/*
438214284Spjd * Vnode op for read using bio
439214284Spjd */
440214284Spjdint
441204076Spjdnfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
442204076Spjd{
443204076Spjd	struct nfsnode *np = VTONFS(vp);
444204076Spjd	int biosize, i;
445204076Spjd	struct buf *bp, *rabp;
446204076Spjd	struct thread *td;
447204076Spjd	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
448204076Spjd	daddr_t lbn, rabn;
449204076Spjd	int bcount;
450204076Spjd	int seqcount;
451204076Spjd	int nra, error = 0, n = 0, on = 0;
452204076Spjd
453204076Spjd#ifdef DIAGNOSTIC
454204076Spjd	if (uio->uio_rw != UIO_READ)
455204076Spjd		panic("nfs_read mode");
456204076Spjd#endif
457204076Spjd	if (uio->uio_resid == 0)
458204076Spjd		return (0);
459204076Spjd	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
460204076Spjd		return (EINVAL);
461204076Spjd	td = uio->uio_td;
462204076Spjd
463204076Spjd	mtx_lock(&nmp->nm_mtx);
464204076Spjd	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
465204076Spjd	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
466204076Spjd		mtx_unlock(&nmp->nm_mtx);
467204076Spjd		(void)nfs_fsinfo(nmp, vp, cred, td);
468204076Spjd	} else
469204076Spjd		mtx_unlock(&nmp->nm_mtx);
470204076Spjd
471204076Spjd	if (vp->v_type != VDIR &&
472209181Spjd	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
473204076Spjd		return (EFBIG);
474204076Spjd
475204076Spjd	if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
476214284Spjd		/* No caching/ no readaheads. Just read data into the user buffer */
477214284Spjd		return nfs_readrpc(vp, uio, cred);
478214284Spjd
479214284Spjd	biosize = vp->v_mount->mnt_stat.f_iosize;
480214284Spjd	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
481204076Spjd
482219844Spjd	error = nfs_bioread_check_cons(vp, td, cred);
483204076Spjd	if (error)
484204076Spjd		return error;
485204076Spjd
486204076Spjd	do {
487204076Spjd	    u_quad_t nsize;
488218218Spjd
489218218Spjd	    mtx_lock(&np->n_mtx);
490218218Spjd	    nsize = np->n_size;
491218218Spjd	    mtx_unlock(&np->n_mtx);
492218218Spjd
493218218Spjd	    switch (vp->v_type) {
494218218Spjd	    case VREG:
495218218Spjd		nfsstats.biocache_reads++;
496218218Spjd		lbn = uio->uio_offset / biosize;
497218218Spjd		on = uio->uio_offset & (biosize - 1);
498218218Spjd
499218218Spjd		/*
500218218Spjd		 * Start the read ahead(s), as required.
501218218Spjd		 */
502218218Spjd		if (nmp->nm_readahead > 0) {
503218218Spjd		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
504218218Spjd			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
505218218Spjd			rabn = lbn + 1 + nra;
506218218Spjd			if (incore(&vp->v_bufobj, rabn) == NULL) {
507218218Spjd			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
508218218Spjd			    if (!rabp) {
509218218Spjd				error = nfs_sigintr(nmp, NULL, td);
510218218Spjd				return (error ? error : EINTR);
511218218Spjd			    }
512218218Spjd			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
513220006Spjd				rabp->b_flags |= B_ASYNC;
514218218Spjd				rabp->b_iocmd = BIO_READ;
515218218Spjd				vfs_busy_pages(rabp, 0);
516218218Spjd				if (nfs_asyncio(nmp, rabp, cred, td)) {
517218218Spjd				    rabp->b_flags |= B_INVAL;
518218218Spjd				    rabp->b_ioflags |= BIO_ERROR;
519218218Spjd				    vfs_unbusy_pages(rabp);
520218218Spjd				    brelse(rabp);
521218218Spjd				    break;
522218218Spjd				}
523218218Spjd			    } else {
524218218Spjd				brelse(rabp);
525218218Spjd			    }
526218218Spjd			}
527218218Spjd		    }
528220898Spjd		}
529205738Spjd
530205738Spjd		/* Note that bcount is *not* DEV_BSIZE aligned. */
531204076Spjd		bcount = biosize;
532205738Spjd		if ((off_t)lbn * biosize >= nsize) {
533204076Spjd			bcount = 0;
534204076Spjd		} else if ((off_t)(lbn + 1) * biosize > nsize) {
535204076Spjd			bcount = nsize - (off_t)lbn * biosize;
536204076Spjd		}
537204076Spjd		bp = nfs_getcacheblk(vp, lbn, bcount, td);
538204076Spjd
539204076Spjd		if (!bp) {
540204076Spjd			error = nfs_sigintr(nmp, NULL, td);
541220898Spjd			return (error ? error : EINTR);
542204076Spjd		}
543218138Spjd
544218138Spjd		/*
545205738Spjd		 * If B_CACHE is not set, we must issue the read.  If this
546205738Spjd		 * fails, we return an error.
547211983Spjd		 */
548205738Spjd
549218218Spjd		if ((bp->b_flags & B_CACHE) == 0) {
550220898Spjd		    bp->b_iocmd = BIO_READ;
551218218Spjd		    vfs_busy_pages(bp, 0);
552220898Spjd		    error = nfs_doio(vp, bp, cred, td);
553220898Spjd		    if (error) {
554204076Spjd			brelse(bp);
555204076Spjd			return (error);
556204076Spjd		    }
557204076Spjd		}
558204076Spjd
559204076Spjd		/*
560204076Spjd		 * on is the offset into the current bp.  Figure out how many
561204076Spjd		 * bytes we can copy out of the bp.  Note that bcount is
562204076Spjd		 * NOT DEV_BSIZE aligned.
563204076Spjd		 *
564204076Spjd		 * Then figure out how many bytes we can copy into the uio.
565204076Spjd		 */
566204076Spjd
567205738Spjd		n = 0;
568204076Spjd		if (on < bcount)
569204076Spjd			n = min((unsigned)(bcount - on), uio->uio_resid);
570204076Spjd		break;
571204076Spjd	    case VLNK:
572204076Spjd		nfsstats.biocache_readlinks++;
573204076Spjd		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
574204076Spjd		if (!bp) {
575205738Spjd			error = nfs_sigintr(nmp, NULL, td);
576204076Spjd			return (error ? error : EINTR);
577204076Spjd		}
578204076Spjd		if ((bp->b_flags & B_CACHE) == 0) {
579204076Spjd		    bp->b_iocmd = BIO_READ;
580204076Spjd		    vfs_busy_pages(bp, 0);
581204076Spjd		    error = nfs_doio(vp, bp, cred, td);
582204076Spjd		    if (error) {
583204076Spjd			bp->b_ioflags |= BIO_ERROR;
584220898Spjd			brelse(bp);
585220898Spjd			return (error);
586204076Spjd		    }
587204076Spjd		}
588204076Spjd		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
589204076Spjd		on = 0;
590204076Spjd		break;
591204076Spjd	    case VDIR:
592204076Spjd		nfsstats.biocache_readdirs++;
593204076Spjd		if (np->n_direofoffset
594204076Spjd		    && uio->uio_offset >= np->n_direofoffset) {
595204076Spjd		    return (0);
596204076Spjd		}
597204076Spjd		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
598204076Spjd		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
599204076Spjd		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
600204076Spjd		if (!bp) {
601204076Spjd		    error = nfs_sigintr(nmp, NULL, td);
602204076Spjd		    return (error ? error : EINTR);
603204076Spjd		}
604204076Spjd		if ((bp->b_flags & B_CACHE) == 0) {
605204076Spjd		    bp->b_iocmd = BIO_READ;
606204076Spjd		    vfs_busy_pages(bp, 0);
607204076Spjd		    error = nfs_doio(vp, bp, cred, td);
608204076Spjd		    if (error) {
609218218Spjd			    brelse(bp);
610204076Spjd		    }
611218218Spjd		    while (error == NFSERR_BAD_COOKIE) {
612204076Spjd			(nmp->nm_rpcops->nr_invaldir)(vp);
613204076Spjd			error = nfs_vinvalbuf(vp, 0, td, 1);
614204076Spjd			/*
615204076Spjd			 * Yuck! The directory has been modified on the
616214284Spjd			 * server. The only way to get the block is by
617214284Spjd			 * reading from the beginning to get all the
618214284Spjd			 * offset cookies.
619214284Spjd			 *
620214284Spjd			 * Leave the last bp intact unless there is an error.
621214284Spjd			 * Loop back up to the while if the error is another
622214284Spjd			 * NFSERR_BAD_COOKIE (double yuch!).
623214284Spjd			 */
624214284Spjd			for (i = 0; i <= lbn && !error; i++) {
625214284Spjd			    if (np->n_direofoffset
626214284Spjd				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
627214284Spjd				    return (0);
628214284Spjd			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
629204076Spjd			    if (!bp) {
630204076Spjd				error = nfs_sigintr(nmp, NULL, td);
631204076Spjd				return (error ? error : EINTR);
632204076Spjd			    }
633204076Spjd			    if ((bp->b_flags & B_CACHE) == 0) {
634204076Spjd				    bp->b_iocmd = BIO_READ;
635204076Spjd				    vfs_busy_pages(bp, 0);
636204076Spjd				    error = nfs_doio(vp, bp, cred, td);
637204076Spjd				    /*
638204076Spjd				     * no error + B_INVAL == directory EOF,
639205738Spjd				     * use the block.
640204076Spjd				     */
641204076Spjd				    if (error == 0 && (bp->b_flags & B_INVAL))
642204076Spjd					    break;
643204076Spjd			    }
644204076Spjd			    /*
645204076Spjd			     * An error will throw away the block and the
646204076Spjd			     * for loop will break out.  If no error and this
647205738Spjd			     * is not the block we want, we throw away the
648204076Spjd			     * block and go for the next one via the for loop.
649204076Spjd			     */
650204076Spjd			    if (error || i < lbn)
651204076Spjd				    brelse(bp);
652204076Spjd			}
653204076Spjd		    }
654204076Spjd		    /*
655204076Spjd		     * The above while is repeated if we hit another cookie
656204076Spjd		     * error.  If we hit an error and it wasn't a cookie error,
657204076Spjd		     * we give up.
658204076Spjd		     */
659204076Spjd		    if (error)
660204076Spjd			    return (error);
661204076Spjd		}
662204076Spjd
663204076Spjd		/*
664204076Spjd		 * If not eof and read aheads are enabled, start one.
665204076Spjd		 * (You need the current block first, so that you have the
666204076Spjd		 *  directory offset cookie of the next block.)
667204076Spjd		 */
668204076Spjd		if (nmp->nm_readahead > 0 &&
669204076Spjd		    (bp->b_flags & B_INVAL) == 0 &&
670204076Spjd		    (np->n_direofoffset == 0 ||
671204076Spjd		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
672204076Spjd		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
673204076Spjd			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
674204076Spjd			if (rabp) {
675204076Spjd			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
676220865Spjd				rabp->b_flags |= B_ASYNC;
677220865Spjd				rabp->b_iocmd = BIO_READ;
678220865Spjd				vfs_busy_pages(rabp, 0);
679220865Spjd				if (nfs_asyncio(nmp, rabp, cred, td)) {
680220865Spjd				    rabp->b_flags |= B_INVAL;
681220865Spjd				    rabp->b_ioflags |= BIO_ERROR;
682220865Spjd				    vfs_unbusy_pages(rabp);
683220865Spjd				    brelse(rabp);
684220865Spjd				}
685220865Spjd			    } else {
686220865Spjd				brelse(rabp);
687220865Spjd			    }
688220865Spjd			}
689220865Spjd		}
690220865Spjd		/*
691220865Spjd		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
692220865Spjd		 * chopped for the EOF condition, we cannot tell how large
693220865Spjd		 * NFS directories are going to be until we hit EOF.  So
694220865Spjd		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
695204076Spjd		 * it just so happens that b_resid will effectively chop it
696204076Spjd		 * to EOF.  *BUT* this information is lost if the buffer goes
697204076Spjd		 * away and is reconstituted into a B_CACHE state ( due to
698204076Spjd		 * being VMIO ) later.  So we keep track of the directory eof
699204076Spjd		 * in np->n_direofoffset and chop it off as an extra step
700204076Spjd		 * right here.
701204076Spjd		 */
702204076Spjd		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
703204076Spjd		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
704204076Spjd			n = np->n_direofoffset - uio->uio_offset;
705204076Spjd		break;
706204076Spjd	    default:
707204076Spjd		nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
708204076Spjd		bp = NULL;
709205738Spjd		break;
710204076Spjd	    };
711204076Spjd
712204076Spjd	    if (n > 0) {
713204076Spjd		    error = uiomove(bp->b_data + on, (int)n, uio);
714204076Spjd	    }
715204076Spjd	    if (vp->v_type == VLNK)
716204076Spjd		n = 0;
717204076Spjd	    if (bp != NULL)
718204076Spjd		brelse(bp);
719204076Spjd	} while (error == 0 && uio->uio_resid > 0 && n > 0);
720204076Spjd	return (error);
721204076Spjd}
722204076Spjd
723204076Spjd/*
724204076Spjd * The NFS write path cannot handle iovecs with len > 1. So we need to
725204076Spjd * break up iovecs accordingly (restricting them to wsize).
726204076Spjd * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
727204076Spjd * For the ASYNC case, 2 copies are needed. The first a copy from the
728214274Spjd * user buffer to a staging buffer and then a second copy from the staging
729223181Strociny * buffer to mbufs. This can be optimized by copying from the user buffer
730220271Spjd * directly into mbufs and passing the chain down, but that requires a
731220271Spjd * fair amount of re-working of the relevant codepaths (and can be done
732220271Spjd * later).
733220271Spjd */
734220271Spjdstatic int
735223181Strocinynfs_directio_write(vp, uiop, cred, ioflag)
736204076Spjd	struct vnode *vp;
737205738Spjd	struct uio *uiop;
738205738Spjd	struct ucred *cred;
739205738Spjd	int ioflag;
740205738Spjd{
741205738Spjd	int error;
742205738Spjd	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
743205738Spjd	struct thread *td = uiop->uio_td;
744212038Spjd	int size;
745220898Spjd	int wsize;
746205738Spjd
747211983Spjd	mtx_lock(&nmp->nm_mtx);
748212038Spjd	wsize = nmp->nm_wsize;
749205738Spjd	mtx_unlock(&nmp->nm_mtx);
750205738Spjd	if (ioflag & IO_SYNC) {
751205738Spjd		int iomode, must_commit;
752220898Spjd		struct uio uio;
753205738Spjd		struct iovec iov;
754205738Spjddo_sync:
755205738Spjd		while (uiop->uio_resid > 0) {
756205738Spjd			size = min(uiop->uio_resid, wsize);
757205738Spjd			size = min(uiop->uio_iov->iov_len, size);
758205738Spjd			iov.iov_base = uiop->uio_iov->iov_base;
759204076Spjd			iov.iov_len = size;
760204076Spjd			uio.uio_iov = &iov;
761204076Spjd			uio.uio_iovcnt = 1;
762204076Spjd			uio.uio_offset = uiop->uio_offset;
763204076Spjd			uio.uio_resid = size;
764204076Spjd			uio.uio_segflg = UIO_USERSPACE;
765204076Spjd			uio.uio_rw = UIO_WRITE;
766211878Spjd			uio.uio_td = td;
767211878Spjd			iomode = NFSV3WRITE_FILESYNC;
768211878Spjd			error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred,
769211878Spjd						      &iomode, &must_commit);
770211878Spjd			KASSERT((must_commit == 0),
771211878Spjd				("nfs_directio_write: Did not commit write"));
772211878Spjd			if (error)
773211878Spjd				return (error);
774211878Spjd			uiop->uio_offset += size;
775211878Spjd			uiop->uio_resid -= size;
776204076Spjd			if (uiop->uio_iov->iov_len <= size) {
777204076Spjd				uiop->uio_iovcnt--;
778204076Spjd				uiop->uio_iov++;
779204076Spjd			} else {
780204076Spjd				uiop->uio_iov->iov_base =
781204076Spjd					(char *)uiop->uio_iov->iov_base + size;
782204076Spjd				uiop->uio_iov->iov_len -= size;
783204076Spjd			}
784204076Spjd		}
785204076Spjd	} else {
786204076Spjd		struct uio *t_uio;
787204076Spjd		struct iovec *t_iov;
788204076Spjd		struct buf *bp;
789204076Spjd
790204076Spjd		/*
791213533Spjd		 * Break up the write into blocksize chunks and hand these
792204076Spjd		 * over to nfsiod's for write back.
793204076Spjd		 * Unfortunately, this incurs a copy of the data. Since
794204076Spjd		 * the user could modify the buffer before the write is
795204076Spjd		 * initiated.
796220266Spjd		 *
797204076Spjd		 * The obvious optimization here is that one of the 2 copies
798204076Spjd		 * in the async write path can be eliminated by copying the
799204076Spjd		 * data here directly into mbufs and passing the mbuf chain
800204076Spjd		 * down. But that will require a fair amount of re-working
801204076Spjd		 * of the code and can be done if there's enough interest
802204076Spjd		 * in NFS directio access.
803204076Spjd		 */
804204076Spjd		while (uiop->uio_resid > 0) {
805204076Spjd			size = min(uiop->uio_resid, wsize);
806204076Spjd			size = min(uiop->uio_iov->iov_len, size);
807204076Spjd			bp = getpbuf(&nfs_pbuf_freecnt);
808204076Spjd			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
809204076Spjd			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
810204076Spjd			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
811204076Spjd			t_iov->iov_len = size;
812204076Spjd			t_uio->uio_iov = t_iov;
813204076Spjd			t_uio->uio_iovcnt = 1;
814204076Spjd			t_uio->uio_offset = uiop->uio_offset;
815204076Spjd			t_uio->uio_resid = size;
816204076Spjd			t_uio->uio_segflg = UIO_SYSSPACE;
817204076Spjd			t_uio->uio_rw = UIO_WRITE;
818213533Spjd			t_uio->uio_td = td;
819204076Spjd			bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size);
820204076Spjd			bp->b_flags |= B_DIRECT;
821204076Spjd			bp->b_iocmd = BIO_WRITE;
822204076Spjd			if (cred != NOCRED) {
823204076Spjd				crhold(cred);
824204076Spjd				bp->b_wcred = cred;
825204076Spjd			} else
826204076Spjd				bp->b_wcred = NOCRED;
827204076Spjd			bp->b_caller1 = (void *)t_uio;
828204076Spjd			bp->b_vp = vp;
829204076Spjd			vhold(vp);
830204076Spjd			error = nfs_asyncio(nmp, bp, NOCRED, td);
831204076Spjd			if (error) {
832204076Spjd				free(t_iov->iov_base, M_NFSDIRECTIO);
833204076Spjd				free(t_iov, M_NFSDIRECTIO);
834204076Spjd				free(t_uio, M_NFSDIRECTIO);
835204076Spjd				vdrop(bp->b_vp);
836204076Spjd				bp->b_vp = NULL;
837219482Strociny				relpbuf(bp, &nfs_pbuf_freecnt);
838204076Spjd				if (error == EINTR)
839204076Spjd					return (error);
840218218Spjd				goto do_sync;
841218218Spjd			}
842204076Spjd			uiop->uio_offset += size;
843219818Spjd			uiop->uio_resid -= size;
844218042Spjd			if (uiop->uio_iov->iov_len <= size) {
845204076Spjd				uiop->uio_iovcnt--;
846212034Spjd				uiop->uio_iov++;
847204076Spjd			} else {
848204076Spjd				uiop->uio_iov->iov_base =
849212038Spjd					(char *)uiop->uio_iov->iov_base + size;
850218218Spjd				uiop->uio_iov->iov_len -= size;
851212038Spjd			}
852219818Spjd		}
853218042Spjd	}
854212038Spjd	return (0);
855212038Spjd}
856212038Spjd
857212038Spjd/*
858218218Spjd * Vnode op for write using bio
859218218Spjd */
860218218Spjdint
861218218Spjdnfs_write(struct vop_write_args *ap)
862219818Spjd{
863218218Spjd	int biosize;
864218218Spjd	struct uio *uio = ap->a_uio;
865218218Spjd	struct thread *td = uio->uio_td;
866218218Spjd	struct vnode *vp = ap->a_vp;
867218218Spjd	struct nfsnode *np = VTONFS(vp);
868204076Spjd	struct ucred *cred = ap->a_cred;
869204076Spjd	int ioflag = ap->a_ioflag;
870204076Spjd	struct buf *bp;
871218042Spjd	struct vattr vattr;
872204076Spjd	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
873212034Spjd	daddr_t lbn;
874204076Spjd	int bcount;
875204076Spjd	int n, on, error = 0;
876204076Spjd	struct proc *p = td?td->td_proc:NULL;
877204076Spjd
878212038Spjd#ifdef DIAGNOSTIC
879212038Spjd	if (uio->uio_rw != UIO_WRITE)
880218218Spjd		panic("nfs_write mode");
881218043Spjd	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
882218043Spjd		panic("nfs_write proc");
883204076Spjd#endif
884204076Spjd	if (vp->v_type != VREG)
885204076Spjd		return (EIO);
886211977Spjd	mtx_lock(&np->n_mtx);
887211984Spjd	if (np->n_flag & NWRITEERR) {
888218043Spjd		np->n_flag &= ~NWRITEERR;
889219482Strociny		mtx_unlock(&np->n_mtx);
890211984Spjd		return (np->n_error);
891218043Spjd	} else
892218043Spjd		mtx_unlock(&np->n_mtx);
893218218Spjd	mtx_lock(&nmp->nm_mtx);
894218043Spjd	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
895218043Spjd	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
896218043Spjd		mtx_unlock(&nmp->nm_mtx);
897204076Spjd		(void)nfs_fsinfo(nmp, vp, cred, td);
898218045Spjd	} else
899218045Spjd		mtx_unlock(&nmp->nm_mtx);
900218043Spjd
901219482Strociny	/*
902218043Spjd	 * Synchronously flush pending buffers if we are in synchronous
903220005Spjd	 * mode or if we are appending.
904204076Spjd	 */
905204076Spjd	if (ioflag & (IO_APPEND | IO_SYNC)) {
906213007Spjd		mtx_lock(&np->n_mtx);
907213007Spjd		if (np->n_flag & NMODIFIED) {
908217784Spjd			mtx_unlock(&np->n_mtx);
909221899Spjd#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
910218049Spjd			/*
911218049Spjd			 * Require non-blocking, synchronous writes to
912218049Spjd			 * dirty files to inform the program it needs
913218214Spjd			 * to fsync(2) explicitly.
914218049Spjd			 */
915213007Spjd			if (ioflag & IO_NDELAY)
916213530Spjd				return (EAGAIN);
917213530Spjd#endif
918213530Spjdflush_and_restart:
919213530Spjd			np->n_attrstamp = 0;
920218138Spjd			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
921213530Spjd			if (error)
922213007Spjd				return (error);
923213007Spjd		} else
924213007Spjd			mtx_unlock(&np->n_mtx);
925213007Spjd	}
926213007Spjd
927213007Spjd	/*
928213007Spjd	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
929213007Spjd	 * get the append lock.
930218138Spjd	 */
931220898Spjd	if (ioflag & IO_APPEND) {
932220898Spjd		np->n_attrstamp = 0;
933220898Spjd		error = VOP_GETATTR(vp, &vattr, cred, td);
934220898Spjd		if (error)
935220898Spjd			return (error);
936220898Spjd		mtx_lock(&np->n_mtx);
937220898Spjd		uio->uio_offset = np->n_size;
938220898Spjd		mtx_unlock(&np->n_mtx);
939220898Spjd	}
940220898Spjd
941220898Spjd	if (uio->uio_offset < 0)
942220898Spjd		return (EINVAL);
943220898Spjd	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
944220898Spjd		return (EFBIG);
945220898Spjd	if (uio->uio_resid == 0)
946220898Spjd		return (0);
947220898Spjd
948220898Spjd	if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
949220898Spjd		return nfs_directio_write(vp, uio, cred, ioflag);
950220898Spjd
951220898Spjd	/*
952220898Spjd	 * Maybe this should be above the vnode op call, but so long as
953220898Spjd	 * file servers have no limits, i don't think it matters
954220898Spjd	 */
955204076Spjd	if (p != NULL) {
956218138Spjd		PROC_LOCK(p);
957204076Spjd		if (uio->uio_offset + uio->uio_resid >
958218138Spjd		    lim_cur(p, RLIMIT_FSIZE)) {
959204076Spjd			psignal(p, SIGXFSZ);
960218138Spjd			PROC_UNLOCK(p);
961204076Spjd			return (EFBIG);
962218138Spjd		}
963204076Spjd		PROC_UNLOCK(p);
964218138Spjd	}
965220898Spjd
966213530Spjd	biosize = vp->v_mount->mnt_stat.f_iosize;
967204076Spjd	/*
968204076Spjd	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
969204076Spjd	 * would exceed the local maximum per-file write commit size when
970204076Spjd	 * combined with those, we must decide whether to flush,
971204076Spjd	 * go synchronous, or return error.  We don't bother checking
972204076Spjd	 * IO_UNIT -- we just make all writes atomic anyway, as there's
973204076Spjd	 * no point optimizing for something that really won't ever happen.
974204076Spjd	 */
975204076Spjd	if (!(ioflag & IO_SYNC)) {
976204076Spjd		int nflag;
977204076Spjd
978204076Spjd		mtx_lock(&np->n_mtx);
979204076Spjd		nflag = np->n_flag;
980204076Spjd		mtx_unlock(&np->n_mtx);
981204076Spjd		int needrestart = 0;
982204076Spjd		if (nmp->nm_wcommitsize < uio->uio_resid) {
983204076Spjd			/*
984204076Spjd			 * If this request could not possibly be completed
985204076Spjd			 * without exceeding the maximum outstanding write
986204076Spjd			 * commit size, see if we can convert it into a
987204076Spjd			 * synchronous write operation.
988204076Spjd			 */
989204076Spjd			if (ioflag & IO_NDELAY)
990204076Spjd				return (EAGAIN);
991204076Spjd			ioflag |= IO_SYNC;
992204076Spjd			if (nflag & NMODIFIED)
993204076Spjd				needrestart = 1;
994204076Spjd		} else if (nflag & NMODIFIED) {
995204076Spjd			int wouldcommit = 0;
996204076Spjd			BO_LOCK(&vp->v_bufobj);
997204076Spjd			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
998204076Spjd				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
999204076Spjd				    b_bobufs) {
1000204076Spjd					if (bp->b_flags & B_NEEDCOMMIT)
1001204076Spjd						wouldcommit += bp->b_bcount;
1002204076Spjd				}
1003204076Spjd			}
1004204076Spjd			BO_UNLOCK(&vp->v_bufobj);
1005204076Spjd			/*
1006204076Spjd			 * Since we're not operating synchronously and
1007204076Spjd			 * bypassing the buffer cache, we are in a commit
1008204076Spjd			 * and holding all of these buffers whether
1009204076Spjd			 * transmitted or not.  If not limited, this
1010204076Spjd			 * will lead to the buffer cache deadlocking,
1011204076Spjd			 * as no one else can flush our uncommitted buffers.
1012204076Spjd			 */
1013204076Spjd			wouldcommit += uio->uio_resid;
1014204076Spjd			/*
1015204076Spjd			 * If we would initially exceed the maximum
1016204076Spjd			 * outstanding write commit size, flush and restart.
1017204076Spjd			 */
1018218138Spjd			if (wouldcommit > nmp->nm_wcommitsize)
1019218138Spjd				needrestart = 1;
1020204076Spjd		}
1021204076Spjd		if (needrestart)
1022204076Spjd			goto flush_and_restart;
1023204076Spjd	}
1024218138Spjd
1025218138Spjd	do {
1026204076Spjd		nfsstats.biocache_writes++;
1027211881Spjd		lbn = uio->uio_offset / biosize;
1028204076Spjd		on = uio->uio_offset & (biosize-1);
1029204076Spjd		n = min((unsigned)(biosize - on), uio->uio_resid);
1030204076Spjdagain:
1031211881Spjd		/*
1032204076Spjd		 * Handle direct append and file extension cases, calculate
1033204076Spjd		 * unaligned buffer size.
1034204076Spjd		 */
1035204076Spjd		mtx_lock(&np->n_mtx);
1036204076Spjd		if (uio->uio_offset == np->n_size && n) {
1037204076Spjd			mtx_unlock(&np->n_mtx);
1038211881Spjd			/*
1039211881Spjd			 * Get the buffer (in its pre-append state to maintain
1040204076Spjd			 * B_CACHE if it was previously set).  Resize the
1041204076Spjd			 * nfsnode after we have locked the buffer to prevent
1042204076Spjd			 * readers from reading garbage.
1043211878Spjd			 */
1044211984Spjd			bcount = on;
1045212038Spjd			bp = nfs_getcacheblk(vp, lbn, bcount, td);
1046204076Spjd
1047204076Spjd			if (bp != NULL) {
1048204076Spjd				long save;
1049204076Spjd
1050204076Spjd				mtx_lock(&np->n_mtx);
1051204076Spjd				np->n_size = uio->uio_offset + n;
1052204076Spjd				np->n_flag |= NMODIFIED;
1053204076Spjd				vnode_pager_setsize(vp, np->n_size);
1054204076Spjd				mtx_unlock(&np->n_mtx);
1055204076Spjd
1056204076Spjd				save = bp->b_flags & B_CACHE;
1057204076Spjd				bcount += n;
1058204076Spjd				allocbuf(bp, bcount);
1059204076Spjd				bp->b_flags |= save;
1060204076Spjd			}
1061204076Spjd		} else {
1062204076Spjd			/*
1063204076Spjd			 * Obtain the locked cache block first, and then
1064204076Spjd			 * adjust the file's size as appropriate.
1065204076Spjd			 */
1066204076Spjd			bcount = on + n;
1067204076Spjd			if ((off_t)lbn * biosize + bcount < np->n_size) {
1068204076Spjd				if ((off_t)(lbn + 1) * biosize < np->n_size)
1069204076Spjd					bcount = biosize;
1070204076Spjd				else
1071204076Spjd					bcount = np->n_size - (off_t)lbn * biosize;
1072204076Spjd			}
1073204076Spjd			mtx_unlock(&np->n_mtx);
1074204076Spjd			bp = nfs_getcacheblk(vp, lbn, bcount, td);
1075204076Spjd			mtx_lock(&np->n_mtx);
1076204076Spjd			if (uio->uio_offset + n > np->n_size) {
1077204076Spjd				np->n_size = uio->uio_offset + n;
1078204076Spjd				np->n_flag |= NMODIFIED;
1079204076Spjd				vnode_pager_setsize(vp, np->n_size);
1080204076Spjd			}
1081204076Spjd			mtx_unlock(&np->n_mtx);
1082204076Spjd		}
1083204076Spjd
1084204076Spjd		if (!bp) {
1085204076Spjd			error = nfs_sigintr(nmp, NULL, td);
1086204076Spjd			if (!error)
1087204076Spjd				error = EINTR;
1088204076Spjd			break;
1089204076Spjd		}
1090204076Spjd
1091204076Spjd		/*
1092204076Spjd		 * Issue a READ if B_CACHE is not set.  In special-append
1093204076Spjd		 * mode, B_CACHE is based on the buffer prior to the write
1094204076Spjd		 * op and is typically set, avoiding the read.  If a read
1095204076Spjd		 * is required in special append mode, the server will
1096204076Spjd		 * probably send us a short-read since we extended the file
1097204076Spjd		 * on our end, resulting in b_resid == 0 and, thusly,
1098204076Spjd		 * B_CACHE getting set.
1099204076Spjd		 *
1100204076Spjd		 * We can also avoid issuing the read if the write covers
1101204076Spjd		 * the entire buffer.  We have to make sure the buffer state
1102204076Spjd		 * is reasonable in this case since we will not be initiating
1103204076Spjd		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
1104204076Spjd		 * more information.
1105204076Spjd		 *
1106204076Spjd		 * B_CACHE may also be set due to the buffer being cached
1107204076Spjd		 * normally.
1108204076Spjd		 */
1109204076Spjd
1110204076Spjd		if (on == 0 && n == bcount) {
1111204076Spjd			bp->b_flags |= B_CACHE;
1112204076Spjd			bp->b_flags &= ~B_INVAL;
1113204076Spjd			bp->b_ioflags &= ~BIO_ERROR;
1114204076Spjd		}
1115204076Spjd
1116204076Spjd		if ((bp->b_flags & B_CACHE) == 0) {
1117204076Spjd			bp->b_iocmd = BIO_READ;
1118204076Spjd			vfs_busy_pages(bp, 0);
1119204076Spjd			error = nfs_doio(vp, bp, cred, td);
1120204076Spjd			if (error) {
1121204076Spjd				brelse(bp);
1122222228Spjd				break;
1123204076Spjd			}
1124204076Spjd		}
1125204076Spjd		if (bp->b_wcred == NOCRED)
1126204076Spjd			bp->b_wcred = crhold(cred);
1127204076Spjd		mtx_lock(&np->n_mtx);
1128204076Spjd		np->n_flag |= NMODIFIED;
1129204076Spjd		mtx_unlock(&np->n_mtx);
1130204076Spjd
1131204076Spjd		/*
1132204076Spjd		 * If dirtyend exceeds file size, chop it down.  This should
1133204076Spjd		 * not normally occur but there is an append race where it
1134204076Spjd		 * might occur XXX, so we log it.
1135204076Spjd		 *
1136204076Spjd		 * If the chopping creates a reverse-indexed or degenerate
1137204076Spjd		 * situation with dirtyoff/end, we 0 both of them.
1138218138Spjd		 */
1139204076Spjd
1140204076Spjd		if (bp->b_dirtyend > bcount) {
1141204076Spjd			nfs_printf("NFS append race @%lx:%d\n",
1142204076Spjd			    (long)bp->b_blkno * DEV_BSIZE,
1143204076Spjd			    bp->b_dirtyend - bcount);
1144204076Spjd			bp->b_dirtyend = bcount;
1145204076Spjd		}
1146204076Spjd
1147204076Spjd		if (bp->b_dirtyoff >= bp->b_dirtyend)
1148204076Spjd			bp->b_dirtyoff = bp->b_dirtyend = 0;
1149204076Spjd
1150204076Spjd		/*
1151222228Spjd		 * If the new write will leave a contiguous dirty
1152214284Spjd		 * area, just update the b_dirtyoff and b_dirtyend,
1153219844Spjd		 * otherwise force a write rpc of the old dirty area.
1154219844Spjd		 *
1155219844Spjd		 * While it is possible to merge discontiguous writes due to
1156219844Spjd		 * our having a B_CACHE buffer ( and thus valid read data
1157219844Spjd		 * for the hole), we don't because it could lead to
1158214284Spjd		 * significant cache coherency problems with multiple clients,
1159214284Spjd		 * especially if locking is implemented later on.
1160204076Spjd		 *
1161204076Spjd		 * as an optimization we could theoretically maintain
1162204076Spjd		 * a linked list of discontinuous areas, but we would still
1163204076Spjd		 * have to commit them separately so there isn't much
1164204076Spjd		 * advantage to it except perhaps a bit of asynchronization.
1165204076Spjd		 */
1166204076Spjd
1167204076Spjd		if (bp->b_dirtyend > 0 &&
1168204076Spjd		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
1169204076Spjd			if (bwrite(bp) == EINTR) {
1170204076Spjd				error = EINTR;
1171204076Spjd				break;
1172204076Spjd			}
1173204076Spjd			goto again;
1174204076Spjd		}
1175204076Spjd
1176204076Spjd		error = uiomove((char *)bp->b_data + on, n, uio);
1177204076Spjd
1178204076Spjd		/*
1179204076Spjd		 * Since this block is being modified, it must be written
1180204076Spjd		 * again and not just committed.  Since write clustering does
1181204076Spjd		 * not work for the stage 1 data write, only the stage 2
1182204076Spjd		 * commit rpc, we have to clear B_CLUSTEROK as well.
1183204076Spjd		 */
1184204076Spjd		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1185204076Spjd
1186204076Spjd		if (error) {
1187204076Spjd			bp->b_ioflags |= BIO_ERROR;
1188204076Spjd			brelse(bp);
1189204076Spjd			break;
1190222228Spjd		}
1191204076Spjd
1192204076Spjd		/*
1193204076Spjd		 * Only update dirtyoff/dirtyend if not a degenerate
1194204076Spjd		 * condition.
1195204076Spjd		 */
1196204076Spjd		if (n) {
1197222228Spjd			if (bp->b_dirtyend > 0) {
1198222228Spjd				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
1199222228Spjd				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
1200222228Spjd			} else {
1201222228Spjd				bp->b_dirtyoff = on;
1202222228Spjd				bp->b_dirtyend = on + n;
1203222228Spjd			}
1204222228Spjd			vfs_bio_set_validclean(bp, on, n);
1205204076Spjd		}
1206204076Spjd
1207204076Spjd		/*
1208204076Spjd		 * If IO_SYNC do bwrite().
1209204076Spjd		 *
1210204076Spjd		 * IO_INVAL appears to be unused.  The idea appears to be
1211204076Spjd		 * to turn off caching in this case.  Very odd.  XXX
1212204076Spjd		 */
1213204076Spjd		if ((ioflag & IO_SYNC)) {
1214204076Spjd			if (ioflag & IO_INVAL)
1215204076Spjd				bp->b_flags |= B_NOCACHE;
1216204076Spjd			error = bwrite(bp);
1217204076Spjd			if (error)
1218204076Spjd				break;
1219204076Spjd		} else if ((n + on) == biosize) {
1220204076Spjd			bp->b_flags |= B_ASYNC;
1221204076Spjd			(void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL);
1222204076Spjd		} else {
1223204076Spjd			bdwrite(bp);
1224204076Spjd		}
1225204076Spjd	} while (uio->uio_resid > 0 && n > 0);
1226204076Spjd
1227204076Spjd	return (error);
1228204076Spjd}
1229204076Spjd
1230204076Spjd/*
1231204076Spjd * Get an nfs cache block.
1232204076Spjd *
1233204076Spjd * Allocate a new one if the block isn't currently in the cache
1234204076Spjd * and return the block marked busy. If the calling process is
1235204076Spjd * interrupted by a signal for an interruptible mount point, return
1236204076Spjd * NULL.
1237204076Spjd *
1238214692Spjd * The caller must carefully deal with the possible B_INVAL state of
1239204076Spjd * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
1240204076Spjd * indirectly), so synchronous reads can be issued without worrying about
1241204076Spjd * the B_INVAL state.  We have to be a little more careful when dealing
1242204076Spjd * with writes (see comments in nfs_write()) when extending a file past
1243204076Spjd * its EOF.
1244204076Spjd */
1245204076Spjdstatic struct buf *
1246204076Spjdnfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
1247204076Spjd{
1248222467Strociny	struct buf *bp;
1249204076Spjd	struct mount *mp;
1250204076Spjd	struct nfsmount *nmp;
1251204076Spjd
1252216479Spjd	mp = vp->v_mount;
1253216479Spjd	nmp = VFSTONFS(mp);
1254216479Spjd
1255216479Spjd	if (nmp->nm_flag & NFSMNT_INT) {
1256216479Spjd 		sigset_t oldset;
1257216479Spjd
1258216479Spjd 		nfs_set_sigmask(td, &oldset);
1259216494Spjd		bp = getblk(vp, bn, size, PCATCH, 0, 0);
1260216479Spjd 		nfs_restore_sigmask(td, &oldset);
1261204076Spjd		while (bp == NULL) {
1262204076Spjd			if (nfs_sigintr(nmp, NULL, td))
1263204076Spjd				return (NULL);
1264204076Spjd			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
1265204076Spjd		}
1266204076Spjd	} else {
1267204076Spjd		bp = getblk(vp, bn, size, 0, 0, 0);
1268204076Spjd	}
1269216479Spjd
1270204076Spjd	if (vp->v_type == VREG) {
1271216479Spjd		int biosize;
1272216479Spjd
1273216479Spjd		biosize = mp->mnt_stat.f_iosize;
1274216479Spjd		bp->b_blkno = bn * (biosize / DEV_BSIZE);
1275204076Spjd	}
1276216479Spjd	return (bp);
1277216479Spjd}
1278216494Spjd
1279216479Spjd/*
1280204076Spjd * Flush and invalidate all dirty buffers. If another process is already
1281216479Spjd * doing the flush, just wait for completion.
1282204076Spjd */
1283204076Spjdint
1284204076Spjdnfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
1285204076Spjd{
1286204076Spjd	struct nfsnode *np = VTONFS(vp);
1287216479Spjd	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1288204076Spjd	int error = 0, slpflag, slptimeo;
1289216479Spjd 	int old_lock = 0;
1290216479Spjd
1291216479Spjd	ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf");
1292216479Spjd
1293204076Spjd	/*
1294216479Spjd	 * XXX This check stops us from needlessly doing a vinvalbuf when
1295204076Spjd	 * being called through vclean().  It is not clear that this is
1296204076Spjd	 * unsafe.
1297204076Spjd	 */
1298216479Spjd	if (vp->v_iflag & VI_DOOMED)
1299204076Spjd		return (0);
1300216479Spjd
1301216479Spjd	if ((nmp->nm_flag & NFSMNT_INT) == 0)
1302216479Spjd		intrflg = 0;
1303216479Spjd	if (intrflg) {
1304204076Spjd		slpflag = PCATCH;
1305216479Spjd		slptimeo = 2 * hz;
1306204076Spjd	} else {
1307204076Spjd		slpflag = 0;
1308204076Spjd		slptimeo = 0;
1309204076Spjd	}
1310204076Spjd
1311204076Spjd	old_lock = nfs_upgrade_vnlock(vp, td);
1312204076Spjd	/*
1313204076Spjd	 * Now, flush as required.
1314204076Spjd	 */
1315204076Spjd	if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) {
1316204076Spjd		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
1317204076Spjd		/*
1318204076Spjd		 * If the page clean was interrupted, fail the invalidation.
1319204076Spjd		 * Not doing so, we run the risk of losing dirty pages in the
1320204076Spjd		 * vinvalbuf() call below.
1321204076Spjd		 */
1322204076Spjd		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
1323204076Spjd			goto out;
1324204076Spjd	}
1325204076Spjd
1326214692Spjd	error = vinvalbuf(vp, flags, td, slpflag, 0);
1327214692Spjd	while (error) {
1328214692Spjd		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
1329214692Spjd			goto out;
1330214692Spjd		error = vinvalbuf(vp, flags, td, 0, slptimeo);
1331218217Spjd	}
1332218217Spjd	mtx_lock(&np->n_mtx);
1333218217Spjd	if (np->n_directio_asyncwr == 0)
1334218217Spjd		np->n_flag &= ~NMODIFIED;
1335214692Spjd	mtx_unlock(&np->n_mtx);
1336218217Spjdout:
1337219864Spjd	nfs_downgrade_vnlock(vp, td, old_lock);
1338218138Spjd	return error;
1339218138Spjd}
1340214692Spjd
1341214692Spjd/*
1342214692Spjd * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1343214692Spjd * This is mainly to avoid queueing async I/O requests when the nfsiods
1344218217Spjd * are all hung on a dead server.
1345214692Spjd *
1346214692Spjd * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
1347214692Spjd * is eventually dequeued by the async daemon, nfs_doio() *will*.
1348214692Spjd */
1349214692Spjdint
1350214692Spjdnfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
1351218217Spjd{
1352214692Spjd	int iod;
1353214692Spjd	int gotiod;
1354214692Spjd	int slpflag = 0;
1355214692Spjd	int slptimeo = 0;
1356214692Spjd	int error, error2;
1357214692Spjd
1358218217Spjd	/*
1359218217Spjd	 * Commits are usually short and sweet so lets save some cpu and
1360214692Spjd	 * leave the async daemons for more important rpc's (such as reads
1361214692Spjd	 * and writes).
1362214692Spjd	 */
1363214692Spjd	mtx_lock(&nfs_iod_mtx);
1364204076Spjd	if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
1365204076Spjd	    (nmp->nm_bufqiods > nfs_numasync / 2)) {
1366204076Spjd		mtx_unlock(&nfs_iod_mtx);
1367204076Spjd		return(EIO);
1368204076Spjd	}
1369204076Spjdagain:
1370204076Spjd	if (nmp->nm_flag & NFSMNT_INT)
1371204076Spjd		slpflag = PCATCH;
1372214692Spjd	gotiod = FALSE;
1373204076Spjd
1374204076Spjd	/*
1375204076Spjd	 * Find a free iod to process this request.
1376204076Spjd	 */
1377204076Spjd	for (iod = 0; iod < nfs_numasync; iod++)
1378204076Spjd		if (nfs_iodwant[iod]) {
1379204076Spjd			gotiod = TRUE;
1380204076Spjd			break;
1381204076Spjd		}
1382204076Spjd
1383219864Spjd	/*
1384204076Spjd	 * Try to create one if none are free.
1385204076Spjd	 */
1386204076Spjd	if (!gotiod) {
1387219721Strociny		iod = nfs_nfsiodnew();
1388214692Spjd		if (iod != -1)
1389214692Spjd			gotiod = TRUE;
1390219721Strociny	}
1391214692Spjd
1392214692Spjd	if (gotiod) {
1393214692Spjd		/*
1394214692Spjd		 * Found one, so wake it up and tell it which
1395214692Spjd		 * mount to process.
1396204076Spjd		 */
1397204076Spjd		NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n",
1398204076Spjd		    iod, nmp));
1399204076Spjd		nfs_iodwant[iod] = NULL;
1400204076Spjd		nfs_iodmount[iod] = nmp;
1401204076Spjd		nmp->nm_bufqiods++;
1402204076Spjd		wakeup(&nfs_iodwant[iod]);
1403204076Spjd	}
1404204076Spjd
1405204076Spjd	/*
1406204076Spjd	 * If none are free, we may already have an iod working on this mount
1407204076Spjd	 * point.  If so, it will process our request.
1408204076Spjd	 */
1409204076Spjd	if (!gotiod) {
1410204076Spjd		if (nmp->nm_bufqiods > 0) {
1411204076Spjd			NFS_DPF(ASYNCIO,
1412204076Spjd				("nfs_asyncio: %d iods are already processing mount %p\n",
1413204076Spjd				 nmp->nm_bufqiods, nmp));
1414204076Spjd			gotiod = TRUE;
1415204076Spjd		}
1416204076Spjd	}
1417204076Spjd
1418204076Spjd	/*
1419204076Spjd	 * If we have an iod which can process the request, then queue
1420204076Spjd	 * the buffer.
1421204076Spjd	 */
1422204076Spjd	if (gotiod) {
1423204076Spjd		/*
1424218138Spjd		 * Ensure that the queue never grows too large.  We still want
1425204076Spjd		 * to asynchronize so we block rather then return EIO.
1426204076Spjd		 */
1427204076Spjd		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1428204076Spjd			NFS_DPF(ASYNCIO,
1429204076Spjd				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1430204076Spjd			nmp->nm_bufqwant = TRUE;
1431204076Spjd 			error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx,
1432204076Spjd					   slpflag | PRIBIO,
1433204076Spjd 					   "nfsaio", slptimeo);
1434204076Spjd			if (error) {
1435204076Spjd				error2 = nfs_sigintr(nmp, NULL, td);
1436204076Spjd				if (error2) {
1437204076Spjd					mtx_unlock(&nfs_iod_mtx);
1438204076Spjd					return (error2);
1439204076Spjd				}
1440204076Spjd				if (slpflag == PCATCH) {
1441204076Spjd					slpflag = 0;
1442204076Spjd					slptimeo = 2 * hz;
1443204076Spjd				}
1444204076Spjd			}
1445204076Spjd			/*
1446204076Spjd			 * We might have lost our iod while sleeping,
1447204076Spjd			 * so check and loop if nescessary.
1448204076Spjd			 */
1449204076Spjd			if (nmp->nm_bufqiods == 0) {
1450204076Spjd				NFS_DPF(ASYNCIO,
1451204076Spjd					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1452204076Spjd				goto again;
1453204076Spjd			}
1454204076Spjd		}
1455204076Spjd
1456204076Spjd		if (bp->b_iocmd == BIO_READ) {
1457204076Spjd			if (bp->b_rcred == NOCRED && cred != NOCRED)
1458204076Spjd				bp->b_rcred = crhold(cred);
1459204076Spjd		} else {
1460204076Spjd			if (bp->b_wcred == NOCRED && cred != NOCRED)
1461204076Spjd				bp->b_wcred = crhold(cred);
1462204076Spjd		}
1463204076Spjd
1464204076Spjd		if (bp->b_flags & B_REMFREE)
1465204076Spjd			bremfreef(bp);
1466204076Spjd		BUF_KERNPROC(bp);
1467204076Spjd		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1468204076Spjd		nmp->nm_bufqlen++;
1469204076Spjd		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
1470204076Spjd			mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);
1471204076Spjd			VTONFS(bp->b_vp)->n_directio_asyncwr++;
1472204076Spjd			mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
1473211979Spjd		}
1474204076Spjd		mtx_unlock(&nfs_iod_mtx);
1475204076Spjd		return (0);
1476204076Spjd	}
1477204076Spjd
1478204076Spjd	mtx_unlock(&nfs_iod_mtx);
1479204076Spjd
1480204076Spjd	/*
1481204076Spjd	 * All the iods are busy on other mounts, so return EIO to
1482204076Spjd	 * force the caller to process the i/o synchronously.
1483204076Spjd	 */
1484204076Spjd	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1485204076Spjd	return (EIO);
1486204076Spjd}
1487204076Spjd
1488204076Spjdvoid
1489204076Spjdnfs_doio_directwrite(struct buf *bp)
1490204076Spjd{
1491204076Spjd	int iomode, must_commit;
1492204076Spjd	struct uio *uiop = (struct uio *)bp->b_caller1;
1493204076Spjd	char *iov_base = uiop->uio_iov->iov_base;
1494204076Spjd	struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount);
1495204076Spjd
1496204076Spjd	iomode = NFSV3WRITE_FILESYNC;
1497204076Spjd	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
1498204076Spjd	(nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit);
1499204076Spjd	KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write"));
1500204076Spjd	free(iov_base, M_NFSDIRECTIO);
1501204076Spjd	free(uiop->uio_iov, M_NFSDIRECTIO);
1502204076Spjd	free(uiop, M_NFSDIRECTIO);
1503204076Spjd	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
1504204076Spjd		struct nfsnode *np = VTONFS(bp->b_vp);
1505204076Spjd		mtx_lock(&np->n_mtx);
1506204076Spjd		np->n_directio_asyncwr--;
1507204076Spjd		if ((np->n_flag & NFSYNCWAIT) && np->n_directio_asyncwr == 0) {
1508204076Spjd			np->n_flag &= ~NFSYNCWAIT;
1509204076Spjd			wakeup((caddr_t)&np->n_directio_asyncwr);
1510204076Spjd		}
1511204076Spjd		mtx_unlock(&np->n_mtx);
1512204076Spjd	}
1513204076Spjd	vdrop(bp->b_vp);
1514204076Spjd	bp->b_vp = NULL;
1515204076Spjd	relpbuf(bp, &nfs_pbuf_freecnt);
1516204076Spjd}
1517204076Spjd
1518204076Spjd/*
1519204076Spjd * Do an I/O operation to/from a cache block. This may be called
1520204076Spjd * synchronously or from an nfsiod.
1521204076Spjd */
1522204076Spjdint
1523204076Spjdnfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td)
1524204076Spjd{
1525204076Spjd	struct uio *uiop;
1526204076Spjd	struct nfsnode *np;
1527204076Spjd	struct nfsmount *nmp;
1528204076Spjd	int error = 0, iomode, must_commit = 0;
1529204076Spjd	struct uio uio;
1530204076Spjd	struct iovec io;
1531204076Spjd	struct proc *p = td ? td->td_proc : NULL;
1532204076Spjd	uint8_t	iocmd;
1533204076Spjd
1534204076Spjd	np = VTONFS(vp);
1535204076Spjd	nmp = VFSTONFS(vp->v_mount);
1536204076Spjd	uiop = &uio;
1537204076Spjd	uiop->uio_iov = &io;
1538204076Spjd	uiop->uio_iovcnt = 1;
1539204076Spjd	uiop->uio_segflg = UIO_SYSSPACE;
1540204076Spjd	uiop->uio_td = td;
1541204076Spjd
1542204076Spjd	/*
1543204076Spjd	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
1544204076Spjd	 * do this here so we do not have to do it in all the code that
1545204076Spjd	 * calls us.
1546204076Spjd	 */
1547204076Spjd	bp->b_flags &= ~B_INVAL;
1548204076Spjd	bp->b_ioflags &= ~BIO_ERROR;
1549204076Spjd
1550204076Spjd	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
1551204076Spjd	iocmd = bp->b_iocmd;
1552204076Spjd	if (iocmd == BIO_READ) {
1553204076Spjd	    io.iov_len = uiop->uio_resid = bp->b_bcount;
1554218138Spjd	    io.iov_base = bp->b_data;
1555204076Spjd	    uiop->uio_rw = UIO_READ;
1556204076Spjd
1557204076Spjd	    switch (vp->v_type) {
1558204076Spjd	    case VREG:
1559204076Spjd		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1560204076Spjd		nfsstats.read_bios++;
1561204076Spjd		error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr);
1562204076Spjd
1563204076Spjd		if (!error) {
1564204076Spjd		    if (uiop->uio_resid) {
1565204076Spjd			/*
1566204076Spjd			 * If we had a short read with no error, we must have
1567204076Spjd			 * hit a file hole.  We should zero-fill the remainder.
1568204076Spjd			 * This can also occur if the server hits the file EOF.
1569204076Spjd			 *
1570204076Spjd			 * Holes used to be able to occur due to pending
1571204076Spjd			 * writes, but that is not possible any longer.
1572204076Spjd			 */
1573204076Spjd			int nread = bp->b_bcount - uiop->uio_resid;
1574204076Spjd			int left  = uiop->uio_resid;
1575204076Spjd
1576204076Spjd			if (left > 0)
1577204076Spjd				bzero((char *)bp->b_data + nread, left);
1578204076Spjd			uiop->uio_resid = 0;
1579204076Spjd		    }
1580204076Spjd		}
1581204076Spjd		/* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */
1582204076Spjd		if (p && (vp->v_vflag & VV_TEXT)) {
1583204076Spjd			mtx_lock(&np->n_mtx);
1584204076Spjd			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) {
1585204076Spjd				mtx_unlock(&np->n_mtx);
1586204076Spjd				PROC_LOCK(p);
1587204076Spjd				killproc(p, "text file modification");
1588204076Spjd				PROC_UNLOCK(p);
1589204076Spjd			} else
1590204076Spjd				mtx_unlock(&np->n_mtx);
1591204076Spjd		}
1592216478Spjd		break;
1593216479Spjd	    case VLNK:
1594216479Spjd		uiop->uio_offset = (off_t)0;
1595204076Spjd		nfsstats.readlink_bios++;
1596204076Spjd		error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr);
1597204076Spjd		break;
1598204076Spjd	    case VDIR:
1599204076Spjd		nfsstats.readdir_bios++;
1600204076Spjd		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1601204076Spjd		if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
1602204076Spjd			error = nfs4_readdirrpc(vp, uiop, cr);
1603204076Spjd		else {
1604204076Spjd			if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
1605204076Spjd				error = nfs_readdirplusrpc(vp, uiop, cr);
1606204076Spjd				if (error == NFSERR_NOTSUPP)
1607204076Spjd					nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1608204076Spjd			}
1609204076Spjd			if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1610204076Spjd				error = nfs_readdirrpc(vp, uiop, cr);
1611204076Spjd		}
1612204076Spjd		/*
1613204076Spjd		 * end-of-directory sets B_INVAL but does not generate an
1614204076Spjd		 * error.
1615204076Spjd		 */
1616204076Spjd		if (error == 0 && uiop->uio_resid == bp->b_bcount)
1617204076Spjd			bp->b_flags |= B_INVAL;
1618204076Spjd		break;
1619204076Spjd	    default:
1620204076Spjd		nfs_printf("nfs_doio:  type %x unexpected\n", vp->v_type);
1621204076Spjd		break;
1622204076Spjd	    };
1623204076Spjd	    if (error) {
1624218138Spjd		bp->b_ioflags |= BIO_ERROR;
1625204076Spjd		bp->b_error = error;
1626204076Spjd	    }
1627204076Spjd	} else {
1628204076Spjd	    /*
1629204076Spjd	     * If we only need to commit, try to commit
1630204076Spjd	     */
1631204076Spjd	    if (bp->b_flags & B_NEEDCOMMIT) {
1632204076Spjd		    int retv;
1633204076Spjd		    off_t off;
1634204076Spjd
1635204076Spjd		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
1636204076Spjd		    retv = (nmp->nm_rpcops->nr_commit)(
1637204076Spjd				vp, off, bp->b_dirtyend-bp->b_dirtyoff,
1638204076Spjd				bp->b_wcred, td);
1639204076Spjd		    if (retv == 0) {
1640204076Spjd			    bp->b_dirtyoff = bp->b_dirtyend = 0;
1641204076Spjd			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1642204076Spjd			    bp->b_resid = 0;
1643204076Spjd			    bufdone(bp);
1644204076Spjd			    return (0);
1645204076Spjd		    }
1646204076Spjd		    if (retv == NFSERR_STALEWRITEVERF) {
1647204076Spjd			    nfs_clearcommit(vp->v_mount);
1648204076Spjd		    }
1649204076Spjd	    }
1650204076Spjd
1651204076Spjd	    /*
1652204076Spjd	     * Setup for actual write
1653204076Spjd	     */
1654204076Spjd	    mtx_lock(&np->n_mtx);
1655204076Spjd	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1656204076Spjd		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1657204076Spjd	    mtx_unlock(&np->n_mtx);
1658204076Spjd
1659204076Spjd	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1660204076Spjd		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1661204076Spjd		    - bp->b_dirtyoff;
1662204076Spjd		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
1663204076Spjd		    + bp->b_dirtyoff;
1664204076Spjd		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1665204076Spjd		uiop->uio_rw = UIO_WRITE;
1666204076Spjd		nfsstats.write_bios++;
1667204076Spjd
1668204076Spjd		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1669204076Spjd		    iomode = NFSV3WRITE_UNSTABLE;
1670204076Spjd		else
1671204076Spjd		    iomode = NFSV3WRITE_FILESYNC;
1672204076Spjd
1673204076Spjd		error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit);
1674204076Spjd
1675204076Spjd		/*
1676204076Spjd		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1677204076Spjd		 * to cluster the buffers needing commit.  This will allow
1678204076Spjd		 * the system to submit a single commit rpc for the whole
1679219879Strociny		 * cluster.  We can do this even if the buffer is not 100%
1680219879Strociny		 * dirty (relative to the NFS blocksize), so we optimize the
1681204076Spjd		 * append-to-file-case.
1682219879Strociny		 *
1683219879Strociny		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1684219879Strociny		 * cleared because write clustering only works for commit
1685219879Strociny		 * rpc's, not for the data portion of the write).
1686219879Strociny		 */
1687204076Spjd
1688204076Spjd		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1689204076Spjd		    bp->b_flags |= B_NEEDCOMMIT;
1690223655Strociny		    if (bp->b_dirtyoff == 0
1691223655Strociny			&& bp->b_dirtyend == bp->b_bcount)
1692223655Strociny			bp->b_flags |= B_CLUSTEROK;
1693223655Strociny		} else {
1694223655Strociny		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1695204076Spjd		}
1696204076Spjd
1697204076Spjd		/*
1698204076Spjd		 * For an interrupted write, the buffer is still valid
1699204076Spjd		 * and the write hasn't been pushed to the server yet,
1700204076Spjd		 * so we can't set BIO_ERROR and report the interruption
1701204076Spjd		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1702204076Spjd		 * is not relevant, so the rpc attempt is essentially
1703204076Spjd		 * a noop.  For the case of a V3 write rpc not being
1704204076Spjd		 * committed to stable storage, the block is still
1705204076Spjd		 * dirty and requires either a commit rpc or another
1706204076Spjd		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1707204076Spjd		 * the block is reused. This is indicated by setting
1708204076Spjd		 * the B_DELWRI and B_NEEDCOMMIT flags.
1709204076Spjd		 *
1710204076Spjd		 * If the buffer is marked B_PAGING, it does not reside on
1711204076Spjd		 * the vp's paging queues so we cannot call bdirty().  The
1712204076Spjd		 * bp in this case is not an NFS cache block so we should
1713204076Spjd		 * be safe. XXX
1714204076Spjd		 */
1715204076Spjd    		if (error == EINTR || error == EIO || error == ETIMEDOUT
1716204076Spjd		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1717204076Spjd			int s;
1718204076Spjd
1719204076Spjd			s = splbio();
1720204076Spjd			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1721204076Spjd			if ((bp->b_flags & B_PAGING) == 0) {
1722204076Spjd			    bdirty(bp);
1723204076Spjd			    bp->b_flags &= ~B_DONE;
1724204076Spjd			}
1725204076Spjd			if (error && (bp->b_flags & B_ASYNC) == 0)
1726204076Spjd			    bp->b_flags |= B_EINTR;
1727204076Spjd			splx(s);
1728204076Spjd	    	} else {
1729204076Spjd		    if (error) {
1730204076Spjd			bp->b_ioflags |= BIO_ERROR;
1731204076Spjd			bp->b_error = np->n_error = error;
1732204076Spjd			mtx_lock(&np->n_mtx);
1733204076Spjd			np->n_flag |= NWRITEERR;
1734204076Spjd			mtx_unlock(&np->n_mtx);
1735204076Spjd		    }
1736204076Spjd		    bp->b_dirtyoff = bp->b_dirtyend = 0;
1737204076Spjd		}
1738204076Spjd	    } else {
1739204076Spjd		bp->b_resid = 0;
1740204076Spjd		bufdone(bp);
1741204076Spjd		return (0);
1742204076Spjd	    }
1743204076Spjd	}
1744204076Spjd	bp->b_resid = uiop->uio_resid;
1745204076Spjd	if (must_commit)
1746219372Spjd	    nfs_clearcommit(vp->v_mount);
1747204076Spjd	bufdone(bp);
1748204076Spjd	return (error);
1749204076Spjd}
1750204076Spjd
1751204076Spjd/*
1752204076Spjd * Used to aid in handling ftruncate() operations on the NFS client side.
1753204076Spjd * Truncation creates a number of special problems for NFS.  We have to
1754211897Spjd * throw away VM pages and buffer cache buffers that are beyond EOF, and
1755211897Spjd * we have to properly handle VM pages or (potentially dirty) buffers
1756204076Spjd * that straddle the truncation point.
1757204076Spjd */
1758204076Spjd
1759211897Spjdint
1760219372Spjdnfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
1761219372Spjd{
1762219372Spjd	struct nfsnode *np = VTONFS(vp);
1763219372Spjd	u_quad_t tsize;
1764211879Spjd	int biosize = vp->v_mount->mnt_stat.f_iosize;
1765212038Spjd	int error = 0;
1766211879Spjd
1767204076Spjd	mtx_lock(&np->n_mtx);
1768204076Spjd	tsize = np->n_size;
1769204076Spjd	np->n_size = nsize;
1770204076Spjd	mtx_unlock(&np->n_mtx);
1771204076Spjd
1772204076Spjd	if (nsize < tsize) {
1773204076Spjd		struct buf *bp;
1774204076Spjd		daddr_t lbn;
1775204076Spjd		int bufsize;
1776204076Spjd
1777204076Spjd		/*
1778204076Spjd		 * vtruncbuf() doesn't get the buffer overlapping the
1779204076Spjd		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
1780204076Spjd		 * buffer that now needs to be truncated.
1781204076Spjd		 */
1782204076Spjd		error = vtruncbuf(vp, cred, td, nsize, biosize);
1783204076Spjd		lbn = nsize / biosize;
1784204076Spjd		bufsize = nsize & (biosize - 1);
1785204076Spjd		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
1786204076Spjd 		if (!bp)
1787204076Spjd 			return EINTR;
1788204076Spjd		if (bp->b_dirtyoff > bp->b_bcount)
1789204076Spjd			bp->b_dirtyoff = bp->b_bcount;
1790204076Spjd		if (bp->b_dirtyend > bp->b_bcount)
1791204076Spjd			bp->b_dirtyend = bp->b_bcount;
1792204076Spjd		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
1793204076Spjd		brelse(bp);
1794204076Spjd	} else {
1795219372Spjd		vnode_pager_setsize(vp, nsize);
1796219372Spjd	}
1797204076Spjd	return(error);
1798212038Spjd}
1799219372Spjd
1800204076Spjd