nfs_bio.c revision 193187
1183234Ssimon/*-
2183234Ssimon * Copyright (c) 1989, 1993
3183234Ssimon *	The Regents of the University of California.  All rights reserved.
4183234Ssimon *
5183234Ssimon * This code is derived from software contributed to Berkeley by
6183234Ssimon * Rick Macklem at The University of Guelph.
7183234Ssimon *
8183234Ssimon * Redistribution and use in source and binary forms, with or without
9280297Sjkim * modification, are permitted provided that the following conditions
10183234Ssimon * are met:
11183234Ssimon * 1. Redistributions of source code must retain the above copyright
12183234Ssimon *    notice, this list of conditions and the following disclaimer.
13183234Ssimon * 2. Redistributions in binary form must reproduce the above copyright
14183234Ssimon *    notice, this list of conditions and the following disclaimer in the
15183234Ssimon *    documentation and/or other materials provided with the distribution.
16183234Ssimon * 4. Neither the name of the University nor the names of its contributors
17183234Ssimon *    may be used to endorse or promote products derived from this software
18183234Ssimon *    without specific prior written permission.
19183234Ssimon *
20183234Ssimon * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21183234Ssimon * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22183234Ssimon * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23183234Ssimon * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24183234Ssimon * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25183234Ssimon * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26183234Ssimon * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27183234Ssimon * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28183234Ssimon * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29183234Ssimon * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30183234Ssimon * SUCH DAMAGE.
31183234Ssimon *
32183234Ssimon *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
33183234Ssimon */
34183234Ssimon
35183234Ssimon#include <sys/cdefs.h>
36183234Ssimon__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 193187 2009-05-31 20:18:02Z alc $");
37183234Ssimon
38183234Ssimon#include "opt_kdtrace.h"
39183234Ssimon
40183234Ssimon#include <sys/param.h>
41183234Ssimon#include <sys/systm.h>
42183234Ssimon#include <sys/bio.h>
43183234Ssimon#include <sys/buf.h>
44183234Ssimon#include <sys/kernel.h>
45183234Ssimon#include <sys/mbuf.h>
46183234Ssimon#include <sys/mount.h>
47183234Ssimon#include <sys/proc.h>
48183234Ssimon#include <sys/resourcevar.h>
49183234Ssimon#include <sys/signalvar.h>
50183234Ssimon#include <sys/vmmeter.h>
51183234Ssimon#include <sys/vnode.h>
52183234Ssimon
53183234Ssimon#include <vm/vm.h>
54183234Ssimon#include <vm/vm_extern.h>
55183234Ssimon#include <vm/vm_page.h>
56280297Sjkim#include <vm/vm_object.h>
57183234Ssimon#include <vm/vm_pager.h>
58237657Sjkim#include <vm/vnode_pager.h>
59237657Sjkim
60237657Sjkim#include <nfs/rpcv2.h>
61237657Sjkim#include <nfs/nfsproto.h>
62183234Ssimon#include <nfsclient/nfs.h>
63280297Sjkim#include <nfsclient/nfsmount.h>
64280297Sjkim#include <nfsclient/nfsnode.h>
65183234Ssimon#include <nfsclient/nfs_kdtrace.h>
66183234Ssimon
67183234Ssimonstatic struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
68183234Ssimon		    struct thread *td);
69183234Ssimonstatic int nfs_directio_write(struct vnode *vp, struct uio *uiop,
70280297Sjkim			      struct ucred *cred, int ioflag);
71183234Ssimon
72183234Ssimonextern int nfs_directio_enable;
73183234Ssimonextern int nfs_directio_allow_mmap;
74183234Ssimon
75280297Sjkim/*
76280297Sjkim * Vnode op for VM getpages.
77280297Sjkim */
78280297Sjkimint
79280297Sjkimnfs_getpages(struct vop_getpages_args *ap)
80280297Sjkim{
81280297Sjkim	int i, error, nextoff, size, toff, count, npages;
82280297Sjkim	struct uio uio;
83280297Sjkim	struct iovec iov;
84280297Sjkim	vm_offset_t kva;
85280297Sjkim	struct buf *bp;
86280297Sjkim	struct vnode *vp;
87280297Sjkim	struct thread *td;
88280297Sjkim	struct ucred *cred;
89280297Sjkim	struct nfsmount *nmp;
90280297Sjkim	vm_object_t object;
91280297Sjkim	vm_page_t *pages;
92280297Sjkim	struct nfsnode *np;
93280297Sjkim
94183234Ssimon	vp = ap->a_vp;
95183234Ssimon	np = VTONFS(vp);
96280297Sjkim	td = curthread;				/* XXX */
97280297Sjkim	cred = curthread->td_ucred;		/* XXX */
98280297Sjkim	nmp = VFSTONFS(vp->v_mount);
99280297Sjkim	pages = ap->a_m;
100280297Sjkim	count = ap->a_count;
101280297Sjkim
102280297Sjkim	if ((object = vp->v_object) == NULL) {
103280297Sjkim		nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
104280297Sjkim		return VM_PAGER_ERROR;
105280297Sjkim	}
106280297Sjkim
107280297Sjkim	if (nfs_directio_enable && !nfs_directio_allow_mmap) {
108280297Sjkim		mtx_lock(&np->n_mtx);
109280297Sjkim		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
110280297Sjkim			mtx_unlock(&np->n_mtx);
111280297Sjkim			nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
112280297Sjkim			return VM_PAGER_ERROR;
113280297Sjkim		} else
114280297Sjkim			mtx_unlock(&np->n_mtx);
115280297Sjkim	}
116280297Sjkim
117280297Sjkim	mtx_lock(&nmp->nm_mtx);
118280297Sjkim	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
119280297Sjkim	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
120280297Sjkim		mtx_unlock(&nmp->nm_mtx);
121280297Sjkim		/* We'll never get here for v4, because we always have fsinfo */
122280297Sjkim		(void)nfs_fsinfo(nmp, vp, cred, td);
123280297Sjkim	} else
124280297Sjkim		mtx_unlock(&nmp->nm_mtx);
125183234Ssimon
126183234Ssimon	npages = btoc(count);
127183234Ssimon
128183234Ssimon	/*
129183234Ssimon	 * If the requested page is partially valid, just return it and
130	 * allow the pager to zero-out the blanks.  Partially valid pages
131	 * can only occur at the file EOF.
132	 */
133
134	{
135		vm_page_t m = pages[ap->a_reqpage];
136
137		VM_OBJECT_LOCK(object);
138		if (m->valid != 0) {
139			vm_page_lock_queues();
140			for (i = 0; i < npages; ++i) {
141				if (i != ap->a_reqpage)
142					vm_page_free(pages[i]);
143			}
144			vm_page_unlock_queues();
145			VM_OBJECT_UNLOCK(object);
146			return(0);
147		}
148		VM_OBJECT_UNLOCK(object);
149	}
150
151	/*
152	 * We use only the kva address for the buffer, but this is extremely
153	 * convienient and fast.
154	 */
155	bp = getpbuf(&nfs_pbuf_freecnt);
156
157	kva = (vm_offset_t) bp->b_data;
158	pmap_qenter(kva, pages, npages);
159	PCPU_INC(cnt.v_vnodein);
160	PCPU_ADD(cnt.v_vnodepgsin, npages);
161
162	iov.iov_base = (caddr_t) kva;
163	iov.iov_len = count;
164	uio.uio_iov = &iov;
165	uio.uio_iovcnt = 1;
166	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
167	uio.uio_resid = count;
168	uio.uio_segflg = UIO_SYSSPACE;
169	uio.uio_rw = UIO_READ;
170	uio.uio_td = td;
171
172	error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred);
173	pmap_qremove(kva, npages);
174
175	relpbuf(bp, &nfs_pbuf_freecnt);
176
177	if (error && (uio.uio_resid == count)) {
178		nfs_printf("nfs_getpages: error %d\n", error);
179		VM_OBJECT_LOCK(object);
180		vm_page_lock_queues();
181		for (i = 0; i < npages; ++i) {
182			if (i != ap->a_reqpage)
183				vm_page_free(pages[i]);
184		}
185		vm_page_unlock_queues();
186		VM_OBJECT_UNLOCK(object);
187		return VM_PAGER_ERROR;
188	}
189
190	/*
191	 * Calculate the number of bytes read and validate only that number
192	 * of bytes.  Note that due to pending writes, size may be 0.  This
193	 * does not mean that the remaining data is invalid!
194	 */
195
196	size = count - uio.uio_resid;
197	VM_OBJECT_LOCK(object);
198	vm_page_lock_queues();
199	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
200		vm_page_t m;
201		nextoff = toff + PAGE_SIZE;
202		m = pages[i];
203
204		if (nextoff <= size) {
205			/*
206			 * Read operation filled an entire page
207			 */
208			m->valid = VM_PAGE_BITS_ALL;
209			KASSERT(m->dirty == 0,
210			    ("nfs_getpages: page %p is dirty", m));
211		} else if (size > toff) {
212			/*
213			 * Read operation filled a partial page.
214			 */
215			m->valid = 0;
216			vm_page_set_valid(m, 0, size - toff);
217			KASSERT(m->dirty == 0,
218			    ("nfs_getpages: page %p is dirty", m));
219		} else {
220			/*
221			 * Read operation was short.  If no error occured
222			 * we may have hit a zero-fill section.   We simply
223			 * leave valid set to 0.
224			 */
225			;
226		}
227		if (i != ap->a_reqpage) {
228			/*
229			 * Whether or not to leave the page activated is up in
230			 * the air, but we should put the page on a page queue
231			 * somewhere (it already is in the object).  Result:
232			 * It appears that emperical results show that
233			 * deactivating pages is best.
234			 */
235
236			/*
237			 * Just in case someone was asking for this page we
238			 * now tell them that it is ok to use.
239			 */
240			if (!error) {
241				if (m->oflags & VPO_WANTED)
242					vm_page_activate(m);
243				else
244					vm_page_deactivate(m);
245				vm_page_wakeup(m);
246			} else {
247				vm_page_free(m);
248			}
249		}
250	}
251	vm_page_unlock_queues();
252	VM_OBJECT_UNLOCK(object);
253	return 0;
254}
255
256/*
257 * Vnode op for VM putpages.
258 */
259int
260nfs_putpages(struct vop_putpages_args *ap)
261{
262	struct uio uio;
263	struct iovec iov;
264	vm_offset_t kva;
265	struct buf *bp;
266	int iomode, must_commit, i, error, npages, count;
267	off_t offset;
268	int *rtvals;
269	struct vnode *vp;
270	struct thread *td;
271	struct ucred *cred;
272	struct nfsmount *nmp;
273	struct nfsnode *np;
274	vm_page_t *pages;
275
276	vp = ap->a_vp;
277	np = VTONFS(vp);
278	td = curthread;				/* XXX */
279	cred = curthread->td_ucred;		/* XXX */
280	nmp = VFSTONFS(vp->v_mount);
281	pages = ap->a_m;
282	count = ap->a_count;
283	rtvals = ap->a_rtvals;
284	npages = btoc(count);
285	offset = IDX_TO_OFF(pages[0]->pindex);
286
287	mtx_lock(&nmp->nm_mtx);
288	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
289	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
290		mtx_unlock(&nmp->nm_mtx);
291		(void)nfs_fsinfo(nmp, vp, cred, td);
292	} else
293		mtx_unlock(&nmp->nm_mtx);
294
295	mtx_lock(&np->n_mtx);
296	if (nfs_directio_enable && !nfs_directio_allow_mmap &&
297	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
298		mtx_unlock(&np->n_mtx);
299		nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
300		mtx_lock(&np->n_mtx);
301	}
302
303	for (i = 0; i < npages; i++)
304		rtvals[i] = VM_PAGER_AGAIN;
305
306	/*
307	 * When putting pages, do not extend file past EOF.
308	 */
309	if (offset + count > np->n_size) {
310		count = np->n_size - offset;
311		if (count < 0)
312			count = 0;
313	}
314	mtx_unlock(&np->n_mtx);
315
316	/*
317	 * We use only the kva address for the buffer, but this is extremely
318	 * convienient and fast.
319	 */
320	bp = getpbuf(&nfs_pbuf_freecnt);
321
322	kva = (vm_offset_t) bp->b_data;
323	pmap_qenter(kva, pages, npages);
324	PCPU_INC(cnt.v_vnodeout);
325	PCPU_ADD(cnt.v_vnodepgsout, count);
326
327	iov.iov_base = (caddr_t) kva;
328	iov.iov_len = count;
329	uio.uio_iov = &iov;
330	uio.uio_iovcnt = 1;
331	uio.uio_offset = offset;
332	uio.uio_resid = count;
333	uio.uio_segflg = UIO_SYSSPACE;
334	uio.uio_rw = UIO_WRITE;
335	uio.uio_td = td;
336
337	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
338	    iomode = NFSV3WRITE_UNSTABLE;
339	else
340	    iomode = NFSV3WRITE_FILESYNC;
341
342	error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit);
343
344	pmap_qremove(kva, npages);
345	relpbuf(bp, &nfs_pbuf_freecnt);
346
347	if (!error) {
348		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
349		for (i = 0; i < nwritten; i++) {
350			rtvals[i] = VM_PAGER_OK;
351			vm_page_undirty(pages[i]);
352		}
353		if (must_commit) {
354			nfs_clearcommit(vp->v_mount);
355		}
356	}
357	return rtvals[0];
358}
359
360/*
361 * For nfs, cache consistency can only be maintained approximately.
362 * Although RFC1094 does not specify the criteria, the following is
363 * believed to be compatible with the reference port.
364 * For nfs:
365 * If the file's modify time on the server has changed since the
366 * last read rpc or you have written to the file,
367 * you may have lost data cache consistency with the
368 * server, so flush all of the file's data out of the cache.
369 * Then force a getattr rpc to ensure that you have up to date
370 * attributes.
371 * NB: This implies that cache data can be read when up to
372 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
373 * attributes this could be forced by setting n_attrstamp to 0 before
374 * the VOP_GETATTR() call.
375 */
376static inline int
377nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
378{
379	int error = 0;
380	struct vattr vattr;
381	struct nfsnode *np = VTONFS(vp);
382	int old_lock;
383	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
384
385	/*
386	 * Grab the exclusive lock before checking whether the cache is
387	 * consistent.
388	 * XXX - We can make this cheaper later (by acquiring cheaper locks).
389	 * But for now, this suffices.
390	 */
391	old_lock = nfs_upgrade_vnlock(vp);
392	mtx_lock(&np->n_mtx);
393	if (np->n_flag & NMODIFIED) {
394		mtx_unlock(&np->n_mtx);
395		if (vp->v_type != VREG) {
396			if (vp->v_type != VDIR)
397				panic("nfs: bioread, not dir");
398			(nmp->nm_rpcops->nr_invaldir)(vp);
399			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
400			if (error)
401				goto out;
402		}
403		np->n_attrstamp = 0;
404		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
405		error = VOP_GETATTR(vp, &vattr, cred);
406		if (error)
407			goto out;
408		mtx_lock(&np->n_mtx);
409		np->n_mtime = vattr.va_mtime;
410		mtx_unlock(&np->n_mtx);
411	} else {
412		mtx_unlock(&np->n_mtx);
413		error = VOP_GETATTR(vp, &vattr, cred);
414		if (error)
415			return (error);
416		mtx_lock(&np->n_mtx);
417		if ((np->n_flag & NSIZECHANGED)
418		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
419			mtx_unlock(&np->n_mtx);
420			if (vp->v_type == VDIR)
421				(nmp->nm_rpcops->nr_invaldir)(vp);
422			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
423			if (error)
424				goto out;
425			mtx_lock(&np->n_mtx);
426			np->n_mtime = vattr.va_mtime;
427			np->n_flag &= ~NSIZECHANGED;
428		}
429		mtx_unlock(&np->n_mtx);
430	}
431out:
432	nfs_downgrade_vnlock(vp, old_lock);
433	return error;
434}
435
436/*
437 * Vnode op for read using bio
438 */
439int
440nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
441{
442	struct nfsnode *np = VTONFS(vp);
443	int biosize, i;
444	struct buf *bp, *rabp;
445	struct thread *td;
446	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
447	daddr_t lbn, rabn;
448	int bcount;
449	int seqcount;
450	int nra, error = 0, n = 0, on = 0;
451
452#ifdef DIAGNOSTIC
453	if (uio->uio_rw != UIO_READ)
454		panic("nfs_read mode");
455#endif
456	if (uio->uio_resid == 0)
457		return (0);
458	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
459		return (EINVAL);
460	td = uio->uio_td;
461
462	mtx_lock(&nmp->nm_mtx);
463	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
464	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
465		mtx_unlock(&nmp->nm_mtx);
466		(void)nfs_fsinfo(nmp, vp, cred, td);
467	} else
468		mtx_unlock(&nmp->nm_mtx);
469
470	if (vp->v_type != VDIR &&
471	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
472		return (EFBIG);
473
474	if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
475		/* No caching/ no readaheads. Just read data into the user buffer */
476		return nfs_readrpc(vp, uio, cred);
477
478	biosize = vp->v_mount->mnt_stat.f_iosize;
479	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
480
481	error = nfs_bioread_check_cons(vp, td, cred);
482	if (error)
483		return error;
484
485	do {
486	    u_quad_t nsize;
487
488	    mtx_lock(&np->n_mtx);
489	    nsize = np->n_size;
490	    mtx_unlock(&np->n_mtx);
491
492	    switch (vp->v_type) {
493	    case VREG:
494		nfsstats.biocache_reads++;
495		lbn = uio->uio_offset / biosize;
496		on = uio->uio_offset & (biosize - 1);
497
498		/*
499		 * Start the read ahead(s), as required.
500		 */
501		if (nmp->nm_readahead > 0) {
502		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
503			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
504			rabn = lbn + 1 + nra;
505			if (incore(&vp->v_bufobj, rabn) == NULL) {
506			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
507			    if (!rabp) {
508				error = nfs_sigintr(nmp, NULL, td);
509				return (error ? error : EINTR);
510			    }
511			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
512				rabp->b_flags |= B_ASYNC;
513				rabp->b_iocmd = BIO_READ;
514				vfs_busy_pages(rabp, 0);
515				if (nfs_asyncio(nmp, rabp, cred, td)) {
516				    rabp->b_flags |= B_INVAL;
517				    rabp->b_ioflags |= BIO_ERROR;
518				    vfs_unbusy_pages(rabp);
519				    brelse(rabp);
520				    break;
521				}
522			    } else {
523				brelse(rabp);
524			    }
525			}
526		    }
527		}
528
529		/* Note that bcount is *not* DEV_BSIZE aligned. */
530		bcount = biosize;
531		if ((off_t)lbn * biosize >= nsize) {
532			bcount = 0;
533		} else if ((off_t)(lbn + 1) * biosize > nsize) {
534			bcount = nsize - (off_t)lbn * biosize;
535		}
536		bp = nfs_getcacheblk(vp, lbn, bcount, td);
537
538		if (!bp) {
539			error = nfs_sigintr(nmp, NULL, td);
540			return (error ? error : EINTR);
541		}
542
543		/*
544		 * If B_CACHE is not set, we must issue the read.  If this
545		 * fails, we return an error.
546		 */
547
548		if ((bp->b_flags & B_CACHE) == 0) {
549		    bp->b_iocmd = BIO_READ;
550		    vfs_busy_pages(bp, 0);
551		    error = nfs_doio(vp, bp, cred, td);
552		    if (error) {
553			brelse(bp);
554			return (error);
555		    }
556		}
557
558		/*
559		 * on is the offset into the current bp.  Figure out how many
560		 * bytes we can copy out of the bp.  Note that bcount is
561		 * NOT DEV_BSIZE aligned.
562		 *
563		 * Then figure out how many bytes we can copy into the uio.
564		 */
565
566		n = 0;
567		if (on < bcount)
568			n = min((unsigned)(bcount - on), uio->uio_resid);
569		break;
570	    case VLNK:
571		nfsstats.biocache_readlinks++;
572		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
573		if (!bp) {
574			error = nfs_sigintr(nmp, NULL, td);
575			return (error ? error : EINTR);
576		}
577		if ((bp->b_flags & B_CACHE) == 0) {
578		    bp->b_iocmd = BIO_READ;
579		    vfs_busy_pages(bp, 0);
580		    error = nfs_doio(vp, bp, cred, td);
581		    if (error) {
582			bp->b_ioflags |= BIO_ERROR;
583			brelse(bp);
584			return (error);
585		    }
586		}
587		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
588		on = 0;
589		break;
590	    case VDIR:
591		nfsstats.biocache_readdirs++;
592		if (np->n_direofoffset
593		    && uio->uio_offset >= np->n_direofoffset) {
594		    return (0);
595		}
596		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
597		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
598		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
599		if (!bp) {
600		    error = nfs_sigintr(nmp, NULL, td);
601		    return (error ? error : EINTR);
602		}
603		if ((bp->b_flags & B_CACHE) == 0) {
604		    bp->b_iocmd = BIO_READ;
605		    vfs_busy_pages(bp, 0);
606		    error = nfs_doio(vp, bp, cred, td);
607		    if (error) {
608			    brelse(bp);
609		    }
610		    while (error == NFSERR_BAD_COOKIE) {
611			(nmp->nm_rpcops->nr_invaldir)(vp);
612			error = nfs_vinvalbuf(vp, 0, td, 1);
613			/*
614			 * Yuck! The directory has been modified on the
615			 * server. The only way to get the block is by
616			 * reading from the beginning to get all the
617			 * offset cookies.
618			 *
619			 * Leave the last bp intact unless there is an error.
620			 * Loop back up to the while if the error is another
621			 * NFSERR_BAD_COOKIE (double yuch!).
622			 */
623			for (i = 0; i <= lbn && !error; i++) {
624			    if (np->n_direofoffset
625				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
626				    return (0);
627			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
628			    if (!bp) {
629				error = nfs_sigintr(nmp, NULL, td);
630				return (error ? error : EINTR);
631			    }
632			    if ((bp->b_flags & B_CACHE) == 0) {
633				    bp->b_iocmd = BIO_READ;
634				    vfs_busy_pages(bp, 0);
635				    error = nfs_doio(vp, bp, cred, td);
636				    /*
637				     * no error + B_INVAL == directory EOF,
638				     * use the block.
639				     */
640				    if (error == 0 && (bp->b_flags & B_INVAL))
641					    break;
642			    }
643			    /*
644			     * An error will throw away the block and the
645			     * for loop will break out.  If no error and this
646			     * is not the block we want, we throw away the
647			     * block and go for the next one via the for loop.
648			     */
649			    if (error || i < lbn)
650				    brelse(bp);
651			}
652		    }
653		    /*
654		     * The above while is repeated if we hit another cookie
655		     * error.  If we hit an error and it wasn't a cookie error,
656		     * we give up.
657		     */
658		    if (error)
659			    return (error);
660		}
661
662		/*
663		 * If not eof and read aheads are enabled, start one.
664		 * (You need the current block first, so that you have the
665		 *  directory offset cookie of the next block.)
666		 */
667		if (nmp->nm_readahead > 0 &&
668		    (bp->b_flags & B_INVAL) == 0 &&
669		    (np->n_direofoffset == 0 ||
670		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
671		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
672			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
673			if (rabp) {
674			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
675				rabp->b_flags |= B_ASYNC;
676				rabp->b_iocmd = BIO_READ;
677				vfs_busy_pages(rabp, 0);
678				if (nfs_asyncio(nmp, rabp, cred, td)) {
679				    rabp->b_flags |= B_INVAL;
680				    rabp->b_ioflags |= BIO_ERROR;
681				    vfs_unbusy_pages(rabp);
682				    brelse(rabp);
683				}
684			    } else {
685				brelse(rabp);
686			    }
687			}
688		}
689		/*
690		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
691		 * chopped for the EOF condition, we cannot tell how large
692		 * NFS directories are going to be until we hit EOF.  So
693		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
694		 * it just so happens that b_resid will effectively chop it
695		 * to EOF.  *BUT* this information is lost if the buffer goes
696		 * away and is reconstituted into a B_CACHE state ( due to
697		 * being VMIO ) later.  So we keep track of the directory eof
698		 * in np->n_direofoffset and chop it off as an extra step
699		 * right here.
700		 */
701		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
702		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
703			n = np->n_direofoffset - uio->uio_offset;
704		break;
705	    default:
706		nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
707		bp = NULL;
708		break;
709	    };
710
711	    if (n > 0) {
712		    error = uiomove(bp->b_data + on, (int)n, uio);
713	    }
714	    if (vp->v_type == VLNK)
715		n = 0;
716	    if (bp != NULL)
717		brelse(bp);
718	} while (error == 0 && uio->uio_resid > 0 && n > 0);
719	return (error);
720}
721
722/*
723 * The NFS write path cannot handle iovecs with len > 1. So we need to
724 * break up iovecs accordingly (restricting them to wsize).
725 * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
726 * For the ASYNC case, 2 copies are needed. The first a copy from the
727 * user buffer to a staging buffer and then a second copy from the staging
728 * buffer to mbufs. This can be optimized by copying from the user buffer
729 * directly into mbufs and passing the chain down, but that requires a
730 * fair amount of re-working of the relevant codepaths (and can be done
731 * later).
732 */
733static int
734nfs_directio_write(vp, uiop, cred, ioflag)
735	struct vnode *vp;
736	struct uio *uiop;
737	struct ucred *cred;
738	int ioflag;
739{
740	int error;
741	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
742	struct thread *td = uiop->uio_td;
743	int size;
744	int wsize;
745
746	mtx_lock(&nmp->nm_mtx);
747	wsize = nmp->nm_wsize;
748	mtx_unlock(&nmp->nm_mtx);
749	if (ioflag & IO_SYNC) {
750		int iomode, must_commit;
751		struct uio uio;
752		struct iovec iov;
753do_sync:
754		while (uiop->uio_resid > 0) {
755			size = min(uiop->uio_resid, wsize);
756			size = min(uiop->uio_iov->iov_len, size);
757			iov.iov_base = uiop->uio_iov->iov_base;
758			iov.iov_len = size;
759			uio.uio_iov = &iov;
760			uio.uio_iovcnt = 1;
761			uio.uio_offset = uiop->uio_offset;
762			uio.uio_resid = size;
763			uio.uio_segflg = UIO_USERSPACE;
764			uio.uio_rw = UIO_WRITE;
765			uio.uio_td = td;
766			iomode = NFSV3WRITE_FILESYNC;
767			error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred,
768						      &iomode, &must_commit);
769			KASSERT((must_commit == 0),
770				("nfs_directio_write: Did not commit write"));
771			if (error)
772				return (error);
773			uiop->uio_offset += size;
774			uiop->uio_resid -= size;
775			if (uiop->uio_iov->iov_len <= size) {
776				uiop->uio_iovcnt--;
777				uiop->uio_iov++;
778			} else {
779				uiop->uio_iov->iov_base =
780					(char *)uiop->uio_iov->iov_base + size;
781				uiop->uio_iov->iov_len -= size;
782			}
783		}
784	} else {
785		struct uio *t_uio;
786		struct iovec *t_iov;
787		struct buf *bp;
788
789		/*
790		 * Break up the write into blocksize chunks and hand these
791		 * over to nfsiod's for write back.
792		 * Unfortunately, this incurs a copy of the data. Since
793		 * the user could modify the buffer before the write is
794		 * initiated.
795		 *
796		 * The obvious optimization here is that one of the 2 copies
797		 * in the async write path can be eliminated by copying the
798		 * data here directly into mbufs and passing the mbuf chain
799		 * down. But that will require a fair amount of re-working
800		 * of the code and can be done if there's enough interest
801		 * in NFS directio access.
802		 */
803		while (uiop->uio_resid > 0) {
804			size = min(uiop->uio_resid, wsize);
805			size = min(uiop->uio_iov->iov_len, size);
806			bp = getpbuf(&nfs_pbuf_freecnt);
807			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
808			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
809			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
810			t_iov->iov_len = size;
811			t_uio->uio_iov = t_iov;
812			t_uio->uio_iovcnt = 1;
813			t_uio->uio_offset = uiop->uio_offset;
814			t_uio->uio_resid = size;
815			t_uio->uio_segflg = UIO_SYSSPACE;
816			t_uio->uio_rw = UIO_WRITE;
817			t_uio->uio_td = td;
818			bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size);
819			bp->b_flags |= B_DIRECT;
820			bp->b_iocmd = BIO_WRITE;
821			if (cred != NOCRED) {
822				crhold(cred);
823				bp->b_wcred = cred;
824			} else
825				bp->b_wcred = NOCRED;
826			bp->b_caller1 = (void *)t_uio;
827			bp->b_vp = vp;
828			error = nfs_asyncio(nmp, bp, NOCRED, td);
829			if (error) {
830				free(t_iov->iov_base, M_NFSDIRECTIO);
831				free(t_iov, M_NFSDIRECTIO);
832				free(t_uio, M_NFSDIRECTIO);
833				bp->b_vp = NULL;
834				relpbuf(bp, &nfs_pbuf_freecnt);
835				if (error == EINTR)
836					return (error);
837				goto do_sync;
838			}
839			uiop->uio_offset += size;
840			uiop->uio_resid -= size;
841			if (uiop->uio_iov->iov_len <= size) {
842				uiop->uio_iovcnt--;
843				uiop->uio_iov++;
844			} else {
845				uiop->uio_iov->iov_base =
846					(char *)uiop->uio_iov->iov_base + size;
847				uiop->uio_iov->iov_len -= size;
848			}
849		}
850	}
851	return (0);
852}
853
854/*
855 * Vnode op for write using bio
856 */
857int
858nfs_write(struct vop_write_args *ap)
859{
860	int biosize;
861	struct uio *uio = ap->a_uio;
862	struct thread *td = uio->uio_td;
863	struct vnode *vp = ap->a_vp;
864	struct nfsnode *np = VTONFS(vp);
865	struct ucred *cred = ap->a_cred;
866	int ioflag = ap->a_ioflag;
867	struct buf *bp;
868	struct vattr vattr;
869	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
870	daddr_t lbn;
871	int bcount;
872	int n, on, error = 0;
873	struct proc *p = td?td->td_proc:NULL;
874
875#ifdef DIAGNOSTIC
876	if (uio->uio_rw != UIO_WRITE)
877		panic("nfs_write mode");
878	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
879		panic("nfs_write proc");
880#endif
881	if (vp->v_type != VREG)
882		return (EIO);
883	mtx_lock(&np->n_mtx);
884	if (np->n_flag & NWRITEERR) {
885		np->n_flag &= ~NWRITEERR;
886		mtx_unlock(&np->n_mtx);
887		return (np->n_error);
888	} else
889		mtx_unlock(&np->n_mtx);
890	mtx_lock(&nmp->nm_mtx);
891	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
892	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
893		mtx_unlock(&nmp->nm_mtx);
894		(void)nfs_fsinfo(nmp, vp, cred, td);
895	} else
896		mtx_unlock(&nmp->nm_mtx);
897
898	/*
899	 * Synchronously flush pending buffers if we are in synchronous
900	 * mode or if we are appending.
901	 */
902	if (ioflag & (IO_APPEND | IO_SYNC)) {
903		mtx_lock(&np->n_mtx);
904		if (np->n_flag & NMODIFIED) {
905			mtx_unlock(&np->n_mtx);
906#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
907			/*
908			 * Require non-blocking, synchronous writes to
909			 * dirty files to inform the program it needs
910			 * to fsync(2) explicitly.
911			 */
912			if (ioflag & IO_NDELAY)
913				return (EAGAIN);
914#endif
915flush_and_restart:
916			np->n_attrstamp = 0;
917			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
918			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
919			if (error)
920				return (error);
921		} else
922			mtx_unlock(&np->n_mtx);
923	}
924
925	/*
926	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
927	 * get the append lock.
928	 */
929	if (ioflag & IO_APPEND) {
930		np->n_attrstamp = 0;
931		KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
932		error = VOP_GETATTR(vp, &vattr, cred);
933		if (error)
934			return (error);
935		mtx_lock(&np->n_mtx);
936		uio->uio_offset = np->n_size;
937		mtx_unlock(&np->n_mtx);
938	}
939
940	if (uio->uio_offset < 0)
941		return (EINVAL);
942	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
943		return (EFBIG);
944	if (uio->uio_resid == 0)
945		return (0);
946
947	if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
948		return nfs_directio_write(vp, uio, cred, ioflag);
949
950	/*
951	 * Maybe this should be above the vnode op call, but so long as
952	 * file servers have no limits, i don't think it matters
953	 */
954	if (p != NULL) {
955		PROC_LOCK(p);
956		if (uio->uio_offset + uio->uio_resid >
957		    lim_cur(p, RLIMIT_FSIZE)) {
958			psignal(p, SIGXFSZ);
959			PROC_UNLOCK(p);
960			return (EFBIG);
961		}
962		PROC_UNLOCK(p);
963	}
964
965	biosize = vp->v_mount->mnt_stat.f_iosize;
966	/*
967	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
968	 * would exceed the local maximum per-file write commit size when
969	 * combined with those, we must decide whether to flush,
970	 * go synchronous, or return error.  We don't bother checking
971	 * IO_UNIT -- we just make all writes atomic anyway, as there's
972	 * no point optimizing for something that really won't ever happen.
973	 */
974	if (!(ioflag & IO_SYNC)) {
975		int nflag;
976
977		mtx_lock(&np->n_mtx);
978		nflag = np->n_flag;
979		mtx_unlock(&np->n_mtx);
980		int needrestart = 0;
981		if (nmp->nm_wcommitsize < uio->uio_resid) {
982			/*
983			 * If this request could not possibly be completed
984			 * without exceeding the maximum outstanding write
985			 * commit size, see if we can convert it into a
986			 * synchronous write operation.
987			 */
988			if (ioflag & IO_NDELAY)
989				return (EAGAIN);
990			ioflag |= IO_SYNC;
991			if (nflag & NMODIFIED)
992				needrestart = 1;
993		} else if (nflag & NMODIFIED) {
994			int wouldcommit = 0;
995			BO_LOCK(&vp->v_bufobj);
996			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
997				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
998				    b_bobufs) {
999					if (bp->b_flags & B_NEEDCOMMIT)
1000						wouldcommit += bp->b_bcount;
1001				}
1002			}
1003			BO_UNLOCK(&vp->v_bufobj);
1004			/*
1005			 * Since we're not operating synchronously and
1006			 * bypassing the buffer cache, we are in a commit
1007			 * and holding all of these buffers whether
1008			 * transmitted or not.  If not limited, this
1009			 * will lead to the buffer cache deadlocking,
1010			 * as no one else can flush our uncommitted buffers.
1011			 */
1012			wouldcommit += uio->uio_resid;
1013			/*
1014			 * If we would initially exceed the maximum
1015			 * outstanding write commit size, flush and restart.
1016			 */
1017			if (wouldcommit > nmp->nm_wcommitsize)
1018				needrestart = 1;
1019		}
1020		if (needrestart)
1021			goto flush_and_restart;
1022	}
1023
1024	do {
1025		nfsstats.biocache_writes++;
1026		lbn = uio->uio_offset / biosize;
1027		on = uio->uio_offset & (biosize-1);
1028		n = min((unsigned)(biosize - on), uio->uio_resid);
1029again:
1030		/*
1031		 * Handle direct append and file extension cases, calculate
1032		 * unaligned buffer size.
1033		 */
1034		mtx_lock(&np->n_mtx);
1035		if (uio->uio_offset == np->n_size && n) {
1036			mtx_unlock(&np->n_mtx);
1037			/*
1038			 * Get the buffer (in its pre-append state to maintain
1039			 * B_CACHE if it was previously set).  Resize the
1040			 * nfsnode after we have locked the buffer to prevent
1041			 * readers from reading garbage.
1042			 */
1043			bcount = on;
1044			bp = nfs_getcacheblk(vp, lbn, bcount, td);
1045
1046			if (bp != NULL) {
1047				long save;
1048
1049				mtx_lock(&np->n_mtx);
1050				np->n_size = uio->uio_offset + n;
1051				np->n_flag |= NMODIFIED;
1052				vnode_pager_setsize(vp, np->n_size);
1053				mtx_unlock(&np->n_mtx);
1054
1055				save = bp->b_flags & B_CACHE;
1056				bcount += n;
1057				allocbuf(bp, bcount);
1058				bp->b_flags |= save;
1059			}
1060		} else {
1061			/*
1062			 * Obtain the locked cache block first, and then
1063			 * adjust the file's size as appropriate.
1064			 */
1065			bcount = on + n;
1066			if ((off_t)lbn * biosize + bcount < np->n_size) {
1067				if ((off_t)(lbn + 1) * biosize < np->n_size)
1068					bcount = biosize;
1069				else
1070					bcount = np->n_size - (off_t)lbn * biosize;
1071			}
1072			mtx_unlock(&np->n_mtx);
1073			bp = nfs_getcacheblk(vp, lbn, bcount, td);
1074			mtx_lock(&np->n_mtx);
1075			if (uio->uio_offset + n > np->n_size) {
1076				np->n_size = uio->uio_offset + n;
1077				np->n_flag |= NMODIFIED;
1078				vnode_pager_setsize(vp, np->n_size);
1079			}
1080			mtx_unlock(&np->n_mtx);
1081		}
1082
1083		if (!bp) {
1084			error = nfs_sigintr(nmp, NULL, td);
1085			if (!error)
1086				error = EINTR;
1087			break;
1088		}
1089
1090		/*
1091		 * Issue a READ if B_CACHE is not set.  In special-append
1092		 * mode, B_CACHE is based on the buffer prior to the write
1093		 * op and is typically set, avoiding the read.  If a read
1094		 * is required in special append mode, the server will
1095		 * probably send us a short-read since we extended the file
1096		 * on our end, resulting in b_resid == 0 and, thusly,
1097		 * B_CACHE getting set.
1098		 *
1099		 * We can also avoid issuing the read if the write covers
1100		 * the entire buffer.  We have to make sure the buffer state
1101		 * is reasonable in this case since we will not be initiating
1102		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
1103		 * more information.
1104		 *
1105		 * B_CACHE may also be set due to the buffer being cached
1106		 * normally.
1107		 */
1108
1109		if (on == 0 && n == bcount) {
1110			bp->b_flags |= B_CACHE;
1111			bp->b_flags &= ~B_INVAL;
1112			bp->b_ioflags &= ~BIO_ERROR;
1113		}
1114
1115		if ((bp->b_flags & B_CACHE) == 0) {
1116			bp->b_iocmd = BIO_READ;
1117			vfs_busy_pages(bp, 0);
1118			error = nfs_doio(vp, bp, cred, td);
1119			if (error) {
1120				brelse(bp);
1121				break;
1122			}
1123		}
1124		if (bp->b_wcred == NOCRED)
1125			bp->b_wcred = crhold(cred);
1126		mtx_lock(&np->n_mtx);
1127		np->n_flag |= NMODIFIED;
1128		mtx_unlock(&np->n_mtx);
1129
1130		/*
1131		 * If dirtyend exceeds file size, chop it down.  This should
1132		 * not normally occur but there is an append race where it
1133		 * might occur XXX, so we log it.
1134		 *
1135		 * If the chopping creates a reverse-indexed or degenerate
1136		 * situation with dirtyoff/end, we 0 both of them.
1137		 */
1138
1139		if (bp->b_dirtyend > bcount) {
1140			nfs_printf("NFS append race @%lx:%d\n",
1141			    (long)bp->b_blkno * DEV_BSIZE,
1142			    bp->b_dirtyend - bcount);
1143			bp->b_dirtyend = bcount;
1144		}
1145
1146		if (bp->b_dirtyoff >= bp->b_dirtyend)
1147			bp->b_dirtyoff = bp->b_dirtyend = 0;
1148
1149		/*
1150		 * If the new write will leave a contiguous dirty
1151		 * area, just update the b_dirtyoff and b_dirtyend,
1152		 * otherwise force a write rpc of the old dirty area.
1153		 *
1154		 * While it is possible to merge discontiguous writes due to
1155		 * our having a B_CACHE buffer ( and thus valid read data
1156		 * for the hole), we don't because it could lead to
1157		 * significant cache coherency problems with multiple clients,
1158		 * especially if locking is implemented later on.
1159		 *
1160		 * as an optimization we could theoretically maintain
1161		 * a linked list of discontinuous areas, but we would still
1162		 * have to commit them separately so there isn't much
1163		 * advantage to it except perhaps a bit of asynchronization.
1164		 */
1165
1166		if (bp->b_dirtyend > 0 &&
1167		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
1168			if (bwrite(bp) == EINTR) {
1169				error = EINTR;
1170				break;
1171			}
1172			goto again;
1173		}
1174
1175		error = uiomove((char *)bp->b_data + on, n, uio);
1176
1177		/*
1178		 * Since this block is being modified, it must be written
1179		 * again and not just committed.  Since write clustering does
1180		 * not work for the stage 1 data write, only the stage 2
1181		 * commit rpc, we have to clear B_CLUSTEROK as well.
1182		 */
1183		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1184
1185		if (error) {
1186			bp->b_ioflags |= BIO_ERROR;
1187			brelse(bp);
1188			break;
1189		}
1190
1191		/*
1192		 * Only update dirtyoff/dirtyend if not a degenerate
1193		 * condition.
1194		 */
1195		if (n) {
1196			if (bp->b_dirtyend > 0) {
1197				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
1198				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
1199			} else {
1200				bp->b_dirtyoff = on;
1201				bp->b_dirtyend = on + n;
1202			}
1203			vfs_bio_set_valid(bp, on, n);
1204		}
1205
1206		/*
1207		 * If IO_SYNC do bwrite().
1208		 *
1209		 * IO_INVAL appears to be unused.  The idea appears to be
1210		 * to turn off caching in this case.  Very odd.  XXX
1211		 */
1212		if ((ioflag & IO_SYNC)) {
1213			if (ioflag & IO_INVAL)
1214				bp->b_flags |= B_NOCACHE;
1215			error = bwrite(bp);
1216			if (error)
1217				break;
1218		} else if ((n + on) == biosize) {
1219			bp->b_flags |= B_ASYNC;
1220			(void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL);
1221		} else {
1222			bdwrite(bp);
1223		}
1224	} while (uio->uio_resid > 0 && n > 0);
1225
1226	return (error);
1227}
1228
1229/*
1230 * Get an nfs cache block.
1231 *
1232 * Allocate a new one if the block isn't currently in the cache
1233 * and return the block marked busy. If the calling process is
1234 * interrupted by a signal for an interruptible mount point, return
1235 * NULL.
1236 *
1237 * The caller must carefully deal with the possible B_INVAL state of
1238 * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
1239 * indirectly), so synchronous reads can be issued without worrying about
1240 * the B_INVAL state.  We have to be a little more careful when dealing
1241 * with writes (see comments in nfs_write()) when extending a file past
1242 * its EOF.
1243 */
1244static struct buf *
1245nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
1246{
1247	struct buf *bp;
1248	struct mount *mp;
1249	struct nfsmount *nmp;
1250
1251	mp = vp->v_mount;
1252	nmp = VFSTONFS(mp);
1253
1254	if (nmp->nm_flag & NFSMNT_INT) {
1255 		sigset_t oldset;
1256
1257 		nfs_set_sigmask(td, &oldset);
1258		bp = getblk(vp, bn, size, PCATCH, 0, 0);
1259 		nfs_restore_sigmask(td, &oldset);
1260		while (bp == NULL) {
1261			if (nfs_sigintr(nmp, NULL, td))
1262				return (NULL);
1263			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
1264		}
1265	} else {
1266		bp = getblk(vp, bn, size, 0, 0, 0);
1267	}
1268
1269	if (vp->v_type == VREG) {
1270		int biosize;
1271
1272		biosize = mp->mnt_stat.f_iosize;
1273		bp->b_blkno = bn * (biosize / DEV_BSIZE);
1274	}
1275	return (bp);
1276}
1277
1278/*
1279 * Flush and invalidate all dirty buffers. If another process is already
1280 * doing the flush, just wait for completion.
1281 */
1282int
1283nfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
1284{
1285	struct nfsnode *np = VTONFS(vp);
1286	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1287	int error = 0, slpflag, slptimeo;
1288 	int old_lock = 0;
1289
1290	ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf");
1291
1292	/*
1293	 * XXX This check stops us from needlessly doing a vinvalbuf when
1294	 * being called through vclean().  It is not clear that this is
1295	 * unsafe.
1296	 */
1297	if (vp->v_iflag & VI_DOOMED)
1298		return (0);
1299
1300	if ((nmp->nm_flag & NFSMNT_INT) == 0)
1301		intrflg = 0;
1302	if (intrflg) {
1303		slpflag = PCATCH;
1304		slptimeo = 2 * hz;
1305	} else {
1306		slpflag = 0;
1307		slptimeo = 0;
1308	}
1309
1310	old_lock = nfs_upgrade_vnlock(vp);
1311	/*
1312	 * Now, flush as required.
1313	 */
1314	if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) {
1315		VM_OBJECT_LOCK(vp->v_bufobj.bo_object);
1316		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
1317		VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object);
1318		/*
1319		 * If the page clean was interrupted, fail the invalidation.
1320		 * Not doing so, we run the risk of losing dirty pages in the
1321		 * vinvalbuf() call below.
1322		 */
1323		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
1324			goto out;
1325	}
1326
1327	error = vinvalbuf(vp, flags, slpflag, 0);
1328	while (error) {
1329		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
1330			goto out;
1331		error = vinvalbuf(vp, flags, 0, slptimeo);
1332	}
1333	mtx_lock(&np->n_mtx);
1334	if (np->n_directio_asyncwr == 0)
1335		np->n_flag &= ~NMODIFIED;
1336	mtx_unlock(&np->n_mtx);
1337out:
1338	nfs_downgrade_vnlock(vp, old_lock);
1339	return error;
1340}
1341
1342/*
1343 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1344 * This is mainly to avoid queueing async I/O requests when the nfsiods
1345 * are all hung on a dead server.
1346 *
1347 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
1348 * is eventually dequeued by the async daemon, nfs_doio() *will*.
1349 */
1350int
1351nfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
1352{
1353	int iod;
1354	int gotiod;
1355	int slpflag = 0;
1356	int slptimeo = 0;
1357	int error, error2;
1358
1359	/*
1360	 * Commits are usually short and sweet so lets save some cpu and
1361	 * leave the async daemons for more important rpc's (such as reads
1362	 * and writes).
1363	 */
1364	mtx_lock(&nfs_iod_mtx);
1365	if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
1366	    (nmp->nm_bufqiods > nfs_numasync / 2)) {
1367		mtx_unlock(&nfs_iod_mtx);
1368		return(EIO);
1369	}
1370again:
1371	if (nmp->nm_flag & NFSMNT_INT)
1372		slpflag = PCATCH;
1373	gotiod = FALSE;
1374
1375	/*
1376	 * Find a free iod to process this request.
1377	 */
1378	for (iod = 0; iod < nfs_numasync; iod++)
1379		if (nfs_iodwant[iod]) {
1380			gotiod = TRUE;
1381			break;
1382		}
1383
1384	/*
1385	 * Try to create one if none are free.
1386	 */
1387	if (!gotiod) {
1388		iod = nfs_nfsiodnew();
1389		if (iod != -1)
1390			gotiod = TRUE;
1391	}
1392
1393	if (gotiod) {
1394		/*
1395		 * Found one, so wake it up and tell it which
1396		 * mount to process.
1397		 */
1398		NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n",
1399		    iod, nmp));
1400		nfs_iodwant[iod] = NULL;
1401		nfs_iodmount[iod] = nmp;
1402		nmp->nm_bufqiods++;
1403		wakeup(&nfs_iodwant[iod]);
1404	}
1405
1406	/*
1407	 * If none are free, we may already have an iod working on this mount
1408	 * point.  If so, it will process our request.
1409	 */
1410	if (!gotiod) {
1411		if (nmp->nm_bufqiods > 0) {
1412			NFS_DPF(ASYNCIO,
1413				("nfs_asyncio: %d iods are already processing mount %p\n",
1414				 nmp->nm_bufqiods, nmp));
1415			gotiod = TRUE;
1416		}
1417	}
1418
1419	/*
1420	 * If we have an iod which can process the request, then queue
1421	 * the buffer.
1422	 */
1423	if (gotiod) {
1424		/*
1425		 * Ensure that the queue never grows too large.  We still want
1426		 * to asynchronize so we block rather then return EIO.
1427		 */
1428		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1429			NFS_DPF(ASYNCIO,
1430				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1431			nmp->nm_bufqwant = TRUE;
1432 			error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx,
1433					   slpflag | PRIBIO,
1434 					   "nfsaio", slptimeo);
1435			if (error) {
1436				error2 = nfs_sigintr(nmp, NULL, td);
1437				if (error2) {
1438					mtx_unlock(&nfs_iod_mtx);
1439					return (error2);
1440				}
1441				if (slpflag == PCATCH) {
1442					slpflag = 0;
1443					slptimeo = 2 * hz;
1444				}
1445			}
1446			/*
1447			 * We might have lost our iod while sleeping,
1448			 * so check and loop if nescessary.
1449			 */
1450			if (nmp->nm_bufqiods == 0) {
1451				NFS_DPF(ASYNCIO,
1452					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1453				goto again;
1454			}
1455		}
1456
1457		/* We might have lost our nfsiod */
1458		if (nmp->nm_bufqiods == 0) {
1459			NFS_DPF(ASYNCIO,
1460				("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1461			goto again;
1462		}
1463
1464		if (bp->b_iocmd == BIO_READ) {
1465			if (bp->b_rcred == NOCRED && cred != NOCRED)
1466				bp->b_rcred = crhold(cred);
1467		} else {
1468			if (bp->b_wcred == NOCRED && cred != NOCRED)
1469				bp->b_wcred = crhold(cred);
1470		}
1471
1472		if (bp->b_flags & B_REMFREE)
1473			bremfreef(bp);
1474		BUF_KERNPROC(bp);
1475		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1476		nmp->nm_bufqlen++;
1477		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
1478			mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);
1479			VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
1480			VTONFS(bp->b_vp)->n_directio_asyncwr++;
1481			mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
1482		}
1483		mtx_unlock(&nfs_iod_mtx);
1484		return (0);
1485	}
1486
1487	mtx_unlock(&nfs_iod_mtx);
1488
1489	/*
1490	 * All the iods are busy on other mounts, so return EIO to
1491	 * force the caller to process the i/o synchronously.
1492	 */
1493	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1494	return (EIO);
1495}
1496
1497void
1498nfs_doio_directwrite(struct buf *bp)
1499{
1500	int iomode, must_commit;
1501	struct uio *uiop = (struct uio *)bp->b_caller1;
1502	char *iov_base = uiop->uio_iov->iov_base;
1503	struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount);
1504
1505	iomode = NFSV3WRITE_FILESYNC;
1506	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
1507	(nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit);
1508	KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write"));
1509	free(iov_base, M_NFSDIRECTIO);
1510	free(uiop->uio_iov, M_NFSDIRECTIO);
1511	free(uiop, M_NFSDIRECTIO);
1512	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
1513		struct nfsnode *np = VTONFS(bp->b_vp);
1514		mtx_lock(&np->n_mtx);
1515		np->n_directio_asyncwr--;
1516		if (np->n_directio_asyncwr == 0) {
1517			VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED;
1518			if ((np->n_flag & NFSYNCWAIT)) {
1519				np->n_flag &= ~NFSYNCWAIT;
1520				wakeup((caddr_t)&np->n_directio_asyncwr);
1521			}
1522		}
1523		mtx_unlock(&np->n_mtx);
1524	}
1525	bp->b_vp = NULL;
1526	relpbuf(bp, &nfs_pbuf_freecnt);
1527}
1528
1529/*
1530 * Do an I/O operation to/from a cache block. This may be called
1531 * synchronously or from an nfsiod.
1532 */
1533int
1534nfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td)
1535{
1536	struct uio *uiop;
1537	struct nfsnode *np;
1538	struct nfsmount *nmp;
1539	int error = 0, iomode, must_commit = 0;
1540	struct uio uio;
1541	struct iovec io;
1542	struct proc *p = td ? td->td_proc : NULL;
1543	uint8_t	iocmd;
1544
1545	np = VTONFS(vp);
1546	nmp = VFSTONFS(vp->v_mount);
1547	uiop = &uio;
1548	uiop->uio_iov = &io;
1549	uiop->uio_iovcnt = 1;
1550	uiop->uio_segflg = UIO_SYSSPACE;
1551	uiop->uio_td = td;
1552
1553	/*
1554	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
1555	 * do this here so we do not have to do it in all the code that
1556	 * calls us.
1557	 */
1558	bp->b_flags &= ~B_INVAL;
1559	bp->b_ioflags &= ~BIO_ERROR;
1560
1561	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
1562	iocmd = bp->b_iocmd;
1563	if (iocmd == BIO_READ) {
1564	    io.iov_len = uiop->uio_resid = bp->b_bcount;
1565	    io.iov_base = bp->b_data;
1566	    uiop->uio_rw = UIO_READ;
1567
1568	    switch (vp->v_type) {
1569	    case VREG:
1570		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1571		nfsstats.read_bios++;
1572		error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr);
1573
1574		if (!error) {
1575		    if (uiop->uio_resid) {
1576			/*
1577			 * If we had a short read with no error, we must have
1578			 * hit a file hole.  We should zero-fill the remainder.
1579			 * This can also occur if the server hits the file EOF.
1580			 *
1581			 * Holes used to be able to occur due to pending
1582			 * writes, but that is not possible any longer.
1583			 */
1584			int nread = bp->b_bcount - uiop->uio_resid;
1585			int left  = uiop->uio_resid;
1586
1587			if (left > 0)
1588				bzero((char *)bp->b_data + nread, left);
1589			uiop->uio_resid = 0;
1590		    }
1591		}
1592		/* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */
1593		if (p && (vp->v_vflag & VV_TEXT)) {
1594			mtx_lock(&np->n_mtx);
1595			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) {
1596				mtx_unlock(&np->n_mtx);
1597				PROC_LOCK(p);
1598				killproc(p, "text file modification");
1599				PROC_UNLOCK(p);
1600			} else
1601				mtx_unlock(&np->n_mtx);
1602		}
1603		break;
1604	    case VLNK:
1605		uiop->uio_offset = (off_t)0;
1606		nfsstats.readlink_bios++;
1607		error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr);
1608		break;
1609	    case VDIR:
1610		nfsstats.readdir_bios++;
1611		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1612		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
1613			error = nfs_readdirplusrpc(vp, uiop, cr);
1614			if (error == NFSERR_NOTSUPP)
1615				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1616		}
1617		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1618			error = nfs_readdirrpc(vp, uiop, cr);
1619		/*
1620		 * end-of-directory sets B_INVAL but does not generate an
1621		 * error.
1622		 */
1623		if (error == 0 && uiop->uio_resid == bp->b_bcount)
1624			bp->b_flags |= B_INVAL;
1625		break;
1626	    default:
1627		nfs_printf("nfs_doio:  type %x unexpected\n", vp->v_type);
1628		break;
1629	    };
1630	    if (error) {
1631		bp->b_ioflags |= BIO_ERROR;
1632		bp->b_error = error;
1633	    }
1634	} else {
1635	    /*
1636	     * If we only need to commit, try to commit
1637	     */
1638	    if (bp->b_flags & B_NEEDCOMMIT) {
1639		    int retv;
1640		    off_t off;
1641
1642		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
1643		    retv = (nmp->nm_rpcops->nr_commit)(
1644				vp, off, bp->b_dirtyend-bp->b_dirtyoff,
1645				bp->b_wcred, td);
1646		    if (retv == 0) {
1647			    bp->b_dirtyoff = bp->b_dirtyend = 0;
1648			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1649			    bp->b_resid = 0;
1650			    bufdone(bp);
1651			    return (0);
1652		    }
1653		    if (retv == NFSERR_STALEWRITEVERF) {
1654			    nfs_clearcommit(vp->v_mount);
1655		    }
1656	    }
1657
1658	    /*
1659	     * Setup for actual write
1660	     */
1661	    mtx_lock(&np->n_mtx);
1662	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1663		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1664	    mtx_unlock(&np->n_mtx);
1665
1666	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1667		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1668		    - bp->b_dirtyoff;
1669		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
1670		    + bp->b_dirtyoff;
1671		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1672		uiop->uio_rw = UIO_WRITE;
1673		nfsstats.write_bios++;
1674
1675		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1676		    iomode = NFSV3WRITE_UNSTABLE;
1677		else
1678		    iomode = NFSV3WRITE_FILESYNC;
1679
1680		error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit);
1681
1682		/*
1683		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1684		 * to cluster the buffers needing commit.  This will allow
1685		 * the system to submit a single commit rpc for the whole
1686		 * cluster.  We can do this even if the buffer is not 100%
1687		 * dirty (relative to the NFS blocksize), so we optimize the
1688		 * append-to-file-case.
1689		 *
1690		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1691		 * cleared because write clustering only works for commit
1692		 * rpc's, not for the data portion of the write).
1693		 */
1694
1695		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1696		    bp->b_flags |= B_NEEDCOMMIT;
1697		    if (bp->b_dirtyoff == 0
1698			&& bp->b_dirtyend == bp->b_bcount)
1699			bp->b_flags |= B_CLUSTEROK;
1700		} else {
1701		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1702		}
1703
1704		/*
1705		 * For an interrupted write, the buffer is still valid
1706		 * and the write hasn't been pushed to the server yet,
1707		 * so we can't set BIO_ERROR and report the interruption
1708		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1709		 * is not relevant, so the rpc attempt is essentially
1710		 * a noop.  For the case of a V3 write rpc not being
1711		 * committed to stable storage, the block is still
1712		 * dirty and requires either a commit rpc or another
1713		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1714		 * the block is reused. This is indicated by setting
1715		 * the B_DELWRI and B_NEEDCOMMIT flags.
1716		 *
1717		 * If the buffer is marked B_PAGING, it does not reside on
1718		 * the vp's paging queues so we cannot call bdirty().  The
1719		 * bp in this case is not an NFS cache block so we should
1720		 * be safe. XXX
1721		 *
1722		 * The logic below breaks up errors into recoverable and
1723		 * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE
1724		 * and keep the buffer around for potential write retries.
1725		 * For the latter (eg ESTALE), we toss the buffer away (B_INVAL)
1726		 * and save the error in the nfsnode. This is less than ideal
1727		 * but necessary. Keeping such buffers around could potentially
1728		 * cause buffer exhaustion eventually (they can never be written
1729		 * out, so will get constantly be re-dirtied). It also causes
1730		 * all sorts of vfs panics. For non-recoverable write errors,
1731		 * also invalidate the attrcache, so we'll be forced to go over
1732		 * the wire for this object, returning an error to user on next
1733		 * call (most of the time).
1734		 */
1735    		if (error == EINTR || error == EIO || error == ETIMEDOUT
1736		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1737			int s;
1738
1739			s = splbio();
1740			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1741			if ((bp->b_flags & B_PAGING) == 0) {
1742			    bdirty(bp);
1743			    bp->b_flags &= ~B_DONE;
1744			}
1745			if (error && (bp->b_flags & B_ASYNC) == 0)
1746			    bp->b_flags |= B_EINTR;
1747			splx(s);
1748	    	} else {
1749		    if (error) {
1750			bp->b_ioflags |= BIO_ERROR;
1751			bp->b_flags |= B_INVAL;
1752			bp->b_error = np->n_error = error;
1753			mtx_lock(&np->n_mtx);
1754			np->n_flag |= NWRITEERR;
1755			np->n_attrstamp = 0;
1756			KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp);
1757			mtx_unlock(&np->n_mtx);
1758		    }
1759		    bp->b_dirtyoff = bp->b_dirtyend = 0;
1760		}
1761	    } else {
1762		bp->b_resid = 0;
1763		bufdone(bp);
1764		return (0);
1765	    }
1766	}
1767	bp->b_resid = uiop->uio_resid;
1768	if (must_commit)
1769	    nfs_clearcommit(vp->v_mount);
1770	bufdone(bp);
1771	return (error);
1772}
1773
1774/*
1775 * Used to aid in handling ftruncate() operations on the NFS client side.
1776 * Truncation creates a number of special problems for NFS.  We have to
1777 * throw away VM pages and buffer cache buffers that are beyond EOF, and
1778 * we have to properly handle VM pages or (potentially dirty) buffers
1779 * that straddle the truncation point.
1780 */
1781
1782int
1783nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
1784{
1785	struct nfsnode *np = VTONFS(vp);
1786	u_quad_t tsize;
1787	int biosize = vp->v_mount->mnt_stat.f_iosize;
1788	int error = 0;
1789
1790	mtx_lock(&np->n_mtx);
1791	tsize = np->n_size;
1792	np->n_size = nsize;
1793	mtx_unlock(&np->n_mtx);
1794
1795	if (nsize < tsize) {
1796		struct buf *bp;
1797		daddr_t lbn;
1798		int bufsize;
1799
1800		/*
1801		 * vtruncbuf() doesn't get the buffer overlapping the
1802		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
1803		 * buffer that now needs to be truncated.
1804		 */
1805		error = vtruncbuf(vp, cred, td, nsize, biosize);
1806		lbn = nsize / biosize;
1807		bufsize = nsize & (biosize - 1);
1808		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
1809 		if (!bp)
1810 			return EINTR;
1811		if (bp->b_dirtyoff > bp->b_bcount)
1812			bp->b_dirtyoff = bp->b_bcount;
1813		if (bp->b_dirtyend > bp->b_bcount)
1814			bp->b_dirtyend = bp->b_bcount;
1815		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
1816		brelse(bp);
1817	} else {
1818		vnode_pager_setsize(vp, nsize);
1819	}
1820	return(error);
1821}
1822
1823