nfs_bio.c revision 45347
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
37 * $Id: nfs_bio.c,v 1.67 1999/03/12 02:24:58 julian Exp $
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/kernel.h>
50
51#include <vm/vm.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_prot.h>
54#include <vm/vm_page.h>
55#include <vm/vm_object.h>
56#include <vm/vm_pager.h>
57#include <vm/vnode_pager.h>
58
59#include <nfs/rpcv2.h>
60#include <nfs/nfsproto.h>
61#include <nfs/nfs.h>
62#include <nfs/nfsmount.h>
63#include <nfs/nqnfs.h>
64#include <nfs/nfsnode.h>
65
66static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
67					struct proc *p));
68static void nfs_prot_buf __P((struct buf *bp, int off, int n));
69
70extern int nfs_numasync;
71extern int nfs_pbuf_freecnt;
72extern struct nfsstats nfsstats;
73
74/*
75 * Vnode op for VM getpages.
76 */
77int
78nfs_getpages(ap)
79	struct vop_getpages_args /* {
80		struct vnode *a_vp;
81		vm_page_t *a_m;
82		int a_count;
83		int a_reqpage;
84		vm_ooffset_t a_offset;
85	} */ *ap;
86{
87	int i, error, nextoff, size, toff, npages, count;
88	struct uio uio;
89	struct iovec iov;
90	vm_offset_t kva;
91	struct buf *bp;
92	struct vnode *vp;
93	struct proc *p;
94	struct ucred *cred;
95	struct nfsmount *nmp;
96	vm_page_t *pages;
97
98	vp = ap->a_vp;
99	p = curproc;				/* XXX */
100	cred = curproc->p_ucred;		/* XXX */
101	nmp = VFSTONFS(vp->v_mount);
102	pages = ap->a_m;
103	count = ap->a_count;
104
105	if (vp->v_object == NULL) {
106		printf("nfs_getpages: called with non-merged cache vnode??\n");
107		return VM_PAGER_ERROR;
108	}
109
110	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
111	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
112		(void)nfs_fsinfo(nmp, vp, cred, p);
113	/*
114	 * We use only the kva address for the buffer, but this is extremely
115	 * convienient and fast.
116	 */
117	bp = getpbuf(&nfs_pbuf_freecnt);
118
119	npages = btoc(count);
120	kva = (vm_offset_t) bp->b_data;
121	pmap_qenter(kva, pages, npages);
122
123	iov.iov_base = (caddr_t) kva;
124	iov.iov_len = count;
125	uio.uio_iov = &iov;
126	uio.uio_iovcnt = 1;
127	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
128	uio.uio_resid = count;
129	uio.uio_segflg = UIO_SYSSPACE;
130	uio.uio_rw = UIO_READ;
131	uio.uio_procp = p;
132
133	error = nfs_readrpc(vp, &uio, cred);
134	pmap_qremove(kva, npages);
135
136	relpbuf(bp, &nfs_pbuf_freecnt);
137
138	if (error && (uio.uio_resid == count)) {
139		printf("nfs_getpages: error %d\n", error);
140		for (i = 0; i < npages; ++i) {
141			if (i != ap->a_reqpage)
142				vnode_pager_freepage(pages[i]);
143		}
144		return VM_PAGER_ERROR;
145	}
146
147	/*
148	 * Calculate the number of bytes read and validate only that number
149	 * of bytes.  Note that due to pending writes, size may be 0.  This
150	 * does not mean that the remaining data is invalid!
151	 */
152
153	size = count - uio.uio_resid;
154
155	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
156		vm_page_t m;
157		nextoff = toff + PAGE_SIZE;
158		m = pages[i];
159
160		m->flags &= ~PG_ZERO;
161
162		if (nextoff <= size) {
163			/*
164			 * Read operation filled an entire page
165			 */
166			m->valid = VM_PAGE_BITS_ALL;
167			m->dirty = 0;
168		} else if (size > toff) {
169			/*
170			 * Read operation filled a partial page, set valid
171			 * bits properly.  validclean will zero out
172			 * any cruft in the buffer when setting a valid bit,
173			 * if the size is not DEV_BSIZE aligned.
174			 */
175			vm_page_set_validclean(m, 0, size - toff);
176		}
177
178		if (i != ap->a_reqpage) {
179			/*
180			 * Whether or not to leave the page activated is up in
181			 * the air, but we should put the page on a page queue
182			 * somewhere (it already is in the object).  Result:
183			 * It appears that emperical results show that
184			 * deactivating pages is best.
185			 */
186
187			/*
188			 * Just in case someone was asking for this page we
189			 * now tell them that it is ok to use.
190			 */
191			if (!error) {
192				if (m->flags & PG_WANTED)
193					vm_page_activate(m);
194				else
195					vm_page_deactivate(m);
196				vm_page_wakeup(m);
197			} else {
198				vnode_pager_freepage(m);
199			}
200		} else {
201			/*
202			 * This page is being mapped, clear out any other
203			 * cruft in the invalid areas of the page.
204			 */
205			if (m->valid && m->valid != VM_PAGE_BITS_ALL)
206				vm_page_zero_invalid(m, FALSE);
207		}
208	}
209	return 0;
210}
211
212/*
213 * Vnode op for VM putpages.
214 */
215int
216nfs_putpages(ap)
217	struct vop_putpages_args /* {
218		struct vnode *a_vp;
219		vm_page_t *a_m;
220		int a_count;
221		int a_sync;
222		int *a_rtvals;
223		vm_ooffset_t a_offset;
224	} */ *ap;
225{
226	struct uio uio;
227	struct iovec iov;
228	vm_offset_t kva;
229	struct buf *bp;
230	int iomode, must_commit, i, error, npages, count;
231	int *rtvals;
232	struct vnode *vp;
233	struct proc *p;
234	struct ucred *cred;
235	struct nfsmount *nmp;
236	vm_page_t *pages;
237
238	vp = ap->a_vp;
239	p = curproc;				/* XXX */
240	cred = curproc->p_ucred;		/* XXX */
241	nmp = VFSTONFS(vp->v_mount);
242	pages = ap->a_m;
243	count = ap->a_count;
244	rtvals = ap->a_rtvals;
245	npages = btoc(count);
246
247	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
248	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
249		(void)nfs_fsinfo(nmp, vp, cred, p);
250
251	for (i = 0; i < npages; i++) {
252		rtvals[i] = VM_PAGER_AGAIN;
253	}
254
255	/*
256	 * We use only the kva address for the buffer, but this is extremely
257	 * convienient and fast.
258	 */
259	bp = getpbuf(&nfs_pbuf_freecnt);
260
261	kva = (vm_offset_t) bp->b_data;
262	pmap_qenter(kva, pages, npages);
263
264	iov.iov_base = (caddr_t) kva;
265	iov.iov_len = count;
266	uio.uio_iov = &iov;
267	uio.uio_iovcnt = 1;
268	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
269	uio.uio_resid = count;
270	uio.uio_segflg = UIO_SYSSPACE;
271	uio.uio_rw = UIO_WRITE;
272	uio.uio_procp = p;
273
274	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
275	    iomode = NFSV3WRITE_UNSTABLE;
276	else
277	    iomode = NFSV3WRITE_FILESYNC;
278
279	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
280
281	pmap_qremove(kva, npages);
282	relpbuf(bp, &nfs_pbuf_freecnt);
283
284	if (!error) {
285		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
286		for (i = 0; i < nwritten; i++) {
287			rtvals[i] = VM_PAGER_OK;
288			pages[i]->dirty = 0;
289		}
290		if (must_commit)
291			nfs_clearcommit(vp->v_mount);
292	}
293	return rtvals[0];
294}
295
296/*
297 * Vnode op for read using bio
298 */
299int
300nfs_bioread(vp, uio, ioflag, cred, getpages)
301	register struct vnode *vp;
302	register struct uio *uio;
303	int ioflag;
304	struct ucred *cred;
305	int getpages;
306{
307	register struct nfsnode *np = VTONFS(vp);
308	register int biosize, i;
309	off_t diff;
310	struct buf *bp = 0, *rabp;
311	struct vattr vattr;
312	struct proc *p;
313	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
314	daddr_t lbn, rabn;
315	int bufsize;
316	int nra, error = 0, n = 0, on = 0, not_readin;
317
318#ifdef DIAGNOSTIC
319	if (uio->uio_rw != UIO_READ)
320		panic("nfs_read mode");
321#endif
322	if (uio->uio_resid == 0)
323		return (0);
324	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
325		return (EINVAL);
326	p = uio->uio_procp;
327	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
328	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
329		(void)nfs_fsinfo(nmp, vp, cred, p);
330	if (vp->v_type != VDIR &&
331	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
332		return (EFBIG);
333	biosize = vp->v_mount->mnt_stat.f_iosize;
334	/*
335	 * For nfs, cache consistency can only be maintained approximately.
336	 * Although RFC1094 does not specify the criteria, the following is
337	 * believed to be compatible with the reference port.
338	 * For nqnfs, full cache consistency is maintained within the loop.
339	 * For nfs:
340	 * If the file's modify time on the server has changed since the
341	 * last read rpc or you have written to the file,
342	 * you may have lost data cache consistency with the
343	 * server, so flush all of the file's data out of the cache.
344	 * Then force a getattr rpc to ensure that you have up to date
345	 * attributes.
346	 * NB: This implies that cache data can be read when up to
347	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
348	 * attributes this could be forced by setting n_attrstamp to 0 before
349	 * the VOP_GETATTR() call.
350	 */
351	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
352		if (np->n_flag & NMODIFIED) {
353			if (vp->v_type != VREG) {
354				if (vp->v_type != VDIR)
355					panic("nfs: bioread, not dir");
356				nfs_invaldir(vp);
357				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
358				if (error)
359					return (error);
360			}
361			np->n_attrstamp = 0;
362			error = VOP_GETATTR(vp, &vattr, cred, p);
363			if (error)
364				return (error);
365			np->n_mtime = vattr.va_mtime.tv_sec;
366		} else {
367			error = VOP_GETATTR(vp, &vattr, cred, p);
368			if (error)
369				return (error);
370			if (np->n_mtime != vattr.va_mtime.tv_sec) {
371				if (vp->v_type == VDIR)
372					nfs_invaldir(vp);
373				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
374				if (error)
375					return (error);
376				np->n_mtime = vattr.va_mtime.tv_sec;
377			}
378		}
379	}
380	do {
381
382	    /*
383	     * Get a valid lease. If cached data is stale, flush it.
384	     */
385	    if (nmp->nm_flag & NFSMNT_NQNFS) {
386		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
387		    do {
388			error = nqnfs_getlease(vp, ND_READ, cred, p);
389		    } while (error == NQNFS_EXPIRED);
390		    if (error)
391			return (error);
392		    if (np->n_lrev != np->n_brev ||
393			(np->n_flag & NQNFSNONCACHE) ||
394			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
395			if (vp->v_type == VDIR)
396			    nfs_invaldir(vp);
397			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
398			if (error)
399			    return (error);
400			np->n_brev = np->n_lrev;
401		    }
402		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
403		    nfs_invaldir(vp);
404		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
405		    if (error)
406			return (error);
407		}
408	    }
409	    if (np->n_flag & NQNFSNONCACHE) {
410		switch (vp->v_type) {
411		case VREG:
412			return (nfs_readrpc(vp, uio, cred));
413		case VLNK:
414			return (nfs_readlinkrpc(vp, uio, cred));
415		case VDIR:
416			break;
417		default:
418			printf(" NQNFSNONCACHE: type %x unexpected\n",
419				vp->v_type);
420		};
421	    }
422	    switch (vp->v_type) {
423	    case VREG:
424		nfsstats.biocache_reads++;
425		lbn = uio->uio_offset / biosize;
426		on = uio->uio_offset & (biosize - 1);
427		not_readin = 1;
428
429		/*
430		 * Start the read ahead(s), as required.
431		 */
432		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
433		    for (nra = 0; nra < nmp->nm_readahead &&
434			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
435			rabn = lbn + 1 + nra;
436			if (!incore(vp, rabn)) {
437			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
438			    if (!rabp)
439				return (EINTR);
440			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
441				rabp->b_flags |= (B_READ | B_ASYNC);
442				rabp->b_flags &= ~B_DONE;
443				vfs_busy_pages(rabp, 0);
444				if (nfs_asyncio(rabp, cred)) {
445				    rabp->b_flags |= B_INVAL|B_ERROR;
446				    vfs_unbusy_pages(rabp);
447				    brelse(rabp);
448				}
449			    } else
450				brelse(rabp);
451			}
452		    }
453		}
454
455		/*
456		 * If the block is in the cache and has the required data
457		 * in a valid region, just copy it out.
458		 * Otherwise, get the block and write back/read in,
459		 * as required.
460		 */
461again:
462		bufsize = biosize;
463		if ((off_t)(lbn + 1) * biosize > np->n_size &&
464		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
465			bufsize = np->n_size - (off_t)lbn * biosize;
466			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
467		}
468		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
469		if (!bp)
470			return (EINTR);
471
472		/*
473		 * If we are being called from nfs_getpages, we must
474		 * make sure the buffer is a vmio buffer.  The vp will
475		 * already be setup for vmio but there may be some old
476		 * non-vmio buffers attached to it.
477		 */
478		if (getpages && !(bp->b_flags & B_VMIO)) {
479#ifdef DIAGNOSTIC
480			printf("nfs_bioread: non vmio buf found, discarding\n");
481#endif
482			bp->b_flags |= B_NOCACHE;
483			bp->b_flags |= B_INVAFTERWRITE;
484			if (bp->b_dirtyend > 0) {
485				if ((bp->b_flags & B_DELWRI) == 0)
486					panic("nfsbioread");
487				if (VOP_BWRITE(bp) == EINTR)
488					return (EINTR);
489			} else
490				brelse(bp);
491			goto again;
492		}
493		if ((bp->b_flags & B_CACHE) == 0) {
494		    bp->b_flags |= B_READ;
495		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
496		    not_readin = 0;
497		    vfs_busy_pages(bp, 0);
498		    error = nfs_doio(bp, cred, p);
499		    if (error) {
500			brelse(bp);
501			return (error);
502		    }
503		}
504		if (bufsize > on) {
505			n = min((unsigned)(bufsize - on), uio->uio_resid);
506		} else {
507			n = 0;
508		}
509		diff = np->n_size - uio->uio_offset;
510		if (diff < n)
511			n = diff;
512		if (not_readin && n > 0) {
513			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
514				bp->b_flags |= B_NOCACHE;
515				bp->b_flags |= B_INVAFTERWRITE;
516				if (bp->b_dirtyend > 0) {
517				    if ((bp->b_flags & B_DELWRI) == 0)
518					panic("nfsbioread");
519				    if (VOP_BWRITE(bp) == EINTR)
520					return (EINTR);
521				} else
522				    brelse(bp);
523				goto again;
524			}
525		}
526		vp->v_lastr = lbn;
527		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
528		if (diff < n)
529			n = diff;
530		break;
531	    case VLNK:
532		nfsstats.biocache_readlinks++;
533		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
534		if (!bp)
535			return (EINTR);
536		if ((bp->b_flags & B_CACHE) == 0) {
537		    bp->b_flags |= B_READ;
538		    bp->b_flags &= ~B_DONE;
539		    vfs_busy_pages(bp, 0);
540		    error = nfs_doio(bp, cred, p);
541		    if (error) {
542			bp->b_flags |= B_ERROR;
543			brelse(bp);
544			return (error);
545		    }
546		}
547		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
548		on = 0;
549		break;
550	    case VDIR:
551		nfsstats.biocache_readdirs++;
552		if (np->n_direofoffset
553		    && uio->uio_offset >= np->n_direofoffset) {
554		    return (0);
555		}
556		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
557		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
558		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
559		if (!bp)
560		    return (EINTR);
561		if ((bp->b_flags & B_CACHE) == 0) {
562		    bp->b_flags |= B_READ;
563		    bp->b_flags &= ~B_DONE;
564		    vfs_busy_pages(bp, 0);
565		    error = nfs_doio(bp, cred, p);
566		    if (error) {
567			    brelse(bp);
568		    }
569		    while (error == NFSERR_BAD_COOKIE) {
570			nfs_invaldir(vp);
571			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
572			/*
573			 * Yuck! The directory has been modified on the
574			 * server. The only way to get the block is by
575			 * reading from the beginning to get all the
576			 * offset cookies.
577			 */
578			for (i = 0; i <= lbn && !error; i++) {
579			    if (np->n_direofoffset
580				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
581				    return (0);
582			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
583			    if (!bp)
584				return (EINTR);
585			    if ((bp->b_flags & B_DONE) == 0) {
586				bp->b_flags |= B_READ;
587				bp->b_flags &= ~B_DONE;
588				vfs_busy_pages(bp, 0);
589				error = nfs_doio(bp, cred, p);
590				if (error == 0 && (bp->b_flags & B_INVAL))
591					break;
592				if (error) {
593				    brelse(bp);
594				} else if (i < lbn) {
595				    brelse(bp);
596				}
597			    }
598			}
599		    }
600		    if (error)
601			    return (error);
602		}
603
604		/*
605		 * If not eof and read aheads are enabled, start one.
606		 * (You need the current block first, so that you have the
607		 *  directory offset cookie of the next block.)
608		 */
609		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
610		    (bp->b_flags & B_INVAL) == 0 &&
611		    (np->n_direofoffset == 0 ||
612		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
613		    !(np->n_flag & NQNFSNONCACHE) &&
614		    !incore(vp, lbn + 1)) {
615			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
616			if (rabp) {
617			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
618				rabp->b_flags |= (B_READ | B_ASYNC);
619				rabp->b_flags &= ~B_DONE;
620				vfs_busy_pages(rabp, 0);
621				if (nfs_asyncio(rabp, cred)) {
622				    rabp->b_flags |= B_INVAL|B_ERROR;
623				    vfs_unbusy_pages(rabp);
624				    brelse(rabp);
625				}
626			    } else {
627				brelse(rabp);
628			    }
629			}
630		}
631		/*
632		 * Make sure we use a signed variant of min() since
633		 * the second term may be negative.
634		 */
635		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
636		break;
637	    default:
638		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
639		break;
640	    };
641
642	    if (n > 0) {
643		    error = uiomove(bp->b_data + on, (int)n, uio);
644	    }
645	    switch (vp->v_type) {
646	    case VREG:
647		break;
648	    case VLNK:
649		n = 0;
650		break;
651	    case VDIR:
652		if (np->n_flag & NQNFSNONCACHE)
653			bp->b_flags |= B_INVAL;
654		break;
655	    default:
656		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
657	    }
658	    brelse(bp);
659	} while (error == 0 && uio->uio_resid > 0 && n > 0);
660	return (error);
661}
662
663static void
664nfs_prot_buf(bp, off, n)
665	struct buf *bp;
666	int off;
667	int n;
668{
669	int pindex, boff, end;
670
671	if ((bp->b_flags & B_VMIO) == 0)
672		return;
673
674	end = round_page(off + n);
675	for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
676		pindex = boff >> PAGE_SHIFT;
677		vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
678	}
679}
680
681/*
682 * Vnode op for write using bio
683 */
684int
685nfs_write(ap)
686	struct vop_write_args /* {
687		struct vnode *a_vp;
688		struct uio *a_uio;
689		int  a_ioflag;
690		struct ucred *a_cred;
691	} */ *ap;
692{
693	register int biosize;
694	register struct uio *uio = ap->a_uio;
695	struct proc *p = uio->uio_procp;
696	register struct vnode *vp = ap->a_vp;
697	struct nfsnode *np = VTONFS(vp);
698	register struct ucred *cred = ap->a_cred;
699	int ioflag = ap->a_ioflag;
700	struct buf *bp;
701	struct vattr vattr;
702	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
703	daddr_t lbn;
704	int bufsize;
705	int n, on, error = 0, iomode, must_commit;
706
707#ifdef DIAGNOSTIC
708	if (uio->uio_rw != UIO_WRITE)
709		panic("nfs_write mode");
710	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
711		panic("nfs_write proc");
712#endif
713	if (vp->v_type != VREG)
714		return (EIO);
715	if (np->n_flag & NWRITEERR) {
716		np->n_flag &= ~NWRITEERR;
717		return (np->n_error);
718	}
719	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
720	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
721		(void)nfs_fsinfo(nmp, vp, cred, p);
722	if (ioflag & (IO_APPEND | IO_SYNC)) {
723		if (np->n_flag & NMODIFIED) {
724			np->n_attrstamp = 0;
725			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
726			if (error)
727				return (error);
728		}
729		if (ioflag & IO_APPEND) {
730			np->n_attrstamp = 0;
731			error = VOP_GETATTR(vp, &vattr, cred, p);
732			if (error)
733				return (error);
734			uio->uio_offset = np->n_size;
735		}
736	}
737	if (uio->uio_offset < 0)
738		return (EINVAL);
739	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
740		return (EFBIG);
741	if (uio->uio_resid == 0)
742		return (0);
743	/*
744	 * Maybe this should be above the vnode op call, but so long as
745	 * file servers have no limits, i don't think it matters
746	 */
747	if (p && uio->uio_offset + uio->uio_resid >
748	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
749		psignal(p, SIGXFSZ);
750		return (EFBIG);
751	}
752	/*
753	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
754	 * will be the same size within a filesystem. nfs_writerpc will
755	 * still use nm_wsize when sizing the rpc's.
756	 */
757	biosize = vp->v_mount->mnt_stat.f_iosize;
758	do {
759		/*
760		 * Check for a valid write lease.
761		 */
762		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
763		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
764			do {
765				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
766			} while (error == NQNFS_EXPIRED);
767			if (error)
768				return (error);
769			if (np->n_lrev != np->n_brev ||
770			    (np->n_flag & NQNFSNONCACHE)) {
771				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
772				if (error)
773					return (error);
774				np->n_brev = np->n_lrev;
775			}
776		}
777		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
778		    iomode = NFSV3WRITE_FILESYNC;
779		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
780		    if (must_commit)
781			nfs_clearcommit(vp->v_mount);
782		    return (error);
783		}
784		nfsstats.biocache_writes++;
785		lbn = uio->uio_offset / biosize;
786		on = uio->uio_offset & (biosize-1);
787		n = min((unsigned)(biosize - on), uio->uio_resid);
788again:
789		if (uio->uio_offset + n > np->n_size) {
790			np->n_size = uio->uio_offset + n;
791			np->n_flag |= NMODIFIED;
792			vnode_pager_setsize(vp, np->n_size);
793		}
794		bufsize = biosize;
795		if ((off_t)(lbn + 1) * biosize > np->n_size) {
796			bufsize = np->n_size - (off_t)lbn * biosize;
797			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
798		}
799		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
800		if (!bp)
801			return (EINTR);
802		if (bp->b_wcred == NOCRED) {
803			crhold(cred);
804			bp->b_wcred = cred;
805		}
806		np->n_flag |= NMODIFIED;
807
808		/*
809		 * If dirtyend exceeds file size, chop it down.  If this
810		 * creates a reverse-indexed or degenerate situation with
811		 * dirtyoff/end, 0 them.
812		 */
813
814		if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
815			bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
816		if (bp->b_dirtyoff >= bp->b_dirtyend)
817			bp->b_dirtyoff = bp->b_dirtyend = 0;
818
819		/*
820		 * If the new write will leave a contiguous dirty
821		 * area, just update the b_dirtyoff and b_dirtyend,
822		 * otherwise force a write rpc of the old dirty area.
823		 */
824
825		if (bp->b_dirtyend > 0 &&
826		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
827			bp->b_proc = p;
828			if (VOP_BWRITE(bp) == EINTR)
829				return (EINTR);
830			goto again;
831		}
832
833		/*
834		 * Check for valid write lease and get one as required.
835		 * In case getblk() and/or bwrite() delayed us.
836		 */
837		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
838		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
839			do {
840				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
841			} while (error == NQNFS_EXPIRED);
842			if (error) {
843				brelse(bp);
844				return (error);
845			}
846			if (np->n_lrev != np->n_brev ||
847			    (np->n_flag & NQNFSNONCACHE)) {
848				brelse(bp);
849				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
850				if (error)
851					return (error);
852				np->n_brev = np->n_lrev;
853				goto again;
854			}
855		}
856
857		error = uiomove((char *)bp->b_data + on, n, uio);
858		bp->b_flags &= ~B_NEEDCOMMIT;
859		if (error) {
860			bp->b_flags |= B_ERROR;
861			brelse(bp);
862			return (error);
863		}
864
865		/*
866		 * This will keep the buffer and mmaped regions more coherent.
867		 */
868		nfs_prot_buf(bp, on, n);
869
870		/*
871		 * Only update dirtyoff/dirtyend if not a degenerate
872		 * condition.
873		 */
874		if (n) {
875			if (bp->b_dirtyend > 0) {
876				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
877				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
878			} else {
879				bp->b_dirtyoff = on;
880				bp->b_dirtyend = on + n;
881			}
882		}
883
884		/*
885		 * To avoid code complexity, we may have to throw away
886		 * previously valid ranges when merging the new dirty range
887		 * into the valid range.  As long as we do not *ADD* an
888		 * invalid valid range, we are ok.
889		 */
890		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
891		    bp->b_validoff > bp->b_dirtyend) {
892			bp->b_validoff = bp->b_dirtyoff;
893			bp->b_validend = bp->b_dirtyend;
894		} else {
895			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
896			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
897		}
898
899		/*
900		 * Since this block is being modified, it must be written
901		 * again and not just committed.
902		 */
903		bp->b_flags &= ~B_NEEDCOMMIT;
904
905		/*
906		 * If the lease is non-cachable or IO_SYNC do bwrite().
907		 */
908		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
909			bp->b_proc = p;
910			if (ioflag & IO_INVAL)
911				bp->b_flags |= B_INVAL;
912			error = VOP_BWRITE(bp);
913			if (error)
914				return (error);
915			if (np->n_flag & NQNFSNONCACHE) {
916				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
917				if (error)
918					return (error);
919			}
920		} else if ((n + on) == biosize &&
921			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
922			bp->b_proc = (struct proc *)0;
923			bp->b_flags |= B_ASYNC;
924			(void)nfs_writebp(bp, 0);
925		} else
926			bdwrite(bp);
927	} while (uio->uio_resid > 0 && n > 0);
928	return (0);
929}
930
931/*
932 * Get an nfs cache block.
933 * Allocate a new one if the block isn't currently in the cache
934 * and return the block marked busy. If the calling process is
935 * interrupted by a signal for an interruptible mount point, return
936 * NULL.
937 */
938static struct buf *
939nfs_getcacheblk(vp, bn, size, p)
940	struct vnode *vp;
941	daddr_t bn;
942	int size;
943	struct proc *p;
944{
945	register struct buf *bp;
946	struct mount *mp;
947	struct nfsmount *nmp;
948
949	mp = vp->v_mount;
950	nmp = VFSTONFS(mp);
951
952	if (nmp->nm_flag & NFSMNT_INT) {
953		bp = getblk(vp, bn, size, PCATCH, 0);
954		while (bp == (struct buf *)0) {
955			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
956				return ((struct buf *)0);
957			bp = getblk(vp, bn, size, 0, 2 * hz);
958		}
959	} else
960		bp = getblk(vp, bn, size, 0, 0);
961
962	if (vp->v_type == VREG) {
963		int biosize;
964		biosize = mp->mnt_stat.f_iosize;
965		bp->b_blkno = bn * (biosize / DEV_BSIZE);
966	}
967
968	return (bp);
969}
970
971/*
972 * Flush and invalidate all dirty buffers. If another process is already
973 * doing the flush, just wait for completion.
974 */
975int
976nfs_vinvalbuf(vp, flags, cred, p, intrflg)
977	struct vnode *vp;
978	int flags;
979	struct ucred *cred;
980	struct proc *p;
981	int intrflg;
982{
983	register struct nfsnode *np = VTONFS(vp);
984	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
985	int error = 0, slpflag, slptimeo;
986
987	if (vp->v_flag & VXLOCK) {
988		return (0);
989	}
990
991	if ((nmp->nm_flag & NFSMNT_INT) == 0)
992		intrflg = 0;
993	if (intrflg) {
994		slpflag = PCATCH;
995		slptimeo = 2 * hz;
996	} else {
997		slpflag = 0;
998		slptimeo = 0;
999	}
1000	/*
1001	 * First wait for any other process doing a flush to complete.
1002	 */
1003	while (np->n_flag & NFLUSHINPROG) {
1004		np->n_flag |= NFLUSHWANT;
1005		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
1006			slptimeo);
1007		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
1008			return (EINTR);
1009	}
1010
1011	/*
1012	 * Now, flush as required.
1013	 */
1014	np->n_flag |= NFLUSHINPROG;
1015	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
1016	while (error) {
1017		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
1018			np->n_flag &= ~NFLUSHINPROG;
1019			if (np->n_flag & NFLUSHWANT) {
1020				np->n_flag &= ~NFLUSHWANT;
1021				wakeup((caddr_t)&np->n_flag);
1022			}
1023			return (EINTR);
1024		}
1025		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
1026	}
1027	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
1028	if (np->n_flag & NFLUSHWANT) {
1029		np->n_flag &= ~NFLUSHWANT;
1030		wakeup((caddr_t)&np->n_flag);
1031	}
1032	return (0);
1033}
1034
1035/*
1036 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1037 * This is mainly to avoid queueing async I/O requests when the nfsiods
1038 * are all hung on a dead server.
1039 */
1040int
1041nfs_asyncio(bp, cred)
1042	register struct buf *bp;
1043	struct ucred *cred;
1044{
1045	struct nfsmount *nmp;
1046	int i;
1047	int gotiod;
1048	int slpflag = 0;
1049	int slptimeo = 0;
1050	int error;
1051
1052	if (nfs_numasync == 0)
1053		return (EIO);
1054
1055	nmp = VFSTONFS(bp->b_vp->v_mount);
1056again:
1057	if (nmp->nm_flag & NFSMNT_INT)
1058		slpflag = PCATCH;
1059	gotiod = FALSE;
1060
1061	/*
1062	 * Find a free iod to process this request.
1063	 */
1064	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
1065		if (nfs_iodwant[i]) {
1066			/*
1067			 * Found one, so wake it up and tell it which
1068			 * mount to process.
1069			 */
1070			NFS_DPF(ASYNCIO,
1071				("nfs_asyncio: waking iod %d for mount %p\n",
1072				 i, nmp));
1073			nfs_iodwant[i] = (struct proc *)0;
1074			nfs_iodmount[i] = nmp;
1075			nmp->nm_bufqiods++;
1076			wakeup((caddr_t)&nfs_iodwant[i]);
1077			gotiod = TRUE;
1078			break;
1079		}
1080
1081	/*
1082	 * If none are free, we may already have an iod working on this mount
1083	 * point.  If so, it will process our request.
1084	 */
1085	if (!gotiod) {
1086		if (nmp->nm_bufqiods > 0) {
1087			NFS_DPF(ASYNCIO,
1088				("nfs_asyncio: %d iods are already processing mount %p\n",
1089				 nmp->nm_bufqiods, nmp));
1090			gotiod = TRUE;
1091		}
1092	}
1093
1094	/*
1095	 * If we have an iod which can process the request, then queue
1096	 * the buffer.
1097	 */
1098	if (gotiod) {
1099		/*
1100		 * Ensure that the queue never grows too large.
1101		 */
1102		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
1103			NFS_DPF(ASYNCIO,
1104				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
1105			nmp->nm_bufqwant = TRUE;
1106			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
1107				       "nfsaio", slptimeo);
1108			if (error) {
1109				if (nfs_sigintr(nmp, NULL, bp->b_proc))
1110					return (EINTR);
1111				if (slpflag == PCATCH) {
1112					slpflag = 0;
1113					slptimeo = 2 * hz;
1114				}
1115			}
1116			/*
1117			 * We might have lost our iod while sleeping,
1118			 * so check and loop if nescessary.
1119			 */
1120			if (nmp->nm_bufqiods == 0) {
1121				NFS_DPF(ASYNCIO,
1122					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
1123				goto again;
1124			}
1125		}
1126
1127		if (bp->b_flags & B_READ) {
1128			if (bp->b_rcred == NOCRED && cred != NOCRED) {
1129				crhold(cred);
1130				bp->b_rcred = cred;
1131			}
1132		} else {
1133			bp->b_flags |= B_WRITEINPROG;
1134			if (bp->b_wcred == NOCRED && cred != NOCRED) {
1135				crhold(cred);
1136				bp->b_wcred = cred;
1137			}
1138		}
1139
1140		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
1141		nmp->nm_bufqlen++;
1142		return (0);
1143	}
1144
1145	/*
1146	 * All the iods are busy on other mounts, so return EIO to
1147	 * force the caller to process the i/o synchronously.
1148	 */
1149	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1150	return (EIO);
1151}
1152
1153/*
1154 * Do an I/O operation to/from a cache block. This may be called
1155 * synchronously or from an nfsiod.
1156 */
1157int
1158nfs_doio(bp, cr, p)
1159	struct buf *bp;
1160	struct ucred *cr;
1161	struct proc *p;
1162{
1163	struct uio *uiop;
1164	struct vnode *vp;
1165	struct nfsnode *np;
1166	struct nfsmount *nmp;
1167	int error = 0, diff, len, iomode, must_commit = 0;
1168	struct uio uio;
1169	struct iovec io;
1170
1171	vp = bp->b_vp;
1172	np = VTONFS(vp);
1173	nmp = VFSTONFS(vp->v_mount);
1174	uiop = &uio;
1175	uiop->uio_iov = &io;
1176	uiop->uio_iovcnt = 1;
1177	uiop->uio_segflg = UIO_SYSSPACE;
1178	uiop->uio_procp = p;
1179
1180	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
1181
1182	/*
1183	 * Historically, paging was done with physio, but no more.
1184	 */
1185	if (bp->b_flags & B_PHYS) {
1186	    /*
1187	     * ...though reading /dev/drum still gets us here.
1188	     */
1189	    io.iov_len = uiop->uio_resid = bp->b_bcount;
1190	    /* mapping was done by vmapbuf() */
1191	    io.iov_base = bp->b_data;
1192	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1193	    if (bp->b_flags & B_READ) {
1194		uiop->uio_rw = UIO_READ;
1195		nfsstats.read_physios++;
1196		error = nfs_readrpc(vp, uiop, cr);
1197	    } else {
1198		int com;
1199
1200		iomode = NFSV3WRITE_DATASYNC;
1201		uiop->uio_rw = UIO_WRITE;
1202		nfsstats.write_physios++;
1203		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
1204	    }
1205	    if (error) {
1206		bp->b_flags |= B_ERROR;
1207		bp->b_error = error;
1208	    }
1209	} else if (bp->b_flags & B_READ) {
1210	    io.iov_len = uiop->uio_resid = bp->b_bcount;
1211	    io.iov_base = bp->b_data;
1212	    uiop->uio_rw = UIO_READ;
1213	    switch (vp->v_type) {
1214	    case VREG:
1215		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1216		nfsstats.read_bios++;
1217		error = nfs_readrpc(vp, uiop, cr);
1218		if (!error) {
1219		    bp->b_validoff = 0;
1220		    if (uiop->uio_resid) {
1221			/*
1222			 * If len > 0, there is a hole in the file and
1223			 * no writes after the hole have been pushed to
1224			 * the server yet.
1225			 * Just zero fill the rest of the valid area.
1226			 */
1227			diff = bp->b_bcount - uiop->uio_resid;
1228			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
1229				+ diff);
1230			if (len > 0) {
1231			    len = min(len, uiop->uio_resid);
1232			    bzero((char *)bp->b_data + diff, len);
1233			    bp->b_validend = diff + len;
1234			} else
1235			    bp->b_validend = diff;
1236		    } else
1237			bp->b_validend = bp->b_bcount;
1238		}
1239		if (p && (vp->v_flag & VTEXT) &&
1240			(((nmp->nm_flag & NFSMNT_NQNFS) &&
1241			  NQNFS_CKINVALID(vp, np, ND_READ) &&
1242			  np->n_lrev != np->n_brev) ||
1243			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1244			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1245			uprintf("Process killed due to text file modification\n");
1246			psignal(p, SIGKILL);
1247			p->p_flag |= P_NOSWAP;
1248		}
1249		break;
1250	    case VLNK:
1251		uiop->uio_offset = (off_t)0;
1252		nfsstats.readlink_bios++;
1253		error = nfs_readlinkrpc(vp, uiop, cr);
1254		break;
1255	    case VDIR:
1256		nfsstats.readdir_bios++;
1257		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1258		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1259			error = nfs_readdirplusrpc(vp, uiop, cr);
1260			if (error == NFSERR_NOTSUPP)
1261				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1262		}
1263		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1264			error = nfs_readdirrpc(vp, uiop, cr);
1265		if (error == 0 && uiop->uio_resid == bp->b_bcount)
1266			bp->b_flags |= B_INVAL;
1267		break;
1268	    default:
1269		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1270		break;
1271	    };
1272	    if (error) {
1273		bp->b_flags |= B_ERROR;
1274		bp->b_error = error;
1275	    }
1276	} else {
1277	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
1278		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
1279
1280	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1281		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1282		    - bp->b_dirtyoff;
1283		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
1284		    + bp->b_dirtyoff;
1285		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1286		uiop->uio_rw = UIO_WRITE;
1287		nfsstats.write_bios++;
1288
1289		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1290		    iomode = NFSV3WRITE_UNSTABLE;
1291		else
1292		    iomode = NFSV3WRITE_FILESYNC;
1293
1294		bp->b_flags |= B_WRITEINPROG;
1295		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1296		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1297		    bp->b_flags |= B_NEEDCOMMIT;
1298		    if (bp->b_dirtyoff == 0
1299			&& bp->b_dirtyend == bp->b_bufsize)
1300			bp->b_flags |= B_CLUSTEROK;
1301		} else {
1302		    bp->b_flags &= ~B_NEEDCOMMIT;
1303		}
1304		bp->b_flags &= ~B_WRITEINPROG;
1305
1306		/*
1307		 * For an interrupted write, the buffer is still valid
1308		 * and the write hasn't been pushed to the server yet,
1309		 * so we can't set B_ERROR and report the interruption
1310		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1311		 * is not relevant, so the rpc attempt is essentially
1312		 * a noop.  For the case of a V3 write rpc not being
1313		 * committed to stable storage, the block is still
1314		 * dirty and requires either a commit rpc or another
1315		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1316		 * the block is reused. This is indicated by setting
1317		 * the B_DELWRI and B_NEEDCOMMIT flags.
1318		 *
1319		 * If the buffer is marked B_PAGING, it does not reside on
1320		 * the vp's paging queues so we cannot call bdirty().  The
1321		 * bp in this case is not an NFS cache block so we should
1322		 * be safe. XXX
1323		 */
1324    		if (error == EINTR
1325		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1326			int s;
1327
1328			s = splbio();
1329			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1330			if ((bp->b_flags & B_PAGING) == 0) {
1331			    bdirty(bp);
1332			    bp->b_flags &= ~B_DONE;
1333			}
1334			if ((bp->b_flags & B_ASYNC) == 0)
1335			    bp->b_flags |= B_EINTR;
1336			splx(s);
1337	    	} else {
1338		    if (error) {
1339			bp->b_flags |= B_ERROR;
1340			bp->b_error = np->n_error = error;
1341			np->n_flag |= NWRITEERR;
1342		    }
1343		    bp->b_dirtyoff = bp->b_dirtyend = 0;
1344		}
1345	    } else {
1346		bp->b_resid = 0;
1347		biodone(bp);
1348		return (0);
1349	    }
1350	}
1351	bp->b_resid = uiop->uio_resid;
1352	if (must_commit)
1353	    nfs_clearcommit(vp->v_mount);
1354	biodone(bp);
1355	return (error);
1356}
1357