nfs_bio.c revision 26469
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
37 * $Id: nfs_bio.c,v 1.39 1997/06/03 09:42:36 dfr Exp $
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/kernel.h>
50#include <sys/sysctl.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_prot.h>
56#include <vm/vm_page.h>
57#include <vm/vm_object.h>
58#include <vm/vm_pager.h>
59#include <vm/vnode_pager.h>
60
61#include <nfs/rpcv2.h>
62#include <nfs/nfsproto.h>
63#include <nfs/nfs.h>
64#include <nfs/nfsmount.h>
65#include <nfs/nqnfs.h>
66#include <nfs/nfsnode.h>
67
68static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
69					struct proc *p));
70
71extern int nfs_numasync;
72extern struct nfsstats nfsstats;
73
74/*
75 * Vnode op for VM getpages.
76 */
77int
78nfs_getpages(ap)
79	struct vop_getpages_args *ap;
80{
81	int i, bsize;
82	vm_object_t obj;
83	int pcount;
84	struct uio auio;
85	struct iovec aiov;
86	int error;
87	vm_page_t m;
88
89	if (!(ap->a_vp->v_flag & VVMIO)) {
90		printf("nfs_getpages: called with non-VMIO vnode??\n");
91		return EOPNOTSUPP;
92	}
93
94	pcount = round_page(ap->a_count) / PAGE_SIZE;
95
96	obj = ap->a_m[ap->a_reqpage]->object;
97	bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;
98
99	for (i = 0; i < pcount; i++) {
100		if (i != ap->a_reqpage) {
101			vnode_pager_freepage(ap->a_m[i]);
102		}
103	}
104	m = ap->a_m[ap->a_reqpage];
105
106	m->busy++;
107	m->flags &= ~PG_BUSY;
108
109	auio.uio_iov = &aiov;
110	auio.uio_iovcnt = 1;
111	aiov.iov_base = 0;
112	aiov.iov_len = PAGE_SIZE;
113	auio.uio_resid = PAGE_SIZE;
114	auio.uio_offset = IDX_TO_OFF(m->pindex);
115	auio.uio_segflg = UIO_NOCOPY;
116	auio.uio_rw = UIO_READ;
117	auio.uio_procp = curproc;
118	error = nfs_bioread(ap->a_vp, &auio, IO_NODELOCKED, curproc->p_ucred, 1);
119
120	m->flags |= PG_BUSY;
121	m->busy--;
122
123	if (error && (auio.uio_resid == PAGE_SIZE))
124		return VM_PAGER_ERROR;
125	return 0;
126}
127
128/*
129 * Vnode op for read using bio
130 * Any similarity to readip() is purely coincidental
131 */
132int
133nfs_bioread(vp, uio, ioflag, cred, getpages)
134	register struct vnode *vp;
135	register struct uio *uio;
136	int ioflag;
137	struct ucred *cred;
138	int getpages;
139{
140	register struct nfsnode *np = VTONFS(vp);
141	register int biosize, diff, i;
142	struct buf *bp = 0, *rabp;
143	struct vattr vattr;
144	struct proc *p;
145	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
146	daddr_t lbn, rabn;
147	int bufsize;
148	int nra, error = 0, n = 0, on = 0, not_readin;
149
150#ifdef DIAGNOSTIC
151	if (uio->uio_rw != UIO_READ)
152		panic("nfs_read mode");
153#endif
154	if (uio->uio_resid == 0)
155		return (0);
156	if (uio->uio_offset < 0)
157		return (EINVAL);
158	p = uio->uio_procp;
159	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
160		(void)nfs_fsinfo(nmp, vp, cred, p);
161	biosize = vp->v_mount->mnt_stat.f_iosize;
162	/*
163	 * For nfs, cache consistency can only be maintained approximately.
164	 * Although RFC1094 does not specify the criteria, the following is
165	 * believed to be compatible with the reference port.
166	 * For nqnfs, full cache consistency is maintained within the loop.
167	 * For nfs:
168	 * If the file's modify time on the server has changed since the
169	 * last read rpc or you have written to the file,
170	 * you may have lost data cache consistency with the
171	 * server, so flush all of the file's data out of the cache.
172	 * Then force a getattr rpc to ensure that you have up to date
173	 * attributes.
174	 * NB: This implies that cache data can be read when up to
175	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
176	 * attributes this could be forced by setting n_attrstamp to 0 before
177	 * the VOP_GETATTR() call.
178	 */
179	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
180		if (np->n_flag & NMODIFIED) {
181			if (vp->v_type != VREG) {
182				if (vp->v_type != VDIR)
183					panic("nfs: bioread, not dir");
184				nfs_invaldir(vp);
185				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
186				if (error)
187					return (error);
188			}
189			np->n_attrstamp = 0;
190			error = VOP_GETATTR(vp, &vattr, cred, p);
191			if (error)
192				return (error);
193			np->n_mtime = vattr.va_mtime.tv_sec;
194		} else {
195			error = VOP_GETATTR(vp, &vattr, cred, p);
196			if (error)
197				return (error);
198			if (np->n_mtime != vattr.va_mtime.tv_sec) {
199				if (vp->v_type == VDIR)
200					nfs_invaldir(vp);
201				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
202				if (error)
203					return (error);
204				np->n_mtime = vattr.va_mtime.tv_sec;
205			}
206		}
207	}
208	do {
209
210	    /*
211	     * Get a valid lease. If cached data is stale, flush it.
212	     */
213	    if (nmp->nm_flag & NFSMNT_NQNFS) {
214		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
215		    do {
216			error = nqnfs_getlease(vp, ND_READ, cred, p);
217		    } while (error == NQNFS_EXPIRED);
218		    if (error)
219			return (error);
220		    if (np->n_lrev != np->n_brev ||
221			(np->n_flag & NQNFSNONCACHE) ||
222			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
223			if (vp->v_type == VDIR)
224			    nfs_invaldir(vp);
225			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
226			if (error)
227			    return (error);
228			np->n_brev = np->n_lrev;
229		    }
230		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
231		    nfs_invaldir(vp);
232		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
233		    if (error)
234			return (error);
235		}
236	    }
237	    if (np->n_flag & NQNFSNONCACHE) {
238		switch (vp->v_type) {
239		case VREG:
240			return (nfs_readrpc(vp, uio, cred));
241		case VLNK:
242			return (nfs_readlinkrpc(vp, uio, cred));
243		case VDIR:
244			break;
245		default:
246			printf(" NQNFSNONCACHE: type %x unexpected\n",
247				vp->v_type);
248		};
249	    }
250	    switch (vp->v_type) {
251	    case VREG:
252		nfsstats.biocache_reads++;
253		lbn = uio->uio_offset / biosize;
254		on = uio->uio_offset & (biosize - 1);
255		not_readin = 1;
256
257		/*
258		 * Start the read ahead(s), as required.
259		 */
260		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
261		    for (nra = 0; nra < nmp->nm_readahead &&
262			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
263			rabn = lbn + 1 + nra;
264			if (!incore(vp, rabn)) {
265			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
266			    if (!rabp)
267				return (EINTR);
268			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
269				rabp->b_flags |= (B_READ | B_ASYNC);
270				vfs_busy_pages(rabp, 0);
271				if (nfs_asyncio(rabp, cred)) {
272				    rabp->b_flags |= B_INVAL|B_ERROR;
273				    vfs_unbusy_pages(rabp);
274				    brelse(rabp);
275				}
276			    } else
277				brelse(rabp);
278			}
279		    }
280		}
281
282		/*
283		 * If the block is in the cache and has the required data
284		 * in a valid region, just copy it out.
285		 * Otherwise, get the block and write back/read in,
286		 * as required.
287		 */
288again:
289		bufsize = biosize;
290		if ((off_t)(lbn + 1) * biosize > np->n_size &&
291		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
292			bufsize = np->n_size - lbn * biosize;
293			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
294		}
295		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
296		if (!bp)
297			return (EINTR);
298		/*
299		 * If we are being called from nfs_getpages, we must
300		 * make sure the buffer is a vmio buffer.  The vp will
301		 * already be setup for vmio but there may be some old
302		 * non-vmio buffers attached to it.
303		 */
304		if (getpages && !(bp->b_flags & B_VMIO)) {
305#ifdef DIAGNOSTIC
306			printf("nfs_bioread: non vmio buf found, discarding\n");
307#endif
308			bp->b_flags |= B_NOCACHE;
309			bp->b_flags |= B_INVAFTERWRITE;
310			if (bp->b_dirtyend > 0) {
311				if ((bp->b_flags & B_DELWRI) == 0)
312					panic("nfsbioread");
313				if (VOP_BWRITE(bp) == EINTR)
314					return (EINTR);
315			} else
316				brelse(bp);
317			goto again;
318		}
319		if ((bp->b_flags & B_CACHE) == 0) {
320			bp->b_flags |= B_READ;
321			bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
322			not_readin = 0;
323			vfs_busy_pages(bp, 0);
324			error = nfs_doio(bp, cred, p);
325			if (error) {
326			    brelse(bp);
327			    return (error);
328			}
329		}
330		if (bufsize > on) {
331			n = min((unsigned)(bufsize - on), uio->uio_resid);
332		} else {
333			n = 0;
334		}
335		diff = np->n_size - uio->uio_offset;
336		if (diff < n)
337			n = diff;
338		if (not_readin && n > 0) {
339			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
340				bp->b_flags |= B_NOCACHE;
341				bp->b_flags |= B_INVAFTERWRITE;
342				if (bp->b_dirtyend > 0) {
343				    if ((bp->b_flags & B_DELWRI) == 0)
344					panic("nfsbioread");
345				    if (VOP_BWRITE(bp) == EINTR)
346					return (EINTR);
347				} else
348				    brelse(bp);
349				goto again;
350			}
351		}
352		vp->v_lastr = lbn;
353		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
354		if (diff < n)
355			n = diff;
356		break;
357	    case VLNK:
358		nfsstats.biocache_readlinks++;
359		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
360		if (!bp)
361			return (EINTR);
362		if ((bp->b_flags & B_CACHE) == 0) {
363			bp->b_flags |= B_READ;
364			vfs_busy_pages(bp, 0);
365			error = nfs_doio(bp, cred, p);
366			if (error) {
367				bp->b_flags |= B_ERROR;
368				brelse(bp);
369				return (error);
370			}
371		}
372		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
373		on = 0;
374		break;
375	    case VDIR:
376		nfsstats.biocache_readdirs++;
377		if (np->n_direofoffset
378		    && uio->uio_offset >= np->n_direofoffset) {
379		    return (0);
380		}
381		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
382		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
383		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
384		if (!bp)
385		    return (EINTR);
386		if ((bp->b_flags & B_CACHE) == 0) {
387		    bp->b_flags |= B_READ;
388		    vfs_busy_pages(bp, 0);
389		    error = nfs_doio(bp, cred, p);
390		    if (error) {
391		        vfs_unbusy_pages(bp);
392			brelse(bp);
393			while (error == NFSERR_BAD_COOKIE) {
394			    nfs_invaldir(vp);
395			    error = nfs_vinvalbuf(vp, 0, cred, p, 1);
396			    /*
397			     * Yuck! The directory has been modified on the
398			     * server. The only way to get the block is by
399			     * reading from the beginning to get all the
400			     * offset cookies.
401			     */
402			    for (i = 0; i <= lbn && !error; i++) {
403				if (np->n_direofoffset
404				    && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
405				    return (0);
406				bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
407				if (!bp)
408				    return (EINTR);
409				if ((bp->b_flags & B_DONE) == 0) {
410				    bp->b_flags |= B_READ;
411				    vfs_busy_pages(bp, 0);
412				    error = nfs_doio(bp, cred, p);
413				    if (error) {
414					vfs_unbusy_pages(bp);
415					brelse(bp);
416				    } else if (i < lbn)
417					brelse(bp);
418				}
419			    }
420			}
421			if (error)
422			    return (error);
423		    }
424		}
425
426		/*
427		 * If not eof and read aheads are enabled, start one.
428		 * (You need the current block first, so that you have the
429		 *  directory offset cookie of the next block.)
430		 */
431		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
432		    (np->n_direofoffset == 0 ||
433		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
434		    !(np->n_flag & NQNFSNONCACHE) &&
435		    !incore(vp, lbn + 1)) {
436			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
437			if (rabp) {
438			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
439				rabp->b_flags |= (B_READ | B_ASYNC);
440				vfs_busy_pages(rabp, 0);
441				if (nfs_asyncio(rabp, cred)) {
442				    rabp->b_flags |= B_INVAL|B_ERROR;
443				    vfs_unbusy_pages(rabp);
444				    brelse(rabp);
445				}
446			    } else {
447				brelse(rabp);
448			    }
449			}
450		}
451		/*
452		 * Make sure we use a signed variant of min() since
453		 * the second term may be negative.
454		 */
455		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
456		break;
457	    default:
458		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
459		break;
460	    };
461
462	    if (n > 0) {
463		error = uiomove(bp->b_data + on, (int)n, uio);
464	    }
465	    switch (vp->v_type) {
466	    case VREG:
467		break;
468	    case VLNK:
469		n = 0;
470		break;
471	    case VDIR:
472		if (np->n_flag & NQNFSNONCACHE)
473			bp->b_flags |= B_INVAL;
474		break;
475	    default:
476		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
477	    }
478 	    brelse(bp);
479	} while (error == 0 && uio->uio_resid > 0 && n > 0);
480	return (error);
481}
482
483/*
484 * Vnode op for write using bio
485 */
486int
487nfs_write(ap)
488	struct vop_write_args /* {
489		struct vnode *a_vp;
490		struct uio *a_uio;
491		int  a_ioflag;
492		struct ucred *a_cred;
493	} */ *ap;
494{
495	register int biosize;
496	register struct uio *uio = ap->a_uio;
497	struct proc *p = uio->uio_procp;
498	register struct vnode *vp = ap->a_vp;
499	struct nfsnode *np = VTONFS(vp);
500	register struct ucred *cred = ap->a_cred;
501	int ioflag = ap->a_ioflag;
502	struct buf *bp;
503	struct vattr vattr;
504	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
505	daddr_t lbn;
506	int bufsize;
507	int n, on, error = 0, iomode, must_commit;
508
509#ifdef DIAGNOSTIC
510	if (uio->uio_rw != UIO_WRITE)
511		panic("nfs_write mode");
512	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
513		panic("nfs_write proc");
514#endif
515	if (vp->v_type != VREG)
516		return (EIO);
517	if (np->n_flag & NWRITEERR) {
518		np->n_flag &= ~NWRITEERR;
519		return (np->n_error);
520	}
521	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
522		(void)nfs_fsinfo(nmp, vp, cred, p);
523	if (ioflag & (IO_APPEND | IO_SYNC)) {
524		if (np->n_flag & NMODIFIED) {
525			np->n_attrstamp = 0;
526			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
527			if (error)
528				return (error);
529		}
530		if (ioflag & IO_APPEND) {
531			np->n_attrstamp = 0;
532			error = VOP_GETATTR(vp, &vattr, cred, p);
533			if (error)
534				return (error);
535			uio->uio_offset = np->n_size;
536		}
537	}
538	if (uio->uio_offset < 0)
539		return (EINVAL);
540	if (uio->uio_resid == 0)
541		return (0);
542	/*
543	 * Maybe this should be above the vnode op call, but so long as
544	 * file servers have no limits, i don't think it matters
545	 */
546	if (p && uio->uio_offset + uio->uio_resid >
547	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
548		psignal(p, SIGXFSZ);
549		return (EFBIG);
550	}
551	/*
552	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
553	 * will be the same size within a filesystem. nfs_writerpc will
554	 * still use nm_wsize when sizing the rpc's.
555	 */
556	biosize = vp->v_mount->mnt_stat.f_iosize;
557	do {
558		/*
559		 * Check for a valid write lease.
560		 */
561		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
562		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
563			do {
564				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
565			} while (error == NQNFS_EXPIRED);
566			if (error)
567				return (error);
568			if (np->n_lrev != np->n_brev ||
569			    (np->n_flag & NQNFSNONCACHE)) {
570				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
571				if (error)
572					return (error);
573				np->n_brev = np->n_lrev;
574			}
575		}
576		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
577		    iomode = NFSV3WRITE_FILESYNC;
578		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
579		    if (must_commit)
580			nfs_clearcommit(vp->v_mount);
581		    return (error);
582		}
583		nfsstats.biocache_writes++;
584		lbn = uio->uio_offset / biosize;
585		on = uio->uio_offset & (biosize-1);
586		n = min((unsigned)(biosize - on), uio->uio_resid);
587again:
588		if (uio->uio_offset + n > np->n_size) {
589			np->n_size = uio->uio_offset + n;
590			np->n_flag |= NMODIFIED;
591			vnode_pager_setsize(vp, (u_long)np->n_size);
592		}
593		bufsize = biosize;
594		if ((lbn + 1) * biosize > np->n_size) {
595			bufsize = np->n_size - lbn * biosize;
596			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
597		}
598		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
599		if (!bp)
600			return (EINTR);
601		if (bp->b_wcred == NOCRED) {
602			crhold(cred);
603			bp->b_wcred = cred;
604		}
605		np->n_flag |= NMODIFIED;
606
607		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
608			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
609		}
610
611		/*
612		 * If the new write will leave a contiguous dirty
613		 * area, just update the b_dirtyoff and b_dirtyend,
614		 * otherwise force a write rpc of the old dirty area.
615		 */
616		if (bp->b_dirtyend > 0 &&
617		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
618			bp->b_proc = p;
619			if (VOP_BWRITE(bp) == EINTR)
620				return (EINTR);
621			goto again;
622		}
623
624		/*
625		 * Check for valid write lease and get one as required.
626		 * In case getblk() and/or bwrite() delayed us.
627		 */
628		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
629		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
630			do {
631				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
632			} while (error == NQNFS_EXPIRED);
633			if (error) {
634				brelse(bp);
635				return (error);
636			}
637			if (np->n_lrev != np->n_brev ||
638			    (np->n_flag & NQNFSNONCACHE)) {
639				brelse(bp);
640				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
641				if (error)
642					return (error);
643				np->n_brev = np->n_lrev;
644				goto again;
645			}
646		}
647		error = uiomove((char *)bp->b_data + on, n, uio);
648		if (error) {
649			bp->b_flags |= B_ERROR;
650			brelse(bp);
651			return (error);
652		}
653		if (bp->b_dirtyend > 0) {
654			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
655			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
656		} else {
657			bp->b_dirtyoff = on;
658			bp->b_dirtyend = on + n;
659		}
660		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
661		    bp->b_validoff > bp->b_dirtyend) {
662			bp->b_validoff = bp->b_dirtyoff;
663			bp->b_validend = bp->b_dirtyend;
664		} else {
665			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
666			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
667		}
668
669		/*
670		 * Since this block is being modified, it must be written
671		 * again and not just committed.
672		 */
673		bp->b_flags &= ~B_NEEDCOMMIT;
674
675		/*
676		 * If the lease is non-cachable or IO_SYNC do bwrite().
677		 */
678		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
679			bp->b_proc = p;
680			error = VOP_BWRITE(bp);
681			if (error)
682				return (error);
683			if (np->n_flag & NQNFSNONCACHE) {
684				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
685				if (error)
686					return (error);
687			}
688		} else if ((n + on) == biosize &&
689			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
690			bp->b_proc = (struct proc *)0;
691			bp->b_flags |= B_ASYNC;
692			(void)nfs_writebp(bp, 0);
693		} else
694			bdwrite(bp);
695	} while (uio->uio_resid > 0 && n > 0);
696	return (0);
697}
698
699/*
700 * Get an nfs cache block.
701 * Allocate a new one if the block isn't currently in the cache
702 * and return the block marked busy. If the calling process is
703 * interrupted by a signal for an interruptible mount point, return
704 * NULL.
705 */
706static struct buf *
707nfs_getcacheblk(vp, bn, size, p)
708	struct vnode *vp;
709	daddr_t bn;
710	int size;
711	struct proc *p;
712{
713	register struct buf *bp;
714	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
715	int biosize = vp->v_mount->mnt_stat.f_iosize;
716
717	if (nmp->nm_flag & NFSMNT_INT) {
718		bp = getblk(vp, bn, size, PCATCH, 0);
719		while (bp == (struct buf *)0) {
720			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
721				return ((struct buf *)0);
722			bp = getblk(vp, bn, size, 0, 2 * hz);
723		}
724	} else
725		bp = getblk(vp, bn, size, 0, 0);
726
727	if( vp->v_type == VREG)
728		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
729
730	return (bp);
731}
732
733/*
734 * Flush and invalidate all dirty buffers. If another process is already
735 * doing the flush, just wait for completion.
736 */
737int
738nfs_vinvalbuf(vp, flags, cred, p, intrflg)
739	struct vnode *vp;
740	int flags;
741	struct ucred *cred;
742	struct proc *p;
743	int intrflg;
744{
745	register struct nfsnode *np = VTONFS(vp);
746	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
747	int error = 0, slpflag, slptimeo;
748
749	if ((nmp->nm_flag & NFSMNT_INT) == 0)
750		intrflg = 0;
751	if (intrflg) {
752		slpflag = PCATCH;
753		slptimeo = 2 * hz;
754	} else {
755		slpflag = 0;
756		slptimeo = 0;
757	}
758	/*
759	 * First wait for any other process doing a flush to complete.
760	 */
761	while (np->n_flag & NFLUSHINPROG) {
762		np->n_flag |= NFLUSHWANT;
763		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
764			slptimeo);
765		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
766			return (EINTR);
767	}
768
769	/*
770	 * Now, flush as required.
771	 */
772	np->n_flag |= NFLUSHINPROG;
773	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
774	while (error) {
775		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
776			np->n_flag &= ~NFLUSHINPROG;
777			if (np->n_flag & NFLUSHWANT) {
778				np->n_flag &= ~NFLUSHWANT;
779				wakeup((caddr_t)&np->n_flag);
780			}
781			return (EINTR);
782		}
783		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
784	}
785	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
786	if (np->n_flag & NFLUSHWANT) {
787		np->n_flag &= ~NFLUSHWANT;
788		wakeup((caddr_t)&np->n_flag);
789	}
790	return (0);
791}
792
793/*
794 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
795 * This is mainly to avoid queueing async I/O requests when the nfsiods
796 * are all hung on a dead server.
797 */
798int
799nfs_asyncio(bp, cred)
800	register struct buf *bp;
801	struct ucred *cred;
802{
803	struct nfsmount *nmp;
804	int i;
805	int gotiod;
806	int slpflag = 0;
807	int slptimeo = 0;
808	int error;
809
810	if (nfs_numasync == 0)
811		return (EIO);
812
813	nmp = VFSTONFS(bp->b_vp->v_mount);
814again:
815	if (nmp->nm_flag & NFSMNT_INT)
816		slpflag = PCATCH;
817	gotiod = FALSE;
818
819	/*
820	 * Find a free iod to process this request.
821	 */
822	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
823		if (nfs_iodwant[i]) {
824			/*
825			 * Found one, so wake it up and tell it which
826			 * mount to process.
827			 */
828			NFS_DPF(ASYNCIO,
829				("nfs_asyncio: waking iod %d for mount %p\n",
830				 i, nmp));
831			nfs_iodwant[i] = (struct proc *)0;
832			nfs_iodmount[i] = nmp;
833			nmp->nm_bufqiods++;
834			wakeup((caddr_t)&nfs_iodwant[i]);
835			gotiod = TRUE;
836			break;
837		}
838
839	/*
840	 * If none are free, we may already have an iod working on this mount
841	 * point.  If so, it will process our request.
842	 */
843	if (!gotiod) {
844		if (nmp->nm_bufqiods > 0) {
845			NFS_DPF(ASYNCIO,
846				("nfs_asyncio: %d iods are already processing mount %p\n",
847				 nmp->nm_bufqiods, nmp));
848			gotiod = TRUE;
849		}
850	}
851
852	/*
853	 * If we have an iod which can process the request, then queue
854	 * the buffer.
855	 */
856	if (gotiod) {
857		/*
858		 * Ensure that the queue never grows too large.
859		 */
860		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
861			NFS_DPF(ASYNCIO,
862				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
863			nmp->nm_bufqwant = TRUE;
864			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
865				       "nfsaio", slptimeo);
866			if (error) {
867				if (nfs_sigintr(nmp, NULL, bp->b_proc))
868					return (EINTR);
869				if (slpflag == PCATCH) {
870					slpflag = 0;
871					slptimeo = 2 * hz;
872				}
873			}
874			/*
875			 * We might have lost our iod while sleeping,
876			 * so check and loop if nescessary.
877			 */
878			if (nmp->nm_bufqiods == 0) {
879				NFS_DPF(ASYNCIO,
880					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
881				goto again;
882			}
883		}
884
885		if (bp->b_flags & B_READ) {
886			if (bp->b_rcred == NOCRED && cred != NOCRED) {
887				crhold(cred);
888				bp->b_rcred = cred;
889			}
890		} else {
891			bp->b_flags |= B_WRITEINPROG;
892			if (bp->b_wcred == NOCRED && cred != NOCRED) {
893				crhold(cred);
894				bp->b_wcred = cred;
895			}
896		}
897
898		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
899		nmp->nm_bufqlen++;
900		return (0);
901	}
902
903	/*
904	 * All the iods are busy on other mounts, so return EIO to
905	 * force the caller to process the i/o synchronously.
906	 */
907	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
908	return (EIO);
909}
910
911/*
912 * Do an I/O operation to/from a cache block. This may be called
913 * synchronously or from an nfsiod.
914 */
915int
916nfs_doio(bp, cr, p)
917	register struct buf *bp;
918	struct ucred *cr;
919	struct proc *p;
920{
921	register struct uio *uiop;
922	register struct vnode *vp;
923	struct nfsnode *np;
924	struct nfsmount *nmp;
925	int error = 0, diff, len, iomode, must_commit = 0;
926	struct uio uio;
927	struct iovec io;
928
929	vp = bp->b_vp;
930	np = VTONFS(vp);
931	nmp = VFSTONFS(vp->v_mount);
932	uiop = &uio;
933	uiop->uio_iov = &io;
934	uiop->uio_iovcnt = 1;
935	uiop->uio_segflg = UIO_SYSSPACE;
936	uiop->uio_procp = p;
937
938	/*
939	 * Historically, paging was done with physio, but no more.
940	 */
941	if (bp->b_flags & B_PHYS) {
942	    /*
943	     * ...though reading /dev/drum still gets us here.
944	     */
945	    io.iov_len = uiop->uio_resid = bp->b_bcount;
946	    /* mapping was done by vmapbuf() */
947	    io.iov_base = bp->b_data;
948	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
949	    if (bp->b_flags & B_READ) {
950		uiop->uio_rw = UIO_READ;
951		nfsstats.read_physios++;
952		error = nfs_readrpc(vp, uiop, cr);
953	    } else {
954		int com;
955
956		iomode = NFSV3WRITE_DATASYNC;
957		uiop->uio_rw = UIO_WRITE;
958		nfsstats.write_physios++;
959		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
960	    }
961	    if (error) {
962		bp->b_flags |= B_ERROR;
963		bp->b_error = error;
964	    }
965	} else if (bp->b_flags & B_READ) {
966	    io.iov_len = uiop->uio_resid = bp->b_bcount;
967	    io.iov_base = bp->b_data;
968	    uiop->uio_rw = UIO_READ;
969	    switch (vp->v_type) {
970	    case VREG:
971		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
972		nfsstats.read_bios++;
973		error = nfs_readrpc(vp, uiop, cr);
974		if (!error) {
975		    bp->b_validoff = 0;
976		    if (uiop->uio_resid) {
977			/*
978			 * If len > 0, there is a hole in the file and
979			 * no writes after the hole have been pushed to
980			 * the server yet.
981			 * Just zero fill the rest of the valid area.
982			 */
983			diff = bp->b_bcount - uiop->uio_resid;
984			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
985				+ diff);
986			if (len > 0) {
987			    len = min(len, uiop->uio_resid);
988			    bzero((char *)bp->b_data + diff, len);
989			    bp->b_validend = diff + len;
990			} else
991			    bp->b_validend = diff;
992		    } else
993			bp->b_validend = bp->b_bcount;
994		}
995		if (p && (vp->v_flag & VTEXT) &&
996			(((nmp->nm_flag & NFSMNT_NQNFS) &&
997			  NQNFS_CKINVALID(vp, np, ND_READ) &&
998			  np->n_lrev != np->n_brev) ||
999			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1000			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1001			uprintf("Process killed due to text file modification\n");
1002			psignal(p, SIGKILL);
1003#ifdef __NetBSD__
1004			p->p_holdcnt++;
1005#else
1006			p->p_flag |= P_NOSWAP;
1007#endif
1008		}
1009		break;
1010	    case VLNK:
1011		uiop->uio_offset = (off_t)0;
1012		nfsstats.readlink_bios++;
1013		error = nfs_readlinkrpc(vp, uiop, cr);
1014		break;
1015	    case VDIR:
1016		nfsstats.readdir_bios++;
1017		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1018		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1019			error = nfs_readdirplusrpc(vp, uiop, cr);
1020			if (error == NFSERR_NOTSUPP)
1021				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1022		}
1023		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1024			error = nfs_readdirrpc(vp, uiop, cr);
1025		break;
1026	    default:
1027		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1028		break;
1029	    };
1030	    if (error) {
1031		bp->b_flags |= B_ERROR;
1032		bp->b_error = error;
1033	    }
1034	} else {
1035	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1036		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1037
1038	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1039		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1040		    - bp->b_dirtyoff;
1041		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1042		    + bp->b_dirtyoff;
1043		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1044		uiop->uio_rw = UIO_WRITE;
1045		nfsstats.write_bios++;
1046		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1047		    iomode = NFSV3WRITE_UNSTABLE;
1048		else
1049		    iomode = NFSV3WRITE_FILESYNC;
1050		bp->b_flags |= B_WRITEINPROG;
1051		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1052		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1053		    bp->b_flags |= B_NEEDCOMMIT;
1054		    if (bp->b_dirtyoff == 0
1055			&& bp->b_dirtyend == bp->b_bufsize)
1056			bp->b_flags |= B_CLUSTEROK;
1057		} else
1058		    bp->b_flags &= ~B_NEEDCOMMIT;
1059		bp->b_flags &= ~B_WRITEINPROG;
1060
1061		/*
1062		 * For an interrupted write, the buffer is still valid
1063		 * and the write hasn't been pushed to the server yet,
1064		 * so we can't set B_ERROR and report the interruption
1065		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1066		 * is not relevant, so the rpc attempt is essentially
1067		 * a noop.  For the case of a V3 write rpc not being
1068		 * committed to stable storage, the block is still
1069		 * dirty and requires either a commit rpc or another
1070		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1071		 * the block is reused. This is indicated by setting
1072		 * the B_DELWRI and B_NEEDCOMMIT flags.
1073		 */
1074    		if (error == EINTR
1075		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1076			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1077			bp->b_flags |= B_DELWRI;
1078
1079		/*
1080		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1081		 * buffer to the clean list, we have to reassign it back to the
1082		 * dirty one. Ugh.
1083		 */
1084			if (bp->b_flags & B_ASYNC)
1085				reassignbuf(bp, vp);
1086			else
1087				bp->b_flags |= B_EINTR;
1088	    	} else {
1089			if (error) {
1090				bp->b_flags |= B_ERROR;
1091				bp->b_error = np->n_error = error;
1092				np->n_flag |= NWRITEERR;
1093			}
1094			bp->b_dirtyoff = bp->b_dirtyend = 0;
1095		}
1096	    } else {
1097		bp->b_resid = 0;
1098		biodone(bp);
1099		return (0);
1100	    }
1101	}
1102	bp->b_resid = uiop->uio_resid;
1103	if (must_commit)
1104		nfs_clearcommit(vp->v_mount);
1105	biodone(bp);
1106	return (error);
1107}
1108