nfs_bio.c revision 33134
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
37 * $Id: nfs_bio.c,v 1.49 1998/02/04 22:33:13 eivind Exp $
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/kernel.h>
50
51#include <vm/vm.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_prot.h>
54#include <vm/vm_page.h>
55#include <vm/vm_object.h>
56#include <vm/vm_pager.h>
57#include <vm/vnode_pager.h>
58
59#include <nfs/rpcv2.h>
60#include <nfs/nfsproto.h>
61#include <nfs/nfs.h>
62#include <nfs/nfsmount.h>
63#include <nfs/nqnfs.h>
64#include <nfs/nfsnode.h>
65
66static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
67					struct proc *p));
68
69extern int nfs_numasync;
70extern struct nfsstats nfsstats;
71
72/*
73 * Vnode op for VM getpages.
74 */
75int
76nfs_getpages(ap)
77	struct vop_getpages_args *ap;
78{
79	int i, pcount, error;
80	struct uio uio;
81	struct iovec iov;
82	vm_page_t m;
83	vm_offset_t kva;
84
85	if ((ap->a_vp->v_object) == NULL) {
86		printf("nfs_getpages: called with non-merged cache vnode??\n");
87		return EOPNOTSUPP;
88	}
89
90	m = ap->a_m[ap->a_reqpage];
91	kva = vm_pager_map_page(m);
92
93	iov.iov_base = (caddr_t) kva;
94	iov.iov_len = PAGE_SIZE;
95	uio.uio_iov = &iov;
96	uio.uio_iovcnt = 1;
97	uio.uio_offset = IDX_TO_OFF(m->pindex);
98	uio.uio_resid = PAGE_SIZE;
99	uio.uio_segflg = UIO_SYSSPACE;
100	uio.uio_rw = UIO_READ;
101	uio.uio_procp = curproc;
102
103	error = nfs_readrpc(ap->a_vp, &uio, curproc->p_ucred);
104	vm_pager_unmap_page(kva);
105
106	pcount = round_page(ap->a_count) / PAGE_SIZE;
107	for (i = 0; i < pcount; i++) {
108		if (i != ap->a_reqpage) {
109			vnode_pager_freepage(ap->a_m[i]);
110		}
111	}
112
113	if (error && (uio.uio_resid == PAGE_SIZE))
114		return VM_PAGER_ERROR;
115	return 0;
116}
117
118/*
119 * Vnode op for read using bio
120 * Any similarity to readip() is purely coincidental
121 */
122int
123nfs_bioread(vp, uio, ioflag, cred, getpages)
124	register struct vnode *vp;
125	register struct uio *uio;
126	int ioflag;
127	struct ucred *cred;
128	int getpages;
129{
130	register struct nfsnode *np = VTONFS(vp);
131	register int biosize, diff, i;
132	struct buf *bp = 0, *rabp;
133	struct vattr vattr;
134	struct proc *p;
135	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
136	daddr_t lbn, rabn;
137	int bufsize;
138	int nra, error = 0, n = 0, on = 0, not_readin;
139
140#ifdef DIAGNOSTIC
141	if (uio->uio_rw != UIO_READ)
142		panic("nfs_read mode");
143#endif
144	if (uio->uio_resid == 0)
145		return (0);
146	if (uio->uio_offset < 0)
147		return (EINVAL);
148	p = uio->uio_procp;
149	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
150		(void)nfs_fsinfo(nmp, vp, cred, p);
151	biosize = vp->v_mount->mnt_stat.f_iosize;
152	/*
153	 * For nfs, cache consistency can only be maintained approximately.
154	 * Although RFC1094 does not specify the criteria, the following is
155	 * believed to be compatible with the reference port.
156	 * For nqnfs, full cache consistency is maintained within the loop.
157	 * For nfs:
158	 * If the file's modify time on the server has changed since the
159	 * last read rpc or you have written to the file,
160	 * you may have lost data cache consistency with the
161	 * server, so flush all of the file's data out of the cache.
162	 * Then force a getattr rpc to ensure that you have up to date
163	 * attributes.
164	 * NB: This implies that cache data can be read when up to
165	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
166	 * attributes this could be forced by setting n_attrstamp to 0 before
167	 * the VOP_GETATTR() call.
168	 */
169	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
170		if (np->n_flag & NMODIFIED) {
171			if (vp->v_type != VREG) {
172				if (vp->v_type != VDIR)
173					panic("nfs: bioread, not dir");
174				nfs_invaldir(vp);
175				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
176				if (error)
177					return (error);
178			}
179			np->n_attrstamp = 0;
180			error = VOP_GETATTR(vp, &vattr, cred, p);
181			if (error)
182				return (error);
183			np->n_mtime = vattr.va_mtime.tv_sec;
184		} else {
185			error = VOP_GETATTR(vp, &vattr, cred, p);
186			if (error)
187				return (error);
188			if (np->n_mtime != vattr.va_mtime.tv_sec) {
189				if (vp->v_type == VDIR)
190					nfs_invaldir(vp);
191				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
192				if (error)
193					return (error);
194				np->n_mtime = vattr.va_mtime.tv_sec;
195			}
196		}
197	}
198	do {
199
200	    /*
201	     * Get a valid lease. If cached data is stale, flush it.
202	     */
203	    if (nmp->nm_flag & NFSMNT_NQNFS) {
204		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
205		    do {
206			error = nqnfs_getlease(vp, ND_READ, cred, p);
207		    } while (error == NQNFS_EXPIRED);
208		    if (error)
209			return (error);
210		    if (np->n_lrev != np->n_brev ||
211			(np->n_flag & NQNFSNONCACHE) ||
212			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
213			if (vp->v_type == VDIR)
214			    nfs_invaldir(vp);
215			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
216			if (error)
217			    return (error);
218			np->n_brev = np->n_lrev;
219		    }
220		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
221		    nfs_invaldir(vp);
222		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
223		    if (error)
224			return (error);
225		}
226	    }
227	    if (np->n_flag & NQNFSNONCACHE) {
228		switch (vp->v_type) {
229		case VREG:
230			return (nfs_readrpc(vp, uio, cred));
231		case VLNK:
232			return (nfs_readlinkrpc(vp, uio, cred));
233		case VDIR:
234			break;
235		default:
236			printf(" NQNFSNONCACHE: type %x unexpected\n",
237				vp->v_type);
238		};
239	    }
240	    switch (vp->v_type) {
241	    case VREG:
242		nfsstats.biocache_reads++;
243		lbn = uio->uio_offset / biosize;
244		on = uio->uio_offset & (biosize - 1);
245		not_readin = 1;
246
247		/*
248		 * Start the read ahead(s), as required.
249		 */
250		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
251		    for (nra = 0; nra < nmp->nm_readahead &&
252			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
253			rabn = lbn + 1 + nra;
254			if (!incore(vp, rabn)) {
255			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
256			    if (!rabp)
257				return (EINTR);
258			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
259				rabp->b_flags |= (B_READ | B_ASYNC);
260				vfs_busy_pages(rabp, 0);
261				if (nfs_asyncio(rabp, cred)) {
262				    rabp->b_flags |= B_INVAL|B_ERROR;
263				    vfs_unbusy_pages(rabp);
264				    brelse(rabp);
265				}
266			    } else
267				brelse(rabp);
268			}
269		    }
270		}
271
272		/*
273		 * If the block is in the cache and has the required data
274		 * in a valid region, just copy it out.
275		 * Otherwise, get the block and write back/read in,
276		 * as required.
277		 */
278again:
279		bufsize = biosize;
280		if ((off_t)(lbn + 1) * biosize > np->n_size &&
281		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
282			bufsize = np->n_size - lbn * biosize;
283			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
284		}
285		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
286		if (!bp)
287			return (EINTR);
288		/*
289		 * If we are being called from nfs_getpages, we must
290		 * make sure the buffer is a vmio buffer.  The vp will
291		 * already be setup for vmio but there may be some old
292		 * non-vmio buffers attached to it.
293		 */
294		if (getpages && !(bp->b_flags & B_VMIO)) {
295#ifdef DIAGNOSTIC
296			printf("nfs_bioread: non vmio buf found, discarding\n");
297#endif
298			bp->b_flags |= B_NOCACHE;
299			bp->b_flags |= B_INVAFTERWRITE;
300			if (bp->b_dirtyend > 0) {
301				if ((bp->b_flags & B_DELWRI) == 0)
302					panic("nfsbioread");
303				if (VOP_BWRITE(bp) == EINTR)
304					return (EINTR);
305			} else
306				brelse(bp);
307			goto again;
308		}
309		if ((bp->b_flags & B_CACHE) == 0) {
310		    bp->b_flags |= B_READ;
311		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
312		    not_readin = 0;
313		    vfs_busy_pages(bp, 0);
314		    error = nfs_doio(bp, cred, p);
315		    if (error) {
316			brelse(bp);
317			return (error);
318		    }
319		}
320		if (bufsize > on) {
321			n = min((unsigned)(bufsize - on), uio->uio_resid);
322		} else {
323			n = 0;
324		}
325		diff = np->n_size - uio->uio_offset;
326		if (diff < n)
327			n = diff;
328		if (not_readin && n > 0) {
329			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
330				bp->b_flags |= B_NOCACHE;
331				bp->b_flags |= B_INVAFTERWRITE;
332				if (bp->b_dirtyend > 0) {
333				    if ((bp->b_flags & B_DELWRI) == 0)
334					panic("nfsbioread");
335				    if (VOP_BWRITE(bp) == EINTR)
336					return (EINTR);
337				} else
338				    brelse(bp);
339				goto again;
340			}
341		}
342		vp->v_lastr = lbn;
343		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
344		if (diff < n)
345			n = diff;
346		break;
347	    case VLNK:
348		nfsstats.biocache_readlinks++;
349		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
350		if (!bp)
351			return (EINTR);
352		if ((bp->b_flags & B_CACHE) == 0) {
353		    bp->b_flags |= B_READ;
354		    vfs_busy_pages(bp, 0);
355		    error = nfs_doio(bp, cred, p);
356		    if (error) {
357			bp->b_flags |= B_ERROR;
358			brelse(bp);
359			return (error);
360		    }
361		}
362		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
363		on = 0;
364		break;
365	    case VDIR:
366		nfsstats.biocache_readdirs++;
367		if (np->n_direofoffset
368		    && uio->uio_offset >= np->n_direofoffset) {
369		    return (0);
370		}
371		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
372		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
373		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
374		if (!bp)
375		    return (EINTR);
376		if ((bp->b_flags & B_CACHE) == 0) {
377		    bp->b_flags |= B_READ;
378		    vfs_busy_pages(bp, 0);
379		    error = nfs_doio(bp, cred, p);
380		    if (error) {
381			    brelse(bp);
382		    }
383		    while (error == NFSERR_BAD_COOKIE) {
384			nfs_invaldir(vp);
385			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
386			/*
387			 * Yuck! The directory has been modified on the
388			 * server. The only way to get the block is by
389			 * reading from the beginning to get all the
390			 * offset cookies.
391			 */
392			for (i = 0; i <= lbn && !error; i++) {
393			    if (np->n_direofoffset
394				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
395				    return (0);
396			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
397			    if (!bp)
398				return (EINTR);
399			    if ((bp->b_flags & B_DONE) == 0) {
400				bp->b_flags |= B_READ;
401				vfs_busy_pages(bp, 0);
402				error = nfs_doio(bp, cred, p);
403				if (error) {
404				    brelse(bp);
405				} else if (i < lbn) {
406				    brelse(bp);
407				}
408			    }
409			}
410		    }
411		    if (error)
412			    return (error);
413		}
414
415		/*
416		 * If not eof and read aheads are enabled, start one.
417		 * (You need the current block first, so that you have the
418		 *  directory offset cookie of the next block.)
419		 */
420		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
421		    (np->n_direofoffset == 0 ||
422		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
423		    !(np->n_flag & NQNFSNONCACHE) &&
424		    !incore(vp, lbn + 1)) {
425			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
426			if (rabp) {
427			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
428				rabp->b_flags |= (B_READ | B_ASYNC);
429				vfs_busy_pages(rabp, 0);
430				if (nfs_asyncio(rabp, cred)) {
431				    rabp->b_flags |= B_INVAL|B_ERROR;
432				    vfs_unbusy_pages(rabp);
433				    brelse(rabp);
434				}
435			    } else {
436				brelse(rabp);
437			    }
438			}
439		}
440		/*
441		 * Make sure we use a signed variant of min() since
442		 * the second term may be negative.
443		 */
444		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
445		break;
446	    default:
447		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
448		break;
449	    };
450
451	    if (n > 0) {
452		error = uiomove(bp->b_data + on, (int)n, uio);
453	    }
454	    switch (vp->v_type) {
455	    case VREG:
456		break;
457	    case VLNK:
458		n = 0;
459		break;
460	    case VDIR:
461		if (np->n_flag & NQNFSNONCACHE)
462			bp->b_flags |= B_INVAL;
463		break;
464	    default:
465		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
466	    }
467	    brelse(bp);
468	} while (error == 0 && uio->uio_resid > 0 && n > 0);
469	return (error);
470}
471
472/*
473 * Vnode op for write using bio
474 */
475int
476nfs_write(ap)
477	struct vop_write_args /* {
478		struct vnode *a_vp;
479		struct uio *a_uio;
480		int  a_ioflag;
481		struct ucred *a_cred;
482	} */ *ap;
483{
484	register int biosize;
485	register struct uio *uio = ap->a_uio;
486	struct proc *p = uio->uio_procp;
487	register struct vnode *vp = ap->a_vp;
488	struct nfsnode *np = VTONFS(vp);
489	register struct ucred *cred = ap->a_cred;
490	int ioflag = ap->a_ioflag;
491	struct buf *bp;
492	struct vattr vattr;
493	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
494	daddr_t lbn;
495	int bufsize;
496	int n, on, error = 0, iomode, must_commit;
497
498#ifdef DIAGNOSTIC
499	if (uio->uio_rw != UIO_WRITE)
500		panic("nfs_write mode");
501	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
502		panic("nfs_write proc");
503#endif
504	if (vp->v_type != VREG)
505		return (EIO);
506	if (np->n_flag & NWRITEERR) {
507		np->n_flag &= ~NWRITEERR;
508		return (np->n_error);
509	}
510	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
511		(void)nfs_fsinfo(nmp, vp, cred, p);
512	if (ioflag & (IO_APPEND | IO_SYNC)) {
513		if (np->n_flag & NMODIFIED) {
514			np->n_attrstamp = 0;
515			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
516			if (error)
517				return (error);
518		}
519		if (ioflag & IO_APPEND) {
520			np->n_attrstamp = 0;
521			error = VOP_GETATTR(vp, &vattr, cred, p);
522			if (error)
523				return (error);
524			uio->uio_offset = np->n_size;
525		}
526	}
527	if (uio->uio_offset < 0)
528		return (EINVAL);
529	if (uio->uio_resid == 0)
530		return (0);
531	/*
532	 * Maybe this should be above the vnode op call, but so long as
533	 * file servers have no limits, i don't think it matters
534	 */
535	if (p && uio->uio_offset + uio->uio_resid >
536	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
537		psignal(p, SIGXFSZ);
538		return (EFBIG);
539	}
540	/*
541	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
542	 * will be the same size within a filesystem. nfs_writerpc will
543	 * still use nm_wsize when sizing the rpc's.
544	 */
545	biosize = vp->v_mount->mnt_stat.f_iosize;
546	do {
547		/*
548		 * Check for a valid write lease.
549		 */
550		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
551		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
552			do {
553				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
554			} while (error == NQNFS_EXPIRED);
555			if (error)
556				return (error);
557			if (np->n_lrev != np->n_brev ||
558			    (np->n_flag & NQNFSNONCACHE)) {
559				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
560				if (error)
561					return (error);
562				np->n_brev = np->n_lrev;
563			}
564		}
565		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
566		    iomode = NFSV3WRITE_FILESYNC;
567		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
568		    if (must_commit)
569			nfs_clearcommit(vp->v_mount);
570		    return (error);
571		}
572		nfsstats.biocache_writes++;
573		lbn = uio->uio_offset / biosize;
574		on = uio->uio_offset & (biosize-1);
575		n = min((unsigned)(biosize - on), uio->uio_resid);
576again:
577		if (uio->uio_offset + n > np->n_size) {
578			np->n_size = uio->uio_offset + n;
579			np->n_flag |= NMODIFIED;
580			vnode_pager_setsize(vp, (u_long)np->n_size);
581		}
582		bufsize = biosize;
583		if ((lbn + 1) * biosize > np->n_size) {
584			bufsize = np->n_size - lbn * biosize;
585			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
586		}
587		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
588		if (!bp)
589			return (EINTR);
590		if (bp->b_wcred == NOCRED) {
591			crhold(cred);
592			bp->b_wcred = cred;
593		}
594		np->n_flag |= NMODIFIED;
595
596		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
597			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
598		}
599
600		/*
601		 * If the new write will leave a contiguous dirty
602		 * area, just update the b_dirtyoff and b_dirtyend,
603		 * otherwise force a write rpc of the old dirty area.
604		 */
605		if (bp->b_dirtyend > 0 &&
606		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
607			bp->b_proc = p;
608			if (VOP_BWRITE(bp) == EINTR)
609				return (EINTR);
610			goto again;
611		}
612
613		/*
614		 * Check for valid write lease and get one as required.
615		 * In case getblk() and/or bwrite() delayed us.
616		 */
617		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
618		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
619			do {
620				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
621			} while (error == NQNFS_EXPIRED);
622			if (error) {
623				brelse(bp);
624				return (error);
625			}
626			if (np->n_lrev != np->n_brev ||
627			    (np->n_flag & NQNFSNONCACHE)) {
628				brelse(bp);
629				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
630				if (error)
631					return (error);
632				np->n_brev = np->n_lrev;
633				goto again;
634			}
635		}
636		error = uiomove((char *)bp->b_data + on, n, uio);
637		if (error) {
638			bp->b_flags |= B_ERROR;
639			brelse(bp);
640			return (error);
641		}
642		if (bp->b_dirtyend > 0) {
643			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
644			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
645		} else {
646			bp->b_dirtyoff = on;
647			bp->b_dirtyend = on + n;
648		}
649		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
650		    bp->b_validoff > bp->b_dirtyend) {
651			bp->b_validoff = bp->b_dirtyoff;
652			bp->b_validend = bp->b_dirtyend;
653		} else {
654			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
655			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
656		}
657
658		/*
659		 * Since this block is being modified, it must be written
660		 * again and not just committed.
661		 */
662		bp->b_flags &= ~B_NEEDCOMMIT;
663
664		/*
665		 * If the lease is non-cachable or IO_SYNC do bwrite().
666		 */
667		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
668			bp->b_proc = p;
669			error = VOP_BWRITE(bp);
670			if (error)
671				return (error);
672			if (np->n_flag & NQNFSNONCACHE) {
673				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
674				if (error)
675					return (error);
676			}
677		} else if ((n + on) == biosize &&
678			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
679			bp->b_proc = (struct proc *)0;
680			bp->b_flags |= B_ASYNC;
681			(void)nfs_writebp(bp, 0);
682		} else
683			bdwrite(bp);
684	} while (uio->uio_resid > 0 && n > 0);
685	return (0);
686}
687
688/*
689 * Get an nfs cache block.
690 * Allocate a new one if the block isn't currently in the cache
691 * and return the block marked busy. If the calling process is
692 * interrupted by a signal for an interruptible mount point, return
693 * NULL.
694 */
695static struct buf *
696nfs_getcacheblk(vp, bn, size, p)
697	struct vnode *vp;
698	daddr_t bn;
699	int size;
700	struct proc *p;
701{
702	register struct buf *bp;
703	struct mount *mp;
704	struct nfsmount *nmp;
705
706	mp = vp->v_mount;
707	nmp = VFSTONFS(mp);
708
709	if (nmp->nm_flag & NFSMNT_INT) {
710		bp = getblk(vp, bn, size, PCATCH, 0);
711		while (bp == (struct buf *)0) {
712			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
713				return ((struct buf *)0);
714			bp = getblk(vp, bn, size, 0, 2 * hz);
715		}
716	} else
717		bp = getblk(vp, bn, size, 0, 0);
718
719	if( vp->v_type == VREG) {
720		int biosize;
721		biosize = mp->mnt_stat.f_iosize;
722		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
723	}
724
725	return (bp);
726}
727
728/*
729 * Flush and invalidate all dirty buffers. If another process is already
730 * doing the flush, just wait for completion.
731 */
732int
733nfs_vinvalbuf(vp, flags, cred, p, intrflg)
734	struct vnode *vp;
735	int flags;
736	struct ucred *cred;
737	struct proc *p;
738	int intrflg;
739{
740	register struct nfsnode *np = VTONFS(vp);
741	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
742	int error = 0, slpflag, slptimeo;
743
744	if (vp->v_flag & VXLOCK) {
745		return (0);
746	}
747
748	if ((nmp->nm_flag & NFSMNT_INT) == 0)
749		intrflg = 0;
750	if (intrflg) {
751		slpflag = PCATCH;
752		slptimeo = 2 * hz;
753	} else {
754		slpflag = 0;
755		slptimeo = 0;
756	}
757	/*
758	 * First wait for any other process doing a flush to complete.
759	 */
760	while (np->n_flag & NFLUSHINPROG) {
761		np->n_flag |= NFLUSHWANT;
762		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
763			slptimeo);
764		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
765			return (EINTR);
766	}
767
768	/*
769	 * Now, flush as required.
770	 */
771	np->n_flag |= NFLUSHINPROG;
772	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
773	while (error) {
774		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
775			np->n_flag &= ~NFLUSHINPROG;
776			if (np->n_flag & NFLUSHWANT) {
777				np->n_flag &= ~NFLUSHWANT;
778				wakeup((caddr_t)&np->n_flag);
779			}
780			return (EINTR);
781		}
782		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
783	}
784	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
785	if (np->n_flag & NFLUSHWANT) {
786		np->n_flag &= ~NFLUSHWANT;
787		wakeup((caddr_t)&np->n_flag);
788	}
789	return (0);
790}
791
792/*
793 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
794 * This is mainly to avoid queueing async I/O requests when the nfsiods
795 * are all hung on a dead server.
796 */
797int
798nfs_asyncio(bp, cred)
799	register struct buf *bp;
800	struct ucred *cred;
801{
802	struct nfsmount *nmp;
803	int i;
804	int gotiod;
805	int slpflag = 0;
806	int slptimeo = 0;
807	int error;
808
809	if (nfs_numasync == 0)
810		return (EIO);
811
812	nmp = VFSTONFS(bp->b_vp->v_mount);
813again:
814	if (nmp->nm_flag & NFSMNT_INT)
815		slpflag = PCATCH;
816	gotiod = FALSE;
817
818	/*
819	 * Find a free iod to process this request.
820	 */
821	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
822		if (nfs_iodwant[i]) {
823			/*
824			 * Found one, so wake it up and tell it which
825			 * mount to process.
826			 */
827			NFS_DPF(ASYNCIO,
828				("nfs_asyncio: waking iod %d for mount %p\n",
829				 i, nmp));
830			nfs_iodwant[i] = (struct proc *)0;
831			nfs_iodmount[i] = nmp;
832			nmp->nm_bufqiods++;
833			wakeup((caddr_t)&nfs_iodwant[i]);
834			gotiod = TRUE;
835			break;
836		}
837
838	/*
839	 * If none are free, we may already have an iod working on this mount
840	 * point.  If so, it will process our request.
841	 */
842	if (!gotiod) {
843		if (nmp->nm_bufqiods > 0) {
844			NFS_DPF(ASYNCIO,
845				("nfs_asyncio: %d iods are already processing mount %p\n",
846				 nmp->nm_bufqiods, nmp));
847			gotiod = TRUE;
848		}
849	}
850
851	/*
852	 * If we have an iod which can process the request, then queue
853	 * the buffer.
854	 */
855	if (gotiod) {
856		/*
857		 * Ensure that the queue never grows too large.
858		 */
859		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
860			NFS_DPF(ASYNCIO,
861				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
862			nmp->nm_bufqwant = TRUE;
863			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
864				       "nfsaio", slptimeo);
865			if (error) {
866				if (nfs_sigintr(nmp, NULL, bp->b_proc))
867					return (EINTR);
868				if (slpflag == PCATCH) {
869					slpflag = 0;
870					slptimeo = 2 * hz;
871				}
872			}
873			/*
874			 * We might have lost our iod while sleeping,
875			 * so check and loop if nescessary.
876			 */
877			if (nmp->nm_bufqiods == 0) {
878				NFS_DPF(ASYNCIO,
879					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
880				goto again;
881			}
882		}
883
884		if (bp->b_flags & B_READ) {
885			if (bp->b_rcred == NOCRED && cred != NOCRED) {
886				crhold(cred);
887				bp->b_rcred = cred;
888			}
889		} else {
890			bp->b_flags |= B_WRITEINPROG;
891			if (bp->b_wcred == NOCRED && cred != NOCRED) {
892				crhold(cred);
893				bp->b_wcred = cred;
894			}
895		}
896
897		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
898		nmp->nm_bufqlen++;
899		return (0);
900	}
901
902	/*
903	 * All the iods are busy on other mounts, so return EIO to
904	 * force the caller to process the i/o synchronously.
905	 */
906	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
907	return (EIO);
908}
909
910/*
911 * Do an I/O operation to/from a cache block. This may be called
912 * synchronously or from an nfsiod.
913 */
914int
915nfs_doio(bp, cr, p)
916	register struct buf *bp;
917	struct ucred *cr;
918	struct proc *p;
919{
920	register struct uio *uiop;
921	register struct vnode *vp;
922	struct nfsnode *np;
923	struct nfsmount *nmp;
924	int error = 0, diff, len, iomode, must_commit = 0;
925	struct uio uio;
926	struct iovec io;
927
928	vp = bp->b_vp;
929	np = VTONFS(vp);
930	nmp = VFSTONFS(vp->v_mount);
931	uiop = &uio;
932	uiop->uio_iov = &io;
933	uiop->uio_iovcnt = 1;
934	uiop->uio_segflg = UIO_SYSSPACE;
935	uiop->uio_procp = p;
936
937	/*
938	 * Historically, paging was done with physio, but no more.
939	 */
940	if (bp->b_flags & B_PHYS) {
941	    /*
942	     * ...though reading /dev/drum still gets us here.
943	     */
944	    io.iov_len = uiop->uio_resid = bp->b_bcount;
945	    /* mapping was done by vmapbuf() */
946	    io.iov_base = bp->b_data;
947	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
948	    if (bp->b_flags & B_READ) {
949		uiop->uio_rw = UIO_READ;
950		nfsstats.read_physios++;
951		error = nfs_readrpc(vp, uiop, cr);
952	    } else {
953		int com;
954
955		iomode = NFSV3WRITE_DATASYNC;
956		uiop->uio_rw = UIO_WRITE;
957		nfsstats.write_physios++;
958		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
959	    }
960	    if (error) {
961		bp->b_flags |= B_ERROR;
962		bp->b_error = error;
963	    }
964	} else if (bp->b_flags & B_READ) {
965	    io.iov_len = uiop->uio_resid = bp->b_bcount;
966	    io.iov_base = bp->b_data;
967	    uiop->uio_rw = UIO_READ;
968	    switch (vp->v_type) {
969	    case VREG:
970		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
971		nfsstats.read_bios++;
972		error = nfs_readrpc(vp, uiop, cr);
973		if (!error) {
974		    bp->b_validoff = 0;
975		    if (uiop->uio_resid) {
976			/*
977			 * If len > 0, there is a hole in the file and
978			 * no writes after the hole have been pushed to
979			 * the server yet.
980			 * Just zero fill the rest of the valid area.
981			 */
982			diff = bp->b_bcount - uiop->uio_resid;
983			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
984				+ diff);
985			if (len > 0) {
986			    len = min(len, uiop->uio_resid);
987			    bzero((char *)bp->b_data + diff, len);
988			    bp->b_validend = diff + len;
989			} else
990			    bp->b_validend = diff;
991		    } else
992			bp->b_validend = bp->b_bcount;
993		}
994		if (p && (vp->v_flag & VTEXT) &&
995			(((nmp->nm_flag & NFSMNT_NQNFS) &&
996			  NQNFS_CKINVALID(vp, np, ND_READ) &&
997			  np->n_lrev != np->n_brev) ||
998			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
999			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1000			uprintf("Process killed due to text file modification\n");
1001			psignal(p, SIGKILL);
1002			p->p_flag |= P_NOSWAP;
1003		}
1004		break;
1005	    case VLNK:
1006		uiop->uio_offset = (off_t)0;
1007		nfsstats.readlink_bios++;
1008		error = nfs_readlinkrpc(vp, uiop, cr);
1009		break;
1010	    case VDIR:
1011		nfsstats.readdir_bios++;
1012		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1013		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1014			error = nfs_readdirplusrpc(vp, uiop, cr);
1015			if (error == NFSERR_NOTSUPP)
1016				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1017		}
1018		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1019			error = nfs_readdirrpc(vp, uiop, cr);
1020		break;
1021	    default:
1022		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1023		break;
1024	    };
1025	    if (error) {
1026		bp->b_flags |= B_ERROR;
1027		bp->b_error = error;
1028	    }
1029	} else {
1030	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1031		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1032
1033	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1034		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1035		    - bp->b_dirtyoff;
1036		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1037		    + bp->b_dirtyoff;
1038		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1039		uiop->uio_rw = UIO_WRITE;
1040		nfsstats.write_bios++;
1041		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1042		    iomode = NFSV3WRITE_UNSTABLE;
1043		else
1044		    iomode = NFSV3WRITE_FILESYNC;
1045		bp->b_flags |= B_WRITEINPROG;
1046		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1047		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1048		    bp->b_flags |= B_NEEDCOMMIT;
1049		    if (bp->b_dirtyoff == 0
1050			&& bp->b_dirtyend == bp->b_bufsize)
1051			bp->b_flags |= B_CLUSTEROK;
1052		} else
1053		    bp->b_flags &= ~B_NEEDCOMMIT;
1054		bp->b_flags &= ~B_WRITEINPROG;
1055
1056		/*
1057		 * For an interrupted write, the buffer is still valid
1058		 * and the write hasn't been pushed to the server yet,
1059		 * so we can't set B_ERROR and report the interruption
1060		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1061		 * is not relevant, so the rpc attempt is essentially
1062		 * a noop.  For the case of a V3 write rpc not being
1063		 * committed to stable storage, the block is still
1064		 * dirty and requires either a commit rpc or another
1065		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1066		 * the block is reused. This is indicated by setting
1067		 * the B_DELWRI and B_NEEDCOMMIT flags.
1068		 */
1069    		if (error == EINTR
1070		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1071			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1072			++numdirtybuffers;
1073			bp->b_flags |= B_DELWRI;
1074			reassignbuf(bp, vp);
1075			if ((bp->b_flags & B_ASYNC) == 0)
1076			    bp->b_flags |= B_EINTR;
1077	    	} else {
1078			if (error) {
1079				bp->b_flags |= B_ERROR;
1080				bp->b_error = np->n_error = error;
1081				np->n_flag |= NWRITEERR;
1082			}
1083			bp->b_dirtyoff = bp->b_dirtyend = 0;
1084		}
1085	    } else {
1086		bp->b_resid = 0;
1087		biodone(bp);
1088		return (0);
1089	    }
1090	}
1091	bp->b_resid = uiop->uio_resid;
1092	if (must_commit)
1093		nfs_clearcommit(vp->v_mount);
1094	biodone(bp);
1095	return (error);
1096}
1097