nfs_bio.c revision 33108
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
37 * $Id: nfs_bio.c,v 1.48 1998/01/31 01:27:18 tegge Exp $
38 */
39
40#include "opt_diagnostic.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/resourcevar.h>
45#include <sys/signalvar.h>
46#include <sys/proc.h>
47#include <sys/buf.h>
48#include <sys/vnode.h>
49#include <sys/mount.h>
50#include <sys/kernel.h>
51
52#include <vm/vm.h>
53#include <vm/vm_extern.h>
54#include <vm/vm_prot.h>
55#include <vm/vm_page.h>
56#include <vm/vm_object.h>
57#include <vm/vm_pager.h>
58#include <vm/vnode_pager.h>
59
60#include <nfs/rpcv2.h>
61#include <nfs/nfsproto.h>
62#include <nfs/nfs.h>
63#include <nfs/nfsmount.h>
64#include <nfs/nqnfs.h>
65#include <nfs/nfsnode.h>
66
67static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
68					struct proc *p));
69
70extern int nfs_numasync;
71extern struct nfsstats nfsstats;
72
73/*
74 * Vnode op for VM getpages.
75 */
76int
77nfs_getpages(ap)
78	struct vop_getpages_args *ap;
79{
80	int i, pcount, error;
81	struct uio uio;
82	struct iovec iov;
83	vm_page_t m;
84	vm_offset_t kva;
85
86	if ((ap->a_vp->v_object) == NULL) {
87		printf("nfs_getpages: called with non-merged cache vnode??\n");
88		return EOPNOTSUPP;
89	}
90
91	m = ap->a_m[ap->a_reqpage];
92	kva = vm_pager_map_page(m);
93
94	iov.iov_base = (caddr_t) kva;
95	iov.iov_len = PAGE_SIZE;
96	uio.uio_iov = &iov;
97	uio.uio_iovcnt = 1;
98	uio.uio_offset = IDX_TO_OFF(m->pindex);
99	uio.uio_resid = PAGE_SIZE;
100	uio.uio_segflg = UIO_SYSSPACE;
101	uio.uio_rw = UIO_READ;
102	uio.uio_procp = curproc;
103
104	error = nfs_readrpc(ap->a_vp, &uio, curproc->p_ucred);
105	vm_pager_unmap_page(kva);
106
107	pcount = round_page(ap->a_count) / PAGE_SIZE;
108	for (i = 0; i < pcount; i++) {
109		if (i != ap->a_reqpage) {
110			vnode_pager_freepage(ap->a_m[i]);
111		}
112	}
113
114	if (error && (uio.uio_resid == PAGE_SIZE))
115		return VM_PAGER_ERROR;
116	return 0;
117}
118
119/*
120 * Vnode op for read using bio
121 * Any similarity to readip() is purely coincidental
122 */
123int
124nfs_bioread(vp, uio, ioflag, cred, getpages)
125	register struct vnode *vp;
126	register struct uio *uio;
127	int ioflag;
128	struct ucred *cred;
129	int getpages;
130{
131	register struct nfsnode *np = VTONFS(vp);
132	register int biosize, diff, i;
133	struct buf *bp = 0, *rabp;
134	struct vattr vattr;
135	struct proc *p;
136	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
137	daddr_t lbn, rabn;
138	int bufsize;
139	int nra, error = 0, n = 0, on = 0, not_readin;
140
141#ifdef DIAGNOSTIC
142	if (uio->uio_rw != UIO_READ)
143		panic("nfs_read mode");
144#endif
145	if (uio->uio_resid == 0)
146		return (0);
147	if (uio->uio_offset < 0)
148		return (EINVAL);
149	p = uio->uio_procp;
150	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
151		(void)nfs_fsinfo(nmp, vp, cred, p);
152	biosize = vp->v_mount->mnt_stat.f_iosize;
153	/*
154	 * For nfs, cache consistency can only be maintained approximately.
155	 * Although RFC1094 does not specify the criteria, the following is
156	 * believed to be compatible with the reference port.
157	 * For nqnfs, full cache consistency is maintained within the loop.
158	 * For nfs:
159	 * If the file's modify time on the server has changed since the
160	 * last read rpc or you have written to the file,
161	 * you may have lost data cache consistency with the
162	 * server, so flush all of the file's data out of the cache.
163	 * Then force a getattr rpc to ensure that you have up to date
164	 * attributes.
165	 * NB: This implies that cache data can be read when up to
166	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
167	 * attributes this could be forced by setting n_attrstamp to 0 before
168	 * the VOP_GETATTR() call.
169	 */
170	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
171		if (np->n_flag & NMODIFIED) {
172			if (vp->v_type != VREG) {
173				if (vp->v_type != VDIR)
174					panic("nfs: bioread, not dir");
175				nfs_invaldir(vp);
176				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
177				if (error)
178					return (error);
179			}
180			np->n_attrstamp = 0;
181			error = VOP_GETATTR(vp, &vattr, cred, p);
182			if (error)
183				return (error);
184			np->n_mtime = vattr.va_mtime.tv_sec;
185		} else {
186			error = VOP_GETATTR(vp, &vattr, cred, p);
187			if (error)
188				return (error);
189			if (np->n_mtime != vattr.va_mtime.tv_sec) {
190				if (vp->v_type == VDIR)
191					nfs_invaldir(vp);
192				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
193				if (error)
194					return (error);
195				np->n_mtime = vattr.va_mtime.tv_sec;
196			}
197		}
198	}
199	do {
200
201	    /*
202	     * Get a valid lease. If cached data is stale, flush it.
203	     */
204	    if (nmp->nm_flag & NFSMNT_NQNFS) {
205		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
206		    do {
207			error = nqnfs_getlease(vp, ND_READ, cred, p);
208		    } while (error == NQNFS_EXPIRED);
209		    if (error)
210			return (error);
211		    if (np->n_lrev != np->n_brev ||
212			(np->n_flag & NQNFSNONCACHE) ||
213			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
214			if (vp->v_type == VDIR)
215			    nfs_invaldir(vp);
216			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
217			if (error)
218			    return (error);
219			np->n_brev = np->n_lrev;
220		    }
221		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
222		    nfs_invaldir(vp);
223		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
224		    if (error)
225			return (error);
226		}
227	    }
228	    if (np->n_flag & NQNFSNONCACHE) {
229		switch (vp->v_type) {
230		case VREG:
231			return (nfs_readrpc(vp, uio, cred));
232		case VLNK:
233			return (nfs_readlinkrpc(vp, uio, cred));
234		case VDIR:
235			break;
236		default:
237			printf(" NQNFSNONCACHE: type %x unexpected\n",
238				vp->v_type);
239		};
240	    }
241	    switch (vp->v_type) {
242	    case VREG:
243		nfsstats.biocache_reads++;
244		lbn = uio->uio_offset / biosize;
245		on = uio->uio_offset & (biosize - 1);
246		not_readin = 1;
247
248		/*
249		 * Start the read ahead(s), as required.
250		 */
251		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
252		    for (nra = 0; nra < nmp->nm_readahead &&
253			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
254			rabn = lbn + 1 + nra;
255			if (!incore(vp, rabn)) {
256			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
257			    if (!rabp)
258				return (EINTR);
259			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
260				rabp->b_flags |= (B_READ | B_ASYNC);
261				vfs_busy_pages(rabp, 0);
262				if (nfs_asyncio(rabp, cred)) {
263				    rabp->b_flags |= B_INVAL|B_ERROR;
264				    vfs_unbusy_pages(rabp);
265				    brelse(rabp);
266				}
267			    } else
268				brelse(rabp);
269			}
270		    }
271		}
272
273		/*
274		 * If the block is in the cache and has the required data
275		 * in a valid region, just copy it out.
276		 * Otherwise, get the block and write back/read in,
277		 * as required.
278		 */
279again:
280		bufsize = biosize;
281		if ((off_t)(lbn + 1) * biosize > np->n_size &&
282		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
283			bufsize = np->n_size - lbn * biosize;
284			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
285		}
286		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
287		if (!bp)
288			return (EINTR);
289		/*
290		 * If we are being called from nfs_getpages, we must
291		 * make sure the buffer is a vmio buffer.  The vp will
292		 * already be setup for vmio but there may be some old
293		 * non-vmio buffers attached to it.
294		 */
295		if (getpages && !(bp->b_flags & B_VMIO)) {
296#ifdef DIAGNOSTIC
297			printf("nfs_bioread: non vmio buf found, discarding\n");
298#endif
299			bp->b_flags |= B_NOCACHE;
300			bp->b_flags |= B_INVAFTERWRITE;
301			if (bp->b_dirtyend > 0) {
302				if ((bp->b_flags & B_DELWRI) == 0)
303					panic("nfsbioread");
304				if (VOP_BWRITE(bp) == EINTR)
305					return (EINTR);
306			} else
307				brelse(bp);
308			goto again;
309		}
310		if ((bp->b_flags & B_CACHE) == 0) {
311		    bp->b_flags |= B_READ;
312		    bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
313		    not_readin = 0;
314		    vfs_busy_pages(bp, 0);
315		    error = nfs_doio(bp, cred, p);
316		    if (error) {
317			brelse(bp);
318			return (error);
319		    }
320		}
321		if (bufsize > on) {
322			n = min((unsigned)(bufsize - on), uio->uio_resid);
323		} else {
324			n = 0;
325		}
326		diff = np->n_size - uio->uio_offset;
327		if (diff < n)
328			n = diff;
329		if (not_readin && n > 0) {
330			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
331				bp->b_flags |= B_NOCACHE;
332				bp->b_flags |= B_INVAFTERWRITE;
333				if (bp->b_dirtyend > 0) {
334				    if ((bp->b_flags & B_DELWRI) == 0)
335					panic("nfsbioread");
336				    if (VOP_BWRITE(bp) == EINTR)
337					return (EINTR);
338				} else
339				    brelse(bp);
340				goto again;
341			}
342		}
343		vp->v_lastr = lbn;
344		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
345		if (diff < n)
346			n = diff;
347		break;
348	    case VLNK:
349		nfsstats.biocache_readlinks++;
350		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
351		if (!bp)
352			return (EINTR);
353		if ((bp->b_flags & B_CACHE) == 0) {
354		    bp->b_flags |= B_READ;
355		    vfs_busy_pages(bp, 0);
356		    error = nfs_doio(bp, cred, p);
357		    if (error) {
358			bp->b_flags |= B_ERROR;
359			brelse(bp);
360			return (error);
361		    }
362		}
363		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
364		on = 0;
365		break;
366	    case VDIR:
367		nfsstats.biocache_readdirs++;
368		if (np->n_direofoffset
369		    && uio->uio_offset >= np->n_direofoffset) {
370		    return (0);
371		}
372		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
373		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
374		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
375		if (!bp)
376		    return (EINTR);
377		if ((bp->b_flags & B_CACHE) == 0) {
378		    bp->b_flags |= B_READ;
379		    vfs_busy_pages(bp, 0);
380		    error = nfs_doio(bp, cred, p);
381		    if (error) {
382			    brelse(bp);
383		    }
384		    while (error == NFSERR_BAD_COOKIE) {
385			nfs_invaldir(vp);
386			error = nfs_vinvalbuf(vp, 0, cred, p, 1);
387			/*
388			 * Yuck! The directory has been modified on the
389			 * server. The only way to get the block is by
390			 * reading from the beginning to get all the
391			 * offset cookies.
392			 */
393			for (i = 0; i <= lbn && !error; i++) {
394			    if (np->n_direofoffset
395				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
396				    return (0);
397			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
398			    if (!bp)
399				return (EINTR);
400			    if ((bp->b_flags & B_DONE) == 0) {
401				bp->b_flags |= B_READ;
402				vfs_busy_pages(bp, 0);
403				error = nfs_doio(bp, cred, p);
404				if (error) {
405				    brelse(bp);
406				} else if (i < lbn) {
407				    brelse(bp);
408				}
409			    }
410			}
411		    }
412		    if (error)
413			    return (error);
414		}
415
416		/*
417		 * If not eof and read aheads are enabled, start one.
418		 * (You need the current block first, so that you have the
419		 *  directory offset cookie of the next block.)
420		 */
421		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
422		    (np->n_direofoffset == 0 ||
423		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
424		    !(np->n_flag & NQNFSNONCACHE) &&
425		    !incore(vp, lbn + 1)) {
426			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
427			if (rabp) {
428			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
429				rabp->b_flags |= (B_READ | B_ASYNC);
430				vfs_busy_pages(rabp, 0);
431				if (nfs_asyncio(rabp, cred)) {
432				    rabp->b_flags |= B_INVAL|B_ERROR;
433				    vfs_unbusy_pages(rabp);
434				    brelse(rabp);
435				}
436			    } else {
437				brelse(rabp);
438			    }
439			}
440		}
441		/*
442		 * Make sure we use a signed variant of min() since
443		 * the second term may be negative.
444		 */
445		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
446		break;
447	    default:
448		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
449		break;
450	    };
451
452	    if (n > 0) {
453		error = uiomove(bp->b_data + on, (int)n, uio);
454	    }
455	    switch (vp->v_type) {
456	    case VREG:
457		break;
458	    case VLNK:
459		n = 0;
460		break;
461	    case VDIR:
462		if (np->n_flag & NQNFSNONCACHE)
463			bp->b_flags |= B_INVAL;
464		break;
465	    default:
466		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
467	    }
468	    brelse(bp);
469	} while (error == 0 && uio->uio_resid > 0 && n > 0);
470	return (error);
471}
472
473/*
474 * Vnode op for write using bio
475 */
476int
477nfs_write(ap)
478	struct vop_write_args /* {
479		struct vnode *a_vp;
480		struct uio *a_uio;
481		int  a_ioflag;
482		struct ucred *a_cred;
483	} */ *ap;
484{
485	register int biosize;
486	register struct uio *uio = ap->a_uio;
487	struct proc *p = uio->uio_procp;
488	register struct vnode *vp = ap->a_vp;
489	struct nfsnode *np = VTONFS(vp);
490	register struct ucred *cred = ap->a_cred;
491	int ioflag = ap->a_ioflag;
492	struct buf *bp;
493	struct vattr vattr;
494	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
495	daddr_t lbn;
496	int bufsize;
497	int n, on, error = 0, iomode, must_commit;
498
499#ifdef DIAGNOSTIC
500	if (uio->uio_rw != UIO_WRITE)
501		panic("nfs_write mode");
502	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
503		panic("nfs_write proc");
504#endif
505	if (vp->v_type != VREG)
506		return (EIO);
507	if (np->n_flag & NWRITEERR) {
508		np->n_flag &= ~NWRITEERR;
509		return (np->n_error);
510	}
511	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
512		(void)nfs_fsinfo(nmp, vp, cred, p);
513	if (ioflag & (IO_APPEND | IO_SYNC)) {
514		if (np->n_flag & NMODIFIED) {
515			np->n_attrstamp = 0;
516			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
517			if (error)
518				return (error);
519		}
520		if (ioflag & IO_APPEND) {
521			np->n_attrstamp = 0;
522			error = VOP_GETATTR(vp, &vattr, cred, p);
523			if (error)
524				return (error);
525			uio->uio_offset = np->n_size;
526		}
527	}
528	if (uio->uio_offset < 0)
529		return (EINVAL);
530	if (uio->uio_resid == 0)
531		return (0);
532	/*
533	 * Maybe this should be above the vnode op call, but so long as
534	 * file servers have no limits, i don't think it matters
535	 */
536	if (p && uio->uio_offset + uio->uio_resid >
537	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
538		psignal(p, SIGXFSZ);
539		return (EFBIG);
540	}
541	/*
542	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
543	 * will be the same size within a filesystem. nfs_writerpc will
544	 * still use nm_wsize when sizing the rpc's.
545	 */
546	biosize = vp->v_mount->mnt_stat.f_iosize;
547	do {
548		/*
549		 * Check for a valid write lease.
550		 */
551		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
552		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
553			do {
554				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
555			} while (error == NQNFS_EXPIRED);
556			if (error)
557				return (error);
558			if (np->n_lrev != np->n_brev ||
559			    (np->n_flag & NQNFSNONCACHE)) {
560				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
561				if (error)
562					return (error);
563				np->n_brev = np->n_lrev;
564			}
565		}
566		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
567		    iomode = NFSV3WRITE_FILESYNC;
568		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
569		    if (must_commit)
570			nfs_clearcommit(vp->v_mount);
571		    return (error);
572		}
573		nfsstats.biocache_writes++;
574		lbn = uio->uio_offset / biosize;
575		on = uio->uio_offset & (biosize-1);
576		n = min((unsigned)(biosize - on), uio->uio_resid);
577again:
578		if (uio->uio_offset + n > np->n_size) {
579			np->n_size = uio->uio_offset + n;
580			np->n_flag |= NMODIFIED;
581			vnode_pager_setsize(vp, (u_long)np->n_size);
582		}
583		bufsize = biosize;
584		if ((lbn + 1) * biosize > np->n_size) {
585			bufsize = np->n_size - lbn * biosize;
586			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
587		}
588		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
589		if (!bp)
590			return (EINTR);
591		if (bp->b_wcred == NOCRED) {
592			crhold(cred);
593			bp->b_wcred = cred;
594		}
595		np->n_flag |= NMODIFIED;
596
597		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
598			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
599		}
600
601		/*
602		 * If the new write will leave a contiguous dirty
603		 * area, just update the b_dirtyoff and b_dirtyend,
604		 * otherwise force a write rpc of the old dirty area.
605		 */
606		if (bp->b_dirtyend > 0 &&
607		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
608			bp->b_proc = p;
609			if (VOP_BWRITE(bp) == EINTR)
610				return (EINTR);
611			goto again;
612		}
613
614		/*
615		 * Check for valid write lease and get one as required.
616		 * In case getblk() and/or bwrite() delayed us.
617		 */
618		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
619		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
620			do {
621				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
622			} while (error == NQNFS_EXPIRED);
623			if (error) {
624				brelse(bp);
625				return (error);
626			}
627			if (np->n_lrev != np->n_brev ||
628			    (np->n_flag & NQNFSNONCACHE)) {
629				brelse(bp);
630				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
631				if (error)
632					return (error);
633				np->n_brev = np->n_lrev;
634				goto again;
635			}
636		}
637		error = uiomove((char *)bp->b_data + on, n, uio);
638		if (error) {
639			bp->b_flags |= B_ERROR;
640			brelse(bp);
641			return (error);
642		}
643		if (bp->b_dirtyend > 0) {
644			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
645			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
646		} else {
647			bp->b_dirtyoff = on;
648			bp->b_dirtyend = on + n;
649		}
650		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
651		    bp->b_validoff > bp->b_dirtyend) {
652			bp->b_validoff = bp->b_dirtyoff;
653			bp->b_validend = bp->b_dirtyend;
654		} else {
655			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
656			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
657		}
658
659		/*
660		 * Since this block is being modified, it must be written
661		 * again and not just committed.
662		 */
663		bp->b_flags &= ~B_NEEDCOMMIT;
664
665		/*
666		 * If the lease is non-cachable or IO_SYNC do bwrite().
667		 */
668		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
669			bp->b_proc = p;
670			error = VOP_BWRITE(bp);
671			if (error)
672				return (error);
673			if (np->n_flag & NQNFSNONCACHE) {
674				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
675				if (error)
676					return (error);
677			}
678		} else if ((n + on) == biosize &&
679			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
680			bp->b_proc = (struct proc *)0;
681			bp->b_flags |= B_ASYNC;
682			(void)nfs_writebp(bp, 0);
683		} else
684			bdwrite(bp);
685	} while (uio->uio_resid > 0 && n > 0);
686	return (0);
687}
688
689/*
690 * Get an nfs cache block.
691 * Allocate a new one if the block isn't currently in the cache
692 * and return the block marked busy. If the calling process is
693 * interrupted by a signal for an interruptible mount point, return
694 * NULL.
695 */
696static struct buf *
697nfs_getcacheblk(vp, bn, size, p)
698	struct vnode *vp;
699	daddr_t bn;
700	int size;
701	struct proc *p;
702{
703	register struct buf *bp;
704	struct mount *mp;
705	struct nfsmount *nmp;
706
707	mp = vp->v_mount;
708	nmp = VFSTONFS(mp);
709
710	if (nmp->nm_flag & NFSMNT_INT) {
711		bp = getblk(vp, bn, size, PCATCH, 0);
712		while (bp == (struct buf *)0) {
713			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
714				return ((struct buf *)0);
715			bp = getblk(vp, bn, size, 0, 2 * hz);
716		}
717	} else
718		bp = getblk(vp, bn, size, 0, 0);
719
720	if( vp->v_type == VREG) {
721		int biosize;
722		biosize = mp->mnt_stat.f_iosize;
723		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
724	}
725
726	return (bp);
727}
728
729/*
730 * Flush and invalidate all dirty buffers. If another process is already
731 * doing the flush, just wait for completion.
732 */
733int
734nfs_vinvalbuf(vp, flags, cred, p, intrflg)
735	struct vnode *vp;
736	int flags;
737	struct ucred *cred;
738	struct proc *p;
739	int intrflg;
740{
741	register struct nfsnode *np = VTONFS(vp);
742	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
743	int error = 0, slpflag, slptimeo;
744
745	if (vp->v_flag & VXLOCK) {
746		return (0);
747	}
748
749	if ((nmp->nm_flag & NFSMNT_INT) == 0)
750		intrflg = 0;
751	if (intrflg) {
752		slpflag = PCATCH;
753		slptimeo = 2 * hz;
754	} else {
755		slpflag = 0;
756		slptimeo = 0;
757	}
758	/*
759	 * First wait for any other process doing a flush to complete.
760	 */
761	while (np->n_flag & NFLUSHINPROG) {
762		np->n_flag |= NFLUSHWANT;
763		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
764			slptimeo);
765		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
766			return (EINTR);
767	}
768
769	/*
770	 * Now, flush as required.
771	 */
772	np->n_flag |= NFLUSHINPROG;
773	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
774	while (error) {
775		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
776			np->n_flag &= ~NFLUSHINPROG;
777			if (np->n_flag & NFLUSHWANT) {
778				np->n_flag &= ~NFLUSHWANT;
779				wakeup((caddr_t)&np->n_flag);
780			}
781			return (EINTR);
782		}
783		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
784	}
785	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
786	if (np->n_flag & NFLUSHWANT) {
787		np->n_flag &= ~NFLUSHWANT;
788		wakeup((caddr_t)&np->n_flag);
789	}
790	return (0);
791}
792
793/*
794 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
795 * This is mainly to avoid queueing async I/O requests when the nfsiods
796 * are all hung on a dead server.
797 */
798int
799nfs_asyncio(bp, cred)
800	register struct buf *bp;
801	struct ucred *cred;
802{
803	struct nfsmount *nmp;
804	int i;
805	int gotiod;
806	int slpflag = 0;
807	int slptimeo = 0;
808	int error;
809
810	if (nfs_numasync == 0)
811		return (EIO);
812
813	nmp = VFSTONFS(bp->b_vp->v_mount);
814again:
815	if (nmp->nm_flag & NFSMNT_INT)
816		slpflag = PCATCH;
817	gotiod = FALSE;
818
819	/*
820	 * Find a free iod to process this request.
821	 */
822	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
823		if (nfs_iodwant[i]) {
824			/*
825			 * Found one, so wake it up and tell it which
826			 * mount to process.
827			 */
828			NFS_DPF(ASYNCIO,
829				("nfs_asyncio: waking iod %d for mount %p\n",
830				 i, nmp));
831			nfs_iodwant[i] = (struct proc *)0;
832			nfs_iodmount[i] = nmp;
833			nmp->nm_bufqiods++;
834			wakeup((caddr_t)&nfs_iodwant[i]);
835			gotiod = TRUE;
836			break;
837		}
838
839	/*
840	 * If none are free, we may already have an iod working on this mount
841	 * point.  If so, it will process our request.
842	 */
843	if (!gotiod) {
844		if (nmp->nm_bufqiods > 0) {
845			NFS_DPF(ASYNCIO,
846				("nfs_asyncio: %d iods are already processing mount %p\n",
847				 nmp->nm_bufqiods, nmp));
848			gotiod = TRUE;
849		}
850	}
851
852	/*
853	 * If we have an iod which can process the request, then queue
854	 * the buffer.
855	 */
856	if (gotiod) {
857		/*
858		 * Ensure that the queue never grows too large.
859		 */
860		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
861			NFS_DPF(ASYNCIO,
862				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
863			nmp->nm_bufqwant = TRUE;
864			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
865				       "nfsaio", slptimeo);
866			if (error) {
867				if (nfs_sigintr(nmp, NULL, bp->b_proc))
868					return (EINTR);
869				if (slpflag == PCATCH) {
870					slpflag = 0;
871					slptimeo = 2 * hz;
872				}
873			}
874			/*
875			 * We might have lost our iod while sleeping,
876			 * so check and loop if nescessary.
877			 */
878			if (nmp->nm_bufqiods == 0) {
879				NFS_DPF(ASYNCIO,
880					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
881				goto again;
882			}
883		}
884
885		if (bp->b_flags & B_READ) {
886			if (bp->b_rcred == NOCRED && cred != NOCRED) {
887				crhold(cred);
888				bp->b_rcred = cred;
889			}
890		} else {
891			bp->b_flags |= B_WRITEINPROG;
892			if (bp->b_wcred == NOCRED && cred != NOCRED) {
893				crhold(cred);
894				bp->b_wcred = cred;
895			}
896		}
897
898		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
899		nmp->nm_bufqlen++;
900		return (0);
901	}
902
903	/*
904	 * All the iods are busy on other mounts, so return EIO to
905	 * force the caller to process the i/o synchronously.
906	 */
907	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
908	return (EIO);
909}
910
911/*
912 * Do an I/O operation to/from a cache block. This may be called
913 * synchronously or from an nfsiod.
914 */
915int
916nfs_doio(bp, cr, p)
917	register struct buf *bp;
918	struct ucred *cr;
919	struct proc *p;
920{
921	register struct uio *uiop;
922	register struct vnode *vp;
923	struct nfsnode *np;
924	struct nfsmount *nmp;
925	int error = 0, diff, len, iomode, must_commit = 0;
926	struct uio uio;
927	struct iovec io;
928
929	vp = bp->b_vp;
930	np = VTONFS(vp);
931	nmp = VFSTONFS(vp->v_mount);
932	uiop = &uio;
933	uiop->uio_iov = &io;
934	uiop->uio_iovcnt = 1;
935	uiop->uio_segflg = UIO_SYSSPACE;
936	uiop->uio_procp = p;
937
938	/*
939	 * Historically, paging was done with physio, but no more.
940	 */
941	if (bp->b_flags & B_PHYS) {
942	    /*
943	     * ...though reading /dev/drum still gets us here.
944	     */
945	    io.iov_len = uiop->uio_resid = bp->b_bcount;
946	    /* mapping was done by vmapbuf() */
947	    io.iov_base = bp->b_data;
948	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
949	    if (bp->b_flags & B_READ) {
950		uiop->uio_rw = UIO_READ;
951		nfsstats.read_physios++;
952		error = nfs_readrpc(vp, uiop, cr);
953	    } else {
954		int com;
955
956		iomode = NFSV3WRITE_DATASYNC;
957		uiop->uio_rw = UIO_WRITE;
958		nfsstats.write_physios++;
959		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
960	    }
961	    if (error) {
962		bp->b_flags |= B_ERROR;
963		bp->b_error = error;
964	    }
965	} else if (bp->b_flags & B_READ) {
966	    io.iov_len = uiop->uio_resid = bp->b_bcount;
967	    io.iov_base = bp->b_data;
968	    uiop->uio_rw = UIO_READ;
969	    switch (vp->v_type) {
970	    case VREG:
971		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
972		nfsstats.read_bios++;
973		error = nfs_readrpc(vp, uiop, cr);
974		if (!error) {
975		    bp->b_validoff = 0;
976		    if (uiop->uio_resid) {
977			/*
978			 * If len > 0, there is a hole in the file and
979			 * no writes after the hole have been pushed to
980			 * the server yet.
981			 * Just zero fill the rest of the valid area.
982			 */
983			diff = bp->b_bcount - uiop->uio_resid;
984			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
985				+ diff);
986			if (len > 0) {
987			    len = min(len, uiop->uio_resid);
988			    bzero((char *)bp->b_data + diff, len);
989			    bp->b_validend = diff + len;
990			} else
991			    bp->b_validend = diff;
992		    } else
993			bp->b_validend = bp->b_bcount;
994		}
995		if (p && (vp->v_flag & VTEXT) &&
996			(((nmp->nm_flag & NFSMNT_NQNFS) &&
997			  NQNFS_CKINVALID(vp, np, ND_READ) &&
998			  np->n_lrev != np->n_brev) ||
999			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1000			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1001			uprintf("Process killed due to text file modification\n");
1002			psignal(p, SIGKILL);
1003			p->p_flag |= P_NOSWAP;
1004		}
1005		break;
1006	    case VLNK:
1007		uiop->uio_offset = (off_t)0;
1008		nfsstats.readlink_bios++;
1009		error = nfs_readlinkrpc(vp, uiop, cr);
1010		break;
1011	    case VDIR:
1012		nfsstats.readdir_bios++;
1013		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1014		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1015			error = nfs_readdirplusrpc(vp, uiop, cr);
1016			if (error == NFSERR_NOTSUPP)
1017				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1018		}
1019		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1020			error = nfs_readdirrpc(vp, uiop, cr);
1021		break;
1022	    default:
1023		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1024		break;
1025	    };
1026	    if (error) {
1027		bp->b_flags |= B_ERROR;
1028		bp->b_error = error;
1029	    }
1030	} else {
1031	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1032		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1033
1034	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1035		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1036		    - bp->b_dirtyoff;
1037		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1038		    + bp->b_dirtyoff;
1039		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1040		uiop->uio_rw = UIO_WRITE;
1041		nfsstats.write_bios++;
1042		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1043		    iomode = NFSV3WRITE_UNSTABLE;
1044		else
1045		    iomode = NFSV3WRITE_FILESYNC;
1046		bp->b_flags |= B_WRITEINPROG;
1047		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1048		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1049		    bp->b_flags |= B_NEEDCOMMIT;
1050		    if (bp->b_dirtyoff == 0
1051			&& bp->b_dirtyend == bp->b_bufsize)
1052			bp->b_flags |= B_CLUSTEROK;
1053		} else
1054		    bp->b_flags &= ~B_NEEDCOMMIT;
1055		bp->b_flags &= ~B_WRITEINPROG;
1056
1057		/*
1058		 * For an interrupted write, the buffer is still valid
1059		 * and the write hasn't been pushed to the server yet,
1060		 * so we can't set B_ERROR and report the interruption
1061		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1062		 * is not relevant, so the rpc attempt is essentially
1063		 * a noop.  For the case of a V3 write rpc not being
1064		 * committed to stable storage, the block is still
1065		 * dirty and requires either a commit rpc or another
1066		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1067		 * the block is reused. This is indicated by setting
1068		 * the B_DELWRI and B_NEEDCOMMIT flags.
1069		 */
1070    		if (error == EINTR
1071		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1072			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1073			++numdirtybuffers;
1074			bp->b_flags |= B_DELWRI;
1075			reassignbuf(bp, vp);
1076			if ((bp->b_flags & B_ASYNC) == 0)
1077			    bp->b_flags |= B_EINTR;
1078	    	} else {
1079			if (error) {
1080				bp->b_flags |= B_ERROR;
1081				bp->b_error = np->n_error = error;
1082				np->n_flag |= NWRITEERR;
1083			}
1084			bp->b_dirtyoff = bp->b_dirtyend = 0;
1085		}
1086	    } else {
1087		bp->b_resid = 0;
1088		biodone(bp);
1089		return (0);
1090	    }
1091	}
1092	bp->b_resid = uiop->uio_resid;
1093	if (must_commit)
1094		nfs_clearcommit(vp->v_mount);
1095	biodone(bp);
1096	return (error);
1097}
1098