nfs_bio.c revision 26409
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
37 * $Id: nfs_bio.c,v 1.38 1997/05/19 14:36:47 dfr Exp $
38 */
39
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/resourcevar.h>
44#include <sys/signalvar.h>
45#include <sys/proc.h>
46#include <sys/buf.h>
47#include <sys/vnode.h>
48#include <sys/mount.h>
49#include <sys/kernel.h>
50#include <sys/sysctl.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_prot.h>
56#include <vm/vm_page.h>
57#include <vm/vm_object.h>
58#include <vm/vm_pager.h>
59#include <vm/vnode_pager.h>
60
61#include <nfs/rpcv2.h>
62#include <nfs/nfsproto.h>
63#include <nfs/nfs.h>
64#include <nfs/nfsmount.h>
65#include <nfs/nqnfs.h>
66#include <nfs/nfsnode.h>
67
68static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
69					struct proc *p));
70
71extern int nfs_numasync;
72extern struct nfsstats nfsstats;
73
74/*
75 * Vnode op for VM getpages.
76 */
77int
78nfs_getpages(ap)
79	struct vop_getpages_args *ap;
80{
81	int i, bsize;
82	vm_object_t obj;
83	int pcount;
84	struct uio auio;
85	struct iovec aiov;
86	int error;
87	vm_page_t m;
88
89	if (!(ap->a_vp->v_flag & VVMIO)) {
90		printf("nfs_getpages: called with non-VMIO vnode??\n");
91		return EOPNOTSUPP;
92	}
93
94	pcount = round_page(ap->a_count) / PAGE_SIZE;
95
96	obj = ap->a_m[ap->a_reqpage]->object;
97	bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;
98
99	for (i = 0; i < pcount; i++) {
100		if (i != ap->a_reqpage) {
101			vnode_pager_freepage(ap->a_m[i]);
102		}
103	}
104	m = ap->a_m[ap->a_reqpage];
105
106	m->busy++;
107	m->flags &= ~PG_BUSY;
108
109	auio.uio_iov = &aiov;
110	auio.uio_iovcnt = 1;
111	aiov.iov_base = 0;
112	aiov.iov_len = PAGE_SIZE;
113	auio.uio_resid = PAGE_SIZE;
114	auio.uio_offset = IDX_TO_OFF(m->pindex);
115	auio.uio_segflg = UIO_NOCOPY;
116	auio.uio_rw = UIO_READ;
117	auio.uio_procp = curproc;
118	error = nfs_bioread(ap->a_vp, &auio, IO_NODELOCKED, curproc->p_ucred, 1);
119
120	m->flags |= PG_BUSY;
121	m->busy--;
122
123	if (error && (auio.uio_resid == PAGE_SIZE))
124		return VM_PAGER_ERROR;
125	return 0;
126}
127
128/*
129 * Vnode op for read using bio
130 * Any similarity to readip() is purely coincidental
131 */
132int
133nfs_bioread(vp, uio, ioflag, cred, getpages)
134	register struct vnode *vp;
135	register struct uio *uio;
136	int ioflag;
137	struct ucred *cred;
138	int getpages;
139{
140	register struct nfsnode *np = VTONFS(vp);
141	register int biosize, diff, i;
142	struct buf *bp = 0, *rabp;
143	struct vattr vattr;
144	struct proc *p;
145	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
146	daddr_t lbn, rabn;
147	int bufsize;
148	int nra, error = 0, n = 0, on = 0, not_readin;
149
150#ifdef DIAGNOSTIC
151	if (uio->uio_rw != UIO_READ)
152		panic("nfs_read mode");
153#endif
154	if (uio->uio_resid == 0)
155		return (0);
156	if (uio->uio_offset < 0)
157		return (EINVAL);
158	p = uio->uio_procp;
159	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
160		(void)nfs_fsinfo(nmp, vp, cred, p);
161	biosize = vp->v_mount->mnt_stat.f_iosize;
162	/*
163	 * For nfs, cache consistency can only be maintained approximately.
164	 * Although RFC1094 does not specify the criteria, the following is
165	 * believed to be compatible with the reference port.
166	 * For nqnfs, full cache consistency is maintained within the loop.
167	 * For nfs:
168	 * If the file's modify time on the server has changed since the
169	 * last read rpc or you have written to the file,
170	 * you may have lost data cache consistency with the
171	 * server, so flush all of the file's data out of the cache.
172	 * Then force a getattr rpc to ensure that you have up to date
173	 * attributes.
174	 * NB: This implies that cache data can be read when up to
175	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
176	 * attributes this could be forced by setting n_attrstamp to 0 before
177	 * the VOP_GETATTR() call.
178	 */
179	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
180		if (np->n_flag & NMODIFIED) {
181			if (vp->v_type != VREG) {
182				if (vp->v_type != VDIR)
183					panic("nfs: bioread, not dir");
184				nfs_invaldir(vp);
185				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
186				if (error)
187					return (error);
188			}
189			np->n_attrstamp = 0;
190			error = VOP_GETATTR(vp, &vattr, cred, p);
191			if (error)
192				return (error);
193			np->n_mtime = vattr.va_mtime.tv_sec;
194		} else {
195			error = VOP_GETATTR(vp, &vattr, cred, p);
196			if (error)
197				return (error);
198			if (np->n_mtime != vattr.va_mtime.tv_sec) {
199				if (vp->v_type == VDIR)
200					nfs_invaldir(vp);
201				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
202				if (error)
203					return (error);
204				np->n_mtime = vattr.va_mtime.tv_sec;
205			}
206		}
207	}
208	do {
209
210	    /*
211	     * Get a valid lease. If cached data is stale, flush it.
212	     */
213	    if (nmp->nm_flag & NFSMNT_NQNFS) {
214		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
215		    do {
216			error = nqnfs_getlease(vp, ND_READ, cred, p);
217		    } while (error == NQNFS_EXPIRED);
218		    if (error)
219			return (error);
220		    if (np->n_lrev != np->n_brev ||
221			(np->n_flag & NQNFSNONCACHE) ||
222			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
223			if (vp->v_type == VDIR)
224			    nfs_invaldir(vp);
225			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
226			if (error)
227			    return (error);
228			np->n_brev = np->n_lrev;
229		    }
230		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
231		    nfs_invaldir(vp);
232		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
233		    if (error)
234			return (error);
235		}
236	    }
237	    if (np->n_flag & NQNFSNONCACHE) {
238		switch (vp->v_type) {
239		case VREG:
240			return (nfs_readrpc(vp, uio, cred));
241		case VLNK:
242			return (nfs_readlinkrpc(vp, uio, cred));
243		case VDIR:
244			break;
245		default:
246			printf(" NQNFSNONCACHE: type %x unexpected\n",
247				vp->v_type);
248		};
249	    }
250	    switch (vp->v_type) {
251	    case VREG:
252		nfsstats.biocache_reads++;
253		lbn = uio->uio_offset / biosize;
254		on = uio->uio_offset & (biosize - 1);
255		not_readin = 1;
256
257		/*
258		 * Start the read ahead(s), as required.
259		 */
260		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
261		    for (nra = 0; nra < nmp->nm_readahead &&
262			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
263			rabn = lbn + 1 + nra;
264			if (!incore(vp, rabn)) {
265			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
266			    if (!rabp)
267				return (EINTR);
268			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
269				rabp->b_flags |= (B_READ | B_ASYNC);
270				vfs_busy_pages(rabp, 0);
271				if (nfs_asyncio(rabp, cred)) {
272				    rabp->b_flags |= B_INVAL|B_ERROR;
273				    vfs_unbusy_pages(rabp);
274				    brelse(rabp);
275				}
276			    } else
277				brelse(rabp);
278			}
279		    }
280		}
281
282		/*
283		 * If the block is in the cache and has the required data
284		 * in a valid region, just copy it out.
285		 * Otherwise, get the block and write back/read in,
286		 * as required.
287		 */
288again:
289		bufsize = biosize;
290		if ((off_t)(lbn + 1) * biosize > np->n_size &&
291		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
292			bufsize = np->n_size - lbn * biosize;
293			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
294		}
295		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
296		if (!bp)
297			return (EINTR);
298		/*
299		 * If we are being called from nfs_getpages, we must
300		 * make sure the buffer is a vmio buffer.  The vp will
301		 * already be setup for vmio but there may be some old
302		 * non-vmio buffers attached to it.
303		 */
304		if (getpages && !(bp->b_flags & B_VMIO)) {
305#ifdef DIAGNOSTIC
306			printf("nfs_bioread: non vmio buf found, discarding\n");
307#endif
308			bp->b_flags |= B_NOCACHE;
309			bp->b_flags |= B_INVAFTERWRITE;
310			if (bp->b_dirtyend > 0) {
311				if ((bp->b_flags & B_DELWRI) == 0)
312					panic("nfsbioread");
313				if (VOP_BWRITE(bp) == EINTR)
314					return (EINTR);
315			} else
316				brelse(bp);
317			goto again;
318		}
319		if ((bp->b_flags & B_CACHE) == 0) {
320			bp->b_flags |= B_READ;
321			bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
322			not_readin = 0;
323			vfs_busy_pages(bp, 0);
324			error = nfs_doio(bp, cred, p);
325			if (error) {
326			    brelse(bp);
327			    return (error);
328			}
329		}
330		if (bufsize > on) {
331			n = min((unsigned)(bufsize - on), uio->uio_resid);
332		} else {
333			n = 0;
334		}
335		diff = np->n_size - uio->uio_offset;
336		if (diff < n)
337			n = diff;
338		if (not_readin && n > 0) {
339			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
340				bp->b_flags |= B_NOCACHE;
341				bp->b_flags |= B_INVAFTERWRITE;
342				if (bp->b_dirtyend > 0) {
343				    if ((bp->b_flags & B_DELWRI) == 0)
344					panic("nfsbioread");
345				    if (VOP_BWRITE(bp) == EINTR)
346					return (EINTR);
347				} else
348				    brelse(bp);
349				goto again;
350			}
351		}
352		vp->v_lastr = lbn;
353		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
354		if (diff < n)
355			n = diff;
356		break;
357	    case VLNK:
358		nfsstats.biocache_readlinks++;
359		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
360		if (!bp)
361			return (EINTR);
362		if ((bp->b_flags & B_CACHE) == 0) {
363			bp->b_flags |= B_READ;
364			vfs_busy_pages(bp, 0);
365			error = nfs_doio(bp, cred, p);
366			if (error) {
367				bp->b_flags |= B_ERROR;
368				brelse(bp);
369				return (error);
370			}
371		}
372		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
373		on = 0;
374		break;
375	    case VDIR:
376		nfsstats.biocache_readdirs++;
377		if (np->n_direofoffset
378		    && uio->uio_offset >= np->n_direofoffset) {
379		    return (0);
380		}
381		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
382		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
383		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
384		if (!bp)
385		    return (EINTR);
386		if ((bp->b_flags & B_CACHE) == 0) {
387		    bp->b_flags |= B_READ;
388		    vfs_busy_pages(bp, 0);
389		    error = nfs_doio(bp, cred, p);
390		    if (error) {
391		        vfs_unbusy_pages(bp);
392			brelse(bp);
393			while (error == NFSERR_BAD_COOKIE) {
394			    nfs_invaldir(vp);
395			    error = nfs_vinvalbuf(vp, 0, cred, p, 1);
396			    /*
397			     * Yuck! The directory has been modified on the
398			     * server. The only way to get the block is by
399			     * reading from the beginning to get all the
400			     * offset cookies.
401			     */
402			    for (i = 0; i <= lbn && !error; i++) {
403				if (np->n_direofoffset
404				    && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
405				    return (0);
406				bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
407				if (!bp)
408				    return (EINTR);
409				if ((bp->b_flags & B_DONE) == 0) {
410				    bp->b_flags |= B_READ;
411				    vfs_busy_pages(bp, 0);
412				    error = nfs_doio(bp, cred, p);
413				    if (error) {
414					vfs_unbusy_pages(bp);
415					brelse(bp);
416				    } else if (i < lbn)
417					brelse(bp);
418				}
419			    }
420			}
421			if (error)
422			    return (error);
423		    }
424		}
425
426		/*
427		 * If not eof and read aheads are enabled, start one.
428		 * (You need the current block first, so that you have the
429		 *  directory offset cookie of the next block.)
430		 */
431		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
432		    (np->n_direofoffset == 0 ||
433		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
434		    !(np->n_flag & NQNFSNONCACHE) &&
435		    !incore(vp, lbn + 1)) {
436			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
437			if (rabp) {
438			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
439				rabp->b_flags |= (B_READ | B_ASYNC);
440				vfs_busy_pages(rabp, 0);
441				if (nfs_asyncio(rabp, cred)) {
442				    rabp->b_flags |= B_INVAL|B_ERROR;
443				    vfs_unbusy_pages(rabp);
444				    brelse(rabp);
445				}
446			    } else {
447				brelse(rabp);
448			    }
449			}
450		}
451		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
452		break;
453	    default:
454		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
455		break;
456	    };
457
458	    if (n > 0) {
459		error = uiomove(bp->b_data + on, (int)n, uio);
460	    }
461	    switch (vp->v_type) {
462	    case VREG:
463		break;
464	    case VLNK:
465		n = 0;
466		break;
467	    case VDIR:
468		if (np->n_flag & NQNFSNONCACHE)
469			bp->b_flags |= B_INVAL;
470		break;
471	    default:
472		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
473	    }
474 	    brelse(bp);
475	} while (error == 0 && uio->uio_resid > 0 && n > 0);
476	return (error);
477}
478
479/*
480 * Vnode op for write using bio
481 */
482int
483nfs_write(ap)
484	struct vop_write_args /* {
485		struct vnode *a_vp;
486		struct uio *a_uio;
487		int  a_ioflag;
488		struct ucred *a_cred;
489	} */ *ap;
490{
491	register int biosize;
492	register struct uio *uio = ap->a_uio;
493	struct proc *p = uio->uio_procp;
494	register struct vnode *vp = ap->a_vp;
495	struct nfsnode *np = VTONFS(vp);
496	register struct ucred *cred = ap->a_cred;
497	int ioflag = ap->a_ioflag;
498	struct buf *bp;
499	struct vattr vattr;
500	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
501	daddr_t lbn;
502	int bufsize;
503	int n, on, error = 0, iomode, must_commit;
504
505#ifdef DIAGNOSTIC
506	if (uio->uio_rw != UIO_WRITE)
507		panic("nfs_write mode");
508	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
509		panic("nfs_write proc");
510#endif
511	if (vp->v_type != VREG)
512		return (EIO);
513	if (np->n_flag & NWRITEERR) {
514		np->n_flag &= ~NWRITEERR;
515		return (np->n_error);
516	}
517	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
518		(void)nfs_fsinfo(nmp, vp, cred, p);
519	if (ioflag & (IO_APPEND | IO_SYNC)) {
520		if (np->n_flag & NMODIFIED) {
521			np->n_attrstamp = 0;
522			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
523			if (error)
524				return (error);
525		}
526		if (ioflag & IO_APPEND) {
527			np->n_attrstamp = 0;
528			error = VOP_GETATTR(vp, &vattr, cred, p);
529			if (error)
530				return (error);
531			uio->uio_offset = np->n_size;
532		}
533	}
534	if (uio->uio_offset < 0)
535		return (EINVAL);
536	if (uio->uio_resid == 0)
537		return (0);
538	/*
539	 * Maybe this should be above the vnode op call, but so long as
540	 * file servers have no limits, i don't think it matters
541	 */
542	if (p && uio->uio_offset + uio->uio_resid >
543	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
544		psignal(p, SIGXFSZ);
545		return (EFBIG);
546	}
547	/*
548	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
549	 * will be the same size within a filesystem. nfs_writerpc will
550	 * still use nm_wsize when sizing the rpc's.
551	 */
552	biosize = vp->v_mount->mnt_stat.f_iosize;
553	do {
554		/*
555		 * Check for a valid write lease.
556		 */
557		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
558		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
559			do {
560				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
561			} while (error == NQNFS_EXPIRED);
562			if (error)
563				return (error);
564			if (np->n_lrev != np->n_brev ||
565			    (np->n_flag & NQNFSNONCACHE)) {
566				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
567				if (error)
568					return (error);
569				np->n_brev = np->n_lrev;
570			}
571		}
572		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
573		    iomode = NFSV3WRITE_FILESYNC;
574		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
575		    if (must_commit)
576			nfs_clearcommit(vp->v_mount);
577		    return (error);
578		}
579		nfsstats.biocache_writes++;
580		lbn = uio->uio_offset / biosize;
581		on = uio->uio_offset & (biosize-1);
582		n = min((unsigned)(biosize - on), uio->uio_resid);
583again:
584		if (uio->uio_offset + n > np->n_size) {
585			np->n_size = uio->uio_offset + n;
586			np->n_flag |= NMODIFIED;
587			vnode_pager_setsize(vp, (u_long)np->n_size);
588		}
589		bufsize = biosize;
590		if ((lbn + 1) * biosize > np->n_size) {
591			bufsize = np->n_size - lbn * biosize;
592			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
593		}
594		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
595		if (!bp)
596			return (EINTR);
597		if (bp->b_wcred == NOCRED) {
598			crhold(cred);
599			bp->b_wcred = cred;
600		}
601		np->n_flag |= NMODIFIED;
602
603		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
604			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
605		}
606
607		/*
608		 * If the new write will leave a contiguous dirty
609		 * area, just update the b_dirtyoff and b_dirtyend,
610		 * otherwise force a write rpc of the old dirty area.
611		 */
612		if (bp->b_dirtyend > 0 &&
613		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
614			bp->b_proc = p;
615			if (VOP_BWRITE(bp) == EINTR)
616				return (EINTR);
617			goto again;
618		}
619
620		/*
621		 * Check for valid write lease and get one as required.
622		 * In case getblk() and/or bwrite() delayed us.
623		 */
624		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
625		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
626			do {
627				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
628			} while (error == NQNFS_EXPIRED);
629			if (error) {
630				brelse(bp);
631				return (error);
632			}
633			if (np->n_lrev != np->n_brev ||
634			    (np->n_flag & NQNFSNONCACHE)) {
635				brelse(bp);
636				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
637				if (error)
638					return (error);
639				np->n_brev = np->n_lrev;
640				goto again;
641			}
642		}
643		error = uiomove((char *)bp->b_data + on, n, uio);
644		if (error) {
645			bp->b_flags |= B_ERROR;
646			brelse(bp);
647			return (error);
648		}
649		if (bp->b_dirtyend > 0) {
650			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
651			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
652		} else {
653			bp->b_dirtyoff = on;
654			bp->b_dirtyend = on + n;
655		}
656		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
657		    bp->b_validoff > bp->b_dirtyend) {
658			bp->b_validoff = bp->b_dirtyoff;
659			bp->b_validend = bp->b_dirtyend;
660		} else {
661			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
662			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
663		}
664
665		/*
666		 * Since this block is being modified, it must be written
667		 * again and not just committed.
668		 */
669		bp->b_flags &= ~B_NEEDCOMMIT;
670
671		/*
672		 * If the lease is non-cachable or IO_SYNC do bwrite().
673		 */
674		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
675			bp->b_proc = p;
676			error = VOP_BWRITE(bp);
677			if (error)
678				return (error);
679			if (np->n_flag & NQNFSNONCACHE) {
680				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
681				if (error)
682					return (error);
683			}
684		} else if ((n + on) == biosize &&
685			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
686			bp->b_proc = (struct proc *)0;
687			bp->b_flags |= B_ASYNC;
688			(void)nfs_writebp(bp, 0);
689		} else
690			bdwrite(bp);
691	} while (uio->uio_resid > 0 && n > 0);
692	return (0);
693}
694
695/*
696 * Get an nfs cache block.
697 * Allocate a new one if the block isn't currently in the cache
698 * and return the block marked busy. If the calling process is
699 * interrupted by a signal for an interruptible mount point, return
700 * NULL.
701 */
702static struct buf *
703nfs_getcacheblk(vp, bn, size, p)
704	struct vnode *vp;
705	daddr_t bn;
706	int size;
707	struct proc *p;
708{
709	register struct buf *bp;
710	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
711	int biosize = vp->v_mount->mnt_stat.f_iosize;
712
713	if (nmp->nm_flag & NFSMNT_INT) {
714		bp = getblk(vp, bn, size, PCATCH, 0);
715		while (bp == (struct buf *)0) {
716			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
717				return ((struct buf *)0);
718			bp = getblk(vp, bn, size, 0, 2 * hz);
719		}
720	} else
721		bp = getblk(vp, bn, size, 0, 0);
722
723	if( vp->v_type == VREG)
724		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
725
726	return (bp);
727}
728
729/*
730 * Flush and invalidate all dirty buffers. If another process is already
731 * doing the flush, just wait for completion.
732 */
733int
734nfs_vinvalbuf(vp, flags, cred, p, intrflg)
735	struct vnode *vp;
736	int flags;
737	struct ucred *cred;
738	struct proc *p;
739	int intrflg;
740{
741	register struct nfsnode *np = VTONFS(vp);
742	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
743	int error = 0, slpflag, slptimeo;
744
745	if ((nmp->nm_flag & NFSMNT_INT) == 0)
746		intrflg = 0;
747	if (intrflg) {
748		slpflag = PCATCH;
749		slptimeo = 2 * hz;
750	} else {
751		slpflag = 0;
752		slptimeo = 0;
753	}
754	/*
755	 * First wait for any other process doing a flush to complete.
756	 */
757	while (np->n_flag & NFLUSHINPROG) {
758		np->n_flag |= NFLUSHWANT;
759		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
760			slptimeo);
761		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
762			return (EINTR);
763	}
764
765	/*
766	 * Now, flush as required.
767	 */
768	np->n_flag |= NFLUSHINPROG;
769	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
770	while (error) {
771		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
772			np->n_flag &= ~NFLUSHINPROG;
773			if (np->n_flag & NFLUSHWANT) {
774				np->n_flag &= ~NFLUSHWANT;
775				wakeup((caddr_t)&np->n_flag);
776			}
777			return (EINTR);
778		}
779		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
780	}
781	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
782	if (np->n_flag & NFLUSHWANT) {
783		np->n_flag &= ~NFLUSHWANT;
784		wakeup((caddr_t)&np->n_flag);
785	}
786	return (0);
787}
788
789/*
790 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
791 * This is mainly to avoid queueing async I/O requests when the nfsiods
792 * are all hung on a dead server.
793 */
794int
795nfs_asyncio(bp, cred)
796	register struct buf *bp;
797	struct ucred *cred;
798{
799	struct nfsmount *nmp;
800	int i;
801	int gotiod;
802	int slpflag = 0;
803	int slptimeo = 0;
804	int error;
805
806	if (nfs_numasync == 0)
807		return (EIO);
808
809	nmp = VFSTONFS(bp->b_vp->v_mount);
810again:
811	if (nmp->nm_flag & NFSMNT_INT)
812		slpflag = PCATCH;
813	gotiod = FALSE;
814
815	/*
816	 * Find a free iod to process this request.
817	 */
818	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
819		if (nfs_iodwant[i]) {
820			/*
821			 * Found one, so wake it up and tell it which
822			 * mount to process.
823			 */
824			NFS_DPF(ASYNCIO,
825				("nfs_asyncio: waking iod %d for mount %p\n",
826				 i, nmp));
827			nfs_iodwant[i] = (struct proc *)0;
828			nfs_iodmount[i] = nmp;
829			nmp->nm_bufqiods++;
830			wakeup((caddr_t)&nfs_iodwant[i]);
831			gotiod = TRUE;
832			break;
833		}
834
835	/*
836	 * If none are free, we may already have an iod working on this mount
837	 * point.  If so, it will process our request.
838	 */
839	if (!gotiod) {
840		if (nmp->nm_bufqiods > 0) {
841			NFS_DPF(ASYNCIO,
842				("nfs_asyncio: %d iods are already processing mount %p\n",
843				 nmp->nm_bufqiods, nmp));
844			gotiod = TRUE;
845		}
846	}
847
848	/*
849	 * If we have an iod which can process the request, then queue
850	 * the buffer.
851	 */
852	if (gotiod) {
853		/*
854		 * Ensure that the queue never grows too large.
855		 */
856		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
857			NFS_DPF(ASYNCIO,
858				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
859			nmp->nm_bufqwant = TRUE;
860			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
861				       "nfsaio", slptimeo);
862			if (error) {
863				if (nfs_sigintr(nmp, NULL, bp->b_proc))
864					return (EINTR);
865				if (slpflag == PCATCH) {
866					slpflag = 0;
867					slptimeo = 2 * hz;
868				}
869			}
870			/*
871			 * We might have lost our iod while sleeping,
872			 * so check and loop if nescessary.
873			 */
874			if (nmp->nm_bufqiods == 0) {
875				NFS_DPF(ASYNCIO,
876					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
877				goto again;
878			}
879		}
880
881		if (bp->b_flags & B_READ) {
882			if (bp->b_rcred == NOCRED && cred != NOCRED) {
883				crhold(cred);
884				bp->b_rcred = cred;
885			}
886		} else {
887			bp->b_flags |= B_WRITEINPROG;
888			if (bp->b_wcred == NOCRED && cred != NOCRED) {
889				crhold(cred);
890				bp->b_wcred = cred;
891			}
892		}
893
894		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
895		nmp->nm_bufqlen++;
896		return (0);
897	}
898
899	/*
900	 * All the iods are busy on other mounts, so return EIO to
901	 * force the caller to process the i/o synchronously.
902	 */
903	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
904	return (EIO);
905}
906
907/*
908 * Do an I/O operation to/from a cache block. This may be called
909 * synchronously or from an nfsiod.
910 */
911int
912nfs_doio(bp, cr, p)
913	register struct buf *bp;
914	struct ucred *cr;
915	struct proc *p;
916{
917	register struct uio *uiop;
918	register struct vnode *vp;
919	struct nfsnode *np;
920	struct nfsmount *nmp;
921	int error = 0, diff, len, iomode, must_commit = 0;
922	struct uio uio;
923	struct iovec io;
924
925	vp = bp->b_vp;
926	np = VTONFS(vp);
927	nmp = VFSTONFS(vp->v_mount);
928	uiop = &uio;
929	uiop->uio_iov = &io;
930	uiop->uio_iovcnt = 1;
931	uiop->uio_segflg = UIO_SYSSPACE;
932	uiop->uio_procp = p;
933
934	/*
935	 * Historically, paging was done with physio, but no more.
936	 */
937	if (bp->b_flags & B_PHYS) {
938	    /*
939	     * ...though reading /dev/drum still gets us here.
940	     */
941	    io.iov_len = uiop->uio_resid = bp->b_bcount;
942	    /* mapping was done by vmapbuf() */
943	    io.iov_base = bp->b_data;
944	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
945	    if (bp->b_flags & B_READ) {
946		uiop->uio_rw = UIO_READ;
947		nfsstats.read_physios++;
948		error = nfs_readrpc(vp, uiop, cr);
949	    } else {
950		int com;
951
952		iomode = NFSV3WRITE_DATASYNC;
953		uiop->uio_rw = UIO_WRITE;
954		nfsstats.write_physios++;
955		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
956	    }
957	    if (error) {
958		bp->b_flags |= B_ERROR;
959		bp->b_error = error;
960	    }
961	} else if (bp->b_flags & B_READ) {
962	    io.iov_len = uiop->uio_resid = bp->b_bcount;
963	    io.iov_base = bp->b_data;
964	    uiop->uio_rw = UIO_READ;
965	    switch (vp->v_type) {
966	    case VREG:
967		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
968		nfsstats.read_bios++;
969		error = nfs_readrpc(vp, uiop, cr);
970		if (!error) {
971		    bp->b_validoff = 0;
972		    if (uiop->uio_resid) {
973			/*
974			 * If len > 0, there is a hole in the file and
975			 * no writes after the hole have been pushed to
976			 * the server yet.
977			 * Just zero fill the rest of the valid area.
978			 */
979			diff = bp->b_bcount - uiop->uio_resid;
980			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
981				+ diff);
982			if (len > 0) {
983			    len = min(len, uiop->uio_resid);
984			    bzero((char *)bp->b_data + diff, len);
985			    bp->b_validend = diff + len;
986			} else
987			    bp->b_validend = diff;
988		    } else
989			bp->b_validend = bp->b_bcount;
990		}
991		if (p && (vp->v_flag & VTEXT) &&
992			(((nmp->nm_flag & NFSMNT_NQNFS) &&
993			  NQNFS_CKINVALID(vp, np, ND_READ) &&
994			  np->n_lrev != np->n_brev) ||
995			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
996			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
997			uprintf("Process killed due to text file modification\n");
998			psignal(p, SIGKILL);
999#ifdef __NetBSD__
1000			p->p_holdcnt++;
1001#else
1002			p->p_flag |= P_NOSWAP;
1003#endif
1004		}
1005		break;
1006	    case VLNK:
1007		uiop->uio_offset = (off_t)0;
1008		nfsstats.readlink_bios++;
1009		error = nfs_readlinkrpc(vp, uiop, cr);
1010		break;
1011	    case VDIR:
1012		nfsstats.readdir_bios++;
1013		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1014		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1015			error = nfs_readdirplusrpc(vp, uiop, cr);
1016			if (error == NFSERR_NOTSUPP)
1017				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1018		}
1019		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1020			error = nfs_readdirrpc(vp, uiop, cr);
1021		break;
1022	    default:
1023		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
1024		break;
1025	    };
1026	    if (error) {
1027		bp->b_flags |= B_ERROR;
1028		bp->b_error = error;
1029	    }
1030	} else {
1031	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1032		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1033
1034	    if (bp->b_dirtyend > bp->b_dirtyoff) {
1035		io.iov_len = uiop->uio_resid = bp->b_dirtyend
1036		    - bp->b_dirtyoff;
1037		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1038		    + bp->b_dirtyoff;
1039		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1040		uiop->uio_rw = UIO_WRITE;
1041		nfsstats.write_bios++;
1042		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1043		    iomode = NFSV3WRITE_UNSTABLE;
1044		else
1045		    iomode = NFSV3WRITE_FILESYNC;
1046		bp->b_flags |= B_WRITEINPROG;
1047		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1048		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1049		    bp->b_flags |= B_NEEDCOMMIT;
1050		    if (bp->b_dirtyoff == 0
1051			&& bp->b_dirtyend == bp->b_bufsize)
1052			bp->b_flags |= B_CLUSTEROK;
1053		} else
1054		    bp->b_flags &= ~B_NEEDCOMMIT;
1055		bp->b_flags &= ~B_WRITEINPROG;
1056
1057		/*
1058		 * For an interrupted write, the buffer is still valid
1059		 * and the write hasn't been pushed to the server yet,
1060		 * so we can't set B_ERROR and report the interruption
1061		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1062		 * is not relevant, so the rpc attempt is essentially
1063		 * a noop.  For the case of a V3 write rpc not being
1064		 * committed to stable storage, the block is still
1065		 * dirty and requires either a commit rpc or another
1066		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1067		 * the block is reused. This is indicated by setting
1068		 * the B_DELWRI and B_NEEDCOMMIT flags.
1069		 */
1070    		if (error == EINTR
1071		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1072			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1073			bp->b_flags |= B_DELWRI;
1074
1075		/*
1076		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1077		 * buffer to the clean list, we have to reassign it back to the
1078		 * dirty one. Ugh.
1079		 */
1080			if (bp->b_flags & B_ASYNC)
1081				reassignbuf(bp, vp);
1082			else
1083				bp->b_flags |= B_EINTR;
1084	    	} else {
1085			if (error) {
1086				bp->b_flags |= B_ERROR;
1087				bp->b_error = np->n_error = error;
1088				np->n_flag |= NWRITEERR;
1089			}
1090			bp->b_dirtyoff = bp->b_dirtyend = 0;
1091		}
1092	    } else {
1093		bp->b_resid = 0;
1094		biodone(bp);
1095		return (0);
1096	    }
1097	}
1098	bp->b_resid = uiop->uio_resid;
1099	if (must_commit)
1100		nfs_clearcommit(vp->v_mount);
1101	biodone(bp);
1102	return (error);
1103}
1104