nfs_bio.c revision 12911
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
37 * $Id: nfs_bio.c,v 1.20 1995/12/07 12:47:23 davidg Exp $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/resourcevar.h>
43#include <sys/signalvar.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/kernel.h>
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/vm_extern.h>
53
54#include <nfs/rpcv2.h>
55#include <nfs/nfsproto.h>
56#include <nfs/nfs.h>
57#include <nfs/nfsmount.h>
58#include <nfs/nqnfs.h>
59#include <nfs/nfsnode.h>
60
61static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
62					struct proc *p));
63
64extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
65extern int nfs_numasync;
66extern struct nfsstats nfsstats;
67
68/*
69 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
70 * that this isn't done inside getblk() and brelse() so these calls
71 * wouldn't need to be here.
72 */
73#ifdef B_VMIO
74#define vnode_pager_uncache(vp)
75#else
76#define vfs_busy_pages(bp, f)
77#define vfs_unbusy_pages(bp)
78#define vfs_dirty_pages(bp)
79#endif
80
81/*
82 * Vnode op for read using bio
83 * Any similarity to readip() is purely coincidental
84 */
85int
86nfs_bioread(vp, uio, ioflag, cred)
87	register struct vnode *vp;
88	register struct uio *uio;
89	int ioflag;
90	struct ucred *cred;
91{
92	register struct nfsnode *np = VTONFS(vp);
93	register int biosize, diff, i;
94	struct buf *bp = 0, *rabp;
95	struct vattr vattr;
96	struct proc *p;
97	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
98	daddr_t lbn, rabn;
99	int bufsize;
100	int nra, error = 0, n = 0, on = 0, not_readin;
101
102#ifdef DIAGNOSTIC
103	if (uio->uio_rw != UIO_READ)
104		panic("nfs_read mode");
105#endif
106	if (uio->uio_resid == 0)
107		return (0);
108	if (uio->uio_offset < 0)
109		return (EINVAL);
110	p = uio->uio_procp;
111	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
112		(void)nfs_fsinfo(nmp, vp, cred, p);
113	biosize = vp->v_mount->mnt_stat.f_iosize;
114	/*
115	 * For nfs, cache consistency can only be maintained approximately.
116	 * Although RFC1094 does not specify the criteria, the following is
117	 * believed to be compatible with the reference port.
118	 * For nqnfs, full cache consistency is maintained within the loop.
119	 * For nfs:
120	 * If the file's modify time on the server has changed since the
121	 * last read rpc or you have written to the file,
122	 * you may have lost data cache consistency with the
123	 * server, so flush all of the file's data out of the cache.
124	 * Then force a getattr rpc to ensure that you have up to date
125	 * attributes.
126	 * NB: This implies that cache data can be read when up to
127	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
128	 * attributes this could be forced by setting n_attrstamp to 0 before
129	 * the VOP_GETATTR() call.
130	 */
131	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
132		if (np->n_flag & NMODIFIED) {
133			if (vp->v_type != VREG) {
134				if (vp->v_type != VDIR)
135					panic("nfs: bioread, not dir");
136				nfs_invaldir(vp);
137				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
138				if (error)
139					return (error);
140			}
141			np->n_attrstamp = 0;
142			error = VOP_GETATTR(vp, &vattr, cred, p);
143			if (error)
144				return (error);
145			np->n_mtime = vattr.va_mtime.ts_sec;
146		} else {
147			error = VOP_GETATTR(vp, &vattr, cred, p);
148			if (error)
149				return (error);
150			if (np->n_mtime != vattr.va_mtime.ts_sec) {
151				if (vp->v_type == VDIR)
152					nfs_invaldir(vp);
153				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
154				if (error)
155					return (error);
156				np->n_mtime = vattr.va_mtime.ts_sec;
157			}
158		}
159	}
160	do {
161
162	    /*
163	     * Get a valid lease. If cached data is stale, flush it.
164	     */
165	    if (nmp->nm_flag & NFSMNT_NQNFS) {
166		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
167		    do {
168			error = nqnfs_getlease(vp, ND_READ, cred, p);
169		    } while (error == NQNFS_EXPIRED);
170		    if (error)
171			return (error);
172		    if (np->n_lrev != np->n_brev ||
173			(np->n_flag & NQNFSNONCACHE) ||
174			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
175			if (vp->v_type == VDIR)
176			    nfs_invaldir(vp);
177			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
178			if (error)
179			    return (error);
180			np->n_brev = np->n_lrev;
181		    }
182		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
183		    nfs_invaldir(vp);
184		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
185		    if (error)
186			return (error);
187		}
188	    }
189	    if (np->n_flag & NQNFSNONCACHE) {
190		switch (vp->v_type) {
191		case VREG:
192			return (nfs_readrpc(vp, uio, cred));
193		case VLNK:
194			return (nfs_readlinkrpc(vp, uio, cred));
195		case VDIR:
196			break;
197		default:
198			printf(" NQNFSNONCACHE: type %x unexpected\n",
199				vp->v_type);
200		};
201	    }
202	    switch (vp->v_type) {
203	    case VREG:
204		nfsstats.biocache_reads++;
205		lbn = uio->uio_offset / biosize;
206		on = uio->uio_offset & (biosize - 1);
207		not_readin = 1;
208
209		/*
210		 * Start the read ahead(s), as required.
211		 */
212		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
213		    for (nra = 0; nra < nmp->nm_readahead &&
214			(lbn + 1 + nra) * biosize < np->n_size; nra++) {
215			rabn = lbn + 1 + nra;
216			if (!incore(vp, rabn)) {
217			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
218			    if (!rabp)
219				return (EINTR);
220			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
221				rabp->b_flags |= (B_READ | B_ASYNC);
222				vfs_busy_pages(rabp, 0);
223				if (nfs_asyncio(rabp, cred)) {
224				    rabp->b_flags |= B_INVAL|B_ERROR;
225				    vfs_unbusy_pages(rabp);
226				    brelse(rabp);
227				}
228			    } else {
229				brelse(rabp);
230			    }
231			}
232		    }
233		}
234
235		/*
236		 * If the block is in the cache and has the required data
237		 * in a valid region, just copy it out.
238		 * Otherwise, get the block and write back/read in,
239		 * as required.
240		 */
241again:
242		bufsize = biosize;
243		if ((lbn + 1) * biosize > np->n_size) {
244			bufsize = np->n_size - lbn * biosize;
245			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
246		}
247		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
248		if (!bp)
249			return (EINTR);
250		if ((bp->b_flags & B_CACHE) == 0) {
251			bp->b_flags |= B_READ;
252			not_readin = 0;
253			vfs_busy_pages(bp, 0);
254			error = nfs_doio(bp, cred, p);
255			if (error) {
256			    brelse(bp);
257			    return (error);
258			}
259		}
260		if (bufsize > on) {
261			n = min((unsigned)(bufsize - on), uio->uio_resid);
262		} else {
263			n = 0;
264		}
265		diff = np->n_size - uio->uio_offset;
266		if (diff < n)
267			n = diff;
268		if (not_readin && n > 0) {
269			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
270				bp->b_flags |= B_NOCACHE;
271				if (bp->b_dirtyend > 0) {
272				    if ((bp->b_flags & B_DELWRI) == 0)
273					panic("nfsbioread");
274				    if (VOP_BWRITE(bp) == EINTR)
275					return (EINTR);
276				} else
277				    brelse(bp);
278				goto again;
279			}
280		}
281		vp->v_lastr = lbn;
282		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
283		if (diff < n)
284			n = diff;
285		break;
286	    case VLNK:
287		nfsstats.biocache_readlinks++;
288		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
289		if (!bp)
290			return (EINTR);
291		if ((bp->b_flags & B_CACHE) == 0) {
292			bp->b_flags |= B_READ;
293			vfs_busy_pages(bp, 0);
294			error = nfs_doio(bp, cred, p);
295			if (error) {
296				bp->b_flags |= B_ERROR;
297				brelse(bp);
298				return (error);
299			}
300		}
301		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
302		on = 0;
303		break;
304	    case VDIR:
305		nfsstats.biocache_readdirs++;
306		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
307		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
308		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
309		if (!bp)
310		    return (EINTR);
311		if ((bp->b_flags & B_CACHE) == 0) {
312		    bp->b_flags |= B_READ;
313		    vfs_busy_pages(bp, 0);
314		    error = nfs_doio(bp, cred, p);
315		    if (error) {
316			brelse(bp);
317			while (error == NFSERR_BAD_COOKIE) {
318			    nfs_invaldir(vp);
319			    error = nfs_vinvalbuf(vp, 0, cred, p, 1);
320			    /*
321			     * Yuck! The directory has been modified on the
322			     * server. The only way to get the block is by
323			     * reading from the beginning to get all the
324			     * offset cookies.
325			     */
326			    for (i = 0; i <= lbn && !error; i++) {
327				bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
328				if (!bp)
329				    return (EINTR);
330				if ((bp->b_flags & B_DONE) == 0) {
331				    bp->b_flags |= B_READ;
332				    vfs_busy_pages(bp, 0);
333				    error = nfs_doio(bp, cred, p);
334				    if (error)
335					brelse(bp);
336				}
337			    }
338			}
339			if (error)
340			    return (error);
341		    }
342		}
343
344		/*
345		 * If not eof and read aheads are enabled, start one.
346		 * (You need the current block first, so that you have the
347		 *  directory offset cookie of the next block.)
348		 */
349		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
350		    (np->n_direofoffset == 0 ||
351		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
352		    !(np->n_flag & NQNFSNONCACHE) &&
353		    !incore(vp, lbn + 1)) {
354			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
355			if (rabp) {
356			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
357				rabp->b_flags |= (B_READ | B_ASYNC);
358				vfs_busy_pages(rabp, 0);
359				if (nfs_asyncio(rabp, cred)) {
360				    rabp->b_flags |= B_INVAL|B_ERROR;
361				    vfs_unbusy_pages(rabp);
362				    brelse(rabp);
363				}
364			    } else {
365				brelse(rabp);
366			    }
367			}
368		}
369		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
370		break;
371	    default:
372		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
373		break;
374	    };
375
376	    if (n > 0) {
377		error = uiomove(bp->b_data + on, (int)n, uio);
378	    }
379	    switch (vp->v_type) {
380	    case VREG:
381		break;
382	    case VLNK:
383		n = 0;
384		break;
385	    case VDIR:
386		if (np->n_flag & NQNFSNONCACHE)
387			bp->b_flags |= B_INVAL;
388		break;
389	    default:
390		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
391	    }
392 	    brelse(bp);
393	} while (error == 0 && uio->uio_resid > 0 && n > 0);
394	return (error);
395}
396
397/*
398 * Vnode op for write using bio
399 */
400int
401nfs_write(ap)
402	struct vop_write_args /* {
403		struct vnode *a_vp;
404		struct uio *a_uio;
405		int  a_ioflag;
406		struct ucred *a_cred;
407	} */ *ap;
408{
409	register int biosize;
410	register struct uio *uio = ap->a_uio;
411	struct proc *p = uio->uio_procp;
412	register struct vnode *vp = ap->a_vp;
413	struct nfsnode *np = VTONFS(vp);
414	register struct ucred *cred = ap->a_cred;
415	int ioflag = ap->a_ioflag;
416	struct buf *bp;
417	struct vattr vattr;
418	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
419	daddr_t lbn;
420	int bufsize;
421	int n, on, error = 0, iomode, must_commit;
422
423#ifdef DIAGNOSTIC
424	if (uio->uio_rw != UIO_WRITE)
425		panic("nfs_write mode");
426	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
427		panic("nfs_write proc");
428#endif
429	if (vp->v_type != VREG)
430		return (EIO);
431	if (np->n_flag & NWRITEERR) {
432		np->n_flag &= ~NWRITEERR;
433		return (np->n_error);
434	}
435	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
436		(void)nfs_fsinfo(nmp, vp, cred, p);
437	if (ioflag & (IO_APPEND | IO_SYNC)) {
438		if (np->n_flag & NMODIFIED) {
439			np->n_attrstamp = 0;
440			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
441			if (error)
442				return (error);
443		}
444		if (ioflag & IO_APPEND) {
445			np->n_attrstamp = 0;
446			error = VOP_GETATTR(vp, &vattr, cred, p);
447			if (error)
448				return (error);
449			uio->uio_offset = np->n_size;
450		}
451	}
452	if (uio->uio_offset < 0)
453		return (EINVAL);
454	if (uio->uio_resid == 0)
455		return (0);
456	/*
457	 * Maybe this should be above the vnode op call, but so long as
458	 * file servers have no limits, i don't think it matters
459	 */
460	if (p && uio->uio_offset + uio->uio_resid >
461	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
462		psignal(p, SIGXFSZ);
463		return (EFBIG);
464	}
465	/*
466	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
467	 * will be the same size within a filesystem. nfs_writerpc will
468	 * still use nm_wsize when sizing the rpc's.
469	 */
470	biosize = vp->v_mount->mnt_stat.f_iosize;
471	do {
472
473		/*
474		 * XXX make sure we aren't cached in the VM page cache
475		 */
476		/*
477		 * Check for a valid write lease.
478		 */
479		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
480		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
481			do {
482				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
483			} while (error == NQNFS_EXPIRED);
484			if (error)
485				return (error);
486			if (np->n_lrev != np->n_brev ||
487			    (np->n_flag & NQNFSNONCACHE)) {
488				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
489				if (error)
490					return (error);
491				np->n_brev = np->n_lrev;
492			}
493		}
494		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
495		    iomode = NFSV3WRITE_FILESYNC;
496		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
497		    if (must_commit)
498			nfs_clearcommit(vp->v_mount);
499		    return (error);
500		}
501		nfsstats.biocache_writes++;
502		lbn = uio->uio_offset / biosize;
503		on = uio->uio_offset & (biosize-1);
504		n = min((unsigned)(biosize - on), uio->uio_resid);
505again:
506		if (uio->uio_offset + n > np->n_size) {
507			np->n_size = uio->uio_offset + n;
508			vnode_pager_setsize(vp, (u_long)np->n_size);
509		}
510		bufsize = biosize;
511		if ((lbn + 1) * biosize > np->n_size) {
512			bufsize = np->n_size - lbn * biosize;
513			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
514		}
515		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
516		if (!bp)
517			return (EINTR);
518		if (bp->b_wcred == NOCRED) {
519			crhold(cred);
520			bp->b_wcred = cred;
521		}
522		np->n_flag |= NMODIFIED;
523
524		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
525			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
526		}
527
528		/*
529		 * If the new write will leave a contiguous dirty
530		 * area, just update the b_dirtyoff and b_dirtyend,
531		 * otherwise force a write rpc of the old dirty area.
532		 */
533		if (bp->b_dirtyend > 0 &&
534		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
535			bp->b_proc = p;
536			if (VOP_BWRITE(bp) == EINTR)
537				return (EINTR);
538			goto again;
539		}
540
541		/*
542		 * Check for valid write lease and get one as required.
543		 * In case getblk() and/or bwrite() delayed us.
544		 */
545		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
546		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
547			do {
548				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
549			} while (error == NQNFS_EXPIRED);
550			if (error) {
551				brelse(bp);
552				return (error);
553			}
554			if (np->n_lrev != np->n_brev ||
555			    (np->n_flag & NQNFSNONCACHE)) {
556				brelse(bp);
557				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
558				if (error)
559					return (error);
560				np->n_brev = np->n_lrev;
561				goto again;
562			}
563		}
564		error = uiomove((char *)bp->b_data + on, n, uio);
565		if (error) {
566			bp->b_flags |= B_ERROR;
567			brelse(bp);
568			return (error);
569		}
570		if (bp->b_dirtyend > 0) {
571			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
572			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
573		} else {
574			bp->b_dirtyoff = on;
575			bp->b_dirtyend = on + n;
576		}
577		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
578		    bp->b_validoff > bp->b_dirtyend) {
579			bp->b_validoff = bp->b_dirtyoff;
580			bp->b_validend = bp->b_dirtyend;
581		} else {
582			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
583			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
584		}
585		/*
586		 * If the lease is non-cachable or IO_SYNC do bwrite().
587		 */
588		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
589			bp->b_proc = p;
590			error = VOP_BWRITE(bp);
591			if (error)
592				return (error);
593			if (np->n_flag & NQNFSNONCACHE) {
594				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
595				if (error)
596					return (error);
597			}
598		} else if ((n + on) == biosize &&
599			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
600			bp->b_proc = (struct proc *)0;
601			bp->b_flags |= B_ASYNC;
602			(void)nfs_writebp(bp, 0);
603		} else
604			bdwrite(bp);
605	} while (uio->uio_resid > 0 && n > 0);
606	return (0);
607}
608
609/*
610 * Get an nfs cache block.
611 * Allocate a new one if the block isn't currently in the cache
612 * and return the block marked busy. If the calling process is
613 * interrupted by a signal for an interruptible mount point, return
614 * NULL.
615 */
616static struct buf *
617nfs_getcacheblk(vp, bn, size, p)
618	struct vnode *vp;
619	daddr_t bn;
620	int size;
621	struct proc *p;
622{
623	register struct buf *bp;
624	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
625	int biosize = vp->v_mount->mnt_stat.f_iosize;
626
627	if (nmp->nm_flag & NFSMNT_INT) {
628		bp = getblk(vp, bn, size, PCATCH, 0);
629		while (bp == (struct buf *)0) {
630			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
631				return ((struct buf *)0);
632			bp = getblk(vp, bn, size, 0, 2 * hz);
633		}
634	} else
635		bp = getblk(vp, bn, size, 0, 0);
636
637	if( vp->v_type == VREG)
638		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
639
640	return (bp);
641}
642
643/*
644 * Flush and invalidate all dirty buffers. If another process is already
645 * doing the flush, just wait for completion.
646 */
647int
648nfs_vinvalbuf(vp, flags, cred, p, intrflg)
649	struct vnode *vp;
650	int flags;
651	struct ucred *cred;
652	struct proc *p;
653	int intrflg;
654{
655	register struct nfsnode *np = VTONFS(vp);
656	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
657	int error = 0, slpflag, slptimeo;
658
659	if ((nmp->nm_flag & NFSMNT_INT) == 0)
660		intrflg = 0;
661	if (intrflg) {
662		slpflag = PCATCH;
663		slptimeo = 2 * hz;
664	} else {
665		slpflag = 0;
666		slptimeo = 0;
667	}
668	/*
669	 * First wait for any other process doing a flush to complete.
670	 */
671	while (np->n_flag & NFLUSHINPROG) {
672		np->n_flag |= NFLUSHWANT;
673		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
674			slptimeo);
675		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
676			return (EINTR);
677	}
678
679	/*
680	 * Now, flush as required.
681	 */
682	np->n_flag |= NFLUSHINPROG;
683	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
684	while (error) {
685		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
686			np->n_flag &= ~NFLUSHINPROG;
687			if (np->n_flag & NFLUSHWANT) {
688				np->n_flag &= ~NFLUSHWANT;
689				wakeup((caddr_t)&np->n_flag);
690			}
691			return (EINTR);
692		}
693		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
694	}
695	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
696	if (np->n_flag & NFLUSHWANT) {
697		np->n_flag &= ~NFLUSHWANT;
698		wakeup((caddr_t)&np->n_flag);
699	}
700	return (0);
701}
702
703/*
704 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
705 * This is mainly to avoid queueing async I/O requests when the nfsiods
706 * are all hung on a dead server.
707 */
708int
709nfs_asyncio(bp, cred)
710	register struct buf *bp;
711	struct ucred *cred;
712{
713	register int i;
714
715	if (nfs_numasync == 0)
716		return (EIO);
717	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
718	    if (nfs_iodwant[i]) {
719		if (bp->b_flags & B_READ) {
720			if (bp->b_rcred == NOCRED && cred != NOCRED) {
721				crhold(cred);
722				bp->b_rcred = cred;
723			}
724		} else {
725			bp->b_flags |= B_WRITEINPROG;
726			if (bp->b_wcred == NOCRED && cred != NOCRED) {
727				crhold(cred);
728				bp->b_wcred = cred;
729			}
730		}
731
732		TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
733		nfs_iodwant[i] = (struct proc *)0;
734		wakeup((caddr_t)&nfs_iodwant[i]);
735		return (0);
736	    }
737
738	/*
739	 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE
740	 * return EIO so the process will call nfs_doio() and do it
741	 * synchronously.
742	 */
743	if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE))
744		return (EIO);
745
746	/*
747	 * Just turn the async write into a delayed write, instead of
748	 * doing in synchronously. Hopefully, at least one of the nfsiods
749	 * is currently doing a write for this file and will pick up the
750	 * delayed writes before going back to sleep.
751	 */
752	bp->b_flags |= B_DELWRI;
753	reassignbuf(bp, bp->b_vp);
754	biodone(bp);
755	return (0);
756}
757
758/*
759 * Do an I/O operation to/from a cache block. This may be called
760 * synchronously or from an nfsiod.
761 */
762int
763nfs_doio(bp, cr, p)
764	register struct buf *bp;
765	struct ucred *cr;
766	struct proc *p;
767{
768	register struct uio *uiop;
769	register struct vnode *vp;
770	struct nfsnode *np;
771	struct nfsmount *nmp;
772	int error = 0, diff, len, iomode, must_commit = 0;
773	struct uio uio;
774	struct iovec io;
775
776	vp = bp->b_vp;
777	np = VTONFS(vp);
778	nmp = VFSTONFS(vp->v_mount);
779	uiop = &uio;
780	uiop->uio_iov = &io;
781	uiop->uio_iovcnt = 1;
782	uiop->uio_segflg = UIO_SYSSPACE;
783	uiop->uio_procp = p;
784
785	/*
786	 * Historically, paging was done with physio, but no more.
787	 */
788	if (bp->b_flags & B_PHYS) {
789	    /*
790	     * ...though reading /dev/drum still gets us here.
791	     */
792	    io.iov_len = uiop->uio_resid = bp->b_bcount;
793	    /* mapping was done by vmapbuf() */
794	    io.iov_base = bp->b_data;
795	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
796	    if (bp->b_flags & B_READ) {
797		uiop->uio_rw = UIO_READ;
798		nfsstats.read_physios++;
799		error = nfs_readrpc(vp, uiop, cr);
800	    } else {
801		int com;
802
803		iomode = NFSV3WRITE_DATASYNC;
804		uiop->uio_rw = UIO_WRITE;
805		nfsstats.write_physios++;
806		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
807	    }
808	    if (error) {
809		bp->b_flags |= B_ERROR;
810		bp->b_error = error;
811	    }
812	} else if (bp->b_flags & B_READ) {
813	    io.iov_len = uiop->uio_resid = bp->b_bcount;
814	    io.iov_base = bp->b_data;
815	    uiop->uio_rw = UIO_READ;
816	    switch (vp->v_type) {
817	    case VREG:
818		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
819		nfsstats.read_bios++;
820		error = nfs_readrpc(vp, uiop, cr);
821		if (!error) {
822		    bp->b_validoff = 0;
823		    if (uiop->uio_resid) {
824			/*
825			 * If len > 0, there is a hole in the file and
826			 * no writes after the hole have been pushed to
827			 * the server yet.
828			 * Just zero fill the rest of the valid area.
829			 */
830			diff = bp->b_bcount - uiop->uio_resid;
831			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
832				+ diff);
833			if (len > 0) {
834			    len = min(len, uiop->uio_resid);
835			    bzero((char *)bp->b_data + diff, len);
836			    bp->b_validend = diff + len;
837			} else
838			    bp->b_validend = diff;
839		    } else
840			bp->b_validend = bp->b_bcount;
841		}
842		if (p && (vp->v_flag & VTEXT) &&
843			(((nmp->nm_flag & NFSMNT_NQNFS) &&
844			  NQNFS_CKINVALID(vp, np, ND_READ) &&
845			  np->n_lrev != np->n_brev) ||
846			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
847			  np->n_mtime != np->n_vattr.va_mtime.ts_sec))) {
848			uprintf("Process killed due to text file modification\n");
849			psignal(p, SIGKILL);
850#ifdef __NetBSD__
851			p->p_holdcnt++;
852#else
853			p->p_flag |= P_NOSWAP;
854#endif
855		}
856		break;
857	    case VLNK:
858		uiop->uio_offset = (off_t)0;
859		nfsstats.readlink_bios++;
860		error = nfs_readlinkrpc(vp, uiop, cr);
861		break;
862	    case VDIR:
863		nfsstats.readdir_bios++;
864		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
865		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
866			error = nfs_readdirplusrpc(vp, uiop, cr);
867			if (error == NFSERR_NOTSUPP)
868				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
869		}
870		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
871			error = nfs_readdirrpc(vp, uiop, cr);
872		break;
873	    default:
874		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
875		break;
876	    };
877	    if (error) {
878		bp->b_flags |= B_ERROR;
879		bp->b_error = error;
880	    }
881	} else {
882	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
883		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
884
885	    if (bp->b_dirtyend > bp->b_dirtyoff) {
886		io.iov_len = uiop->uio_resid = bp->b_dirtyend
887		    - bp->b_dirtyoff;
888		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
889		    + bp->b_dirtyoff;
890		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
891		uiop->uio_rw = UIO_WRITE;
892		nfsstats.write_bios++;
893		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
894		    iomode = NFSV3WRITE_UNSTABLE;
895		else
896		    iomode = NFSV3WRITE_FILESYNC;
897		bp->b_flags |= B_WRITEINPROG;
898		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
899		if (!error && iomode == NFSV3WRITE_UNSTABLE)
900		    bp->b_flags |= B_NEEDCOMMIT;
901		else
902		    bp->b_flags &= ~B_NEEDCOMMIT;
903		bp->b_flags &= ~B_WRITEINPROG;
904
905		/*
906		 * For an interrupted write, the buffer is still valid
907		 * and the write hasn't been pushed to the server yet,
908		 * so we can't set B_ERROR and report the interruption
909		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
910		 * is not relevant, so the rpc attempt is essentially
911		 * a noop.  For the case of a V3 write rpc not being
912		 * committed to stable storage, the block is still
913		 * dirty and requires either a commit rpc or another
914		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
915		 * the block is reused. This is indicated by setting
916		 * the B_DELWRI and B_NEEDCOMMIT flags.
917		 */
918    		if (error == EINTR
919		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
920			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
921			bp->b_flags |= B_DELWRI;
922
923		/*
924		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
925		 * buffer to the clean list, we have to reassign it back to the
926		 * dirty one. Ugh.
927		 */
928			if (bp->b_flags & B_ASYNC)
929				reassignbuf(bp, vp);
930			else
931				bp->b_flags |= B_EINTR;
932	    	} else {
933			if (error) {
934				bp->b_flags |= B_ERROR;
935				bp->b_error = np->n_error = error;
936				np->n_flag |= NWRITEERR;
937			}
938			bp->b_dirtyoff = bp->b_dirtyend = 0;
939		}
940	    } else {
941		bp->b_resid = 0;
942		biodone(bp);
943		return (0);
944	    }
945	}
946	bp->b_resid = uiop->uio_resid;
947	if (must_commit)
948		nfs_clearcommit(vp->v_mount);
949	biodone(bp);
950	return (error);
951}
952