nfs_bio.c revision 11921
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
37 * $Id: nfs_bio.c,v 1.17 1995/08/24 10:17:32 dfr Exp $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/resourcevar.h>
43#include <sys/signalvar.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/kernel.h>
49
50#include <vm/vm.h>
51
52#include <nfs/rpcv2.h>
53#include <nfs/nfsproto.h>
54#include <nfs/nfs.h>
55#include <nfs/nfsmount.h>
56#include <nfs/nqnfs.h>
57#include <nfs/nfsnode.h>
58
59struct buf *nfs_getcacheblk();
60extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
61extern int nfs_numasync;
62extern struct nfsstats nfsstats;
63
64/*
65 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
66 * that this isn't done inside getblk() and brelse() so these calls
67 * wouldn't need to be here.
68 */
69#ifdef B_VMIO
70#define vnode_pager_uncache(vp)
71#else
72#define vfs_busy_pages(bp, f)
73#define vfs_unbusy_pages(bp)
74#define vfs_dirty_pages(bp)
75#endif
76
77/*
78 * Vnode op for read using bio
79 * Any similarity to readip() is purely coincidental
80 */
81int
82nfs_bioread(vp, uio, ioflag, cred)
83	register struct vnode *vp;
84	register struct uio *uio;
85	int ioflag;
86	struct ucred *cred;
87{
88	register struct nfsnode *np = VTONFS(vp);
89	register int biosize, diff, i;
90	struct buf *bp = 0, *rabp;
91	struct vattr vattr;
92	struct proc *p;
93	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
94	daddr_t lbn, rabn;
95	int bufsize;
96	int nra, error = 0, n = 0, on = 0, not_readin;
97
98#ifdef DIAGNOSTIC
99	if (uio->uio_rw != UIO_READ)
100		panic("nfs_read mode");
101#endif
102	if (uio->uio_resid == 0)
103		return (0);
104	if (uio->uio_offset < 0)
105		return (EINVAL);
106	p = uio->uio_procp;
107	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
108		(void)nfs_fsinfo(nmp, vp, cred, p);
109	biosize = vp->v_mount->mnt_stat.f_iosize;
110	/*
111	 * For nfs, cache consistency can only be maintained approximately.
112	 * Although RFC1094 does not specify the criteria, the following is
113	 * believed to be compatible with the reference port.
114	 * For nqnfs, full cache consistency is maintained within the loop.
115	 * For nfs:
116	 * If the file's modify time on the server has changed since the
117	 * last read rpc or you have written to the file,
118	 * you may have lost data cache consistency with the
119	 * server, so flush all of the file's data out of the cache.
120	 * Then force a getattr rpc to ensure that you have up to date
121	 * attributes.
122	 * NB: This implies that cache data can be read when up to
123	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
124	 * attributes this could be forced by setting n_attrstamp to 0 before
125	 * the VOP_GETATTR() call.
126	 */
127	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
128		if (np->n_flag & NMODIFIED) {
129			if (vp->v_type != VREG) {
130				if (vp->v_type != VDIR)
131					panic("nfs: bioread, not dir");
132				nfs_invaldir(vp);
133				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
134				if (error)
135					return (error);
136			}
137			np->n_attrstamp = 0;
138			error = VOP_GETATTR(vp, &vattr, cred, p);
139			if (error)
140				return (error);
141			np->n_mtime = vattr.va_mtime.ts_sec;
142		} else {
143			error = VOP_GETATTR(vp, &vattr, cred, p);
144			if (error)
145				return (error);
146			if (np->n_mtime != vattr.va_mtime.ts_sec) {
147				if (vp->v_type == VDIR)
148					nfs_invaldir(vp);
149				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
150				if (error)
151					return (error);
152				np->n_mtime = vattr.va_mtime.ts_sec;
153			}
154		}
155	}
156	do {
157
158	    /*
159	     * Get a valid lease. If cached data is stale, flush it.
160	     */
161	    if (nmp->nm_flag & NFSMNT_NQNFS) {
162		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
163		    do {
164			error = nqnfs_getlease(vp, ND_READ, cred, p);
165		    } while (error == NQNFS_EXPIRED);
166		    if (error)
167			return (error);
168		    if (np->n_lrev != np->n_brev ||
169			(np->n_flag & NQNFSNONCACHE) ||
170			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
171			if (vp->v_type == VDIR)
172			    nfs_invaldir(vp);
173			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
174			if (error)
175			    return (error);
176			np->n_brev = np->n_lrev;
177		    }
178		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
179		    nfs_invaldir(vp);
180		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
181		    if (error)
182			return (error);
183		}
184	    }
185	    if (np->n_flag & NQNFSNONCACHE) {
186		switch (vp->v_type) {
187		case VREG:
188			return (nfs_readrpc(vp, uio, cred));
189		case VLNK:
190			return (nfs_readlinkrpc(vp, uio, cred));
191		case VDIR:
192			break;
193		default:
194			printf(" NQNFSNONCACHE: type %x unexpected\n",
195				vp->v_type);
196		};
197	    }
198	    switch (vp->v_type) {
199	    case VREG:
200		nfsstats.biocache_reads++;
201		lbn = uio->uio_offset / biosize;
202		on = uio->uio_offset & (biosize - 1);
203		not_readin = 1;
204
205		/*
206		 * Start the read ahead(s), as required.
207		 */
208		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
209		    for (nra = 0; nra < nmp->nm_readahead &&
210			(lbn + 1 + nra) * biosize < np->n_size; nra++) {
211			rabn = lbn + 1 + nra;
212			if (!incore(vp, rabn)) {
213			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
214			    if (!rabp)
215				return (EINTR);
216			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
217				rabp->b_flags |= (B_READ | B_ASYNC);
218				vfs_busy_pages(rabp, 0);
219				if (nfs_asyncio(rabp, cred)) {
220				    rabp->b_flags |= B_INVAL|B_ERROR;
221				    vfs_unbusy_pages(rabp);
222				    brelse(rabp);
223				}
224			    } else {
225				brelse(rabp);
226			    }
227			}
228		    }
229		}
230
231		/*
232		 * If the block is in the cache and has the required data
233		 * in a valid region, just copy it out.
234		 * Otherwise, get the block and write back/read in,
235		 * as required.
236		 */
237again:
238		bufsize = biosize;
239		if ((lbn + 1) * biosize > np->n_size) {
240			bufsize = np->n_size - lbn * biosize;
241			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
242		}
243		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
244		if (!bp)
245			return (EINTR);
246		if ((bp->b_flags & B_CACHE) == 0) {
247			bp->b_flags |= B_READ;
248			not_readin = 0;
249			vfs_busy_pages(bp, 0);
250			error = nfs_doio(bp, cred, p);
251			if (error) {
252			    brelse(bp);
253			    return (error);
254			}
255		}
256		if (bufsize > on) {
257			n = min((unsigned)(bufsize - on), uio->uio_resid);
258		} else {
259			n = 0;
260		}
261		diff = np->n_size - uio->uio_offset;
262		if (diff < n)
263			n = diff;
264		if (not_readin && n > 0) {
265			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
266				bp->b_flags |= B_NOCACHE;
267				if (bp->b_dirtyend > 0) {
268				    if ((bp->b_flags & B_DELWRI) == 0)
269					panic("nfsbioread");
270				    if (VOP_BWRITE(bp) == EINTR)
271					return (EINTR);
272				} else
273				    brelse(bp);
274				goto again;
275			}
276		}
277		vp->v_lastr = lbn;
278		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
279		if (diff < n)
280			n = diff;
281		break;
282	    case VLNK:
283		nfsstats.biocache_readlinks++;
284		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
285		if (!bp)
286			return (EINTR);
287		if ((bp->b_flags & B_CACHE) == 0) {
288			bp->b_flags |= B_READ;
289			vfs_busy_pages(bp, 0);
290			error = nfs_doio(bp, cred, p);
291			if (error) {
292				bp->b_flags |= B_ERROR;
293				brelse(bp);
294				return (error);
295			}
296		}
297		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
298		on = 0;
299		break;
300	    case VDIR:
301		nfsstats.biocache_readdirs++;
302		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
303		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
304		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
305		if (!bp)
306		    return (EINTR);
307		if ((bp->b_flags & B_CACHE) == 0) {
308		    bp->b_flags |= B_READ;
309		    vfs_busy_pages(bp, 0);
310		    error = nfs_doio(bp, cred, p);
311		    if (error) {
312			brelse(bp);
313			while (error == NFSERR_BAD_COOKIE) {
314			    nfs_invaldir(vp);
315			    error = nfs_vinvalbuf(vp, 0, cred, p, 1);
316			    /*
317			     * Yuck! The directory has been modified on the
318			     * server. The only way to get the block is by
319			     * reading from the beginning to get all the
320			     * offset cookies.
321			     */
322			    for (i = 0; i <= lbn && !error; i++) {
323				bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
324				if (!bp)
325				    return (EINTR);
326				if ((bp->b_flags & B_DONE) == 0) {
327				    bp->b_flags |= B_READ;
328				    vfs_busy_pages(bp, 0);
329				    error = nfs_doio(bp, cred, p);
330				    if (error)
331					brelse(bp);
332				}
333			    }
334			}
335			if (error)
336			    return (error);
337		    }
338		}
339
340		/*
341		 * If not eof and read aheads are enabled, start one.
342		 * (You need the current block first, so that you have the
343		 *  directory offset cookie of the next block.)
344		 */
345		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
346		    (np->n_direofoffset == 0 ||
347		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
348		    !(np->n_flag & NQNFSNONCACHE) &&
349		    !incore(vp, lbn + 1)) {
350			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
351			if (rabp) {
352			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
353				rabp->b_flags |= (B_READ | B_ASYNC);
354				vfs_busy_pages(rabp, 0);
355				if (nfs_asyncio(rabp, cred)) {
356				    rabp->b_flags |= B_INVAL|B_ERROR;
357				    vfs_unbusy_pages(rabp);
358				    brelse(rabp);
359				}
360			    } else {
361				brelse(rabp);
362			    }
363			}
364		}
365		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
366		break;
367	    default:
368		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
369		break;
370	    };
371
372	    if (n > 0) {
373		error = uiomove(bp->b_data + on, (int)n, uio);
374	    }
375	    switch (vp->v_type) {
376	    case VREG:
377		break;
378	    case VLNK:
379		n = 0;
380		break;
381	    case VDIR:
382		if (np->n_flag & NQNFSNONCACHE)
383			bp->b_flags |= B_INVAL;
384		break;
385	    default:
386		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
387	    }
388 	    brelse(bp);
389	} while (error == 0 && uio->uio_resid > 0 && n > 0);
390	return (error);
391}
392
393/*
394 * Vnode op for write using bio
395 */
396int
397nfs_write(ap)
398	struct vop_write_args /* {
399		struct vnode *a_vp;
400		struct uio *a_uio;
401		int  a_ioflag;
402		struct ucred *a_cred;
403	} */ *ap;
404{
405	register int biosize;
406	register struct uio *uio = ap->a_uio;
407	struct proc *p = uio->uio_procp;
408	register struct vnode *vp = ap->a_vp;
409	struct nfsnode *np = VTONFS(vp);
410	register struct ucred *cred = ap->a_cred;
411	int ioflag = ap->a_ioflag;
412	struct buf *bp;
413	struct vattr vattr;
414	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
415	daddr_t lbn;
416	int bufsize;
417	int n, on, error = 0, iomode, must_commit;
418
419#ifdef DIAGNOSTIC
420	if (uio->uio_rw != UIO_WRITE)
421		panic("nfs_write mode");
422	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
423		panic("nfs_write proc");
424#endif
425	if (vp->v_type != VREG)
426		return (EIO);
427	if (np->n_flag & NWRITEERR) {
428		np->n_flag &= ~NWRITEERR;
429		return (np->n_error);
430	}
431	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
432		(void)nfs_fsinfo(nmp, vp, cred, p);
433	if (ioflag & (IO_APPEND | IO_SYNC)) {
434		if (np->n_flag & NMODIFIED) {
435			np->n_attrstamp = 0;
436			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
437			if (error)
438				return (error);
439		}
440		if (ioflag & IO_APPEND) {
441			np->n_attrstamp = 0;
442			error = VOP_GETATTR(vp, &vattr, cred, p);
443			if (error)
444				return (error);
445			uio->uio_offset = np->n_size;
446		}
447	}
448	if (uio->uio_offset < 0)
449		return (EINVAL);
450	if (uio->uio_resid == 0)
451		return (0);
452	/*
453	 * Maybe this should be above the vnode op call, but so long as
454	 * file servers have no limits, i don't think it matters
455	 */
456	if (p && uio->uio_offset + uio->uio_resid >
457	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
458		psignal(p, SIGXFSZ);
459		return (EFBIG);
460	}
461	/*
462	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
463	 * will be the same size within a filesystem. nfs_writerpc will
464	 * still use nm_wsize when sizing the rpc's.
465	 */
466	biosize = vp->v_mount->mnt_stat.f_iosize;
467	do {
468
469		/*
470		 * XXX make sure we aren't cached in the VM page cache
471		 */
472		/*
473		 * Check for a valid write lease.
474		 */
475		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
476		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
477			do {
478				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
479			} while (error == NQNFS_EXPIRED);
480			if (error)
481				return (error);
482			if (np->n_lrev != np->n_brev ||
483			    (np->n_flag & NQNFSNONCACHE)) {
484				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
485				if (error)
486					return (error);
487				np->n_brev = np->n_lrev;
488			}
489		}
490		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
491		    iomode = NFSV3WRITE_FILESYNC;
492		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
493		    if (must_commit)
494			nfs_clearcommit(vp->v_mount);
495		    return (error);
496		}
497		nfsstats.biocache_writes++;
498		lbn = uio->uio_offset / biosize;
499		on = uio->uio_offset & (biosize-1);
500		n = min((unsigned)(biosize - on), uio->uio_resid);
501again:
502		if (uio->uio_offset + n > np->n_size) {
503			np->n_size = uio->uio_offset + n;
504			vnode_pager_setsize(vp, (u_long)np->n_size);
505		}
506		bufsize = biosize;
507		if ((lbn + 1) * biosize > np->n_size) {
508			bufsize = np->n_size - lbn * biosize;
509			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
510		}
511		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
512		if (!bp)
513			return (EINTR);
514		if (bp->b_wcred == NOCRED) {
515			crhold(cred);
516			bp->b_wcred = cred;
517		}
518		np->n_flag |= NMODIFIED;
519
520		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
521			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
522		}
523
524		/*
525		 * If the new write will leave a contiguous dirty
526		 * area, just update the b_dirtyoff and b_dirtyend,
527		 * otherwise force a write rpc of the old dirty area.
528		 */
529		if (bp->b_dirtyend > 0 &&
530		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
531			bp->b_proc = p;
532			if (VOP_BWRITE(bp) == EINTR)
533				return (EINTR);
534			goto again;
535		}
536
537		/*
538		 * Check for valid write lease and get one as required.
539		 * In case getblk() and/or bwrite() delayed us.
540		 */
541		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
542		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
543			do {
544				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
545			} while (error == NQNFS_EXPIRED);
546			if (error) {
547				brelse(bp);
548				return (error);
549			}
550			if (np->n_lrev != np->n_brev ||
551			    (np->n_flag & NQNFSNONCACHE)) {
552				brelse(bp);
553				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
554				if (error)
555					return (error);
556				np->n_brev = np->n_lrev;
557				goto again;
558			}
559		}
560		error = uiomove((char *)bp->b_data + on, n, uio);
561		if (error) {
562			bp->b_flags |= B_ERROR;
563			brelse(bp);
564			return (error);
565		}
566		if (bp->b_dirtyend > 0) {
567			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
568			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
569		} else {
570			bp->b_dirtyoff = on;
571			bp->b_dirtyend = on + n;
572		}
573		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
574		    bp->b_validoff > bp->b_dirtyend) {
575			bp->b_validoff = bp->b_dirtyoff;
576			bp->b_validend = bp->b_dirtyend;
577		} else {
578			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
579			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
580		}
581		/*
582		 * If the lease is non-cachable or IO_SYNC do bwrite().
583		 */
584		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
585			bp->b_proc = p;
586			error = VOP_BWRITE(bp);
587			if (error)
588				return (error);
589			if (np->n_flag & NQNFSNONCACHE) {
590				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
591				if (error)
592					return (error);
593			}
594		} else if ((n + on) == biosize &&
595			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
596			bp->b_proc = (struct proc *)0;
597			bp->b_flags |= B_ASYNC;
598			(void)nfs_writebp(bp, 0);
599		} else
600			bdwrite(bp);
601	} while (uio->uio_resid > 0 && n > 0);
602	return (0);
603}
604
605/*
606 * Get an nfs cache block.
607 * Allocate a new one if the block isn't currently in the cache
608 * and return the block marked busy. If the calling process is
609 * interrupted by a signal for an interruptible mount point, return
610 * NULL.
611 */
612struct buf *
613nfs_getcacheblk(vp, bn, size, p)
614	struct vnode *vp;
615	daddr_t bn;
616	int size;
617	struct proc *p;
618{
619	register struct buf *bp;
620	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
621	int biosize = vp->v_mount->mnt_stat.f_iosize;
622
623	if (nmp->nm_flag & NFSMNT_INT) {
624		bp = getblk(vp, bn, size, PCATCH, 0);
625		while (bp == (struct buf *)0) {
626			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
627				return ((struct buf *)0);
628			bp = getblk(vp, bn, size, 0, 2 * hz);
629		}
630	} else
631		bp = getblk(vp, bn, size, 0, 0);
632
633	if( vp->v_type == VREG)
634		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
635
636	return (bp);
637}
638
639/*
640 * Flush and invalidate all dirty buffers. If another process is already
641 * doing the flush, just wait for completion.
642 */
643int
644nfs_vinvalbuf(vp, flags, cred, p, intrflg)
645	struct vnode *vp;
646	int flags;
647	struct ucred *cred;
648	struct proc *p;
649	int intrflg;
650{
651	register struct nfsnode *np = VTONFS(vp);
652	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
653	int error = 0, slpflag, slptimeo;
654
655	if ((nmp->nm_flag & NFSMNT_INT) == 0)
656		intrflg = 0;
657	if (intrflg) {
658		slpflag = PCATCH;
659		slptimeo = 2 * hz;
660	} else {
661		slpflag = 0;
662		slptimeo = 0;
663	}
664	/*
665	 * First wait for any other process doing a flush to complete.
666	 */
667	while (np->n_flag & NFLUSHINPROG) {
668		np->n_flag |= NFLUSHWANT;
669		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
670			slptimeo);
671		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
672			return (EINTR);
673	}
674
675	/*
676	 * Now, flush as required.
677	 */
678	np->n_flag |= NFLUSHINPROG;
679	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
680	while (error) {
681		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
682			np->n_flag &= ~NFLUSHINPROG;
683			if (np->n_flag & NFLUSHWANT) {
684				np->n_flag &= ~NFLUSHWANT;
685				wakeup((caddr_t)&np->n_flag);
686			}
687			return (EINTR);
688		}
689		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
690	}
691	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
692	if (np->n_flag & NFLUSHWANT) {
693		np->n_flag &= ~NFLUSHWANT;
694		wakeup((caddr_t)&np->n_flag);
695	}
696	return (0);
697}
698
699/*
700 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
701 * This is mainly to avoid queueing async I/O requests when the nfsiods
702 * are all hung on a dead server.
703 */
704int
705nfs_asyncio(bp, cred)
706	register struct buf *bp;
707	struct ucred *cred;
708{
709	register int i;
710
711	if (nfs_numasync == 0)
712		return (EIO);
713	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
714	    if (nfs_iodwant[i]) {
715		if (bp->b_flags & B_READ) {
716			if (bp->b_rcred == NOCRED && cred != NOCRED) {
717				crhold(cred);
718				bp->b_rcred = cred;
719			}
720		} else {
721			bp->b_flags |= B_WRITEINPROG;
722			if (bp->b_wcred == NOCRED && cred != NOCRED) {
723				crhold(cred);
724				bp->b_wcred = cred;
725			}
726		}
727
728		TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
729		nfs_iodwant[i] = (struct proc *)0;
730		wakeup((caddr_t)&nfs_iodwant[i]);
731		return (0);
732	    }
733
734	/*
735	 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE
736	 * return EIO so the process will call nfs_doio() and do it
737	 * synchronously.
738	 */
739	if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE))
740		return (EIO);
741
742	/*
743	 * Just turn the async write into a delayed write, instead of
744	 * doing in synchronously. Hopefully, at least one of the nfsiods
745	 * is currently doing a write for this file and will pick up the
746	 * delayed writes before going back to sleep.
747	 */
748	bp->b_flags |= B_DELWRI;
749	reassignbuf(bp, bp->b_vp);
750	biodone(bp);
751	return (0);
752}
753
754/*
755 * Do an I/O operation to/from a cache block. This may be called
756 * synchronously or from an nfsiod.
757 */
758int
759nfs_doio(bp, cr, p)
760	register struct buf *bp;
761	struct ucred *cr;
762	struct proc *p;
763{
764	register struct uio *uiop;
765	register struct vnode *vp;
766	struct nfsnode *np;
767	struct nfsmount *nmp;
768	int error = 0, diff, len, iomode, must_commit = 0;
769	struct uio uio;
770	struct iovec io;
771
772	vp = bp->b_vp;
773	np = VTONFS(vp);
774	nmp = VFSTONFS(vp->v_mount);
775	uiop = &uio;
776	uiop->uio_iov = &io;
777	uiop->uio_iovcnt = 1;
778	uiop->uio_segflg = UIO_SYSSPACE;
779	uiop->uio_procp = p;
780
781	/*
782	 * Historically, paging was done with physio, but no more.
783	 */
784	if (bp->b_flags & B_PHYS) {
785	    /*
786	     * ...though reading /dev/drum still gets us here.
787	     */
788	    io.iov_len = uiop->uio_resid = bp->b_bcount;
789	    /* mapping was done by vmapbuf() */
790	    io.iov_base = bp->b_data;
791	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
792	    if (bp->b_flags & B_READ) {
793		uiop->uio_rw = UIO_READ;
794		nfsstats.read_physios++;
795		error = nfs_readrpc(vp, uiop, cr);
796	    } else {
797		int com;
798
799		iomode = NFSV3WRITE_DATASYNC;
800		uiop->uio_rw = UIO_WRITE;
801		nfsstats.write_physios++;
802		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
803	    }
804	    if (error) {
805		bp->b_flags |= B_ERROR;
806		bp->b_error = error;
807	    }
808	} else if (bp->b_flags & B_READ) {
809	    io.iov_len = uiop->uio_resid = bp->b_bcount;
810	    io.iov_base = bp->b_data;
811	    uiop->uio_rw = UIO_READ;
812	    switch (vp->v_type) {
813	    case VREG:
814		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
815		nfsstats.read_bios++;
816		error = nfs_readrpc(vp, uiop, cr);
817		if (!error) {
818		    bp->b_validoff = 0;
819		    if (uiop->uio_resid) {
820			/*
821			 * If len > 0, there is a hole in the file and
822			 * no writes after the hole have been pushed to
823			 * the server yet.
824			 * Just zero fill the rest of the valid area.
825			 */
826			diff = bp->b_bcount - uiop->uio_resid;
827			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
828				+ diff);
829			if (len > 0) {
830			    len = min(len, uiop->uio_resid);
831			    bzero((char *)bp->b_data + diff, len);
832			    bp->b_validend = diff + len;
833			} else
834			    bp->b_validend = diff;
835		    } else
836			bp->b_validend = bp->b_bcount;
837		}
838		if (p && (vp->v_flag & VTEXT) &&
839			(((nmp->nm_flag & NFSMNT_NQNFS) &&
840			  NQNFS_CKINVALID(vp, np, ND_READ) &&
841			  np->n_lrev != np->n_brev) ||
842			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
843			  np->n_mtime != np->n_vattr.va_mtime.ts_sec))) {
844			uprintf("Process killed due to text file modification\n");
845			psignal(p, SIGKILL);
846#ifdef __NetBSD__
847			p->p_holdcnt++;
848#else
849			p->p_flag |= P_NOSWAP;
850#endif
851		}
852		break;
853	    case VLNK:
854		uiop->uio_offset = (off_t)0;
855		nfsstats.readlink_bios++;
856		error = nfs_readlinkrpc(vp, uiop, cr);
857		break;
858	    case VDIR:
859		nfsstats.readdir_bios++;
860		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
861		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
862			error = nfs_readdirplusrpc(vp, uiop, cr);
863			if (error == NFSERR_NOTSUPP)
864				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
865		}
866		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
867			error = nfs_readdirrpc(vp, uiop, cr);
868		break;
869	    default:
870		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
871		break;
872	    };
873	    if (error) {
874		bp->b_flags |= B_ERROR;
875		bp->b_error = error;
876	    }
877	} else {
878	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
879		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
880
881	    if (bp->b_dirtyend > bp->b_dirtyoff) {
882		io.iov_len = uiop->uio_resid = bp->b_dirtyend
883		    - bp->b_dirtyoff;
884		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
885		    + bp->b_dirtyoff;
886		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
887		uiop->uio_rw = UIO_WRITE;
888		nfsstats.write_bios++;
889		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
890		    iomode = NFSV3WRITE_UNSTABLE;
891		else
892		    iomode = NFSV3WRITE_FILESYNC;
893		bp->b_flags |= B_WRITEINPROG;
894		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
895		if (!error && iomode == NFSV3WRITE_UNSTABLE)
896		    bp->b_flags |= B_NEEDCOMMIT;
897		else
898		    bp->b_flags &= ~B_NEEDCOMMIT;
899		bp->b_flags &= ~B_WRITEINPROG;
900
901		/*
902		 * For an interrupted write, the buffer is still valid
903		 * and the write hasn't been pushed to the server yet,
904		 * so we can't set B_ERROR and report the interruption
905		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
906		 * is not relevant, so the rpc attempt is essentially
907		 * a noop.  For the case of a V3 write rpc not being
908		 * committed to stable storage, the block is still
909		 * dirty and requires either a commit rpc or another
910		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
911		 * the block is reused. This is indicated by setting
912		 * the B_DELWRI and B_NEEDCOMMIT flags.
913		 */
914    		if (error == EINTR
915		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
916			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
917			bp->b_flags |= B_DELWRI;
918
919		/*
920		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
921		 * buffer to the clean list, we have to reassign it back to the
922		 * dirty one. Ugh.
923		 */
924			if (bp->b_flags & B_ASYNC)
925				reassignbuf(bp, vp);
926			else
927				bp->b_flags |= B_EINTR;
928	    	} else {
929			if (error) {
930				bp->b_flags |= B_ERROR;
931				bp->b_error = np->n_error = error;
932				np->n_flag |= NWRITEERR;
933			}
934			bp->b_dirtyoff = bp->b_dirtyend = 0;
935		}
936	    } else {
937		bp->b_resid = 0;
938		biodone(bp);
939		return (0);
940	    }
941	}
942	bp->b_resid = uiop->uio_resid;
943	if (must_commit)
944		nfs_clearcommit(vp->v_mount);
945	biodone(bp);
946	return (error);
947}
948