nfs_bio.c revision 9336
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
37 * $Id: nfs_bio.c,v 1.14 1995/05/30 08:12:35 rgrimes Exp $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/resourcevar.h>
43#include <sys/signalvar.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/kernel.h>
49
50#include <vm/vm.h>
51
52#include <nfs/rpcv2.h>
53#include <nfs/nfsproto.h>
54#include <nfs/nfs.h>
55#include <nfs/nfsmount.h>
56#include <nfs/nqnfs.h>
57#include <nfs/nfsnode.h>
58
59struct buf *nfs_getcacheblk();
60extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
61extern int nfs_numasync;
62extern struct nfsstats nfsstats;
63
64/*
65 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
66 * that this isn't done inside getblk() and brelse() so these calls
67 * wouldn't need to be here.
68 */
69#ifdef B_VMIO
70#define vnode_pager_uncache(vp)
71#else
72#define vfs_busy_pages(bp, f)
73#define vfs_unbusy_pages(bp)
74#define vfs_dirty_pages(bp)
75#endif
76
77/*
78 * Vnode op for read using bio
79 * Any similarity to readip() is purely coincidental
80 */
81int
82nfs_bioread(vp, uio, ioflag, cred)
83	register struct vnode *vp;
84	register struct uio *uio;
85	int ioflag;
86	struct ucred *cred;
87{
88	register struct nfsnode *np = VTONFS(vp);
89	register int biosize, diff, i;
90	struct buf *bp = 0, *rabp;
91	struct vattr vattr;
92	struct proc *p;
93	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
94	daddr_t lbn, rabn;
95	int bufsize;
96	int nra, error = 0, n = 0, on = 0, not_readin;
97	nfsquad_t tquad;
98
99#ifdef DIAGNOSTIC
100	if (uio->uio_rw != UIO_READ)
101		panic("nfs_read mode");
102#endif
103	if (uio->uio_resid == 0)
104		return (0);
105	if (uio->uio_offset < 0)
106		return (EINVAL);
107	p = uio->uio_procp;
108	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
109		(void)nfs_fsinfo(nmp, vp, cred, p);
110	biosize = nmp->nm_rsize;
111	/*
112	 * For nfs, cache consistency can only be maintained approximately.
113	 * Although RFC1094 does not specify the criteria, the following is
114	 * believed to be compatible with the reference port.
115	 * For nqnfs, full cache consistency is maintained within the loop.
116	 * For nfs:
117	 * If the file's modify time on the server has changed since the
118	 * last read rpc or you have written to the file,
119	 * you may have lost data cache consistency with the
120	 * server, so flush all of the file's data out of the cache.
121	 * Then force a getattr rpc to ensure that you have up to date
122	 * attributes.
123	 * NB: This implies that cache data can be read when up to
124	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
125	 * attributes this could be forced by setting n_attrstamp to 0 before
126	 * the VOP_GETATTR() call.
127	 */
128	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) {
129		if (np->n_flag & NMODIFIED) {
130			if (vp->v_type != VREG) {
131				if (vp->v_type != VDIR)
132					panic("nfs: bioread, not dir");
133				nfs_invaldir(vp);
134				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
135				if (error)
136					return (error);
137			}
138			np->n_attrstamp = 0;
139			error = VOP_GETATTR(vp, &vattr, cred, p);
140			if (error)
141				return (error);
142			np->n_mtime = vattr.va_mtime.ts_sec;
143		} else {
144			error = VOP_GETATTR(vp, &vattr, cred, p);
145			if (error)
146				return (error);
147			if (np->n_mtime != vattr.va_mtime.ts_sec) {
148				if (vp->v_type == VDIR)
149					nfs_invaldir(vp);
150				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
151				if (error)
152					return (error);
153				np->n_mtime = vattr.va_mtime.ts_sec;
154			}
155		}
156	}
157	do {
158
159	    /*
160	     * Get a valid lease. If cached data is stale, flush it.
161	     */
162	    if (nmp->nm_flag & NFSMNT_NQNFS) {
163		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
164		    do {
165			error = nqnfs_getlease(vp, ND_READ, cred, p);
166		    } while (error == NQNFS_EXPIRED);
167		    if (error)
168			return (error);
169		    if (np->n_lrev != np->n_brev ||
170			(np->n_flag & NQNFSNONCACHE) ||
171			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
172			if (vp->v_type == VDIR)
173			    nfs_invaldir(vp);
174			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
175			if (error)
176			    return (error);
177			np->n_brev = np->n_lrev;
178		    }
179		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
180		    nfs_invaldir(vp);
181		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
182		    if (error)
183			return (error);
184		}
185	    }
186	    if (np->n_flag & NQNFSNONCACHE) {
187		switch (vp->v_type) {
188		case VREG:
189			return (nfs_readrpc(vp, uio, cred));
190		case VLNK:
191			return (nfs_readlinkrpc(vp, uio, cred));
192		case VDIR:
193			break;
194		default:
195			printf(" NQNFSNONCACHE: type %x unexpected\n",
196				vp->v_type);
197		};
198	    }
199	    switch (vp->v_type) {
200	    case VREG:
201		nfsstats.biocache_reads++;
202		lbn = uio->uio_offset / biosize;
203		on = uio->uio_offset & (biosize - 1);
204		not_readin = 1;
205
206		/*
207		 * Start the read ahead(s), as required.
208		 */
209		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
210		    for (nra = 0; nra < nmp->nm_readahead &&
211			(lbn + 1 + nra) * biosize < np->n_size; nra++) {
212			rabn = lbn + 1 + nra;
213			if (!incore(vp, rabn)) {
214			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
215			    if (!rabp)
216				return (EINTR);
217			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
218				rabp->b_flags |= (B_READ | B_ASYNC);
219				vfs_busy_pages(rabp, 0);
220				if (nfs_asyncio(rabp, cred)) {
221				    rabp->b_flags |= B_INVAL|B_ERROR;
222				    vfs_unbusy_pages(rabp);
223				    brelse(rabp);
224				}
225			    } else {
226				brelse(rabp);
227			    }
228			}
229		    }
230		}
231
232		/*
233		 * If the block is in the cache and has the required data
234		 * in a valid region, just copy it out.
235		 * Otherwise, get the block and write back/read in,
236		 * as required.
237		 */
238again:
239		bufsize = biosize;
240		if ((lbn + 1) * biosize > np->n_size) {
241			bufsize = np->n_size - lbn * biosize;
242			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
243		}
244		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
245		if (!bp)
246			return (EINTR);
247		if ((bp->b_flags & B_CACHE) == 0) {
248			bp->b_flags |= B_READ;
249			not_readin = 0;
250			vfs_busy_pages(bp, 0);
251			error = nfs_doio(bp, cred, p);
252			if (error) {
253			    brelse(bp);
254			    return (error);
255			}
256		}
257		if (bufsize > on) {
258			n = min((unsigned)(bufsize - on), uio->uio_resid);
259		} else {
260			n = 0;
261		}
262		diff = np->n_size - uio->uio_offset;
263		if (diff < n)
264			n = diff;
265		if (not_readin && n > 0) {
266			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
267				bp->b_flags |= B_NOCACHE;
268				if (bp->b_dirtyend > 0) {
269				    if ((bp->b_flags & B_DELWRI) == 0)
270					panic("nfsbioread");
271				    if (VOP_BWRITE(bp) == EINTR)
272					return (EINTR);
273				} else
274				    brelse(bp);
275				goto again;
276			}
277		}
278		vp->v_lastr = lbn;
279		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
280		if (diff < n)
281			n = diff;
282		break;
283	    case VLNK:
284		nfsstats.biocache_readlinks++;
285		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
286		if (!bp)
287			return (EINTR);
288		if ((bp->b_flags & B_CACHE) == 0) {
289			bp->b_flags |= B_READ;
290			vfs_busy_pages(bp, 0);
291			error = nfs_doio(bp, cred, p);
292			if (error) {
293				bp->b_flags |= B_ERROR;
294				brelse(bp);
295				return (error);
296			}
297		}
298		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
299		on = 0;
300		break;
301	    case VDIR:
302		nfsstats.biocache_readdirs++;
303		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
304		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
305		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
306		if (!bp)
307		    return (EINTR);
308		if ((bp->b_flags & B_CACHE) == 0) {
309		    bp->b_flags |= B_READ;
310		    vfs_busy_pages(bp, 0);
311		    error = nfs_doio(bp, cred, p);
312		    if (error) {
313			brelse(bp);
314			while (error == NFSERR_BAD_COOKIE) {
315			    nfs_invaldir(vp);
316			    error = nfs_vinvalbuf(vp, 0, cred, p, 1);
317			    /*
318			     * Yuck! The directory has been modified on the
319			     * server. The only way to get the block is by
320			     * reading from the beginning to get all the
321			     * offset cookies.
322			     */
323			    for (i = 0; i <= lbn && !error; i++) {
324				bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
325				if (!bp)
326				    return (EINTR);
327				if ((bp->b_flags & B_DONE) == 0) {
328				    bp->b_flags |= B_READ;
329				    vfs_busy_pages(bp, 0);
330				    error = nfs_doio(bp, cred, p);
331				    if (error)
332					brelse(bp);
333				}
334			    }
335			}
336			if (error)
337			    return (error);
338		    }
339		}
340
341		/*
342		 * If not eof and read aheads are enabled, start one.
343		 * (You need the current block first, so that you have the
344		 *  directory offset cookie of the next block.)
345		 */
346		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
347		    (np->n_direofoffset == 0 ||
348		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
349		    !(np->n_flag & NQNFSNONCACHE) &&
350		    !incore(vp, lbn + 1)) {
351			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
352			if (rabp) {
353			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
354				rabp->b_flags |= (B_READ | B_ASYNC);
355				vfs_busy_pages(rabp, 0);
356				if (nfs_asyncio(rabp, cred)) {
357				    rabp->b_flags |= B_INVAL|B_ERROR;
358				    vfs_unbusy_pages(rabp);
359				    brelse(rabp);
360				}
361			    } else {
362				brelse(rabp);
363			    }
364			}
365		}
366		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
367		break;
368	    default:
369		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
370		break;
371	    };
372
373	    if (n > 0) {
374		error = uiomove(bp->b_data + on, (int)n, uio);
375	    }
376	    switch (vp->v_type) {
377	    case VREG:
378		break;
379	    case VLNK:
380		n = 0;
381		break;
382	    case VDIR:
383		if (np->n_flag & NQNFSNONCACHE)
384			bp->b_flags |= B_INVAL;
385		break;
386	    default:
387		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
388	    }
389 	    brelse(bp);
390	} while (error == 0 && uio->uio_resid > 0 && n > 0);
391	return (error);
392}
393
394/*
395 * Vnode op for write using bio
396 */
397int
398nfs_write(ap)
399	struct vop_write_args /* {
400		struct vnode *a_vp;
401		struct uio *a_uio;
402		int  a_ioflag;
403		struct ucred *a_cred;
404	} */ *ap;
405{
406	register int biosize;
407	register struct uio *uio = ap->a_uio;
408	struct proc *p = uio->uio_procp;
409	register struct vnode *vp = ap->a_vp;
410	struct nfsnode *np = VTONFS(vp);
411	register struct ucred *cred = ap->a_cred;
412	int ioflag = ap->a_ioflag;
413	struct buf *bp;
414	struct vattr vattr;
415	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
416	daddr_t lbn, bn;
417	int bufsize;
418	int n, on, error = 0, iomode, must_commit;
419
420#ifdef DIAGNOSTIC
421	if (uio->uio_rw != UIO_WRITE)
422		panic("nfs_write mode");
423	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
424		panic("nfs_write proc");
425#endif
426	if (vp->v_type != VREG)
427		return (EIO);
428	if (np->n_flag & NWRITEERR) {
429		np->n_flag &= ~NWRITEERR;
430		return (np->n_error);
431	}
432	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
433		(void)nfs_fsinfo(nmp, vp, cred, p);
434	if (ioflag & (IO_APPEND | IO_SYNC)) {
435		if (np->n_flag & NMODIFIED) {
436			np->n_attrstamp = 0;
437			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
438			if (error)
439				return (error);
440		}
441		if (ioflag & IO_APPEND) {
442			np->n_attrstamp = 0;
443			error = VOP_GETATTR(vp, &vattr, cred, p);
444			if (error)
445				return (error);
446			uio->uio_offset = np->n_size;
447		}
448	}
449	if (uio->uio_offset < 0)
450		return (EINVAL);
451	if (uio->uio_resid == 0)
452		return (0);
453	/*
454	 * Maybe this should be above the vnode op call, but so long as
455	 * file servers have no limits, i don't think it matters
456	 */
457	if (p && uio->uio_offset + uio->uio_resid >
458	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
459		psignal(p, SIGXFSZ);
460		return (EFBIG);
461	}
462	/*
463	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
464	 * will be the same size within a filesystem. nfs_writerpc will
465	 * still use nm_wsize when sizing the rpc's.
466	 */
467	biosize = nmp->nm_rsize;
468	do {
469
470		/*
471		 * XXX make sure we aren't cached in the VM page cache
472		 */
473		/*
474		 * Check for a valid write lease.
475		 */
476		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
477		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
478			do {
479				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
480			} while (error == NQNFS_EXPIRED);
481			if (error)
482				return (error);
483			if (np->n_lrev != np->n_brev ||
484			    (np->n_flag & NQNFSNONCACHE)) {
485				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
486				if (error)
487					return (error);
488				np->n_brev = np->n_lrev;
489			}
490		}
491		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
492		    iomode = NFSV3WRITE_FILESYNC;
493		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
494		    if (must_commit)
495			nfs_clearcommit(vp->v_mount);
496		    return (error);
497		}
498		nfsstats.biocache_writes++;
499		lbn = uio->uio_offset / biosize;
500		on = uio->uio_offset & (biosize-1);
501		n = min((unsigned)(biosize - on), uio->uio_resid);
502again:
503		if (uio->uio_offset + n > np->n_size) {
504			np->n_size = uio->uio_offset + n;
505			vnode_pager_setsize(vp, (u_long)np->n_size);
506		}
507		bufsize = biosize;
508		if ((lbn + 1) * biosize > np->n_size) {
509			bufsize = np->n_size - lbn * biosize;
510			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
511		}
512		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
513		if (!bp)
514			return (EINTR);
515		if (bp->b_wcred == NOCRED) {
516			crhold(cred);
517			bp->b_wcred = cred;
518		}
519		np->n_flag |= NMODIFIED;
520
521		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
522			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
523		}
524
525		/*
526		 * If the new write will leave a contiguous dirty
527		 * area, just update the b_dirtyoff and b_dirtyend,
528		 * otherwise force a write rpc of the old dirty area.
529		 */
530		if (bp->b_dirtyend > 0 &&
531		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
532			bp->b_proc = p;
533			if (VOP_BWRITE(bp) == EINTR)
534				return (EINTR);
535			goto again;
536		}
537
538		/*
539		 * Check for valid write lease and get one as required.
540		 * In case getblk() and/or bwrite() delayed us.
541		 */
542		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
543		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
544			do {
545				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
546			} while (error == NQNFS_EXPIRED);
547			if (error) {
548				brelse(bp);
549				return (error);
550			}
551			if (np->n_lrev != np->n_brev ||
552			    (np->n_flag & NQNFSNONCACHE)) {
553				brelse(bp);
554				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
555				if (error)
556					return (error);
557				np->n_brev = np->n_lrev;
558				goto again;
559			}
560		}
561		error = uiomove((char *)bp->b_data + on, n, uio);
562		if (error) {
563			bp->b_flags |= B_ERROR;
564			brelse(bp);
565			return (error);
566		}
567		if (bp->b_dirtyend > 0) {
568			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
569			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
570		} else {
571			bp->b_dirtyoff = on;
572			bp->b_dirtyend = on + n;
573		}
574		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
575		    bp->b_validoff > bp->b_dirtyend) {
576			bp->b_validoff = bp->b_dirtyoff;
577			bp->b_validend = bp->b_dirtyend;
578		} else {
579			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
580			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
581		}
582		/*
583		 * If the lease is non-cachable or IO_SYNC do bwrite().
584		 */
585		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
586			bp->b_proc = p;
587			error = VOP_BWRITE(bp);
588			if (error)
589				return (error);
590			if (np->n_flag & NQNFSNONCACHE) {
591				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
592				if (error)
593					return (error);
594			}
595		} else if ((n + on) == biosize &&
596			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
597			bp->b_proc = (struct proc *)0;
598			bp->b_flags |= B_ASYNC;
599			(void)nfs_writebp(bp, 0);
600		} else
601			bdwrite(bp);
602	} while (uio->uio_resid > 0 && n > 0);
603	return (0);
604}
605
606/*
607 * Get an nfs cache block.
608 * Allocate a new one if the block isn't currently in the cache
609 * and return the block marked busy. If the calling process is
610 * interrupted by a signal for an interruptible mount point, return
611 * NULL.
612 */
613struct buf *
614nfs_getcacheblk(vp, bn, size, p)
615	struct vnode *vp;
616	daddr_t bn;
617	int size;
618	struct proc *p;
619{
620	register struct buf *bp;
621	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
622	int biosize = nmp->nm_rsize;
623
624	if (nmp->nm_flag & NFSMNT_INT) {
625		bp = getblk(vp, bn, size, PCATCH, 0);
626		while (bp == (struct buf *)0) {
627			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
628				return ((struct buf *)0);
629			bp = getblk(vp, bn, size, 0, 2 * hz);
630		}
631	} else
632		bp = getblk(vp, bn, size, 0, 0);
633
634	if( vp->v_type == VREG)
635		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
636
637	return (bp);
638}
639
640/*
641 * Flush and invalidate all dirty buffers. If another process is already
642 * doing the flush, just wait for completion.
643 */
644int
645nfs_vinvalbuf(vp, flags, cred, p, intrflg)
646	struct vnode *vp;
647	int flags;
648	struct ucred *cred;
649	struct proc *p;
650	int intrflg;
651{
652	register struct nfsnode *np = VTONFS(vp);
653	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
654	int error = 0, slpflag, slptimeo;
655
656	if ((nmp->nm_flag & NFSMNT_INT) == 0)
657		intrflg = 0;
658	if (intrflg) {
659		slpflag = PCATCH;
660		slptimeo = 2 * hz;
661	} else {
662		slpflag = 0;
663		slptimeo = 0;
664	}
665	/*
666	 * First wait for any other process doing a flush to complete.
667	 */
668	while (np->n_flag & NFLUSHINPROG) {
669		np->n_flag |= NFLUSHWANT;
670		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
671			slptimeo);
672		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
673			return (EINTR);
674	}
675
676	/*
677	 * Now, flush as required.
678	 */
679	np->n_flag |= NFLUSHINPROG;
680	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
681	while (error) {
682		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
683			np->n_flag &= ~NFLUSHINPROG;
684			if (np->n_flag & NFLUSHWANT) {
685				np->n_flag &= ~NFLUSHWANT;
686				wakeup((caddr_t)&np->n_flag);
687			}
688			return (EINTR);
689		}
690		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
691	}
692	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
693	if (np->n_flag & NFLUSHWANT) {
694		np->n_flag &= ~NFLUSHWANT;
695		wakeup((caddr_t)&np->n_flag);
696	}
697	return (0);
698}
699
700/*
701 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
702 * This is mainly to avoid queueing async I/O requests when the nfsiods
703 * are all hung on a dead server.
704 */
705int
706nfs_asyncio(bp, cred)
707	register struct buf *bp;
708	struct ucred *cred;
709{
710	register int i;
711
712	if (nfs_numasync == 0)
713		return (EIO);
714	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
715	    if (nfs_iodwant[i]) {
716		if (bp->b_flags & B_READ) {
717			if (bp->b_rcred == NOCRED && cred != NOCRED) {
718				crhold(cred);
719				bp->b_rcred = cred;
720			}
721		} else {
722			bp->b_flags |= B_WRITEINPROG;
723			if (bp->b_wcred == NOCRED && cred != NOCRED) {
724				crhold(cred);
725				bp->b_wcred = cred;
726			}
727		}
728
729		TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
730		nfs_iodwant[i] = (struct proc *)0;
731		wakeup((caddr_t)&nfs_iodwant[i]);
732		return (0);
733	    }
734
735	/*
736	 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE
737	 * return EIO so the process will call nfs_doio() and do it
738	 * synchronously.
739	 */
740	if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE))
741		return (EIO);
742
743	/*
744	 * Just turn the async write into a delayed write, instead of
745	 * doing in synchronously. Hopefully, at least one of the nfsiods
746	 * is currently doing a write for this file and will pick up the
747	 * delayed writes before going back to sleep.
748	 */
749	bp->b_flags |= B_DELWRI;
750	reassignbuf(bp, bp->b_vp);
751	biodone(bp);
752	return (0);
753}
754
755/*
756 * Do an I/O operation to/from a cache block. This may be called
757 * synchronously or from an nfsiod.
758 */
759int
760nfs_doio(bp, cr, p)
761	register struct buf *bp;
762	struct ucred *cr;
763	struct proc *p;
764{
765	register struct uio *uiop;
766	register struct vnode *vp;
767	struct nfsnode *np;
768	struct nfsmount *nmp;
769	int error = 0, diff, len, iomode, must_commit = 0;
770	struct uio uio;
771	struct iovec io;
772	nfsquad_t tquad;
773
774	vp = bp->b_vp;
775	np = VTONFS(vp);
776	nmp = VFSTONFS(vp->v_mount);
777	uiop = &uio;
778	uiop->uio_iov = &io;
779	uiop->uio_iovcnt = 1;
780	uiop->uio_segflg = UIO_SYSSPACE;
781	uiop->uio_procp = p;
782
783	/*
784	 * Historically, paging was done with physio, but no more.
785	 */
786	if (bp->b_flags & B_PHYS) {
787	    /*
788	     * ...though reading /dev/drum still gets us here.
789	     */
790	    io.iov_len = uiop->uio_resid = bp->b_bcount;
791	    /* mapping was done by vmapbuf() */
792	    io.iov_base = bp->b_data;
793	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
794	    if (bp->b_flags & B_READ) {
795		uiop->uio_rw = UIO_READ;
796		nfsstats.read_physios++;
797		error = nfs_readrpc(vp, uiop, cr);
798	    } else {
799		int com;
800
801		iomode = NFSV3WRITE_DATASYNC;
802		uiop->uio_rw = UIO_WRITE;
803		nfsstats.write_physios++;
804		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
805	    }
806	    if (error) {
807		bp->b_flags |= B_ERROR;
808		bp->b_error = error;
809	    }
810	} else if (bp->b_flags & B_READ) {
811	    io.iov_len = uiop->uio_resid = bp->b_bcount;
812	    io.iov_base = bp->b_data;
813	    uiop->uio_rw = UIO_READ;
814	    switch (vp->v_type) {
815	    case VREG:
816		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
817		nfsstats.read_bios++;
818		error = nfs_readrpc(vp, uiop, cr);
819		if (!error) {
820		    bp->b_validoff = 0;
821		    if (uiop->uio_resid) {
822			/*
823			 * If len > 0, there is a hole in the file and
824			 * no writes after the hole have been pushed to
825			 * the server yet.
826			 * Just zero fill the rest of the valid area.
827			 */
828			diff = bp->b_bcount - uiop->uio_resid;
829			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
830				+ diff);
831			if (len > 0) {
832			    len = min(len, uiop->uio_resid);
833			    bzero((char *)bp->b_data + diff, len);
834			    bp->b_validend = diff + len;
835			} else
836			    bp->b_validend = diff;
837		    } else
838			bp->b_validend = bp->b_bcount;
839		}
840		if (p && (vp->v_flag & VTEXT) &&
841			(((nmp->nm_flag & NFSMNT_NQNFS) &&
842			  NQNFS_CKINVALID(vp, np, ND_READ) &&
843			  np->n_lrev != np->n_brev) ||
844			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
845			  np->n_mtime != np->n_vattr.va_mtime.ts_sec))) {
846			uprintf("Process killed due to text file modification\n");
847			psignal(p, SIGKILL);
848#ifdef __NetBSD__
849			p->p_holdcnt++;
850#else
851			p->p_flag |= P_NOSWAP;
852#endif
853		}
854		break;
855	    case VLNK:
856		uiop->uio_offset = (off_t)0;
857		nfsstats.readlink_bios++;
858		error = nfs_readlinkrpc(vp, uiop, cr);
859		break;
860	    case VDIR:
861		nfsstats.readdir_bios++;
862		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
863		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
864			error = nfs_readdirplusrpc(vp, uiop, cr);
865			if (error == NFSERR_NOTSUPP)
866				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
867		}
868		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
869			error = nfs_readdirrpc(vp, uiop, cr);
870		break;
871	    default:
872		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
873		break;
874	    };
875	    if (error) {
876		bp->b_flags |= B_ERROR;
877		bp->b_error = error;
878	    }
879	} else {
880	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
881		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
882
883	    if (bp->b_dirtyend > bp->b_dirtyoff) {
884		io.iov_len = uiop->uio_resid = bp->b_dirtyend
885		    - bp->b_dirtyoff;
886		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
887		    + bp->b_dirtyoff;
888		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
889		uiop->uio_rw = UIO_WRITE;
890		nfsstats.write_bios++;
891		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
892		    iomode = NFSV3WRITE_UNSTABLE;
893		else
894		    iomode = NFSV3WRITE_FILESYNC;
895		bp->b_flags |= B_WRITEINPROG;
896		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
897		if (!error && iomode == NFSV3WRITE_UNSTABLE)
898		    bp->b_flags |= B_NEEDCOMMIT;
899		else
900		    bp->b_flags &= ~B_NEEDCOMMIT;
901		bp->b_flags &= ~B_WRITEINPROG;
902
903		/*
904		 * For an interrupted write, the buffer is still valid
905		 * and the write hasn't been pushed to the server yet,
906		 * so we can't set B_ERROR and report the interruption
907		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
908		 * is not relevant, so the rpc attempt is essentially
909		 * a noop.  For the case of a V3 write rpc not being
910		 * committed to stable storage, the block is still
911		 * dirty and requires either a commit rpc or another
912		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
913		 * the block is reused. This is indicated by setting
914		 * the B_DELWRI and B_NEEDCOMMIT flags.
915		 */
916    		if (error == EINTR
917		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
918			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
919			bp->b_flags |= B_DELWRI;
920
921		/*
922		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
923		 * buffer to the clean list, we have to reassign it back to the
924		 * dirty one. Ugh.
925		 */
926			if (bp->b_flags & B_ASYNC)
927				reassignbuf(bp, vp);
928			else
929				bp->b_flags |= B_EINTR;
930	    	} else {
931			if (error) {
932				bp->b_flags |= B_ERROR;
933				bp->b_error = np->n_error = error;
934				np->n_flag |= NWRITEERR;
935			}
936			bp->b_dirtyoff = bp->b_dirtyend = 0;
937		}
938	    } else {
939		bp->b_resid = 0;
940		biodone(bp);
941		return (0);
942	    }
943	}
944	bp->b_resid = uiop->uio_resid;
945	if (must_commit)
946		nfs_clearcommit(vp->v_mount);
947	biodone(bp);
948	return (error);
949}
950