nfs_bio.c revision 1549
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
37 */
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/resourcevar.h>
42#include <sys/proc.h>
43#include <sys/buf.h>
44#include <sys/vnode.h>
45#include <sys/trace.h>
46#include <sys/mount.h>
47#include <sys/kernel.h>
48
49#include <vm/vm.h>
50
51#include <nfs/nfsnode.h>
52#include <nfs/rpcv2.h>
53#include <nfs/nfsv2.h>
54#include <nfs/nfs.h>
55#include <nfs/nfsmount.h>
56#include <nfs/nqnfs.h>
57
58struct buf *incore(), *nfs_getcacheblk();
59extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
60extern int nfs_numasync;
61
62/*
63 * Vnode op for read using bio
64 * Any similarity to readip() is purely coincidental
65 */
66int
67nfs_bioread(vp, uio, ioflag, cred)
68	register struct vnode *vp;
69	register struct uio *uio;
70	int ioflag;
71	struct ucred *cred;
72{
73	register struct nfsnode *np = VTONFS(vp);
74	register int biosize, diff;
75	struct buf *bp = 0, *rabp;
76	struct vattr vattr;
77	struct proc *p;
78	struct nfsmount *nmp;
79	daddr_t lbn, bn, rabn;
80	caddr_t baddr;
81	int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin;
82
83#ifdef lint
84	ioflag = ioflag;
85#endif /* lint */
86#ifdef DIAGNOSTIC
87	if (uio->uio_rw != UIO_READ)
88		panic("nfs_read mode");
89#endif
90	if (uio->uio_resid == 0)
91		return (0);
92	if (uio->uio_offset < 0 && vp->v_type != VDIR)
93		return (EINVAL);
94	nmp = VFSTONFS(vp->v_mount);
95	biosize = nmp->nm_rsize;
96	p = uio->uio_procp;
97	/*
98	 * For nfs, cache consistency can only be maintained approximately.
99	 * Although RFC1094 does not specify the criteria, the following is
100	 * believed to be compatible with the reference port.
101	 * For nqnfs, full cache consistency is maintained within the loop.
102	 * For nfs:
103	 * If the file's modify time on the server has changed since the
104	 * last read rpc or you have written to the file,
105	 * you may have lost data cache consistency with the
106	 * server, so flush all of the file's data out of the cache.
107	 * Then force a getattr rpc to ensure that you have up to date
108	 * attributes.
109	 * The mount flag NFSMNT_MYWRITE says "Assume that my writes are
110	 * the ones changing the modify time.
111	 * NB: This implies that cache data can be read when up to
112	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
113	 * attributes this could be forced by setting n_attrstamp to 0 before
114	 * the VOP_GETATTR() call.
115	 */
116	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) {
117		if (np->n_flag & NMODIFIED) {
118			if ((nmp->nm_flag & NFSMNT_MYWRITE) == 0 ||
119			     vp->v_type != VREG) {
120				if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1))
121					return (error);
122			}
123			np->n_attrstamp = 0;
124			np->n_direofoffset = 0;
125			if (error = VOP_GETATTR(vp, &vattr, cred, p))
126				return (error);
127			np->n_mtime = vattr.va_mtime.ts_sec;
128		} else {
129			if (error = VOP_GETATTR(vp, &vattr, cred, p))
130				return (error);
131			if (np->n_mtime != vattr.va_mtime.ts_sec) {
132				np->n_direofoffset = 0;
133				if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1))
134					return (error);
135				np->n_mtime = vattr.va_mtime.ts_sec;
136			}
137		}
138	}
139	do {
140
141	    /*
142	     * Get a valid lease. If cached data is stale, flush it.
143	     */
144	    if (nmp->nm_flag & NFSMNT_NQNFS) {
145		if (NQNFS_CKINVALID(vp, np, NQL_READ)) {
146		    do {
147			error = nqnfs_getlease(vp, NQL_READ, cred, p);
148		    } while (error == NQNFS_EXPIRED);
149		    if (error)
150			return (error);
151		    if (np->n_lrev != np->n_brev ||
152			(np->n_flag & NQNFSNONCACHE) ||
153			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
154			if (vp->v_type == VDIR) {
155			    np->n_direofoffset = 0;
156			    cache_purge(vp);
157			}
158			if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1))
159			    return (error);
160			np->n_brev = np->n_lrev;
161		    }
162		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
163		    np->n_direofoffset = 0;
164		    cache_purge(vp);
165		    if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1))
166			return (error);
167		}
168	    }
169	    if (np->n_flag & NQNFSNONCACHE) {
170		switch (vp->v_type) {
171		case VREG:
172			error = nfs_readrpc(vp, uio, cred);
173			break;
174		case VLNK:
175			error = nfs_readlinkrpc(vp, uio, cred);
176			break;
177		case VDIR:
178			error = nfs_readdirrpc(vp, uio, cred);
179			break;
180		};
181		return (error);
182	    }
183	    baddr = (caddr_t)0;
184	    switch (vp->v_type) {
185	    case VREG:
186		nfsstats.biocache_reads++;
187		lbn = uio->uio_offset / biosize;
188		on = uio->uio_offset & (biosize-1);
189		bn = lbn * (biosize / DEV_BSIZE);
190		not_readin = 1;
191
192		/*
193		 * Start the read ahead(s), as required.
194		 */
195		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
196		    lbn == vp->v_lastr + 1) {
197		    for (nra = 0; nra < nmp->nm_readahead &&
198			(lbn + 1 + nra) * biosize < np->n_size; nra++) {
199			rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE);
200			if (!incore(vp, rabn)) {
201			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
202			    if (!rabp)
203				return (EINTR);
204			    if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) {
205				rabp->b_flags |= (B_READ | B_ASYNC);
206				if (nfs_asyncio(rabp, cred)) {
207				    rabp->b_flags |= B_INVAL;
208				    brelse(rabp);
209				}
210			    }
211			}
212		    }
213		}
214
215		/*
216		 * If the block is in the cache and has the required data
217		 * in a valid region, just copy it out.
218		 * Otherwise, get the block and write back/read in,
219		 * as required.
220		 */
221		if ((bp = incore(vp, bn)) &&
222		    (bp->b_flags & (B_BUSY | B_WRITEINPROG)) ==
223		    (B_BUSY | B_WRITEINPROG))
224			got_buf = 0;
225		else {
226again:
227			bp = nfs_getcacheblk(vp, bn, biosize, p);
228			if (!bp)
229				return (EINTR);
230			got_buf = 1;
231			if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
232				bp->b_flags |= B_READ;
233				not_readin = 0;
234				if (error = nfs_doio(bp, cred, p)) {
235				    brelse(bp);
236				    return (error);
237				}
238			}
239		}
240		n = min((unsigned)(biosize - on), uio->uio_resid);
241		diff = np->n_size - uio->uio_offset;
242		if (diff < n)
243			n = diff;
244		if (not_readin && n > 0) {
245			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
246				if (!got_buf) {
247				    bp = nfs_getcacheblk(vp, bn, biosize, p);
248				    if (!bp)
249					return (EINTR);
250				    got_buf = 1;
251				}
252				bp->b_flags |= B_INVAL;
253				if (bp->b_dirtyend > 0) {
254				    if ((bp->b_flags & B_DELWRI) == 0)
255					panic("nfsbioread");
256				    if (VOP_BWRITE(bp) == EINTR)
257					return (EINTR);
258				} else
259				    brelse(bp);
260				goto again;
261			}
262		}
263		vp->v_lastr = lbn;
264		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
265		if (diff < n)
266			n = diff;
267		break;
268	    case VLNK:
269		nfsstats.biocache_readlinks++;
270		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
271		if (!bp)
272			return (EINTR);
273		if ((bp->b_flags & B_DONE) == 0) {
274			bp->b_flags |= B_READ;
275			if (error = nfs_doio(bp, cred, p)) {
276				brelse(bp);
277				return (error);
278			}
279		}
280		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
281		got_buf = 1;
282		on = 0;
283		break;
284	    case VDIR:
285		nfsstats.biocache_readdirs++;
286		bn = (daddr_t)uio->uio_offset;
287		bp = nfs_getcacheblk(vp, bn, NFS_DIRBLKSIZ, p);
288		if (!bp)
289			return (EINTR);
290		if ((bp->b_flags & B_DONE) == 0) {
291			bp->b_flags |= B_READ;
292			if (error = nfs_doio(bp, cred, p)) {
293				brelse(bp);
294				return (error);
295			}
296		}
297
298		/*
299		 * If not eof and read aheads are enabled, start one.
300		 * (You need the current block first, so that you have the
301		 *  directory offset cookie of the next block.
302		 */
303		rabn = bp->b_blkno;
304		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
305		    rabn != 0 && rabn != np->n_direofoffset &&
306		    !incore(vp, rabn)) {
307			rabp = nfs_getcacheblk(vp, rabn, NFS_DIRBLKSIZ, p);
308			if (rabp) {
309			    if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) {
310				rabp->b_flags |= (B_READ | B_ASYNC);
311				if (nfs_asyncio(rabp, cred)) {
312				    rabp->b_flags |= B_INVAL;
313				    brelse(rabp);
314				}
315			    }
316			}
317		}
318		on = 0;
319		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid);
320		got_buf = 1;
321		break;
322	    };
323
324	    if (n > 0) {
325		if (!baddr)
326			baddr = bp->b_data;
327		error = uiomove(baddr + on, (int)n, uio);
328	    }
329	    switch (vp->v_type) {
330	    case VREG:
331		if (n + on == biosize || uio->uio_offset == np->n_size)
332			bp->b_flags |= B_AGE;
333		break;
334	    case VLNK:
335		n = 0;
336		break;
337	    case VDIR:
338		uio->uio_offset = bp->b_blkno;
339		break;
340	    };
341	    if (got_buf)
342		brelse(bp);
343	} while (error == 0 && uio->uio_resid > 0 && n > 0);
344	return (error);
345}
346
347/*
348 * Vnode op for write using bio
349 */
350int
351nfs_write(ap)
352	struct vop_write_args /* {
353		struct vnode *a_vp;
354		struct uio *a_uio;
355		int  a_ioflag;
356		struct ucred *a_cred;
357	} */ *ap;
358{
359	register int biosize;
360	register struct uio *uio = ap->a_uio;
361	struct proc *p = uio->uio_procp;
362	register struct vnode *vp = ap->a_vp;
363	struct nfsnode *np = VTONFS(vp);
364	register struct ucred *cred = ap->a_cred;
365	int ioflag = ap->a_ioflag;
366	struct buf *bp;
367	struct vattr vattr;
368	struct nfsmount *nmp;
369	daddr_t lbn, bn;
370	int n, on, error = 0;
371
372#ifdef DIAGNOSTIC
373	if (uio->uio_rw != UIO_WRITE)
374		panic("nfs_write mode");
375	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
376		panic("nfs_write proc");
377#endif
378	if (vp->v_type != VREG)
379		return (EIO);
380	if (np->n_flag & NWRITEERR) {
381		np->n_flag &= ~NWRITEERR;
382		return (np->n_error);
383	}
384	if (ioflag & (IO_APPEND | IO_SYNC)) {
385		if (np->n_flag & NMODIFIED) {
386			np->n_attrstamp = 0;
387			if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1))
388				return (error);
389		}
390		if (ioflag & IO_APPEND) {
391			np->n_attrstamp = 0;
392			if (error = VOP_GETATTR(vp, &vattr, cred, p))
393				return (error);
394			uio->uio_offset = np->n_size;
395		}
396	}
397	nmp = VFSTONFS(vp->v_mount);
398	if (uio->uio_offset < 0)
399		return (EINVAL);
400	if (uio->uio_resid == 0)
401		return (0);
402	/*
403	 * Maybe this should be above the vnode op call, but so long as
404	 * file servers have no limits, i don't think it matters
405	 */
406	if (p && uio->uio_offset + uio->uio_resid >
407	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
408		psignal(p, SIGXFSZ);
409		return (EFBIG);
410	}
411	/*
412	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
413	 * will be the same size within a filesystem. nfs_writerpc will
414	 * still use nm_wsize when sizing the rpc's.
415	 */
416	biosize = nmp->nm_rsize;
417	do {
418
419		/*
420		 * Check for a valid write lease.
421		 * If non-cachable, just do the rpc
422		 */
423		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
424		    NQNFS_CKINVALID(vp, np, NQL_WRITE)) {
425			do {
426				error = nqnfs_getlease(vp, NQL_WRITE, cred, p);
427			} while (error == NQNFS_EXPIRED);
428			if (error)
429				return (error);
430			if (np->n_lrev != np->n_brev ||
431			    (np->n_flag & NQNFSNONCACHE)) {
432				if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1))
433					return (error);
434				np->n_brev = np->n_lrev;
435			}
436		}
437		if (np->n_flag & NQNFSNONCACHE)
438			return (nfs_writerpc(vp, uio, cred, ioflag));
439		nfsstats.biocache_writes++;
440		lbn = uio->uio_offset / biosize;
441		on = uio->uio_offset & (biosize-1);
442		n = min((unsigned)(biosize - on), uio->uio_resid);
443		bn = lbn * (biosize / DEV_BSIZE);
444again:
445		bp = nfs_getcacheblk(vp, bn, biosize, p);
446		if (!bp)
447			return (EINTR);
448		if (bp->b_wcred == NOCRED) {
449			crhold(cred);
450			bp->b_wcred = cred;
451		}
452		np->n_flag |= NMODIFIED;
453		if (uio->uio_offset + n > np->n_size) {
454			np->n_size = uio->uio_offset + n;
455			vnode_pager_setsize(vp, (u_long)np->n_size);
456		}
457
458		/*
459		 * If the new write will leave a contiguous dirty
460		 * area, just update the b_dirtyoff and b_dirtyend,
461		 * otherwise force a write rpc of the old dirty area.
462		 */
463		if (bp->b_dirtyend > 0 &&
464		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
465			bp->b_proc = p;
466			if (VOP_BWRITE(bp) == EINTR)
467				return (EINTR);
468			goto again;
469		}
470
471		/*
472		 * Check for valid write lease and get one as required.
473		 * In case getblk() and/or bwrite() delayed us.
474		 */
475		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
476		    NQNFS_CKINVALID(vp, np, NQL_WRITE)) {
477			do {
478				error = nqnfs_getlease(vp, NQL_WRITE, cred, p);
479			} while (error == NQNFS_EXPIRED);
480			if (error) {
481				brelse(bp);
482				return (error);
483			}
484			if (np->n_lrev != np->n_brev ||
485			    (np->n_flag & NQNFSNONCACHE)) {
486				brelse(bp);
487				if (error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1))
488					return (error);
489				np->n_brev = np->n_lrev;
490				goto again;
491			}
492		}
493		if (error = uiomove((char *)bp->b_data + on, n, uio)) {
494			bp->b_flags |= B_ERROR;
495			brelse(bp);
496			return (error);
497		}
498		if (bp->b_dirtyend > 0) {
499			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
500			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
501		} else {
502			bp->b_dirtyoff = on;
503			bp->b_dirtyend = on + n;
504		}
505#ifndef notdef
506		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
507		    bp->b_validoff > bp->b_dirtyend) {
508			bp->b_validoff = bp->b_dirtyoff;
509			bp->b_validend = bp->b_dirtyend;
510		} else {
511			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
512			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
513		}
514#else
515		bp->b_validoff = bp->b_dirtyoff;
516		bp->b_validend = bp->b_dirtyend;
517#endif
518		if (ioflag & IO_APPEND)
519			bp->b_flags |= B_APPENDWRITE;
520
521		/*
522		 * If the lease is non-cachable or IO_SYNC do bwrite().
523		 */
524		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
525			bp->b_proc = p;
526			if (error = VOP_BWRITE(bp))
527				return (error);
528		} else if ((n + on) == biosize &&
529			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
530			bp->b_proc = (struct proc *)0;
531			bawrite(bp);
532		} else
533			bdwrite(bp);
534	} while (uio->uio_resid > 0 && n > 0);
535	return (0);
536}
537
538/*
539 * Get an nfs cache block.
540 * Allocate a new one if the block isn't currently in the cache
541 * and return the block marked busy. If the calling process is
542 * interrupted by a signal for an interruptible mount point, return
543 * NULL.
544 */
545struct buf *
546nfs_getcacheblk(vp, bn, size, p)
547	struct vnode *vp;
548	daddr_t bn;
549	int size;
550	struct proc *p;
551{
552	register struct buf *bp;
553	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
554
555	if (nmp->nm_flag & NFSMNT_INT) {
556		bp = getblk(vp, bn, size, PCATCH, 0);
557		while (bp == (struct buf *)0) {
558			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
559				return ((struct buf *)0);
560			bp = getblk(vp, bn, size, 0, 2 * hz);
561		}
562	} else
563		bp = getblk(vp, bn, size, 0, 0);
564	return (bp);
565}
566
567/*
568 * Flush and invalidate all dirty buffers. If another process is already
569 * doing the flush, just wait for completion.
570 */
571int
572nfs_vinvalbuf(vp, flags, cred, p, intrflg)
573	struct vnode *vp;
574	int flags;
575	struct ucred *cred;
576	struct proc *p;
577	int intrflg;
578{
579	register struct nfsnode *np = VTONFS(vp);
580	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
581	int error = 0, slpflag, slptimeo;
582
583	if ((nmp->nm_flag & NFSMNT_INT) == 0)
584		intrflg = 0;
585	if (intrflg) {
586		slpflag = PCATCH;
587		slptimeo = 2 * hz;
588	} else {
589		slpflag = 0;
590		slptimeo = 0;
591	}
592	/*
593	 * First wait for any other process doing a flush to complete.
594	 */
595	while (np->n_flag & NFLUSHINPROG) {
596		np->n_flag |= NFLUSHWANT;
597		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
598			slptimeo);
599		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
600			return (EINTR);
601	}
602
603	/*
604	 * Now, flush as required.
605	 */
606	np->n_flag |= NFLUSHINPROG;
607	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
608	while (error) {
609		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
610			np->n_flag &= ~NFLUSHINPROG;
611			if (np->n_flag & NFLUSHWANT) {
612				np->n_flag &= ~NFLUSHWANT;
613				wakeup((caddr_t)&np->n_flag);
614			}
615			return (EINTR);
616		}
617		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
618	}
619	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
620	if (np->n_flag & NFLUSHWANT) {
621		np->n_flag &= ~NFLUSHWANT;
622		wakeup((caddr_t)&np->n_flag);
623	}
624	return (0);
625}
626
627/*
628 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
629 * This is mainly to avoid queueing async I/O requests when the nfsiods
630 * are all hung on a dead server.
631 */
632int
633nfs_asyncio(bp, cred)
634	register struct buf *bp;
635	struct ucred *cred;
636{
637	register int i;
638
639	if (nfs_numasync == 0)
640		return (EIO);
641	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
642	    if (nfs_iodwant[i]) {
643		if (bp->b_flags & B_READ) {
644			if (bp->b_rcred == NOCRED && cred != NOCRED) {
645				crhold(cred);
646				bp->b_rcred = cred;
647			}
648		} else {
649			if (bp->b_wcred == NOCRED && cred != NOCRED) {
650				crhold(cred);
651				bp->b_wcred = cred;
652			}
653		}
654
655		TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
656		nfs_iodwant[i] = (struct proc *)0;
657		wakeup((caddr_t)&nfs_iodwant[i]);
658		return (0);
659	    }
660	return (EIO);
661}
662
663/*
664 * Do an I/O operation to/from a cache block. This may be called
665 * synchronously or from an nfsiod.
666 */
667int
668nfs_doio(bp, cr, p)
669	register struct buf *bp;
670	struct cred *cr;
671	struct proc *p;
672{
673	register struct uio *uiop;
674	register struct vnode *vp;
675	struct nfsnode *np;
676	struct nfsmount *nmp;
677	int error = 0, diff, len;
678	struct uio uio;
679	struct iovec io;
680
681	vp = bp->b_vp;
682	np = VTONFS(vp);
683	nmp = VFSTONFS(vp->v_mount);
684	uiop = &uio;
685	uiop->uio_iov = &io;
686	uiop->uio_iovcnt = 1;
687	uiop->uio_segflg = UIO_SYSSPACE;
688	uiop->uio_procp = p;
689
690	/*
691	 * Historically, paging was done with physio, but no more.
692	 */
693	if (bp->b_flags & B_PHYS)
694	    panic("doio phys");
695	if (bp->b_flags & B_READ) {
696	    io.iov_len = uiop->uio_resid = bp->b_bcount;
697	    io.iov_base = bp->b_data;
698	    uiop->uio_rw = UIO_READ;
699	    switch (vp->v_type) {
700	    case VREG:
701		uiop->uio_offset = bp->b_blkno * DEV_BSIZE;
702		nfsstats.read_bios++;
703		error = nfs_readrpc(vp, uiop, cr);
704		if (!error) {
705		    bp->b_validoff = 0;
706		    if (uiop->uio_resid) {
707			/*
708			 * If len > 0, there is a hole in the file and
709			 * no writes after the hole have been pushed to
710			 * the server yet.
711			 * Just zero fill the rest of the valid area.
712			 */
713			diff = bp->b_bcount - uiop->uio_resid;
714			len = np->n_size - (bp->b_blkno * DEV_BSIZE
715				+ diff);
716			if (len > 0) {
717			    len = min(len, uiop->uio_resid);
718			    bzero((char *)bp->b_data + diff, len);
719			    bp->b_validend = diff + len;
720			} else
721			    bp->b_validend = diff;
722		    } else
723			bp->b_validend = bp->b_bcount;
724		}
725		if (p && (vp->v_flag & VTEXT) &&
726			(((nmp->nm_flag & NFSMNT_NQNFS) &&
727			  np->n_lrev != np->n_brev) ||
728			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
729			  np->n_mtime != np->n_vattr.va_mtime.ts_sec))) {
730			uprintf("Process killed due to text file modification\n");
731			psignal(p, SIGKILL);
732			p->p_flag |= P_NOSWAP;
733		}
734		break;
735	    case VLNK:
736		uiop->uio_offset = 0;
737		nfsstats.readlink_bios++;
738		error = nfs_readlinkrpc(vp, uiop, cr);
739		break;
740	    case VDIR:
741		uiop->uio_offset = bp->b_lblkno;
742		nfsstats.readdir_bios++;
743		if (VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS)
744		    error = nfs_readdirlookrpc(vp, uiop, cr);
745		else
746		    error = nfs_readdirrpc(vp, uiop, cr);
747		/*
748		 * Save offset cookie in b_blkno.
749		 */
750		bp->b_blkno = uiop->uio_offset;
751		break;
752	    };
753	    if (error) {
754		bp->b_flags |= B_ERROR;
755		bp->b_error = error;
756	    }
757	} else {
758	    io.iov_len = uiop->uio_resid = bp->b_dirtyend
759		- bp->b_dirtyoff;
760	    uiop->uio_offset = (bp->b_blkno * DEV_BSIZE)
761		+ bp->b_dirtyoff;
762	    io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
763	    uiop->uio_rw = UIO_WRITE;
764	    nfsstats.write_bios++;
765	    if (bp->b_flags & B_APPENDWRITE)
766		error = nfs_writerpc(vp, uiop, cr, IO_APPEND);
767	    else
768		error = nfs_writerpc(vp, uiop, cr, 0);
769	    bp->b_flags &= ~(B_WRITEINPROG | B_APPENDWRITE);
770
771	    /*
772	     * For an interrupted write, the buffer is still valid and the
773	     * write hasn't been pushed to the server yet, so we can't set
774	     * B_ERROR and report the interruption by setting B_EINTR. For
775	     * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt
776	     * is essentially a noop.
777	     */
778	    if (error == EINTR) {
779		bp->b_flags &= ~B_INVAL;
780		bp->b_flags |= B_DELWRI;
781
782		/*
783		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
784		 * buffer to the clean list, we have to reassign it back to the
785		 * dirty one. Ugh.
786		 */
787		if (bp->b_flags & B_ASYNC)
788		    reassignbuf(bp, vp);
789		else
790		    bp->b_flags |= B_EINTR;
791	    } else {
792		if (error) {
793		    bp->b_flags |= B_ERROR;
794		    bp->b_error = np->n_error = error;
795		    np->n_flag |= NWRITEERR;
796		}
797		bp->b_dirtyoff = bp->b_dirtyend = 0;
798	    }
799	}
800	bp->b_resid = uiop->uio_resid;
801	biodone(bp);
802	return (error);
803}
804