nfs_bio.c revision 18866
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)nfs_bio.c	8.5 (Berkeley) 1/4/94
37 * $Id: nfs_bio.c,v 1.25 1996/09/19 18:20:54 nate Exp $
38 */
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/resourcevar.h>
43#include <sys/signalvar.h>
44#include <sys/proc.h>
45#include <sys/buf.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/kernel.h>
49#include <sys/sysctl.h>
50
51#include <vm/vm.h>
52#include <vm/vm_param.h>
53#include <vm/vm_extern.h>
54
55#include <nfs/rpcv2.h>
56#include <nfs/nfsproto.h>
57#include <nfs/nfs.h>
58#include <nfs/nfsmount.h>
59#include <nfs/nqnfs.h>
60#include <nfs/nfsnode.h>
61
62static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
63					struct proc *p));
64
65extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
66extern int nfs_numasync;
67extern struct nfsstats nfsstats;
68
69int nfs_dwrite = 1;
70SYSCTL_INT(_vfs_nfs, OID_AUTO, dwrite, CTLFLAG_RW, &nfs_dwrite, 0, "");
71
72/*
73 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
74 * that this isn't done inside getblk() and brelse() so these calls
75 * wouldn't need to be here.
76 */
77#ifdef B_VMIO
78#define vnode_pager_uncache(vp)
79#else
80#define vfs_busy_pages(bp, f)
81#define vfs_unbusy_pages(bp)
82#define vfs_dirty_pages(bp)
83#endif
84
85/*
86 * Vnode op for read using bio
87 * Any similarity to readip() is purely coincidental
88 */
89int
90nfs_bioread(vp, uio, ioflag, cred)
91	register struct vnode *vp;
92	register struct uio *uio;
93	int ioflag;
94	struct ucred *cred;
95{
96	register struct nfsnode *np = VTONFS(vp);
97	register int biosize, diff, i;
98	struct buf *bp = 0, *rabp;
99	struct vattr vattr;
100	struct proc *p;
101	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
102	daddr_t lbn, rabn;
103	int bufsize;
104	int nra, error = 0, n = 0, on = 0, not_readin;
105
106#ifdef DIAGNOSTIC
107	if (uio->uio_rw != UIO_READ)
108		panic("nfs_read mode");
109#endif
110	if (uio->uio_resid == 0)
111		return (0);
112	if (uio->uio_offset < 0)
113		return (EINVAL);
114	p = uio->uio_procp;
115	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
116		(void)nfs_fsinfo(nmp, vp, cred, p);
117	biosize = vp->v_mount->mnt_stat.f_iosize;
118	/*
119	 * For nfs, cache consistency can only be maintained approximately.
120	 * Although RFC1094 does not specify the criteria, the following is
121	 * believed to be compatible with the reference port.
122	 * For nqnfs, full cache consistency is maintained within the loop.
123	 * For nfs:
124	 * If the file's modify time on the server has changed since the
125	 * last read rpc or you have written to the file,
126	 * you may have lost data cache consistency with the
127	 * server, so flush all of the file's data out of the cache.
128	 * Then force a getattr rpc to ensure that you have up to date
129	 * attributes.
130	 * NB: This implies that cache data can be read when up to
131	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
132	 * attributes this could be forced by setting n_attrstamp to 0 before
133	 * the VOP_GETATTR() call.
134	 */
135	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
136		if (np->n_flag & NMODIFIED) {
137			if (vp->v_type != VREG) {
138				if (vp->v_type != VDIR)
139					panic("nfs: bioread, not dir");
140				nfs_invaldir(vp);
141				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
142				if (error)
143					return (error);
144			}
145			np->n_attrstamp = 0;
146			error = VOP_GETATTR(vp, &vattr, cred, p);
147			if (error)
148				return (error);
149			np->n_mtime = vattr.va_mtime.tv_sec;
150		} else {
151			error = VOP_GETATTR(vp, &vattr, cred, p);
152			if (error)
153				return (error);
154			if (np->n_mtime != vattr.va_mtime.tv_sec) {
155				if (vp->v_type == VDIR)
156					nfs_invaldir(vp);
157				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
158				if (error)
159					return (error);
160				np->n_mtime = vattr.va_mtime.tv_sec;
161			}
162		}
163	}
164	do {
165
166	    /*
167	     * Get a valid lease. If cached data is stale, flush it.
168	     */
169	    if (nmp->nm_flag & NFSMNT_NQNFS) {
170		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
171		    do {
172			error = nqnfs_getlease(vp, ND_READ, cred, p);
173		    } while (error == NQNFS_EXPIRED);
174		    if (error)
175			return (error);
176		    if (np->n_lrev != np->n_brev ||
177			(np->n_flag & NQNFSNONCACHE) ||
178			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
179			if (vp->v_type == VDIR)
180			    nfs_invaldir(vp);
181			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
182			if (error)
183			    return (error);
184			np->n_brev = np->n_lrev;
185		    }
186		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
187		    nfs_invaldir(vp);
188		    error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
189		    if (error)
190			return (error);
191		}
192	    }
193	    if (np->n_flag & NQNFSNONCACHE) {
194		switch (vp->v_type) {
195		case VREG:
196			return (nfs_readrpc(vp, uio, cred));
197		case VLNK:
198			return (nfs_readlinkrpc(vp, uio, cred));
199		case VDIR:
200			break;
201		default:
202			printf(" NQNFSNONCACHE: type %x unexpected\n",
203				vp->v_type);
204		};
205	    }
206	    switch (vp->v_type) {
207	    case VREG:
208		nfsstats.biocache_reads++;
209		lbn = uio->uio_offset / biosize;
210		on = uio->uio_offset & (biosize - 1);
211		not_readin = 1;
212
213		/*
214		 * Start the read ahead(s), as required.
215		 */
216		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
217		    for (nra = 0; nra < nmp->nm_readahead &&
218			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
219			rabn = lbn + 1 + nra;
220			if (!incore(vp, rabn)) {
221			    rabp = nfs_getcacheblk(vp, rabn, biosize, p);
222			    if (!rabp)
223				return (EINTR);
224			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
225				rabp->b_flags |= (B_READ | B_ASYNC);
226				vfs_busy_pages(rabp, 0);
227				if (nfs_asyncio(rabp, cred)) {
228				    rabp->b_flags |= B_INVAL|B_ERROR;
229				    vfs_unbusy_pages(rabp);
230				    brelse(rabp);
231				}
232			    } else {
233				brelse(rabp);
234			    }
235			}
236		    }
237		}
238
239		/*
240		 * If the block is in the cache and has the required data
241		 * in a valid region, just copy it out.
242		 * Otherwise, get the block and write back/read in,
243		 * as required.
244		 */
245again:
246		bufsize = biosize;
247		if ((off_t)(lbn + 1) * biosize > np->n_size &&
248		    (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
249			bufsize = np->n_size - lbn * biosize;
250			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
251		}
252		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
253		if (!bp)
254			return (EINTR);
255		if ((bp->b_flags & B_CACHE) == 0) {
256			bp->b_flags |= B_READ;
257			bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
258			not_readin = 0;
259			vfs_busy_pages(bp, 0);
260			error = nfs_doio(bp, cred, p);
261			if (error) {
262			    brelse(bp);
263			    return (error);
264			}
265		}
266		if (bufsize > on) {
267			n = min((unsigned)(bufsize - on), uio->uio_resid);
268		} else {
269			n = 0;
270		}
271		diff = np->n_size - uio->uio_offset;
272		if (diff < n)
273			n = diff;
274		if (not_readin && n > 0) {
275			if (on < bp->b_validoff || (on + n) > bp->b_validend) {
276				bp->b_flags |= B_NOCACHE;
277				if (bp->b_dirtyend > 0) {
278				    if ((bp->b_flags & B_DELWRI) == 0)
279					panic("nfsbioread");
280				    if (VOP_BWRITE(bp) == EINTR)
281					return (EINTR);
282				} else
283				    brelse(bp);
284				goto again;
285			}
286		}
287		vp->v_lastr = lbn;
288		diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
289		if (diff < n)
290			n = diff;
291		break;
292	    case VLNK:
293		nfsstats.biocache_readlinks++;
294		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
295		if (!bp)
296			return (EINTR);
297		if ((bp->b_flags & B_CACHE) == 0) {
298			bp->b_flags |= B_READ;
299			vfs_busy_pages(bp, 0);
300			error = nfs_doio(bp, cred, p);
301			if (error) {
302				bp->b_flags |= B_ERROR;
303				brelse(bp);
304				return (error);
305			}
306		}
307		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
308		on = 0;
309		break;
310	    case VDIR:
311		nfsstats.biocache_readdirs++;
312		lbn = uio->uio_offset / NFS_DIRBLKSIZ;
313		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
314		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
315		if (!bp)
316		    return (EINTR);
317		if ((bp->b_flags & B_CACHE) == 0) {
318		    bp->b_flags |= B_READ;
319		    vfs_busy_pages(bp, 0);
320		    error = nfs_doio(bp, cred, p);
321		    if (error) {
322			brelse(bp);
323			while (error == NFSERR_BAD_COOKIE) {
324			    nfs_invaldir(vp);
325			    error = nfs_vinvalbuf(vp, 0, cred, p, 1);
326			    /*
327			     * Yuck! The directory has been modified on the
328			     * server. The only way to get the block is by
329			     * reading from the beginning to get all the
330			     * offset cookies.
331			     */
332			    for (i = 0; i <= lbn && !error; i++) {
333				bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
334				if (!bp)
335				    return (EINTR);
336				if ((bp->b_flags & B_DONE) == 0) {
337				    bp->b_flags |= B_READ;
338				    vfs_busy_pages(bp, 0);
339				    error = nfs_doio(bp, cred, p);
340				    if (error)
341					brelse(bp);
342				}
343			    }
344			}
345			if (error)
346			    return (error);
347		    }
348		}
349
350		/*
351		 * If not eof and read aheads are enabled, start one.
352		 * (You need the current block first, so that you have the
353		 *  directory offset cookie of the next block.)
354		 */
355		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
356		    (np->n_direofoffset == 0 ||
357		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
358		    !(np->n_flag & NQNFSNONCACHE) &&
359		    !incore(vp, lbn + 1)) {
360			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
361			if (rabp) {
362			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
363				rabp->b_flags |= (B_READ | B_ASYNC);
364				vfs_busy_pages(rabp, 0);
365				if (nfs_asyncio(rabp, cred)) {
366				    rabp->b_flags |= B_INVAL|B_ERROR;
367				    vfs_unbusy_pages(rabp);
368				    brelse(rabp);
369				}
370			    } else {
371				brelse(rabp);
372			    }
373			}
374		}
375		n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
376		break;
377	    default:
378		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
379		break;
380	    };
381
382	    if (n > 0) {
383		error = uiomove(bp->b_data + on, (int)n, uio);
384	    }
385	    switch (vp->v_type) {
386	    case VREG:
387		break;
388	    case VLNK:
389		n = 0;
390		break;
391	    case VDIR:
392		if (np->n_flag & NQNFSNONCACHE)
393			bp->b_flags |= B_INVAL;
394		break;
395	    default:
396		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
397	    }
398 	    brelse(bp);
399	} while (error == 0 && uio->uio_resid > 0 && n > 0);
400	return (error);
401}
402
403/*
404 * Vnode op for write using bio
405 */
406int
407nfs_write(ap)
408	struct vop_write_args /* {
409		struct vnode *a_vp;
410		struct uio *a_uio;
411		int  a_ioflag;
412		struct ucred *a_cred;
413	} */ *ap;
414{
415	register int biosize;
416	register struct uio *uio = ap->a_uio;
417	struct proc *p = uio->uio_procp;
418	register struct vnode *vp = ap->a_vp;
419	struct nfsnode *np = VTONFS(vp);
420	register struct ucred *cred = ap->a_cred;
421	int ioflag = ap->a_ioflag;
422	struct buf *bp;
423	struct vattr vattr;
424	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
425	daddr_t lbn;
426	int bufsize;
427	int n, on, error = 0, iomode, must_commit;
428
429#ifdef DIAGNOSTIC
430	if (uio->uio_rw != UIO_WRITE)
431		panic("nfs_write mode");
432	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
433		panic("nfs_write proc");
434#endif
435	if (vp->v_type != VREG)
436		return (EIO);
437	if (np->n_flag & NWRITEERR) {
438		np->n_flag &= ~NWRITEERR;
439		return (np->n_error);
440	}
441	if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
442		(void)nfs_fsinfo(nmp, vp, cred, p);
443	if (ioflag & (IO_APPEND | IO_SYNC)) {
444		if (np->n_flag & NMODIFIED) {
445			np->n_attrstamp = 0;
446			error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
447			if (error)
448				return (error);
449		}
450		if (ioflag & IO_APPEND) {
451			np->n_attrstamp = 0;
452			error = VOP_GETATTR(vp, &vattr, cred, p);
453			if (error)
454				return (error);
455			uio->uio_offset = np->n_size;
456		}
457	}
458	if (uio->uio_offset < 0)
459		return (EINVAL);
460	if (uio->uio_resid == 0)
461		return (0);
462	/*
463	 * Maybe this should be above the vnode op call, but so long as
464	 * file servers have no limits, i don't think it matters
465	 */
466	if (p && uio->uio_offset + uio->uio_resid >
467	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
468		psignal(p, SIGXFSZ);
469		return (EFBIG);
470	}
471	/*
472	 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
473	 * will be the same size within a filesystem. nfs_writerpc will
474	 * still use nm_wsize when sizing the rpc's.
475	 */
476	biosize = vp->v_mount->mnt_stat.f_iosize;
477	do {
478
479		/*
480		 * XXX make sure we aren't cached in the VM page cache
481		 */
482		/*
483		 * Check for a valid write lease.
484		 */
485		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
486		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
487			do {
488				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
489			} while (error == NQNFS_EXPIRED);
490			if (error)
491				return (error);
492			if (np->n_lrev != np->n_brev ||
493			    (np->n_flag & NQNFSNONCACHE)) {
494				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
495				if (error)
496					return (error);
497				np->n_brev = np->n_lrev;
498			}
499		}
500		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
501		    iomode = NFSV3WRITE_FILESYNC;
502		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
503		    if (must_commit)
504			nfs_clearcommit(vp->v_mount);
505		    return (error);
506		}
507		nfsstats.biocache_writes++;
508		lbn = uio->uio_offset / biosize;
509		on = uio->uio_offset & (biosize-1);
510		n = min((unsigned)(biosize - on), uio->uio_resid);
511again:
512		if (uio->uio_offset + n > np->n_size) {
513			np->n_size = uio->uio_offset + n;
514			vnode_pager_setsize(vp, (u_long)np->n_size);
515		}
516		bufsize = biosize;
517		if ((lbn + 1) * biosize > np->n_size) {
518			bufsize = np->n_size - lbn * biosize;
519			bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
520		}
521		bp = nfs_getcacheblk(vp, lbn, bufsize, p);
522		if (!bp)
523			return (EINTR);
524		if (bp->b_wcred == NOCRED) {
525			crhold(cred);
526			bp->b_wcred = cred;
527		}
528		np->n_flag |= NMODIFIED;
529
530		if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
531			bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
532		}
533
534		/*
535		 * If the new write will leave a contiguous dirty
536		 * area, just update the b_dirtyoff and b_dirtyend,
537		 * otherwise force a write rpc of the old dirty area.
538		 */
539		if (bp->b_dirtyend > 0 &&
540		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
541			bp->b_proc = p;
542			if (VOP_BWRITE(bp) == EINTR)
543				return (EINTR);
544			goto again;
545		}
546
547		/*
548		 * Check for valid write lease and get one as required.
549		 * In case getblk() and/or bwrite() delayed us.
550		 */
551		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
552		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
553			do {
554				error = nqnfs_getlease(vp, ND_WRITE, cred, p);
555			} while (error == NQNFS_EXPIRED);
556			if (error) {
557				brelse(bp);
558				return (error);
559			}
560			if (np->n_lrev != np->n_brev ||
561			    (np->n_flag & NQNFSNONCACHE)) {
562				brelse(bp);
563				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
564				if (error)
565					return (error);
566				np->n_brev = np->n_lrev;
567				goto again;
568			}
569		}
570		error = uiomove((char *)bp->b_data + on, n, uio);
571		if (error) {
572			bp->b_flags |= B_ERROR;
573			brelse(bp);
574			return (error);
575		}
576		if (bp->b_dirtyend > 0) {
577			bp->b_dirtyoff = min(on, bp->b_dirtyoff);
578			bp->b_dirtyend = max((on + n), bp->b_dirtyend);
579		} else {
580			bp->b_dirtyoff = on;
581			bp->b_dirtyend = on + n;
582		}
583		if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
584		    bp->b_validoff > bp->b_dirtyend) {
585			bp->b_validoff = bp->b_dirtyoff;
586			bp->b_validend = bp->b_dirtyend;
587		} else {
588			bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
589			bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
590		}
591
592		/*
593		 * Since this block is being modified, it must be written
594		 * again and not just committed.
595		 */
596		bp->b_flags &= ~B_NEEDCOMMIT;
597
598		/*
599		 * If the lease is non-cachable or IO_SYNC do bwrite().
600		 */
601		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
602			bp->b_proc = p;
603			error = VOP_BWRITE(bp);
604			if (error)
605				return (error);
606			if (np->n_flag & NQNFSNONCACHE) {
607				error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
608				if (error)
609					return (error);
610			}
611		} else if ((n + on) == biosize &&
612			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
613			bp->b_proc = (struct proc *)0;
614			bp->b_flags |= B_ASYNC;
615			(void)nfs_writebp(bp, 0);
616		} else
617			bdwrite(bp);
618	} while (uio->uio_resid > 0 && n > 0);
619	return (0);
620}
621
622/*
623 * Get an nfs cache block.
624 * Allocate a new one if the block isn't currently in the cache
625 * and return the block marked busy. If the calling process is
626 * interrupted by a signal for an interruptible mount point, return
627 * NULL.
628 */
629static struct buf *
630nfs_getcacheblk(vp, bn, size, p)
631	struct vnode *vp;
632	daddr_t bn;
633	int size;
634	struct proc *p;
635{
636	register struct buf *bp;
637	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
638	int biosize = vp->v_mount->mnt_stat.f_iosize;
639
640	if (nmp->nm_flag & NFSMNT_INT) {
641		bp = getblk(vp, bn, size, PCATCH, 0);
642		while (bp == (struct buf *)0) {
643			if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
644				return ((struct buf *)0);
645			bp = getblk(vp, bn, size, 0, 2 * hz);
646		}
647	} else
648		bp = getblk(vp, bn, size, 0, 0);
649
650	if( vp->v_type == VREG)
651		bp->b_blkno = (bn * biosize) / DEV_BSIZE;
652
653	return (bp);
654}
655
656/*
657 * Flush and invalidate all dirty buffers. If another process is already
658 * doing the flush, just wait for completion.
659 */
660int
661nfs_vinvalbuf(vp, flags, cred, p, intrflg)
662	struct vnode *vp;
663	int flags;
664	struct ucred *cred;
665	struct proc *p;
666	int intrflg;
667{
668	register struct nfsnode *np = VTONFS(vp);
669	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
670	int error = 0, slpflag, slptimeo;
671
672	if ((nmp->nm_flag & NFSMNT_INT) == 0)
673		intrflg = 0;
674	if (intrflg) {
675		slpflag = PCATCH;
676		slptimeo = 2 * hz;
677	} else {
678		slpflag = 0;
679		slptimeo = 0;
680	}
681	/*
682	 * First wait for any other process doing a flush to complete.
683	 */
684	while (np->n_flag & NFLUSHINPROG) {
685		np->n_flag |= NFLUSHWANT;
686		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
687			slptimeo);
688		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
689			return (EINTR);
690	}
691
692	/*
693	 * Now, flush as required.
694	 */
695	np->n_flag |= NFLUSHINPROG;
696	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
697	while (error) {
698		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
699			np->n_flag &= ~NFLUSHINPROG;
700			if (np->n_flag & NFLUSHWANT) {
701				np->n_flag &= ~NFLUSHWANT;
702				wakeup((caddr_t)&np->n_flag);
703			}
704			return (EINTR);
705		}
706		error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
707	}
708	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
709	if (np->n_flag & NFLUSHWANT) {
710		np->n_flag &= ~NFLUSHWANT;
711		wakeup((caddr_t)&np->n_flag);
712	}
713	return (0);
714}
715
716/*
717 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
718 * This is mainly to avoid queueing async I/O requests when the nfsiods
719 * are all hung on a dead server.
720 */
721int
722nfs_asyncio(bp, cred)
723	register struct buf *bp;
724	struct ucred *cred;
725{
726	register int i;
727
728	if (nfs_numasync == 0)
729		return (EIO);
730	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
731	    if (nfs_iodwant[i]) {
732		if (bp->b_flags & B_READ) {
733			if (bp->b_rcred == NOCRED && cred != NOCRED) {
734				crhold(cred);
735				bp->b_rcred = cred;
736			}
737		} else {
738			bp->b_flags |= B_WRITEINPROG;
739			if (bp->b_wcred == NOCRED && cred != NOCRED) {
740				crhold(cred);
741				bp->b_wcred = cred;
742			}
743		}
744
745		TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist);
746		nfs_iodwant[i] = (struct proc *)0;
747		wakeup((caddr_t)&nfs_iodwant[i]);
748		return (0);
749	    }
750
751	/*
752	 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE
753	 * return EIO so the process will call nfs_doio() and do it
754	 * synchronously.
755	 */
756	if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE))
757		return (EIO);
758
759	/*
760	 * Allow the administrator to override the choice of using a delayed
761	 * write since it is a pessimization for some servers, notably some
762	 * Solaris servers.
763	 */
764	if (!nfs_dwrite)
765		return (EIO);
766
767	/*
768	 * Just turn the async write into a delayed write, instead of
769	 * doing in synchronously. Hopefully, at least one of the nfsiods
770	 * is currently doing a write for this file and will pick up the
771	 * delayed writes before going back to sleep.
772	 */
773	bp->b_flags |= B_DELWRI;
774	reassignbuf(bp, bp->b_vp);
775	biodone(bp);
776	return (0);
777}
778
779/*
780 * Do an I/O operation to/from a cache block. This may be called
781 * synchronously or from an nfsiod.
782 */
783int
784nfs_doio(bp, cr, p)
785	register struct buf *bp;
786	struct ucred *cr;
787	struct proc *p;
788{
789	register struct uio *uiop;
790	register struct vnode *vp;
791	struct nfsnode *np;
792	struct nfsmount *nmp;
793	int error = 0, diff, len, iomode, must_commit = 0;
794	struct uio uio;
795	struct iovec io;
796
797	vp = bp->b_vp;
798	np = VTONFS(vp);
799	nmp = VFSTONFS(vp->v_mount);
800	uiop = &uio;
801	uiop->uio_iov = &io;
802	uiop->uio_iovcnt = 1;
803	uiop->uio_segflg = UIO_SYSSPACE;
804	uiop->uio_procp = p;
805
806	/*
807	 * Historically, paging was done with physio, but no more.
808	 */
809	if (bp->b_flags & B_PHYS) {
810	    /*
811	     * ...though reading /dev/drum still gets us here.
812	     */
813	    io.iov_len = uiop->uio_resid = bp->b_bcount;
814	    /* mapping was done by vmapbuf() */
815	    io.iov_base = bp->b_data;
816	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
817	    if (bp->b_flags & B_READ) {
818		uiop->uio_rw = UIO_READ;
819		nfsstats.read_physios++;
820		error = nfs_readrpc(vp, uiop, cr);
821	    } else {
822		int com;
823
824		iomode = NFSV3WRITE_DATASYNC;
825		uiop->uio_rw = UIO_WRITE;
826		nfsstats.write_physios++;
827		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
828	    }
829	    if (error) {
830		bp->b_flags |= B_ERROR;
831		bp->b_error = error;
832	    }
833	} else if (bp->b_flags & B_READ) {
834	    io.iov_len = uiop->uio_resid = bp->b_bcount;
835	    io.iov_base = bp->b_data;
836	    uiop->uio_rw = UIO_READ;
837	    switch (vp->v_type) {
838	    case VREG:
839		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
840		nfsstats.read_bios++;
841		error = nfs_readrpc(vp, uiop, cr);
842		if (!error) {
843		    bp->b_validoff = 0;
844		    if (uiop->uio_resid) {
845			/*
846			 * If len > 0, there is a hole in the file and
847			 * no writes after the hole have been pushed to
848			 * the server yet.
849			 * Just zero fill the rest of the valid area.
850			 */
851			diff = bp->b_bcount - uiop->uio_resid;
852			len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
853				+ diff);
854			if (len > 0) {
855			    len = min(len, uiop->uio_resid);
856			    bzero((char *)bp->b_data + diff, len);
857			    bp->b_validend = diff + len;
858			} else
859			    bp->b_validend = diff;
860		    } else
861			bp->b_validend = bp->b_bcount;
862		}
863		if (p && (vp->v_flag & VTEXT) &&
864			(((nmp->nm_flag & NFSMNT_NQNFS) &&
865			  NQNFS_CKINVALID(vp, np, ND_READ) &&
866			  np->n_lrev != np->n_brev) ||
867			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
868			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
869			uprintf("Process killed due to text file modification\n");
870			psignal(p, SIGKILL);
871#ifdef __NetBSD__
872			p->p_holdcnt++;
873#else
874			p->p_flag |= P_NOSWAP;
875#endif
876		}
877		break;
878	    case VLNK:
879		uiop->uio_offset = (off_t)0;
880		nfsstats.readlink_bios++;
881		error = nfs_readlinkrpc(vp, uiop, cr);
882		break;
883	    case VDIR:
884		nfsstats.readdir_bios++;
885		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
886		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
887			error = nfs_readdirplusrpc(vp, uiop, cr);
888			if (error == NFSERR_NOTSUPP)
889				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
890		}
891		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
892			error = nfs_readdirrpc(vp, uiop, cr);
893		break;
894	    default:
895		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
896		break;
897	    };
898	    if (error) {
899		bp->b_flags |= B_ERROR;
900		bp->b_error = error;
901	    }
902	} else {
903	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
904		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
905
906	    if (bp->b_dirtyend > bp->b_dirtyoff) {
907		io.iov_len = uiop->uio_resid = bp->b_dirtyend
908		    - bp->b_dirtyoff;
909		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
910		    + bp->b_dirtyoff;
911		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
912		uiop->uio_rw = UIO_WRITE;
913		nfsstats.write_bios++;
914		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC)
915		    iomode = NFSV3WRITE_UNSTABLE;
916		else
917		    iomode = NFSV3WRITE_FILESYNC;
918		bp->b_flags |= B_WRITEINPROG;
919		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
920		if (!error && iomode == NFSV3WRITE_UNSTABLE)
921		    bp->b_flags |= B_NEEDCOMMIT;
922		else
923		    bp->b_flags &= ~B_NEEDCOMMIT;
924		bp->b_flags &= ~B_WRITEINPROG;
925
926		/*
927		 * For an interrupted write, the buffer is still valid
928		 * and the write hasn't been pushed to the server yet,
929		 * so we can't set B_ERROR and report the interruption
930		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
931		 * is not relevant, so the rpc attempt is essentially
932		 * a noop.  For the case of a V3 write rpc not being
933		 * committed to stable storage, the block is still
934		 * dirty and requires either a commit rpc or another
935		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
936		 * the block is reused. This is indicated by setting
937		 * the B_DELWRI and B_NEEDCOMMIT flags.
938		 */
939    		if (error == EINTR
940		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
941			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
942			bp->b_flags |= B_DELWRI;
943
944		/*
945		 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
946		 * buffer to the clean list, we have to reassign it back to the
947		 * dirty one. Ugh.
948		 */
949			if (bp->b_flags & B_ASYNC)
950				reassignbuf(bp, vp);
951			else
952				bp->b_flags |= B_EINTR;
953	    	} else {
954			if (error) {
955				bp->b_flags |= B_ERROR;
956				bp->b_error = np->n_error = error;
957				np->n_flag |= NWRITEERR;
958			}
959			bp->b_dirtyoff = bp->b_dirtyend = 0;
960		}
961	    } else {
962		bp->b_resid = 0;
963		biodone(bp);
964		return (0);
965	    }
966	}
967	bp->b_resid = uiop->uio_resid;
968	if (must_commit)
969		nfs_clearcommit(vp->v_mount);
970	biodone(bp);
971	return (error);
972}
973