sys/nfsclient/nfs_bio.c

1541Srgrimes/*
1541Srgrimes * Copyright (c) 1989, 1993
1541Srgrimes *	The Regents of the University of California.  All rights reserved.
1541Srgrimes *
1541Srgrimes * This code is derived from software contributed to Berkeley by
1541Srgrimes * Rick Macklem at The University of Guelph.
1541Srgrimes *
1541Srgrimes * Redistribution and use in source and binary forms, with or without
1541Srgrimes * modification, are permitted provided that the following conditions
1541Srgrimes * are met:
1541Srgrimes * 1. Redistributions of source code must retain the above copyright
1541Srgrimes *    notice, this list of conditions and the following disclaimer.
1541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
1541Srgrimes *    notice, this list of conditions and the following disclaimer in the
1541Srgrimes *    documentation and/or other materials provided with the distribution.
1541Srgrimes * 3. All advertising materials mentioning features or use of this software
1541Srgrimes *    must display the following acknowledgement:
1541Srgrimes *	This product includes software developed by the University of
1541Srgrimes *	California, Berkeley and its contributors.
1541Srgrimes * 4. Neither the name of the University nor the names of its contributors
1541Srgrimes *    may be used to endorse or promote products derived from this software
1541Srgrimes *    without specific prior written permission.
1541Srgrimes *
1541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
1541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
1541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
1541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
1541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
1541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
1541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50477Speter * SUCH DAMAGE.
1541Srgrimes *
1541Srgrimes *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
106369Srwatson * $FreeBSD: head/sys/nfsclient/nfs_bio.c 83629 2001-09-18 18:37:52Z imp $
106369Srwatson */
1541Srgrimes
48274Speter
76166Smarkm#include <sys/param.h>
76166Smarkm#include <sys/systm.h>
12221Sbde#include <sys/bio.h>
1541Srgrimes#include <sys/buf.h>
3308Sphk#include <sys/kernel.h>
1541Srgrimes#include <sys/mount.h>
106369Srwatson#include <sys/proc.h>
1541Srgrimes#include <sys/resourcevar.h>
25583Speter#include <sys/signalvar.h>
1541Srgrimes#include <sys/vmmeter.h>
25656Speter#include <sys/vnode.h>
58377Sphk
1541Srgrimes#include <vm/vm.h>
76166Smarkm#include <vm/vm_extern.h>
26335Speter#include <vm/vm_page.h>
26335Speter#include <vm/vm_object.h>
1541Srgrimes#include <vm/vm_pager.h>
110299Sphk#include <vm/vnode_pager.h>
110299Sphk
9369Sdg#include <nfs/rpcv2.h>
8876Srgrimes#include <nfs/nfsproto.h>
1541Srgrimes#include <nfs/nfs.h>
1541Srgrimes#include <nfs/nfsmount.h>
1541Srgrimes#include <nfs/nqnfs.h>
1541Srgrimes#include <nfs/nfsnode.h>
1541Srgrimes
1541Srgrimes/*
1541Srgrimes * Just call nfs_writebp() with the force argument set to 1.
1541Srgrimes *
1541Srgrimes * NOTE: B_DONE may or may not be set in a_bp on call.
92723Salfred */
92723Salfredstatic int
94343Sjhbnfs_bwrite(struct buf *bp)
92723Salfred{
92723Salfred	return (nfs_writebp(bp, 1, curthread));
13016Sbde}
30739Sphk
30739Sphkstruct buf_ops buf_ops_nfs = {
30739Sphk	"buf_ops_nfs",
30739Sphk	nfs_bwrite
30739Sphk};
30739Sphk
92723Salfred
30739Sphkstatic struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
25583Speter					struct thread *td));
102074Sphk
25583Speterextern int nfs_numasync;
45433Snsayerextern int nfs_pbuf_freecnt;
45438Snsayerextern struct nfsstats nfsstats;
33690Sphk
25583Speter/*
25583Speter * Vnode op for VM getpages.
25656Speter */
33818Sbdeint
35029Sphknfs_getpages(ap)
35029Sphk	struct vop_getpages_args /* {
25583Speter		struct vnode *a_vp;
25583Speter		vm_page_t *a_m;
33818Sbde		int a_count;
45433Snsayer		int a_reqpage;
45433Snsayer		vm_ooffset_t a_offset;
45433Snsayer	} */ *ap;
45433Snsayer{
45438Snsayer	int i, error, nextoff, size, toff, count, npages;
45438Snsayer	struct uio uio;
45438Snsayer	struct iovec iov;
45438Snsayer	vm_offset_t kva;
25583Speter	struct buf *bp;
94343Sjhb	struct vnode *vp;
45433Snsayer	struct thread *td;
45437Smjacob	struct ucred *cred;
45438Snsayer	struct nfsmount *nmp;
45437Smjacob	vm_page_t *pages;
45437Smjacob
45437Smjacob	GIANT_REQUIRED;
45437Smjacob
45437Smjacob	vp = ap->a_vp;
45437Smjacob	td = curthread;				/* XXX */
45437Smjacob	cred = curthread->td_proc->p_ucred;		/* XXX */
45433Snsayer	nmp = VFSTONFS(vp->v_mount);
45433Snsayer	pages = ap->a_m;
45437Smjacob	count = ap->a_count;
45438Snsayer
45438Snsayer	if (vp->v_object == NULL) {
45438Snsayer		printf("nfs_getpages: called with non-merged cache vnode??\n");
45438Snsayer		return VM_PAGER_ERROR;
45438Snsayer	}
45438Snsayer
45438Snsayer	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
45438Snsayer	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
45438Snsayer		(void)nfs_fsinfo(nmp, vp, cred, td);
45433Snsayer	}
33818Sbde
33818Sbde	npages = btoc(count);
33690Sphk
33690Sphk	/*
94343Sjhb	 * If the requested page is partially valid, just return it and
58377Sphk	 * allow the pager to zero-out the blanks.  Partially valid pages
25583Speter	 * can only occur at the file EOF.
25583Speter	 */
25583Speter
25583Speter	{
94343Sjhb		vm_page_t m = pages[ap->a_reqpage];
25583Speter
25583Speter		if (m->valid != 0) {
25583Speter			/* handled by vm_fault now	  */
12221Sbde			/* vm_page_zero_invalid(m, TRUE); */
25583Speter			for (i = 0; i < npages; ++i) {
25583Speter				if (i != ap->a_reqpage)
25583Speter					vm_page_free(pages[i]);
25583Speter			}
25583Speter			return(0);
25656Speter		}
82746Sdillon	}
82746Sdillon
82746Sdillon	/*
25583Speter	 * We use only the kva address for the buffer, but this is extremely
25583Speter	 * convienient and fast.
102074Sphk	 */
25583Speter	bp = getpbuf(&nfs_pbuf_freecnt);
25583Speter
25583Speter	kva = (vm_offset_t) bp->b_data;
107849Salfred	pmap_qenter(kva, pages, npages);
25583Speter	cnt.v_vnodein++;
33690Sphk	cnt.v_vnodepgsin += npages;
107849Salfred
25583Speter	iov.iov_base = (caddr_t) kva;
25583Speter	iov.iov_len = count;
25583Speter	uio.uio_iov = &iov;
25583Speter	uio.uio_iovcnt = 1;
25583Speter	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
25583Speter	uio.uio_resid = count;
25583Speter	uio.uio_segflg = UIO_SYSSPACE;
25583Speter	uio.uio_rw = UIO_READ;
25656Speter	uio.uio_td = td;
82746Sdillon
82746Sdillon	error = nfs_readrpc(vp, &uio, cred);
82746Sdillon	pmap_qremove(kva, npages);
25583Speter
25583Speter	relpbuf(bp, &nfs_pbuf_freecnt);
102074Sphk
25583Speter	if (error && (uio.uio_resid == count)) {
25583Speter		printf("nfs_getpages: error %d\n", error);
25583Speter		for (i = 0; i < npages; ++i) {
25583Speter			if (i != ap->a_reqpage)
25583Speter				vm_page_free(pages[i]);
106369Srwatson		}
106369Srwatson		return VM_PAGER_ERROR;
106369Srwatson	}
106369Srwatson
106369Srwatson	/*
93593Sjhb	 * Calculate the number of bytes read and validate only that number
94343Sjhb	 * of bytes.  Note that due to pending writes, size may be 0.  This
107849Salfred	 * does not mean that the remaining data is invalid!
94343Sjhb	 */
107849Salfred
94343Sjhb	size = count - uio.uio_resid;
94343Sjhb
94343Sjhb	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
34901Sphk		vm_page_t m;
25583Speter		nextoff = toff + PAGE_SIZE;
94343Sjhb		m = pages[i];
82746Sdillon
25583Speter		m->flags &= ~PG_ZERO;
25583Speter
25583Speter		if (nextoff <= size) {
25583Speter			/*
25583Speter			 * Read operation filled an entire page
25583Speter			 */
25583Speter			m->valid = VM_PAGE_BITS_ALL;
25583Speter			vm_page_undirty(m);
25656Speter		} else if (size > toff) {
25583Speter			/*
102074Sphk			 * Read operation filled a partial page.
25583Speter			 */
25583Speter			m->valid = 0;
25656Speter			vm_page_set_validclean(m, 0, size - toff);
25583Speter			/* handled by vm_fault now	  */
107849Salfred			/* vm_page_zero_invalid(m, TRUE); */
25583Speter		}
25656Speter
107849Salfred		if (i != ap->a_reqpage) {
25583Speter			/*
103964Sbde			 * Whether or not to leave the page activated is up in
103964Sbde			 * the air, but we should put the page on a page queue
103964Sbde			 * somewhere (it already is in the object).  Result:
103964Sbde			 * It appears that emperical results show that
103964Sbde			 * deactivating pages is best.
103964Sbde			 */
107849Salfred
25583Speter			/*
25656Speter			 * Just in case someone was asking for this page we
25583Speter			 * now tell them that it is ok to use.
25583Speter			 */
26335Speter			if (!error) {
25656Speter				if (m->flags & PG_WANTED)
26335Speter					vm_page_activate(m);
102074Sphk				else
25583Speter					vm_page_deactivate(m);
35045Sphk				vm_page_wakeup(m);
35042Sphk			} else {
35042Sphk				vm_page_free(m);
25583Speter			}
28773Sbde		}
25656Speter	}
43301Sdillon	return 0;
28773Sbde}
36119Sphk
35029Sphk/*
35042Sphk * Vnode op for VM putpages.
35042Sphk */
35042Sphkint
35042Sphknfs_putpages(ap)
36119Sphk	struct vop_putpages_args /* {
35042Sphk		struct vnode *a_vp;
35042Sphk		vm_page_t *a_m;
35042Sphk		int a_count;
35042Sphk		int a_sync;
35042Sphk		int *a_rtvals;
35042Sphk		vm_ooffset_t a_offset;
35042Sphk	} */ *ap;
35042Sphk{
35042Sphk	struct uio uio;
35042Sphk	struct iovec iov;
35042Sphk	vm_offset_t kva;
35029Sphk	struct buf *bp;
35042Sphk	int iomode, must_commit, i, error, npages, count;
35045Sphk	off_t offset;
35045Sphk	int *rtvals;
35045Sphk	struct vnode *vp;
26335Speter	struct thread *td;
26335Speter	struct ucred *cred;
25583Speter	struct nfsmount *nmp;
26335Speter	struct nfsnode *np;
26335Speter	vm_page_t *pages;
26335Speter
26335Speter	GIANT_REQUIRED;
26335Speter
26335Speter	vp = ap->a_vp;
26335Speter	np = VTONFS(vp);
82746Sdillon	td = curthread;				/* XXX */
82746Sdillon	cred = curthread->td_proc->p_ucred;		/* XXX */
82746Sdillon	nmp = VFSTONFS(vp->v_mount);
26335Speter	pages = ap->a_m;
26335Speter	count = ap->a_count;
102074Sphk	rtvals = ap->a_rtvals;
26335Speter	npages = btoc(count);
26335Speter	offset = IDX_TO_OFF(pages[0]->pindex);
82746Sdillon
26335Speter	GIANT_REQUIRED;
107849Salfred
26335Speter	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
26335Speter	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
82746Sdillon		(void)nfs_fsinfo(nmp, vp, cred, td);
109521Salfred	}
109521Salfred
109521Salfred	for (i = 0; i < npages; i++) {
83366Sjulian		rtvals[i] = VM_PAGER_AGAIN;
107849Salfred	}
82746Sdillon
82746Sdillon	/*
107849Salfred	 * When putting pages, do not extend file past EOF.
109521Salfred	 */
82746Sdillon
25583Speter	if (offset + count > np->n_size) {
25656Speter		count = np->n_size - offset;
25583Speter		if (count < 0)
25583Speter			count = 0;
26335Speter	}
1541Srgrimes
1541Srgrimes	/*
1541Srgrimes	 * We use only the kva address for the buffer, but this is extremely
1541Srgrimes	 * convienient and fast.
12221Sbde	 */
82746Sdillon	bp = getpbuf(&nfs_pbuf_freecnt);
82746Sdillon
82746Sdillon	kva = (vm_offset_t) bp->b_data;
1541Srgrimes	pmap_qenter(kva, pages, npages);
1549Srgrimes	cnt.v_vnodeout++;
102074Sphk	cnt.v_vnodepgsout += count;
1541Srgrimes
1541Srgrimes	iov.iov_base = (caddr_t) kva;
110286Stjr	iov.iov_len = count;
1541Srgrimes	uio.uio_iov = &iov;
1541Srgrimes	uio.uio_iovcnt = 1;
1541Srgrimes	uio.uio_offset = offset;
1541Srgrimes	uio.uio_resid = count;
99012Salfred	uio.uio_segflg = UIO_SYSSPACE;
1541Srgrimes	uio.uio_rw = UIO_WRITE;
90836Sphk	uio.uio_td = td;
110299Sphk
110299Sphk	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
110286Stjr	    iomode = NFSV3WRITE_UNSTABLE;
82746Sdillon	else
1541Srgrimes	    iomode = NFSV3WRITE_FILESYNC;
1541Srgrimes
1541Srgrimes	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
12221Sbde
1541Srgrimes	pmap_qremove(kva, npages);
1541Srgrimes	relpbuf(bp, &nfs_pbuf_freecnt);
1541Srgrimes
1541Srgrimes	if (!error) {
12221Sbde		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
82746Sdillon		for (i = 0; i < nwritten; i++) {
82746Sdillon			rtvals[i] = VM_PAGER_OK;
82746Sdillon			vm_page_undirty(pages[i]);
1541Srgrimes		}
1549Srgrimes		if (must_commit) {
102074Sphk			nfs_clearcommit(vp->v_mount);
1541Srgrimes		}
25656Speter	}
1541Srgrimes	return rtvals[0];
82746Sdillon}
1541Srgrimes
106369Srwatson/*
106369Srwatson * Vnode op for read using bio
106369Srwatson */
106369Srwatsonint
106369Srwatsonnfs_bioread(vp, uio, ioflag, cred)
93593Sjhb	register struct vnode *vp;
94343Sjhb	register struct uio *uio;
1541Srgrimes	int ioflag;
25656Speter	struct ucred *cred;
99012Salfred{
94343Sjhb	register struct nfsnode *np = VTONFS(vp);
94343Sjhb	register int biosize, i;
94343Sjhb	struct buf *bp = 0, *rabp;
25656Speter	struct vattr vattr;
1541Srgrimes	struct thread *td;
99012Salfred	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
94343Sjhb	daddr_t lbn, rabn;
94343Sjhb	int bcount;
94343Sjhb	int seqcount;
94343Sjhb	int nra, error = 0, n = 0, on = 0;
94343Sjhb
110299Sphk#ifdef DIAGNOSTIC
110299Sphk	if (uio->uio_rw != UIO_READ)
82746Sdillon		panic("nfs_read mode");
82746Sdillon#endif
1541Srgrimes	if (uio->uio_resid == 0)
82746Sdillon		return (0);
1541Srgrimes	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
1541Srgrimes		return (EINVAL);
1541Srgrimes	td = uio->uio_td;
1541Srgrimes
1541Srgrimes	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
1541Srgrimes	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
1541Srgrimes		(void)nfs_fsinfo(nmp, vp, cred, td);
1541Srgrimes	if (vp->v_type != VDIR &&
1541Srgrimes	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
1541Srgrimes		return (EFBIG);
1541Srgrimes	biosize = vp->v_mount->mnt_stat.f_iosize;
1541Srgrimes	seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE);
1541Srgrimes	/*
1541Srgrimes	 * For nfs, cache consistency can only be maintained approximately.
1541Srgrimes	 * Although RFC1094 does not specify the criteria, the following is
1541Srgrimes	 * believed to be compatible with the reference port.
1541Srgrimes	 * For nqnfs, full cache consistency is maintained within the loop.
1541Srgrimes	 * For nfs:
1541Srgrimes	 * If the file's modify time on the server has changed since the
1541Srgrimes	 * last read rpc or you have written to the file,
12221Sbde	 * you may have lost data cache consistency with the
1541Srgrimes	 * server, so flush all of the file's data out of the cache.
1541Srgrimes	 * Then force a getattr rpc to ensure that you have up to date
1541Srgrimes	 * attributes.
1541Srgrimes	 * NB: This implies that cache data can be read when up to
12221Sbde	 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
82746Sdillon	 * attributes this could be forced by setting n_attrstamp to 0 before
82746Sdillon	 * the VOP_GETATTR() call.
82746Sdillon	 */
1549Srgrimes	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
102074Sphk		if (np->n_flag & NMODIFIED) {
1541Srgrimes			if (vp->v_type != VREG) {
83366Sjulian				if (vp->v_type != VDIR)
34961Sphk					panic("nfs: bioread, not dir");
1541Srgrimes				nfs_invaldir(vp);
1541Srgrimes				error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
1541Srgrimes				if (error)
1541Srgrimes					return (error);
82746Sdillon			}
1541Srgrimes			np->n_attrstamp = 0;
1541Srgrimes			error = VOP_GETATTR(vp, &vattr, cred, td);
36128Sbde			if (error)
1541Srgrimes				return (error);
1541Srgrimes			np->n_mtime = vattr.va_mtime.tv_sec;
1541Srgrimes		} else {
1541Srgrimes			error = VOP_GETATTR(vp, &vattr, cred, td);
111034Stjr			if (error)
1541Srgrimes				return (error);
111034Stjr			if (np->n_mtime != vattr.va_mtime.tv_sec) {
35058Sphk				if (vp->v_type == VDIR)
36119Sphk					nfs_invaldir(vp);
35058Sphk				error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
35058Sphk				if (error)
1541Srgrimes					return (error);
34961Sphk				np->n_mtime = vattr.va_mtime.tv_sec;
34961Sphk			}
82746Sdillon		}
111034Stjr	}
1541Srgrimes	do {
111034Stjr
82746Sdillon	    /*
110286Stjr	     * Get a valid lease. If cached data is stale, flush it.
1541Srgrimes	     */
1541Srgrimes	    if (nmp->nm_flag & NFSMNT_NQNFS) {
12221Sbde		if (NQNFS_CKINVALID(vp, np, ND_READ)) {
1541Srgrimes		    do {
1541Srgrimes			error = nqnfs_getlease(vp, ND_READ, cred, td);
1541Srgrimes		    } while (error == NQNFS_EXPIRED);
1541Srgrimes		    if (error)
12221Sbde			return (error);
82746Sdillon		    if (np->n_lrev != np->n_brev ||
82746Sdillon			(np->n_flag & NQNFSNONCACHE) ||
82746Sdillon			((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
1549Srgrimes			if (vp->v_type == VDIR)
102074Sphk			    nfs_invaldir(vp);
1541Srgrimes			error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
83366Sjulian			if (error)
111034Stjr			    return (error);
34961Sphk			np->n_brev = np->n_lrev;
111034Stjr		    }
1541Srgrimes		} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
111034Stjr		    nfs_invaldir(vp);
111034Stjr		    error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
111034Stjr		    if (error)
111034Stjr			return (error);
111034Stjr		}
1541Srgrimes	    }
1541Srgrimes	    if (np->n_flag & NQNFSNONCACHE) {
111034Stjr		switch (vp->v_type) {
1541Srgrimes		case VREG:
111034Stjr			return (nfs_readrpc(vp, uio, cred));
111034Stjr		case VLNK:
111034Stjr			return (nfs_readlinkrpc(vp, uio, cred));
111034Stjr		case VDIR:
111034Stjr			break;
111034Stjr		default:
82746Sdillon			printf(" NQNFSNONCACHE: type %x unexpected\n",
1541Srgrimes				vp->v_type);
111034Stjr		};
35058Sphk	    }
69286Sjake	    switch (vp->v_type) {
35058Sphk	    case VREG:
69286Sjake		nfsstats.biocache_reads++;
69286Sjake		lbn = uio->uio_offset / biosize;
36119Sphk		on = uio->uio_offset & (biosize - 1);
35044Sphk
111034Stjr		/*
1541Srgrimes		 * Start the read ahead(s), as required.
111034Stjr		 */
111034Stjr		if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
111034Stjr		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
111034Stjr			(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
111034Stjr			rabn = lbn + 1 + nra;
111034Stjr			if (!incore(vp, rabn)) {
111034Stjr			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
82746Sdillon			    if (!rabp)
111034Stjr				return (EINTR);
111034Stjr			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
1541Srgrimes				rabp->b_flags |= B_ASYNC;
111034Stjr				rabp->b_iocmd = BIO_READ;
82746Sdillon				vfs_busy_pages(rabp, 0);
111034Stjr				if (nfs_asyncio(rabp, cred, td)) {
111034Stjr				    rabp->b_flags |= B_INVAL;
111034Stjr				    rabp->b_ioflags |= BIO_ERROR;
1541Srgrimes				    vfs_unbusy_pages(rabp);
1541Srgrimes				    brelse(rabp);
1541Srgrimes				    break;
1541Srgrimes				}
1541Srgrimes			    } else {
1541Srgrimes				brelse(rabp);
1541Srgrimes			    }
1541Srgrimes			}
1541Srgrimes		    }
36127Sbde		}
9327Sbde
9327Sbde		/*
9327Sbde		 * Obtain the buffer cache block.  Figure out the buffer size
1541Srgrimes		 * when we are at EOF.  If we are modifying the size of the
1541Srgrimes		 * buffer based on an EOF condition we need to hold
102074Sphk		 * nfs_rslock() through obtaining the buffer to prevent
1541Srgrimes		 * a potential writer-appender from messing with n_size.
102074Sphk		 * Otherwise we may accidently truncate the buffer and
35044Sphk		 * lose dirty data.
1541Srgrimes		 *
1541Srgrimes		 * Note that bcount is *not* DEV_BSIZE aligned.
73916Sjhb		 */
1541Srgrimes
35058Sphkagain:
35058Sphk		bcount = biosize;
73916Sjhb		if ((off_t)lbn * biosize >= np->n_size) {
1541Srgrimes			bcount = 0;
1541Srgrimes		} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
1541Srgrimes			bcount = np->n_size - (off_t)lbn * biosize;
1541Srgrimes		}
1541Srgrimes		if (bcount != biosize) {
36119Sphk			switch(nfs_rslock(np, td)) {
35058Sphk			case ENOLCK:
35044Sphk				goto again;
35044Sphk				/* not reached */
69286Sjake			case EINTR:
69286Sjake			case ERESTART:
73916Sjhb				return(EINTR);
1541Srgrimes				/* not reached */
1541Srgrimes			default:
1541Srgrimes				break;
73916Sjhb			}
1541Srgrimes		}
1541Srgrimes
1541Srgrimes		bp = nfs_getcacheblk(vp, lbn, bcount, td);
1541Srgrimes
1541Srgrimes		if (bcount != biosize)
1541Srgrimes			nfs_rsunlock(np, td);
1541Srgrimes		if (!bp)
1541Srgrimes			return (EINTR);
1549Srgrimes
102074Sphk		/*
1541Srgrimes		 * If B_CACHE is not set, we must issue the read.  If this
1541Srgrimes		 * fails, we return an error.
1541Srgrimes		 */
1541Srgrimes
1541Srgrimes		if ((bp->b_flags & B_CACHE) == 0) {
1541Srgrimes		    bp->b_iocmd = BIO_READ;
1541Srgrimes		    vfs_busy_pages(bp, 0);
1541Srgrimes		    error = nfs_doio(bp, cred, td);
1541Srgrimes		    if (error) {
1541Srgrimes			brelse(bp);
1541Srgrimes			return (error);
1541Srgrimes		    }
1541Srgrimes		}
1541Srgrimes
1541Srgrimes		/*
1541Srgrimes		 * on is the offset into the current bp.  Figure out how many
1541Srgrimes		 * bytes we can copy out of the bp.  Note that bcount is
1541Srgrimes		 * NOT DEV_BSIZE aligned.
1541Srgrimes		 *
1541Srgrimes		 * Then figure out how many bytes we can copy into the uio.
1549Srgrimes		 */
102074Sphk
1541Srgrimes		n = 0;
1541Srgrimes		if (on < bcount)
1541Srgrimes			n = min((unsigned)(bcount - on), uio->uio_resid);
1541Srgrimes		break;
1541Srgrimes	    case VLNK:
1541Srgrimes		nfsstats.biocache_readlinks++;
1541Srgrimes		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
1541Srgrimes		if (!bp)
1541Srgrimes			return (EINTR);
1541Srgrimes		if ((bp->b_flags & B_CACHE) == 0) {
1541Srgrimes		    bp->b_iocmd = BIO_READ;
1541Srgrimes		    vfs_busy_pages(bp, 0);
1541Srgrimes		    error = nfs_doio(bp, cred, td);
35058Sphk		    if (error) {
1541Srgrimes			bp->b_ioflags |= BIO_ERROR;
1541Srgrimes			brelse(bp);
1541Srgrimes			return (error);
35058Sphk		    }
1541Srgrimes		}
1541Srgrimes		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
1541Srgrimes		on = 0;
1541Srgrimes		break;
1541Srgrimes	    case VDIR:
1541Srgrimes		nfsstats.biocache_readdirs++;
1541Srgrimes		if (np->n_direofoffset
1541Srgrimes		    && uio->uio_offset >= np->n_direofoffset) {
1541Srgrimes		    return (0);
1541Srgrimes		}
1541Srgrimes		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
1541Srgrimes		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
1541Srgrimes		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
1541Srgrimes		if (!bp)
1541Srgrimes		    return (EINTR);
1541Srgrimes		if ((bp->b_flags & B_CACHE) == 0) {
1541Srgrimes		    bp->b_iocmd = BIO_READ;
1541Srgrimes		    vfs_busy_pages(bp, 0);
1549Srgrimes		    error = nfs_doio(bp, cred, td);
102074Sphk		    if (error) {
1541Srgrimes			    brelse(bp);
1541Srgrimes		    }
1541Srgrimes		    while (error == NFSERR_BAD_COOKIE) {
1541Srgrimes			printf("got bad cookie vp %p bp %p\n", vp, bp);
1541Srgrimes			nfs_invaldir(vp);
1541Srgrimes			error = nfs_vinvalbuf(vp, 0, cred, td, 1);
1541Srgrimes			/*
1549Srgrimes			 * Yuck! The directory has been modified on the
102074Sphk			 * server. The only way to get the block is by
1541Srgrimes			 * reading from the beginning to get all the
1541Srgrimes			 * offset cookies.
1541Srgrimes			 *
1541Srgrimes			 * Leave the last bp intact unless there is an error.
1541Srgrimes			 * Loop back up to the while if the error is another
1541Srgrimes			 * NFSERR_BAD_COOKIE (double yuch!).
1541Srgrimes			 */
12819Sphk			for (i = 0; i <= lbn && !error; i++) {
102074Sphk			    if (np->n_direofoffset
1541Srgrimes				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
1541Srgrimes				    return (0);
1541Srgrimes			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
1541Srgrimes			    if (!bp)
1541Srgrimes				return (EINTR);
1541Srgrimes			    if ((bp->b_flags & B_CACHE) == 0) {
1541Srgrimes				    bp->b_iocmd = BIO_READ;
1541Srgrimes				    vfs_busy_pages(bp, 0);
1541Srgrimes				    error = nfs_doio(bp, cred, td);
1541Srgrimes				    /*
1541Srgrimes				     * no error + B_INVAL == directory EOF,
108142Ssam				     * use the block.
108142Ssam				     */
108511Ssam				    if (error == 0 && (bp->b_flags & B_INVAL))
108142Ssam					    break;
108142Ssam			    }
108142Ssam			    /*
108142Ssam			     * An error will throw away the block and the
108142Ssam			     * for loop will break out.  If no error and this
108142Ssam			     * is not the block we want, we throw away the
108142Ssam			     * block and go for the next one via the for loop.
108511Ssam			     */
108511Ssam			    if (error || i < lbn)
108511Ssam				    brelse(bp);
108142Ssam			}
108142Ssam		    }
108142Ssam		    /*
108142Ssam		     * The above while is repeated if we hit another cookie
108142Ssam		     * error.  If we hit an error and it wasn't a cookie error,
108142Ssam		     * we give up.
108142Ssam		     */
108142Ssam		    if (error)
108142Ssam			    return (error);
108142Ssam		}
108142Ssam
108142Ssam		/*
108142Ssam		 * If not eof and read aheads are enabled, start one.
108142Ssam		 * (You need the current block first, so that you have the
108142Ssam		 *  directory offset cookie of the next block.)
108142Ssam		 */
108511Ssam		if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
108511Ssam		    (bp->b_flags & B_INVAL) == 0 &&
108511Ssam		    (np->n_direofoffset == 0 ||
108511Ssam		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
108511Ssam		    !(np->n_flag & NQNFSNONCACHE) &&
108511Ssam		    !incore(vp, lbn + 1)) {
108511Ssam			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
108142Ssam			if (rabp) {
108142Ssam			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
108142Ssam				rabp->b_flags |= B_ASYNC;
108142Ssam				rabp->b_iocmd = BIO_READ;
108511Ssam				vfs_busy_pages(rabp, 0);
108142Ssam				if (nfs_asyncio(rabp, cred, td)) {
108142Ssam				    rabp->b_flags |= B_INVAL;
108511Ssam				    rabp->b_ioflags |= BIO_ERROR;
108511Ssam				    vfs_unbusy_pages(rabp);
108511Ssam				    brelse(rabp);
108142Ssam				}
108511Ssam			    } else {
108511Ssam				brelse(rabp);
108511Ssam			    }
108511Ssam			}
108511Ssam		}
108511Ssam		/*
108511Ssam		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
108511Ssam		 * chopped for the EOF condition, we cannot tell how large
108511Ssam		 * NFS directories are going to be until we hit EOF.  So
108142Ssam		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
		 * it just so happens that b_resid will effectively chop it
		 * to EOF.  *BUT* this information is lost if the buffer goes
		 * away and is reconstituted into a B_CACHE state ( due to
		 * being VMIO ) later.  So we keep track of the directory eof
		 * in np->n_direofoffset and chop it off as an extra step
		 * right here.
		 */
		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
			n = np->n_direofoffset - uio->uio_offset;
		break;
	    default:
		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
		break;
	    };

	    if (n > 0) {
		    error = uiomove(bp->b_data + on, (int)n, uio);
	    }
	    switch (vp->v_type) {
	    case VREG:
		break;
	    case VLNK:
		n = 0;
		break;
	    case VDIR:
		/*
		 * Invalidate buffer if caching is disabled, forcing a
		 * re-read from the remote later.
		 */
		if (np->n_flag & NQNFSNONCACHE)
			bp->b_flags |= B_INVAL;
		break;
	    default:
		printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
	    }
	    brelse(bp);
	} while (error == 0 && uio->uio_resid > 0 && n > 0);
	return (error);
}

/*
 * Vnode op for write using bio
 */
int
nfs_write(ap)
	struct vop_write_args /* {
		struct vnode *a_vp;
		struct uio *a_uio;
		int  a_ioflag;
		struct ucred *a_cred;
	} */ *ap;
{
	int biosize;
	struct uio *uio = ap->a_uio;
	struct thread *td = uio->uio_td;
	struct vnode *vp = ap->a_vp;
	struct nfsnode *np = VTONFS(vp);
	struct ucred *cred = ap->a_cred;
	int ioflag = ap->a_ioflag;
	struct buf *bp;
	struct vattr vattr;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	daddr_t lbn;
	int bcount;
	int n, on, error = 0, iomode, must_commit;
	int haverslock = 0;
	struct proc *p = td?td->td_proc:NULL;

	GIANT_REQUIRED;

#ifdef DIAGNOSTIC
	if (uio->uio_rw != UIO_WRITE)
		panic("nfs_write mode");
	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
		panic("nfs_write proc");
#endif
	if (vp->v_type != VREG)
		return (EIO);
	if (np->n_flag & NWRITEERR) {
		np->n_flag &= ~NWRITEERR;
		return (np->n_error);
	}
	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
		(void)nfs_fsinfo(nmp, vp, cred, td);

	/*
	 * Synchronously flush pending buffers if we are in synchronous
	 * mode or if we are appending.
	 */
	if (ioflag & (IO_APPEND | IO_SYNC)) {
		if (np->n_flag & NMODIFIED) {
			np->n_attrstamp = 0;
			error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
			if (error)
				return (error);
		}
	}

	/*
	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
	 * get the append lock.
	 */
restart:
	if (ioflag & IO_APPEND) {
		np->n_attrstamp = 0;
		error = VOP_GETATTR(vp, &vattr, cred, td);
		if (error)
			return (error);
		uio->uio_offset = np->n_size;
	}

	if (uio->uio_offset < 0)
		return (EINVAL);
	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
		return (EFBIG);
	if (uio->uio_resid == 0)
		return (0);

	/*
	 * We need to obtain the rslock if we intend to modify np->n_size
	 * in order to guarentee the append point with multiple contending
	 * writers, to guarentee that no other appenders modify n_size
	 * while we are trying to obtain a truncated buffer (i.e. to avoid
	 * accidently truncating data written by another appender due to
	 * the race), and to ensure that the buffer is populated prior to
	 * our extending of the file.  We hold rslock through the entire
	 * operation.
	 *
	 * Note that we do not synchronize the case where someone truncates
	 * the file while we are appending to it because attempting to lock
	 * this case may deadlock other parts of the system unexpectedly.
	 */
	if ((ioflag & IO_APPEND) ||
	    uio->uio_offset + uio->uio_resid > np->n_size) {
		switch(nfs_rslock(np, td)) {
		case ENOLCK:
			goto restart;
			/* not reached */
		case EINTR:
		case ERESTART:
			return(EINTR);
			/* not reached */
		default:
			break;
		}
		haverslock = 1;
	}

	/*
	 * Maybe this should be above the vnode op call, but so long as
	 * file servers have no limits, i don't think it matters
	 */
	if (p && uio->uio_offset + uio->uio_resid >
	      p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
		PROC_LOCK(p);
		psignal(p, SIGXFSZ);
		PROC_UNLOCK(p);
		if (haverslock)
			nfs_rsunlock(np, td);
		return (EFBIG);
	}

	biosize = vp->v_mount->mnt_stat.f_iosize;

	do {
		/*
		 * Check for a valid write lease.
		 */
		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
			do {
				error = nqnfs_getlease(vp, ND_WRITE, cred, td);
			} while (error == NQNFS_EXPIRED);
			if (error)
				break;
			if (np->n_lrev != np->n_brev ||
			    (np->n_flag & NQNFSNONCACHE)) {
				error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
				if (error)
					break;
				np->n_brev = np->n_lrev;
			}
		}
		if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
		    iomode = NFSV3WRITE_FILESYNC;
		    error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
		    if (must_commit)
			    nfs_clearcommit(vp->v_mount);
		    break;
		}
		nfsstats.biocache_writes++;
		lbn = uio->uio_offset / biosize;
		on = uio->uio_offset & (biosize-1);
		n = min((unsigned)(biosize - on), uio->uio_resid);
again:
		/*
		 * Handle direct append and file extension cases, calculate
		 * unaligned buffer size.
		 */

		if (uio->uio_offset == np->n_size && n) {
			/*
			 * Get the buffer (in its pre-append state to maintain
			 * B_CACHE if it was previously set).  Resize the
			 * nfsnode after we have locked the buffer to prevent
			 * readers from reading garbage.
			 */
			bcount = on;
			bp = nfs_getcacheblk(vp, lbn, bcount, td);

			if (bp != NULL) {
				long save;

				np->n_size = uio->uio_offset + n;
				np->n_flag |= NMODIFIED;
				vnode_pager_setsize(vp, np->n_size);

				save = bp->b_flags & B_CACHE;
				bcount += n;
				allocbuf(bp, bcount);
				bp->b_flags |= save;
				bp->b_magic = B_MAGIC_NFS;
				bp->b_op = &buf_ops_nfs;
			}
		} else {
			/*
			 * Obtain the locked cache block first, and then
			 * adjust the file's size as appropriate.
			 */
			bcount = on + n;
			if ((off_t)lbn * biosize + bcount < np->n_size) {
				if ((off_t)(lbn + 1) * biosize < np->n_size)
					bcount = biosize;
				else
					bcount = np->n_size - (off_t)lbn * biosize;
			}

			bp = nfs_getcacheblk(vp, lbn, bcount, td);

			if (uio->uio_offset + n > np->n_size) {
				np->n_size = uio->uio_offset + n;
				np->n_flag |= NMODIFIED;
				vnode_pager_setsize(vp, np->n_size);
			}
		}

		if (!bp) {
			error = EINTR;
			break;
		}

		/*
		 * Issue a READ if B_CACHE is not set.  In special-append
		 * mode, B_CACHE is based on the buffer prior to the write
		 * op and is typically set, avoiding the read.  If a read
		 * is required in special append mode, the server will
		 * probably send us a short-read since we extended the file
		 * on our end, resulting in b_resid == 0 and, thusly,
		 * B_CACHE getting set.
		 *
		 * We can also avoid issuing the read if the write covers
		 * the entire buffer.  We have to make sure the buffer state
		 * is reasonable in this case since we will not be initiating
		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
		 * more information.
		 *
		 * B_CACHE may also be set due to the buffer being cached
		 * normally.
		 */

		if (on == 0 && n == bcount) {
			bp->b_flags |= B_CACHE;
			bp->b_flags &= ~B_INVAL;
			bp->b_ioflags &= ~BIO_ERROR;
		}

		if ((bp->b_flags & B_CACHE) == 0) {
			bp->b_iocmd = BIO_READ;
			vfs_busy_pages(bp, 0);
			error = nfs_doio(bp, cred, td);
			if (error) {
				brelse(bp);
				break;
			}
		}
		if (!bp) {
			error = EINTR;
			break;
		}
		if (bp->b_wcred == NOCRED) {
			crhold(cred);
			bp->b_wcred = cred;
		}
		np->n_flag |= NMODIFIED;

		/*
		 * If dirtyend exceeds file size, chop it down.  This should
		 * not normally occur but there is an append race where it
		 * might occur XXX, so we log it.
		 *
		 * If the chopping creates a reverse-indexed or degenerate
		 * situation with dirtyoff/end, we 0 both of them.
		 */

		if (bp->b_dirtyend > bcount) {
			printf("NFS append race @%lx:%d\n",
			    (long)bp->b_blkno * DEV_BSIZE,
			    bp->b_dirtyend - bcount);
			bp->b_dirtyend = bcount;
		}

		if (bp->b_dirtyoff >= bp->b_dirtyend)
			bp->b_dirtyoff = bp->b_dirtyend = 0;

		/*
		 * If the new write will leave a contiguous dirty
		 * area, just update the b_dirtyoff and b_dirtyend,
		 * otherwise force a write rpc of the old dirty area.
		 *
		 * While it is possible to merge discontiguous writes due to
		 * our having a B_CACHE buffer ( and thus valid read data
		 * for the hole), we don't because it could lead to
		 * significant cache coherency problems with multiple clients,
		 * especially if locking is implemented later on.
		 *
		 * as an optimization we could theoretically maintain
		 * a linked list of discontinuous areas, but we would still
		 * have to commit them separately so there isn't much
		 * advantage to it except perhaps a bit of asynchronization.
		 */

		if (bp->b_dirtyend > 0 &&
		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
			if (BUF_WRITE(bp) == EINTR)
				return (EINTR);
			goto again;
		}

		/*
		 * Check for valid write lease and get one as required.
		 * In case getblk() and/or bwrite() delayed us.
		 */
		if ((nmp->nm_flag & NFSMNT_NQNFS) &&
		    NQNFS_CKINVALID(vp, np, ND_WRITE)) {
			do {
				error = nqnfs_getlease(vp, ND_WRITE, cred, td);
			} while (error == NQNFS_EXPIRED);
			if (error) {
				brelse(bp);
				break;
			}
			if (np->n_lrev != np->n_brev ||
			    (np->n_flag & NQNFSNONCACHE)) {
				brelse(bp);
				error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
				if (error)
					break;
				np->n_brev = np->n_lrev;
				goto again;
			}
		}

		error = uiomove((char *)bp->b_data + on, n, uio);

		/*
		 * Since this block is being modified, it must be written
		 * again and not just committed.  Since write clustering does
		 * not work for the stage 1 data write, only the stage 2
		 * commit rpc, we have to clear B_CLUSTEROK as well.
		 */
		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);

		if (error) {
			bp->b_ioflags |= BIO_ERROR;
			brelse(bp);
			break;
		}

		/*
		 * Only update dirtyoff/dirtyend if not a degenerate
		 * condition.
		 */
		if (n) {
			if (bp->b_dirtyend > 0) {
				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
			} else {
				bp->b_dirtyoff = on;
				bp->b_dirtyend = on + n;
			}
			vfs_bio_set_validclean(bp, on, n);
		}

		/*
		 * If the lease is non-cachable or IO_SYNC do bwrite().
		 *
		 * IO_INVAL appears to be unused.  The idea appears to be
		 * to turn off caching in this case.  Very odd.  XXX
		 */
		if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
			if (ioflag & IO_INVAL)
				bp->b_flags |= B_NOCACHE;
			error = BUF_WRITE(bp);
			if (error)
				break;
			if (np->n_flag & NQNFSNONCACHE) {
				error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
				if (error)
					break;
			}
		} else if ((n + on) == biosize &&
			(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
			bp->b_flags |= B_ASYNC;
			(void)nfs_writebp(bp, 0, 0);
		} else {
			bdwrite(bp);
		}
	} while (uio->uio_resid > 0 && n > 0);

	if (haverslock)
		nfs_rsunlock(np, td);

	return (error);
}

/*
 * Get an nfs cache block.
 *
 * Allocate a new one if the block isn't currently in the cache
 * and return the block marked busy. If the calling process is
 * interrupted by a signal for an interruptible mount point, return
 * NULL.
 *
 * The caller must carefully deal with the possible B_INVAL state of
 * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
 * indirectly), so synchronous reads can be issued without worrying about
 * the B_INVAL state.  We have to be a little more careful when dealing
 * with writes (see comments in nfs_write()) when extending a file past
 * its EOF.
 */
static struct buf *
nfs_getcacheblk(vp, bn, size, td)
	struct vnode *vp;
	daddr_t bn;
	int size;
	struct thread *td;
{
	register struct buf *bp;
	struct mount *mp;
	struct nfsmount *nmp;

	mp = vp->v_mount;
	nmp = VFSTONFS(mp);

	if (nmp->nm_flag & NFSMNT_INT) {
		bp = getblk(vp, bn, size, PCATCH, 0);
		while (bp == (struct buf *)0) {
			if (nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc))
				return ((struct buf *)0);
			bp = getblk(vp, bn, size, 0, 2 * hz);
		}
	} else {
		bp = getblk(vp, bn, size, 0, 0);
	}

	if (vp->v_type == VREG) {
		int biosize;

		biosize = mp->mnt_stat.f_iosize;
		bp->b_blkno = bn * (biosize / DEV_BSIZE);
	}
	return (bp);
}

/*
 * Flush and invalidate all dirty buffers. If another process is already
 * doing the flush, just wait for completion.
 */
int
nfs_vinvalbuf(vp, flags, cred, td, intrflg)
	struct vnode *vp;
	int flags;
	struct ucred *cred;
	struct thread *td;
	int intrflg;
{
	register struct nfsnode *np = VTONFS(vp);
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	int error = 0, slpflag, slptimeo;

	if (vp->v_flag & VXLOCK) {
		return (0);
	}

	if ((nmp->nm_flag & NFSMNT_INT) == 0)
		intrflg = 0;
	if (intrflg) {
		slpflag = PCATCH;
		slptimeo = 2 * hz;
	} else {
		slpflag = 0;
		slptimeo = 0;
	}
	/*
	 * First wait for any other process doing a flush to complete.
	 */
	while (np->n_flag & NFLUSHINPROG) {
		np->n_flag |= NFLUSHWANT;
		error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
			slptimeo);
		if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc))
			return (EINTR);
	}

	/*
	 * Now, flush as required.
	 */
	np->n_flag |= NFLUSHINPROG;
	error = vinvalbuf(vp, flags, cred, td, slpflag, 0);
	while (error) {
		if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc)) {
			np->n_flag &= ~NFLUSHINPROG;
			if (np->n_flag & NFLUSHWANT) {
				np->n_flag &= ~NFLUSHWANT;
				wakeup((caddr_t)&np->n_flag);
			}
			return (EINTR);
		}
		error = vinvalbuf(vp, flags, cred, td, 0, slptimeo);
	}
	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
	if (np->n_flag & NFLUSHWANT) {
		np->n_flag &= ~NFLUSHWANT;
		wakeup((caddr_t)&np->n_flag);
	}
	return (0);
}

/*
 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
 * This is mainly to avoid queueing async I/O requests when the nfsiods
 * are all hung on a dead server.
 *
 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
 * is eventually dequeued by the async daemon, nfs_doio() *will*.
 */
int
nfs_asyncio(bp, cred, td)
	register struct buf *bp;
	struct ucred *cred;
	struct thread *td;
{
	struct nfsmount *nmp;
	int i;
	int gotiod;
	int slpflag = 0;
	int slptimeo = 0;
	int error;

	/*
	 * If no async daemons then return EIO to force caller to run the rpc
	 * synchronously.
	 */
	if (nfs_numasync == 0)
		return (EIO);

	nmp = VFSTONFS(bp->b_vp->v_mount);

	/*
	 * Commits are usually short and sweet so lets save some cpu and
	 * leave the async daemons for more important rpc's (such as reads
	 * and writes).
	 */
	if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
	    (nmp->nm_bufqiods > nfs_numasync / 2)) {
		return(EIO);
	}

again:
	if (nmp->nm_flag & NFSMNT_INT)
		slpflag = PCATCH;
	gotiod = FALSE;

	/*
	 * Find a free iod to process this request.
	 */
	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
		if (nfs_iodwant[i]) {
			/*
			 * Found one, so wake it up and tell it which
			 * mount to process.
			 */
			NFS_DPF(ASYNCIO,
				("nfs_asyncio: waking iod %d for mount %p\n",
				 i, nmp));
			nfs_iodwant[i] = (struct proc *)0;
			nfs_iodmount[i] = nmp;
			nmp->nm_bufqiods++;
			wakeup((caddr_t)&nfs_iodwant[i]);
			gotiod = TRUE;
			break;
		}

	/*
	 * If none are free, we may already have an iod working on this mount
	 * point.  If so, it will process our request.
	 */
	if (!gotiod) {
		if (nmp->nm_bufqiods > 0) {
			NFS_DPF(ASYNCIO,
				("nfs_asyncio: %d iods are already processing mount %p\n",
				 nmp->nm_bufqiods, nmp));
			gotiod = TRUE;
		}
	}

	/*
	 * If we have an iod which can process the request, then queue
	 * the buffer.
	 */
	if (gotiod) {
		/*
		 * Ensure that the queue never grows too large.  We still want
		 * to asynchronize so we block rather then return EIO.
		 */
		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
			NFS_DPF(ASYNCIO,
				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
			nmp->nm_bufqwant = TRUE;
			error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
				       "nfsaio", slptimeo);
			if (error) {
				if (nfs_sigintr(nmp, NULL, td ? td->td_proc : NULL))
					return (EINTR);
				if (slpflag == PCATCH) {
					slpflag = 0;
					slptimeo = 2 * hz;
				}
			}
			/*
			 * We might have lost our iod while sleeping,
			 * so check and loop if nescessary.
			 */
			if (nmp->nm_bufqiods == 0) {
				NFS_DPF(ASYNCIO,
					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
				goto again;
			}
		}

		if (bp->b_iocmd == BIO_READ) {
			if (bp->b_rcred == NOCRED && cred != NOCRED) {
				crhold(cred);
				bp->b_rcred = cred;
			}
		} else {
			bp->b_flags |= B_WRITEINPROG;
			if (bp->b_wcred == NOCRED && cred != NOCRED) {
				crhold(cred);
				bp->b_wcred = cred;
			}
		}

		BUF_KERNPROC(bp);
		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
		nmp->nm_bufqlen++;
		return (0);
	}

	/*
	 * All the iods are busy on other mounts, so return EIO to
	 * force the caller to process the i/o synchronously.
	 */
	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
	return (EIO);
}

/*
 * Do an I/O operation to/from a cache block. This may be called
 * synchronously or from an nfsiod.
 */
int
nfs_doio(bp, cr, td)
	struct buf *bp;
	struct ucred *cr;
	struct thread *td;
{
	struct uio *uiop;
	struct vnode *vp;
	struct nfsnode *np;
	struct nfsmount *nmp;
	int error = 0, iomode, must_commit = 0;
	struct uio uio;
	struct iovec io;
	struct proc *p = td?td->td_proc:NULL;

	vp = bp->b_vp;
	np = VTONFS(vp);
	nmp = VFSTONFS(vp->v_mount);
	uiop = &uio;
	uiop->uio_iov = &io;
	uiop->uio_iovcnt = 1;
	uiop->uio_segflg = UIO_SYSSPACE;
	uiop->uio_td = td;

	/*
	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
	 * do this here so we do not have to do it in all the code that
	 * calls us.
	 */
	bp->b_flags &= ~B_INVAL;
	bp->b_ioflags &= ~BIO_ERROR;

	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));

	/*
	 * Historically, paging was done with physio, but no more.
	 */
	if (bp->b_flags & B_PHYS) {
	    /*
	     * ...though reading /dev/drum still gets us here.
	     */
	    io.iov_len = uiop->uio_resid = bp->b_bcount;
	    /* mapping was done by vmapbuf() */
	    io.iov_base = bp->b_data;
	    uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
	    if (bp->b_iocmd == BIO_READ) {
		uiop->uio_rw = UIO_READ;
		nfsstats.read_physios++;
		error = nfs_readrpc(vp, uiop, cr);
	    } else {
		int com;

		iomode = NFSV3WRITE_DATASYNC;
		uiop->uio_rw = UIO_WRITE;
		nfsstats.write_physios++;
		error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
	    }
	    if (error) {
		bp->b_ioflags |= BIO_ERROR;
		bp->b_error = error;
	    }
	} else if (bp->b_iocmd == BIO_READ) {
	    io.iov_len = uiop->uio_resid = bp->b_bcount;
	    io.iov_base = bp->b_data;
	    uiop->uio_rw = UIO_READ;
	    switch (vp->v_type) {
	    case VREG:
		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
		nfsstats.read_bios++;
		error = nfs_readrpc(vp, uiop, cr);
		if (!error) {
		    if (uiop->uio_resid) {
			/*
			 * If we had a short read with no error, we must have
			 * hit a file hole.  We should zero-fill the remainder.
			 * This can also occur if the server hits the file EOF.
			 *
			 * Holes used to be able to occur due to pending
			 * writes, but that is not possible any longer.
			 */
			int nread = bp->b_bcount - uiop->uio_resid;
			int left  = bp->b_bcount - nread;

			if (left > 0)
				bzero((char *)bp->b_data + nread, left);
			uiop->uio_resid = 0;
		    }
		}
		if (p && (vp->v_flag & VTEXT) &&
			(((nmp->nm_flag & NFSMNT_NQNFS) &&
			  NQNFS_CKINVALID(vp, np, ND_READ) &&
			  np->n_lrev != np->n_brev) ||
			 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
			  np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
			uprintf("Process killed due to text file modification\n");
			PROC_LOCK(p);
			psignal(p, SIGKILL);
			_PHOLD(p);
			PROC_UNLOCK(p);
		}
		break;
	    case VLNK:
		uiop->uio_offset = (off_t)0;
		nfsstats.readlink_bios++;
		error = nfs_readlinkrpc(vp, uiop, cr);
		break;
	    case VDIR:
		nfsstats.readdir_bios++;
		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
		if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
			error = nfs_readdirplusrpc(vp, uiop, cr);
			if (error == NFSERR_NOTSUPP)
				nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
		}
		if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
			error = nfs_readdirrpc(vp, uiop, cr);
		/*
		 * end-of-directory sets B_INVAL but does not generate an
		 * error.
		 */
		if (error == 0 && uiop->uio_resid == bp->b_bcount)
			bp->b_flags |= B_INVAL;
		break;
	    default:
		printf("nfs_doio:  type %x unexpected\n",vp->v_type);
		break;
	    };
	    if (error) {
		bp->b_ioflags |= BIO_ERROR;
		bp->b_error = error;
	    }
	} else {
	    /*
	     * If we only need to commit, try to commit
	     */
	    if (bp->b_flags & B_NEEDCOMMIT) {
		    int retv;
		    off_t off;

		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
		    bp->b_flags |= B_WRITEINPROG;
		    retv = nfs_commit(
				bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
				bp->b_wcred, td);
		    bp->b_flags &= ~B_WRITEINPROG;
		    if (retv == 0) {
			    bp->b_dirtyoff = bp->b_dirtyend = 0;
			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
			    bp->b_resid = 0;
			    bufdone(bp);
			    return (0);
		    }
		    if (retv == NFSERR_STALEWRITEVERF) {
			    nfs_clearcommit(bp->b_vp->v_mount);
		    }
	    }

	    /*
	     * Setup for actual write
	     */

	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;

	    if (bp->b_dirtyend > bp->b_dirtyoff) {
		io.iov_len = uiop->uio_resid = bp->b_dirtyend
		    - bp->b_dirtyoff;
		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
		    + bp->b_dirtyoff;
		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
		uiop->uio_rw = UIO_WRITE;
		nfsstats.write_bios++;

		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
		    iomode = NFSV3WRITE_UNSTABLE;
		else
		    iomode = NFSV3WRITE_FILESYNC;

		bp->b_flags |= B_WRITEINPROG;
		error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);

		/*
		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
		 * to cluster the buffers needing commit.  This will allow
		 * the system to submit a single commit rpc for the whole
		 * cluster.  We can do this even if the buffer is not 100%
		 * dirty (relative to the NFS blocksize), so we optimize the
		 * append-to-file-case.
		 *
		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
		 * cleared because write clustering only works for commit
		 * rpc's, not for the data portion of the write).
		 */

		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
		    bp->b_flags |= B_NEEDCOMMIT;
		    if (bp->b_dirtyoff == 0
			&& bp->b_dirtyend == bp->b_bcount)
			bp->b_flags |= B_CLUSTEROK;
		} else {
		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
		}
		bp->b_flags &= ~B_WRITEINPROG;

		/*
		 * For an interrupted write, the buffer is still valid
		 * and the write hasn't been pushed to the server yet,
		 * so we can't set BIO_ERROR and report the interruption
		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
		 * is not relevant, so the rpc attempt is essentially
		 * a noop.  For the case of a V3 write rpc not being
		 * committed to stable storage, the block is still
		 * dirty and requires either a commit rpc or another
		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
		 * the block is reused. This is indicated by setting
		 * the B_DELWRI and B_NEEDCOMMIT flags.
		 *
		 * If the buffer is marked B_PAGING, it does not reside on
		 * the vp's paging queues so we cannot call bdirty().  The
		 * bp in this case is not an NFS cache block so we should
		 * be safe. XXX
		 */
    		if (error == EINTR
		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
			int s;

			s = splbio();
			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
			if ((bp->b_flags & B_PAGING) == 0) {
			    bdirty(bp);
			    bp->b_flags &= ~B_DONE;
			}
			if (error && (bp->b_flags & B_ASYNC) == 0)
			    bp->b_flags |= B_EINTR;
			splx(s);
	    	} else {
		    if (error) {
			bp->b_ioflags |= BIO_ERROR;
			bp->b_error = np->n_error = error;
			np->n_flag |= NWRITEERR;
		    }
		    bp->b_dirtyoff = bp->b_dirtyend = 0;
		}
	    } else {
		bp->b_resid = 0;
		bufdone(bp);
		return (0);
	    }
	}
	bp->b_resid = uiop->uio_resid;
	if (must_commit)
	    nfs_clearcommit(vp->v_mount);
	bufdone(bp);
	return (error);
}