nfs_bio.c revision 58345
11590Srgrimes/* 294589Sobrien * Copyright (c) 1989, 1993 394589Sobrien * The Regents of the University of California. All rights reserved. 45814Sjkh * 51590Srgrimes * This code is derived from software contributed to Berkeley by 61590Srgrimes * Rick Macklem at The University of Guelph. 71590Srgrimes * 81590Srgrimes * Redistribution and use in source and binary forms, with or without 91590Srgrimes * modification, are permitted provided that the following conditions 101590Srgrimes * are met: 111590Srgrimes * 1. Redistributions of source code must retain the above copyright 121590Srgrimes * notice, this list of conditions and the following disclaimer. 131590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141590Srgrimes * notice, this list of conditions and the following disclaimer in the 151590Srgrimes * documentation and/or other materials provided with the distribution. 161590Srgrimes * 3. All advertising materials mentioning features or use of this software 171590Srgrimes * must display the following acknowledgement: 181590Srgrimes * This product includes software developed by the University of 191590Srgrimes * California, Berkeley and its contributors. 201590Srgrimes * 4. Neither the name of the University nor the names of its contributors 211590Srgrimes * may be used to endorse or promote products derived from this software 221590Srgrimes * without specific prior written permission. 231590Srgrimes * 241590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 251590Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 261590Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 271590Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 281590Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 291590Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 301590Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 311590Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 321590Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 331590Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 341590Srgrimes * SUCH DAMAGE. 351590Srgrimes * 361590Srgrimes * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 371590Srgrimes * $FreeBSD: head/sys/nfsclient/nfs_bio.c 58345 2000-03-20 10:44:49Z phk $ 3862833Swsanchez */ 3962833Swsanchez 401590Srgrimes 411590Srgrimes#include <sys/param.h> 4262833Swsanchez#include <sys/systm.h> 4394587Sobrien#include <sys/resourcevar.h> 441590Srgrimes#include <sys/signalvar.h> 4535483Simp#include <sys/proc.h> 46103503Sjmallett#include <sys/buf.h> 4735483Simp#include <sys/vnode.h> 4835483Simp#include <sys/mount.h> 491590Srgrimes#include <sys/kernel.h> 501590Srgrimes 511590Srgrimes#include <vm/vm.h> 521590Srgrimes#include <vm/vm_extern.h> 531590Srgrimes#include <vm/vm_page.h> 541590Srgrimes#include <vm/vm_object.h> 551590Srgrimes#include <vm/vm_pager.h> 561590Srgrimes#include <vm/vnode_pager.h> 571590Srgrimes 581590Srgrimes#include <nfs/rpcv2.h> 591590Srgrimes#include <nfs/nfsproto.h> 601590Srgrimes#include <nfs/nfs.h> 611590Srgrimes#include <nfs/nfsmount.h> 621590Srgrimes#include <nfs/nqnfs.h> 631590Srgrimes#include <nfs/nfsnode.h> 641590Srgrimes 651590Srgrimesstatic struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 661590Srgrimes struct proc *p)); 671590Srgrimes 681590Srgrimesextern int nfs_numasync; 691590Srgrimesextern int nfs_pbuf_freecnt; 701590Srgrimesextern struct nfsstats nfsstats; 711590Srgrimes 721590Srgrimes/* 731590Srgrimes * Vnode op for VM getpages. 741590Srgrimes */ 751590Srgrimesint 761590Srgrimesnfs_getpages(ap) 771590Srgrimes struct vop_getpages_args /* { 781590Srgrimes struct vnode *a_vp; 791590Srgrimes vm_page_t *a_m; 801590Srgrimes int a_count; 811590Srgrimes int a_reqpage; 821590Srgrimes vm_ooffset_t a_offset; 831590Srgrimes } */ *ap; 841590Srgrimes{ 851590Srgrimes int i, error, nextoff, size, toff, count, npages; 8694594Sobrien struct uio uio; 871590Srgrimes struct iovec iov; 881590Srgrimes vm_offset_t kva; 891590Srgrimes struct buf *bp; 901590Srgrimes struct vnode *vp; 911590Srgrimes struct proc *p; 921590Srgrimes struct ucred *cred; 931590Srgrimes struct nfsmount *nmp; 941590Srgrimes vm_page_t *pages; 951590Srgrimes 961590Srgrimes vp = ap->a_vp; 971590Srgrimes p = curproc; /* XXX */ 981590Srgrimes cred = curproc->p_ucred; /* XXX */ 991590Srgrimes nmp = VFSTONFS(vp->v_mount); 1001590Srgrimes pages = ap->a_m; 1011590Srgrimes count = ap->a_count; 1021590Srgrimes 1031590Srgrimes if (vp->v_object == NULL) { 1041590Srgrimes printf("nfs_getpages: called with non-merged cache vnode??\n"); 1051590Srgrimes return VM_PAGER_ERROR; 1061590Srgrimes } 1071590Srgrimes 1081590Srgrimes if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 1091590Srgrimes (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 11094506Scharnier (void)nfs_fsinfo(nmp, vp, cred, p); 11194506Scharnier 1125814Sjkh npages = btoc(count); 1131590Srgrimes 1141590Srgrimes /* 1155814Sjkh * If the requested page is partially valid, just return it and 11680381Ssheldonh * allow the pager to zero-out the blanks. Partially valid pages 11794506Scharnier * can only occur at the file EOF. 1181590Srgrimes */ 1191590Srgrimes 1201590Srgrimes { 1211590Srgrimes vm_page_t m = pages[ap->a_reqpage]; 1221590Srgrimes 12318730Ssteve if (m->valid != 0) { 12418730Ssteve /* handled by vm_fault now */ 12518730Ssteve /* vm_page_zero_invalid(m, TRUE); */ 12618730Ssteve for (i = 0; i < npages; ++i) { 12718730Ssteve if (i != ap->a_reqpage) 12818730Ssteve vnode_pager_freepage(pages[i]); 1291590Srgrimes } 1301590Srgrimes return(0); 1318874Srgrimes } 1321590Srgrimes } 1331590Srgrimes 1341590Srgrimes /* 135103503Sjmallett * We use only the kva address for the buffer, but this is extremely 136103503Sjmallett * convienient and fast. 137103503Sjmallett */ 1381590Srgrimes bp = getpbuf(&nfs_pbuf_freecnt); 13918730Ssteve 14018730Ssteve kva = (vm_offset_t) bp->b_data; 14118730Ssteve pmap_qenter(kva, pages, npages); 14218730Ssteve 143103503Sjmallett iov.iov_base = (caddr_t) kva; 1441590Srgrimes iov.iov_len = count; 1451590Srgrimes uio.uio_iov = &iov; 1461590Srgrimes uio.uio_iovcnt = 1; 1471590Srgrimes uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 1481590Srgrimes uio.uio_resid = count; 1491590Srgrimes uio.uio_segflg = UIO_SYSSPACE; 1501590Srgrimes uio.uio_rw = UIO_READ; 1511590Srgrimes uio.uio_procp = p; 1521590Srgrimes 1531590Srgrimes error = nfs_readrpc(vp, &uio, cred); 1541590Srgrimes pmap_qremove(kva, npages); 1551590Srgrimes 1561590Srgrimes relpbuf(bp, &nfs_pbuf_freecnt); 1571590Srgrimes 1581590Srgrimes if (error && (uio.uio_resid == count)) { 159103503Sjmallett printf("nfs_getpages: error %d\n", error); 160103503Sjmallett for (i = 0; i < npages; ++i) { 161103503Sjmallett if (i != ap->a_reqpage) 162103503Sjmallett vnode_pager_freepage(pages[i]); 1631590Srgrimes } 1641590Srgrimes return VM_PAGER_ERROR; 16568898Skris } 16668898Skris 16768898Skris /* 16868898Skris * Calculate the number of bytes read and validate only that number 16968898Skris * of bytes. Note that due to pending writes, size may be 0. This 17068898Skris * does not mean that the remaining data is invalid! 17168898Skris */ 1721590Srgrimes 1731590Srgrimes size = count - uio.uio_resid; 1741590Srgrimes 1751590Srgrimes for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 1761590Srgrimes vm_page_t m; 1771590Srgrimes nextoff = toff + PAGE_SIZE; 1781590Srgrimes m = pages[i]; 1791590Srgrimes 1801590Srgrimes m->flags &= ~PG_ZERO; 1811590Srgrimes 1821590Srgrimes if (nextoff <= size) { 1831590Srgrimes /* 1841590Srgrimes * Read operation filled an entire page 1851590Srgrimes */ 1861590Srgrimes m->valid = VM_PAGE_BITS_ALL; 1871590Srgrimes vm_page_undirty(m); 1881590Srgrimes } else if (size > toff) { 1891590Srgrimes /* 1901590Srgrimes * Read operation filled a partial page. 1911590Srgrimes */ 1921590Srgrimes m->valid = 0; 19318730Ssteve vm_page_set_validclean(m, 0, size - toff); 19418730Ssteve /* handled by vm_fault now */ 1951590Srgrimes /* vm_page_zero_invalid(m, TRUE); */ 19618730Ssteve } 1971590Srgrimes 1981590Srgrimes if (i != ap->a_reqpage) { 1991590Srgrimes /* 20064739Sgreen * Whether or not to leave the page activated is up in 20164739Sgreen * the air, but we should put the page on a page queue 20264739Sgreen * somewhere (it already is in the object). Result: 20364739Sgreen * It appears that emperical results show that 20464739Sgreen * deactivating pages is best. 20564739Sgreen */ 20664739Sgreen 20764739Sgreen /* 20864739Sgreen * Just in case someone was asking for this page we 20964739Sgreen * now tell them that it is ok to use. 2101590Srgrimes */ 2111590Srgrimes if (!error) { 2121590Srgrimes if (m->flags & PG_WANTED) 21318730Ssteve vm_page_activate(m); 21418730Ssteve else 21518730Ssteve vm_page_deactivate(m); 21618730Ssteve vm_page_wakeup(m); 2171590Srgrimes } else { 2181590Srgrimes vnode_pager_freepage(m); 2191590Srgrimes } 2201590Srgrimes } 2211590Srgrimes } 2221590Srgrimes return 0; 2231590Srgrimes} 22418730Ssteve 2251590Srgrimes/* 2261590Srgrimes * Vnode op for VM putpages. 2271590Srgrimes */ 2281590Srgrimesint 2291590Srgrimesnfs_putpages(ap) 2301590Srgrimes struct vop_putpages_args /* { 23118730Ssteve struct vnode *a_vp; 23218730Ssteve vm_page_t *a_m; 23318730Ssteve int a_count; 23418730Ssteve int a_sync; 2351590Srgrimes int *a_rtvals; 2361590Srgrimes vm_ooffset_t a_offset; 2371590Srgrimes } */ *ap; 2381590Srgrimes{ 2391590Srgrimes struct uio uio; 2401590Srgrimes struct iovec iov; 2411590Srgrimes vm_offset_t kva; 2421590Srgrimes struct buf *bp; 2431590Srgrimes int iomode, must_commit, i, error, npages, count; 24418730Ssteve off_t offset; 2451590Srgrimes int *rtvals; 24618730Ssteve struct vnode *vp; 2471590Srgrimes struct proc *p; 2481590Srgrimes struct ucred *cred; 2491590Srgrimes struct nfsmount *nmp; 25018730Ssteve struct nfsnode *np; 25118730Ssteve vm_page_t *pages; 25218730Ssteve 25318730Ssteve vp = ap->a_vp; 25418730Ssteve np = VTONFS(vp); 25518730Ssteve p = curproc; /* XXX */ 25618730Ssteve cred = curproc->p_ucred; /* XXX */ 25718730Ssteve nmp = VFSTONFS(vp->v_mount); 25818730Ssteve pages = ap->a_m; 25918730Ssteve count = ap->a_count; 2601590Srgrimes rtvals = ap->a_rtvals; 2611590Srgrimes npages = btoc(count); 2621590Srgrimes offset = IDX_TO_OFF(pages[0]->pindex); 2631590Srgrimes 2648874Srgrimes if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 2651590Srgrimes (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 26618730Ssteve (void)nfs_fsinfo(nmp, vp, cred, p); 2671590Srgrimes 2681590Srgrimes for (i = 0; i < npages; i++) { 2691590Srgrimes rtvals[i] = VM_PAGER_AGAIN; 2701590Srgrimes } 2711590Srgrimes 27218730Ssteve /* 2731590Srgrimes * When putting pages, do not extend file past EOF. 2741590Srgrimes */ 27518730Ssteve 2761590Srgrimes if (offset + count > np->n_size) { 27718730Ssteve count = np->n_size - offset; 2781590Srgrimes if (count < 0) 2791590Srgrimes count = 0; 2801590Srgrimes } 28118730Ssteve 28218730Ssteve /* 28318730Ssteve * We use only the kva address for the buffer, but this is extremely 28418730Ssteve * convienient and fast. 28518730Ssteve */ 28618730Ssteve bp = getpbuf(&nfs_pbuf_freecnt); 287103503Sjmallett 28818730Ssteve kva = (vm_offset_t) bp->b_data; 28918730Ssteve pmap_qenter(kva, pages, npages); 29018730Ssteve 29118730Ssteve iov.iov_base = (caddr_t) kva; 29218730Ssteve iov.iov_len = count; 29318730Ssteve uio.uio_iov = &iov; 29418730Ssteve uio.uio_iovcnt = 1; 29518730Ssteve uio.uio_offset = offset; 29618730Ssteve uio.uio_resid = count; 297103503Sjmallett uio.uio_segflg = UIO_SYSSPACE; 298103503Sjmallett uio.uio_rw = UIO_WRITE; 29918730Ssteve uio.uio_procp = p; 30018730Ssteve 30192921Simp if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 30292921Simp iomode = NFSV3WRITE_UNSTABLE; 30392921Simp else 30492921Simp iomode = NFSV3WRITE_FILESYNC; 30592921Simp 30692921Simp error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 30718730Ssteve 30892921Simp pmap_qremove(kva, npages); 30918730Ssteve relpbuf(bp, &nfs_pbuf_freecnt); 31092921Simp 31118730Ssteve if (!error) { 31218730Ssteve int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 31392921Simp for (i = 0; i < nwritten; i++) { 31492921Simp rtvals[i] = VM_PAGER_OK; 31518730Ssteve vm_page_undirty(pages[i]); 31692921Simp } 31792921Simp if (must_commit) 31892921Simp nfs_clearcommit(vp->v_mount); 31992921Simp } 32092921Simp return rtvals[0]; 32192921Simp} 32292921Simp 32392921Simp/* 3241590Srgrimes * Vnode op for read using bio 3251590Srgrimes */ 3261590Srgrimesint 3271590Srgrimesnfs_bioread(vp, uio, ioflag, cred) 3281590Srgrimes register struct vnode *vp; 3291590Srgrimes register struct uio *uio; 3301590Srgrimes int ioflag; 3311590Srgrimes struct ucred *cred; 3321590Srgrimes{ 3331590Srgrimes register struct nfsnode *np = VTONFS(vp); 3341590Srgrimes register int biosize, i; 3351590Srgrimes struct buf *bp = 0, *rabp; 3361590Srgrimes struct vattr vattr; 3371590Srgrimes struct proc *p; 3381590Srgrimes struct nfsmount *nmp = VFSTONFS(vp->v_mount); 3391590Srgrimes daddr_t lbn, rabn; 3405814Sjkh int bcount; 34169531Swill int seqcount; 34269531Swill int nra, error = 0, n = 0, on = 0; 3431590Srgrimes 3445814Sjkh#ifdef DIAGNOSTIC 3455814Sjkh if (uio->uio_rw != UIO_READ) 3461590Srgrimes panic("nfs_read mode"); 3471590Srgrimes#endif 34818730Ssteve if (uio->uio_resid == 0) 3491590Srgrimes return (0); 3501590Srgrimes if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 3511590Srgrimes return (EINVAL); 3521590Srgrimes p = uio->uio_procp; 3531590Srgrimes 3541590Srgrimes if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 3551590Srgrimes (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 3561590Srgrimes (void)nfs_fsinfo(nmp, vp, cred, p); 35718730Ssteve if (vp->v_type != VDIR && 35818730Ssteve (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 35918730Ssteve return (EFBIG); 36018730Ssteve biosize = vp->v_mount->mnt_stat.f_iosize; 36118730Ssteve seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); 36218730Ssteve /* 3631590Srgrimes * For nfs, cache consistency can only be maintained approximately. 3641590Srgrimes * Although RFC1094 does not specify the criteria, the following is 36518730Ssteve * believed to be compatible with the reference port. 3661590Srgrimes * For nqnfs, full cache consistency is maintained within the loop. 3671590Srgrimes * For nfs: 3681590Srgrimes * If the file's modify time on the server has changed since the 3691590Srgrimes * last read rpc or you have written to the file, 3701590Srgrimes * you may have lost data cache consistency with the 3711590Srgrimes * server, so flush all of the file's data out of the cache. 3721590Srgrimes * Then force a getattr rpc to ensure that you have up to date 3731590Srgrimes * attributes. 3741590Srgrimes * NB: This implies that cache data can be read when up to 3751590Srgrimes * NFS_ATTRTIMEO seconds out of date. If you find that you need current 3761590Srgrimes * attributes this could be forced by setting n_attrstamp to 0 before 3771590Srgrimes * the VOP_GETATTR() call. 3781590Srgrimes */ 3798874Srgrimes if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 3801590Srgrimes if (np->n_flag & NMODIFIED) { 3811590Srgrimes if (vp->v_type != VREG) { 3821590Srgrimes if (vp->v_type != VDIR) 3831590Srgrimes panic("nfs: bioread, not dir"); 3841590Srgrimes nfs_invaldir(vp); 3851590Srgrimes error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 38618730Ssteve if (error) 38718730Ssteve return (error); 3888874Srgrimes } 38918730Ssteve np->n_attrstamp = 0; 39018730Ssteve error = VOP_GETATTR(vp, &vattr, cred, p); 39118730Ssteve if (error) 39218730Ssteve return (error); 39369531Swill np->n_mtime = vattr.va_mtime.tv_sec; 3941590Srgrimes } else { 3951590Srgrimes error = VOP_GETATTR(vp, &vattr, cred, p); 3961590Srgrimes if (error) 3971590Srgrimes return (error); 3981590Srgrimes if (np->n_mtime != vattr.va_mtime.tv_sec) { 3991590Srgrimes if (vp->v_type == VDIR) 4001590Srgrimes nfs_invaldir(vp); 40118730Ssteve error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 4021590Srgrimes if (error) 40318730Ssteve return (error); 4041590Srgrimes np->n_mtime = vattr.va_mtime.tv_sec; 4058874Srgrimes } 4061590Srgrimes } 4071590Srgrimes } 4081590Srgrimes do { 4091590Srgrimes 41038520Scracauer /* 4111590Srgrimes * Get a valid lease. If cached data is stale, flush it. 4128874Srgrimes */ 4131590Srgrimes if (nmp->nm_flag & NFSMNT_NQNFS) { 4141590Srgrimes if (NQNFS_CKINVALID(vp, np, ND_READ)) { 4151590Srgrimes do { 4161590Srgrimes error = nqnfs_getlease(vp, ND_READ, cred, p); 4171590Srgrimes } while (error == NQNFS_EXPIRED); 4181590Srgrimes if (error) 41918730Ssteve return (error); 42018730Ssteve if (np->n_lrev != np->n_brev || 42118730Ssteve (np->n_flag & NQNFSNONCACHE) || 42218730Ssteve ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 42318730Ssteve if (vp->v_type == VDIR) 42418730Ssteve nfs_invaldir(vp); 42518730Ssteve error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 4261590Srgrimes if (error) 42718730Ssteve return (error); 42818730Ssteve np->n_brev = np->n_lrev; 42918730Ssteve } 43018730Ssteve } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 43118730Ssteve nfs_invaldir(vp); 43218730Ssteve error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 43318730Ssteve if (error) 4341590Srgrimes return (error); 43518730Ssteve } 43618730Ssteve } 4375814Sjkh if (np->n_flag & NQNFSNONCACHE) { 43869531Swill switch (vp->v_type) { 4391590Srgrimes case VREG: 44018730Ssteve return (nfs_readrpc(vp, uio, cred)); 44118730Ssteve case VLNK: 44218730Ssteve return (nfs_readlinkrpc(vp, uio, cred)); 44318730Ssteve case VDIR: 4441590Srgrimes break; 4451590Srgrimes default: 4461590Srgrimes printf(" NQNFSNONCACHE: type %x unexpected\n", 4471590Srgrimes vp->v_type); 4481590Srgrimes }; 4491590Srgrimes } 4501590Srgrimes switch (vp->v_type) { 4511590Srgrimes case VREG: 4521590Srgrimes nfsstats.biocache_reads++; 4531590Srgrimes lbn = uio->uio_offset / biosize; 4541590Srgrimes on = uio->uio_offset & (biosize - 1); 4551590Srgrimes 4561590Srgrimes /* 4571590Srgrimes * Start the read ahead(s), as required. 4581590Srgrimes */ 4591590Srgrimes if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 4601590Srgrimes for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 46118730Ssteve (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 46269531Swill rabn = lbn + 1 + nra; 46369531Swill if (!incore(vp, rabn)) { 4641590Srgrimes rabp = nfs_getcacheblk(vp, rabn, biosize, p); 46518730Ssteve if (!rabp) 4661590Srgrimes return (EINTR); 4671590Srgrimes if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 46818730Ssteve rabp->b_flags |= B_ASYNC; 4691590Srgrimes rabp->b_iocmd = BIO_READ; 4701590Srgrimes vfs_busy_pages(rabp, 0); 47118730Ssteve if (nfs_asyncio(rabp, cred, p)) { 47218730Ssteve rabp->b_flags |= B_INVAL|B_ERROR; 47318730Ssteve vfs_unbusy_pages(rabp); 47418730Ssteve brelse(rabp); 47518730Ssteve break; 47618730Ssteve } 47718730Ssteve } else { 47818730Ssteve brelse(rabp); 47918730Ssteve } 48018730Ssteve } 48118730Ssteve } 48218730Ssteve } 48318730Ssteve 48469531Swill /* 48569531Swill * Obtain the buffer cache block. Figure out the buffer size 48618730Ssteve * when we are at EOF. If we are modifying the size of the 48718730Ssteve * buffer based on an EOF condition we need to hold 48818730Ssteve * nfs_rslock() through obtaining the buffer to prevent 48918730Ssteve * a potential writer-appender from messing with n_size. 49018730Ssteve * Otherwise we may accidently truncate the buffer and 49118730Ssteve * lose dirty data. 49218730Ssteve * 4931590Srgrimes * Note that bcount is *not* DEV_BSIZE aligned. 4941590Srgrimes */ 4951590Srgrimes 4961590Srgrimesagain: 4971590Srgrimes bcount = biosize; 4981590Srgrimes if ((off_t)lbn * biosize >= np->n_size) { 4991590Srgrimes bcount = 0; 5001590Srgrimes } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 5011590Srgrimes bcount = np->n_size - (off_t)lbn * biosize; 5021590Srgrimes } 5031590Srgrimes if (bcount != biosize) { 50494594Sobrien switch(nfs_rslock(np, p)) { 5051590Srgrimes case ENOLCK: 5061590Srgrimes goto again; 5071590Srgrimes /* not reached */ 5081590Srgrimes case EINTR: 5091590Srgrimes case ERESTART: 5101590Srgrimes return(EINTR); 5111590Srgrimes /* not reached */ 5121590Srgrimes default: 5131590Srgrimes break; 5141590Srgrimes } 5151590Srgrimes } 5161590Srgrimes 5171590Srgrimes bp = nfs_getcacheblk(vp, lbn, bcount, p); 5181590Srgrimes 51918730Ssteve if (bcount != biosize) 52069531Swill nfs_rsunlock(np, p); 52169531Swill if (!bp) 5221590Srgrimes return (EINTR); 5231590Srgrimes 5241590Srgrimes /* 5251590Srgrimes * If B_CACHE is not set, we must issue the read. If this 5261590Srgrimes * fails, we return an error. 5271590Srgrimes */ 5281590Srgrimes 5291590Srgrimes if ((bp->b_flags & B_CACHE) == 0) { 5301590Srgrimes bp->b_iocmd = BIO_READ; 5311590Srgrimes vfs_busy_pages(bp, 0); 5321590Srgrimes error = nfs_doio(bp, cred, p); 5331590Srgrimes if (error) { 5341590Srgrimes brelse(bp); 5355814Sjkh return (error); 5368874Srgrimes } 5371590Srgrimes } 53818730Ssteve 5391590Srgrimes /* 54018730Ssteve * on is the offset into the current bp. Figure out how many 5418874Srgrimes * bytes we can copy out of the bp. Note that bcount is 5421590Srgrimes * NOT DEV_BSIZE aligned. 54318730Ssteve * 54469531Swill * Then figure out how many bytes we can copy into the uio. 54518730Ssteve */ 5461590Srgrimes 54718730Ssteve n = 0; 5481590Srgrimes if (on < bcount) 5491590Srgrimes n = min((unsigned)(bcount - on), uio->uio_resid); 550103503Sjmallett break; 55118730Ssteve case VLNK: 55218730Ssteve nfsstats.biocache_readlinks++; 55318730Ssteve bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 55418730Ssteve if (!bp) 55518730Ssteve return (EINTR); 5561590Srgrimes if ((bp->b_flags & B_CACHE) == 0) { 5571590Srgrimes bp->b_iocmd = BIO_READ; 5581590Srgrimes vfs_busy_pages(bp, 0); 5591590Srgrimes error = nfs_doio(bp, cred, p); 5601590Srgrimes if (error) { 5611590Srgrimes bp->b_flags |= B_ERROR; 5621590Srgrimes brelse(bp); 56369531Swill return (error); 56418730Ssteve } 56569531Swill } 5661590Srgrimes n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 5671590Srgrimes on = 0; 5681590Srgrimes break; 5691590Srgrimes case VDIR: 5701590Srgrimes nfsstats.biocache_readdirs++; 5711590Srgrimes if (np->n_direofoffset 5721590Srgrimes && uio->uio_offset >= np->n_direofoffset) { 5731590Srgrimes return (0); 57460569Swill } 5751590Srgrimes lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 5761590Srgrimes on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 5771590Srgrimes bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 5781590Srgrimes if (!bp) 5791590Srgrimes return (EINTR); 5801590Srgrimes if ((bp->b_flags & B_CACHE) == 0) { 5811590Srgrimes bp->b_iocmd = BIO_READ; 5821590Srgrimes vfs_busy_pages(bp, 0); 5831590Srgrimes error = nfs_doio(bp, cred, p); 5841590Srgrimes if (error) { 58518730Ssteve brelse(bp); 5861590Srgrimes } 58718730Ssteve while (error == NFSERR_BAD_COOKIE) { 5881590Srgrimes printf("got bad cookie vp %p bp %p\n", vp, bp); 5891590Srgrimes nfs_invaldir(vp); 5901590Srgrimes error = nfs_vinvalbuf(vp, 0, cred, p, 1); 5911590Srgrimes /* 5921590Srgrimes * Yuck! The directory has been modified on the 5931590Srgrimes * server. The only way to get the block is by 59418730Ssteve * reading from the beginning to get all the 5951590Srgrimes * offset cookies. 5961590Srgrimes * 5971590Srgrimes * Leave the last bp intact unless there is an error. 5981590Srgrimes * Loop back up to the while if the error is another 5991590Srgrimes * NFSERR_BAD_COOKIE (double yuch!). 6001590Srgrimes */ 6011590Srgrimes for (i = 0; i <= lbn && !error; i++) { 6021590Srgrimes if (np->n_direofoffset 6031590Srgrimes && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 60418730Ssteve return (0); 6051590Srgrimes bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 60618730Ssteve if (!bp) 60718730Ssteve return (EINTR); 60818730Ssteve if ((bp->b_flags & B_CACHE) == 0) { 6091590Srgrimes bp->b_iocmd = BIO_READ; 61018730Ssteve vfs_busy_pages(bp, 0); 6111590Srgrimes error = nfs_doio(bp, cred, p); 6121590Srgrimes /* 61318730Ssteve * no error + B_INVAL == directory EOF, 6141590Srgrimes * use the block. 6151590Srgrimes */ 6161590Srgrimes if (error == 0 && (bp->b_flags & B_INVAL)) 6171590Srgrimes break; 6181590Srgrimes } 6191590Srgrimes /* 6201590Srgrimes * An error will throw away the block and the 6211590Srgrimes * for loop will break out. If no error and this 6221590Srgrimes * is not the block we want, we throw away the 6231590Srgrimes * block and go for the next one via the for loop. 62418730Ssteve */ 6251590Srgrimes if (error || i < lbn) 62618730Ssteve brelse(bp); 62718730Ssteve } 6281590Srgrimes } 6291590Srgrimes /* 6301590Srgrimes * The above while is repeated if we hit another cookie 6311590Srgrimes * error. If we hit an error and it wasn't a cookie error, 6321590Srgrimes * we give up. 6331590Srgrimes */ 6341590Srgrimes if (error) 6351590Srgrimes return (error); 6361590Srgrimes } 6371590Srgrimes 6381590Srgrimes /* 6391590Srgrimes * If not eof and read aheads are enabled, start one. 6401590Srgrimes * (You need the current block first, so that you have the 6411590Srgrimes * directory offset cookie of the next block.) 6421590Srgrimes */ 6431590Srgrimes if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 6448874Srgrimes (bp->b_flags & B_INVAL) == 0 && 64518730Ssteve (np->n_direofoffset == 0 || 6468874Srgrimes (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 6471590Srgrimes !(np->n_flag & NQNFSNONCACHE) && 6481590Srgrimes !incore(vp, lbn + 1)) { 6491590Srgrimes rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 6501590Srgrimes if (rabp) { 6511590Srgrimes if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 6521590Srgrimes rabp->b_flags |= B_ASYNC; 6531590Srgrimes rabp->b_iocmd = BIO_READ; 65418730Ssteve vfs_busy_pages(rabp, 0); 6551590Srgrimes if (nfs_asyncio(rabp, cred, p)) { 6561590Srgrimes rabp->b_flags |= B_INVAL|B_ERROR; 65718730Ssteve vfs_unbusy_pages(rabp); 6581590Srgrimes brelse(rabp); 6591590Srgrimes } 66018730Ssteve } else { 6611590Srgrimes brelse(rabp); 66218730Ssteve } 6631590Srgrimes } 6641590Srgrimes } 6651590Srgrimes /* 6661590Srgrimes * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 6671590Srgrimes * chopped for the EOF condition, we cannot tell how large 6681590Srgrimes * NFS directories are going to be until we hit EOF. So 6691590Srgrimes * an NFS directory buffer is *not* chopped to its EOF. Now, 6701590Srgrimes * it just so happens that b_resid will effectively chop it 6711590Srgrimes * to EOF. *BUT* this information is lost if the buffer goes 6721590Srgrimes * away and is reconstituted into a B_CACHE state ( due to 6731590Srgrimes * being VMIO ) later. So we keep track of the directory eof 6741590Srgrimes * in np->n_direofoffset and chop it off as an extra step 6751590Srgrimes * right here. 6761590Srgrimes */ 6771590Srgrimes n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 6781590Srgrimes if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 6791590Srgrimes n = np->n_direofoffset - uio->uio_offset; 68018730Ssteve break; 68169531Swill default: 68269531Swill printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 6831590Srgrimes break; 68469531Swill }; 68518730Ssteve 68618730Ssteve if (n > 0) { 6871590Srgrimes error = uiomove(bp->b_data + on, (int)n, uio); 6881590Srgrimes } 68918730Ssteve switch (vp->v_type) { 6901590Srgrimes case VREG: 6911590Srgrimes break; 69218730Ssteve case VLNK: 69318730Ssteve n = 0; 69418730Ssteve break; 69518730Ssteve case VDIR: 69618730Ssteve /* 69718730Ssteve * Invalidate buffer if caching is disabled, forcing a 69818730Ssteve * re-read from the remote later. 69918730Ssteve */ 70018730Ssteve if (np->n_flag & NQNFSNONCACHE) 70118730Ssteve bp->b_flags |= B_INVAL; 70218730Ssteve break; 70318730Ssteve default: 70418730Ssteve printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 70518730Ssteve } 70618730Ssteve brelse(bp); 70718730Ssteve } while (error == 0 && uio->uio_resid > 0 && n > 0); 70818730Ssteve return (error); 70918730Ssteve} 71018730Ssteve 71118730Ssteve/* 71218730Ssteve * Vnode op for write using bio 71318730Ssteve */ 71418730Ssteveint 71518730Sstevenfs_write(ap) 71618730Ssteve struct vop_write_args /* { 71718730Ssteve struct vnode *a_vp; 71818730Ssteve struct uio *a_uio; 71918730Ssteve int a_ioflag; 72018730Ssteve struct ucred *a_cred; 72118730Ssteve } */ *ap; 72218730Ssteve{ 72318730Ssteve int biosize; 72418730Ssteve struct uio *uio = ap->a_uio; 72518730Ssteve struct proc *p = uio->uio_procp; 7261590Srgrimes struct vnode *vp = ap->a_vp; 7271590Srgrimes struct nfsnode *np = VTONFS(vp); 7281590Srgrimes struct ucred *cred = ap->a_cred; 7291590Srgrimes int ioflag = ap->a_ioflag; 7301590Srgrimes struct buf *bp; 7311590Srgrimes struct vattr vattr; 7321590Srgrimes struct nfsmount *nmp = VFSTONFS(vp->v_mount); 7331590Srgrimes daddr_t lbn; 7341590Srgrimes int bcount; 7351590Srgrimes int n, on, error = 0, iomode, must_commit; 7361590Srgrimes int haverslock = 0; 7371590Srgrimes 7381590Srgrimes#ifdef DIAGNOSTIC 7391590Srgrimes if (uio->uio_rw != UIO_WRITE) 7401590Srgrimes panic("nfs_write mode"); 7411590Srgrimes if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 7421590Srgrimes panic("nfs_write proc"); 7431590Srgrimes#endif 7441590Srgrimes if (vp->v_type != VREG) 7451590Srgrimes return (EIO); 7461590Srgrimes if (np->n_flag & NWRITEERR) { 7471590Srgrimes np->n_flag &= ~NWRITEERR; 7481590Srgrimes return (np->n_error); 74918730Ssteve } 75018730Ssteve if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 75118730Ssteve (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 7521590Srgrimes (void)nfs_fsinfo(nmp, vp, cred, p); 75318730Ssteve 7541590Srgrimes /* 75518730Ssteve * Synchronously flush pending buffers if we are in synchronous 75618730Ssteve * mode or if we are appending. 75718730Ssteve */ 7581590Srgrimes if (ioflag & (IO_APPEND | IO_SYNC)) { 7591590Srgrimes if (np->n_flag & NMODIFIED) { 7601590Srgrimes np->n_attrstamp = 0; 7611590Srgrimes error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 7621590Srgrimes if (error) 7631590Srgrimes return (error); 7641590Srgrimes } 7651590Srgrimes } 7661590Srgrimes 76718730Ssteve /* 76818730Ssteve * If IO_APPEND then load uio_offset. We restart here if we cannot 76918730Ssteve * get the append lock. 77018730Ssteve */ 7711590Srgrimesrestart: 77218730Ssteve if (ioflag & IO_APPEND) { 7731590Srgrimes np->n_attrstamp = 0; 7741590Srgrimes error = VOP_GETATTR(vp, &vattr, cred, p); 77518730Ssteve if (error) 77618730Ssteve return (error); 77718730Ssteve uio->uio_offset = np->n_size; 77818730Ssteve } 77918730Ssteve 7801590Srgrimes if (uio->uio_offset < 0) 7811590Srgrimes return (EINVAL); 7821590Srgrimes if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 7831590Srgrimes return (EFBIG); 78418730Ssteve if (uio->uio_resid == 0) 78518730Ssteve return (0); 78618730Ssteve 78718730Ssteve /* 78818730Ssteve * We need to obtain the rslock if we intend to modify np->n_size 7891590Srgrimes * in order to guarentee the append point with multiple contending 79018730Ssteve * writers, to guarentee that no other appenders modify n_size 79118730Ssteve * while we are trying to obtain a truncated buffer (i.e. to avoid 79218730Ssteve * accidently truncating data written by another appender due to 7931590Srgrimes * the race), and to ensure that the buffer is populated prior to 79418730Ssteve * our extending of the file. We hold rslock through the entire 79518730Ssteve * operation. 79618730Ssteve * 79718730Ssteve * Note that we do not synchronize the case where someone truncates 79818730Ssteve * the file while we are appending to it because attempting to lock 7991590Srgrimes * this case may deadlock other parts of the system unexpectedly. 8001590Srgrimes */ 8011590Srgrimes if ((ioflag & IO_APPEND) || 8021590Srgrimes uio->uio_offset + uio->uio_resid > np->n_size) { 8031590Srgrimes switch(nfs_rslock(np, p)) { 8041590Srgrimes case ENOLCK: 8058874Srgrimes goto restart; 8061590Srgrimes /* not reached */ 80718730Ssteve case EINTR: 80818730Ssteve case ERESTART: 8091590Srgrimes return(EINTR); 8101590Srgrimes /* not reached */ 8111590Srgrimes default: 8128874Srgrimes break; 81318730Ssteve } 8141590Srgrimes haverslock = 1; 8151590Srgrimes } 8161590Srgrimes 8171590Srgrimes /* 8181590Srgrimes * Maybe this should be above the vnode op call, but so long as 81918730Ssteve * file servers have no limits, i don't think it matters 82094582Sobrien */ 82194582Sobrien if (p && uio->uio_offset + uio->uio_resid > 8221590Srgrimes p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 8231590Srgrimes psignal(p, SIGXFSZ); 8241590Srgrimes if (haverslock) 8251590Srgrimes nfs_rsunlock(np, p); 82618730Ssteve return (EFBIG); 82718730Ssteve } 82818730Ssteve 82918730Ssteve biosize = vp->v_mount->mnt_stat.f_iosize; 83018730Ssteve 83118730Ssteve do { 8321590Srgrimes /* 83318730Ssteve * Check for a valid write lease. 8341590Srgrimes */ 8351590Srgrimes if ((nmp->nm_flag & NFSMNT_NQNFS) && 83618730Ssteve NQNFS_CKINVALID(vp, np, ND_WRITE)) { 83718730Ssteve do { 83818730Ssteve error = nqnfs_getlease(vp, ND_WRITE, cred, p); 8391590Srgrimes } while (error == NQNFS_EXPIRED); 8401590Srgrimes if (error) 84118730Ssteve break; 8421590Srgrimes if (np->n_lrev != np->n_brev || 8431590Srgrimes (np->n_flag & NQNFSNONCACHE)) { 8441590Srgrimes error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 84518730Ssteve if (error) 8461590Srgrimes break; 8471590Srgrimes np->n_brev = np->n_lrev; 84818730Ssteve } 8491590Srgrimes } 85018730Ssteve if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 85118730Ssteve iomode = NFSV3WRITE_FILESYNC; 85218730Ssteve error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 85318730Ssteve if (must_commit) 85418730Ssteve nfs_clearcommit(vp->v_mount); 8551590Srgrimes break; 85618730Ssteve } 8571590Srgrimes nfsstats.biocache_writes++; 8581590Srgrimes lbn = uio->uio_offset / biosize; 85918730Ssteve on = uio->uio_offset & (biosize-1); 86018730Ssteve n = min((unsigned)(biosize - on), uio->uio_resid); 86118730Ssteveagain: 8621590Srgrimes /* 8631590Srgrimes * Handle direct append and file extension cases, calculate 86469531Swill * unaligned buffer size. 86518730Ssteve */ 86618730Ssteve 86718730Ssteve if (uio->uio_offset == np->n_size && n) { 86818730Ssteve /* 86918730Ssteve * Get the buffer (in its pre-append state to maintain 8701590Srgrimes * B_CACHE if it was previously set). Resize the 87118730Ssteve * nfsnode after we have locked the buffer to prevent 8721590Srgrimes * readers from reading garbage. 8731590Srgrimes */ 8741590Srgrimes bcount = on; 8751590Srgrimes bp = nfs_getcacheblk(vp, lbn, bcount, p); 8761590Srgrimes 8771590Srgrimes if (bp != NULL) { 8781590Srgrimes long save; 87918730Ssteve 8801590Srgrimes np->n_size = uio->uio_offset + n; 8811590Srgrimes np->n_flag |= NMODIFIED; 88218730Ssteve vnode_pager_setsize(vp, np->n_size); 8831590Srgrimes 88418730Ssteve save = bp->b_flags & B_CACHE; 88518730Ssteve bcount += n; 88618730Ssteve allocbuf(bp, bcount); 88718730Ssteve bp->b_flags |= save; 88818730Ssteve } 88918730Ssteve } else { 89018730Ssteve /* 89118730Ssteve * Obtain the locked cache block first, and then 89218730Ssteve * adjust the file's size as appropriate. 89318730Ssteve */ 89418730Ssteve bcount = on + n; 89518730Ssteve if ((off_t)lbn * biosize + bcount < np->n_size) { 89618730Ssteve if ((off_t)(lbn + 1) * biosize < np->n_size) 89718730Ssteve bcount = biosize; 8981590Srgrimes else 89918730Ssteve bcount = np->n_size - (off_t)lbn * biosize; 9001590Srgrimes } 90118730Ssteve 90269531Swill bp = nfs_getcacheblk(vp, lbn, bcount, p); 90318730Ssteve 90418730Ssteve if (uio->uio_offset + n > np->n_size) { 90518730Ssteve np->n_size = uio->uio_offset + n; 90618730Ssteve np->n_flag |= NMODIFIED; 90718730Ssteve vnode_pager_setsize(vp, np->n_size); 90818730Ssteve } 90918730Ssteve } 91018730Ssteve 91118730Ssteve if (!bp) { 91218730Ssteve error = EINTR; 91318730Ssteve break; 91418730Ssteve } 91518730Ssteve 91618730Ssteve /* 91718730Ssteve * Issue a READ if B_CACHE is not set. In special-append 91818730Ssteve * mode, B_CACHE is based on the buffer prior to the write 91918730Ssteve * op and is typically set, avoiding the read. If a read 92018730Ssteve * is required in special append mode, the server will 92118730Ssteve * probably send us a short-read since we extended the file 9221590Srgrimes * on our end, resulting in b_resid == 0 and, thusly, 9231590Srgrimes * B_CACHE getting set. 92418730Ssteve * 9251590Srgrimes * We can also avoid issuing the read if the write covers 9261590Srgrimes * the entire buffer. We have to make sure the buffer state 92718730Ssteve * is reasonable in this case since we will not be initiating 9281590Srgrimes * I/O. See the comments in kern/vfs_bio.c's getblk() for 9291590Srgrimes * more information. 93018730Ssteve * 9311590Srgrimes * B_CACHE may also be set due to the buffer being cached 9321590Srgrimes * normally. 9331590Srgrimes */ 9341590Srgrimes 9351590Srgrimes if (on == 0 && n == bcount) { 9361590Srgrimes bp->b_flags |= B_CACHE; 9371590Srgrimes bp->b_flags &= ~(B_ERROR | B_INVAL); 93818730Ssteve } 93918730Ssteve 94018730Ssteve if ((bp->b_flags & B_CACHE) == 0) { 94118730Ssteve bp->b_iocmd = BIO_READ; 94218730Ssteve vfs_busy_pages(bp, 0); 94318730Ssteve error = nfs_doio(bp, cred, p); 94418730Ssteve if (error) { 94518730Ssteve brelse(bp); 94618730Ssteve break; 94718730Ssteve } 94818730Ssteve } 94918730Ssteve if (!bp) { 95018730Ssteve error = EINTR; 95118730Ssteve break; 95218730Ssteve } 95318730Ssteve if (bp->b_wcred == NOCRED) { 95418730Ssteve crhold(cred); 95518730Ssteve bp->b_wcred = cred; 95618730Ssteve } 95718730Ssteve np->n_flag |= NMODIFIED; 9581590Srgrimes 9591590Srgrimes /* 9601590Srgrimes * If dirtyend exceeds file size, chop it down. This should 9611590Srgrimes * not normally occur but there is an append race where it 9621590Srgrimes * might occur XXX, so we log it. 9638874Srgrimes * 9641590Srgrimes * If the chopping creates a reverse-indexed or degenerate 9651590Srgrimes * situation with dirtyoff/end, we 0 both of them. 9661590Srgrimes */ 96718730Ssteve 9681590Srgrimes if (bp->b_dirtyend > bcount) { 9691590Srgrimes printf("NFS append race @%lx:%d\n", 9701590Srgrimes (long)bp->b_blkno * DEV_BSIZE, 9711590Srgrimes bp->b_dirtyend - bcount); 9721590Srgrimes bp->b_dirtyend = bcount; 9731590Srgrimes } 9741590Srgrimes 97569527Swill if (bp->b_dirtyoff >= bp->b_dirtyend) 97618730Ssteve bp->b_dirtyoff = bp->b_dirtyend = 0; 9771590Srgrimes 97869531Swill /* 9791590Srgrimes * If the new write will leave a contiguous dirty 9801590Srgrimes * area, just update the b_dirtyoff and b_dirtyend, 98118730Ssteve * otherwise force a write rpc of the old dirty area. 98269531Swill * 98318730Ssteve * While it is possible to merge discontiguous writes due to 9841590Srgrimes * our having a B_CACHE buffer ( and thus valid read data 98569531Swill * for the hole), we don't because it could lead to 9861590Srgrimes * significant cache coherency problems with multiple clients, 9871590Srgrimes * especially if locking is implemented later on. 98818730Ssteve * 9891590Srgrimes * as an optimization we could theoretically maintain 9901590Srgrimes * a linked list of discontinuous areas, but we would still 9911590Srgrimes * have to commit them separately so there isn't much 9921590Srgrimes * advantage to it except perhaps a bit of asynchronization. 9931590Srgrimes */ 9941590Srgrimes 9951590Srgrimes if (bp->b_dirtyend > 0 && 9961590Srgrimes (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 9971590Srgrimes if (VOP_BWRITE(bp->b_vp, bp) == EINTR) 9981590Srgrimes return (EINTR); 9991590Srgrimes goto again; 10001590Srgrimes } 10018874Srgrimes 100268898Skris /* 10031590Srgrimes * Check for valid write lease and get one as required. 10041590Srgrimes * In case getblk() and/or bwrite() delayed us. 10051590Srgrimes */ 100618730Ssteve if ((nmp->nm_flag & NFSMNT_NQNFS) && 10071590Srgrimes NQNFS_CKINVALID(vp, np, ND_WRITE)) { 10081590Srgrimes do { 10091590Srgrimes error = nqnfs_getlease(vp, ND_WRITE, cred, p); 10101590Srgrimes } while (error == NQNFS_EXPIRED); 10111590Srgrimes if (error) { 10121590Srgrimes brelse(bp); 10131590Srgrimes break; 10141590Srgrimes } 10151590Srgrimes if (np->n_lrev != np->n_brev || 10161590Srgrimes (np->n_flag & NQNFSNONCACHE)) { 10171590Srgrimes brelse(bp); 10181590Srgrimes error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 10191590Srgrimes if (error) 10201590Srgrimes break; 10211590Srgrimes np->n_brev = np->n_lrev; 10221590Srgrimes goto again; 10231590Srgrimes } 102418730Ssteve } 10251590Srgrimes 10261590Srgrimes error = uiomove((char *)bp->b_data + on, n, uio); 10271590Srgrimes 10281590Srgrimes /* 102918730Ssteve * Since this block is being modified, it must be written 10301590Srgrimes * again and not just committed. Since write clustering does 10311590Srgrimes * not work for the stage 1 data write, only the stage 2 10321590Srgrimes * commit rpc, we have to clear B_CLUSTEROK as well. 10331590Srgrimes */ 10341590Srgrimes bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 10351590Srgrimes 10361590Srgrimes if (error) { 10371590Srgrimes bp->b_flags |= B_ERROR; 10388874Srgrimes brelse(bp); 10391590Srgrimes break; 104018730Ssteve } 104118730Ssteve 10421590Srgrimes /* 10431590Srgrimes * Only update dirtyoff/dirtyend if not a degenerate 10441590Srgrimes * condition. 10451590Srgrimes */ 10461590Srgrimes if (n) { 10471590Srgrimes if (bp->b_dirtyend > 0) { 10481590Srgrimes bp->b_dirtyoff = min(on, bp->b_dirtyoff); 104918730Ssteve bp->b_dirtyend = max((on + n), bp->b_dirtyend); 10501590Srgrimes } else { 105118730Ssteve bp->b_dirtyoff = on; 10521590Srgrimes bp->b_dirtyend = on + n; 10531590Srgrimes } 10541590Srgrimes vfs_bio_set_validclean(bp, on, n); 105518730Ssteve } 105618730Ssteve 105718730Ssteve /* 10581590Srgrimes * If the lease is non-cachable or IO_SYNC do bwrite(). 10591590Srgrimes * 10601590Srgrimes * IO_INVAL appears to be unused. The idea appears to be 10611590Srgrimes * to turn off caching in this case. Very odd. XXX 10621590Srgrimes */ 10631590Srgrimes if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 10641590Srgrimes if (ioflag & IO_INVAL) 10651590Srgrimes bp->b_flags |= B_NOCACHE; 10661590Srgrimes error = VOP_BWRITE(bp->b_vp, bp); 106794506Scharnier if (error) 106818730Ssteve break; 10691590Srgrimes if (np->n_flag & NQNFSNONCACHE) { 10708874Srgrimes error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 107118730Ssteve if (error) 107218730Ssteve break; 107318730Ssteve } 107418730Ssteve } else if ((n + on) == biosize && 107518730Ssteve (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 107618730Ssteve bp->b_flags |= B_ASYNC; 10771590Srgrimes (void)nfs_writebp(bp, 0, 0); 10781590Srgrimes } else { 10791590Srgrimes bdwrite(bp); 10801590Srgrimes } 10811590Srgrimes } while (uio->uio_resid > 0 && n > 0); 10821590Srgrimes 10831590Srgrimes if (haverslock) 10848874Srgrimes nfs_rsunlock(np, p); 10851590Srgrimes 10861590Srgrimes return (error); 10871590Srgrimes} 10881590Srgrimes 10891590Srgrimes/* 10901590Srgrimes * Get an nfs cache block. 10911590Srgrimes * 10921590Srgrimes * Allocate a new one if the block isn't currently in the cache 10931590Srgrimes * and return the block marked busy. If the calling process is 10941590Srgrimes * interrupted by a signal for an interruptible mount point, return 109518730Ssteve * NULL. 10961590Srgrimes * 10971590Srgrimes * The caller must carefully deal with the possible B_INVAL state of 109898136Sjmallett * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 10991590Srgrimes * indirectly), so synchronous reads can be issued without worrying about 11001590Srgrimes * the B_INVAL state. We have to be a little more careful when dealing 110118730Ssteve * with writes (see comments in nfs_write()) when extending a file past 11021590Srgrimes * its EOF. 11031590Srgrimes */ 11041590Srgrimesstatic struct buf * 11058874Srgrimesnfs_getcacheblk(vp, bn, size, p) 11061590Srgrimes struct vnode *vp; 110769527Swill daddr_t bn; 11085814Sjkh int size; 11091590Srgrimes struct proc *p; 11101590Srgrimes{ 11111590Srgrimes register struct buf *bp; 11121590Srgrimes struct mount *mp; 11131590Srgrimes struct nfsmount *nmp; 11141590Srgrimes 11151590Srgrimes mp = vp->v_mount; 11161590Srgrimes nmp = VFSTONFS(mp); 11171590Srgrimes 11181590Srgrimes if (nmp->nm_flag & NFSMNT_INT) { 111918730Ssteve bp = getblk(vp, bn, size, PCATCH, 0); 112049938Shoek while (bp == (struct buf *)0) { 112118730Ssteve if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 11221590Srgrimes return ((struct buf *)0); 11231590Srgrimes bp = getblk(vp, bn, size, 0, 2 * hz); 11241590Srgrimes } 11251590Srgrimes } else { 11261590Srgrimes bp = getblk(vp, bn, size, 0, 0); 11278874Srgrimes } 11281590Srgrimes 112918730Ssteve if (vp->v_type == VREG) { 113018730Ssteve int biosize; 11311590Srgrimes 113218730Ssteve biosize = mp->mnt_stat.f_iosize; 113318730Ssteve bp->b_blkno = bn * (biosize / DEV_BSIZE); 11341590Srgrimes } 113518730Ssteve return (bp); 113618730Ssteve} 113718730Ssteve 11381590Srgrimes/* 113935483Simp * Flush and invalidate all dirty buffers. If another process is already 114035483Simp * doing the flush, just wait for completion. 114135483Simp */ 114235483Simpint 114335483Simpnfs_vinvalbuf(vp, flags, cred, p, intrflg) 114435483Simp struct vnode *vp; 114518730Ssteve int flags; 11461590Srgrimes struct ucred *cred; 11471590Srgrimes struct proc *p; 11481590Srgrimes int intrflg; 114918730Ssteve{ 11501590Srgrimes register struct nfsnode *np = VTONFS(vp); 11511590Srgrimes struct nfsmount *nmp = VFSTONFS(vp->v_mount); 11521590Srgrimes int error = 0, slpflag, slptimeo; 11531590Srgrimes 11541590Srgrimes if (vp->v_flag & VXLOCK) { 11551590Srgrimes return (0); 11561590Srgrimes } 11571590Srgrimes 11581590Srgrimes if ((nmp->nm_flag & NFSMNT_INT) == 0) 11591590Srgrimes intrflg = 0; 11601590Srgrimes if (intrflg) { 11611590Srgrimes slpflag = PCATCH; 11628874Srgrimes slptimeo = 2 * hz; 11631590Srgrimes } else { 11641590Srgrimes slpflag = 0; 11651590Srgrimes slptimeo = 0; 11661590Srgrimes } 11671590Srgrimes /* 11681590Srgrimes * First wait for any other process doing a flush to complete. 11691590Srgrimes */ 11701590Srgrimes while (np->n_flag & NFLUSHINPROG) { 11711590Srgrimes np->n_flag |= NFLUSHWANT; 11721590Srgrimes error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 11731590Srgrimes slptimeo); 11741590Srgrimes if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 11751590Srgrimes return (EINTR); 11761590Srgrimes } 11771590Srgrimes 11781590Srgrimes /* 11791590Srgrimes * Now, flush as required. 11801590Srgrimes */ 11811590Srgrimes np->n_flag |= NFLUSHINPROG; 11821590Srgrimes error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 11831590Srgrimes while (error) { 11841590Srgrimes if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 11851590Srgrimes np->n_flag &= ~NFLUSHINPROG; 11861590Srgrimes if (np->n_flag & NFLUSHWANT) { 11871590Srgrimes np->n_flag &= ~NFLUSHWANT; 11881590Srgrimes wakeup((caddr_t)&np->n_flag); 11891590Srgrimes } 11901590Srgrimes return (EINTR); 11911590Srgrimes } 11921590Srgrimes error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 11931590Srgrimes } 11941590Srgrimes np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 11951590Srgrimes if (np->n_flag & NFLUSHWANT) { 11968874Srgrimes np->n_flag &= ~NFLUSHWANT; 11971590Srgrimes wakeup((caddr_t)&np->n_flag); 11981590Srgrimes } 11998874Srgrimes return (0); 120018730Ssteve} 120118730Ssteve 120218730Ssteve/* 120318730Ssteve * Initiate asynchronous I/O. Return an error if no nfsiods are available. 120418730Ssteve * This is mainly to avoid queueing async I/O requests when the nfsiods 12051590Srgrimes * are all hung on a dead server. 120618730Ssteve * 120718730Ssteve * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp 12081590Srgrimes * is eventually dequeued by the async daemon, nfs_doio() *will*. 12098874Srgrimes */ 12101590Srgrimesint 12111590Srgrimesnfs_asyncio(bp, cred, procp) 12121590Srgrimes register struct buf *bp; 12131590Srgrimes struct ucred *cred; 12141590Srgrimes struct proc *procp; 12151590Srgrimes{ 12161590Srgrimes struct nfsmount *nmp; 121718730Ssteve int i; 121818730Ssteve int gotiod; 12191590Srgrimes int slpflag = 0; 12201590Srgrimes int slptimeo = 0; 12218874Srgrimes int error; 12221590Srgrimes 12231590Srgrimes /* 12241590Srgrimes * If no async daemons then return EIO to force caller to run the rpc 12251590Srgrimes * synchronously. 12261590Srgrimes */ 12271590Srgrimes if (nfs_numasync == 0) 122818730Ssteve return (EIO); 122918730Ssteve 12301590Srgrimes nmp = VFSTONFS(bp->b_vp->v_mount); 12311590Srgrimes 12321590Srgrimes /* 12331590Srgrimes * Commits are usually short and sweet so lets save some cpu and 12341590Srgrimes * leave the async daemons for more important rpc's (such as reads 12351590Srgrimes * and writes). 12361590Srgrimes */ 123718730Ssteve if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 123818730Ssteve (nmp->nm_bufqiods > nfs_numasync / 2)) { 123918730Ssteve return(EIO); 124094506Scharnier } 12418874Srgrimes 12421590Srgrimesagain: 12431590Srgrimes if (nmp->nm_flag & NFSMNT_INT) 12441590Srgrimes slpflag = PCATCH; 12451590Srgrimes gotiod = FALSE; 12461590Srgrimes 124718730Ssteve /* 124818730Ssteve * Find a free iod to process this request. 12491590Srgrimes */ 12501590Srgrimes for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 12511590Srgrimes if (nfs_iodwant[i]) { 12521590Srgrimes /* 12531590Srgrimes * Found one, so wake it up and tell it which 12541590Srgrimes * mount to process. 125518730Ssteve */ 125618730Ssteve NFS_DPF(ASYNCIO, 12571590Srgrimes ("nfs_asyncio: waking iod %d for mount %p\n", 12581590Srgrimes i, nmp)); 12591590Srgrimes nfs_iodwant[i] = (struct proc *)0; 12601590Srgrimes nfs_iodmount[i] = nmp; 12611590Srgrimes nmp->nm_bufqiods++; 12621590Srgrimes wakeup((caddr_t)&nfs_iodwant[i]); 12631590Srgrimes gotiod = TRUE; 126418730Ssteve break; 126518730Ssteve } 126618730Ssteve 12671590Srgrimes /* 12681590Srgrimes * If none are free, we may already have an iod working on this mount 12691590Srgrimes * point. If so, it will process our request. 12701590Srgrimes */ 12711590Srgrimes if (!gotiod) { 12721590Srgrimes if (nmp->nm_bufqiods > 0) { 12731590Srgrimes NFS_DPF(ASYNCIO, 127418730Ssteve ("nfs_asyncio: %d iods are already processing mount %p\n", 127518730Ssteve nmp->nm_bufqiods, nmp)); 127618730Ssteve gotiod = TRUE; 127718730Ssteve } 127818730Ssteve } 127918730Ssteve 12808874Srgrimes /* 128118730Ssteve * If we have an iod which can process the request, then queue 128218730Ssteve * the buffer. 128318730Ssteve */ 128418730Ssteve if (gotiod) { 128518730Ssteve /* 128697251Sru * Ensure that the queue never grows too large. We still want 12871590Srgrimes * to asynchronize so we block rather then return EIO. 128880381Ssheldonh */ 128918730Ssteve while (nmp->nm_bufqlen >= 2*nfs_numasync) { 129018730Ssteve NFS_DPF(ASYNCIO, 12911590Srgrimes ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 129218730Ssteve nmp->nm_bufqwant = TRUE; 129318730Ssteve error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 129418730Ssteve "nfsaio", slptimeo); 12951590Srgrimes if (error) { 12961590Srgrimes if (nfs_sigintr(nmp, NULL, procp)) 12971590Srgrimes return (EINTR); 12981590Srgrimes if (slpflag == PCATCH) { 12991590Srgrimes slpflag = 0; 13001590Srgrimes slptimeo = 2 * hz; 13011590Srgrimes } 13021590Srgrimes } 13031590Srgrimes /* 13048874Srgrimes * We might have lost our iod while sleeping, 13051590Srgrimes * so check and loop if nescessary. 13061590Srgrimes */ 13071590Srgrimes if (nmp->nm_bufqiods == 0) { 13081590Srgrimes NFS_DPF(ASYNCIO, 13091590Srgrimes ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 13101590Srgrimes goto again; 13111590Srgrimes } 13121590Srgrimes } 131318730Ssteve 13141590Srgrimes if (bp->b_iocmd == BIO_READ) { 131518730Ssteve if (bp->b_rcred == NOCRED && cred != NOCRED) { 131618730Ssteve crhold(cred); 131718730Ssteve bp->b_rcred = cred; 13181590Srgrimes } 13191590Srgrimes } else { 13201590Srgrimes bp->b_flags |= B_WRITEINPROG; 132118730Ssteve if (bp->b_wcred == NOCRED && cred != NOCRED) { 13221590Srgrimes crhold(cred); 132318730Ssteve bp->b_wcred = cred; 132418730Ssteve } 13251590Srgrimes } 13261590Srgrimes 13271590Srgrimes BUF_KERNPROC(bp); 132818730Ssteve TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 132918730Ssteve nmp->nm_bufqlen++; 133018730Ssteve return (0); 13311590Srgrimes } 13321590Srgrimes 13331590Srgrimes /* 13348874Srgrimes * All the iods are busy on other mounts, so return EIO to 13351590Srgrimes * force the caller to process the i/o synchronously. 13361590Srgrimes */ 13371590Srgrimes NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 13381590Srgrimes return (EIO); 13391590Srgrimes} 134069531Swill 13411590Srgrimes/* 13421590Srgrimes * Do an I/O operation to/from a cache block. This may be called 13431590Srgrimes * synchronously or from an nfsiod. 13441590Srgrimes */ 13451590Srgrimesint 13461590Srgrimesnfs_doio(bp, cr, p) 13471590Srgrimes struct buf *bp; 13481590Srgrimes struct ucred *cr; 13491590Srgrimes struct proc *p; 13501590Srgrimes{ 13518874Srgrimes struct uio *uiop; 13521590Srgrimes struct vnode *vp; 13531590Srgrimes struct nfsnode *np; 13541590Srgrimes struct nfsmount *nmp; 13551590Srgrimes int error = 0, iomode, must_commit = 0; 13561590Srgrimes struct uio uio; 13571590Srgrimes struct iovec io; 13581590Srgrimes 13591590Srgrimes vp = bp->b_vp; 13601590Srgrimes np = VTONFS(vp); 13611590Srgrimes nmp = VFSTONFS(vp->v_mount); 13621590Srgrimes uiop = &uio; 13631590Srgrimes uiop->uio_iov = &io; 13641590Srgrimes uiop->uio_iovcnt = 1; 13658874Srgrimes uiop->uio_segflg = UIO_SYSSPACE; 13661590Srgrimes uiop->uio_procp = p; 13671590Srgrimes 13681590Srgrimes /* 13691590Srgrimes * clear B_ERROR and B_INVAL state prior to initiating the I/O. We 13701590Srgrimes * do this here so we do not have to do it in all the code that 13711590Srgrimes * calls us. 13721590Srgrimes */ 13731590Srgrimes bp->b_flags &= ~(B_ERROR | B_INVAL); 13741590Srgrimes 13751590Srgrimes KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 13761590Srgrimes 13771590Srgrimes /* 13781590Srgrimes * Historically, paging was done with physio, but no more. 13791590Srgrimes */ 13801590Srgrimes if (bp->b_flags & B_PHYS) { 13811590Srgrimes /* 13821590Srgrimes * ...though reading /dev/drum still gets us here. 13831590Srgrimes */ 13841590Srgrimes io.iov_len = uiop->uio_resid = bp->b_bcount; 13851590Srgrimes /* mapping was done by vmapbuf() */ 13861590Srgrimes io.iov_base = bp->b_data; 13871590Srgrimes uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 13881590Srgrimes if (bp->b_iocmd == BIO_READ) { 13891590Srgrimes uiop->uio_rw = UIO_READ; 13901590Srgrimes nfsstats.read_physios++; 13911590Srgrimes error = nfs_readrpc(vp, uiop, cr); 13921590Srgrimes } else { 13931590Srgrimes int com; 13941590Srgrimes 13951590Srgrimes iomode = NFSV3WRITE_DATASYNC; 13961590Srgrimes uiop->uio_rw = UIO_WRITE; 13971590Srgrimes nfsstats.write_physios++; 139818730Ssteve error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 13991590Srgrimes } 14001590Srgrimes if (error) { 14011590Srgrimes bp->b_flags |= B_ERROR; 14021590Srgrimes bp->b_error = error; 14031590Srgrimes } 14048874Srgrimes } else if (bp->b_iocmd == BIO_READ) { 14051590Srgrimes io.iov_len = uiop->uio_resid = bp->b_bcount; 14061590Srgrimes io.iov_base = bp->b_data; 14071590Srgrimes uiop->uio_rw = UIO_READ; 14081590Srgrimes switch (vp->v_type) { 14091590Srgrimes case VREG: 14101590Srgrimes uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 14111590Srgrimes nfsstats.read_bios++; 14121590Srgrimes error = nfs_readrpc(vp, uiop, cr); 14131590Srgrimes if (!error) { 14141590Srgrimes if (uiop->uio_resid) { 14151590Srgrimes /* 14161590Srgrimes * If we had a short read with no error, we must have 14171590Srgrimes * hit a file hole. We should zero-fill the remainder. 141818730Ssteve * This can also occur if the server hits the file EOF. 141918730Ssteve * 142018730Ssteve * Holes used to be able to occur due to pending 142118730Ssteve * writes, but that is not possible any longer. 14221590Srgrimes */ 142318730Ssteve int nread = bp->b_bcount - uiop->uio_resid; 142418730Ssteve int left = bp->b_bcount - nread; 142518730Ssteve 142618730Ssteve if (left > 0) 142718730Ssteve bzero((char *)bp->b_data + nread, left); 142818730Ssteve uiop->uio_resid = 0; 142918730Ssteve } 143018730Ssteve } 14311590Srgrimes if (p && (vp->v_flag & VTEXT) && 143218730Ssteve (((nmp->nm_flag & NFSMNT_NQNFS) && 143318730Ssteve NQNFS_CKINVALID(vp, np, ND_READ) && 143418730Ssteve np->n_lrev != np->n_brev) || 143518730Ssteve (!(nmp->nm_flag & NFSMNT_NQNFS) && 143618730Ssteve np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 143718730Ssteve uprintf("Process killed due to text file modification\n"); 143818730Ssteve psignal(p, SIGKILL); 143918730Ssteve PHOLD(p); 144018730Ssteve } 14411590Srgrimes break; 14421590Srgrimes case VLNK: 14431590Srgrimes uiop->uio_offset = (off_t)0; 14441590Srgrimes nfsstats.readlink_bios++; 14451590Srgrimes error = nfs_readlinkrpc(vp, uiop, cr); 144618730Ssteve break; 144718730Ssteve case VDIR: 144818730Ssteve nfsstats.readdir_bios++; 144918730Ssteve uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 145018730Ssteve if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 145118730Ssteve error = nfs_readdirplusrpc(vp, uiop, cr); 145218730Ssteve if (error == NFSERR_NOTSUPP) 145318730Ssteve nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 14541590Srgrimes } 14551590Srgrimes if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 145618730Ssteve error = nfs_readdirrpc(vp, uiop, cr); 145718730Ssteve /* 145818730Ssteve * end-of-directory sets B_INVAL but does not generate an 145918730Ssteve * error. 14601590Srgrimes */ 146118730Ssteve if (error == 0 && uiop->uio_resid == bp->b_bcount) 14621590Srgrimes bp->b_flags |= B_INVAL; 14631590Srgrimes break; 14641590Srgrimes default: 14651590Srgrimes printf("nfs_doio: type %x unexpected\n",vp->v_type); 14661590Srgrimes break; 146718730Ssteve }; 146818730Ssteve if (error) { 146918730Ssteve bp->b_flags |= B_ERROR; 147018730Ssteve bp->b_error = error; 147118730Ssteve } 147218730Ssteve } else { 147318730Ssteve /* 147418730Ssteve * If we only need to commit, try to commit 147569531Swill */ 14761590Srgrimes if (bp->b_flags & B_NEEDCOMMIT) { 14771590Srgrimes int retv; 147818730Ssteve off_t off; 147918730Ssteve 14801590Srgrimes off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 14811590Srgrimes bp->b_flags |= B_WRITEINPROG; 148218730Ssteve retv = nfs_commit( 148318730Ssteve bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 148418730Ssteve bp->b_wcred, p); 148518730Ssteve bp->b_flags &= ~B_WRITEINPROG; 148618730Ssteve if (retv == 0) { 148718730Ssteve bp->b_dirtyoff = bp->b_dirtyend = 0; 148818730Ssteve bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 148918730Ssteve bp->b_resid = 0; 149018730Ssteve biodone(bp); 149118730Ssteve return (0); 149218730Ssteve } 14931590Srgrimes if (retv == NFSERR_STALEWRITEVERF) { 149418730Ssteve nfs_clearcommit(bp->b_vp->v_mount); 14958874Srgrimes } 149669531Swill } 14971590Srgrimes 14981590Srgrimes /* 14991590Srgrimes * Setup for actual write 15001590Srgrimes */ 150118730Ssteve 150218730Ssteve if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 15031590Srgrimes bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 15041590Srgrimes 15051590Srgrimes if (bp->b_dirtyend > bp->b_dirtyoff) { 15061590Srgrimes io.iov_len = uiop->uio_resid = bp->b_dirtyend 15071590Srgrimes - bp->b_dirtyoff; 15081590Srgrimes uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 15091590Srgrimes + bp->b_dirtyoff; 15101590Srgrimes io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 15111590Srgrimes uiop->uio_rw = UIO_WRITE; 15128874Srgrimes nfsstats.write_bios++; 15131590Srgrimes 15141590Srgrimes if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 15158874Srgrimes iomode = NFSV3WRITE_UNSTABLE; 15161590Srgrimes else 15178874Srgrimes iomode = NFSV3WRITE_FILESYNC; 15181590Srgrimes 151918730Ssteve bp->b_flags |= B_WRITEINPROG; 152018730Ssteve error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 15211590Srgrimes 152218730Ssteve /* 152318730Ssteve * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 152418730Ssteve * to cluster the buffers needing commit. This will allow 152518730Ssteve * the system to submit a single commit rpc for the whole 152618730Ssteve * cluster. We can do this even if the buffer is not 100% 152718730Ssteve * dirty (relative to the NFS blocksize), so we optimize the 152818730Ssteve * append-to-file-case. 152918730Ssteve * 153018730Ssteve * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 153118730Ssteve * cleared because write clustering only works for commit 153218730Ssteve * rpc's, not for the data portion of the write). 15331590Srgrimes */ 15341590Srgrimes 15351590Srgrimes if (!error && iomode == NFSV3WRITE_UNSTABLE) { 15361590Srgrimes bp->b_flags |= B_NEEDCOMMIT; 15371590Srgrimes if (bp->b_dirtyoff == 0 153818730Ssteve && bp->b_dirtyend == bp->b_bcount) 153918730Ssteve bp->b_flags |= B_CLUSTEROK; 15401590Srgrimes } else { 154169531Swill bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 15421590Srgrimes } 15431590Srgrimes bp->b_flags &= ~B_WRITEINPROG; 154418730Ssteve 154518730Ssteve /* 15461590Srgrimes * For an interrupted write, the buffer is still valid 15471590Srgrimes * and the write hasn't been pushed to the server yet, 154818730Ssteve * so we can't set B_ERROR and report the interruption 15491590Srgrimes * by setting B_EINTR. For the B_ASYNC case, B_EINTR 15501590Srgrimes * is not relevant, so the rpc attempt is essentially 15511590Srgrimes * a noop. For the case of a V3 write rpc not being 15521590Srgrimes * committed to stable storage, the block is still 155318730Ssteve * dirty and requires either a commit rpc or another 155418730Ssteve * write rpc with iomode == NFSV3WRITE_FILESYNC before 15551590Srgrimes * the block is reused. This is indicated by setting 15561590Srgrimes * the B_DELWRI and B_NEEDCOMMIT flags. 155718730Ssteve * 15581590Srgrimes * If the buffer is marked B_PAGING, it does not reside on 155918730Ssteve * the vp's paging queues so we cannot call bdirty(). The 156018730Ssteve * bp in this case is not an NFS cache block so we should 156118730Ssteve * be safe. XXX 156218730Ssteve */ 156318730Ssteve if (error == EINTR 156418730Ssteve || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 156518730Ssteve int s; 156618730Ssteve 156718730Ssteve s = splbio(); 156818730Ssteve bp->b_flags &= ~(B_INVAL|B_NOCACHE); 156918730Ssteve if ((bp->b_flags & B_PAGING) == 0) { 157018730Ssteve bdirty(bp); 15711590Srgrimes bp->b_flags &= ~B_DONE; 15721590Srgrimes } 15731590Srgrimes if (error && (bp->b_flags & B_ASYNC) == 0) 15741590Srgrimes bp->b_flags |= B_EINTR; 15751590Srgrimes splx(s); 15761590Srgrimes } else { 15771590Srgrimes if (error) { 157818730Ssteve bp->b_flags |= B_ERROR; 157918730Ssteve bp->b_error = np->n_error = error; 15801590Srgrimes np->n_flag |= NWRITEERR; 15811590Srgrimes } 158218730Ssteve bp->b_dirtyoff = bp->b_dirtyend = 0; 158318730Ssteve } 158418730Ssteve } else { 158518730Ssteve bp->b_resid = 0; 158618730Ssteve biodone(bp); 158718730Ssteve return (0); 158818730Ssteve } 158918730Ssteve } 159018730Ssteve bp->b_resid = uiop->uio_resid; 159118730Ssteve if (must_commit) 15921590Srgrimes nfs_clearcommit(vp->v_mount); 15931590Srgrimes biodone(bp); 15941590Srgrimes return (error); 15951590Srgrimes} 15961590Srgrimes