1139823Simp/*- 21541Srgrimes * Copyright (c) 1989, 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * This code is derived from software contributed to Berkeley by 61541Srgrimes * Rick Macklem at The University of Guelph. 71541Srgrimes * 81541Srgrimes * Redistribution and use in source and binary forms, with or without 91541Srgrimes * modification, are permitted provided that the following conditions 101541Srgrimes * are met: 111541Srgrimes * 1. Redistributions of source code must retain the above copyright 121541Srgrimes * notice, this list of conditions and the following disclaimer. 131541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141541Srgrimes * notice, this list of conditions and the following disclaimer in the 151541Srgrimes * documentation and/or other materials provided with the distribution. 161541Srgrimes * 4. Neither the name of the University nor the names of its contributors 171541Srgrimes * may be used to endorse or promote products derived from this software 181541Srgrimes * without specific prior written permission. 191541Srgrimes * 201541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 211541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 221541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 231541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 241541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 251541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 261541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 271541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 281541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 291541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 301541Srgrimes * SUCH DAMAGE. 311541Srgrimes * 3222521Sdyson * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 331541Srgrimes */ 341541Srgrimes 3583651Speter#include <sys/cdefs.h> 3683654Speter__FBSDID("$FreeBSD$"); 3722521Sdyson 38190380Srwatson#include "opt_kdtrace.h" 39190380Srwatson 401541Srgrimes#include <sys/param.h> 411541Srgrimes#include <sys/systm.h> 4279247Sjhb#include <sys/bio.h> 4379247Sjhb#include <sys/buf.h> 4479247Sjhb#include <sys/kernel.h> 45192578Srwatson#include <sys/mbuf.h> 4679247Sjhb#include <sys/mount.h> 4779247Sjhb#include <sys/proc.h> 48248084Sattilio#include <sys/rwlock.h> 4979247Sjhb#include <sys/vmmeter.h> 501541Srgrimes#include <sys/vnode.h> 511541Srgrimes 521541Srgrimes#include <vm/vm.h> 53239065Skib#include <vm/vm_param.h> 5412662Sdg#include <vm/vm_extern.h> 5525930Sdfr#include <vm/vm_page.h> 5625930Sdfr#include <vm/vm_object.h> 5725930Sdfr#include <vm/vm_pager.h> 5825930Sdfr#include <vm/vnode_pager.h> 591541Srgrimes 609336Sdfr#include <nfs/nfsproto.h> 6183651Speter#include <nfsclient/nfs.h> 6283651Speter#include <nfsclient/nfsmount.h> 6383651Speter#include <nfsclient/nfsnode.h> 64221543Srmacklem#include <nfs/nfs_kdtrace.h> 651541Srgrimes 6683651Speterstatic struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 6783651Speter struct thread *td); 68138899Spsstatic int nfs_directio_write(struct vnode *vp, struct uio *uiop, 69138899Sps struct ucred *cred, int ioflag); 7075580Sphk 71138899Spsextern int nfs_directio_enable; 72138899Spsextern int nfs_directio_allow_mmap; 73158739Smohans 741541Srgrimes/* 7525930Sdfr * Vnode op for VM getpages. 7625930Sdfr */ 7725930Sdfrint 7883651Speternfs_getpages(struct vop_getpages_args *ap) 7925930Sdfr{ 8046349Salc int i, error, nextoff, size, toff, count, npages; 8132755Sdyson struct uio uio; 8232755Sdyson struct iovec iov; 8332755Sdyson vm_offset_t kva; 8434206Sdyson struct buf *bp; 8536563Speter struct vnode *vp; 8683366Sjulian struct thread *td; 8736563Speter struct ucred *cred; 8836563Speter struct nfsmount *nmp; 89116461Salc vm_object_t object; 9036563Speter vm_page_t *pages; 91138899Sps struct nfsnode *np; 9225930Sdfr 9336563Speter vp = ap->a_vp; 94138899Sps np = VTONFS(vp); 9583366Sjulian td = curthread; /* XXX */ 9691406Sjhb cred = curthread->td_ucred; /* XXX */ 9736563Speter nmp = VFSTONFS(vp->v_mount); 9836563Speter pages = ap->a_m; 9936563Speter count = ap->a_count; 10036563Speter 101116461Salc if ((object = vp->v_object) == NULL) { 102158739Smohans nfs_printf("nfs_getpages: called with non-merged cache vnode??\n"); 103194425Salc return (VM_PAGER_ERROR); 10425930Sdfr } 10525930Sdfr 106158739Smohans if (nfs_directio_enable && !nfs_directio_allow_mmap) { 107158739Smohans mtx_lock(&np->n_mtx); 108158739Smohans if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { 109158739Smohans mtx_unlock(&np->n_mtx); 110158739Smohans nfs_printf("nfs_getpages: called on non-cacheable vnode??\n"); 111194425Salc return (VM_PAGER_ERROR); 112158739Smohans } else 113158739Smohans mtx_unlock(&np->n_mtx); 114138899Sps } 115138899Sps 116158739Smohans mtx_lock(&nmp->nm_mtx); 11736563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 118158739Smohans (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 119158739Smohans mtx_unlock(&nmp->nm_mtx); 120122698Salfred /* We'll never get here for v4, because we always have fsinfo */ 12183366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 122158739Smohans } else 123158739Smohans mtx_unlock(&nmp->nm_mtx); 12446349Salc 12546349Salc npages = btoc(count); 12646349Salc 12734206Sdyson /* 12846349Salc * If the requested page is partially valid, just return it and 12946349Salc * allow the pager to zero-out the blanks. Partially valid pages 13046349Salc * can only occur at the file EOF. 13146349Salc */ 132248084Sattilio VM_OBJECT_WLOCK(object); 133194425Salc if (pages[ap->a_reqpage]->valid != 0) { 134194425Salc for (i = 0; i < npages; ++i) { 135207669Salc if (i != ap->a_reqpage) { 136207669Salc vm_page_lock(pages[i]); 137194425Salc vm_page_free(pages[i]); 138207669Salc vm_page_unlock(pages[i]); 139207669Salc } 14046349Salc } 141248084Sattilio VM_OBJECT_WUNLOCK(object); 142194425Salc return (0); 14346349Salc } 144248084Sattilio VM_OBJECT_WUNLOCK(object); 14546349Salc 14646349Salc /* 14734206Sdyson * We use only the kva address for the buffer, but this is extremely 14834206Sdyson * convienient and fast. 14934206Sdyson */ 15042957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 15125930Sdfr 15234206Sdyson kva = (vm_offset_t) bp->b_data; 15336563Speter pmap_qenter(kva, pages, npages); 154170292Sattilio PCPU_INC(cnt.v_vnodein); 155170292Sattilio PCPU_ADD(cnt.v_vnodepgsin, npages); 15634206Sdyson 15732755Sdyson iov.iov_base = (caddr_t) kva; 15836563Speter iov.iov_len = count; 15932755Sdyson uio.uio_iov = &iov; 16032755Sdyson uio.uio_iovcnt = 1; 16136563Speter uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 16236563Speter uio.uio_resid = count; 16332755Sdyson uio.uio_segflg = UIO_SYSSPACE; 16432755Sdyson uio.uio_rw = UIO_READ; 16583366Sjulian uio.uio_td = td; 16625930Sdfr 167122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 16834206Sdyson pmap_qremove(kva, npages); 16932755Sdyson 17042957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 17134206Sdyson 17242957Sdillon if (error && (uio.uio_resid == count)) { 173158739Smohans nfs_printf("nfs_getpages: error %d\n", error); 174248084Sattilio VM_OBJECT_WLOCK(object); 17542957Sdillon for (i = 0; i < npages; ++i) { 176207669Salc if (i != ap->a_reqpage) { 177207669Salc vm_page_lock(pages[i]); 17875692Salfred vm_page_free(pages[i]); 179207669Salc vm_page_unlock(pages[i]); 180207669Salc } 18142957Sdillon } 182248084Sattilio VM_OBJECT_WUNLOCK(object); 183194425Salc return (VM_PAGER_ERROR); 18442957Sdillon } 18534206Sdyson 18645347Sjulian /* 18745347Sjulian * Calculate the number of bytes read and validate only that number 18845347Sjulian * of bytes. Note that due to pending writes, size may be 0. This 18945347Sjulian * does not mean that the remaining data is invalid! 19045347Sjulian */ 19145347Sjulian 19236563Speter size = count - uio.uio_resid; 193248084Sattilio VM_OBJECT_WLOCK(object); 19434206Sdyson for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 19534206Sdyson vm_page_t m; 19634206Sdyson nextoff = toff + PAGE_SIZE; 19736563Speter m = pages[i]; 19834206Sdyson 19934206Sdyson if (nextoff <= size) { 20045347Sjulian /* 20145347Sjulian * Read operation filled an entire page 20245347Sjulian */ 20334206Sdyson m->valid = VM_PAGE_BITS_ALL; 204192010Salc KASSERT(m->dirty == 0, 205192010Salc ("nfs_getpages: page %p is dirty", m)); 20645347Sjulian } else if (size > toff) { 20745347Sjulian /* 20846349Salc * Read operation filled a partial page. 20945347Sjulian */ 21046349Salc m->valid = 0; 211228156Skib vm_page_set_valid_range(m, 0, size - toff); 212192986Salc KASSERT(m->dirty == 0, 213192134Salc ("nfs_getpages: page %p is dirty", m)); 21487834Sdillon } else { 21587834Sdillon /* 216239246Skib * Read operation was short. If no error 217239246Skib * occured we may have hit a zero-fill 218239246Skib * section. We leave valid set to 0, and page 219239246Skib * is freed by vm_page_readahead_finish() if 220239246Skib * its index is not equal to requested, or 221239246Skib * page is zeroed and set valid by 222239246Skib * vm_pager_get_pages() for requested page. 22387834Sdillon */ 22487834Sdillon ; 22534206Sdyson } 226239040Skib if (i != ap->a_reqpage) 227239246Skib vm_page_readahead_finish(m); 22825930Sdfr } 229248084Sattilio VM_OBJECT_WUNLOCK(object); 230194425Salc return (0); 23125930Sdfr} 23225930Sdfr 23325930Sdfr/* 23434206Sdyson * Vnode op for VM putpages. 23534096Smsmith */ 23634096Smsmithint 23783651Speternfs_putpages(struct vop_putpages_args *ap) 23834096Smsmith{ 23934206Sdyson struct uio uio; 24034206Sdyson struct iovec iov; 24134206Sdyson vm_offset_t kva; 24234206Sdyson struct buf *bp; 24336563Speter int iomode, must_commit, i, error, npages, count; 24446349Salc off_t offset; 24534206Sdyson int *rtvals; 24636563Speter struct vnode *vp; 24783366Sjulian struct thread *td; 24836563Speter struct ucred *cred; 24936563Speter struct nfsmount *nmp; 25046349Salc struct nfsnode *np; 25136563Speter vm_page_t *pages; 25234206Sdyson 25336563Speter vp = ap->a_vp; 25446349Salc np = VTONFS(vp); 25583366Sjulian td = curthread; /* XXX */ 256235332Srmacklem /* Set the cred to n_writecred for the write rpcs. */ 257235332Srmacklem if (np->n_writecred != NULL) 258235332Srmacklem cred = crhold(np->n_writecred); 259235332Srmacklem else 260235332Srmacklem cred = crhold(curthread->td_ucred); /* XXX */ 26136563Speter nmp = VFSTONFS(vp->v_mount); 26236563Speter pages = ap->a_m; 26336563Speter count = ap->a_count; 26434206Sdyson rtvals = ap->a_rtvals; 26536563Speter npages = btoc(count); 26646349Salc offset = IDX_TO_OFF(pages[0]->pindex); 267158739Smohans 268158739Smohans mtx_lock(&nmp->nm_mtx); 26936563Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 27076827Salfred (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 271158739Smohans mtx_unlock(&nmp->nm_mtx); 27283366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 273158739Smohans } else 274158739Smohans mtx_unlock(&nmp->nm_mtx); 27534206Sdyson 276158739Smohans mtx_lock(&np->n_mtx); 277157557Smohans if (nfs_directio_enable && !nfs_directio_allow_mmap && 278158739Smohans (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { 279158739Smohans mtx_unlock(&np->n_mtx); 280158739Smohans nfs_printf("nfs_putpages: called on noncache-able vnode??\n"); 281158739Smohans mtx_lock(&np->n_mtx); 282158739Smohans } 283138899Sps 28483651Speter for (i = 0; i < npages; i++) 285222586Skib rtvals[i] = VM_PAGER_ERROR; 28634206Sdyson 28734206Sdyson /* 28846349Salc * When putting pages, do not extend file past EOF. 28946349Salc */ 29046349Salc if (offset + count > np->n_size) { 29146349Salc count = np->n_size - offset; 29246349Salc if (count < 0) 29346349Salc count = 0; 29446349Salc } 295158739Smohans mtx_unlock(&np->n_mtx); 29646349Salc 29746349Salc /* 29834206Sdyson * We use only the kva address for the buffer, but this is extremely 29934206Sdyson * convienient and fast. 30034206Sdyson */ 30142957Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 30234206Sdyson 30334206Sdyson kva = (vm_offset_t) bp->b_data; 30436563Speter pmap_qenter(kva, pages, npages); 305170292Sattilio PCPU_INC(cnt.v_vnodeout); 306170292Sattilio PCPU_ADD(cnt.v_vnodepgsout, count); 30734206Sdyson 30834206Sdyson iov.iov_base = (caddr_t) kva; 30936563Speter iov.iov_len = count; 31034206Sdyson uio.uio_iov = &iov; 31134206Sdyson uio.uio_iovcnt = 1; 31246349Salc uio.uio_offset = offset; 31336563Speter uio.uio_resid = count; 31434206Sdyson uio.uio_segflg = UIO_SYSSPACE; 31534206Sdyson uio.uio_rw = UIO_WRITE; 31683366Sjulian uio.uio_td = td; 31734206Sdyson 31834206Sdyson if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 31934206Sdyson iomode = NFSV3WRITE_UNSTABLE; 32034206Sdyson else 32134206Sdyson iomode = NFSV3WRITE_FILESYNC; 32234206Sdyson 323122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 324235332Srmacklem crfree(cred); 32534206Sdyson 32634206Sdyson pmap_qremove(kva, npages); 32742957Sdillon relpbuf(bp, &nfs_pbuf_freecnt); 32834206Sdyson 32934206Sdyson if (!error) { 330222586Skib vnode_pager_undirty_pages(pages, rtvals, count - uio.uio_resid); 33176827Salfred if (must_commit) { 33236563Speter nfs_clearcommit(vp->v_mount); 33376827Salfred } 33434206Sdyson } 33536563Speter return rtvals[0]; 33634096Smsmith} 33734096Smsmith 33834096Smsmith/* 339158739Smohans * For nfs, cache consistency can only be maintained approximately. 340158739Smohans * Although RFC1094 does not specify the criteria, the following is 341158739Smohans * believed to be compatible with the reference port. 342158739Smohans * For nfs: 343158739Smohans * If the file's modify time on the server has changed since the 344158739Smohans * last read rpc or you have written to the file, 345158739Smohans * you may have lost data cache consistency with the 346158739Smohans * server, so flush all of the file's data out of the cache. 347158739Smohans * Then force a getattr rpc to ensure that you have up to date 348158739Smohans * attributes. 349158739Smohans * NB: This implies that cache data can be read when up to 350158739Smohans * NFS_ATTRTIMEO seconds out of date. If you find that you need current 351158739Smohans * attributes this could be forced by setting n_attrstamp to 0 before 352158739Smohans * the VOP_GETATTR() call. 353158739Smohans */ 354158739Smohansstatic inline int 355158739Smohansnfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) 356158739Smohans{ 357158739Smohans int error = 0; 358158739Smohans struct vattr vattr; 359158739Smohans struct nfsnode *np = VTONFS(vp); 360158739Smohans int old_lock; 361158739Smohans struct nfsmount *nmp = VFSTONFS(vp->v_mount); 362158739Smohans 363158739Smohans /* 364158739Smohans * Grab the exclusive lock before checking whether the cache is 365158739Smohans * consistent. 366158739Smohans * XXX - We can make this cheaper later (by acquiring cheaper locks). 367158739Smohans * But for now, this suffices. 368158739Smohans */ 369176134Sattilio old_lock = nfs_upgrade_vnlock(vp); 370193952Srmacklem if (vp->v_iflag & VI_DOOMED) { 371193952Srmacklem nfs_downgrade_vnlock(vp, old_lock); 372193952Srmacklem return (EBADF); 373193952Srmacklem } 374193952Srmacklem 375158739Smohans mtx_lock(&np->n_mtx); 376158739Smohans if (np->n_flag & NMODIFIED) { 377158739Smohans mtx_unlock(&np->n_mtx); 378158739Smohans if (vp->v_type != VREG) { 379158739Smohans if (vp->v_type != VDIR) 380158739Smohans panic("nfs: bioread, not dir"); 381158739Smohans (nmp->nm_rpcops->nr_invaldir)(vp); 382158739Smohans error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 383158739Smohans if (error) 384158739Smohans goto out; 385158739Smohans } 386158739Smohans np->n_attrstamp = 0; 387190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 388182371Sattilio error = VOP_GETATTR(vp, &vattr, cred); 389158739Smohans if (error) 390158739Smohans goto out; 391158739Smohans mtx_lock(&np->n_mtx); 392158739Smohans np->n_mtime = vattr.va_mtime; 393158739Smohans mtx_unlock(&np->n_mtx); 394158739Smohans } else { 395158739Smohans mtx_unlock(&np->n_mtx); 396182371Sattilio error = VOP_GETATTR(vp, &vattr, cred); 397158739Smohans if (error) 398158739Smohans return (error); 399158739Smohans mtx_lock(&np->n_mtx); 400158739Smohans if ((np->n_flag & NSIZECHANGED) 401158739Smohans || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { 402158739Smohans mtx_unlock(&np->n_mtx); 403158739Smohans if (vp->v_type == VDIR) 404158739Smohans (nmp->nm_rpcops->nr_invaldir)(vp); 405158739Smohans error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 406158739Smohans if (error) 407158739Smohans goto out; 408158739Smohans mtx_lock(&np->n_mtx); 409158739Smohans np->n_mtime = vattr.va_mtime; 410158739Smohans np->n_flag &= ~NSIZECHANGED; 411158739Smohans } 412158739Smohans mtx_unlock(&np->n_mtx); 413158739Smohans } 414158739Smohansout: 415176134Sattilio nfs_downgrade_vnlock(vp, old_lock); 416158739Smohans return error; 417158739Smohans} 418158739Smohans 419158739Smohans/* 4201541Srgrimes * Vnode op for read using bio 4211541Srgrimes */ 4221549Srgrimesint 42383651Speternfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 4241541Srgrimes{ 42583651Speter struct nfsnode *np = VTONFS(vp); 42683651Speter int biosize, i; 427143822Sdas struct buf *bp, *rabp; 42883366Sjulian struct thread *td; 4299336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 4305455Sdg daddr_t lbn, rabn; 431224733Sjhb off_t end; 43246349Salc int bcount; 43351344Sdillon int seqcount; 43446349Salc int nra, error = 0, n = 0, on = 0; 4351541Srgrimes 436209120Skib KASSERT(uio->uio_rw == UIO_READ, ("nfs_read mode")); 4371541Srgrimes if (uio->uio_resid == 0) 4381541Srgrimes return (0); 43936473Speter if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 4401541Srgrimes return (EINVAL); 44183366Sjulian td = uio->uio_td; 44251344Sdillon 443158739Smohans mtx_lock(&nmp->nm_mtx); 44436176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 445158739Smohans (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 446158739Smohans mtx_unlock(&nmp->nm_mtx); 44783366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 448158739Smohans } else 449158739Smohans mtx_unlock(&nmp->nm_mtx); 450158739Smohans 451224733Sjhb end = uio->uio_offset + uio->uio_resid; 45236473Speter if (vp->v_type != VDIR && 453224733Sjhb (end > nmp->nm_maxfilesize || end < uio->uio_offset)) 45436473Speter return (EFBIG); 455138899Sps 456138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) 457138899Sps /* No caching/ no readaheads. Just read data into the user buffer */ 458138899Sps return nfs_readrpc(vp, uio, cred); 459138899Sps 460230605Srmacklem biosize = vp->v_bufobj.bo_bsize; 461108357Sdillon seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 462158739Smohans 463158739Smohans error = nfs_bioread_check_cons(vp, td, cred); 464158739Smohans if (error) 465158739Smohans return error; 466158739Smohans 46783651Speter do { 468158739Smohans u_quad_t nsize; 469158739Smohans 470158739Smohans mtx_lock(&np->n_mtx); 471158739Smohans nsize = np->n_size; 472158739Smohans mtx_unlock(&np->n_mtx); 473158739Smohans 4741541Srgrimes switch (vp->v_type) { 4751541Srgrimes case VREG: 4761541Srgrimes nfsstats.biocache_reads++; 4771541Srgrimes lbn = uio->uio_offset / biosize; 478248500Semaste on = uio->uio_offset - (lbn * biosize); 4791541Srgrimes 4801541Srgrimes /* 4811541Srgrimes * Start the read ahead(s), as required. 4821541Srgrimes */ 483158739Smohans if (nmp->nm_readahead > 0) { 48451344Sdillon for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 485158739Smohans (off_t)(lbn + 1 + nra) * biosize < nsize; nra++) { 4865455Sdg rabn = lbn + 1 + nra; 487136767Sphk if (incore(&vp->v_bufobj, rabn) == NULL) { 48883366Sjulian rabp = nfs_getcacheblk(vp, rabn, biosize, td); 489131691Salfred if (!rabp) { 490195203Sdfr error = nfs_sigintr(nmp, td); 491131691Salfred return (error ? error : EINTR); 492131691Salfred } 4938692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 49458345Sphk rabp->b_flags |= B_ASYNC; 49558345Sphk rabp->b_iocmd = BIO_READ; 4965455Sdg vfs_busy_pages(rabp, 0); 497134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 49858934Sphk rabp->b_flags |= B_INVAL; 49958934Sphk rabp->b_ioflags |= BIO_ERROR; 5005455Sdg vfs_unbusy_pages(rabp); 5011541Srgrimes brelse(rabp); 50255431Sdillon break; 5031541Srgrimes } 50455431Sdillon } else { 5055471Sdg brelse(rabp); 50655431Sdillon } 5071541Srgrimes } 5081541Srgrimes } 5091541Srgrimes } 5101541Srgrimes 511148268Sps /* Note that bcount is *not* DEV_BSIZE aligned. */ 51246349Salc bcount = biosize; 513158739Smohans if ((off_t)lbn * biosize >= nsize) { 51446349Salc bcount = 0; 515158739Smohans } else if ((off_t)(lbn + 1) * biosize > nsize) { 516158739Smohans bcount = nsize - (off_t)lbn * biosize; 5178692Sdg } 51883366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 51954605Sdillon 520131691Salfred if (!bp) { 521195203Sdfr error = nfs_sigintr(nmp, td); 522131691Salfred return (error ? error : EINTR); 523131691Salfred } 52442957Sdillon 52525930Sdfr /* 52646349Salc * If B_CACHE is not set, we must issue the read. If this 52746349Salc * fails, we return an error. 52825930Sdfr */ 52946349Salc 5307871Sdg if ((bp->b_flags & B_CACHE) == 0) { 53158345Sphk bp->b_iocmd = BIO_READ; 53232755Sdyson vfs_busy_pages(bp, 0); 533134898Sphk error = nfs_doio(vp, bp, cred, td); 53432755Sdyson if (error) { 53532755Sdyson brelse(bp); 53632755Sdyson return (error); 53732755Sdyson } 5381541Srgrimes } 53946349Salc 54046349Salc /* 54146349Salc * on is the offset into the current bp. Figure out how many 54246349Salc * bytes we can copy out of the bp. Note that bcount is 54346349Salc * NOT DEV_BSIZE aligned. 54446349Salc * 54546349Salc * Then figure out how many bytes we can copy into the uio. 54646349Salc */ 54746349Salc 54846349Salc n = 0; 54946349Salc if (on < bcount) 550231949Skib n = MIN((unsigned)(bcount - on), uio->uio_resid); 5511541Srgrimes break; 5521541Srgrimes case VLNK: 5531541Srgrimes nfsstats.biocache_readlinks++; 55483366Sjulian bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 555131691Salfred if (!bp) { 556195203Sdfr error = nfs_sigintr(nmp, td); 557131691Salfred return (error ? error : EINTR); 558131691Salfred } 5597871Sdg if ((bp->b_flags & B_CACHE) == 0) { 56058345Sphk bp->b_iocmd = BIO_READ; 56132755Sdyson vfs_busy_pages(bp, 0); 562134898Sphk error = nfs_doio(vp, bp, cred, td); 56332755Sdyson if (error) { 56458934Sphk bp->b_ioflags |= BIO_ERROR; 56532755Sdyson brelse(bp); 56632755Sdyson return (error); 56732755Sdyson } 5681541Srgrimes } 569231949Skib n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 5701541Srgrimes on = 0; 5711541Srgrimes break; 5721541Srgrimes case VDIR: 5731541Srgrimes nfsstats.biocache_readdirs++; 57424577Sdfr if (np->n_direofoffset 57524577Sdfr && uio->uio_offset >= np->n_direofoffset) { 57624577Sdfr return (0); 57724577Sdfr } 57836979Sbde lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 5799336Sdfr on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 58083366Sjulian bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 581131691Salfred if (!bp) { 582195203Sdfr error = nfs_sigintr(nmp, td); 583131691Salfred return (error ? error : EINTR); 584131691Salfred } 5857871Sdg if ((bp->b_flags & B_CACHE) == 0) { 58658345Sphk bp->b_iocmd = BIO_READ; 5879336Sdfr vfs_busy_pages(bp, 0); 588134898Sphk error = nfs_doio(vp, bp, cred, td); 58932912Stegge if (error) { 59032912Stegge brelse(bp); 59132912Stegge } 59232755Sdyson while (error == NFSERR_BAD_COOKIE) { 593122953Salfred (nmp->nm_rpcops->nr_invaldir)(vp); 594140731Sphk error = nfs_vinvalbuf(vp, 0, td, 1); 59532755Sdyson /* 59632755Sdyson * Yuck! The directory has been modified on the 59732755Sdyson * server. The only way to get the block is by 59832755Sdyson * reading from the beginning to get all the 59932755Sdyson * offset cookies. 60046349Salc * 60146349Salc * Leave the last bp intact unless there is an error. 60246349Salc * Loop back up to the while if the error is another 60346349Salc * NFSERR_BAD_COOKIE (double yuch!). 60432755Sdyson */ 60532755Sdyson for (i = 0; i <= lbn && !error; i++) { 60632755Sdyson if (np->n_direofoffset 60732755Sdyson && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 60824577Sdfr return (0); 60983366Sjulian bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 610131691Salfred if (!bp) { 611195203Sdfr error = nfs_sigintr(nmp, td); 612131691Salfred return (error ? error : EINTR); 613131691Salfred } 61446349Salc if ((bp->b_flags & B_CACHE) == 0) { 61558345Sphk bp->b_iocmd = BIO_READ; 61646349Salc vfs_busy_pages(bp, 0); 617134898Sphk error = nfs_doio(vp, bp, cred, td); 61846349Salc /* 61946349Salc * no error + B_INVAL == directory EOF, 62046349Salc * use the block. 62146349Salc */ 62246349Salc if (error == 0 && (bp->b_flags & B_INVAL)) 62346349Salc break; 62446349Salc } 62546349Salc /* 62646349Salc * An error will throw away the block and the 62746349Salc * for loop will break out. If no error and this 62846349Salc * is not the block we want, we throw away the 62946349Salc * block and go for the next one via the for loop. 63046349Salc */ 63146349Salc if (error || i < lbn) 63232755Sdyson brelse(bp); 6331541Srgrimes } 63432912Stegge } 63546349Salc /* 63646349Salc * The above while is repeated if we hit another cookie 63746349Salc * error. If we hit an error and it wasn't a cookie error, 63846349Salc * we give up. 63946349Salc */ 64032912Stegge if (error) 6419336Sdfr return (error); 6421541Srgrimes } 6431541Srgrimes 6441541Srgrimes /* 6451541Srgrimes * If not eof and read aheads are enabled, start one. 6461541Srgrimes * (You need the current block first, so that you have the 6479336Sdfr * directory offset cookie of the next block.) 6481541Srgrimes */ 64989324Speter if (nmp->nm_readahead > 0 && 65039782Smckusick (bp->b_flags & B_INVAL) == 0 && 6519336Sdfr (np->n_direofoffset == 0 || 6529336Sdfr (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 653136767Sphk incore(&vp->v_bufobj, lbn + 1) == NULL) { 65483366Sjulian rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 6551541Srgrimes if (rabp) { 6568692Sdg if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 65758345Sphk rabp->b_flags |= B_ASYNC; 65858345Sphk rabp->b_iocmd = BIO_READ; 6595455Sdg vfs_busy_pages(rabp, 0); 660134898Sphk if (nfs_asyncio(nmp, rabp, cred, td)) { 66158934Sphk rabp->b_flags |= B_INVAL; 66258934Sphk rabp->b_ioflags |= BIO_ERROR; 6635455Sdg vfs_unbusy_pages(rabp); 6641541Srgrimes brelse(rabp); 6651541Srgrimes } 6665471Sdg } else { 6675471Sdg brelse(rabp); 6681541Srgrimes } 6691541Srgrimes } 6701541Srgrimes } 67126469Sdfr /* 67246349Salc * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 67346349Salc * chopped for the EOF condition, we cannot tell how large 67446349Salc * NFS directories are going to be until we hit EOF. So 67546349Salc * an NFS directory buffer is *not* chopped to its EOF. Now, 67646349Salc * it just so happens that b_resid will effectively chop it 67746349Salc * to EOF. *BUT* this information is lost if the buffer goes 67846349Salc * away and is reconstituted into a B_CACHE state ( due to 67946349Salc * being VMIO ) later. So we keep track of the directory eof 68083651Speter * in np->n_direofoffset and chop it off as an extra step 68146349Salc * right here. 68226469Sdfr */ 68326469Sdfr n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 68446349Salc if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 68546349Salc n = np->n_direofoffset - uio->uio_offset; 6861541Srgrimes break; 6873305Sphk default: 688158739Smohans nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 689143822Sdas bp = NULL; 6903305Sphk break; 6911541Srgrimes }; 6921541Srgrimes 6931541Srgrimes if (n > 0) { 69434206Sdyson error = uiomove(bp->b_data + on, (int)n, uio); 6951541Srgrimes } 696143822Sdas if (vp->v_type == VLNK) 6971541Srgrimes n = 0; 698143822Sdas if (bp != NULL) 699143822Sdas brelse(bp); 7001541Srgrimes } while (error == 0 && uio->uio_resid > 0 && n > 0); 7011541Srgrimes return (error); 7021541Srgrimes} 7031541Srgrimes 7041541Srgrimes/* 705138899Sps * The NFS write path cannot handle iovecs with len > 1. So we need to 706138899Sps * break up iovecs accordingly (restricting them to wsize). 707138899Sps * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 708138899Sps * For the ASYNC case, 2 copies are needed. The first a copy from the 709138899Sps * user buffer to a staging buffer and then a second copy from the staging 710138899Sps * buffer to mbufs. This can be optimized by copying from the user buffer 711138899Sps * directly into mbufs and passing the chain down, but that requires a 712138899Sps * fair amount of re-working of the relevant codepaths (and can be done 713138899Sps * later). 714138899Sps */ 715138899Spsstatic int 716138899Spsnfs_directio_write(vp, uiop, cred, ioflag) 717138899Sps struct vnode *vp; 718138899Sps struct uio *uiop; 719138899Sps struct ucred *cred; 720138899Sps int ioflag; 721138899Sps{ 722138899Sps int error; 723138899Sps struct nfsmount *nmp = VFSTONFS(vp->v_mount); 724138899Sps struct thread *td = uiop->uio_td; 725138899Sps int size; 726158739Smohans int wsize; 727158739Smohans 728158739Smohans mtx_lock(&nmp->nm_mtx); 729158739Smohans wsize = nmp->nm_wsize; 730158739Smohans mtx_unlock(&nmp->nm_mtx); 731138899Sps if (ioflag & IO_SYNC) { 732138899Sps int iomode, must_commit; 733138899Sps struct uio uio; 734138899Sps struct iovec iov; 735138899Spsdo_sync: 736138899Sps while (uiop->uio_resid > 0) { 737231949Skib size = MIN(uiop->uio_resid, wsize); 738231949Skib size = MIN(uiop->uio_iov->iov_len, size); 739138899Sps iov.iov_base = uiop->uio_iov->iov_base; 740138899Sps iov.iov_len = size; 741138899Sps uio.uio_iov = &iov; 742138899Sps uio.uio_iovcnt = 1; 743138899Sps uio.uio_offset = uiop->uio_offset; 744138899Sps uio.uio_resid = size; 745138899Sps uio.uio_segflg = UIO_USERSPACE; 746138899Sps uio.uio_rw = UIO_WRITE; 747138899Sps uio.uio_td = td; 748138899Sps iomode = NFSV3WRITE_FILESYNC; 749138899Sps error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 750138899Sps &iomode, &must_commit); 751138899Sps KASSERT((must_commit == 0), 752138899Sps ("nfs_directio_write: Did not commit write")); 753138899Sps if (error) 754138899Sps return (error); 755138899Sps uiop->uio_offset += size; 756138899Sps uiop->uio_resid -= size; 757138899Sps if (uiop->uio_iov->iov_len <= size) { 758138899Sps uiop->uio_iovcnt--; 759138899Sps uiop->uio_iov++; 760138899Sps } else { 761138899Sps uiop->uio_iov->iov_base = 762138899Sps (char *)uiop->uio_iov->iov_base + size; 763138899Sps uiop->uio_iov->iov_len -= size; 764138899Sps } 765138899Sps } 766138899Sps } else { 767138899Sps struct uio *t_uio; 768138899Sps struct iovec *t_iov; 769138899Sps struct buf *bp; 770138899Sps 771138899Sps /* 772138899Sps * Break up the write into blocksize chunks and hand these 773138899Sps * over to nfsiod's for write back. 774138899Sps * Unfortunately, this incurs a copy of the data. Since 775138899Sps * the user could modify the buffer before the write is 776138899Sps * initiated. 777138899Sps * 778138899Sps * The obvious optimization here is that one of the 2 copies 779138899Sps * in the async write path can be eliminated by copying the 780138899Sps * data here directly into mbufs and passing the mbuf chain 781138899Sps * down. But that will require a fair amount of re-working 782138899Sps * of the code and can be done if there's enough interest 783138899Sps * in NFS directio access. 784138899Sps */ 785138899Sps while (uiop->uio_resid > 0) { 786231949Skib size = MIN(uiop->uio_resid, wsize); 787231949Skib size = MIN(uiop->uio_iov->iov_len, size); 788138899Sps bp = getpbuf(&nfs_pbuf_freecnt); 789138899Sps t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); 790138899Sps t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); 791138899Sps t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); 792138899Sps t_iov->iov_len = size; 793138899Sps t_uio->uio_iov = t_iov; 794138899Sps t_uio->uio_iovcnt = 1; 795138899Sps t_uio->uio_offset = uiop->uio_offset; 796138899Sps t_uio->uio_resid = size; 797138899Sps t_uio->uio_segflg = UIO_SYSSPACE; 798138899Sps t_uio->uio_rw = UIO_WRITE; 799138899Sps t_uio->uio_td = td; 800232327Srmacklem KASSERT(uiop->uio_segflg == UIO_USERSPACE || 801232327Srmacklem uiop->uio_segflg == UIO_SYSSPACE, 802232327Srmacklem ("nfs_directio_write: Bad uio_segflg")); 803232327Srmacklem if (uiop->uio_segflg == UIO_USERSPACE) { 804232327Srmacklem error = copyin(uiop->uio_iov->iov_base, 805232327Srmacklem t_iov->iov_base, size); 806232327Srmacklem if (error != 0) 807232327Srmacklem goto err_free; 808232327Srmacklem } else 809232327Srmacklem /* 810232327Srmacklem * UIO_SYSSPACE may never happen, but handle 811232327Srmacklem * it just in case it does. 812232327Srmacklem */ 813232327Srmacklem bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, 814232327Srmacklem size); 815138899Sps bp->b_flags |= B_DIRECT; 816138899Sps bp->b_iocmd = BIO_WRITE; 817138899Sps if (cred != NOCRED) { 818138899Sps crhold(cred); 819138899Sps bp->b_wcred = cred; 820138899Sps } else 821138899Sps bp->b_wcred = NOCRED; 822138899Sps bp->b_caller1 = (void *)t_uio; 823138899Sps bp->b_vp = vp; 824138899Sps error = nfs_asyncio(nmp, bp, NOCRED, td); 825232327Srmacklemerr_free: 826138899Sps if (error) { 827138899Sps free(t_iov->iov_base, M_NFSDIRECTIO); 828138899Sps free(t_iov, M_NFSDIRECTIO); 829138899Sps free(t_uio, M_NFSDIRECTIO); 830138899Sps bp->b_vp = NULL; 831138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 832138899Sps if (error == EINTR) 833138899Sps return (error); 834138899Sps goto do_sync; 835138899Sps } 836138899Sps uiop->uio_offset += size; 837138899Sps uiop->uio_resid -= size; 838138899Sps if (uiop->uio_iov->iov_len <= size) { 839138899Sps uiop->uio_iovcnt--; 840138899Sps uiop->uio_iov++; 841138899Sps } else { 842138899Sps uiop->uio_iov->iov_base = 843138899Sps (char *)uiop->uio_iov->iov_base + size; 844138899Sps uiop->uio_iov->iov_len -= size; 845138899Sps } 846138899Sps } 847138899Sps } 848138899Sps return (0); 849138899Sps} 850138899Sps 851138899Sps/* 8521541Srgrimes * Vnode op for write using bio 8531541Srgrimes */ 8541549Srgrimesint 85583651Speternfs_write(struct vop_write_args *ap) 8561541Srgrimes{ 85746349Salc int biosize; 85846349Salc struct uio *uio = ap->a_uio; 85983366Sjulian struct thread *td = uio->uio_td; 86046349Salc struct vnode *vp = ap->a_vp; 8611541Srgrimes struct nfsnode *np = VTONFS(vp); 86246349Salc struct ucred *cred = ap->a_cred; 8631541Srgrimes int ioflag = ap->a_ioflag; 8641541Srgrimes struct buf *bp; 8651541Srgrimes struct vattr vattr; 8669336Sdfr struct nfsmount *nmp = VFSTONFS(vp->v_mount); 86711921Sphk daddr_t lbn; 868224733Sjhb off_t end; 86946349Salc int bcount; 87083651Speter int n, on, error = 0; 8711541Srgrimes 872209120Skib KASSERT(uio->uio_rw == UIO_WRITE, ("nfs_write mode")); 873209120Skib KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, 874209120Skib ("nfs_write proc")); 8751541Srgrimes if (vp->v_type != VREG) 8761541Srgrimes return (EIO); 877158739Smohans mtx_lock(&np->n_mtx); 8781541Srgrimes if (np->n_flag & NWRITEERR) { 8791541Srgrimes np->n_flag &= ~NWRITEERR; 880158739Smohans mtx_unlock(&np->n_mtx); 8811541Srgrimes return (np->n_error); 882158739Smohans } else 883158739Smohans mtx_unlock(&np->n_mtx); 884158739Smohans mtx_lock(&nmp->nm_mtx); 88536176Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 886158739Smohans (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 887158739Smohans mtx_unlock(&nmp->nm_mtx); 88883366Sjulian (void)nfs_fsinfo(nmp, vp, cred, td); 889158739Smohans } else 890158739Smohans mtx_unlock(&nmp->nm_mtx); 89154605Sdillon 89254605Sdillon /* 89354605Sdillon * Synchronously flush pending buffers if we are in synchronous 89454605Sdillon * mode or if we are appending. 89554605Sdillon */ 8961541Srgrimes if (ioflag & (IO_APPEND | IO_SYNC)) { 897158739Smohans mtx_lock(&np->n_mtx); 8981541Srgrimes if (np->n_flag & NMODIFIED) { 899158739Smohans mtx_unlock(&np->n_mtx); 900147420Sgreen#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ 901147280Sgreen /* 902147280Sgreen * Require non-blocking, synchronous writes to 903147280Sgreen * dirty files to inform the program it needs 904147280Sgreen * to fsync(2) explicitly. 905147280Sgreen */ 906147280Sgreen if (ioflag & IO_NDELAY) 907147280Sgreen return (EAGAIN); 908147420Sgreen#endif 909147280Sgreenflush_and_restart: 9101541Srgrimes np->n_attrstamp = 0; 911190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 912140731Sphk error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 9133305Sphk if (error) 9141541Srgrimes return (error); 915158739Smohans } else 916158739Smohans mtx_unlock(&np->n_mtx); 9171541Srgrimes } 91854605Sdillon 91954605Sdillon /* 92054605Sdillon * If IO_APPEND then load uio_offset. We restart here if we cannot 92154605Sdillon * get the append lock. 92254605Sdillon */ 92354605Sdillon if (ioflag & IO_APPEND) { 92454605Sdillon np->n_attrstamp = 0; 925190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 926182371Sattilio error = VOP_GETATTR(vp, &vattr, cred); 92754605Sdillon if (error) 92854605Sdillon return (error); 929158739Smohans mtx_lock(&np->n_mtx); 93054605Sdillon uio->uio_offset = np->n_size; 931158739Smohans mtx_unlock(&np->n_mtx); 93254605Sdillon } 93354605Sdillon 9341541Srgrimes if (uio->uio_offset < 0) 9351541Srgrimes return (EINVAL); 936224733Sjhb end = uio->uio_offset + uio->uio_resid; 937224733Sjhb if (end > nmp->nm_maxfilesize || end < uio->uio_offset) 93836473Speter return (EFBIG); 9391541Srgrimes if (uio->uio_resid == 0) 9401541Srgrimes return (0); 94154605Sdillon 942138899Sps if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) 943138899Sps return nfs_directio_write(vp, uio, cred, ioflag); 944138899Sps 9451541Srgrimes /* 9461541Srgrimes * Maybe this should be above the vnode op call, but so long as 9471541Srgrimes * file servers have no limits, i don't think it matters 9481541Srgrimes */ 949207662Strasz if (vn_rlimit_fsize(vp, uio, td)) 950207662Strasz return (EFBIG); 95146349Salc 952230605Srmacklem biosize = vp->v_bufobj.bo_bsize; 953147280Sgreen /* 954147280Sgreen * Find all of this file's B_NEEDCOMMIT buffers. If our writes 955147280Sgreen * would exceed the local maximum per-file write commit size when 956147280Sgreen * combined with those, we must decide whether to flush, 957147280Sgreen * go synchronous, or return error. We don't bother checking 958147280Sgreen * IO_UNIT -- we just make all writes atomic anyway, as there's 959147280Sgreen * no point optimizing for something that really won't ever happen. 960147280Sgreen */ 961147280Sgreen if (!(ioflag & IO_SYNC)) { 962158739Smohans int nflag; 963158739Smohans 964158739Smohans mtx_lock(&np->n_mtx); 965158739Smohans nflag = np->n_flag; 966158739Smohans mtx_unlock(&np->n_mtx); 967147280Sgreen int needrestart = 0; 968147280Sgreen if (nmp->nm_wcommitsize < uio->uio_resid) { 969147280Sgreen /* 970147280Sgreen * If this request could not possibly be completed 971147280Sgreen * without exceeding the maximum outstanding write 972147280Sgreen * commit size, see if we can convert it into a 973147280Sgreen * synchronous write operation. 974147280Sgreen */ 975147280Sgreen if (ioflag & IO_NDELAY) 976147280Sgreen return (EAGAIN); 977147280Sgreen ioflag |= IO_SYNC; 978158739Smohans if (nflag & NMODIFIED) 979147280Sgreen needrestart = 1; 980158739Smohans } else if (nflag & NMODIFIED) { 981147280Sgreen int wouldcommit = 0; 982147280Sgreen BO_LOCK(&vp->v_bufobj); 983147280Sgreen if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { 984147280Sgreen TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, 985147280Sgreen b_bobufs) { 986147280Sgreen if (bp->b_flags & B_NEEDCOMMIT) 987147280Sgreen wouldcommit += bp->b_bcount; 988147280Sgreen } 989147280Sgreen } 990147280Sgreen BO_UNLOCK(&vp->v_bufobj); 991147280Sgreen /* 992147280Sgreen * Since we're not operating synchronously and 993147280Sgreen * bypassing the buffer cache, we are in a commit 994147280Sgreen * and holding all of these buffers whether 995147280Sgreen * transmitted or not. If not limited, this 996147280Sgreen * will lead to the buffer cache deadlocking, 997147280Sgreen * as no one else can flush our uncommitted buffers. 998147280Sgreen */ 999147280Sgreen wouldcommit += uio->uio_resid; 1000147280Sgreen /* 1001147280Sgreen * If we would initially exceed the maximum 1002147280Sgreen * outstanding write commit size, flush and restart. 1003147280Sgreen */ 1004147280Sgreen if (wouldcommit > nmp->nm_wcommitsize) 1005147280Sgreen needrestart = 1; 1006147280Sgreen } 1007148268Sps if (needrestart) 1008147280Sgreen goto flush_and_restart; 1009147280Sgreen } 101046349Salc 10111541Srgrimes do { 10121541Srgrimes nfsstats.biocache_writes++; 10131541Srgrimes lbn = uio->uio_offset / biosize; 1014248500Semaste on = uio->uio_offset - (lbn * biosize); 1015231949Skib n = MIN((unsigned)(biosize - on), uio->uio_resid); 10161541Srgrimesagain: 101746349Salc /* 101846349Salc * Handle direct append and file extension cases, calculate 101946349Salc * unaligned buffer size. 102046349Salc */ 1021158739Smohans mtx_lock(&np->n_mtx); 102246349Salc if (uio->uio_offset == np->n_size && n) { 1023158739Smohans mtx_unlock(&np->n_mtx); 102446349Salc /* 102554605Sdillon * Get the buffer (in its pre-append state to maintain 102654605Sdillon * B_CACHE if it was previously set). Resize the 102754605Sdillon * nfsnode after we have locked the buffer to prevent 102854605Sdillon * readers from reading garbage. 102946349Salc */ 103046349Salc bcount = on; 103183366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 103246349Salc 103354605Sdillon if (bp != NULL) { 103454605Sdillon long save; 103546349Salc 1036158739Smohans mtx_lock(&np->n_mtx); 103754605Sdillon np->n_size = uio->uio_offset + n; 103854605Sdillon np->n_flag |= NMODIFIED; 103954605Sdillon vnode_pager_setsize(vp, np->n_size); 1040158739Smohans mtx_unlock(&np->n_mtx); 104154605Sdillon 104254605Sdillon save = bp->b_flags & B_CACHE; 104354605Sdillon bcount += n; 104454605Sdillon allocbuf(bp, bcount); 104554605Sdillon bp->b_flags |= save; 104654605Sdillon } 104746349Salc } else { 104854605Sdillon /* 104983651Speter * Obtain the locked cache block first, and then 105054605Sdillon * adjust the file's size as appropriate. 105154605Sdillon */ 105254605Sdillon bcount = on + n; 105354605Sdillon if ((off_t)lbn * biosize + bcount < np->n_size) { 105454605Sdillon if ((off_t)(lbn + 1) * biosize < np->n_size) 105554605Sdillon bcount = biosize; 105654605Sdillon else 105754605Sdillon bcount = np->n_size - (off_t)lbn * biosize; 105854605Sdillon } 1059158739Smohans mtx_unlock(&np->n_mtx); 106083366Sjulian bp = nfs_getcacheblk(vp, lbn, bcount, td); 1061158739Smohans mtx_lock(&np->n_mtx); 106246349Salc if (uio->uio_offset + n > np->n_size) { 106346349Salc np->n_size = uio->uio_offset + n; 106446349Salc np->n_flag |= NMODIFIED; 106546349Salc vnode_pager_setsize(vp, np->n_size); 106646349Salc } 1067158739Smohans mtx_unlock(&np->n_mtx); 10688692Sdg } 106946349Salc 107054605Sdillon if (!bp) { 1071195203Sdfr error = nfs_sigintr(nmp, td); 1072131691Salfred if (!error) 1073131691Salfred error = EINTR; 107454605Sdillon break; 107554605Sdillon } 107654605Sdillon 107746349Salc /* 107846349Salc * Issue a READ if B_CACHE is not set. In special-append 107946349Salc * mode, B_CACHE is based on the buffer prior to the write 108046349Salc * op and is typically set, avoiding the read. If a read 108146349Salc * is required in special append mode, the server will 108246349Salc * probably send us a short-read since we extended the file 108383651Speter * on our end, resulting in b_resid == 0 and, thusly, 108446349Salc * B_CACHE getting set. 108546349Salc * 108646349Salc * We can also avoid issuing the read if the write covers 108746349Salc * the entire buffer. We have to make sure the buffer state 108846349Salc * is reasonable in this case since we will not be initiating 108946349Salc * I/O. See the comments in kern/vfs_bio.c's getblk() for 109046349Salc * more information. 109146349Salc * 109246349Salc * B_CACHE may also be set due to the buffer being cached 109346349Salc * normally. 109446349Salc */ 109546349Salc 109646349Salc if (on == 0 && n == bcount) { 109746349Salc bp->b_flags |= B_CACHE; 109858934Sphk bp->b_flags &= ~B_INVAL; 109958934Sphk bp->b_ioflags &= ~BIO_ERROR; 11008692Sdg } 110146349Salc 110246349Salc if ((bp->b_flags & B_CACHE) == 0) { 110358345Sphk bp->b_iocmd = BIO_READ; 110446349Salc vfs_busy_pages(bp, 0); 1105134898Sphk error = nfs_doio(vp, bp, cred, td); 110646349Salc if (error) { 110746349Salc brelse(bp); 110854605Sdillon break; 110946349Salc } 111046349Salc } 111184827Sjhb if (bp->b_wcred == NOCRED) 111284827Sjhb bp->b_wcred = crhold(cred); 1113158739Smohans mtx_lock(&np->n_mtx); 11141541Srgrimes np->n_flag |= NMODIFIED; 1115158739Smohans mtx_unlock(&np->n_mtx); 11168692Sdg 111745347Sjulian /* 111854605Sdillon * If dirtyend exceeds file size, chop it down. This should 111954605Sdillon * not normally occur but there is an append race where it 112083651Speter * might occur XXX, so we log it. 112154605Sdillon * 112254605Sdillon * If the chopping creates a reverse-indexed or degenerate 112354605Sdillon * situation with dirtyoff/end, we 0 both of them. 112445347Sjulian */ 112545347Sjulian 112654605Sdillon if (bp->b_dirtyend > bcount) { 1127158739Smohans nfs_printf("NFS append race @%lx:%d\n", 112883651Speter (long)bp->b_blkno * DEV_BSIZE, 112954605Sdillon bp->b_dirtyend - bcount); 113054605Sdillon bp->b_dirtyend = bcount; 113154605Sdillon } 113254605Sdillon 113345347Sjulian if (bp->b_dirtyoff >= bp->b_dirtyend) 113445347Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 113531617Sdyson 11361541Srgrimes /* 113731617Sdyson * If the new write will leave a contiguous dirty 113831617Sdyson * area, just update the b_dirtyoff and b_dirtyend, 113931617Sdyson * otherwise force a write rpc of the old dirty area. 114046349Salc * 114183651Speter * While it is possible to merge discontiguous writes due to 114246349Salc * our having a B_CACHE buffer ( and thus valid read data 114383651Speter * for the hole), we don't because it could lead to 114446349Salc * significant cache coherency problems with multiple clients, 114546349Salc * especially if locking is implemented later on. 114646349Salc * 114746349Salc * as an optimization we could theoretically maintain 114846349Salc * a linked list of discontinuous areas, but we would still 114946349Salc * have to commit them separately so there isn't much 115046349Salc * advantage to it except perhaps a bit of asynchronization. 115131617Sdyson */ 115242957Sdillon 115331617Sdyson if (bp->b_dirtyend > 0 && 115431617Sdyson (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1155126853Sphk if (bwrite(bp) == EINTR) { 1156100194Sdillon error = EINTR; 1157100194Sdillon break; 1158100194Sdillon } 115931617Sdyson goto again; 116031617Sdyson } 116131617Sdyson 11623305Sphk error = uiomove((char *)bp->b_data + on, n, uio); 116354480Sdillon 116454480Sdillon /* 116554480Sdillon * Since this block is being modified, it must be written 116654480Sdillon * again and not just committed. Since write clustering does 116754480Sdillon * not work for the stage 1 data write, only the stage 2 116854480Sdillon * commit rpc, we have to clear B_CLUSTEROK as well. 116954480Sdillon */ 117054480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 117154480Sdillon 11723305Sphk if (error) { 117358934Sphk bp->b_ioflags |= BIO_ERROR; 11741541Srgrimes brelse(bp); 117554605Sdillon break; 11761541Srgrimes } 117734206Sdyson 117834206Sdyson /* 117983651Speter * Only update dirtyoff/dirtyend if not a degenerate 118045347Sjulian * condition. 118145347Sjulian */ 118245347Sjulian if (n) { 118345347Sjulian if (bp->b_dirtyend > 0) { 118445347Sjulian bp->b_dirtyoff = min(on, bp->b_dirtyoff); 118545347Sjulian bp->b_dirtyend = max((on + n), bp->b_dirtyend); 118645347Sjulian } else { 118745347Sjulian bp->b_dirtyoff = on; 118845347Sjulian bp->b_dirtyend = on + n; 118945347Sjulian } 1190193187Salc vfs_bio_set_valid(bp, on, n); 11911541Srgrimes } 119245347Sjulian 119344679Sjulian /* 119483651Speter * If IO_SYNC do bwrite(). 119546349Salc * 119646349Salc * IO_INVAL appears to be unused. The idea appears to be 119746349Salc * to turn off caching in this case. Very odd. XXX 11981541Srgrimes */ 119983651Speter if ((ioflag & IO_SYNC)) { 120034206Sdyson if (ioflag & IO_INVAL) 120146349Salc bp->b_flags |= B_NOCACHE; 1202126853Sphk error = bwrite(bp); 12033305Sphk if (error) 120454605Sdillon break; 120583651Speter } else if ((n + on) == biosize) { 12069336Sdfr bp->b_flags |= B_ASYNC; 1207158739Smohans (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL); 120846349Salc } else { 12091541Srgrimes bdwrite(bp); 121046349Salc } 12111541Srgrimes } while (uio->uio_resid > 0 && n > 0); 121254605Sdillon 121354605Sdillon return (error); 12141541Srgrimes} 12151541Srgrimes 12161541Srgrimes/* 12171541Srgrimes * Get an nfs cache block. 121854480Sdillon * 12191541Srgrimes * Allocate a new one if the block isn't currently in the cache 12201541Srgrimes * and return the block marked busy. If the calling process is 12211541Srgrimes * interrupted by a signal for an interruptible mount point, return 12221541Srgrimes * NULL. 122354480Sdillon * 122454480Sdillon * The caller must carefully deal with the possible B_INVAL state of 122554480Sdillon * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 122654480Sdillon * indirectly), so synchronous reads can be issued without worrying about 122754480Sdillon * the B_INVAL state. We have to be a little more careful when dealing 122854480Sdillon * with writes (see comments in nfs_write()) when extending a file past 122954480Sdillon * its EOF. 12301541Srgrimes */ 123112911Sphkstatic struct buf * 123283651Speternfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 12331541Srgrimes{ 123483651Speter struct buf *bp; 123532755Sdyson struct mount *mp; 123632755Sdyson struct nfsmount *nmp; 12371541Srgrimes 123832755Sdyson mp = vp->v_mount; 123932755Sdyson nmp = VFSTONFS(mp); 124032755Sdyson 12411541Srgrimes if (nmp->nm_flag & NFSMNT_INT) { 1242138496Sps sigset_t oldset; 1243138496Sps 1244138496Sps nfs_set_sigmask(td, &oldset); 1245248255Sjhb bp = getblk(vp, bn, size, PCATCH, 0, 0); 1246138496Sps nfs_restore_sigmask(td, &oldset); 124799797Sdillon while (bp == NULL) { 1248195203Sdfr if (nfs_sigintr(nmp, td)) 124999797Sdillon return (NULL); 1250111856Sjeff bp = getblk(vp, bn, size, 0, 2 * hz, 0); 12511541Srgrimes } 125246349Salc } else { 1253111856Sjeff bp = getblk(vp, bn, size, 0, 0, 0); 125446349Salc } 12555455Sdg 1256230605Srmacklem if (vp->v_type == VREG) 1257230605Srmacklem bp->b_blkno = bn * (vp->v_bufobj.bo_bsize / DEV_BSIZE); 12581541Srgrimes return (bp); 12591541Srgrimes} 12601541Srgrimes 12611541Srgrimes/* 12621541Srgrimes * Flush and invalidate all dirty buffers. If another process is already 12631541Srgrimes * doing the flush, just wait for completion. 12641541Srgrimes */ 12651549Srgrimesint 1266140731Sphknfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) 12671541Srgrimes{ 126883651Speter struct nfsnode *np = VTONFS(vp); 12691541Srgrimes struct nfsmount *nmp = VFSTONFS(vp->v_mount); 12701541Srgrimes int error = 0, slpflag, slptimeo; 1271138469Sps int old_lock = 0; 12721541Srgrimes 1273115041Srwatson ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1274115041Srwatson 12751541Srgrimes if ((nmp->nm_flag & NFSMNT_INT) == 0) 12761541Srgrimes intrflg = 0; 12771541Srgrimes if (intrflg) { 1278248255Sjhb slpflag = PCATCH; 12791541Srgrimes slptimeo = 2 * hz; 12801541Srgrimes } else { 12811541Srgrimes slpflag = 0; 12821541Srgrimes slptimeo = 0; 12831541Srgrimes } 12841541Srgrimes 1285176134Sattilio old_lock = nfs_upgrade_vnlock(vp); 1286193952Srmacklem if (vp->v_iflag & VI_DOOMED) { 1287193952Srmacklem /* 1288193952Srmacklem * Since vgonel() uses the generic vinvalbuf() to flush 1289193952Srmacklem * dirty buffers and it does not call this function, it 1290193952Srmacklem * is safe to just return OK when VI_DOOMED is set. 1291193952Srmacklem */ 1292193952Srmacklem nfs_downgrade_vnlock(vp, old_lock); 1293193952Srmacklem return (0); 1294193952Srmacklem } 1295193952Srmacklem 12961541Srgrimes /* 12971541Srgrimes * Now, flush as required. 12981541Srgrimes */ 1299158906Sups if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) { 1300248084Sattilio VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); 1301158906Sups vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 1302248084Sattilio VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); 1303158906Sups /* 1304158906Sups * If the page clean was interrupted, fail the invalidation. 1305158906Sups * Not doing so, we run the risk of losing dirty pages in the 1306158906Sups * vinvalbuf() call below. 1307158906Sups */ 1308195203Sdfr if (intrflg && (error = nfs_sigintr(nmp, td))) 1309158906Sups goto out; 1310158906Sups } 1311158906Sups 1312183754Sattilio error = vinvalbuf(vp, flags, slpflag, 0); 13131541Srgrimes while (error) { 1314195203Sdfr if (intrflg && (error = nfs_sigintr(nmp, td))) 1315138469Sps goto out; 1316183754Sattilio error = vinvalbuf(vp, flags, 0, slptimeo); 13171541Srgrimes } 1318158739Smohans mtx_lock(&np->n_mtx); 1319157557Smohans if (np->n_directio_asyncwr == 0) 1320157557Smohans np->n_flag &= ~NMODIFIED; 1321158739Smohans mtx_unlock(&np->n_mtx); 1322138469Spsout: 1323176134Sattilio nfs_downgrade_vnlock(vp, old_lock); 1324138469Sps return error; 13251541Srgrimes} 13261541Srgrimes 13271541Srgrimes/* 13281541Srgrimes * Initiate asynchronous I/O. Return an error if no nfsiods are available. 13291541Srgrimes * This is mainly to avoid queueing async I/O requests when the nfsiods 13301541Srgrimes * are all hung on a dead server. 133146349Salc * 133258934Sphk * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 133346349Salc * is eventually dequeued by the async daemon, nfs_doio() *will*. 13341541Srgrimes */ 13351549Srgrimesint 1336134898Sphknfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) 13371541Srgrimes{ 133889324Speter int iod; 133919449Sdfr int gotiod; 134019449Sdfr int slpflag = 0; 134119449Sdfr int slptimeo = 0; 1342131691Salfred int error, error2; 13431541Srgrimes 134455431Sdillon /* 134583651Speter * Commits are usually short and sweet so lets save some cpu and 134655431Sdillon * leave the async daemons for more important rpc's (such as reads 134755431Sdillon * and writes). 1348249623Srmacklem * 1349249623Srmacklem * Readdirplus RPCs do vget()s to acquire the vnodes for entries 1350249623Srmacklem * in the directory in order to update attributes. This can deadlock 1351249623Srmacklem * with another thread that is waiting for async I/O to be done by 1352249623Srmacklem * an nfsiod thread while holding a lock on one of these vnodes. 1353249623Srmacklem * To avoid this deadlock, don't allow the async nfsiod threads to 1354249623Srmacklem * perform Readdirplus RPCs. 135555431Sdillon */ 1356158739Smohans mtx_lock(&nfs_iod_mtx); 1357249623Srmacklem if ((bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1358249623Srmacklem (nmp->nm_bufqiods > nfs_numasync / 2)) || 1359249623Srmacklem (bp->b_vp->v_type == VDIR && (nmp->nm_flag & NFSMNT_RDIRPLUS))) { 1360158739Smohans mtx_unlock(&nfs_iod_mtx); 136155431Sdillon return(EIO); 136255431Sdillon } 136319449Sdfragain: 136419449Sdfr if (nmp->nm_flag & NFSMNT_INT) 1365248255Sjhb slpflag = PCATCH; 136619449Sdfr gotiod = FALSE; 136719449Sdfr 136819449Sdfr /* 136919449Sdfr * Find a free iod to process this request. 137019449Sdfr */ 137189407Speter for (iod = 0; iod < nfs_numasync; iod++) 1372203072Srmacklem if (nfs_iodwant[iod] == NFSIOD_AVAILABLE) { 137319449Sdfr gotiod = TRUE; 137425023Sdfr break; 137519449Sdfr } 137619449Sdfr 137719449Sdfr /* 137889324Speter * Try to create one if none are free. 137989324Speter */ 1380214026Skib if (!gotiod) 1381214026Skib nfs_nfsiodnew(); 1382214026Skib else { 138389407Speter /* 138489407Speter * Found one, so wake it up and tell it which 138589407Speter * mount to process. 138689407Speter */ 138789407Speter NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 138889407Speter iod, nmp)); 1389203072Srmacklem nfs_iodwant[iod] = NFSIOD_NOT_AVAILABLE; 139089407Speter nfs_iodmount[iod] = nmp; 139189407Speter nmp->nm_bufqiods++; 1392111748Sdes wakeup(&nfs_iodwant[iod]); 139389407Speter } 139489407Speter 139589324Speter /* 139619449Sdfr * If none are free, we may already have an iod working on this mount 139719449Sdfr * point. If so, it will process our request. 139819449Sdfr */ 139919449Sdfr if (!gotiod) { 140019449Sdfr if (nmp->nm_bufqiods > 0) { 140119449Sdfr NFS_DPF(ASYNCIO, 1402214026Skib ("nfs_asyncio: %d iods are already processing mount %p\n", 140319449Sdfr nmp->nm_bufqiods, nmp)); 140419449Sdfr gotiod = TRUE; 140519449Sdfr } 140619449Sdfr } 140719449Sdfr 140819449Sdfr /* 140919449Sdfr * If we have an iod which can process the request, then queue 141019449Sdfr * the buffer. 141119449Sdfr */ 141219449Sdfr if (gotiod) { 141319449Sdfr /* 141455431Sdillon * Ensure that the queue never grows too large. We still want 141555431Sdillon * to asynchronize so we block rather then return EIO. 141619449Sdfr */ 1417214026Skib while (nmp->nm_bufqlen >= 2 * nfs_numasync) { 141819449Sdfr NFS_DPF(ASYNCIO, 1419214026Skib ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 142019449Sdfr nmp->nm_bufqwant = TRUE; 1421158739Smohans error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx, 1422158739Smohans slpflag | PRIBIO, 1423138496Sps "nfsaio", slptimeo); 142419449Sdfr if (error) { 1425195203Sdfr error2 = nfs_sigintr(nmp, td); 1426158739Smohans if (error2) { 1427214026Skib mtx_unlock(&nfs_iod_mtx); 1428131691Salfred return (error2); 1429158739Smohans } 1430248255Sjhb if (slpflag == PCATCH) { 143119449Sdfr slpflag = 0; 143219449Sdfr slptimeo = 2 * hz; 143319449Sdfr } 143419449Sdfr } 143519449Sdfr /* 143619449Sdfr * We might have lost our iod while sleeping, 143719449Sdfr * so check and loop if nescessary. 143819449Sdfr */ 1439214026Skib goto again; 144019449Sdfr } 144119449Sdfr 1442172324Smohans /* We might have lost our nfsiod */ 1443172324Smohans if (nmp->nm_bufqiods == 0) { 1444172324Smohans NFS_DPF(ASYNCIO, 1445214026Skib("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1446172324Smohans goto again; 1447172324Smohans } 1448172324Smohans 144958345Sphk if (bp->b_iocmd == BIO_READ) { 145084827Sjhb if (bp->b_rcred == NOCRED && cred != NOCRED) 145184827Sjhb bp->b_rcred = crhold(cred); 14521541Srgrimes } else { 145384827Sjhb if (bp->b_wcred == NOCRED && cred != NOCRED) 145484827Sjhb bp->b_wcred = crhold(cred); 14551541Srgrimes } 14568876Srgrimes 1457137846Sjeff if (bp->b_flags & B_REMFREE) 1458137846Sjeff bremfreef(bp); 145948225Smckusick BUF_KERNPROC(bp); 146019449Sdfr TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 146119449Sdfr nmp->nm_bufqlen++; 1462158739Smohans if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { 1463158739Smohans mtx_lock(&(VTONFS(bp->b_vp))->n_mtx); 1464169043Sjhb VTONFS(bp->b_vp)->n_flag |= NMODIFIED; 1465157557Smohans VTONFS(bp->b_vp)->n_directio_asyncwr++; 1466158739Smohans mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx); 1467158739Smohans } 1468158739Smohans mtx_unlock(&nfs_iod_mtx); 14691541Srgrimes return (0); 147019449Sdfr } 14719336Sdfr 1472158739Smohans mtx_unlock(&nfs_iod_mtx); 1473158739Smohans 14749336Sdfr /* 147519449Sdfr * All the iods are busy on other mounts, so return EIO to 147619449Sdfr * force the caller to process the i/o synchronously. 14779336Sdfr */ 147819449Sdfr NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 147919449Sdfr return (EIO); 14801541Srgrimes} 14811541Srgrimes 1482138899Spsvoid 1483138899Spsnfs_doio_directwrite(struct buf *bp) 1484138899Sps{ 1485138899Sps int iomode, must_commit; 1486138899Sps struct uio *uiop = (struct uio *)bp->b_caller1; 1487138899Sps char *iov_base = uiop->uio_iov->iov_base; 1488138899Sps struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount); 1489138899Sps 1490138899Sps iomode = NFSV3WRITE_FILESYNC; 1491138899Sps uiop->uio_td = NULL; /* NULL since we're in nfsiod */ 1492138899Sps (nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit); 1493138899Sps KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write")); 1494138899Sps free(iov_base, M_NFSDIRECTIO); 1495138899Sps free(uiop->uio_iov, M_NFSDIRECTIO); 1496138899Sps free(uiop, M_NFSDIRECTIO); 1497157557Smohans if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { 1498157557Smohans struct nfsnode *np = VTONFS(bp->b_vp); 1499158739Smohans mtx_lock(&np->n_mtx); 1500157557Smohans np->n_directio_asyncwr--; 1501169043Sjhb if (np->n_directio_asyncwr == 0) { 1502169043Sjhb VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED; 1503169043Sjhb if ((np->n_flag & NFSYNCWAIT)) { 1504169043Sjhb np->n_flag &= ~NFSYNCWAIT; 1505169043Sjhb wakeup((caddr_t)&np->n_directio_asyncwr); 1506169043Sjhb } 1507157557Smohans } 1508158739Smohans mtx_unlock(&np->n_mtx); 1509157557Smohans } 1510138899Sps bp->b_vp = NULL; 1511138899Sps relpbuf(bp, &nfs_pbuf_freecnt); 1512138899Sps} 1513138899Sps 15141541Srgrimes/* 15151541Srgrimes * Do an I/O operation to/from a cache block. This may be called 15161541Srgrimes * synchronously or from an nfsiod. 15171541Srgrimes */ 15181541Srgrimesint 1519134898Sphknfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) 15201541Srgrimes{ 152144679Sjulian struct uio *uiop; 15221541Srgrimes struct nfsnode *np; 15231541Srgrimes struct nfsmount *nmp; 152446349Salc int error = 0, iomode, must_commit = 0; 15251541Srgrimes struct uio uio; 15261541Srgrimes struct iovec io; 152783651Speter struct proc *p = td ? td->td_proc : NULL; 1528158739Smohans uint8_t iocmd; 1529158739Smohans 15301541Srgrimes np = VTONFS(vp); 15311541Srgrimes nmp = VFSTONFS(vp->v_mount); 15321541Srgrimes uiop = &uio; 15331541Srgrimes uiop->uio_iov = &io; 15341541Srgrimes uiop->uio_iovcnt = 1; 15351541Srgrimes uiop->uio_segflg = UIO_SYSSPACE; 153683366Sjulian uiop->uio_td = td; 15371541Srgrimes 153846349Salc /* 153958934Sphk * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 154046349Salc * do this here so we do not have to do it in all the code that 154146349Salc * calls us. 154246349Salc */ 154358934Sphk bp->b_flags &= ~B_INVAL; 154458934Sphk bp->b_ioflags &= ~BIO_ERROR; 154546349Salc 154644679Sjulian KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1547158739Smohans iocmd = bp->b_iocmd; 1548158739Smohans if (iocmd == BIO_READ) { 15493664Sphk io.iov_len = uiop->uio_resid = bp->b_bcount; 15503664Sphk io.iov_base = bp->b_data; 15511541Srgrimes uiop->uio_rw = UIO_READ; 155287834Sdillon 15531541Srgrimes switch (vp->v_type) { 15541541Srgrimes case VREG: 15559336Sdfr uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 15561541Srgrimes nfsstats.read_bios++; 1557122953Salfred error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 155887834Sdillon 15591541Srgrimes if (!error) { 15601541Srgrimes if (uiop->uio_resid) { 15611541Srgrimes /* 156246349Salc * If we had a short read with no error, we must have 156346349Salc * hit a file hole. We should zero-fill the remainder. 156446349Salc * This can also occur if the server hits the file EOF. 156546349Salc * 156683651Speter * Holes used to be able to occur due to pending 156746349Salc * writes, but that is not possible any longer. 15681541Srgrimes */ 156946349Salc int nread = bp->b_bcount - uiop->uio_resid; 157087834Sdillon int left = uiop->uio_resid; 157146349Salc 157246349Salc if (left > 0) 157346349Salc bzero((char *)bp->b_data + nread, left); 157446349Salc uiop->uio_resid = 0; 157546349Salc } 15761541Srgrimes } 1577115041Srwatson /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1578158739Smohans if (p && (vp->v_vflag & VV_TEXT)) { 1579158739Smohans mtx_lock(&np->n_mtx); 1580158739Smohans if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) { 1581158739Smohans mtx_unlock(&np->n_mtx); 1582158739Smohans PROC_LOCK(p); 1583158739Smohans killproc(p, "text file modification"); 1584158739Smohans PROC_UNLOCK(p); 1585158739Smohans } else 1586158739Smohans mtx_unlock(&np->n_mtx); 15871541Srgrimes } 15881541Srgrimes break; 15891541Srgrimes case VLNK: 15909336Sdfr uiop->uio_offset = (off_t)0; 15911541Srgrimes nfsstats.readlink_bios++; 1592122953Salfred error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 15931541Srgrimes break; 15941541Srgrimes case VDIR: 15951541Srgrimes nfsstats.readdir_bios++; 15969336Sdfr uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1597192578Srwatson if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1598192578Srwatson error = nfs_readdirplusrpc(vp, uiop, cr); 1599192578Srwatson if (error == NFSERR_NOTSUPP) 1600192578Srwatson nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 16019336Sdfr } 1602192578Srwatson if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1603192578Srwatson error = nfs_readdirrpc(vp, uiop, cr); 160446349Salc /* 160546349Salc * end-of-directory sets B_INVAL but does not generate an 160646349Salc * error. 160746349Salc */ 160839782Smckusick if (error == 0 && uiop->uio_resid == bp->b_bcount) 160939782Smckusick bp->b_flags |= B_INVAL; 16101541Srgrimes break; 16113305Sphk default: 1612158739Smohans nfs_printf("nfs_doio: type %x unexpected\n", vp->v_type); 16133305Sphk break; 16141541Srgrimes }; 16151541Srgrimes if (error) { 161658934Sphk bp->b_ioflags |= BIO_ERROR; 16171541Srgrimes bp->b_error = error; 16181541Srgrimes } 16191541Srgrimes } else { 162083651Speter /* 162151344Sdillon * If we only need to commit, try to commit 162251344Sdillon */ 162351344Sdillon if (bp->b_flags & B_NEEDCOMMIT) { 162451344Sdillon int retv; 162551344Sdillon off_t off; 162651344Sdillon 162751344Sdillon off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1628122953Salfred retv = (nmp->nm_rpcops->nr_commit)( 1629136927Sphk vp, off, bp->b_dirtyend-bp->b_dirtyoff, 163083366Sjulian bp->b_wcred, td); 163151344Sdillon if (retv == 0) { 163251344Sdillon bp->b_dirtyoff = bp->b_dirtyend = 0; 163354480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 163451344Sdillon bp->b_resid = 0; 163559249Sphk bufdone(bp); 163651344Sdillon return (0); 163751344Sdillon } 163851344Sdillon if (retv == NFSERR_STALEWRITEVERF) { 1639136927Sphk nfs_clearcommit(vp->v_mount); 164051344Sdillon } 164151344Sdillon } 164251344Sdillon 164351344Sdillon /* 164451344Sdillon * Setup for actual write 164551344Sdillon */ 1646158739Smohans mtx_lock(&np->n_mtx); 164741791Sdt if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 164841791Sdt bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1649158739Smohans mtx_unlock(&np->n_mtx); 16508692Sdg 16518692Sdg if (bp->b_dirtyend > bp->b_dirtyoff) { 16528692Sdg io.iov_len = uiop->uio_resid = bp->b_dirtyend 16539336Sdfr - bp->b_dirtyoff; 165441791Sdt uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 16559336Sdfr + bp->b_dirtyoff; 16568692Sdg io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 16578692Sdg uiop->uio_rw = UIO_WRITE; 16588692Sdg nfsstats.write_bios++; 165944679Sjulian 166025785Sdfr if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 16619336Sdfr iomode = NFSV3WRITE_UNSTABLE; 16628692Sdg else 16639336Sdfr iomode = NFSV3WRITE_FILESYNC; 166444679Sjulian 1665122953Salfred error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 166651475Sdillon 166751475Sdillon /* 166851475Sdillon * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 166951475Sdillon * to cluster the buffers needing commit. This will allow 167051475Sdillon * the system to submit a single commit rpc for the whole 167183651Speter * cluster. We can do this even if the buffer is not 100% 167254480Sdillon * dirty (relative to the NFS blocksize), so we optimize the 167354480Sdillon * append-to-file-case. 167454480Sdillon * 167554480Sdillon * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 167654480Sdillon * cleared because write clustering only works for commit 167754480Sdillon * rpc's, not for the data portion of the write). 167851475Sdillon */ 167951475Sdillon 168025003Sdfr if (!error && iomode == NFSV3WRITE_UNSTABLE) { 168125003Sdfr bp->b_flags |= B_NEEDCOMMIT; 168225003Sdfr if (bp->b_dirtyoff == 0 168346349Salc && bp->b_dirtyend == bp->b_bcount) 168425003Sdfr bp->b_flags |= B_CLUSTEROK; 168544679Sjulian } else { 168654480Sdillon bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 168744679Sjulian } 16888692Sdg 16899336Sdfr /* 16909336Sdfr * For an interrupted write, the buffer is still valid 16919336Sdfr * and the write hasn't been pushed to the server yet, 169258934Sphk * so we can't set BIO_ERROR and report the interruption 16939336Sdfr * by setting B_EINTR. For the B_ASYNC case, B_EINTR 16949336Sdfr * is not relevant, so the rpc attempt is essentially 16959336Sdfr * a noop. For the case of a V3 write rpc not being 16969336Sdfr * committed to stable storage, the block is still 16979336Sdfr * dirty and requires either a commit rpc or another 16989336Sdfr * write rpc with iomode == NFSV3WRITE_FILESYNC before 16999336Sdfr * the block is reused. This is indicated by setting 17009336Sdfr * the B_DELWRI and B_NEEDCOMMIT flags. 170142957Sdillon * 170242957Sdillon * If the buffer is marked B_PAGING, it does not reside on 170344679Sjulian * the vp's paging queues so we cannot call bdirty(). The 170444679Sjulian * bp in this case is not an NFS cache block so we should 170544679Sjulian * be safe. XXX 1706171189Sjhb * 1707171189Sjhb * The logic below breaks up errors into recoverable and 1708171189Sjhb * unrecoverable. For the former, we clear B_INVAL|B_NOCACHE 1709171189Sjhb * and keep the buffer around for potential write retries. 1710171189Sjhb * For the latter (eg ESTALE), we toss the buffer away (B_INVAL) 1711171189Sjhb * and save the error in the nfsnode. This is less than ideal 1712171189Sjhb * but necessary. Keeping such buffers around could potentially 1713171189Sjhb * cause buffer exhaustion eventually (they can never be written 1714171189Sjhb * out, so will get constantly be re-dirtied). It also causes 1715171189Sjhb * all sorts of vfs panics. For non-recoverable write errors, 1716171189Sjhb * also invalidate the attrcache, so we'll be forced to go over 1717171189Sjhb * the wire for this object, returning an error to user on next 1718171189Sjhb * call (most of the time). 17199336Sdfr */ 1720152656Sps if (error == EINTR || error == EIO || error == ETIMEDOUT 17219336Sdfr || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 172234266Sjulian int s; 172334266Sjulian 172444679Sjulian s = splbio(); 17258692Sdg bp->b_flags &= ~(B_INVAL|B_NOCACHE); 172642957Sdillon if ((bp->b_flags & B_PAGING) == 0) { 172744679Sjulian bdirty(bp); 172844679Sjulian bp->b_flags &= ~B_DONE; 172942957Sdillon } 173047749Speter if (error && (bp->b_flags & B_ASYNC) == 0) 173132755Sdyson bp->b_flags |= B_EINTR; 173244679Sjulian splx(s); 17338692Sdg } else { 173444679Sjulian if (error) { 173558934Sphk bp->b_ioflags |= BIO_ERROR; 1736171189Sjhb bp->b_flags |= B_INVAL; 173744679Sjulian bp->b_error = np->n_error = error; 1738158739Smohans mtx_lock(&np->n_mtx); 173944679Sjulian np->n_flag |= NWRITEERR; 1740171189Sjhb np->n_attrstamp = 0; 1741190380Srwatson KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); 1742158739Smohans mtx_unlock(&np->n_mtx); 174344679Sjulian } 174444679Sjulian bp->b_dirtyoff = bp->b_dirtyend = 0; 17458692Sdg } 17461541Srgrimes } else { 17478692Sdg bp->b_resid = 0; 174859249Sphk bufdone(bp); 17498692Sdg return (0); 17501541Srgrimes } 17511541Srgrimes } 17521541Srgrimes bp->b_resid = uiop->uio_resid; 17539336Sdfr if (must_commit) 175444679Sjulian nfs_clearcommit(vp->v_mount); 175559249Sphk bufdone(bp); 17561541Srgrimes return (error); 17571541Srgrimes} 175887834Sdillon 175987834Sdillon/* 176087834Sdillon * Used to aid in handling ftruncate() operations on the NFS client side. 176187834Sdillon * Truncation creates a number of special problems for NFS. We have to 176287834Sdillon * throw away VM pages and buffer cache buffers that are beyond EOF, and 176387834Sdillon * we have to properly handle VM pages or (potentially dirty) buffers 176487834Sdillon * that straddle the truncation point. 176587834Sdillon */ 176687834Sdillon 176787834Sdillonint 176887834Sdillonnfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 176987834Sdillon{ 177087834Sdillon struct nfsnode *np = VTONFS(vp); 1771158739Smohans u_quad_t tsize; 1772230605Srmacklem int biosize = vp->v_bufobj.bo_bsize; 177387834Sdillon int error = 0; 177487834Sdillon 1775158739Smohans mtx_lock(&np->n_mtx); 1776158739Smohans tsize = np->n_size; 177787834Sdillon np->n_size = nsize; 1778158739Smohans mtx_unlock(&np->n_mtx); 177987834Sdillon 1780158739Smohans if (nsize < tsize) { 178187834Sdillon struct buf *bp; 178287834Sdillon daddr_t lbn; 178387834Sdillon int bufsize; 178487834Sdillon 178587834Sdillon /* 178687834Sdillon * vtruncbuf() doesn't get the buffer overlapping the 178787834Sdillon * truncation point. We may have a B_DELWRI and/or B_CACHE 178887834Sdillon * buffer that now needs to be truncated. 178987834Sdillon */ 1790234605Strasz error = vtruncbuf(vp, cred, nsize, biosize); 179187834Sdillon lbn = nsize / biosize; 1792248500Semaste bufsize = nsize - (lbn * biosize); 179387834Sdillon bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1794138496Sps if (!bp) 1795138496Sps return EINTR; 179687834Sdillon if (bp->b_dirtyoff > bp->b_bcount) 179787834Sdillon bp->b_dirtyoff = bp->b_bcount; 179887834Sdillon if (bp->b_dirtyend > bp->b_bcount) 179987834Sdillon bp->b_dirtyend = bp->b_bcount; 180087834Sdillon bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 180187834Sdillon brelse(bp); 180287834Sdillon } else { 180387834Sdillon vnode_pager_setsize(vp, nsize); 180487834Sdillon } 180587834Sdillon return(error); 180687834Sdillon} 180787834Sdillon 1808