nfs_bio.c revision 158906
1204076Spjd/*- 2204076Spjd * Copyright (c) 1989, 1993 3219351Spjd * The Regents of the University of California. All rights reserved. 4204076Spjd * 5204076Spjd * This code is derived from software contributed to Berkeley by 6204076Spjd * Rick Macklem at The University of Guelph. 7204076Spjd * 8204076Spjd * Redistribution and use in source and binary forms, with or without 9204076Spjd * modification, are permitted provided that the following conditions 10204076Spjd * are met: 11204076Spjd * 1. Redistributions of source code must retain the above copyright 12204076Spjd * notice, this list of conditions and the following disclaimer. 13204076Spjd * 2. Redistributions in binary form must reproduce the above copyright 14204076Spjd * notice, this list of conditions and the following disclaimer in the 15204076Spjd * documentation and/or other materials provided with the distribution. 16204076Spjd * 4. Neither the name of the University nor the names of its contributors 17204076Spjd * may be used to endorse or promote products derived from this software 18204076Spjd * without specific prior written permission. 19204076Spjd * 20204076Spjd * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21204076Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22204076Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23204076Spjd * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24204076Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25204076Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26204076Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27204076Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28204076Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29204076Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30204076Spjd * SUCH DAMAGE. 31204076Spjd * 32204076Spjd * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 33204076Spjd */ 34204076Spjd 35204076Spjd#include <sys/cdefs.h> 36204076Spjd__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 158906 2006-05-25 01:00:35Z ups $"); 37204076Spjd 38204076Spjd#include <sys/param.h> 39204076Spjd#include <sys/systm.h> 40204076Spjd#include <sys/bio.h> 41204076Spjd#include <sys/buf.h> 42204076Spjd#include <sys/kernel.h> 43204076Spjd#include <sys/mount.h> 44204076Spjd#include <sys/proc.h> 45204076Spjd#include <sys/resourcevar.h> 46204076Spjd#include <sys/signalvar.h> 47204076Spjd#include <sys/vmmeter.h> 48211982Spjd#include <sys/vnode.h> 49204076Spjd 50204076Spjd#include <vm/vm.h> 51204076Spjd#include <vm/vm_extern.h> 52204076Spjd#include <vm/vm_page.h> 53204076Spjd#include <vm/vm_object.h> 54204076Spjd#include <vm/vm_pager.h> 55204076Spjd#include <vm/vnode_pager.h> 56204076Spjd 57204076Spjd#include <rpc/rpcclnt.h> 58204076Spjd 59204076Spjd#include <nfs/rpcv2.h> 60212038Spjd#include <nfs/nfsproto.h> 61204076Spjd#include <nfsclient/nfs.h> 62204076Spjd#include <nfsclient/nfsmount.h> 63204076Spjd#include <nfsclient/nfsnode.h> 64211886Spjd 65204076Spjd#include <nfs4client/nfs4.h> 66204076Spjd 67204076Spjdstatic struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 68204076Spjd struct thread *td); 69204076Spjdstatic int nfs_directio_write(struct vnode *vp, struct uio *uiop, 70204076Spjd struct ucred *cred, int ioflag); 71210886Spjd 72210886Spjdextern int nfs_directio_enable; 73210886Spjdextern int nfs_directio_allow_mmap; 74204076Spjd 75204076Spjd/* 76204076Spjd * Vnode op for VM getpages. 77204076Spjd */ 78204076Spjdint 79204076Spjdnfs_getpages(struct vop_getpages_args *ap) 80204076Spjd{ 81204076Spjd int i, error, nextoff, size, toff, count, npages; 82204076Spjd struct uio uio; 83204076Spjd struct iovec iov; 84204076Spjd vm_offset_t kva; 85204076Spjd struct buf *bp; 86204076Spjd struct vnode *vp; 87204076Spjd struct thread *td; 88204076Spjd struct ucred *cred; 89219818Spjd struct nfsmount *nmp; 90204076Spjd vm_object_t object; 91204076Spjd vm_page_t *pages; 92204076Spjd struct nfsnode *np; 93204076Spjd 94204076Spjd vp = ap->a_vp; 95204076Spjd np = VTONFS(vp); 96204076Spjd td = curthread; /* XXX */ 97204076Spjd cred = curthread->td_ucred; /* XXX */ 98204076Spjd nmp = VFSTONFS(vp->v_mount); 99204076Spjd pages = ap->a_m; 100204076Spjd count = ap->a_count; 101204076Spjd 102204076Spjd if ((object = vp->v_object) == NULL) { 103204076Spjd nfs_printf("nfs_getpages: called with non-merged cache vnode??\n"); 104204076Spjd return VM_PAGER_ERROR; 105204076Spjd } 106204076Spjd 107204076Spjd if (nfs_directio_enable && !nfs_directio_allow_mmap) { 108204076Spjd mtx_lock(&np->n_mtx); 109204076Spjd if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { 110204076Spjd mtx_unlock(&np->n_mtx); 111204076Spjd nfs_printf("nfs_getpages: called on non-cacheable vnode??\n"); 112204076Spjd return VM_PAGER_ERROR; 113204076Spjd } else 114204076Spjd mtx_unlock(&np->n_mtx); 115204076Spjd } 116204076Spjd 117204076Spjd mtx_lock(&nmp->nm_mtx); 118204076Spjd if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 119204076Spjd (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 120204076Spjd mtx_unlock(&nmp->nm_mtx); 121204076Spjd /* We'll never get here for v4, because we always have fsinfo */ 122204076Spjd (void)nfs_fsinfo(nmp, vp, cred, td); 123204076Spjd } else 124204076Spjd mtx_unlock(&nmp->nm_mtx); 125204076Spjd 126204076Spjd npages = btoc(count); 127204076Spjd 128204076Spjd /* 129204076Spjd * If the requested page is partially valid, just return it and 130204076Spjd * allow the pager to zero-out the blanks. Partially valid pages 131204076Spjd * can only occur at the file EOF. 132204076Spjd */ 133204076Spjd 134204076Spjd { 135204076Spjd vm_page_t m = pages[ap->a_reqpage]; 136204076Spjd 137204076Spjd VM_OBJECT_LOCK(object); 138204076Spjd vm_page_lock_queues(); 139204076Spjd if (m->valid != 0) { 140204076Spjd /* handled by vm_fault now */ 141204076Spjd /* vm_page_zero_invalid(m, TRUE); */ 142204076Spjd for (i = 0; i < npages; ++i) { 143204076Spjd if (i != ap->a_reqpage) 144204076Spjd vm_page_free(pages[i]); 145204076Spjd } 146204076Spjd vm_page_unlock_queues(); 147204076Spjd VM_OBJECT_UNLOCK(object); 148204076Spjd return(0); 149204076Spjd } 150204076Spjd vm_page_unlock_queues(); 151204076Spjd VM_OBJECT_UNLOCK(object); 152204076Spjd } 153204076Spjd 154204076Spjd /* 155204076Spjd * We use only the kva address for the buffer, but this is extremely 156204076Spjd * convienient and fast. 157204076Spjd */ 158204076Spjd bp = getpbuf(&nfs_pbuf_freecnt); 159204076Spjd 160204076Spjd kva = (vm_offset_t) bp->b_data; 161204076Spjd pmap_qenter(kva, pages, npages); 162204076Spjd cnt.v_vnodein++; 163204076Spjd cnt.v_vnodepgsin += npages; 164204076Spjd 165204076Spjd iov.iov_base = (caddr_t) kva; 166204076Spjd iov.iov_len = count; 167204076Spjd uio.uio_iov = &iov; 168204076Spjd uio.uio_iovcnt = 1; 169204076Spjd uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 170204076Spjd uio.uio_resid = count; 171204076Spjd uio.uio_segflg = UIO_SYSSPACE; 172204076Spjd uio.uio_rw = UIO_READ; 173204076Spjd uio.uio_td = td; 174204076Spjd 175204076Spjd error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 176204076Spjd pmap_qremove(kva, npages); 177204076Spjd 178214692Spjd relpbuf(bp, &nfs_pbuf_freecnt); 179214692Spjd 180214692Spjd if (error && (uio.uio_resid == count)) { 181204076Spjd nfs_printf("nfs_getpages: error %d\n", error); 182214692Spjd VM_OBJECT_LOCK(object); 183214692Spjd vm_page_lock_queues(); 184214692Spjd for (i = 0; i < npages; ++i) { 185214692Spjd if (i != ap->a_reqpage) 186219864Spjd vm_page_free(pages[i]); 187214692Spjd } 188204076Spjd vm_page_unlock_queues(); 189214692Spjd VM_OBJECT_UNLOCK(object); 190214692Spjd return VM_PAGER_ERROR; 191214692Spjd } 192214692Spjd 193204076Spjd /* 194204076Spjd * Calculate the number of bytes read and validate only that number 195204076Spjd * of bytes. Note that due to pending writes, size may be 0. This 196204076Spjd * does not mean that the remaining data is invalid! 197204076Spjd */ 198204076Spjd 199204076Spjd size = count - uio.uio_resid; 200204076Spjd VM_OBJECT_LOCK(object); 201204076Spjd vm_page_lock_queues(); 202204076Spjd for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 203204076Spjd vm_page_t m; 204204076Spjd nextoff = toff + PAGE_SIZE; 205209183Spjd m = pages[i]; 206209183Spjd 207209183Spjd if (nextoff <= size) { 208209183Spjd /* 209204076Spjd * Read operation filled an entire page 210204076Spjd */ 211204076Spjd m->valid = VM_PAGE_BITS_ALL; 212204076Spjd vm_page_undirty(m); 213204076Spjd } else if (size > toff) { 214204076Spjd /* 215204076Spjd * Read operation filled a partial page. 216204076Spjd */ 217204076Spjd m->valid = 0; 218204076Spjd vm_page_set_validclean(m, 0, size - toff); 219204076Spjd /* handled by vm_fault now */ 220204076Spjd /* vm_page_zero_invalid(m, TRUE); */ 221204076Spjd } else { 222220898Spjd /* 223204076Spjd * Read operation was short. If no error occured 224204076Spjd * we may have hit a zero-fill section. We simply 225204076Spjd * leave valid set to 0. 226204076Spjd */ 227204076Spjd ; 228204076Spjd } 229204076Spjd if (i != ap->a_reqpage) { 230204076Spjd /* 231204076Spjd * Whether or not to leave the page activated is up in 232211982Spjd * the air, but we should put the page on a page queue 233204076Spjd * somewhere (it already is in the object). Result: 234204076Spjd * It appears that emperical results show that 235204076Spjd * deactivating pages is best. 236204076Spjd */ 237204076Spjd 238204076Spjd /* 239204076Spjd * Just in case someone was asking for this page we 240204076Spjd * now tell them that it is ok to use. 241204076Spjd */ 242204076Spjd if (!error) { 243204076Spjd if (m->flags & PG_WANTED) 244213533Spjd vm_page_activate(m); 245204076Spjd else 246204076Spjd vm_page_deactivate(m); 247204076Spjd vm_page_wakeup(m); 248204076Spjd } else { 249213531Spjd vm_page_free(m); 250213531Spjd } 251204076Spjd } 252204076Spjd } 253204076Spjd vm_page_unlock_queues(); 254204076Spjd VM_OBJECT_UNLOCK(object); 255204076Spjd return 0; 256204076Spjd} 257204076Spjd 258204076Spjd/* 259204076Spjd * Vnode op for VM putpages. 260212899Spjd */ 261204076Spjdint 262204076Spjdnfs_putpages(struct vop_putpages_args *ap) 263204076Spjd{ 264204076Spjd struct uio uio; 265218138Spjd struct iovec iov; 266204076Spjd vm_offset_t kva; 267204076Spjd struct buf *bp; 268204076Spjd int iomode, must_commit, i, error, npages, count; 269204076Spjd off_t offset; 270204076Spjd int *rtvals; 271204076Spjd struct vnode *vp; 272204076Spjd struct thread *td; 273212899Spjd struct ucred *cred; 274204076Spjd struct nfsmount *nmp; 275204076Spjd struct nfsnode *np; 276204076Spjd vm_page_t *pages; 277204076Spjd 278204076Spjd vp = ap->a_vp; 279204076Spjd np = VTONFS(vp); 280204076Spjd td = curthread; /* XXX */ 281204076Spjd cred = curthread->td_ucred; /* XXX */ 282204076Spjd nmp = VFSTONFS(vp->v_mount); 283204076Spjd pages = ap->a_m; 284204076Spjd count = ap->a_count; 285204076Spjd rtvals = ap->a_rtvals; 286204076Spjd npages = btoc(count); 287204076Spjd offset = IDX_TO_OFF(pages[0]->pindex); 288204076Spjd 289204076Spjd mtx_lock(&nmp->nm_mtx); 290204076Spjd if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 291204076Spjd (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 292218138Spjd mtx_unlock(&nmp->nm_mtx); 293218138Spjd (void)nfs_fsinfo(nmp, vp, cred, td); 294204076Spjd } else 295204076Spjd mtx_unlock(&nmp->nm_mtx); 296204076Spjd 297204076Spjd mtx_lock(&np->n_mtx); 298204076Spjd if (nfs_directio_enable && !nfs_directio_allow_mmap && 299204076Spjd (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) { 300204076Spjd mtx_unlock(&np->n_mtx); 301204076Spjd nfs_printf("nfs_putpages: called on noncache-able vnode??\n"); 302204076Spjd mtx_lock(&np->n_mtx); 303210881Spjd } 304210881Spjd 305210881Spjd for (i = 0; i < npages; i++) 306210881Spjd rtvals[i] = VM_PAGER_AGAIN; 307210881Spjd 308210881Spjd /* 309210881Spjd * When putting pages, do not extend file past EOF. 310204076Spjd */ 311204076Spjd if (offset + count > np->n_size) { 312204076Spjd count = np->n_size - offset; 313204076Spjd if (count < 0) 314204076Spjd count = 0; 315204076Spjd } 316204076Spjd mtx_unlock(&np->n_mtx); 317204076Spjd 318204076Spjd /* 319204076Spjd * We use only the kva address for the buffer, but this is extremely 320204076Spjd * convienient and fast. 321204076Spjd */ 322204076Spjd bp = getpbuf(&nfs_pbuf_freecnt); 323204076Spjd 324204076Spjd kva = (vm_offset_t) bp->b_data; 325204076Spjd pmap_qenter(kva, pages, npages); 326204076Spjd cnt.v_vnodeout++; 327204076Spjd cnt.v_vnodepgsout += count; 328204076Spjd 329204076Spjd iov.iov_base = (caddr_t) kva; 330204076Spjd iov.iov_len = count; 331204076Spjd uio.uio_iov = &iov; 332204076Spjd uio.uio_iovcnt = 1; 333204076Spjd uio.uio_offset = offset; 334204076Spjd uio.uio_resid = count; 335204076Spjd uio.uio_segflg = UIO_SYSSPACE; 336204076Spjd uio.uio_rw = UIO_WRITE; 337204076Spjd uio.uio_td = td; 338204076Spjd 339204076Spjd if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 340204076Spjd iomode = NFSV3WRITE_UNSTABLE; 341204076Spjd else 342204076Spjd iomode = NFSV3WRITE_FILESYNC; 343204076Spjd 344204076Spjd error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 345204076Spjd 346204076Spjd pmap_qremove(kva, npages); 347204076Spjd relpbuf(bp, &nfs_pbuf_freecnt); 348204076Spjd 349204076Spjd if (!error) { 350204076Spjd int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 351204076Spjd for (i = 0; i < nwritten; i++) { 352204076Spjd rtvals[i] = VM_PAGER_OK; 353204076Spjd vm_page_undirty(pages[i]); 354204076Spjd } 355204076Spjd if (must_commit) { 356204076Spjd nfs_clearcommit(vp->v_mount); 357204076Spjd } 358204076Spjd } 359204076Spjd return rtvals[0]; 360204076Spjd} 361204076Spjd 362204076Spjd/* 363204076Spjd * For nfs, cache consistency can only be maintained approximately. 364204076Spjd * Although RFC1094 does not specify the criteria, the following is 365204076Spjd * believed to be compatible with the reference port. 366204076Spjd * For nfs: 367204076Spjd * If the file's modify time on the server has changed since the 368204076Spjd * last read rpc or you have written to the file, 369204076Spjd * you may have lost data cache consistency with the 370204076Spjd * server, so flush all of the file's data out of the cache. 371204076Spjd * Then force a getattr rpc to ensure that you have up to date 372204076Spjd * attributes. 373204076Spjd * NB: This implies that cache data can be read when up to 374204076Spjd * NFS_ATTRTIMEO seconds out of date. If you find that you need current 375204076Spjd * attributes this could be forced by setting n_attrstamp to 0 before 376204076Spjd * the VOP_GETATTR() call. 377204076Spjd */ 378204076Spjdstatic inline int 379204076Spjdnfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred) 380204076Spjd{ 381204076Spjd int error = 0; 382204076Spjd struct vattr vattr; 383204076Spjd struct nfsnode *np = VTONFS(vp); 384204076Spjd int old_lock; 385204076Spjd struct nfsmount *nmp = VFSTONFS(vp->v_mount); 386204076Spjd 387204076Spjd /* 388204076Spjd * Grab the exclusive lock before checking whether the cache is 389204076Spjd * consistent. 390204076Spjd * XXX - We can make this cheaper later (by acquiring cheaper locks). 391204076Spjd * But for now, this suffices. 392204076Spjd */ 393204076Spjd old_lock = nfs_upgrade_vnlock(vp, td); 394204076Spjd mtx_lock(&np->n_mtx); 395204076Spjd if (np->n_flag & NMODIFIED) { 396204076Spjd mtx_unlock(&np->n_mtx); 397204076Spjd if (vp->v_type != VREG) { 398204076Spjd if (vp->v_type != VDIR) 399204076Spjd panic("nfs: bioread, not dir"); 400204076Spjd (nmp->nm_rpcops->nr_invaldir)(vp); 401204076Spjd error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 402204076Spjd if (error) 403204076Spjd goto out; 404204076Spjd } 405204076Spjd np->n_attrstamp = 0; 406204076Spjd error = VOP_GETATTR(vp, &vattr, cred, td); 407204076Spjd if (error) 408204076Spjd goto out; 409204076Spjd mtx_lock(&np->n_mtx); 410204076Spjd np->n_mtime = vattr.va_mtime; 411204076Spjd mtx_unlock(&np->n_mtx); 412204076Spjd } else { 413204076Spjd mtx_unlock(&np->n_mtx); 414204076Spjd error = VOP_GETATTR(vp, &vattr, cred, td); 415204076Spjd if (error) 416204076Spjd return (error); 417204076Spjd mtx_lock(&np->n_mtx); 418204076Spjd if ((np->n_flag & NSIZECHANGED) 419204076Spjd || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { 420204076Spjd mtx_unlock(&np->n_mtx); 421204076Spjd if (vp->v_type == VDIR) 422204076Spjd (nmp->nm_rpcops->nr_invaldir)(vp); 423214284Spjd error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 424214284Spjd if (error) 425214284Spjd goto out; 426214284Spjd mtx_lock(&np->n_mtx); 427214284Spjd np->n_mtime = vattr.va_mtime; 428214284Spjd np->n_flag &= ~NSIZECHANGED; 429214284Spjd } 430214284Spjd mtx_unlock(&np->n_mtx); 431214284Spjd } 432214284Spjdout: 433214284Spjd nfs_downgrade_vnlock(vp, td, old_lock); 434214284Spjd return error; 435214284Spjd} 436214284Spjd 437214284Spjd/* 438214284Spjd * Vnode op for read using bio 439214284Spjd */ 440214284Spjdint 441204076Spjdnfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 442204076Spjd{ 443204076Spjd struct nfsnode *np = VTONFS(vp); 444204076Spjd int biosize, i; 445204076Spjd struct buf *bp, *rabp; 446204076Spjd struct thread *td; 447204076Spjd struct nfsmount *nmp = VFSTONFS(vp->v_mount); 448204076Spjd daddr_t lbn, rabn; 449204076Spjd int bcount; 450204076Spjd int seqcount; 451204076Spjd int nra, error = 0, n = 0, on = 0; 452204076Spjd 453204076Spjd#ifdef DIAGNOSTIC 454204076Spjd if (uio->uio_rw != UIO_READ) 455204076Spjd panic("nfs_read mode"); 456204076Spjd#endif 457204076Spjd if (uio->uio_resid == 0) 458204076Spjd return (0); 459204076Spjd if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 460204076Spjd return (EINVAL); 461204076Spjd td = uio->uio_td; 462204076Spjd 463204076Spjd mtx_lock(&nmp->nm_mtx); 464204076Spjd if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 465204076Spjd (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 466204076Spjd mtx_unlock(&nmp->nm_mtx); 467204076Spjd (void)nfs_fsinfo(nmp, vp, cred, td); 468204076Spjd } else 469204076Spjd mtx_unlock(&nmp->nm_mtx); 470204076Spjd 471204076Spjd if (vp->v_type != VDIR && 472209181Spjd (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 473204076Spjd return (EFBIG); 474204076Spjd 475204076Spjd if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG)) 476214284Spjd /* No caching/ no readaheads. Just read data into the user buffer */ 477214284Spjd return nfs_readrpc(vp, uio, cred); 478214284Spjd 479214284Spjd biosize = vp->v_mount->mnt_stat.f_iosize; 480214284Spjd seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 481204076Spjd 482219844Spjd error = nfs_bioread_check_cons(vp, td, cred); 483204076Spjd if (error) 484204076Spjd return error; 485204076Spjd 486204076Spjd do { 487204076Spjd u_quad_t nsize; 488218218Spjd 489218218Spjd mtx_lock(&np->n_mtx); 490218218Spjd nsize = np->n_size; 491218218Spjd mtx_unlock(&np->n_mtx); 492218218Spjd 493218218Spjd switch (vp->v_type) { 494218218Spjd case VREG: 495218218Spjd nfsstats.biocache_reads++; 496218218Spjd lbn = uio->uio_offset / biosize; 497218218Spjd on = uio->uio_offset & (biosize - 1); 498218218Spjd 499218218Spjd /* 500218218Spjd * Start the read ahead(s), as required. 501218218Spjd */ 502218218Spjd if (nmp->nm_readahead > 0) { 503218218Spjd for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 504218218Spjd (off_t)(lbn + 1 + nra) * biosize < nsize; nra++) { 505218218Spjd rabn = lbn + 1 + nra; 506218218Spjd if (incore(&vp->v_bufobj, rabn) == NULL) { 507218218Spjd rabp = nfs_getcacheblk(vp, rabn, biosize, td); 508218218Spjd if (!rabp) { 509218218Spjd error = nfs_sigintr(nmp, NULL, td); 510218218Spjd return (error ? error : EINTR); 511218218Spjd } 512218218Spjd if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 513220006Spjd rabp->b_flags |= B_ASYNC; 514218218Spjd rabp->b_iocmd = BIO_READ; 515218218Spjd vfs_busy_pages(rabp, 0); 516218218Spjd if (nfs_asyncio(nmp, rabp, cred, td)) { 517218218Spjd rabp->b_flags |= B_INVAL; 518218218Spjd rabp->b_ioflags |= BIO_ERROR; 519218218Spjd vfs_unbusy_pages(rabp); 520218218Spjd brelse(rabp); 521218218Spjd break; 522218218Spjd } 523218218Spjd } else { 524218218Spjd brelse(rabp); 525218218Spjd } 526218218Spjd } 527218218Spjd } 528220898Spjd } 529205738Spjd 530205738Spjd /* Note that bcount is *not* DEV_BSIZE aligned. */ 531204076Spjd bcount = biosize; 532205738Spjd if ((off_t)lbn * biosize >= nsize) { 533204076Spjd bcount = 0; 534204076Spjd } else if ((off_t)(lbn + 1) * biosize > nsize) { 535204076Spjd bcount = nsize - (off_t)lbn * biosize; 536204076Spjd } 537204076Spjd bp = nfs_getcacheblk(vp, lbn, bcount, td); 538204076Spjd 539204076Spjd if (!bp) { 540204076Spjd error = nfs_sigintr(nmp, NULL, td); 541220898Spjd return (error ? error : EINTR); 542204076Spjd } 543218138Spjd 544218138Spjd /* 545205738Spjd * If B_CACHE is not set, we must issue the read. If this 546205738Spjd * fails, we return an error. 547211983Spjd */ 548205738Spjd 549218218Spjd if ((bp->b_flags & B_CACHE) == 0) { 550220898Spjd bp->b_iocmd = BIO_READ; 551218218Spjd vfs_busy_pages(bp, 0); 552220898Spjd error = nfs_doio(vp, bp, cred, td); 553220898Spjd if (error) { 554204076Spjd brelse(bp); 555204076Spjd return (error); 556204076Spjd } 557204076Spjd } 558204076Spjd 559204076Spjd /* 560204076Spjd * on is the offset into the current bp. Figure out how many 561204076Spjd * bytes we can copy out of the bp. Note that bcount is 562204076Spjd * NOT DEV_BSIZE aligned. 563204076Spjd * 564204076Spjd * Then figure out how many bytes we can copy into the uio. 565204076Spjd */ 566204076Spjd 567205738Spjd n = 0; 568204076Spjd if (on < bcount) 569204076Spjd n = min((unsigned)(bcount - on), uio->uio_resid); 570204076Spjd break; 571204076Spjd case VLNK: 572204076Spjd nfsstats.biocache_readlinks++; 573204076Spjd bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 574204076Spjd if (!bp) { 575205738Spjd error = nfs_sigintr(nmp, NULL, td); 576204076Spjd return (error ? error : EINTR); 577204076Spjd } 578204076Spjd if ((bp->b_flags & B_CACHE) == 0) { 579204076Spjd bp->b_iocmd = BIO_READ; 580204076Spjd vfs_busy_pages(bp, 0); 581204076Spjd error = nfs_doio(vp, bp, cred, td); 582204076Spjd if (error) { 583204076Spjd bp->b_ioflags |= BIO_ERROR; 584220898Spjd brelse(bp); 585220898Spjd return (error); 586204076Spjd } 587204076Spjd } 588204076Spjd n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 589204076Spjd on = 0; 590204076Spjd break; 591204076Spjd case VDIR: 592204076Spjd nfsstats.biocache_readdirs++; 593204076Spjd if (np->n_direofoffset 594204076Spjd && uio->uio_offset >= np->n_direofoffset) { 595204076Spjd return (0); 596204076Spjd } 597204076Spjd lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 598204076Spjd on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 599204076Spjd bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 600204076Spjd if (!bp) { 601204076Spjd error = nfs_sigintr(nmp, NULL, td); 602204076Spjd return (error ? error : EINTR); 603204076Spjd } 604204076Spjd if ((bp->b_flags & B_CACHE) == 0) { 605204076Spjd bp->b_iocmd = BIO_READ; 606204076Spjd vfs_busy_pages(bp, 0); 607204076Spjd error = nfs_doio(vp, bp, cred, td); 608204076Spjd if (error) { 609218218Spjd brelse(bp); 610204076Spjd } 611218218Spjd while (error == NFSERR_BAD_COOKIE) { 612204076Spjd (nmp->nm_rpcops->nr_invaldir)(vp); 613204076Spjd error = nfs_vinvalbuf(vp, 0, td, 1); 614204076Spjd /* 615204076Spjd * Yuck! The directory has been modified on the 616214284Spjd * server. The only way to get the block is by 617214284Spjd * reading from the beginning to get all the 618214284Spjd * offset cookies. 619214284Spjd * 620214284Spjd * Leave the last bp intact unless there is an error. 621214284Spjd * Loop back up to the while if the error is another 622214284Spjd * NFSERR_BAD_COOKIE (double yuch!). 623214284Spjd */ 624214284Spjd for (i = 0; i <= lbn && !error; i++) { 625214284Spjd if (np->n_direofoffset 626214284Spjd && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 627214284Spjd return (0); 628214284Spjd bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 629204076Spjd if (!bp) { 630204076Spjd error = nfs_sigintr(nmp, NULL, td); 631204076Spjd return (error ? error : EINTR); 632204076Spjd } 633204076Spjd if ((bp->b_flags & B_CACHE) == 0) { 634204076Spjd bp->b_iocmd = BIO_READ; 635204076Spjd vfs_busy_pages(bp, 0); 636204076Spjd error = nfs_doio(vp, bp, cred, td); 637204076Spjd /* 638204076Spjd * no error + B_INVAL == directory EOF, 639205738Spjd * use the block. 640204076Spjd */ 641204076Spjd if (error == 0 && (bp->b_flags & B_INVAL)) 642204076Spjd break; 643204076Spjd } 644204076Spjd /* 645204076Spjd * An error will throw away the block and the 646204076Spjd * for loop will break out. If no error and this 647205738Spjd * is not the block we want, we throw away the 648204076Spjd * block and go for the next one via the for loop. 649204076Spjd */ 650204076Spjd if (error || i < lbn) 651204076Spjd brelse(bp); 652204076Spjd } 653204076Spjd } 654204076Spjd /* 655204076Spjd * The above while is repeated if we hit another cookie 656204076Spjd * error. If we hit an error and it wasn't a cookie error, 657204076Spjd * we give up. 658204076Spjd */ 659204076Spjd if (error) 660204076Spjd return (error); 661204076Spjd } 662204076Spjd 663204076Spjd /* 664204076Spjd * If not eof and read aheads are enabled, start one. 665204076Spjd * (You need the current block first, so that you have the 666204076Spjd * directory offset cookie of the next block.) 667204076Spjd */ 668204076Spjd if (nmp->nm_readahead > 0 && 669204076Spjd (bp->b_flags & B_INVAL) == 0 && 670204076Spjd (np->n_direofoffset == 0 || 671204076Spjd (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 672204076Spjd incore(&vp->v_bufobj, lbn + 1) == NULL) { 673204076Spjd rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 674204076Spjd if (rabp) { 675204076Spjd if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 676220865Spjd rabp->b_flags |= B_ASYNC; 677220865Spjd rabp->b_iocmd = BIO_READ; 678220865Spjd vfs_busy_pages(rabp, 0); 679220865Spjd if (nfs_asyncio(nmp, rabp, cred, td)) { 680220865Spjd rabp->b_flags |= B_INVAL; 681220865Spjd rabp->b_ioflags |= BIO_ERROR; 682220865Spjd vfs_unbusy_pages(rabp); 683220865Spjd brelse(rabp); 684220865Spjd } 685220865Spjd } else { 686220865Spjd brelse(rabp); 687220865Spjd } 688220865Spjd } 689220865Spjd } 690220865Spjd /* 691220865Spjd * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 692220865Spjd * chopped for the EOF condition, we cannot tell how large 693220865Spjd * NFS directories are going to be until we hit EOF. So 694220865Spjd * an NFS directory buffer is *not* chopped to its EOF. Now, 695204076Spjd * it just so happens that b_resid will effectively chop it 696204076Spjd * to EOF. *BUT* this information is lost if the buffer goes 697204076Spjd * away and is reconstituted into a B_CACHE state ( due to 698204076Spjd * being VMIO ) later. So we keep track of the directory eof 699204076Spjd * in np->n_direofoffset and chop it off as an extra step 700204076Spjd * right here. 701204076Spjd */ 702204076Spjd n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 703204076Spjd if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 704204076Spjd n = np->n_direofoffset - uio->uio_offset; 705204076Spjd break; 706204076Spjd default: 707204076Spjd nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 708204076Spjd bp = NULL; 709205738Spjd break; 710204076Spjd }; 711204076Spjd 712204076Spjd if (n > 0) { 713204076Spjd error = uiomove(bp->b_data + on, (int)n, uio); 714204076Spjd } 715204076Spjd if (vp->v_type == VLNK) 716204076Spjd n = 0; 717204076Spjd if (bp != NULL) 718204076Spjd brelse(bp); 719204076Spjd } while (error == 0 && uio->uio_resid > 0 && n > 0); 720204076Spjd return (error); 721204076Spjd} 722204076Spjd 723204076Spjd/* 724204076Spjd * The NFS write path cannot handle iovecs with len > 1. So we need to 725204076Spjd * break up iovecs accordingly (restricting them to wsize). 726204076Spjd * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 727204076Spjd * For the ASYNC case, 2 copies are needed. The first a copy from the 728214274Spjd * user buffer to a staging buffer and then a second copy from the staging 729223181Strociny * buffer to mbufs. This can be optimized by copying from the user buffer 730220271Spjd * directly into mbufs and passing the chain down, but that requires a 731220271Spjd * fair amount of re-working of the relevant codepaths (and can be done 732220271Spjd * later). 733220271Spjd */ 734220271Spjdstatic int 735223181Strocinynfs_directio_write(vp, uiop, cred, ioflag) 736204076Spjd struct vnode *vp; 737205738Spjd struct uio *uiop; 738205738Spjd struct ucred *cred; 739205738Spjd int ioflag; 740205738Spjd{ 741205738Spjd int error; 742205738Spjd struct nfsmount *nmp = VFSTONFS(vp->v_mount); 743205738Spjd struct thread *td = uiop->uio_td; 744212038Spjd int size; 745220898Spjd int wsize; 746205738Spjd 747211983Spjd mtx_lock(&nmp->nm_mtx); 748212038Spjd wsize = nmp->nm_wsize; 749205738Spjd mtx_unlock(&nmp->nm_mtx); 750205738Spjd if (ioflag & IO_SYNC) { 751205738Spjd int iomode, must_commit; 752220898Spjd struct uio uio; 753205738Spjd struct iovec iov; 754205738Spjddo_sync: 755205738Spjd while (uiop->uio_resid > 0) { 756205738Spjd size = min(uiop->uio_resid, wsize); 757205738Spjd size = min(uiop->uio_iov->iov_len, size); 758205738Spjd iov.iov_base = uiop->uio_iov->iov_base; 759204076Spjd iov.iov_len = size; 760204076Spjd uio.uio_iov = &iov; 761204076Spjd uio.uio_iovcnt = 1; 762204076Spjd uio.uio_offset = uiop->uio_offset; 763204076Spjd uio.uio_resid = size; 764204076Spjd uio.uio_segflg = UIO_USERSPACE; 765204076Spjd uio.uio_rw = UIO_WRITE; 766211878Spjd uio.uio_td = td; 767211878Spjd iomode = NFSV3WRITE_FILESYNC; 768211878Spjd error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 769211878Spjd &iomode, &must_commit); 770211878Spjd KASSERT((must_commit == 0), 771211878Spjd ("nfs_directio_write: Did not commit write")); 772211878Spjd if (error) 773211878Spjd return (error); 774211878Spjd uiop->uio_offset += size; 775211878Spjd uiop->uio_resid -= size; 776204076Spjd if (uiop->uio_iov->iov_len <= size) { 777204076Spjd uiop->uio_iovcnt--; 778204076Spjd uiop->uio_iov++; 779204076Spjd } else { 780204076Spjd uiop->uio_iov->iov_base = 781204076Spjd (char *)uiop->uio_iov->iov_base + size; 782204076Spjd uiop->uio_iov->iov_len -= size; 783204076Spjd } 784204076Spjd } 785204076Spjd } else { 786204076Spjd struct uio *t_uio; 787204076Spjd struct iovec *t_iov; 788204076Spjd struct buf *bp; 789204076Spjd 790204076Spjd /* 791213533Spjd * Break up the write into blocksize chunks and hand these 792204076Spjd * over to nfsiod's for write back. 793204076Spjd * Unfortunately, this incurs a copy of the data. Since 794204076Spjd * the user could modify the buffer before the write is 795204076Spjd * initiated. 796220266Spjd * 797204076Spjd * The obvious optimization here is that one of the 2 copies 798204076Spjd * in the async write path can be eliminated by copying the 799204076Spjd * data here directly into mbufs and passing the mbuf chain 800204076Spjd * down. But that will require a fair amount of re-working 801204076Spjd * of the code and can be done if there's enough interest 802204076Spjd * in NFS directio access. 803204076Spjd */ 804204076Spjd while (uiop->uio_resid > 0) { 805204076Spjd size = min(uiop->uio_resid, wsize); 806204076Spjd size = min(uiop->uio_iov->iov_len, size); 807204076Spjd bp = getpbuf(&nfs_pbuf_freecnt); 808204076Spjd t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK); 809204076Spjd t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK); 810204076Spjd t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK); 811204076Spjd t_iov->iov_len = size; 812204076Spjd t_uio->uio_iov = t_iov; 813204076Spjd t_uio->uio_iovcnt = 1; 814204076Spjd t_uio->uio_offset = uiop->uio_offset; 815204076Spjd t_uio->uio_resid = size; 816204076Spjd t_uio->uio_segflg = UIO_SYSSPACE; 817204076Spjd t_uio->uio_rw = UIO_WRITE; 818213533Spjd t_uio->uio_td = td; 819204076Spjd bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size); 820204076Spjd bp->b_flags |= B_DIRECT; 821204076Spjd bp->b_iocmd = BIO_WRITE; 822204076Spjd if (cred != NOCRED) { 823204076Spjd crhold(cred); 824204076Spjd bp->b_wcred = cred; 825204076Spjd } else 826204076Spjd bp->b_wcred = NOCRED; 827204076Spjd bp->b_caller1 = (void *)t_uio; 828204076Spjd bp->b_vp = vp; 829204076Spjd vhold(vp); 830204076Spjd error = nfs_asyncio(nmp, bp, NOCRED, td); 831204076Spjd if (error) { 832204076Spjd free(t_iov->iov_base, M_NFSDIRECTIO); 833204076Spjd free(t_iov, M_NFSDIRECTIO); 834204076Spjd free(t_uio, M_NFSDIRECTIO); 835204076Spjd vdrop(bp->b_vp); 836204076Spjd bp->b_vp = NULL; 837219482Strociny relpbuf(bp, &nfs_pbuf_freecnt); 838204076Spjd if (error == EINTR) 839204076Spjd return (error); 840218218Spjd goto do_sync; 841218218Spjd } 842204076Spjd uiop->uio_offset += size; 843219818Spjd uiop->uio_resid -= size; 844218042Spjd if (uiop->uio_iov->iov_len <= size) { 845204076Spjd uiop->uio_iovcnt--; 846212034Spjd uiop->uio_iov++; 847204076Spjd } else { 848204076Spjd uiop->uio_iov->iov_base = 849212038Spjd (char *)uiop->uio_iov->iov_base + size; 850218218Spjd uiop->uio_iov->iov_len -= size; 851212038Spjd } 852219818Spjd } 853218042Spjd } 854212038Spjd return (0); 855212038Spjd} 856212038Spjd 857212038Spjd/* 858218218Spjd * Vnode op for write using bio 859218218Spjd */ 860218218Spjdint 861218218Spjdnfs_write(struct vop_write_args *ap) 862219818Spjd{ 863218218Spjd int biosize; 864218218Spjd struct uio *uio = ap->a_uio; 865218218Spjd struct thread *td = uio->uio_td; 866218218Spjd struct vnode *vp = ap->a_vp; 867218218Spjd struct nfsnode *np = VTONFS(vp); 868204076Spjd struct ucred *cred = ap->a_cred; 869204076Spjd int ioflag = ap->a_ioflag; 870204076Spjd struct buf *bp; 871218042Spjd struct vattr vattr; 872204076Spjd struct nfsmount *nmp = VFSTONFS(vp->v_mount); 873212034Spjd daddr_t lbn; 874204076Spjd int bcount; 875204076Spjd int n, on, error = 0; 876204076Spjd struct proc *p = td?td->td_proc:NULL; 877204076Spjd 878212038Spjd#ifdef DIAGNOSTIC 879212038Spjd if (uio->uio_rw != UIO_WRITE) 880218218Spjd panic("nfs_write mode"); 881218043Spjd if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 882218043Spjd panic("nfs_write proc"); 883204076Spjd#endif 884204076Spjd if (vp->v_type != VREG) 885204076Spjd return (EIO); 886211977Spjd mtx_lock(&np->n_mtx); 887211984Spjd if (np->n_flag & NWRITEERR) { 888218043Spjd np->n_flag &= ~NWRITEERR; 889219482Strociny mtx_unlock(&np->n_mtx); 890211984Spjd return (np->n_error); 891218043Spjd } else 892218043Spjd mtx_unlock(&np->n_mtx); 893218218Spjd mtx_lock(&nmp->nm_mtx); 894218043Spjd if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 895218043Spjd (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 896218043Spjd mtx_unlock(&nmp->nm_mtx); 897204076Spjd (void)nfs_fsinfo(nmp, vp, cred, td); 898218045Spjd } else 899218045Spjd mtx_unlock(&nmp->nm_mtx); 900218043Spjd 901219482Strociny /* 902218043Spjd * Synchronously flush pending buffers if we are in synchronous 903220005Spjd * mode or if we are appending. 904204076Spjd */ 905204076Spjd if (ioflag & (IO_APPEND | IO_SYNC)) { 906213007Spjd mtx_lock(&np->n_mtx); 907213007Spjd if (np->n_flag & NMODIFIED) { 908217784Spjd mtx_unlock(&np->n_mtx); 909221899Spjd#ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */ 910218049Spjd /* 911218049Spjd * Require non-blocking, synchronous writes to 912218049Spjd * dirty files to inform the program it needs 913218214Spjd * to fsync(2) explicitly. 914218049Spjd */ 915213007Spjd if (ioflag & IO_NDELAY) 916213530Spjd return (EAGAIN); 917213530Spjd#endif 918213530Spjdflush_and_restart: 919213530Spjd np->n_attrstamp = 0; 920218138Spjd error = nfs_vinvalbuf(vp, V_SAVE, td, 1); 921213530Spjd if (error) 922213007Spjd return (error); 923213007Spjd } else 924213007Spjd mtx_unlock(&np->n_mtx); 925213007Spjd } 926213007Spjd 927213007Spjd /* 928213007Spjd * If IO_APPEND then load uio_offset. We restart here if we cannot 929213007Spjd * get the append lock. 930218138Spjd */ 931220898Spjd if (ioflag & IO_APPEND) { 932220898Spjd np->n_attrstamp = 0; 933220898Spjd error = VOP_GETATTR(vp, &vattr, cred, td); 934220898Spjd if (error) 935220898Spjd return (error); 936220898Spjd mtx_lock(&np->n_mtx); 937220898Spjd uio->uio_offset = np->n_size; 938220898Spjd mtx_unlock(&np->n_mtx); 939220898Spjd } 940220898Spjd 941220898Spjd if (uio->uio_offset < 0) 942220898Spjd return (EINVAL); 943220898Spjd if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 944220898Spjd return (EFBIG); 945220898Spjd if (uio->uio_resid == 0) 946220898Spjd return (0); 947220898Spjd 948220898Spjd if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG) 949220898Spjd return nfs_directio_write(vp, uio, cred, ioflag); 950220898Spjd 951220898Spjd /* 952220898Spjd * Maybe this should be above the vnode op call, but so long as 953220898Spjd * file servers have no limits, i don't think it matters 954220898Spjd */ 955204076Spjd if (p != NULL) { 956218138Spjd PROC_LOCK(p); 957204076Spjd if (uio->uio_offset + uio->uio_resid > 958218138Spjd lim_cur(p, RLIMIT_FSIZE)) { 959204076Spjd psignal(p, SIGXFSZ); 960218138Spjd PROC_UNLOCK(p); 961204076Spjd return (EFBIG); 962218138Spjd } 963204076Spjd PROC_UNLOCK(p); 964218138Spjd } 965220898Spjd 966213530Spjd biosize = vp->v_mount->mnt_stat.f_iosize; 967204076Spjd /* 968204076Spjd * Find all of this file's B_NEEDCOMMIT buffers. If our writes 969204076Spjd * would exceed the local maximum per-file write commit size when 970204076Spjd * combined with those, we must decide whether to flush, 971204076Spjd * go synchronous, or return error. We don't bother checking 972204076Spjd * IO_UNIT -- we just make all writes atomic anyway, as there's 973204076Spjd * no point optimizing for something that really won't ever happen. 974204076Spjd */ 975204076Spjd if (!(ioflag & IO_SYNC)) { 976204076Spjd int nflag; 977204076Spjd 978204076Spjd mtx_lock(&np->n_mtx); 979204076Spjd nflag = np->n_flag; 980204076Spjd mtx_unlock(&np->n_mtx); 981204076Spjd int needrestart = 0; 982204076Spjd if (nmp->nm_wcommitsize < uio->uio_resid) { 983204076Spjd /* 984204076Spjd * If this request could not possibly be completed 985204076Spjd * without exceeding the maximum outstanding write 986204076Spjd * commit size, see if we can convert it into a 987204076Spjd * synchronous write operation. 988204076Spjd */ 989204076Spjd if (ioflag & IO_NDELAY) 990204076Spjd return (EAGAIN); 991204076Spjd ioflag |= IO_SYNC; 992204076Spjd if (nflag & NMODIFIED) 993204076Spjd needrestart = 1; 994204076Spjd } else if (nflag & NMODIFIED) { 995204076Spjd int wouldcommit = 0; 996204076Spjd BO_LOCK(&vp->v_bufobj); 997204076Spjd if (vp->v_bufobj.bo_dirty.bv_cnt != 0) { 998204076Spjd TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, 999204076Spjd b_bobufs) { 1000204076Spjd if (bp->b_flags & B_NEEDCOMMIT) 1001204076Spjd wouldcommit += bp->b_bcount; 1002204076Spjd } 1003204076Spjd } 1004204076Spjd BO_UNLOCK(&vp->v_bufobj); 1005204076Spjd /* 1006204076Spjd * Since we're not operating synchronously and 1007204076Spjd * bypassing the buffer cache, we are in a commit 1008204076Spjd * and holding all of these buffers whether 1009204076Spjd * transmitted or not. If not limited, this 1010204076Spjd * will lead to the buffer cache deadlocking, 1011204076Spjd * as no one else can flush our uncommitted buffers. 1012204076Spjd */ 1013204076Spjd wouldcommit += uio->uio_resid; 1014204076Spjd /* 1015204076Spjd * If we would initially exceed the maximum 1016204076Spjd * outstanding write commit size, flush and restart. 1017204076Spjd */ 1018218138Spjd if (wouldcommit > nmp->nm_wcommitsize) 1019218138Spjd needrestart = 1; 1020204076Spjd } 1021204076Spjd if (needrestart) 1022204076Spjd goto flush_and_restart; 1023204076Spjd } 1024218138Spjd 1025218138Spjd do { 1026204076Spjd nfsstats.biocache_writes++; 1027211881Spjd lbn = uio->uio_offset / biosize; 1028204076Spjd on = uio->uio_offset & (biosize-1); 1029204076Spjd n = min((unsigned)(biosize - on), uio->uio_resid); 1030204076Spjdagain: 1031211881Spjd /* 1032204076Spjd * Handle direct append and file extension cases, calculate 1033204076Spjd * unaligned buffer size. 1034204076Spjd */ 1035204076Spjd mtx_lock(&np->n_mtx); 1036204076Spjd if (uio->uio_offset == np->n_size && n) { 1037204076Spjd mtx_unlock(&np->n_mtx); 1038211881Spjd /* 1039211881Spjd * Get the buffer (in its pre-append state to maintain 1040204076Spjd * B_CACHE if it was previously set). Resize the 1041204076Spjd * nfsnode after we have locked the buffer to prevent 1042204076Spjd * readers from reading garbage. 1043211878Spjd */ 1044211984Spjd bcount = on; 1045212038Spjd bp = nfs_getcacheblk(vp, lbn, bcount, td); 1046204076Spjd 1047204076Spjd if (bp != NULL) { 1048204076Spjd long save; 1049204076Spjd 1050204076Spjd mtx_lock(&np->n_mtx); 1051204076Spjd np->n_size = uio->uio_offset + n; 1052204076Spjd np->n_flag |= NMODIFIED; 1053204076Spjd vnode_pager_setsize(vp, np->n_size); 1054204076Spjd mtx_unlock(&np->n_mtx); 1055204076Spjd 1056204076Spjd save = bp->b_flags & B_CACHE; 1057204076Spjd bcount += n; 1058204076Spjd allocbuf(bp, bcount); 1059204076Spjd bp->b_flags |= save; 1060204076Spjd } 1061204076Spjd } else { 1062204076Spjd /* 1063204076Spjd * Obtain the locked cache block first, and then 1064204076Spjd * adjust the file's size as appropriate. 1065204076Spjd */ 1066204076Spjd bcount = on + n; 1067204076Spjd if ((off_t)lbn * biosize + bcount < np->n_size) { 1068204076Spjd if ((off_t)(lbn + 1) * biosize < np->n_size) 1069204076Spjd bcount = biosize; 1070204076Spjd else 1071204076Spjd bcount = np->n_size - (off_t)lbn * biosize; 1072204076Spjd } 1073204076Spjd mtx_unlock(&np->n_mtx); 1074204076Spjd bp = nfs_getcacheblk(vp, lbn, bcount, td); 1075204076Spjd mtx_lock(&np->n_mtx); 1076204076Spjd if (uio->uio_offset + n > np->n_size) { 1077204076Spjd np->n_size = uio->uio_offset + n; 1078204076Spjd np->n_flag |= NMODIFIED; 1079204076Spjd vnode_pager_setsize(vp, np->n_size); 1080204076Spjd } 1081204076Spjd mtx_unlock(&np->n_mtx); 1082204076Spjd } 1083204076Spjd 1084204076Spjd if (!bp) { 1085204076Spjd error = nfs_sigintr(nmp, NULL, td); 1086204076Spjd if (!error) 1087204076Spjd error = EINTR; 1088204076Spjd break; 1089204076Spjd } 1090204076Spjd 1091204076Spjd /* 1092204076Spjd * Issue a READ if B_CACHE is not set. In special-append 1093204076Spjd * mode, B_CACHE is based on the buffer prior to the write 1094204076Spjd * op and is typically set, avoiding the read. If a read 1095204076Spjd * is required in special append mode, the server will 1096204076Spjd * probably send us a short-read since we extended the file 1097204076Spjd * on our end, resulting in b_resid == 0 and, thusly, 1098204076Spjd * B_CACHE getting set. 1099204076Spjd * 1100204076Spjd * We can also avoid issuing the read if the write covers 1101204076Spjd * the entire buffer. We have to make sure the buffer state 1102204076Spjd * is reasonable in this case since we will not be initiating 1103204076Spjd * I/O. See the comments in kern/vfs_bio.c's getblk() for 1104204076Spjd * more information. 1105204076Spjd * 1106204076Spjd * B_CACHE may also be set due to the buffer being cached 1107204076Spjd * normally. 1108204076Spjd */ 1109204076Spjd 1110204076Spjd if (on == 0 && n == bcount) { 1111204076Spjd bp->b_flags |= B_CACHE; 1112204076Spjd bp->b_flags &= ~B_INVAL; 1113204076Spjd bp->b_ioflags &= ~BIO_ERROR; 1114204076Spjd } 1115204076Spjd 1116204076Spjd if ((bp->b_flags & B_CACHE) == 0) { 1117204076Spjd bp->b_iocmd = BIO_READ; 1118204076Spjd vfs_busy_pages(bp, 0); 1119204076Spjd error = nfs_doio(vp, bp, cred, td); 1120204076Spjd if (error) { 1121204076Spjd brelse(bp); 1122222228Spjd break; 1123204076Spjd } 1124204076Spjd } 1125204076Spjd if (bp->b_wcred == NOCRED) 1126204076Spjd bp->b_wcred = crhold(cred); 1127204076Spjd mtx_lock(&np->n_mtx); 1128204076Spjd np->n_flag |= NMODIFIED; 1129204076Spjd mtx_unlock(&np->n_mtx); 1130204076Spjd 1131204076Spjd /* 1132204076Spjd * If dirtyend exceeds file size, chop it down. This should 1133204076Spjd * not normally occur but there is an append race where it 1134204076Spjd * might occur XXX, so we log it. 1135204076Spjd * 1136204076Spjd * If the chopping creates a reverse-indexed or degenerate 1137204076Spjd * situation with dirtyoff/end, we 0 both of them. 1138218138Spjd */ 1139204076Spjd 1140204076Spjd if (bp->b_dirtyend > bcount) { 1141204076Spjd nfs_printf("NFS append race @%lx:%d\n", 1142204076Spjd (long)bp->b_blkno * DEV_BSIZE, 1143204076Spjd bp->b_dirtyend - bcount); 1144204076Spjd bp->b_dirtyend = bcount; 1145204076Spjd } 1146204076Spjd 1147204076Spjd if (bp->b_dirtyoff >= bp->b_dirtyend) 1148204076Spjd bp->b_dirtyoff = bp->b_dirtyend = 0; 1149204076Spjd 1150204076Spjd /* 1151222228Spjd * If the new write will leave a contiguous dirty 1152214284Spjd * area, just update the b_dirtyoff and b_dirtyend, 1153219844Spjd * otherwise force a write rpc of the old dirty area. 1154219844Spjd * 1155219844Spjd * While it is possible to merge discontiguous writes due to 1156219844Spjd * our having a B_CACHE buffer ( and thus valid read data 1157219844Spjd * for the hole), we don't because it could lead to 1158214284Spjd * significant cache coherency problems with multiple clients, 1159214284Spjd * especially if locking is implemented later on. 1160204076Spjd * 1161204076Spjd * as an optimization we could theoretically maintain 1162204076Spjd * a linked list of discontinuous areas, but we would still 1163204076Spjd * have to commit them separately so there isn't much 1164204076Spjd * advantage to it except perhaps a bit of asynchronization. 1165204076Spjd */ 1166204076Spjd 1167204076Spjd if (bp->b_dirtyend > 0 && 1168204076Spjd (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1169204076Spjd if (bwrite(bp) == EINTR) { 1170204076Spjd error = EINTR; 1171204076Spjd break; 1172204076Spjd } 1173204076Spjd goto again; 1174204076Spjd } 1175204076Spjd 1176204076Spjd error = uiomove((char *)bp->b_data + on, n, uio); 1177204076Spjd 1178204076Spjd /* 1179204076Spjd * Since this block is being modified, it must be written 1180204076Spjd * again and not just committed. Since write clustering does 1181204076Spjd * not work for the stage 1 data write, only the stage 2 1182204076Spjd * commit rpc, we have to clear B_CLUSTEROK as well. 1183204076Spjd */ 1184204076Spjd bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1185204076Spjd 1186204076Spjd if (error) { 1187204076Spjd bp->b_ioflags |= BIO_ERROR; 1188204076Spjd brelse(bp); 1189204076Spjd break; 1190222228Spjd } 1191204076Spjd 1192204076Spjd /* 1193204076Spjd * Only update dirtyoff/dirtyend if not a degenerate 1194204076Spjd * condition. 1195204076Spjd */ 1196204076Spjd if (n) { 1197222228Spjd if (bp->b_dirtyend > 0) { 1198222228Spjd bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1199222228Spjd bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1200222228Spjd } else { 1201222228Spjd bp->b_dirtyoff = on; 1202222228Spjd bp->b_dirtyend = on + n; 1203222228Spjd } 1204222228Spjd vfs_bio_set_validclean(bp, on, n); 1205204076Spjd } 1206204076Spjd 1207204076Spjd /* 1208204076Spjd * If IO_SYNC do bwrite(). 1209204076Spjd * 1210204076Spjd * IO_INVAL appears to be unused. The idea appears to be 1211204076Spjd * to turn off caching in this case. Very odd. XXX 1212204076Spjd */ 1213204076Spjd if ((ioflag & IO_SYNC)) { 1214204076Spjd if (ioflag & IO_INVAL) 1215204076Spjd bp->b_flags |= B_NOCACHE; 1216204076Spjd error = bwrite(bp); 1217204076Spjd if (error) 1218204076Spjd break; 1219204076Spjd } else if ((n + on) == biosize) { 1220204076Spjd bp->b_flags |= B_ASYNC; 1221204076Spjd (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL); 1222204076Spjd } else { 1223204076Spjd bdwrite(bp); 1224204076Spjd } 1225204076Spjd } while (uio->uio_resid > 0 && n > 0); 1226204076Spjd 1227204076Spjd return (error); 1228204076Spjd} 1229204076Spjd 1230204076Spjd/* 1231204076Spjd * Get an nfs cache block. 1232204076Spjd * 1233204076Spjd * Allocate a new one if the block isn't currently in the cache 1234204076Spjd * and return the block marked busy. If the calling process is 1235204076Spjd * interrupted by a signal for an interruptible mount point, return 1236204076Spjd * NULL. 1237204076Spjd * 1238214692Spjd * The caller must carefully deal with the possible B_INVAL state of 1239204076Spjd * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1240204076Spjd * indirectly), so synchronous reads can be issued without worrying about 1241204076Spjd * the B_INVAL state. We have to be a little more careful when dealing 1242204076Spjd * with writes (see comments in nfs_write()) when extending a file past 1243204076Spjd * its EOF. 1244204076Spjd */ 1245204076Spjdstatic struct buf * 1246204076Spjdnfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 1247204076Spjd{ 1248222467Strociny struct buf *bp; 1249204076Spjd struct mount *mp; 1250204076Spjd struct nfsmount *nmp; 1251204076Spjd 1252216479Spjd mp = vp->v_mount; 1253216479Spjd nmp = VFSTONFS(mp); 1254216479Spjd 1255216479Spjd if (nmp->nm_flag & NFSMNT_INT) { 1256216479Spjd sigset_t oldset; 1257216479Spjd 1258216479Spjd nfs_set_sigmask(td, &oldset); 1259216494Spjd bp = getblk(vp, bn, size, PCATCH, 0, 0); 1260216479Spjd nfs_restore_sigmask(td, &oldset); 1261204076Spjd while (bp == NULL) { 1262204076Spjd if (nfs_sigintr(nmp, NULL, td)) 1263204076Spjd return (NULL); 1264204076Spjd bp = getblk(vp, bn, size, 0, 2 * hz, 0); 1265204076Spjd } 1266204076Spjd } else { 1267204076Spjd bp = getblk(vp, bn, size, 0, 0, 0); 1268204076Spjd } 1269216479Spjd 1270204076Spjd if (vp->v_type == VREG) { 1271216479Spjd int biosize; 1272216479Spjd 1273216479Spjd biosize = mp->mnt_stat.f_iosize; 1274216479Spjd bp->b_blkno = bn * (biosize / DEV_BSIZE); 1275204076Spjd } 1276216479Spjd return (bp); 1277216479Spjd} 1278216494Spjd 1279216479Spjd/* 1280204076Spjd * Flush and invalidate all dirty buffers. If another process is already 1281216479Spjd * doing the flush, just wait for completion. 1282204076Spjd */ 1283204076Spjdint 1284204076Spjdnfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg) 1285204076Spjd{ 1286204076Spjd struct nfsnode *np = VTONFS(vp); 1287216479Spjd struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1288204076Spjd int error = 0, slpflag, slptimeo; 1289216479Spjd int old_lock = 0; 1290216479Spjd 1291216479Spjd ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1292216479Spjd 1293204076Spjd /* 1294216479Spjd * XXX This check stops us from needlessly doing a vinvalbuf when 1295204076Spjd * being called through vclean(). It is not clear that this is 1296204076Spjd * unsafe. 1297204076Spjd */ 1298216479Spjd if (vp->v_iflag & VI_DOOMED) 1299204076Spjd return (0); 1300216479Spjd 1301216479Spjd if ((nmp->nm_flag & NFSMNT_INT) == 0) 1302216479Spjd intrflg = 0; 1303216479Spjd if (intrflg) { 1304204076Spjd slpflag = PCATCH; 1305216479Spjd slptimeo = 2 * hz; 1306204076Spjd } else { 1307204076Spjd slpflag = 0; 1308204076Spjd slptimeo = 0; 1309204076Spjd } 1310204076Spjd 1311204076Spjd old_lock = nfs_upgrade_vnlock(vp, td); 1312204076Spjd /* 1313204076Spjd * Now, flush as required. 1314204076Spjd */ 1315204076Spjd if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) { 1316204076Spjd vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); 1317204076Spjd /* 1318204076Spjd * If the page clean was interrupted, fail the invalidation. 1319204076Spjd * Not doing so, we run the risk of losing dirty pages in the 1320204076Spjd * vinvalbuf() call below. 1321204076Spjd */ 1322204076Spjd if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1323204076Spjd goto out; 1324204076Spjd } 1325204076Spjd 1326214692Spjd error = vinvalbuf(vp, flags, td, slpflag, 0); 1327214692Spjd while (error) { 1328214692Spjd if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1329214692Spjd goto out; 1330214692Spjd error = vinvalbuf(vp, flags, td, 0, slptimeo); 1331218217Spjd } 1332218217Spjd mtx_lock(&np->n_mtx); 1333218217Spjd if (np->n_directio_asyncwr == 0) 1334218217Spjd np->n_flag &= ~NMODIFIED; 1335214692Spjd mtx_unlock(&np->n_mtx); 1336218217Spjdout: 1337219864Spjd nfs_downgrade_vnlock(vp, td, old_lock); 1338218138Spjd return error; 1339218138Spjd} 1340214692Spjd 1341214692Spjd/* 1342214692Spjd * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1343214692Spjd * This is mainly to avoid queueing async I/O requests when the nfsiods 1344218217Spjd * are all hung on a dead server. 1345214692Spjd * 1346214692Spjd * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1347214692Spjd * is eventually dequeued by the async daemon, nfs_doio() *will*. 1348214692Spjd */ 1349214692Spjdint 1350214692Spjdnfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) 1351218217Spjd{ 1352214692Spjd int iod; 1353214692Spjd int gotiod; 1354214692Spjd int slpflag = 0; 1355214692Spjd int slptimeo = 0; 1356214692Spjd int error, error2; 1357214692Spjd 1358218217Spjd /* 1359218217Spjd * Commits are usually short and sweet so lets save some cpu and 1360214692Spjd * leave the async daemons for more important rpc's (such as reads 1361214692Spjd * and writes). 1362214692Spjd */ 1363214692Spjd mtx_lock(&nfs_iod_mtx); 1364204076Spjd if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1365204076Spjd (nmp->nm_bufqiods > nfs_numasync / 2)) { 1366204076Spjd mtx_unlock(&nfs_iod_mtx); 1367204076Spjd return(EIO); 1368204076Spjd } 1369204076Spjdagain: 1370204076Spjd if (nmp->nm_flag & NFSMNT_INT) 1371204076Spjd slpflag = PCATCH; 1372214692Spjd gotiod = FALSE; 1373204076Spjd 1374204076Spjd /* 1375204076Spjd * Find a free iod to process this request. 1376204076Spjd */ 1377204076Spjd for (iod = 0; iod < nfs_numasync; iod++) 1378204076Spjd if (nfs_iodwant[iod]) { 1379204076Spjd gotiod = TRUE; 1380204076Spjd break; 1381204076Spjd } 1382204076Spjd 1383219864Spjd /* 1384204076Spjd * Try to create one if none are free. 1385204076Spjd */ 1386204076Spjd if (!gotiod) { 1387219721Strociny iod = nfs_nfsiodnew(); 1388214692Spjd if (iod != -1) 1389214692Spjd gotiod = TRUE; 1390219721Strociny } 1391214692Spjd 1392214692Spjd if (gotiod) { 1393214692Spjd /* 1394214692Spjd * Found one, so wake it up and tell it which 1395214692Spjd * mount to process. 1396204076Spjd */ 1397204076Spjd NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 1398204076Spjd iod, nmp)); 1399204076Spjd nfs_iodwant[iod] = NULL; 1400204076Spjd nfs_iodmount[iod] = nmp; 1401204076Spjd nmp->nm_bufqiods++; 1402204076Spjd wakeup(&nfs_iodwant[iod]); 1403204076Spjd } 1404204076Spjd 1405204076Spjd /* 1406204076Spjd * If none are free, we may already have an iod working on this mount 1407204076Spjd * point. If so, it will process our request. 1408204076Spjd */ 1409204076Spjd if (!gotiod) { 1410204076Spjd if (nmp->nm_bufqiods > 0) { 1411204076Spjd NFS_DPF(ASYNCIO, 1412204076Spjd ("nfs_asyncio: %d iods are already processing mount %p\n", 1413204076Spjd nmp->nm_bufqiods, nmp)); 1414204076Spjd gotiod = TRUE; 1415204076Spjd } 1416204076Spjd } 1417204076Spjd 1418204076Spjd /* 1419204076Spjd * If we have an iod which can process the request, then queue 1420204076Spjd * the buffer. 1421204076Spjd */ 1422204076Spjd if (gotiod) { 1423204076Spjd /* 1424218138Spjd * Ensure that the queue never grows too large. We still want 1425204076Spjd * to asynchronize so we block rather then return EIO. 1426204076Spjd */ 1427204076Spjd while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1428204076Spjd NFS_DPF(ASYNCIO, 1429204076Spjd ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1430204076Spjd nmp->nm_bufqwant = TRUE; 1431204076Spjd error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx, 1432204076Spjd slpflag | PRIBIO, 1433204076Spjd "nfsaio", slptimeo); 1434204076Spjd if (error) { 1435204076Spjd error2 = nfs_sigintr(nmp, NULL, td); 1436204076Spjd if (error2) { 1437204076Spjd mtx_unlock(&nfs_iod_mtx); 1438204076Spjd return (error2); 1439204076Spjd } 1440204076Spjd if (slpflag == PCATCH) { 1441204076Spjd slpflag = 0; 1442204076Spjd slptimeo = 2 * hz; 1443204076Spjd } 1444204076Spjd } 1445204076Spjd /* 1446204076Spjd * We might have lost our iod while sleeping, 1447204076Spjd * so check and loop if nescessary. 1448204076Spjd */ 1449204076Spjd if (nmp->nm_bufqiods == 0) { 1450204076Spjd NFS_DPF(ASYNCIO, 1451204076Spjd ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1452204076Spjd goto again; 1453204076Spjd } 1454204076Spjd } 1455204076Spjd 1456204076Spjd if (bp->b_iocmd == BIO_READ) { 1457204076Spjd if (bp->b_rcred == NOCRED && cred != NOCRED) 1458204076Spjd bp->b_rcred = crhold(cred); 1459204076Spjd } else { 1460204076Spjd if (bp->b_wcred == NOCRED && cred != NOCRED) 1461204076Spjd bp->b_wcred = crhold(cred); 1462204076Spjd } 1463204076Spjd 1464204076Spjd if (bp->b_flags & B_REMFREE) 1465204076Spjd bremfreef(bp); 1466204076Spjd BUF_KERNPROC(bp); 1467204076Spjd TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1468204076Spjd nmp->nm_bufqlen++; 1469204076Spjd if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { 1470204076Spjd mtx_lock(&(VTONFS(bp->b_vp))->n_mtx); 1471204076Spjd VTONFS(bp->b_vp)->n_directio_asyncwr++; 1472204076Spjd mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx); 1473211979Spjd } 1474204076Spjd mtx_unlock(&nfs_iod_mtx); 1475204076Spjd return (0); 1476204076Spjd } 1477204076Spjd 1478204076Spjd mtx_unlock(&nfs_iod_mtx); 1479204076Spjd 1480204076Spjd /* 1481204076Spjd * All the iods are busy on other mounts, so return EIO to 1482204076Spjd * force the caller to process the i/o synchronously. 1483204076Spjd */ 1484204076Spjd NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1485204076Spjd return (EIO); 1486204076Spjd} 1487204076Spjd 1488204076Spjdvoid 1489204076Spjdnfs_doio_directwrite(struct buf *bp) 1490204076Spjd{ 1491204076Spjd int iomode, must_commit; 1492204076Spjd struct uio *uiop = (struct uio *)bp->b_caller1; 1493204076Spjd char *iov_base = uiop->uio_iov->iov_base; 1494204076Spjd struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount); 1495204076Spjd 1496204076Spjd iomode = NFSV3WRITE_FILESYNC; 1497204076Spjd uiop->uio_td = NULL; /* NULL since we're in nfsiod */ 1498204076Spjd (nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit); 1499204076Spjd KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write")); 1500204076Spjd free(iov_base, M_NFSDIRECTIO); 1501204076Spjd free(uiop->uio_iov, M_NFSDIRECTIO); 1502204076Spjd free(uiop, M_NFSDIRECTIO); 1503204076Spjd if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) { 1504204076Spjd struct nfsnode *np = VTONFS(bp->b_vp); 1505204076Spjd mtx_lock(&np->n_mtx); 1506204076Spjd np->n_directio_asyncwr--; 1507204076Spjd if ((np->n_flag & NFSYNCWAIT) && np->n_directio_asyncwr == 0) { 1508204076Spjd np->n_flag &= ~NFSYNCWAIT; 1509204076Spjd wakeup((caddr_t)&np->n_directio_asyncwr); 1510204076Spjd } 1511204076Spjd mtx_unlock(&np->n_mtx); 1512204076Spjd } 1513204076Spjd vdrop(bp->b_vp); 1514204076Spjd bp->b_vp = NULL; 1515204076Spjd relpbuf(bp, &nfs_pbuf_freecnt); 1516204076Spjd} 1517204076Spjd 1518204076Spjd/* 1519204076Spjd * Do an I/O operation to/from a cache block. This may be called 1520204076Spjd * synchronously or from an nfsiod. 1521204076Spjd */ 1522204076Spjdint 1523204076Spjdnfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) 1524204076Spjd{ 1525204076Spjd struct uio *uiop; 1526204076Spjd struct nfsnode *np; 1527204076Spjd struct nfsmount *nmp; 1528204076Spjd int error = 0, iomode, must_commit = 0; 1529204076Spjd struct uio uio; 1530204076Spjd struct iovec io; 1531204076Spjd struct proc *p = td ? td->td_proc : NULL; 1532204076Spjd uint8_t iocmd; 1533204076Spjd 1534204076Spjd np = VTONFS(vp); 1535204076Spjd nmp = VFSTONFS(vp->v_mount); 1536204076Spjd uiop = &uio; 1537204076Spjd uiop->uio_iov = &io; 1538204076Spjd uiop->uio_iovcnt = 1; 1539204076Spjd uiop->uio_segflg = UIO_SYSSPACE; 1540204076Spjd uiop->uio_td = td; 1541204076Spjd 1542204076Spjd /* 1543204076Spjd * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1544204076Spjd * do this here so we do not have to do it in all the code that 1545204076Spjd * calls us. 1546204076Spjd */ 1547204076Spjd bp->b_flags &= ~B_INVAL; 1548204076Spjd bp->b_ioflags &= ~BIO_ERROR; 1549204076Spjd 1550204076Spjd KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1551204076Spjd iocmd = bp->b_iocmd; 1552204076Spjd if (iocmd == BIO_READ) { 1553204076Spjd io.iov_len = uiop->uio_resid = bp->b_bcount; 1554218138Spjd io.iov_base = bp->b_data; 1555204076Spjd uiop->uio_rw = UIO_READ; 1556204076Spjd 1557204076Spjd switch (vp->v_type) { 1558204076Spjd case VREG: 1559204076Spjd uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1560204076Spjd nfsstats.read_bios++; 1561204076Spjd error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 1562204076Spjd 1563204076Spjd if (!error) { 1564204076Spjd if (uiop->uio_resid) { 1565204076Spjd /* 1566204076Spjd * If we had a short read with no error, we must have 1567204076Spjd * hit a file hole. We should zero-fill the remainder. 1568204076Spjd * This can also occur if the server hits the file EOF. 1569204076Spjd * 1570204076Spjd * Holes used to be able to occur due to pending 1571204076Spjd * writes, but that is not possible any longer. 1572204076Spjd */ 1573204076Spjd int nread = bp->b_bcount - uiop->uio_resid; 1574204076Spjd int left = uiop->uio_resid; 1575204076Spjd 1576204076Spjd if (left > 0) 1577204076Spjd bzero((char *)bp->b_data + nread, left); 1578204076Spjd uiop->uio_resid = 0; 1579204076Spjd } 1580204076Spjd } 1581204076Spjd /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1582204076Spjd if (p && (vp->v_vflag & VV_TEXT)) { 1583204076Spjd mtx_lock(&np->n_mtx); 1584204076Spjd if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) { 1585204076Spjd mtx_unlock(&np->n_mtx); 1586204076Spjd PROC_LOCK(p); 1587204076Spjd killproc(p, "text file modification"); 1588204076Spjd PROC_UNLOCK(p); 1589204076Spjd } else 1590204076Spjd mtx_unlock(&np->n_mtx); 1591204076Spjd } 1592216478Spjd break; 1593216479Spjd case VLNK: 1594216479Spjd uiop->uio_offset = (off_t)0; 1595204076Spjd nfsstats.readlink_bios++; 1596204076Spjd error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 1597204076Spjd break; 1598204076Spjd case VDIR: 1599204076Spjd nfsstats.readdir_bios++; 1600204076Spjd uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1601204076Spjd if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) 1602204076Spjd error = nfs4_readdirrpc(vp, uiop, cr); 1603204076Spjd else { 1604204076Spjd if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1605204076Spjd error = nfs_readdirplusrpc(vp, uiop, cr); 1606204076Spjd if (error == NFSERR_NOTSUPP) 1607204076Spjd nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1608204076Spjd } 1609204076Spjd if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1610204076Spjd error = nfs_readdirrpc(vp, uiop, cr); 1611204076Spjd } 1612204076Spjd /* 1613204076Spjd * end-of-directory sets B_INVAL but does not generate an 1614204076Spjd * error. 1615204076Spjd */ 1616204076Spjd if (error == 0 && uiop->uio_resid == bp->b_bcount) 1617204076Spjd bp->b_flags |= B_INVAL; 1618204076Spjd break; 1619204076Spjd default: 1620204076Spjd nfs_printf("nfs_doio: type %x unexpected\n", vp->v_type); 1621204076Spjd break; 1622204076Spjd }; 1623204076Spjd if (error) { 1624218138Spjd bp->b_ioflags |= BIO_ERROR; 1625204076Spjd bp->b_error = error; 1626204076Spjd } 1627204076Spjd } else { 1628204076Spjd /* 1629204076Spjd * If we only need to commit, try to commit 1630204076Spjd */ 1631204076Spjd if (bp->b_flags & B_NEEDCOMMIT) { 1632204076Spjd int retv; 1633204076Spjd off_t off; 1634204076Spjd 1635204076Spjd off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1636204076Spjd retv = (nmp->nm_rpcops->nr_commit)( 1637204076Spjd vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1638204076Spjd bp->b_wcred, td); 1639204076Spjd if (retv == 0) { 1640204076Spjd bp->b_dirtyoff = bp->b_dirtyend = 0; 1641204076Spjd bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1642204076Spjd bp->b_resid = 0; 1643204076Spjd bufdone(bp); 1644204076Spjd return (0); 1645204076Spjd } 1646204076Spjd if (retv == NFSERR_STALEWRITEVERF) { 1647204076Spjd nfs_clearcommit(vp->v_mount); 1648204076Spjd } 1649204076Spjd } 1650204076Spjd 1651204076Spjd /* 1652204076Spjd * Setup for actual write 1653204076Spjd */ 1654204076Spjd mtx_lock(&np->n_mtx); 1655204076Spjd if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1656204076Spjd bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1657204076Spjd mtx_unlock(&np->n_mtx); 1658204076Spjd 1659204076Spjd if (bp->b_dirtyend > bp->b_dirtyoff) { 1660204076Spjd io.iov_len = uiop->uio_resid = bp->b_dirtyend 1661204076Spjd - bp->b_dirtyoff; 1662204076Spjd uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1663204076Spjd + bp->b_dirtyoff; 1664204076Spjd io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1665204076Spjd uiop->uio_rw = UIO_WRITE; 1666204076Spjd nfsstats.write_bios++; 1667204076Spjd 1668204076Spjd if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1669204076Spjd iomode = NFSV3WRITE_UNSTABLE; 1670204076Spjd else 1671204076Spjd iomode = NFSV3WRITE_FILESYNC; 1672204076Spjd 1673204076Spjd error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 1674204076Spjd 1675204076Spjd /* 1676204076Spjd * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1677204076Spjd * to cluster the buffers needing commit. This will allow 1678204076Spjd * the system to submit a single commit rpc for the whole 1679219879Strociny * cluster. We can do this even if the buffer is not 100% 1680219879Strociny * dirty (relative to the NFS blocksize), so we optimize the 1681204076Spjd * append-to-file-case. 1682219879Strociny * 1683219879Strociny * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1684219879Strociny * cleared because write clustering only works for commit 1685219879Strociny * rpc's, not for the data portion of the write). 1686219879Strociny */ 1687204076Spjd 1688204076Spjd if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1689204076Spjd bp->b_flags |= B_NEEDCOMMIT; 1690223655Strociny if (bp->b_dirtyoff == 0 1691223655Strociny && bp->b_dirtyend == bp->b_bcount) 1692223655Strociny bp->b_flags |= B_CLUSTEROK; 1693223655Strociny } else { 1694223655Strociny bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1695204076Spjd } 1696204076Spjd 1697204076Spjd /* 1698204076Spjd * For an interrupted write, the buffer is still valid 1699204076Spjd * and the write hasn't been pushed to the server yet, 1700204076Spjd * so we can't set BIO_ERROR and report the interruption 1701204076Spjd * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1702204076Spjd * is not relevant, so the rpc attempt is essentially 1703204076Spjd * a noop. For the case of a V3 write rpc not being 1704204076Spjd * committed to stable storage, the block is still 1705204076Spjd * dirty and requires either a commit rpc or another 1706204076Spjd * write rpc with iomode == NFSV3WRITE_FILESYNC before 1707204076Spjd * the block is reused. This is indicated by setting 1708204076Spjd * the B_DELWRI and B_NEEDCOMMIT flags. 1709204076Spjd * 1710204076Spjd * If the buffer is marked B_PAGING, it does not reside on 1711204076Spjd * the vp's paging queues so we cannot call bdirty(). The 1712204076Spjd * bp in this case is not an NFS cache block so we should 1713204076Spjd * be safe. XXX 1714204076Spjd */ 1715204076Spjd if (error == EINTR || error == EIO || error == ETIMEDOUT 1716204076Spjd || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1717204076Spjd int s; 1718204076Spjd 1719204076Spjd s = splbio(); 1720204076Spjd bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1721204076Spjd if ((bp->b_flags & B_PAGING) == 0) { 1722204076Spjd bdirty(bp); 1723204076Spjd bp->b_flags &= ~B_DONE; 1724204076Spjd } 1725204076Spjd if (error && (bp->b_flags & B_ASYNC) == 0) 1726204076Spjd bp->b_flags |= B_EINTR; 1727204076Spjd splx(s); 1728204076Spjd } else { 1729204076Spjd if (error) { 1730204076Spjd bp->b_ioflags |= BIO_ERROR; 1731204076Spjd bp->b_error = np->n_error = error; 1732204076Spjd mtx_lock(&np->n_mtx); 1733204076Spjd np->n_flag |= NWRITEERR; 1734204076Spjd mtx_unlock(&np->n_mtx); 1735204076Spjd } 1736204076Spjd bp->b_dirtyoff = bp->b_dirtyend = 0; 1737204076Spjd } 1738204076Spjd } else { 1739204076Spjd bp->b_resid = 0; 1740204076Spjd bufdone(bp); 1741204076Spjd return (0); 1742204076Spjd } 1743204076Spjd } 1744204076Spjd bp->b_resid = uiop->uio_resid; 1745204076Spjd if (must_commit) 1746219372Spjd nfs_clearcommit(vp->v_mount); 1747204076Spjd bufdone(bp); 1748204076Spjd return (error); 1749204076Spjd} 1750204076Spjd 1751204076Spjd/* 1752204076Spjd * Used to aid in handling ftruncate() operations on the NFS client side. 1753204076Spjd * Truncation creates a number of special problems for NFS. We have to 1754211897Spjd * throw away VM pages and buffer cache buffers that are beyond EOF, and 1755211897Spjd * we have to properly handle VM pages or (potentially dirty) buffers 1756204076Spjd * that straddle the truncation point. 1757204076Spjd */ 1758204076Spjd 1759211897Spjdint 1760219372Spjdnfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 1761219372Spjd{ 1762219372Spjd struct nfsnode *np = VTONFS(vp); 1763219372Spjd u_quad_t tsize; 1764211879Spjd int biosize = vp->v_mount->mnt_stat.f_iosize; 1765212038Spjd int error = 0; 1766211879Spjd 1767204076Spjd mtx_lock(&np->n_mtx); 1768204076Spjd tsize = np->n_size; 1769204076Spjd np->n_size = nsize; 1770204076Spjd mtx_unlock(&np->n_mtx); 1771204076Spjd 1772204076Spjd if (nsize < tsize) { 1773204076Spjd struct buf *bp; 1774204076Spjd daddr_t lbn; 1775204076Spjd int bufsize; 1776204076Spjd 1777204076Spjd /* 1778204076Spjd * vtruncbuf() doesn't get the buffer overlapping the 1779204076Spjd * truncation point. We may have a B_DELWRI and/or B_CACHE 1780204076Spjd * buffer that now needs to be truncated. 1781204076Spjd */ 1782204076Spjd error = vtruncbuf(vp, cred, td, nsize, biosize); 1783204076Spjd lbn = nsize / biosize; 1784204076Spjd bufsize = nsize & (biosize - 1); 1785204076Spjd bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1786204076Spjd if (!bp) 1787204076Spjd return EINTR; 1788204076Spjd if (bp->b_dirtyoff > bp->b_bcount) 1789204076Spjd bp->b_dirtyoff = bp->b_bcount; 1790204076Spjd if (bp->b_dirtyend > bp->b_bcount) 1791204076Spjd bp->b_dirtyend = bp->b_bcount; 1792204076Spjd bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 1793204076Spjd brelse(bp); 1794204076Spjd } else { 1795219372Spjd vnode_pager_setsize(vp, nsize); 1796219372Spjd } 1797204076Spjd return(error); 1798212038Spjd} 1799219372Spjd 1800204076Spjd