nfs_bio.c revision 83629
11541Srgrimes/* 21541Srgrimes * Copyright (c) 1989, 1993 31541Srgrimes * The Regents of the University of California. All rights reserved. 41541Srgrimes * 51541Srgrimes * This code is derived from software contributed to Berkeley by 61541Srgrimes * Rick Macklem at The University of Guelph. 71541Srgrimes * 81541Srgrimes * Redistribution and use in source and binary forms, with or without 91541Srgrimes * modification, are permitted provided that the following conditions 101541Srgrimes * are met: 111541Srgrimes * 1. Redistributions of source code must retain the above copyright 121541Srgrimes * notice, this list of conditions and the following disclaimer. 131541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 141541Srgrimes * notice, this list of conditions and the following disclaimer in the 151541Srgrimes * documentation and/or other materials provided with the distribution. 161541Srgrimes * 3. All advertising materials mentioning features or use of this software 171541Srgrimes * must display the following acknowledgement: 181541Srgrimes * This product includes software developed by the University of 191541Srgrimes * California, Berkeley and its contributors. 201541Srgrimes * 4. Neither the name of the University nor the names of its contributors 211541Srgrimes * may be used to endorse or promote products derived from this software 221541Srgrimes * without specific prior written permission. 231541Srgrimes * 241541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 251541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 261541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 271541Srgrimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 281541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 291541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 301541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 311541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 321541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 331541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3450477Speter * SUCH DAMAGE. 351541Srgrimes * 361541Srgrimes * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 37106369Srwatson * $FreeBSD: head/sys/nfsclient/nfs_bio.c 83629 2001-09-18 18:37:52Z imp $ 38106369Srwatson */ 391541Srgrimes 4048274Speter 4176166Smarkm#include <sys/param.h> 4276166Smarkm#include <sys/systm.h> 4312221Sbde#include <sys/bio.h> 441541Srgrimes#include <sys/buf.h> 453308Sphk#include <sys/kernel.h> 461541Srgrimes#include <sys/mount.h> 47106369Srwatson#include <sys/proc.h> 481541Srgrimes#include <sys/resourcevar.h> 4925583Speter#include <sys/signalvar.h> 501541Srgrimes#include <sys/vmmeter.h> 5125656Speter#include <sys/vnode.h> 5258377Sphk 531541Srgrimes#include <vm/vm.h> 5476166Smarkm#include <vm/vm_extern.h> 5526335Speter#include <vm/vm_page.h> 5626335Speter#include <vm/vm_object.h> 571541Srgrimes#include <vm/vm_pager.h> 58110299Sphk#include <vm/vnode_pager.h> 59110299Sphk 609369Sdg#include <nfs/rpcv2.h> 618876Srgrimes#include <nfs/nfsproto.h> 621541Srgrimes#include <nfs/nfs.h> 631541Srgrimes#include <nfs/nfsmount.h> 641541Srgrimes#include <nfs/nqnfs.h> 651541Srgrimes#include <nfs/nfsnode.h> 661541Srgrimes 671541Srgrimes/* 681541Srgrimes * Just call nfs_writebp() with the force argument set to 1. 691541Srgrimes * 701541Srgrimes * NOTE: B_DONE may or may not be set in a_bp on call. 7192723Salfred */ 7292723Salfredstatic int 7394343Sjhbnfs_bwrite(struct buf *bp) 7492723Salfred{ 7592723Salfred return (nfs_writebp(bp, 1, curthread)); 7613016Sbde} 7730739Sphk 7830739Sphkstruct buf_ops buf_ops_nfs = { 7930739Sphk "buf_ops_nfs", 8030739Sphk nfs_bwrite 8130739Sphk}; 8230739Sphk 8392723Salfred 8430739Sphkstatic struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 8525583Speter struct thread *td)); 86102074Sphk 8725583Speterextern int nfs_numasync; 8845433Snsayerextern int nfs_pbuf_freecnt; 8945438Snsayerextern struct nfsstats nfsstats; 9033690Sphk 9125583Speter/* 9225583Speter * Vnode op for VM getpages. 9325656Speter */ 9433818Sbdeint 9535029Sphknfs_getpages(ap) 9635029Sphk struct vop_getpages_args /* { 9725583Speter struct vnode *a_vp; 9825583Speter vm_page_t *a_m; 9933818Sbde int a_count; 10045433Snsayer int a_reqpage; 10145433Snsayer vm_ooffset_t a_offset; 10245433Snsayer } */ *ap; 10345433Snsayer{ 10445438Snsayer int i, error, nextoff, size, toff, count, npages; 10545438Snsayer struct uio uio; 10645438Snsayer struct iovec iov; 10745438Snsayer vm_offset_t kva; 10825583Speter struct buf *bp; 10994343Sjhb struct vnode *vp; 11045433Snsayer struct thread *td; 11145437Smjacob struct ucred *cred; 11245438Snsayer struct nfsmount *nmp; 11345437Smjacob vm_page_t *pages; 11445437Smjacob 11545437Smjacob GIANT_REQUIRED; 11645437Smjacob 11745437Smjacob vp = ap->a_vp; 11845437Smjacob td = curthread; /* XXX */ 11945437Smjacob cred = curthread->td_proc->p_ucred; /* XXX */ 12045433Snsayer nmp = VFSTONFS(vp->v_mount); 12145433Snsayer pages = ap->a_m; 12245437Smjacob count = ap->a_count; 12345438Snsayer 12445438Snsayer if (vp->v_object == NULL) { 12545438Snsayer printf("nfs_getpages: called with non-merged cache vnode??\n"); 12645438Snsayer return VM_PAGER_ERROR; 12745438Snsayer } 12845438Snsayer 12945438Snsayer if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 13045438Snsayer (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 13145438Snsayer (void)nfs_fsinfo(nmp, vp, cred, td); 13245433Snsayer } 13333818Sbde 13433818Sbde npages = btoc(count); 13533690Sphk 13633690Sphk /* 13794343Sjhb * If the requested page is partially valid, just return it and 13858377Sphk * allow the pager to zero-out the blanks. Partially valid pages 13925583Speter * can only occur at the file EOF. 14025583Speter */ 14125583Speter 14225583Speter { 14394343Sjhb vm_page_t m = pages[ap->a_reqpage]; 14425583Speter 14525583Speter if (m->valid != 0) { 14625583Speter /* handled by vm_fault now */ 14712221Sbde /* vm_page_zero_invalid(m, TRUE); */ 14825583Speter for (i = 0; i < npages; ++i) { 14925583Speter if (i != ap->a_reqpage) 15025583Speter vm_page_free(pages[i]); 15125583Speter } 15225583Speter return(0); 15325656Speter } 15482746Sdillon } 15582746Sdillon 15682746Sdillon /* 15725583Speter * We use only the kva address for the buffer, but this is extremely 15825583Speter * convienient and fast. 159102074Sphk */ 16025583Speter bp = getpbuf(&nfs_pbuf_freecnt); 16125583Speter 16225583Speter kva = (vm_offset_t) bp->b_data; 163107849Salfred pmap_qenter(kva, pages, npages); 16425583Speter cnt.v_vnodein++; 16533690Sphk cnt.v_vnodepgsin += npages; 166107849Salfred 16725583Speter iov.iov_base = (caddr_t) kva; 16825583Speter iov.iov_len = count; 16925583Speter uio.uio_iov = &iov; 17025583Speter uio.uio_iovcnt = 1; 17125583Speter uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 17225583Speter uio.uio_resid = count; 17325583Speter uio.uio_segflg = UIO_SYSSPACE; 17425583Speter uio.uio_rw = UIO_READ; 17525656Speter uio.uio_td = td; 17682746Sdillon 17782746Sdillon error = nfs_readrpc(vp, &uio, cred); 17882746Sdillon pmap_qremove(kva, npages); 17925583Speter 18025583Speter relpbuf(bp, &nfs_pbuf_freecnt); 181102074Sphk 18225583Speter if (error && (uio.uio_resid == count)) { 18325583Speter printf("nfs_getpages: error %d\n", error); 18425583Speter for (i = 0; i < npages; ++i) { 18525583Speter if (i != ap->a_reqpage) 18625583Speter vm_page_free(pages[i]); 187106369Srwatson } 188106369Srwatson return VM_PAGER_ERROR; 189106369Srwatson } 190106369Srwatson 191106369Srwatson /* 19293593Sjhb * Calculate the number of bytes read and validate only that number 19394343Sjhb * of bytes. Note that due to pending writes, size may be 0. This 194107849Salfred * does not mean that the remaining data is invalid! 19594343Sjhb */ 196107849Salfred 19794343Sjhb size = count - uio.uio_resid; 19894343Sjhb 19994343Sjhb for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 20034901Sphk vm_page_t m; 20125583Speter nextoff = toff + PAGE_SIZE; 20294343Sjhb m = pages[i]; 20382746Sdillon 20425583Speter m->flags &= ~PG_ZERO; 20525583Speter 20625583Speter if (nextoff <= size) { 20725583Speter /* 20825583Speter * Read operation filled an entire page 20925583Speter */ 21025583Speter m->valid = VM_PAGE_BITS_ALL; 21125583Speter vm_page_undirty(m); 21225656Speter } else if (size > toff) { 21325583Speter /* 214102074Sphk * Read operation filled a partial page. 21525583Speter */ 21625583Speter m->valid = 0; 21725656Speter vm_page_set_validclean(m, 0, size - toff); 21825583Speter /* handled by vm_fault now */ 219107849Salfred /* vm_page_zero_invalid(m, TRUE); */ 22025583Speter } 22125656Speter 222107849Salfred if (i != ap->a_reqpage) { 22325583Speter /* 224103964Sbde * Whether or not to leave the page activated is up in 225103964Sbde * the air, but we should put the page on a page queue 226103964Sbde * somewhere (it already is in the object). Result: 227103964Sbde * It appears that emperical results show that 228103964Sbde * deactivating pages is best. 229103964Sbde */ 230107849Salfred 23125583Speter /* 23225656Speter * Just in case someone was asking for this page we 23325583Speter * now tell them that it is ok to use. 23425583Speter */ 23526335Speter if (!error) { 23625656Speter if (m->flags & PG_WANTED) 23726335Speter vm_page_activate(m); 238102074Sphk else 23925583Speter vm_page_deactivate(m); 24035045Sphk vm_page_wakeup(m); 24135042Sphk } else { 24235042Sphk vm_page_free(m); 24325583Speter } 24428773Sbde } 24525656Speter } 24643301Sdillon return 0; 24728773Sbde} 24836119Sphk 24935029Sphk/* 25035042Sphk * Vnode op for VM putpages. 25135042Sphk */ 25235042Sphkint 25335042Sphknfs_putpages(ap) 25436119Sphk struct vop_putpages_args /* { 25535042Sphk struct vnode *a_vp; 25635042Sphk vm_page_t *a_m; 25735042Sphk int a_count; 25835042Sphk int a_sync; 25935042Sphk int *a_rtvals; 26035042Sphk vm_ooffset_t a_offset; 26135042Sphk } */ *ap; 26235042Sphk{ 26335042Sphk struct uio uio; 26435042Sphk struct iovec iov; 26535042Sphk vm_offset_t kva; 26635029Sphk struct buf *bp; 26735042Sphk int iomode, must_commit, i, error, npages, count; 26835045Sphk off_t offset; 26935045Sphk int *rtvals; 27035045Sphk struct vnode *vp; 27126335Speter struct thread *td; 27226335Speter struct ucred *cred; 27325583Speter struct nfsmount *nmp; 27426335Speter struct nfsnode *np; 27526335Speter vm_page_t *pages; 27626335Speter 27726335Speter GIANT_REQUIRED; 27826335Speter 27926335Speter vp = ap->a_vp; 28026335Speter np = VTONFS(vp); 28182746Sdillon td = curthread; /* XXX */ 28282746Sdillon cred = curthread->td_proc->p_ucred; /* XXX */ 28382746Sdillon nmp = VFSTONFS(vp->v_mount); 28426335Speter pages = ap->a_m; 28526335Speter count = ap->a_count; 286102074Sphk rtvals = ap->a_rtvals; 28726335Speter npages = btoc(count); 28826335Speter offset = IDX_TO_OFF(pages[0]->pindex); 28982746Sdillon 29026335Speter GIANT_REQUIRED; 291107849Salfred 29226335Speter if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 29326335Speter (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 29482746Sdillon (void)nfs_fsinfo(nmp, vp, cred, td); 295109521Salfred } 296109521Salfred 297109521Salfred for (i = 0; i < npages; i++) { 29883366Sjulian rtvals[i] = VM_PAGER_AGAIN; 299107849Salfred } 30082746Sdillon 30182746Sdillon /* 302107849Salfred * When putting pages, do not extend file past EOF. 303109521Salfred */ 30482746Sdillon 30525583Speter if (offset + count > np->n_size) { 30625656Speter count = np->n_size - offset; 30725583Speter if (count < 0) 30825583Speter count = 0; 30926335Speter } 3101541Srgrimes 3111541Srgrimes /* 3121541Srgrimes * We use only the kva address for the buffer, but this is extremely 3131541Srgrimes * convienient and fast. 31412221Sbde */ 31582746Sdillon bp = getpbuf(&nfs_pbuf_freecnt); 31682746Sdillon 31782746Sdillon kva = (vm_offset_t) bp->b_data; 3181541Srgrimes pmap_qenter(kva, pages, npages); 3191549Srgrimes cnt.v_vnodeout++; 320102074Sphk cnt.v_vnodepgsout += count; 3211541Srgrimes 3221541Srgrimes iov.iov_base = (caddr_t) kva; 323110286Stjr iov.iov_len = count; 3241541Srgrimes uio.uio_iov = &iov; 3251541Srgrimes uio.uio_iovcnt = 1; 3261541Srgrimes uio.uio_offset = offset; 3271541Srgrimes uio.uio_resid = count; 32899012Salfred uio.uio_segflg = UIO_SYSSPACE; 3291541Srgrimes uio.uio_rw = UIO_WRITE; 33090836Sphk uio.uio_td = td; 331110299Sphk 332110299Sphk if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 333110286Stjr iomode = NFSV3WRITE_UNSTABLE; 33482746Sdillon else 3351541Srgrimes iomode = NFSV3WRITE_FILESYNC; 3361541Srgrimes 3371541Srgrimes error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 33812221Sbde 3391541Srgrimes pmap_qremove(kva, npages); 3401541Srgrimes relpbuf(bp, &nfs_pbuf_freecnt); 3411541Srgrimes 3421541Srgrimes if (!error) { 34312221Sbde int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 34482746Sdillon for (i = 0; i < nwritten; i++) { 34582746Sdillon rtvals[i] = VM_PAGER_OK; 34682746Sdillon vm_page_undirty(pages[i]); 3471541Srgrimes } 3481549Srgrimes if (must_commit) { 349102074Sphk nfs_clearcommit(vp->v_mount); 3501541Srgrimes } 35125656Speter } 3521541Srgrimes return rtvals[0]; 35382746Sdillon} 3541541Srgrimes 355106369Srwatson/* 356106369Srwatson * Vnode op for read using bio 357106369Srwatson */ 358106369Srwatsonint 359106369Srwatsonnfs_bioread(vp, uio, ioflag, cred) 36093593Sjhb register struct vnode *vp; 36194343Sjhb register struct uio *uio; 3621541Srgrimes int ioflag; 36325656Speter struct ucred *cred; 36499012Salfred{ 36594343Sjhb register struct nfsnode *np = VTONFS(vp); 36694343Sjhb register int biosize, i; 36794343Sjhb struct buf *bp = 0, *rabp; 36825656Speter struct vattr vattr; 3691541Srgrimes struct thread *td; 37099012Salfred struct nfsmount *nmp = VFSTONFS(vp->v_mount); 37194343Sjhb daddr_t lbn, rabn; 37294343Sjhb int bcount; 37394343Sjhb int seqcount; 37494343Sjhb int nra, error = 0, n = 0, on = 0; 37594343Sjhb 376110299Sphk#ifdef DIAGNOSTIC 377110299Sphk if (uio->uio_rw != UIO_READ) 37882746Sdillon panic("nfs_read mode"); 37982746Sdillon#endif 3801541Srgrimes if (uio->uio_resid == 0) 38182746Sdillon return (0); 3821541Srgrimes if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 3831541Srgrimes return (EINVAL); 3841541Srgrimes td = uio->uio_td; 3851541Srgrimes 3861541Srgrimes if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 3871541Srgrimes (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 3881541Srgrimes (void)nfs_fsinfo(nmp, vp, cred, td); 3891541Srgrimes if (vp->v_type != VDIR && 3901541Srgrimes (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 3911541Srgrimes return (EFBIG); 3921541Srgrimes biosize = vp->v_mount->mnt_stat.f_iosize; 3931541Srgrimes seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); 3941541Srgrimes /* 3951541Srgrimes * For nfs, cache consistency can only be maintained approximately. 3961541Srgrimes * Although RFC1094 does not specify the criteria, the following is 3971541Srgrimes * believed to be compatible with the reference port. 3981541Srgrimes * For nqnfs, full cache consistency is maintained within the loop. 3991541Srgrimes * For nfs: 4001541Srgrimes * If the file's modify time on the server has changed since the 4011541Srgrimes * last read rpc or you have written to the file, 40212221Sbde * you may have lost data cache consistency with the 4031541Srgrimes * server, so flush all of the file's data out of the cache. 4041541Srgrimes * Then force a getattr rpc to ensure that you have up to date 4051541Srgrimes * attributes. 4061541Srgrimes * NB: This implies that cache data can be read when up to 40712221Sbde * NFS_ATTRTIMEO seconds out of date. If you find that you need current 40882746Sdillon * attributes this could be forced by setting n_attrstamp to 0 before 40982746Sdillon * the VOP_GETATTR() call. 41082746Sdillon */ 4111549Srgrimes if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 412102074Sphk if (np->n_flag & NMODIFIED) { 4131541Srgrimes if (vp->v_type != VREG) { 41483366Sjulian if (vp->v_type != VDIR) 41534961Sphk panic("nfs: bioread, not dir"); 4161541Srgrimes nfs_invaldir(vp); 4171541Srgrimes error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 4181541Srgrimes if (error) 4191541Srgrimes return (error); 42082746Sdillon } 4211541Srgrimes np->n_attrstamp = 0; 4221541Srgrimes error = VOP_GETATTR(vp, &vattr, cred, td); 42336128Sbde if (error) 4241541Srgrimes return (error); 4251541Srgrimes np->n_mtime = vattr.va_mtime.tv_sec; 4261541Srgrimes } else { 4271541Srgrimes error = VOP_GETATTR(vp, &vattr, cred, td); 428111034Stjr if (error) 4291541Srgrimes return (error); 430111034Stjr if (np->n_mtime != vattr.va_mtime.tv_sec) { 43135058Sphk if (vp->v_type == VDIR) 43236119Sphk nfs_invaldir(vp); 43335058Sphk error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 43435058Sphk if (error) 4351541Srgrimes return (error); 43634961Sphk np->n_mtime = vattr.va_mtime.tv_sec; 43734961Sphk } 43882746Sdillon } 439111034Stjr } 4401541Srgrimes do { 441111034Stjr 44282746Sdillon /* 443110286Stjr * Get a valid lease. If cached data is stale, flush it. 4441541Srgrimes */ 4451541Srgrimes if (nmp->nm_flag & NFSMNT_NQNFS) { 44612221Sbde if (NQNFS_CKINVALID(vp, np, ND_READ)) { 4471541Srgrimes do { 4481541Srgrimes error = nqnfs_getlease(vp, ND_READ, cred, td); 4491541Srgrimes } while (error == NQNFS_EXPIRED); 4501541Srgrimes if (error) 45112221Sbde return (error); 45282746Sdillon if (np->n_lrev != np->n_brev || 45382746Sdillon (np->n_flag & NQNFSNONCACHE) || 45482746Sdillon ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 4551549Srgrimes if (vp->v_type == VDIR) 456102074Sphk nfs_invaldir(vp); 4571541Srgrimes error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 45883366Sjulian if (error) 459111034Stjr return (error); 46034961Sphk np->n_brev = np->n_lrev; 461111034Stjr } 4621541Srgrimes } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 463111034Stjr nfs_invaldir(vp); 464111034Stjr error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 465111034Stjr if (error) 466111034Stjr return (error); 467111034Stjr } 4681541Srgrimes } 4691541Srgrimes if (np->n_flag & NQNFSNONCACHE) { 470111034Stjr switch (vp->v_type) { 4711541Srgrimes case VREG: 472111034Stjr return (nfs_readrpc(vp, uio, cred)); 473111034Stjr case VLNK: 474111034Stjr return (nfs_readlinkrpc(vp, uio, cred)); 475111034Stjr case VDIR: 476111034Stjr break; 477111034Stjr default: 47882746Sdillon printf(" NQNFSNONCACHE: type %x unexpected\n", 4791541Srgrimes vp->v_type); 480111034Stjr }; 48135058Sphk } 48269286Sjake switch (vp->v_type) { 48335058Sphk case VREG: 48469286Sjake nfsstats.biocache_reads++; 48569286Sjake lbn = uio->uio_offset / biosize; 48636119Sphk on = uio->uio_offset & (biosize - 1); 48735044Sphk 488111034Stjr /* 4891541Srgrimes * Start the read ahead(s), as required. 490111034Stjr */ 491111034Stjr if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 492111034Stjr for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 493111034Stjr (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 494111034Stjr rabn = lbn + 1 + nra; 495111034Stjr if (!incore(vp, rabn)) { 496111034Stjr rabp = nfs_getcacheblk(vp, rabn, biosize, td); 49782746Sdillon if (!rabp) 498111034Stjr return (EINTR); 499111034Stjr if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 5001541Srgrimes rabp->b_flags |= B_ASYNC; 501111034Stjr rabp->b_iocmd = BIO_READ; 50282746Sdillon vfs_busy_pages(rabp, 0); 503111034Stjr if (nfs_asyncio(rabp, cred, td)) { 504111034Stjr rabp->b_flags |= B_INVAL; 505111034Stjr rabp->b_ioflags |= BIO_ERROR; 5061541Srgrimes vfs_unbusy_pages(rabp); 5071541Srgrimes brelse(rabp); 5081541Srgrimes break; 5091541Srgrimes } 5101541Srgrimes } else { 5111541Srgrimes brelse(rabp); 5121541Srgrimes } 5131541Srgrimes } 5141541Srgrimes } 51536127Sbde } 5169327Sbde 5179327Sbde /* 5189327Sbde * Obtain the buffer cache block. Figure out the buffer size 5191541Srgrimes * when we are at EOF. If we are modifying the size of the 5201541Srgrimes * buffer based on an EOF condition we need to hold 521102074Sphk * nfs_rslock() through obtaining the buffer to prevent 5221541Srgrimes * a potential writer-appender from messing with n_size. 523102074Sphk * Otherwise we may accidently truncate the buffer and 52435044Sphk * lose dirty data. 5251541Srgrimes * 5261541Srgrimes * Note that bcount is *not* DEV_BSIZE aligned. 52773916Sjhb */ 5281541Srgrimes 52935058Sphkagain: 53035058Sphk bcount = biosize; 53173916Sjhb if ((off_t)lbn * biosize >= np->n_size) { 5321541Srgrimes bcount = 0; 5331541Srgrimes } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 5341541Srgrimes bcount = np->n_size - (off_t)lbn * biosize; 5351541Srgrimes } 5361541Srgrimes if (bcount != biosize) { 53736119Sphk switch(nfs_rslock(np, td)) { 53835058Sphk case ENOLCK: 53935044Sphk goto again; 54035044Sphk /* not reached */ 54169286Sjake case EINTR: 54269286Sjake case ERESTART: 54373916Sjhb return(EINTR); 5441541Srgrimes /* not reached */ 5451541Srgrimes default: 5461541Srgrimes break; 54773916Sjhb } 5481541Srgrimes } 5491541Srgrimes 5501541Srgrimes bp = nfs_getcacheblk(vp, lbn, bcount, td); 5511541Srgrimes 5521541Srgrimes if (bcount != biosize) 5531541Srgrimes nfs_rsunlock(np, td); 5541541Srgrimes if (!bp) 5551541Srgrimes return (EINTR); 5561549Srgrimes 557102074Sphk /* 5581541Srgrimes * If B_CACHE is not set, we must issue the read. If this 5591541Srgrimes * fails, we return an error. 5601541Srgrimes */ 5611541Srgrimes 5621541Srgrimes if ((bp->b_flags & B_CACHE) == 0) { 5631541Srgrimes bp->b_iocmd = BIO_READ; 5641541Srgrimes vfs_busy_pages(bp, 0); 5651541Srgrimes error = nfs_doio(bp, cred, td); 5661541Srgrimes if (error) { 5671541Srgrimes brelse(bp); 5681541Srgrimes return (error); 5691541Srgrimes } 5701541Srgrimes } 5711541Srgrimes 5721541Srgrimes /* 5731541Srgrimes * on is the offset into the current bp. Figure out how many 5741541Srgrimes * bytes we can copy out of the bp. Note that bcount is 5751541Srgrimes * NOT DEV_BSIZE aligned. 5761541Srgrimes * 5771541Srgrimes * Then figure out how many bytes we can copy into the uio. 5781549Srgrimes */ 579102074Sphk 5801541Srgrimes n = 0; 5811541Srgrimes if (on < bcount) 5821541Srgrimes n = min((unsigned)(bcount - on), uio->uio_resid); 5831541Srgrimes break; 5841541Srgrimes case VLNK: 5851541Srgrimes nfsstats.biocache_readlinks++; 5861541Srgrimes bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 5871541Srgrimes if (!bp) 5881541Srgrimes return (EINTR); 5891541Srgrimes if ((bp->b_flags & B_CACHE) == 0) { 5901541Srgrimes bp->b_iocmd = BIO_READ; 5911541Srgrimes vfs_busy_pages(bp, 0); 5921541Srgrimes error = nfs_doio(bp, cred, td); 59335058Sphk if (error) { 5941541Srgrimes bp->b_ioflags |= BIO_ERROR; 5951541Srgrimes brelse(bp); 5961541Srgrimes return (error); 59735058Sphk } 5981541Srgrimes } 5991541Srgrimes n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 6001541Srgrimes on = 0; 6011541Srgrimes break; 6021541Srgrimes case VDIR: 6031541Srgrimes nfsstats.biocache_readdirs++; 6041541Srgrimes if (np->n_direofoffset 6051541Srgrimes && uio->uio_offset >= np->n_direofoffset) { 6061541Srgrimes return (0); 6071541Srgrimes } 6081541Srgrimes lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 6091541Srgrimes on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 6101541Srgrimes bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 6111541Srgrimes if (!bp) 6121541Srgrimes return (EINTR); 6131541Srgrimes if ((bp->b_flags & B_CACHE) == 0) { 6141541Srgrimes bp->b_iocmd = BIO_READ; 6151541Srgrimes vfs_busy_pages(bp, 0); 6161549Srgrimes error = nfs_doio(bp, cred, td); 617102074Sphk if (error) { 6181541Srgrimes brelse(bp); 6191541Srgrimes } 6201541Srgrimes while (error == NFSERR_BAD_COOKIE) { 6211541Srgrimes printf("got bad cookie vp %p bp %p\n", vp, bp); 6221541Srgrimes nfs_invaldir(vp); 6231541Srgrimes error = nfs_vinvalbuf(vp, 0, cred, td, 1); 6241541Srgrimes /* 6251549Srgrimes * Yuck! The directory has been modified on the 626102074Sphk * server. The only way to get the block is by 6271541Srgrimes * reading from the beginning to get all the 6281541Srgrimes * offset cookies. 6291541Srgrimes * 6301541Srgrimes * Leave the last bp intact unless there is an error. 6311541Srgrimes * Loop back up to the while if the error is another 6321541Srgrimes * NFSERR_BAD_COOKIE (double yuch!). 6331541Srgrimes */ 63412819Sphk for (i = 0; i <= lbn && !error; i++) { 635102074Sphk if (np->n_direofoffset 6361541Srgrimes && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 6371541Srgrimes return (0); 6381541Srgrimes bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 6391541Srgrimes if (!bp) 6401541Srgrimes return (EINTR); 6411541Srgrimes if ((bp->b_flags & B_CACHE) == 0) { 6421541Srgrimes bp->b_iocmd = BIO_READ; 6431541Srgrimes vfs_busy_pages(bp, 0); 6441541Srgrimes error = nfs_doio(bp, cred, td); 6451541Srgrimes /* 6461541Srgrimes * no error + B_INVAL == directory EOF, 647108142Ssam * use the block. 648108142Ssam */ 649108511Ssam if (error == 0 && (bp->b_flags & B_INVAL)) 650108142Ssam break; 651108142Ssam } 652108142Ssam /* 653108142Ssam * An error will throw away the block and the 654108142Ssam * for loop will break out. If no error and this 655108142Ssam * is not the block we want, we throw away the 656108142Ssam * block and go for the next one via the for loop. 657108511Ssam */ 658108511Ssam if (error || i < lbn) 659108511Ssam brelse(bp); 660108142Ssam } 661108142Ssam } 662108142Ssam /* 663108142Ssam * The above while is repeated if we hit another cookie 664108142Ssam * error. If we hit an error and it wasn't a cookie error, 665108142Ssam * we give up. 666108142Ssam */ 667108142Ssam if (error) 668108142Ssam return (error); 669108142Ssam } 670108142Ssam 671108142Ssam /* 672108142Ssam * If not eof and read aheads are enabled, start one. 673108142Ssam * (You need the current block first, so that you have the 674108142Ssam * directory offset cookie of the next block.) 675108142Ssam */ 676108511Ssam if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 677108511Ssam (bp->b_flags & B_INVAL) == 0 && 678108511Ssam (np->n_direofoffset == 0 || 679108511Ssam (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 680108511Ssam !(np->n_flag & NQNFSNONCACHE) && 681108511Ssam !incore(vp, lbn + 1)) { 682108511Ssam rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 683108142Ssam if (rabp) { 684108142Ssam if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 685108142Ssam rabp->b_flags |= B_ASYNC; 686108142Ssam rabp->b_iocmd = BIO_READ; 687108511Ssam vfs_busy_pages(rabp, 0); 688108142Ssam if (nfs_asyncio(rabp, cred, td)) { 689108142Ssam rabp->b_flags |= B_INVAL; 690108511Ssam rabp->b_ioflags |= BIO_ERROR; 691108511Ssam vfs_unbusy_pages(rabp); 692108511Ssam brelse(rabp); 693108142Ssam } 694108511Ssam } else { 695108511Ssam brelse(rabp); 696108511Ssam } 697108511Ssam } 698108511Ssam } 699108511Ssam /* 700108511Ssam * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 701108511Ssam * chopped for the EOF condition, we cannot tell how large 702108511Ssam * NFS directories are going to be until we hit EOF. So 703108142Ssam * an NFS directory buffer is *not* chopped to its EOF. Now, 704 * it just so happens that b_resid will effectively chop it 705 * to EOF. *BUT* this information is lost if the buffer goes 706 * away and is reconstituted into a B_CACHE state ( due to 707 * being VMIO ) later. So we keep track of the directory eof 708 * in np->n_direofoffset and chop it off as an extra step 709 * right here. 710 */ 711 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 712 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 713 n = np->n_direofoffset - uio->uio_offset; 714 break; 715 default: 716 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 717 break; 718 }; 719 720 if (n > 0) { 721 error = uiomove(bp->b_data + on, (int)n, uio); 722 } 723 switch (vp->v_type) { 724 case VREG: 725 break; 726 case VLNK: 727 n = 0; 728 break; 729 case VDIR: 730 /* 731 * Invalidate buffer if caching is disabled, forcing a 732 * re-read from the remote later. 733 */ 734 if (np->n_flag & NQNFSNONCACHE) 735 bp->b_flags |= B_INVAL; 736 break; 737 default: 738 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 739 } 740 brelse(bp); 741 } while (error == 0 && uio->uio_resid > 0 && n > 0); 742 return (error); 743} 744 745/* 746 * Vnode op for write using bio 747 */ 748int 749nfs_write(ap) 750 struct vop_write_args /* { 751 struct vnode *a_vp; 752 struct uio *a_uio; 753 int a_ioflag; 754 struct ucred *a_cred; 755 } */ *ap; 756{ 757 int biosize; 758 struct uio *uio = ap->a_uio; 759 struct thread *td = uio->uio_td; 760 struct vnode *vp = ap->a_vp; 761 struct nfsnode *np = VTONFS(vp); 762 struct ucred *cred = ap->a_cred; 763 int ioflag = ap->a_ioflag; 764 struct buf *bp; 765 struct vattr vattr; 766 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 767 daddr_t lbn; 768 int bcount; 769 int n, on, error = 0, iomode, must_commit; 770 int haverslock = 0; 771 struct proc *p = td?td->td_proc:NULL; 772 773 GIANT_REQUIRED; 774 775#ifdef DIAGNOSTIC 776 if (uio->uio_rw != UIO_WRITE) 777 panic("nfs_write mode"); 778 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 779 panic("nfs_write proc"); 780#endif 781 if (vp->v_type != VREG) 782 return (EIO); 783 if (np->n_flag & NWRITEERR) { 784 np->n_flag &= ~NWRITEERR; 785 return (np->n_error); 786 } 787 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 788 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 789 (void)nfs_fsinfo(nmp, vp, cred, td); 790 791 /* 792 * Synchronously flush pending buffers if we are in synchronous 793 * mode or if we are appending. 794 */ 795 if (ioflag & (IO_APPEND | IO_SYNC)) { 796 if (np->n_flag & NMODIFIED) { 797 np->n_attrstamp = 0; 798 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 799 if (error) 800 return (error); 801 } 802 } 803 804 /* 805 * If IO_APPEND then load uio_offset. We restart here if we cannot 806 * get the append lock. 807 */ 808restart: 809 if (ioflag & IO_APPEND) { 810 np->n_attrstamp = 0; 811 error = VOP_GETATTR(vp, &vattr, cred, td); 812 if (error) 813 return (error); 814 uio->uio_offset = np->n_size; 815 } 816 817 if (uio->uio_offset < 0) 818 return (EINVAL); 819 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 820 return (EFBIG); 821 if (uio->uio_resid == 0) 822 return (0); 823 824 /* 825 * We need to obtain the rslock if we intend to modify np->n_size 826 * in order to guarentee the append point with multiple contending 827 * writers, to guarentee that no other appenders modify n_size 828 * while we are trying to obtain a truncated buffer (i.e. to avoid 829 * accidently truncating data written by another appender due to 830 * the race), and to ensure that the buffer is populated prior to 831 * our extending of the file. We hold rslock through the entire 832 * operation. 833 * 834 * Note that we do not synchronize the case where someone truncates 835 * the file while we are appending to it because attempting to lock 836 * this case may deadlock other parts of the system unexpectedly. 837 */ 838 if ((ioflag & IO_APPEND) || 839 uio->uio_offset + uio->uio_resid > np->n_size) { 840 switch(nfs_rslock(np, td)) { 841 case ENOLCK: 842 goto restart; 843 /* not reached */ 844 case EINTR: 845 case ERESTART: 846 return(EINTR); 847 /* not reached */ 848 default: 849 break; 850 } 851 haverslock = 1; 852 } 853 854 /* 855 * Maybe this should be above the vnode op call, but so long as 856 * file servers have no limits, i don't think it matters 857 */ 858 if (p && uio->uio_offset + uio->uio_resid > 859 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 860 PROC_LOCK(p); 861 psignal(p, SIGXFSZ); 862 PROC_UNLOCK(p); 863 if (haverslock) 864 nfs_rsunlock(np, td); 865 return (EFBIG); 866 } 867 868 biosize = vp->v_mount->mnt_stat.f_iosize; 869 870 do { 871 /* 872 * Check for a valid write lease. 873 */ 874 if ((nmp->nm_flag & NFSMNT_NQNFS) && 875 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 876 do { 877 error = nqnfs_getlease(vp, ND_WRITE, cred, td); 878 } while (error == NQNFS_EXPIRED); 879 if (error) 880 break; 881 if (np->n_lrev != np->n_brev || 882 (np->n_flag & NQNFSNONCACHE)) { 883 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 884 if (error) 885 break; 886 np->n_brev = np->n_lrev; 887 } 888 } 889 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 890 iomode = NFSV3WRITE_FILESYNC; 891 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 892 if (must_commit) 893 nfs_clearcommit(vp->v_mount); 894 break; 895 } 896 nfsstats.biocache_writes++; 897 lbn = uio->uio_offset / biosize; 898 on = uio->uio_offset & (biosize-1); 899 n = min((unsigned)(biosize - on), uio->uio_resid); 900again: 901 /* 902 * Handle direct append and file extension cases, calculate 903 * unaligned buffer size. 904 */ 905 906 if (uio->uio_offset == np->n_size && n) { 907 /* 908 * Get the buffer (in its pre-append state to maintain 909 * B_CACHE if it was previously set). Resize the 910 * nfsnode after we have locked the buffer to prevent 911 * readers from reading garbage. 912 */ 913 bcount = on; 914 bp = nfs_getcacheblk(vp, lbn, bcount, td); 915 916 if (bp != NULL) { 917 long save; 918 919 np->n_size = uio->uio_offset + n; 920 np->n_flag |= NMODIFIED; 921 vnode_pager_setsize(vp, np->n_size); 922 923 save = bp->b_flags & B_CACHE; 924 bcount += n; 925 allocbuf(bp, bcount); 926 bp->b_flags |= save; 927 bp->b_magic = B_MAGIC_NFS; 928 bp->b_op = &buf_ops_nfs; 929 } 930 } else { 931 /* 932 * Obtain the locked cache block first, and then 933 * adjust the file's size as appropriate. 934 */ 935 bcount = on + n; 936 if ((off_t)lbn * biosize + bcount < np->n_size) { 937 if ((off_t)(lbn + 1) * biosize < np->n_size) 938 bcount = biosize; 939 else 940 bcount = np->n_size - (off_t)lbn * biosize; 941 } 942 943 bp = nfs_getcacheblk(vp, lbn, bcount, td); 944 945 if (uio->uio_offset + n > np->n_size) { 946 np->n_size = uio->uio_offset + n; 947 np->n_flag |= NMODIFIED; 948 vnode_pager_setsize(vp, np->n_size); 949 } 950 } 951 952 if (!bp) { 953 error = EINTR; 954 break; 955 } 956 957 /* 958 * Issue a READ if B_CACHE is not set. In special-append 959 * mode, B_CACHE is based on the buffer prior to the write 960 * op and is typically set, avoiding the read. If a read 961 * is required in special append mode, the server will 962 * probably send us a short-read since we extended the file 963 * on our end, resulting in b_resid == 0 and, thusly, 964 * B_CACHE getting set. 965 * 966 * We can also avoid issuing the read if the write covers 967 * the entire buffer. We have to make sure the buffer state 968 * is reasonable in this case since we will not be initiating 969 * I/O. See the comments in kern/vfs_bio.c's getblk() for 970 * more information. 971 * 972 * B_CACHE may also be set due to the buffer being cached 973 * normally. 974 */ 975 976 if (on == 0 && n == bcount) { 977 bp->b_flags |= B_CACHE; 978 bp->b_flags &= ~B_INVAL; 979 bp->b_ioflags &= ~BIO_ERROR; 980 } 981 982 if ((bp->b_flags & B_CACHE) == 0) { 983 bp->b_iocmd = BIO_READ; 984 vfs_busy_pages(bp, 0); 985 error = nfs_doio(bp, cred, td); 986 if (error) { 987 brelse(bp); 988 break; 989 } 990 } 991 if (!bp) { 992 error = EINTR; 993 break; 994 } 995 if (bp->b_wcred == NOCRED) { 996 crhold(cred); 997 bp->b_wcred = cred; 998 } 999 np->n_flag |= NMODIFIED; 1000 1001 /* 1002 * If dirtyend exceeds file size, chop it down. This should 1003 * not normally occur but there is an append race where it 1004 * might occur XXX, so we log it. 1005 * 1006 * If the chopping creates a reverse-indexed or degenerate 1007 * situation with dirtyoff/end, we 0 both of them. 1008 */ 1009 1010 if (bp->b_dirtyend > bcount) { 1011 printf("NFS append race @%lx:%d\n", 1012 (long)bp->b_blkno * DEV_BSIZE, 1013 bp->b_dirtyend - bcount); 1014 bp->b_dirtyend = bcount; 1015 } 1016 1017 if (bp->b_dirtyoff >= bp->b_dirtyend) 1018 bp->b_dirtyoff = bp->b_dirtyend = 0; 1019 1020 /* 1021 * If the new write will leave a contiguous dirty 1022 * area, just update the b_dirtyoff and b_dirtyend, 1023 * otherwise force a write rpc of the old dirty area. 1024 * 1025 * While it is possible to merge discontiguous writes due to 1026 * our having a B_CACHE buffer ( and thus valid read data 1027 * for the hole), we don't because it could lead to 1028 * significant cache coherency problems with multiple clients, 1029 * especially if locking is implemented later on. 1030 * 1031 * as an optimization we could theoretically maintain 1032 * a linked list of discontinuous areas, but we would still 1033 * have to commit them separately so there isn't much 1034 * advantage to it except perhaps a bit of asynchronization. 1035 */ 1036 1037 if (bp->b_dirtyend > 0 && 1038 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1039 if (BUF_WRITE(bp) == EINTR) 1040 return (EINTR); 1041 goto again; 1042 } 1043 1044 /* 1045 * Check for valid write lease and get one as required. 1046 * In case getblk() and/or bwrite() delayed us. 1047 */ 1048 if ((nmp->nm_flag & NFSMNT_NQNFS) && 1049 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 1050 do { 1051 error = nqnfs_getlease(vp, ND_WRITE, cred, td); 1052 } while (error == NQNFS_EXPIRED); 1053 if (error) { 1054 brelse(bp); 1055 break; 1056 } 1057 if (np->n_lrev != np->n_brev || 1058 (np->n_flag & NQNFSNONCACHE)) { 1059 brelse(bp); 1060 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 1061 if (error) 1062 break; 1063 np->n_brev = np->n_lrev; 1064 goto again; 1065 } 1066 } 1067 1068 error = uiomove((char *)bp->b_data + on, n, uio); 1069 1070 /* 1071 * Since this block is being modified, it must be written 1072 * again and not just committed. Since write clustering does 1073 * not work for the stage 1 data write, only the stage 2 1074 * commit rpc, we have to clear B_CLUSTEROK as well. 1075 */ 1076 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1077 1078 if (error) { 1079 bp->b_ioflags |= BIO_ERROR; 1080 brelse(bp); 1081 break; 1082 } 1083 1084 /* 1085 * Only update dirtyoff/dirtyend if not a degenerate 1086 * condition. 1087 */ 1088 if (n) { 1089 if (bp->b_dirtyend > 0) { 1090 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1091 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1092 } else { 1093 bp->b_dirtyoff = on; 1094 bp->b_dirtyend = on + n; 1095 } 1096 vfs_bio_set_validclean(bp, on, n); 1097 } 1098 1099 /* 1100 * If the lease is non-cachable or IO_SYNC do bwrite(). 1101 * 1102 * IO_INVAL appears to be unused. The idea appears to be 1103 * to turn off caching in this case. Very odd. XXX 1104 */ 1105 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 1106 if (ioflag & IO_INVAL) 1107 bp->b_flags |= B_NOCACHE; 1108 error = BUF_WRITE(bp); 1109 if (error) 1110 break; 1111 if (np->n_flag & NQNFSNONCACHE) { 1112 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 1113 if (error) 1114 break; 1115 } 1116 } else if ((n + on) == biosize && 1117 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 1118 bp->b_flags |= B_ASYNC; 1119 (void)nfs_writebp(bp, 0, 0); 1120 } else { 1121 bdwrite(bp); 1122 } 1123 } while (uio->uio_resid > 0 && n > 0); 1124 1125 if (haverslock) 1126 nfs_rsunlock(np, td); 1127 1128 return (error); 1129} 1130 1131/* 1132 * Get an nfs cache block. 1133 * 1134 * Allocate a new one if the block isn't currently in the cache 1135 * and return the block marked busy. If the calling process is 1136 * interrupted by a signal for an interruptible mount point, return 1137 * NULL. 1138 * 1139 * The caller must carefully deal with the possible B_INVAL state of 1140 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1141 * indirectly), so synchronous reads can be issued without worrying about 1142 * the B_INVAL state. We have to be a little more careful when dealing 1143 * with writes (see comments in nfs_write()) when extending a file past 1144 * its EOF. 1145 */ 1146static struct buf * 1147nfs_getcacheblk(vp, bn, size, td) 1148 struct vnode *vp; 1149 daddr_t bn; 1150 int size; 1151 struct thread *td; 1152{ 1153 register struct buf *bp; 1154 struct mount *mp; 1155 struct nfsmount *nmp; 1156 1157 mp = vp->v_mount; 1158 nmp = VFSTONFS(mp); 1159 1160 if (nmp->nm_flag & NFSMNT_INT) { 1161 bp = getblk(vp, bn, size, PCATCH, 0); 1162 while (bp == (struct buf *)0) { 1163 if (nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc)) 1164 return ((struct buf *)0); 1165 bp = getblk(vp, bn, size, 0, 2 * hz); 1166 } 1167 } else { 1168 bp = getblk(vp, bn, size, 0, 0); 1169 } 1170 1171 if (vp->v_type == VREG) { 1172 int biosize; 1173 1174 biosize = mp->mnt_stat.f_iosize; 1175 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1176 } 1177 return (bp); 1178} 1179 1180/* 1181 * Flush and invalidate all dirty buffers. If another process is already 1182 * doing the flush, just wait for completion. 1183 */ 1184int 1185nfs_vinvalbuf(vp, flags, cred, td, intrflg) 1186 struct vnode *vp; 1187 int flags; 1188 struct ucred *cred; 1189 struct thread *td; 1190 int intrflg; 1191{ 1192 register struct nfsnode *np = VTONFS(vp); 1193 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1194 int error = 0, slpflag, slptimeo; 1195 1196 if (vp->v_flag & VXLOCK) { 1197 return (0); 1198 } 1199 1200 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1201 intrflg = 0; 1202 if (intrflg) { 1203 slpflag = PCATCH; 1204 slptimeo = 2 * hz; 1205 } else { 1206 slpflag = 0; 1207 slptimeo = 0; 1208 } 1209 /* 1210 * First wait for any other process doing a flush to complete. 1211 */ 1212 while (np->n_flag & NFLUSHINPROG) { 1213 np->n_flag |= NFLUSHWANT; 1214 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 1215 slptimeo); 1216 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc)) 1217 return (EINTR); 1218 } 1219 1220 /* 1221 * Now, flush as required. 1222 */ 1223 np->n_flag |= NFLUSHINPROG; 1224 error = vinvalbuf(vp, flags, cred, td, slpflag, 0); 1225 while (error) { 1226 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc)) { 1227 np->n_flag &= ~NFLUSHINPROG; 1228 if (np->n_flag & NFLUSHWANT) { 1229 np->n_flag &= ~NFLUSHWANT; 1230 wakeup((caddr_t)&np->n_flag); 1231 } 1232 return (EINTR); 1233 } 1234 error = vinvalbuf(vp, flags, cred, td, 0, slptimeo); 1235 } 1236 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 1237 if (np->n_flag & NFLUSHWANT) { 1238 np->n_flag &= ~NFLUSHWANT; 1239 wakeup((caddr_t)&np->n_flag); 1240 } 1241 return (0); 1242} 1243 1244/* 1245 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1246 * This is mainly to avoid queueing async I/O requests when the nfsiods 1247 * are all hung on a dead server. 1248 * 1249 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1250 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1251 */ 1252int 1253nfs_asyncio(bp, cred, td) 1254 register struct buf *bp; 1255 struct ucred *cred; 1256 struct thread *td; 1257{ 1258 struct nfsmount *nmp; 1259 int i; 1260 int gotiod; 1261 int slpflag = 0; 1262 int slptimeo = 0; 1263 int error; 1264 1265 /* 1266 * If no async daemons then return EIO to force caller to run the rpc 1267 * synchronously. 1268 */ 1269 if (nfs_numasync == 0) 1270 return (EIO); 1271 1272 nmp = VFSTONFS(bp->b_vp->v_mount); 1273 1274 /* 1275 * Commits are usually short and sweet so lets save some cpu and 1276 * leave the async daemons for more important rpc's (such as reads 1277 * and writes). 1278 */ 1279 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1280 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1281 return(EIO); 1282 } 1283 1284again: 1285 if (nmp->nm_flag & NFSMNT_INT) 1286 slpflag = PCATCH; 1287 gotiod = FALSE; 1288 1289 /* 1290 * Find a free iod to process this request. 1291 */ 1292 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 1293 if (nfs_iodwant[i]) { 1294 /* 1295 * Found one, so wake it up and tell it which 1296 * mount to process. 1297 */ 1298 NFS_DPF(ASYNCIO, 1299 ("nfs_asyncio: waking iod %d for mount %p\n", 1300 i, nmp)); 1301 nfs_iodwant[i] = (struct proc *)0; 1302 nfs_iodmount[i] = nmp; 1303 nmp->nm_bufqiods++; 1304 wakeup((caddr_t)&nfs_iodwant[i]); 1305 gotiod = TRUE; 1306 break; 1307 } 1308 1309 /* 1310 * If none are free, we may already have an iod working on this mount 1311 * point. If so, it will process our request. 1312 */ 1313 if (!gotiod) { 1314 if (nmp->nm_bufqiods > 0) { 1315 NFS_DPF(ASYNCIO, 1316 ("nfs_asyncio: %d iods are already processing mount %p\n", 1317 nmp->nm_bufqiods, nmp)); 1318 gotiod = TRUE; 1319 } 1320 } 1321 1322 /* 1323 * If we have an iod which can process the request, then queue 1324 * the buffer. 1325 */ 1326 if (gotiod) { 1327 /* 1328 * Ensure that the queue never grows too large. We still want 1329 * to asynchronize so we block rather then return EIO. 1330 */ 1331 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1332 NFS_DPF(ASYNCIO, 1333 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1334 nmp->nm_bufqwant = TRUE; 1335 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1336 "nfsaio", slptimeo); 1337 if (error) { 1338 if (nfs_sigintr(nmp, NULL, td ? td->td_proc : NULL)) 1339 return (EINTR); 1340 if (slpflag == PCATCH) { 1341 slpflag = 0; 1342 slptimeo = 2 * hz; 1343 } 1344 } 1345 /* 1346 * We might have lost our iod while sleeping, 1347 * so check and loop if nescessary. 1348 */ 1349 if (nmp->nm_bufqiods == 0) { 1350 NFS_DPF(ASYNCIO, 1351 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1352 goto again; 1353 } 1354 } 1355 1356 if (bp->b_iocmd == BIO_READ) { 1357 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1358 crhold(cred); 1359 bp->b_rcred = cred; 1360 } 1361 } else { 1362 bp->b_flags |= B_WRITEINPROG; 1363 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1364 crhold(cred); 1365 bp->b_wcred = cred; 1366 } 1367 } 1368 1369 BUF_KERNPROC(bp); 1370 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1371 nmp->nm_bufqlen++; 1372 return (0); 1373 } 1374 1375 /* 1376 * All the iods are busy on other mounts, so return EIO to 1377 * force the caller to process the i/o synchronously. 1378 */ 1379 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1380 return (EIO); 1381} 1382 1383/* 1384 * Do an I/O operation to/from a cache block. This may be called 1385 * synchronously or from an nfsiod. 1386 */ 1387int 1388nfs_doio(bp, cr, td) 1389 struct buf *bp; 1390 struct ucred *cr; 1391 struct thread *td; 1392{ 1393 struct uio *uiop; 1394 struct vnode *vp; 1395 struct nfsnode *np; 1396 struct nfsmount *nmp; 1397 int error = 0, iomode, must_commit = 0; 1398 struct uio uio; 1399 struct iovec io; 1400 struct proc *p = td?td->td_proc:NULL; 1401 1402 vp = bp->b_vp; 1403 np = VTONFS(vp); 1404 nmp = VFSTONFS(vp->v_mount); 1405 uiop = &uio; 1406 uiop->uio_iov = &io; 1407 uiop->uio_iovcnt = 1; 1408 uiop->uio_segflg = UIO_SYSSPACE; 1409 uiop->uio_td = td; 1410 1411 /* 1412 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1413 * do this here so we do not have to do it in all the code that 1414 * calls us. 1415 */ 1416 bp->b_flags &= ~B_INVAL; 1417 bp->b_ioflags &= ~BIO_ERROR; 1418 1419 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1420 1421 /* 1422 * Historically, paging was done with physio, but no more. 1423 */ 1424 if (bp->b_flags & B_PHYS) { 1425 /* 1426 * ...though reading /dev/drum still gets us here. 1427 */ 1428 io.iov_len = uiop->uio_resid = bp->b_bcount; 1429 /* mapping was done by vmapbuf() */ 1430 io.iov_base = bp->b_data; 1431 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1432 if (bp->b_iocmd == BIO_READ) { 1433 uiop->uio_rw = UIO_READ; 1434 nfsstats.read_physios++; 1435 error = nfs_readrpc(vp, uiop, cr); 1436 } else { 1437 int com; 1438 1439 iomode = NFSV3WRITE_DATASYNC; 1440 uiop->uio_rw = UIO_WRITE; 1441 nfsstats.write_physios++; 1442 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1443 } 1444 if (error) { 1445 bp->b_ioflags |= BIO_ERROR; 1446 bp->b_error = error; 1447 } 1448 } else if (bp->b_iocmd == BIO_READ) { 1449 io.iov_len = uiop->uio_resid = bp->b_bcount; 1450 io.iov_base = bp->b_data; 1451 uiop->uio_rw = UIO_READ; 1452 switch (vp->v_type) { 1453 case VREG: 1454 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1455 nfsstats.read_bios++; 1456 error = nfs_readrpc(vp, uiop, cr); 1457 if (!error) { 1458 if (uiop->uio_resid) { 1459 /* 1460 * If we had a short read with no error, we must have 1461 * hit a file hole. We should zero-fill the remainder. 1462 * This can also occur if the server hits the file EOF. 1463 * 1464 * Holes used to be able to occur due to pending 1465 * writes, but that is not possible any longer. 1466 */ 1467 int nread = bp->b_bcount - uiop->uio_resid; 1468 int left = bp->b_bcount - nread; 1469 1470 if (left > 0) 1471 bzero((char *)bp->b_data + nread, left); 1472 uiop->uio_resid = 0; 1473 } 1474 } 1475 if (p && (vp->v_flag & VTEXT) && 1476 (((nmp->nm_flag & NFSMNT_NQNFS) && 1477 NQNFS_CKINVALID(vp, np, ND_READ) && 1478 np->n_lrev != np->n_brev) || 1479 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1480 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1481 uprintf("Process killed due to text file modification\n"); 1482 PROC_LOCK(p); 1483 psignal(p, SIGKILL); 1484 _PHOLD(p); 1485 PROC_UNLOCK(p); 1486 } 1487 break; 1488 case VLNK: 1489 uiop->uio_offset = (off_t)0; 1490 nfsstats.readlink_bios++; 1491 error = nfs_readlinkrpc(vp, uiop, cr); 1492 break; 1493 case VDIR: 1494 nfsstats.readdir_bios++; 1495 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1496 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1497 error = nfs_readdirplusrpc(vp, uiop, cr); 1498 if (error == NFSERR_NOTSUPP) 1499 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1500 } 1501 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1502 error = nfs_readdirrpc(vp, uiop, cr); 1503 /* 1504 * end-of-directory sets B_INVAL but does not generate an 1505 * error. 1506 */ 1507 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1508 bp->b_flags |= B_INVAL; 1509 break; 1510 default: 1511 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1512 break; 1513 }; 1514 if (error) { 1515 bp->b_ioflags |= BIO_ERROR; 1516 bp->b_error = error; 1517 } 1518 } else { 1519 /* 1520 * If we only need to commit, try to commit 1521 */ 1522 if (bp->b_flags & B_NEEDCOMMIT) { 1523 int retv; 1524 off_t off; 1525 1526 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1527 bp->b_flags |= B_WRITEINPROG; 1528 retv = nfs_commit( 1529 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1530 bp->b_wcred, td); 1531 bp->b_flags &= ~B_WRITEINPROG; 1532 if (retv == 0) { 1533 bp->b_dirtyoff = bp->b_dirtyend = 0; 1534 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1535 bp->b_resid = 0; 1536 bufdone(bp); 1537 return (0); 1538 } 1539 if (retv == NFSERR_STALEWRITEVERF) { 1540 nfs_clearcommit(bp->b_vp->v_mount); 1541 } 1542 } 1543 1544 /* 1545 * Setup for actual write 1546 */ 1547 1548 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1549 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1550 1551 if (bp->b_dirtyend > bp->b_dirtyoff) { 1552 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1553 - bp->b_dirtyoff; 1554 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1555 + bp->b_dirtyoff; 1556 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1557 uiop->uio_rw = UIO_WRITE; 1558 nfsstats.write_bios++; 1559 1560 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1561 iomode = NFSV3WRITE_UNSTABLE; 1562 else 1563 iomode = NFSV3WRITE_FILESYNC; 1564 1565 bp->b_flags |= B_WRITEINPROG; 1566 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1567 1568 /* 1569 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1570 * to cluster the buffers needing commit. This will allow 1571 * the system to submit a single commit rpc for the whole 1572 * cluster. We can do this even if the buffer is not 100% 1573 * dirty (relative to the NFS blocksize), so we optimize the 1574 * append-to-file-case. 1575 * 1576 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1577 * cleared because write clustering only works for commit 1578 * rpc's, not for the data portion of the write). 1579 */ 1580 1581 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1582 bp->b_flags |= B_NEEDCOMMIT; 1583 if (bp->b_dirtyoff == 0 1584 && bp->b_dirtyend == bp->b_bcount) 1585 bp->b_flags |= B_CLUSTEROK; 1586 } else { 1587 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1588 } 1589 bp->b_flags &= ~B_WRITEINPROG; 1590 1591 /* 1592 * For an interrupted write, the buffer is still valid 1593 * and the write hasn't been pushed to the server yet, 1594 * so we can't set BIO_ERROR and report the interruption 1595 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1596 * is not relevant, so the rpc attempt is essentially 1597 * a noop. For the case of a V3 write rpc not being 1598 * committed to stable storage, the block is still 1599 * dirty and requires either a commit rpc or another 1600 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1601 * the block is reused. This is indicated by setting 1602 * the B_DELWRI and B_NEEDCOMMIT flags. 1603 * 1604 * If the buffer is marked B_PAGING, it does not reside on 1605 * the vp's paging queues so we cannot call bdirty(). The 1606 * bp in this case is not an NFS cache block so we should 1607 * be safe. XXX 1608 */ 1609 if (error == EINTR 1610 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1611 int s; 1612 1613 s = splbio(); 1614 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1615 if ((bp->b_flags & B_PAGING) == 0) { 1616 bdirty(bp); 1617 bp->b_flags &= ~B_DONE; 1618 } 1619 if (error && (bp->b_flags & B_ASYNC) == 0) 1620 bp->b_flags |= B_EINTR; 1621 splx(s); 1622 } else { 1623 if (error) { 1624 bp->b_ioflags |= BIO_ERROR; 1625 bp->b_error = np->n_error = error; 1626 np->n_flag |= NWRITEERR; 1627 } 1628 bp->b_dirtyoff = bp->b_dirtyend = 0; 1629 } 1630 } else { 1631 bp->b_resid = 0; 1632 bufdone(bp); 1633 return (0); 1634 } 1635 } 1636 bp->b_resid = uiop->uio_resid; 1637 if (must_commit) 1638 nfs_clearcommit(vp->v_mount); 1639 bufdone(bp); 1640 return (error); 1641} 1642