nfs_bio.c revision 75858
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 37 * $FreeBSD: head/sys/nfsclient/nfs_bio.c 75858 2001-04-23 09:05:15Z grog $ 38 */ 39 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/resourcevar.h> 44#include <sys/signalvar.h> 45#include <sys/proc.h> 46#include <sys/bio.h> 47#include <sys/buf.h> 48#include <sys/vnode.h> 49#include <net/radix.h> 50#include <sys/socket.h> 51#include <sys/mount.h> 52#include <sys/kernel.h> 53 54#include <vm/vm.h> 55#include <vm/vm_extern.h> 56#include <vm/vm_page.h> 57#include <vm/vm_object.h> 58#include <vm/vm_pager.h> 59#include <vm/vnode_pager.h> 60 61#include <nfs/rpcv2.h> 62#include <nfs/nfsproto.h> 63#include <nfs/nfs.h> 64#include <nfs/nfsmount.h> 65#include <nfs/nqnfs.h> 66#include <nfs/nfsnode.h> 67 68/* 69 * Just call nfs_writebp() with the force argument set to 1. 70 * 71 * NOTE: B_DONE may or may not be set in a_bp on call. 72 */ 73static int 74nfs_bwrite(struct buf *bp) 75{ 76 return (nfs_writebp(bp, 1, curproc)); 77} 78 79struct buf_ops buf_ops_nfs = { 80 "buf_ops_nfs", 81 nfs_bwrite 82}; 83 84 85static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 86 struct proc *p)); 87 88extern int nfs_numasync; 89extern int nfs_pbuf_freecnt; 90extern struct nfsstats nfsstats; 91 92/* 93 * Vnode op for VM getpages. 94 */ 95int 96nfs_getpages(ap) 97 struct vop_getpages_args /* { 98 struct vnode *a_vp; 99 vm_page_t *a_m; 100 int a_count; 101 int a_reqpage; 102 vm_ooffset_t a_offset; 103 } */ *ap; 104{ 105 int i, error, nextoff, size, toff, count, npages; 106 struct uio uio; 107 struct iovec iov; 108 vm_offset_t kva; 109 struct buf *bp; 110 struct vnode *vp; 111 struct proc *p; 112 struct ucred *cred; 113 struct nfsmount *nmp; 114 vm_page_t *pages; 115 116 vp = ap->a_vp; 117 p = curproc; /* XXX */ 118 cred = curproc->p_ucred; /* XXX */ 119 nmp = VFSTONFS(vp->v_mount); 120 pages = ap->a_m; 121 count = ap->a_count; 122 123 if (vp->v_object == NULL) { 124 printf("nfs_getpages: called with non-merged cache vnode??\n"); 125 return VM_PAGER_ERROR; 126 } 127 128 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 129 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 130 (void)nfs_fsinfo(nmp, vp, cred, p); 131 132 npages = btoc(count); 133 134 /* 135 * If the requested page is partially valid, just return it and 136 * allow the pager to zero-out the blanks. Partially valid pages 137 * can only occur at the file EOF. 138 */ 139 140 { 141 vm_page_t m = pages[ap->a_reqpage]; 142 143 if (m->valid != 0) { 144 /* handled by vm_fault now */ 145 /* vm_page_zero_invalid(m, TRUE); */ 146 for (i = 0; i < npages; ++i) { 147 if (i != ap->a_reqpage) 148 vm_page_free(pages[i]); 149 } 150 return(0); 151 } 152 } 153 154 /* 155 * We use only the kva address for the buffer, but this is extremely 156 * convienient and fast. 157 */ 158 bp = getpbuf(&nfs_pbuf_freecnt); 159 160 kva = (vm_offset_t) bp->b_data; 161 pmap_qenter(kva, pages, npages); 162 163 iov.iov_base = (caddr_t) kva; 164 iov.iov_len = count; 165 uio.uio_iov = &iov; 166 uio.uio_iovcnt = 1; 167 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 168 uio.uio_resid = count; 169 uio.uio_segflg = UIO_SYSSPACE; 170 uio.uio_rw = UIO_READ; 171 uio.uio_procp = p; 172 173 error = nfs_readrpc(vp, &uio, cred); 174 pmap_qremove(kva, npages); 175 176 relpbuf(bp, &nfs_pbuf_freecnt); 177 178 if (error && (uio.uio_resid == count)) { 179 printf("nfs_getpages: error %d\n", error); 180 for (i = 0; i < npages; ++i) { 181 if (i != ap->a_reqpage) 182 vm_page_free(pages[i]); 183 } 184 return VM_PAGER_ERROR; 185 } 186 187 /* 188 * Calculate the number of bytes read and validate only that number 189 * of bytes. Note that due to pending writes, size may be 0. This 190 * does not mean that the remaining data is invalid! 191 */ 192 193 size = count - uio.uio_resid; 194 195 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 196 vm_page_t m; 197 nextoff = toff + PAGE_SIZE; 198 m = pages[i]; 199 200 m->flags &= ~PG_ZERO; 201 202 if (nextoff <= size) { 203 /* 204 * Read operation filled an entire page 205 */ 206 m->valid = VM_PAGE_BITS_ALL; 207 vm_page_undirty(m); 208 } else if (size > toff) { 209 /* 210 * Read operation filled a partial page. 211 */ 212 m->valid = 0; 213 vm_page_set_validclean(m, 0, size - toff); 214 /* handled by vm_fault now */ 215 /* vm_page_zero_invalid(m, TRUE); */ 216 } 217 218 if (i != ap->a_reqpage) { 219 /* 220 * Whether or not to leave the page activated is up in 221 * the air, but we should put the page on a page queue 222 * somewhere (it already is in the object). Result: 223 * It appears that emperical results show that 224 * deactivating pages is best. 225 */ 226 227 /* 228 * Just in case someone was asking for this page we 229 * now tell them that it is ok to use. 230 */ 231 if (!error) { 232 if (m->flags & PG_WANTED) 233 vm_page_activate(m); 234 else 235 vm_page_deactivate(m); 236 vm_page_wakeup(m); 237 } else { 238 vm_page_free(m); 239 } 240 } 241 } 242 return 0; 243} 244 245/* 246 * Vnode op for VM putpages. 247 */ 248int 249nfs_putpages(ap) 250 struct vop_putpages_args /* { 251 struct vnode *a_vp; 252 vm_page_t *a_m; 253 int a_count; 254 int a_sync; 255 int *a_rtvals; 256 vm_ooffset_t a_offset; 257 } */ *ap; 258{ 259 struct uio uio; 260 struct iovec iov; 261 vm_offset_t kva; 262 struct buf *bp; 263 int iomode, must_commit, i, error, npages, count; 264 off_t offset; 265 int *rtvals; 266 struct vnode *vp; 267 struct proc *p; 268 struct ucred *cred; 269 struct nfsmount *nmp; 270 struct nfsnode *np; 271 vm_page_t *pages; 272 273 vp = ap->a_vp; 274 np = VTONFS(vp); 275 p = curproc; /* XXX */ 276 cred = curproc->p_ucred; /* XXX */ 277 nmp = VFSTONFS(vp->v_mount); 278 pages = ap->a_m; 279 count = ap->a_count; 280 rtvals = ap->a_rtvals; 281 npages = btoc(count); 282 offset = IDX_TO_OFF(pages[0]->pindex); 283 284 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 285 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 286 (void)nfs_fsinfo(nmp, vp, cred, p); 287 288 for (i = 0; i < npages; i++) { 289 rtvals[i] = VM_PAGER_AGAIN; 290 } 291 292 /* 293 * When putting pages, do not extend file past EOF. 294 */ 295 296 if (offset + count > np->n_size) { 297 count = np->n_size - offset; 298 if (count < 0) 299 count = 0; 300 } 301 302 /* 303 * We use only the kva address for the buffer, but this is extremely 304 * convienient and fast. 305 */ 306 bp = getpbuf(&nfs_pbuf_freecnt); 307 308 kva = (vm_offset_t) bp->b_data; 309 pmap_qenter(kva, pages, npages); 310 311 iov.iov_base = (caddr_t) kva; 312 iov.iov_len = count; 313 uio.uio_iov = &iov; 314 uio.uio_iovcnt = 1; 315 uio.uio_offset = offset; 316 uio.uio_resid = count; 317 uio.uio_segflg = UIO_SYSSPACE; 318 uio.uio_rw = UIO_WRITE; 319 uio.uio_procp = p; 320 321 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 322 iomode = NFSV3WRITE_UNSTABLE; 323 else 324 iomode = NFSV3WRITE_FILESYNC; 325 326 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 327 328 pmap_qremove(kva, npages); 329 relpbuf(bp, &nfs_pbuf_freecnt); 330 331 if (!error) { 332 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 333 for (i = 0; i < nwritten; i++) { 334 rtvals[i] = VM_PAGER_OK; 335 vm_page_undirty(pages[i]); 336 } 337 if (must_commit) 338 nfs_clearcommit(vp->v_mount); 339 } 340 return rtvals[0]; 341} 342 343/* 344 * Vnode op for read using bio 345 */ 346int 347nfs_bioread(vp, uio, ioflag, cred) 348 register struct vnode *vp; 349 register struct uio *uio; 350 int ioflag; 351 struct ucred *cred; 352{ 353 register struct nfsnode *np = VTONFS(vp); 354 register int biosize, i; 355 struct buf *bp = 0, *rabp; 356 struct vattr vattr; 357 struct proc *p; 358 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 359 daddr_t lbn, rabn; 360 int bcount; 361 int seqcount; 362 int nra, error = 0, n = 0, on = 0; 363 364#ifdef DIAGNOSTIC 365 if (uio->uio_rw != UIO_READ) 366 panic("nfs_read mode"); 367#endif 368 if (uio->uio_resid == 0) 369 return (0); 370 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 371 return (EINVAL); 372 p = uio->uio_procp; 373 374 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 375 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 376 (void)nfs_fsinfo(nmp, vp, cred, p); 377 if (vp->v_type != VDIR && 378 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 379 return (EFBIG); 380 biosize = vp->v_mount->mnt_stat.f_iosize; 381 seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); 382 /* 383 * For nfs, cache consistency can only be maintained approximately. 384 * Although RFC1094 does not specify the criteria, the following is 385 * believed to be compatible with the reference port. 386 * For nqnfs, full cache consistency is maintained within the loop. 387 * For nfs: 388 * If the file's modify time on the server has changed since the 389 * last read rpc or you have written to the file, 390 * you may have lost data cache consistency with the 391 * server, so flush all of the file's data out of the cache. 392 * Then force a getattr rpc to ensure that you have up to date 393 * attributes. 394 * NB: This implies that cache data can be read when up to 395 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 396 * attributes this could be forced by setting n_attrstamp to 0 before 397 * the VOP_GETATTR() call. 398 */ 399 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 400 if (np->n_flag & NMODIFIED) { 401 if (vp->v_type != VREG) { 402 if (vp->v_type != VDIR) 403 panic("nfs: bioread, not dir"); 404 nfs_invaldir(vp); 405 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 406 if (error) 407 return (error); 408 } 409 np->n_attrstamp = 0; 410 error = VOP_GETATTR(vp, &vattr, cred, p); 411 if (error) 412 return (error); 413 np->n_mtime = vattr.va_mtime.tv_sec; 414 } else { 415 error = VOP_GETATTR(vp, &vattr, cred, p); 416 if (error) 417 return (error); 418 if (np->n_mtime != vattr.va_mtime.tv_sec) { 419 if (vp->v_type == VDIR) 420 nfs_invaldir(vp); 421 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 422 if (error) 423 return (error); 424 np->n_mtime = vattr.va_mtime.tv_sec; 425 } 426 } 427 } 428 do { 429 430 /* 431 * Get a valid lease. If cached data is stale, flush it. 432 */ 433 if (nmp->nm_flag & NFSMNT_NQNFS) { 434 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 435 do { 436 error = nqnfs_getlease(vp, ND_READ, cred, p); 437 } while (error == NQNFS_EXPIRED); 438 if (error) 439 return (error); 440 if (np->n_lrev != np->n_brev || 441 (np->n_flag & NQNFSNONCACHE) || 442 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 443 if (vp->v_type == VDIR) 444 nfs_invaldir(vp); 445 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 446 if (error) 447 return (error); 448 np->n_brev = np->n_lrev; 449 } 450 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 451 nfs_invaldir(vp); 452 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 453 if (error) 454 return (error); 455 } 456 } 457 if (np->n_flag & NQNFSNONCACHE) { 458 switch (vp->v_type) { 459 case VREG: 460 return (nfs_readrpc(vp, uio, cred)); 461 case VLNK: 462 return (nfs_readlinkrpc(vp, uio, cred)); 463 case VDIR: 464 break; 465 default: 466 printf(" NQNFSNONCACHE: type %x unexpected\n", 467 vp->v_type); 468 }; 469 } 470 switch (vp->v_type) { 471 case VREG: 472 nfsstats.biocache_reads++; 473 lbn = uio->uio_offset / biosize; 474 on = uio->uio_offset & (biosize - 1); 475 476 /* 477 * Start the read ahead(s), as required. 478 */ 479 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 480 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 481 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 482 rabn = lbn + 1 + nra; 483 if (!incore(vp, rabn)) { 484 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 485 if (!rabp) 486 return (EINTR); 487 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 488 rabp->b_flags |= B_ASYNC; 489 rabp->b_iocmd = BIO_READ; 490 vfs_busy_pages(rabp, 0); 491 if (nfs_asyncio(rabp, cred, p)) { 492 rabp->b_flags |= B_INVAL; 493 rabp->b_ioflags |= BIO_ERROR; 494 vfs_unbusy_pages(rabp); 495 brelse(rabp); 496 break; 497 } 498 } else { 499 brelse(rabp); 500 } 501 } 502 } 503 } 504 505 /* 506 * Obtain the buffer cache block. Figure out the buffer size 507 * when we are at EOF. If we are modifying the size of the 508 * buffer based on an EOF condition we need to hold 509 * nfs_rslock() through obtaining the buffer to prevent 510 * a potential writer-appender from messing with n_size. 511 * Otherwise we may accidently truncate the buffer and 512 * lose dirty data. 513 * 514 * Note that bcount is *not* DEV_BSIZE aligned. 515 */ 516 517again: 518 bcount = biosize; 519 if ((off_t)lbn * biosize >= np->n_size) { 520 bcount = 0; 521 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 522 bcount = np->n_size - (off_t)lbn * biosize; 523 } 524 if (bcount != biosize) { 525 switch(nfs_rslock(np, p)) { 526 case ENOLCK: 527 goto again; 528 /* not reached */ 529 case EINTR: 530 case ERESTART: 531 return(EINTR); 532 /* not reached */ 533 default: 534 break; 535 } 536 } 537 538 bp = nfs_getcacheblk(vp, lbn, bcount, p); 539 540 if (bcount != biosize) 541 nfs_rsunlock(np, p); 542 if (!bp) 543 return (EINTR); 544 545 /* 546 * If B_CACHE is not set, we must issue the read. If this 547 * fails, we return an error. 548 */ 549 550 if ((bp->b_flags & B_CACHE) == 0) { 551 bp->b_iocmd = BIO_READ; 552 vfs_busy_pages(bp, 0); 553 error = nfs_doio(bp, cred, p); 554 if (error) { 555 brelse(bp); 556 return (error); 557 } 558 } 559 560 /* 561 * on is the offset into the current bp. Figure out how many 562 * bytes we can copy out of the bp. Note that bcount is 563 * NOT DEV_BSIZE aligned. 564 * 565 * Then figure out how many bytes we can copy into the uio. 566 */ 567 568 n = 0; 569 if (on < bcount) 570 n = min((unsigned)(bcount - on), uio->uio_resid); 571 break; 572 case VLNK: 573 nfsstats.biocache_readlinks++; 574 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 575 if (!bp) 576 return (EINTR); 577 if ((bp->b_flags & B_CACHE) == 0) { 578 bp->b_iocmd = BIO_READ; 579 vfs_busy_pages(bp, 0); 580 error = nfs_doio(bp, cred, p); 581 if (error) { 582 bp->b_ioflags |= BIO_ERROR; 583 brelse(bp); 584 return (error); 585 } 586 } 587 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 588 on = 0; 589 break; 590 case VDIR: 591 nfsstats.biocache_readdirs++; 592 if (np->n_direofoffset 593 && uio->uio_offset >= np->n_direofoffset) { 594 return (0); 595 } 596 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 597 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 598 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 599 if (!bp) 600 return (EINTR); 601 if ((bp->b_flags & B_CACHE) == 0) { 602 bp->b_iocmd = BIO_READ; 603 vfs_busy_pages(bp, 0); 604 error = nfs_doio(bp, cred, p); 605 if (error) { 606 brelse(bp); 607 } 608 while (error == NFSERR_BAD_COOKIE) { 609 printf("got bad cookie vp %p bp %p\n", vp, bp); 610 nfs_invaldir(vp); 611 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 612 /* 613 * Yuck! The directory has been modified on the 614 * server. The only way to get the block is by 615 * reading from the beginning to get all the 616 * offset cookies. 617 * 618 * Leave the last bp intact unless there is an error. 619 * Loop back up to the while if the error is another 620 * NFSERR_BAD_COOKIE (double yuch!). 621 */ 622 for (i = 0; i <= lbn && !error; i++) { 623 if (np->n_direofoffset 624 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 625 return (0); 626 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 627 if (!bp) 628 return (EINTR); 629 if ((bp->b_flags & B_CACHE) == 0) { 630 bp->b_iocmd = BIO_READ; 631 vfs_busy_pages(bp, 0); 632 error = nfs_doio(bp, cred, p); 633 /* 634 * no error + B_INVAL == directory EOF, 635 * use the block. 636 */ 637 if (error == 0 && (bp->b_flags & B_INVAL)) 638 break; 639 } 640 /* 641 * An error will throw away the block and the 642 * for loop will break out. If no error and this 643 * is not the block we want, we throw away the 644 * block and go for the next one via the for loop. 645 */ 646 if (error || i < lbn) 647 brelse(bp); 648 } 649 } 650 /* 651 * The above while is repeated if we hit another cookie 652 * error. If we hit an error and it wasn't a cookie error, 653 * we give up. 654 */ 655 if (error) 656 return (error); 657 } 658 659 /* 660 * If not eof and read aheads are enabled, start one. 661 * (You need the current block first, so that you have the 662 * directory offset cookie of the next block.) 663 */ 664 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 665 (bp->b_flags & B_INVAL) == 0 && 666 (np->n_direofoffset == 0 || 667 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 668 !(np->n_flag & NQNFSNONCACHE) && 669 !incore(vp, lbn + 1)) { 670 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 671 if (rabp) { 672 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 673 rabp->b_flags |= B_ASYNC; 674 rabp->b_iocmd = BIO_READ; 675 vfs_busy_pages(rabp, 0); 676 if (nfs_asyncio(rabp, cred, p)) { 677 rabp->b_flags |= B_INVAL; 678 rabp->b_ioflags |= BIO_ERROR; 679 vfs_unbusy_pages(rabp); 680 brelse(rabp); 681 } 682 } else { 683 brelse(rabp); 684 } 685 } 686 } 687 /* 688 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 689 * chopped for the EOF condition, we cannot tell how large 690 * NFS directories are going to be until we hit EOF. So 691 * an NFS directory buffer is *not* chopped to its EOF. Now, 692 * it just so happens that b_resid will effectively chop it 693 * to EOF. *BUT* this information is lost if the buffer goes 694 * away and is reconstituted into a B_CACHE state ( due to 695 * being VMIO ) later. So we keep track of the directory eof 696 * in np->n_direofoffset and chop it off as an extra step 697 * right here. 698 */ 699 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 700 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 701 n = np->n_direofoffset - uio->uio_offset; 702 break; 703 default: 704 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 705 break; 706 }; 707 708 if (n > 0) { 709 error = uiomove(bp->b_data + on, (int)n, uio); 710 } 711 switch (vp->v_type) { 712 case VREG: 713 break; 714 case VLNK: 715 n = 0; 716 break; 717 case VDIR: 718 /* 719 * Invalidate buffer if caching is disabled, forcing a 720 * re-read from the remote later. 721 */ 722 if (np->n_flag & NQNFSNONCACHE) 723 bp->b_flags |= B_INVAL; 724 break; 725 default: 726 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 727 } 728 brelse(bp); 729 } while (error == 0 && uio->uio_resid > 0 && n > 0); 730 return (error); 731} 732 733/* 734 * Vnode op for write using bio 735 */ 736int 737nfs_write(ap) 738 struct vop_write_args /* { 739 struct vnode *a_vp; 740 struct uio *a_uio; 741 int a_ioflag; 742 struct ucred *a_cred; 743 } */ *ap; 744{ 745 int biosize; 746 struct uio *uio = ap->a_uio; 747 struct proc *p = uio->uio_procp; 748 struct vnode *vp = ap->a_vp; 749 struct nfsnode *np = VTONFS(vp); 750 struct ucred *cred = ap->a_cred; 751 int ioflag = ap->a_ioflag; 752 struct buf *bp; 753 struct vattr vattr; 754 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 755 daddr_t lbn; 756 int bcount; 757 int n, on, error = 0, iomode, must_commit; 758 int haverslock = 0; 759 760#ifdef DIAGNOSTIC 761 if (uio->uio_rw != UIO_WRITE) 762 panic("nfs_write mode"); 763 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 764 panic("nfs_write proc"); 765#endif 766 if (vp->v_type != VREG) 767 return (EIO); 768 if (np->n_flag & NWRITEERR) { 769 np->n_flag &= ~NWRITEERR; 770 return (np->n_error); 771 } 772 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 773 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 774 (void)nfs_fsinfo(nmp, vp, cred, p); 775 776 /* 777 * Synchronously flush pending buffers if we are in synchronous 778 * mode or if we are appending. 779 */ 780 if (ioflag & (IO_APPEND | IO_SYNC)) { 781 if (np->n_flag & NMODIFIED) { 782 np->n_attrstamp = 0; 783 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 784 if (error) 785 return (error); 786 } 787 } 788 789 /* 790 * If IO_APPEND then load uio_offset. We restart here if we cannot 791 * get the append lock. 792 */ 793restart: 794 if (ioflag & IO_APPEND) { 795 np->n_attrstamp = 0; 796 error = VOP_GETATTR(vp, &vattr, cred, p); 797 if (error) 798 return (error); 799 uio->uio_offset = np->n_size; 800 } 801 802 if (uio->uio_offset < 0) 803 return (EINVAL); 804 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 805 return (EFBIG); 806 if (uio->uio_resid == 0) 807 return (0); 808 809 /* 810 * We need to obtain the rslock if we intend to modify np->n_size 811 * in order to guarentee the append point with multiple contending 812 * writers, to guarentee that no other appenders modify n_size 813 * while we are trying to obtain a truncated buffer (i.e. to avoid 814 * accidently truncating data written by another appender due to 815 * the race), and to ensure that the buffer is populated prior to 816 * our extending of the file. We hold rslock through the entire 817 * operation. 818 * 819 * Note that we do not synchronize the case where someone truncates 820 * the file while we are appending to it because attempting to lock 821 * this case may deadlock other parts of the system unexpectedly. 822 */ 823 if ((ioflag & IO_APPEND) || 824 uio->uio_offset + uio->uio_resid > np->n_size) { 825 switch(nfs_rslock(np, p)) { 826 case ENOLCK: 827 goto restart; 828 /* not reached */ 829 case EINTR: 830 case ERESTART: 831 return(EINTR); 832 /* not reached */ 833 default: 834 break; 835 } 836 haverslock = 1; 837 } 838 839 /* 840 * Maybe this should be above the vnode op call, but so long as 841 * file servers have no limits, i don't think it matters 842 */ 843 if (p && uio->uio_offset + uio->uio_resid > 844 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 845 PROC_LOCK(p); 846 psignal(p, SIGXFSZ); 847 PROC_UNLOCK(p); 848 if (haverslock) 849 nfs_rsunlock(np, p); 850 return (EFBIG); 851 } 852 853 biosize = vp->v_mount->mnt_stat.f_iosize; 854 855 do { 856 /* 857 * Check for a valid write lease. 858 */ 859 if ((nmp->nm_flag & NFSMNT_NQNFS) && 860 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 861 do { 862 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 863 } while (error == NQNFS_EXPIRED); 864 if (error) 865 break; 866 if (np->n_lrev != np->n_brev || 867 (np->n_flag & NQNFSNONCACHE)) { 868 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 869 if (error) 870 break; 871 np->n_brev = np->n_lrev; 872 } 873 } 874 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 875 iomode = NFSV3WRITE_FILESYNC; 876 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 877 if (must_commit) 878 nfs_clearcommit(vp->v_mount); 879 break; 880 } 881 nfsstats.biocache_writes++; 882 lbn = uio->uio_offset / biosize; 883 on = uio->uio_offset & (biosize-1); 884 n = min((unsigned)(biosize - on), uio->uio_resid); 885again: 886 /* 887 * Handle direct append and file extension cases, calculate 888 * unaligned buffer size. 889 */ 890 891 if (uio->uio_offset == np->n_size && n) { 892 /* 893 * Get the buffer (in its pre-append state to maintain 894 * B_CACHE if it was previously set). Resize the 895 * nfsnode after we have locked the buffer to prevent 896 * readers from reading garbage. 897 */ 898 bcount = on; 899 bp = nfs_getcacheblk(vp, lbn, bcount, p); 900 901 if (bp != NULL) { 902 long save; 903 904 np->n_size = uio->uio_offset + n; 905 np->n_flag |= NMODIFIED; 906 vnode_pager_setsize(vp, np->n_size); 907 908 save = bp->b_flags & B_CACHE; 909 bcount += n; 910 allocbuf(bp, bcount); 911 bp->b_flags |= save; 912 bp->b_magic = B_MAGIC_NFS; 913 bp->b_op = &buf_ops_nfs; 914 } 915 } else { 916 /* 917 * Obtain the locked cache block first, and then 918 * adjust the file's size as appropriate. 919 */ 920 bcount = on + n; 921 if ((off_t)lbn * biosize + bcount < np->n_size) { 922 if ((off_t)(lbn + 1) * biosize < np->n_size) 923 bcount = biosize; 924 else 925 bcount = np->n_size - (off_t)lbn * biosize; 926 } 927 928 bp = nfs_getcacheblk(vp, lbn, bcount, p); 929 930 if (uio->uio_offset + n > np->n_size) { 931 np->n_size = uio->uio_offset + n; 932 np->n_flag |= NMODIFIED; 933 vnode_pager_setsize(vp, np->n_size); 934 } 935 } 936 937 if (!bp) { 938 error = EINTR; 939 break; 940 } 941 942 /* 943 * Issue a READ if B_CACHE is not set. In special-append 944 * mode, B_CACHE is based on the buffer prior to the write 945 * op and is typically set, avoiding the read. If a read 946 * is required in special append mode, the server will 947 * probably send us a short-read since we extended the file 948 * on our end, resulting in b_resid == 0 and, thusly, 949 * B_CACHE getting set. 950 * 951 * We can also avoid issuing the read if the write covers 952 * the entire buffer. We have to make sure the buffer state 953 * is reasonable in this case since we will not be initiating 954 * I/O. See the comments in kern/vfs_bio.c's getblk() for 955 * more information. 956 * 957 * B_CACHE may also be set due to the buffer being cached 958 * normally. 959 */ 960 961 if (on == 0 && n == bcount) { 962 bp->b_flags |= B_CACHE; 963 bp->b_flags &= ~B_INVAL; 964 bp->b_ioflags &= ~BIO_ERROR; 965 } 966 967 if ((bp->b_flags & B_CACHE) == 0) { 968 bp->b_iocmd = BIO_READ; 969 vfs_busy_pages(bp, 0); 970 error = nfs_doio(bp, cred, p); 971 if (error) { 972 brelse(bp); 973 break; 974 } 975 } 976 if (!bp) { 977 error = EINTR; 978 break; 979 } 980 if (bp->b_wcred == NOCRED) { 981 crhold(cred); 982 bp->b_wcred = cred; 983 } 984 np->n_flag |= NMODIFIED; 985 986 /* 987 * If dirtyend exceeds file size, chop it down. This should 988 * not normally occur but there is an append race where it 989 * might occur XXX, so we log it. 990 * 991 * If the chopping creates a reverse-indexed or degenerate 992 * situation with dirtyoff/end, we 0 both of them. 993 */ 994 995 if (bp->b_dirtyend > bcount) { 996 printf("NFS append race @%lx:%d\n", 997 (long)bp->b_blkno * DEV_BSIZE, 998 bp->b_dirtyend - bcount); 999 bp->b_dirtyend = bcount; 1000 } 1001 1002 if (bp->b_dirtyoff >= bp->b_dirtyend) 1003 bp->b_dirtyoff = bp->b_dirtyend = 0; 1004 1005 /* 1006 * If the new write will leave a contiguous dirty 1007 * area, just update the b_dirtyoff and b_dirtyend, 1008 * otherwise force a write rpc of the old dirty area. 1009 * 1010 * While it is possible to merge discontiguous writes due to 1011 * our having a B_CACHE buffer ( and thus valid read data 1012 * for the hole), we don't because it could lead to 1013 * significant cache coherency problems with multiple clients, 1014 * especially if locking is implemented later on. 1015 * 1016 * as an optimization we could theoretically maintain 1017 * a linked list of discontinuous areas, but we would still 1018 * have to commit them separately so there isn't much 1019 * advantage to it except perhaps a bit of asynchronization. 1020 */ 1021 1022 if (bp->b_dirtyend > 0 && 1023 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1024 if (BUF_WRITE(bp) == EINTR) 1025 return (EINTR); 1026 goto again; 1027 } 1028 1029 /* 1030 * Check for valid write lease and get one as required. 1031 * In case getblk() and/or bwrite() delayed us. 1032 */ 1033 if ((nmp->nm_flag & NFSMNT_NQNFS) && 1034 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 1035 do { 1036 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 1037 } while (error == NQNFS_EXPIRED); 1038 if (error) { 1039 brelse(bp); 1040 break; 1041 } 1042 if (np->n_lrev != np->n_brev || 1043 (np->n_flag & NQNFSNONCACHE)) { 1044 brelse(bp); 1045 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1046 if (error) 1047 break; 1048 np->n_brev = np->n_lrev; 1049 goto again; 1050 } 1051 } 1052 1053 error = uiomove((char *)bp->b_data + on, n, uio); 1054 1055 /* 1056 * Since this block is being modified, it must be written 1057 * again and not just committed. Since write clustering does 1058 * not work for the stage 1 data write, only the stage 2 1059 * commit rpc, we have to clear B_CLUSTEROK as well. 1060 */ 1061 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1062 1063 if (error) { 1064 bp->b_ioflags |= BIO_ERROR; 1065 brelse(bp); 1066 break; 1067 } 1068 1069 /* 1070 * Only update dirtyoff/dirtyend if not a degenerate 1071 * condition. 1072 */ 1073 if (n) { 1074 if (bp->b_dirtyend > 0) { 1075 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1076 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1077 } else { 1078 bp->b_dirtyoff = on; 1079 bp->b_dirtyend = on + n; 1080 } 1081 vfs_bio_set_validclean(bp, on, n); 1082 } 1083 1084 /* 1085 * If the lease is non-cachable or IO_SYNC do bwrite(). 1086 * 1087 * IO_INVAL appears to be unused. The idea appears to be 1088 * to turn off caching in this case. Very odd. XXX 1089 */ 1090 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 1091 if (ioflag & IO_INVAL) 1092 bp->b_flags |= B_NOCACHE; 1093 error = BUF_WRITE(bp); 1094 if (error) 1095 break; 1096 if (np->n_flag & NQNFSNONCACHE) { 1097 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1098 if (error) 1099 break; 1100 } 1101 } else if ((n + on) == biosize && 1102 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 1103 bp->b_flags |= B_ASYNC; 1104 (void)nfs_writebp(bp, 0, 0); 1105 } else { 1106 bdwrite(bp); 1107 } 1108 } while (uio->uio_resid > 0 && n > 0); 1109 1110 if (haverslock) 1111 nfs_rsunlock(np, p); 1112 1113 return (error); 1114} 1115 1116/* 1117 * Get an nfs cache block. 1118 * 1119 * Allocate a new one if the block isn't currently in the cache 1120 * and return the block marked busy. If the calling process is 1121 * interrupted by a signal for an interruptible mount point, return 1122 * NULL. 1123 * 1124 * The caller must carefully deal with the possible B_INVAL state of 1125 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1126 * indirectly), so synchronous reads can be issued without worrying about 1127 * the B_INVAL state. We have to be a little more careful when dealing 1128 * with writes (see comments in nfs_write()) when extending a file past 1129 * its EOF. 1130 */ 1131static struct buf * 1132nfs_getcacheblk(vp, bn, size, p) 1133 struct vnode *vp; 1134 daddr_t bn; 1135 int size; 1136 struct proc *p; 1137{ 1138 register struct buf *bp; 1139 struct mount *mp; 1140 struct nfsmount *nmp; 1141 1142 mp = vp->v_mount; 1143 nmp = VFSTONFS(mp); 1144 1145 if (nmp->nm_flag & NFSMNT_INT) { 1146 bp = getblk(vp, bn, size, PCATCH, 0); 1147 while (bp == (struct buf *)0) { 1148 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 1149 return ((struct buf *)0); 1150 bp = getblk(vp, bn, size, 0, 2 * hz); 1151 } 1152 } else { 1153 bp = getblk(vp, bn, size, 0, 0); 1154 } 1155 1156 if (vp->v_type == VREG) { 1157 int biosize; 1158 1159 biosize = mp->mnt_stat.f_iosize; 1160 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1161 } 1162 return (bp); 1163} 1164 1165/* 1166 * Flush and invalidate all dirty buffers. If another process is already 1167 * doing the flush, just wait for completion. 1168 */ 1169int 1170nfs_vinvalbuf(vp, flags, cred, p, intrflg) 1171 struct vnode *vp; 1172 int flags; 1173 struct ucred *cred; 1174 struct proc *p; 1175 int intrflg; 1176{ 1177 register struct nfsnode *np = VTONFS(vp); 1178 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1179 int error = 0, slpflag, slptimeo; 1180 1181 if (vp->v_flag & VXLOCK) { 1182 return (0); 1183 } 1184 1185 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1186 intrflg = 0; 1187 if (intrflg) { 1188 slpflag = PCATCH; 1189 slptimeo = 2 * hz; 1190 } else { 1191 slpflag = 0; 1192 slptimeo = 0; 1193 } 1194 /* 1195 * First wait for any other process doing a flush to complete. 1196 */ 1197 while (np->n_flag & NFLUSHINPROG) { 1198 np->n_flag |= NFLUSHWANT; 1199 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 1200 slptimeo); 1201 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 1202 return (EINTR); 1203 } 1204 1205 /* 1206 * Now, flush as required. 1207 */ 1208 np->n_flag |= NFLUSHINPROG; 1209 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 1210 while (error) { 1211 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 1212 np->n_flag &= ~NFLUSHINPROG; 1213 if (np->n_flag & NFLUSHWANT) { 1214 np->n_flag &= ~NFLUSHWANT; 1215 wakeup((caddr_t)&np->n_flag); 1216 } 1217 return (EINTR); 1218 } 1219 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 1220 } 1221 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 1222 if (np->n_flag & NFLUSHWANT) { 1223 np->n_flag &= ~NFLUSHWANT; 1224 wakeup((caddr_t)&np->n_flag); 1225 } 1226 return (0); 1227} 1228 1229/* 1230 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1231 * This is mainly to avoid queueing async I/O requests when the nfsiods 1232 * are all hung on a dead server. 1233 * 1234 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1235 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1236 */ 1237int 1238nfs_asyncio(bp, cred, procp) 1239 register struct buf *bp; 1240 struct ucred *cred; 1241 struct proc *procp; 1242{ 1243 struct nfsmount *nmp; 1244 int i; 1245 int gotiod; 1246 int slpflag = 0; 1247 int slptimeo = 0; 1248 int error; 1249 1250 /* 1251 * If no async daemons then return EIO to force caller to run the rpc 1252 * synchronously. 1253 */ 1254 if (nfs_numasync == 0) 1255 return (EIO); 1256 1257 nmp = VFSTONFS(bp->b_vp->v_mount); 1258 1259 /* 1260 * Commits are usually short and sweet so lets save some cpu and 1261 * leave the async daemons for more important rpc's (such as reads 1262 * and writes). 1263 */ 1264 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1265 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1266 return(EIO); 1267 } 1268 1269again: 1270 if (nmp->nm_flag & NFSMNT_INT) 1271 slpflag = PCATCH; 1272 gotiod = FALSE; 1273 1274 /* 1275 * Find a free iod to process this request. 1276 */ 1277 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 1278 if (nfs_iodwant[i]) { 1279 /* 1280 * Found one, so wake it up and tell it which 1281 * mount to process. 1282 */ 1283 NFS_DPF(ASYNCIO, 1284 ("nfs_asyncio: waking iod %d for mount %p\n", 1285 i, nmp)); 1286 nfs_iodwant[i] = (struct proc *)0; 1287 nfs_iodmount[i] = nmp; 1288 nmp->nm_bufqiods++; 1289 wakeup((caddr_t)&nfs_iodwant[i]); 1290 gotiod = TRUE; 1291 break; 1292 } 1293 1294 /* 1295 * If none are free, we may already have an iod working on this mount 1296 * point. If so, it will process our request. 1297 */ 1298 if (!gotiod) { 1299 if (nmp->nm_bufqiods > 0) { 1300 NFS_DPF(ASYNCIO, 1301 ("nfs_asyncio: %d iods are already processing mount %p\n", 1302 nmp->nm_bufqiods, nmp)); 1303 gotiod = TRUE; 1304 } 1305 } 1306 1307 /* 1308 * If we have an iod which can process the request, then queue 1309 * the buffer. 1310 */ 1311 if (gotiod) { 1312 /* 1313 * Ensure that the queue never grows too large. We still want 1314 * to asynchronize so we block rather then return EIO. 1315 */ 1316 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1317 NFS_DPF(ASYNCIO, 1318 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1319 nmp->nm_bufqwant = TRUE; 1320 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1321 "nfsaio", slptimeo); 1322 if (error) { 1323 if (nfs_sigintr(nmp, NULL, procp)) 1324 return (EINTR); 1325 if (slpflag == PCATCH) { 1326 slpflag = 0; 1327 slptimeo = 2 * hz; 1328 } 1329 } 1330 /* 1331 * We might have lost our iod while sleeping, 1332 * so check and loop if nescessary. 1333 */ 1334 if (nmp->nm_bufqiods == 0) { 1335 NFS_DPF(ASYNCIO, 1336 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1337 goto again; 1338 } 1339 } 1340 1341 if (bp->b_iocmd == BIO_READ) { 1342 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1343 crhold(cred); 1344 bp->b_rcred = cred; 1345 } 1346 } else { 1347 bp->b_flags |= B_WRITEINPROG; 1348 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1349 crhold(cred); 1350 bp->b_wcred = cred; 1351 } 1352 } 1353 1354 BUF_KERNPROC(bp); 1355 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1356 nmp->nm_bufqlen++; 1357 return (0); 1358 } 1359 1360 /* 1361 * All the iods are busy on other mounts, so return EIO to 1362 * force the caller to process the i/o synchronously. 1363 */ 1364 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1365 return (EIO); 1366} 1367 1368/* 1369 * Do an I/O operation to/from a cache block. This may be called 1370 * synchronously or from an nfsiod. 1371 */ 1372int 1373nfs_doio(bp, cr, p) 1374 struct buf *bp; 1375 struct ucred *cr; 1376 struct proc *p; 1377{ 1378 struct uio *uiop; 1379 struct vnode *vp; 1380 struct nfsnode *np; 1381 struct nfsmount *nmp; 1382 int error = 0, iomode, must_commit = 0; 1383 struct uio uio; 1384 struct iovec io; 1385 1386 vp = bp->b_vp; 1387 np = VTONFS(vp); 1388 nmp = VFSTONFS(vp->v_mount); 1389 uiop = &uio; 1390 uiop->uio_iov = &io; 1391 uiop->uio_iovcnt = 1; 1392 uiop->uio_segflg = UIO_SYSSPACE; 1393 uiop->uio_procp = p; 1394 1395 /* 1396 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1397 * do this here so we do not have to do it in all the code that 1398 * calls us. 1399 */ 1400 bp->b_flags &= ~B_INVAL; 1401 bp->b_ioflags &= ~BIO_ERROR; 1402 1403 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1404 1405 /* 1406 * Historically, paging was done with physio, but no more. 1407 */ 1408 if (bp->b_flags & B_PHYS) { 1409 /* 1410 * ...though reading /dev/drum still gets us here. 1411 */ 1412 io.iov_len = uiop->uio_resid = bp->b_bcount; 1413 /* mapping was done by vmapbuf() */ 1414 io.iov_base = bp->b_data; 1415 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1416 if (bp->b_iocmd == BIO_READ) { 1417 uiop->uio_rw = UIO_READ; 1418 nfsstats.read_physios++; 1419 error = nfs_readrpc(vp, uiop, cr); 1420 } else { 1421 int com; 1422 1423 iomode = NFSV3WRITE_DATASYNC; 1424 uiop->uio_rw = UIO_WRITE; 1425 nfsstats.write_physios++; 1426 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1427 } 1428 if (error) { 1429 bp->b_ioflags |= BIO_ERROR; 1430 bp->b_error = error; 1431 } 1432 } else if (bp->b_iocmd == BIO_READ) { 1433 io.iov_len = uiop->uio_resid = bp->b_bcount; 1434 io.iov_base = bp->b_data; 1435 uiop->uio_rw = UIO_READ; 1436 switch (vp->v_type) { 1437 case VREG: 1438 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1439 nfsstats.read_bios++; 1440 error = nfs_readrpc(vp, uiop, cr); 1441 if (!error) { 1442 if (uiop->uio_resid) { 1443 /* 1444 * If we had a short read with no error, we must have 1445 * hit a file hole. We should zero-fill the remainder. 1446 * This can also occur if the server hits the file EOF. 1447 * 1448 * Holes used to be able to occur due to pending 1449 * writes, but that is not possible any longer. 1450 */ 1451 int nread = bp->b_bcount - uiop->uio_resid; 1452 int left = bp->b_bcount - nread; 1453 1454 if (left > 0) 1455 bzero((char *)bp->b_data + nread, left); 1456 uiop->uio_resid = 0; 1457 } 1458 } 1459 if (p && (vp->v_flag & VTEXT) && 1460 (((nmp->nm_flag & NFSMNT_NQNFS) && 1461 NQNFS_CKINVALID(vp, np, ND_READ) && 1462 np->n_lrev != np->n_brev) || 1463 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1464 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1465 uprintf("Process killed due to text file modification\n"); 1466 PROC_LOCK(p); 1467 psignal(p, SIGKILL); 1468 _PHOLD(p); 1469 PROC_UNLOCK(p); 1470 } 1471 break; 1472 case VLNK: 1473 uiop->uio_offset = (off_t)0; 1474 nfsstats.readlink_bios++; 1475 error = nfs_readlinkrpc(vp, uiop, cr); 1476 break; 1477 case VDIR: 1478 nfsstats.readdir_bios++; 1479 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1480 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1481 error = nfs_readdirplusrpc(vp, uiop, cr); 1482 if (error == NFSERR_NOTSUPP) 1483 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1484 } 1485 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1486 error = nfs_readdirrpc(vp, uiop, cr); 1487 /* 1488 * end-of-directory sets B_INVAL but does not generate an 1489 * error. 1490 */ 1491 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1492 bp->b_flags |= B_INVAL; 1493 break; 1494 default: 1495 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1496 break; 1497 }; 1498 if (error) { 1499 bp->b_ioflags |= BIO_ERROR; 1500 bp->b_error = error; 1501 } 1502 } else { 1503 /* 1504 * If we only need to commit, try to commit 1505 */ 1506 if (bp->b_flags & B_NEEDCOMMIT) { 1507 int retv; 1508 off_t off; 1509 1510 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1511 bp->b_flags |= B_WRITEINPROG; 1512 retv = nfs_commit( 1513 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1514 bp->b_wcred, p); 1515 bp->b_flags &= ~B_WRITEINPROG; 1516 if (retv == 0) { 1517 bp->b_dirtyoff = bp->b_dirtyend = 0; 1518 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1519 bp->b_resid = 0; 1520 bufdone(bp); 1521 return (0); 1522 } 1523 if (retv == NFSERR_STALEWRITEVERF) { 1524 nfs_clearcommit(bp->b_vp->v_mount); 1525 } 1526 } 1527 1528 /* 1529 * Setup for actual write 1530 */ 1531 1532 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1533 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1534 1535 if (bp->b_dirtyend > bp->b_dirtyoff) { 1536 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1537 - bp->b_dirtyoff; 1538 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1539 + bp->b_dirtyoff; 1540 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1541 uiop->uio_rw = UIO_WRITE; 1542 nfsstats.write_bios++; 1543 1544 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1545 iomode = NFSV3WRITE_UNSTABLE; 1546 else 1547 iomode = NFSV3WRITE_FILESYNC; 1548 1549 bp->b_flags |= B_WRITEINPROG; 1550 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1551 1552 /* 1553 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1554 * to cluster the buffers needing commit. This will allow 1555 * the system to submit a single commit rpc for the whole 1556 * cluster. We can do this even if the buffer is not 100% 1557 * dirty (relative to the NFS blocksize), so we optimize the 1558 * append-to-file-case. 1559 * 1560 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1561 * cleared because write clustering only works for commit 1562 * rpc's, not for the data portion of the write). 1563 */ 1564 1565 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1566 bp->b_flags |= B_NEEDCOMMIT; 1567 if (bp->b_dirtyoff == 0 1568 && bp->b_dirtyend == bp->b_bcount) 1569 bp->b_flags |= B_CLUSTEROK; 1570 } else { 1571 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1572 } 1573 bp->b_flags &= ~B_WRITEINPROG; 1574 1575 /* 1576 * For an interrupted write, the buffer is still valid 1577 * and the write hasn't been pushed to the server yet, 1578 * so we can't set BIO_ERROR and report the interruption 1579 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1580 * is not relevant, so the rpc attempt is essentially 1581 * a noop. For the case of a V3 write rpc not being 1582 * committed to stable storage, the block is still 1583 * dirty and requires either a commit rpc or another 1584 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1585 * the block is reused. This is indicated by setting 1586 * the B_DELWRI and B_NEEDCOMMIT flags. 1587 * 1588 * If the buffer is marked B_PAGING, it does not reside on 1589 * the vp's paging queues so we cannot call bdirty(). The 1590 * bp in this case is not an NFS cache block so we should 1591 * be safe. XXX 1592 */ 1593 if (error == EINTR 1594 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1595 int s; 1596 1597 s = splbio(); 1598 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1599 if ((bp->b_flags & B_PAGING) == 0) { 1600 bdirty(bp); 1601 bp->b_flags &= ~B_DONE; 1602 } 1603 if (error && (bp->b_flags & B_ASYNC) == 0) 1604 bp->b_flags |= B_EINTR; 1605 splx(s); 1606 } else { 1607 if (error) { 1608 bp->b_ioflags |= BIO_ERROR; 1609 bp->b_error = np->n_error = error; 1610 np->n_flag |= NWRITEERR; 1611 } 1612 bp->b_dirtyoff = bp->b_dirtyend = 0; 1613 } 1614 } else { 1615 bp->b_resid = 0; 1616 bufdone(bp); 1617 return (0); 1618 } 1619 } 1620 bp->b_resid = uiop->uio_resid; 1621 if (must_commit) 1622 nfs_clearcommit(vp->v_mount); 1623 bufdone(bp); 1624 return (error); 1625} 1626