nfs_bio.c revision 76117
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 37 * $FreeBSD: head/sys/nfsclient/nfs_bio.c 76117 2001-04-29 02:45:39Z grog $ 38 */ 39 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/resourcevar.h> 44#include <sys/signalvar.h> 45#include <sys/proc.h> 46#include <sys/bio.h> 47#include <sys/buf.h> 48#include <sys/vnode.h> 49#include <sys/mount.h> 50#include <sys/kernel.h> 51 52#include <vm/vm.h> 53#include <vm/vm_extern.h> 54#include <vm/vm_page.h> 55#include <vm/vm_object.h> 56#include <vm/vm_pager.h> 57#include <vm/vnode_pager.h> 58 59#include <nfs/rpcv2.h> 60#include <nfs/nfsproto.h> 61#include <nfs/nfs.h> 62#include <nfs/nfsmount.h> 63#include <nfs/nqnfs.h> 64#include <nfs/nfsnode.h> 65 66/* 67 * Just call nfs_writebp() with the force argument set to 1. 68 * 69 * NOTE: B_DONE may or may not be set in a_bp on call. 70 */ 71static int 72nfs_bwrite(struct buf *bp) 73{ 74 return (nfs_writebp(bp, 1, curproc)); 75} 76 77struct buf_ops buf_ops_nfs = { 78 "buf_ops_nfs", 79 nfs_bwrite 80}; 81 82 83static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 84 struct proc *p)); 85 86extern int nfs_numasync; 87extern int nfs_pbuf_freecnt; 88extern struct nfsstats nfsstats; 89 90/* 91 * Vnode op for VM getpages. 92 */ 93int 94nfs_getpages(ap) 95 struct vop_getpages_args /* { 96 struct vnode *a_vp; 97 vm_page_t *a_m; 98 int a_count; 99 int a_reqpage; 100 vm_ooffset_t a_offset; 101 } */ *ap; 102{ 103 int i, error, nextoff, size, toff, count, npages; 104 struct uio uio; 105 struct iovec iov; 106 vm_offset_t kva; 107 struct buf *bp; 108 struct vnode *vp; 109 struct proc *p; 110 struct ucred *cred; 111 struct nfsmount *nmp; 112 vm_page_t *pages; 113 114 vp = ap->a_vp; 115 p = curproc; /* XXX */ 116 cred = curproc->p_ucred; /* XXX */ 117 nmp = VFSTONFS(vp->v_mount); 118 pages = ap->a_m; 119 count = ap->a_count; 120 121 if (vp->v_object == NULL) { 122 printf("nfs_getpages: called with non-merged cache vnode??\n"); 123 return VM_PAGER_ERROR; 124 } 125 126 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 127 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 128 (void)nfs_fsinfo(nmp, vp, cred, p); 129 130 npages = btoc(count); 131 132 /* 133 * If the requested page is partially valid, just return it and 134 * allow the pager to zero-out the blanks. Partially valid pages 135 * can only occur at the file EOF. 136 */ 137 138 { 139 vm_page_t m = pages[ap->a_reqpage]; 140 141 if (m->valid != 0) { 142 /* handled by vm_fault now */ 143 /* vm_page_zero_invalid(m, TRUE); */ 144 for (i = 0; i < npages; ++i) { 145 if (i != ap->a_reqpage) 146 vm_page_free(pages[i]); 147 } 148 return(0); 149 } 150 } 151 152 /* 153 * We use only the kva address for the buffer, but this is extremely 154 * convienient and fast. 155 */ 156 bp = getpbuf(&nfs_pbuf_freecnt); 157 158 kva = (vm_offset_t) bp->b_data; 159 pmap_qenter(kva, pages, npages); 160 161 iov.iov_base = (caddr_t) kva; 162 iov.iov_len = count; 163 uio.uio_iov = &iov; 164 uio.uio_iovcnt = 1; 165 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 166 uio.uio_resid = count; 167 uio.uio_segflg = UIO_SYSSPACE; 168 uio.uio_rw = UIO_READ; 169 uio.uio_procp = p; 170 171 error = nfs_readrpc(vp, &uio, cred); 172 pmap_qremove(kva, npages); 173 174 relpbuf(bp, &nfs_pbuf_freecnt); 175 176 if (error && (uio.uio_resid == count)) { 177 printf("nfs_getpages: error %d\n", error); 178 for (i = 0; i < npages; ++i) { 179 if (i != ap->a_reqpage) 180 vm_page_free(pages[i]); 181 } 182 return VM_PAGER_ERROR; 183 } 184 185 /* 186 * Calculate the number of bytes read and validate only that number 187 * of bytes. Note that due to pending writes, size may be 0. This 188 * does not mean that the remaining data is invalid! 189 */ 190 191 size = count - uio.uio_resid; 192 193 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 194 vm_page_t m; 195 nextoff = toff + PAGE_SIZE; 196 m = pages[i]; 197 198 m->flags &= ~PG_ZERO; 199 200 if (nextoff <= size) { 201 /* 202 * Read operation filled an entire page 203 */ 204 m->valid = VM_PAGE_BITS_ALL; 205 vm_page_undirty(m); 206 } else if (size > toff) { 207 /* 208 * Read operation filled a partial page. 209 */ 210 m->valid = 0; 211 vm_page_set_validclean(m, 0, size - toff); 212 /* handled by vm_fault now */ 213 /* vm_page_zero_invalid(m, TRUE); */ 214 } 215 216 if (i != ap->a_reqpage) { 217 /* 218 * Whether or not to leave the page activated is up in 219 * the air, but we should put the page on a page queue 220 * somewhere (it already is in the object). Result: 221 * It appears that emperical results show that 222 * deactivating pages is best. 223 */ 224 225 /* 226 * Just in case someone was asking for this page we 227 * now tell them that it is ok to use. 228 */ 229 if (!error) { 230 if (m->flags & PG_WANTED) 231 vm_page_activate(m); 232 else 233 vm_page_deactivate(m); 234 vm_page_wakeup(m); 235 } else { 236 vm_page_free(m); 237 } 238 } 239 } 240 return 0; 241} 242 243/* 244 * Vnode op for VM putpages. 245 */ 246int 247nfs_putpages(ap) 248 struct vop_putpages_args /* { 249 struct vnode *a_vp; 250 vm_page_t *a_m; 251 int a_count; 252 int a_sync; 253 int *a_rtvals; 254 vm_ooffset_t a_offset; 255 } */ *ap; 256{ 257 struct uio uio; 258 struct iovec iov; 259 vm_offset_t kva; 260 struct buf *bp; 261 int iomode, must_commit, i, error, npages, count; 262 off_t offset; 263 int *rtvals; 264 struct vnode *vp; 265 struct proc *p; 266 struct ucred *cred; 267 struct nfsmount *nmp; 268 struct nfsnode *np; 269 vm_page_t *pages; 270 271 vp = ap->a_vp; 272 np = VTONFS(vp); 273 p = curproc; /* XXX */ 274 cred = curproc->p_ucred; /* XXX */ 275 nmp = VFSTONFS(vp->v_mount); 276 pages = ap->a_m; 277 count = ap->a_count; 278 rtvals = ap->a_rtvals; 279 npages = btoc(count); 280 offset = IDX_TO_OFF(pages[0]->pindex); 281 282 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 283 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 284 (void)nfs_fsinfo(nmp, vp, cred, p); 285 286 for (i = 0; i < npages; i++) { 287 rtvals[i] = VM_PAGER_AGAIN; 288 } 289 290 /* 291 * When putting pages, do not extend file past EOF. 292 */ 293 294 if (offset + count > np->n_size) { 295 count = np->n_size - offset; 296 if (count < 0) 297 count = 0; 298 } 299 300 /* 301 * We use only the kva address for the buffer, but this is extremely 302 * convienient and fast. 303 */ 304 bp = getpbuf(&nfs_pbuf_freecnt); 305 306 kva = (vm_offset_t) bp->b_data; 307 pmap_qenter(kva, pages, npages); 308 309 iov.iov_base = (caddr_t) kva; 310 iov.iov_len = count; 311 uio.uio_iov = &iov; 312 uio.uio_iovcnt = 1; 313 uio.uio_offset = offset; 314 uio.uio_resid = count; 315 uio.uio_segflg = UIO_SYSSPACE; 316 uio.uio_rw = UIO_WRITE; 317 uio.uio_procp = p; 318 319 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 320 iomode = NFSV3WRITE_UNSTABLE; 321 else 322 iomode = NFSV3WRITE_FILESYNC; 323 324 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 325 326 pmap_qremove(kva, npages); 327 relpbuf(bp, &nfs_pbuf_freecnt); 328 329 if (!error) { 330 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 331 for (i = 0; i < nwritten; i++) { 332 rtvals[i] = VM_PAGER_OK; 333 vm_page_undirty(pages[i]); 334 } 335 if (must_commit) 336 nfs_clearcommit(vp->v_mount); 337 } 338 return rtvals[0]; 339} 340 341/* 342 * Vnode op for read using bio 343 */ 344int 345nfs_bioread(vp, uio, ioflag, cred) 346 register struct vnode *vp; 347 register struct uio *uio; 348 int ioflag; 349 struct ucred *cred; 350{ 351 register struct nfsnode *np = VTONFS(vp); 352 register int biosize, i; 353 struct buf *bp = 0, *rabp; 354 struct vattr vattr; 355 struct proc *p; 356 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 357 daddr_t lbn, rabn; 358 int bcount; 359 int seqcount; 360 int nra, error = 0, n = 0, on = 0; 361 362#ifdef DIAGNOSTIC 363 if (uio->uio_rw != UIO_READ) 364 panic("nfs_read mode"); 365#endif 366 if (uio->uio_resid == 0) 367 return (0); 368 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 369 return (EINVAL); 370 p = uio->uio_procp; 371 372 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 373 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 374 (void)nfs_fsinfo(nmp, vp, cred, p); 375 if (vp->v_type != VDIR && 376 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 377 return (EFBIG); 378 biosize = vp->v_mount->mnt_stat.f_iosize; 379 seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); 380 /* 381 * For nfs, cache consistency can only be maintained approximately. 382 * Although RFC1094 does not specify the criteria, the following is 383 * believed to be compatible with the reference port. 384 * For nqnfs, full cache consistency is maintained within the loop. 385 * For nfs: 386 * If the file's modify time on the server has changed since the 387 * last read rpc or you have written to the file, 388 * you may have lost data cache consistency with the 389 * server, so flush all of the file's data out of the cache. 390 * Then force a getattr rpc to ensure that you have up to date 391 * attributes. 392 * NB: This implies that cache data can be read when up to 393 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 394 * attributes this could be forced by setting n_attrstamp to 0 before 395 * the VOP_GETATTR() call. 396 */ 397 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 398 if (np->n_flag & NMODIFIED) { 399 if (vp->v_type != VREG) { 400 if (vp->v_type != VDIR) 401 panic("nfs: bioread, not dir"); 402 nfs_invaldir(vp); 403 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 404 if (error) 405 return (error); 406 } 407 np->n_attrstamp = 0; 408 error = VOP_GETATTR(vp, &vattr, cred, p); 409 if (error) 410 return (error); 411 np->n_mtime = vattr.va_mtime.tv_sec; 412 } else { 413 error = VOP_GETATTR(vp, &vattr, cred, p); 414 if (error) 415 return (error); 416 if (np->n_mtime != vattr.va_mtime.tv_sec) { 417 if (vp->v_type == VDIR) 418 nfs_invaldir(vp); 419 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 420 if (error) 421 return (error); 422 np->n_mtime = vattr.va_mtime.tv_sec; 423 } 424 } 425 } 426 do { 427 428 /* 429 * Get a valid lease. If cached data is stale, flush it. 430 */ 431 if (nmp->nm_flag & NFSMNT_NQNFS) { 432 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 433 do { 434 error = nqnfs_getlease(vp, ND_READ, cred, p); 435 } while (error == NQNFS_EXPIRED); 436 if (error) 437 return (error); 438 if (np->n_lrev != np->n_brev || 439 (np->n_flag & NQNFSNONCACHE) || 440 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 441 if (vp->v_type == VDIR) 442 nfs_invaldir(vp); 443 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 444 if (error) 445 return (error); 446 np->n_brev = np->n_lrev; 447 } 448 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 449 nfs_invaldir(vp); 450 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 451 if (error) 452 return (error); 453 } 454 } 455 if (np->n_flag & NQNFSNONCACHE) { 456 switch (vp->v_type) { 457 case VREG: 458 return (nfs_readrpc(vp, uio, cred)); 459 case VLNK: 460 return (nfs_readlinkrpc(vp, uio, cred)); 461 case VDIR: 462 break; 463 default: 464 printf(" NQNFSNONCACHE: type %x unexpected\n", 465 vp->v_type); 466 }; 467 } 468 switch (vp->v_type) { 469 case VREG: 470 nfsstats.biocache_reads++; 471 lbn = uio->uio_offset / biosize; 472 on = uio->uio_offset & (biosize - 1); 473 474 /* 475 * Start the read ahead(s), as required. 476 */ 477 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 478 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 479 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 480 rabn = lbn + 1 + nra; 481 if (!incore(vp, rabn)) { 482 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 483 if (!rabp) 484 return (EINTR); 485 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 486 rabp->b_flags |= B_ASYNC; 487 rabp->b_iocmd = BIO_READ; 488 vfs_busy_pages(rabp, 0); 489 if (nfs_asyncio(rabp, cred, p)) { 490 rabp->b_flags |= B_INVAL; 491 rabp->b_ioflags |= BIO_ERROR; 492 vfs_unbusy_pages(rabp); 493 brelse(rabp); 494 break; 495 } 496 } else { 497 brelse(rabp); 498 } 499 } 500 } 501 } 502 503 /* 504 * Obtain the buffer cache block. Figure out the buffer size 505 * when we are at EOF. If we are modifying the size of the 506 * buffer based on an EOF condition we need to hold 507 * nfs_rslock() through obtaining the buffer to prevent 508 * a potential writer-appender from messing with n_size. 509 * Otherwise we may accidently truncate the buffer and 510 * lose dirty data. 511 * 512 * Note that bcount is *not* DEV_BSIZE aligned. 513 */ 514 515again: 516 bcount = biosize; 517 if ((off_t)lbn * biosize >= np->n_size) { 518 bcount = 0; 519 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 520 bcount = np->n_size - (off_t)lbn * biosize; 521 } 522 if (bcount != biosize) { 523 switch(nfs_rslock(np, p)) { 524 case ENOLCK: 525 goto again; 526 /* not reached */ 527 case EINTR: 528 case ERESTART: 529 return(EINTR); 530 /* not reached */ 531 default: 532 break; 533 } 534 } 535 536 bp = nfs_getcacheblk(vp, lbn, bcount, p); 537 538 if (bcount != biosize) 539 nfs_rsunlock(np, p); 540 if (!bp) 541 return (EINTR); 542 543 /* 544 * If B_CACHE is not set, we must issue the read. If this 545 * fails, we return an error. 546 */ 547 548 if ((bp->b_flags & B_CACHE) == 0) { 549 bp->b_iocmd = BIO_READ; 550 vfs_busy_pages(bp, 0); 551 error = nfs_doio(bp, cred, p); 552 if (error) { 553 brelse(bp); 554 return (error); 555 } 556 } 557 558 /* 559 * on is the offset into the current bp. Figure out how many 560 * bytes we can copy out of the bp. Note that bcount is 561 * NOT DEV_BSIZE aligned. 562 * 563 * Then figure out how many bytes we can copy into the uio. 564 */ 565 566 n = 0; 567 if (on < bcount) 568 n = min((unsigned)(bcount - on), uio->uio_resid); 569 break; 570 case VLNK: 571 nfsstats.biocache_readlinks++; 572 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 573 if (!bp) 574 return (EINTR); 575 if ((bp->b_flags & B_CACHE) == 0) { 576 bp->b_iocmd = BIO_READ; 577 vfs_busy_pages(bp, 0); 578 error = nfs_doio(bp, cred, p); 579 if (error) { 580 bp->b_ioflags |= BIO_ERROR; 581 brelse(bp); 582 return (error); 583 } 584 } 585 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 586 on = 0; 587 break; 588 case VDIR: 589 nfsstats.biocache_readdirs++; 590 if (np->n_direofoffset 591 && uio->uio_offset >= np->n_direofoffset) { 592 return (0); 593 } 594 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 595 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 596 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 597 if (!bp) 598 return (EINTR); 599 if ((bp->b_flags & B_CACHE) == 0) { 600 bp->b_iocmd = BIO_READ; 601 vfs_busy_pages(bp, 0); 602 error = nfs_doio(bp, cred, p); 603 if (error) { 604 brelse(bp); 605 } 606 while (error == NFSERR_BAD_COOKIE) { 607 printf("got bad cookie vp %p bp %p\n", vp, bp); 608 nfs_invaldir(vp); 609 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 610 /* 611 * Yuck! The directory has been modified on the 612 * server. The only way to get the block is by 613 * reading from the beginning to get all the 614 * offset cookies. 615 * 616 * Leave the last bp intact unless there is an error. 617 * Loop back up to the while if the error is another 618 * NFSERR_BAD_COOKIE (double yuch!). 619 */ 620 for (i = 0; i <= lbn && !error; i++) { 621 if (np->n_direofoffset 622 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 623 return (0); 624 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 625 if (!bp) 626 return (EINTR); 627 if ((bp->b_flags & B_CACHE) == 0) { 628 bp->b_iocmd = BIO_READ; 629 vfs_busy_pages(bp, 0); 630 error = nfs_doio(bp, cred, p); 631 /* 632 * no error + B_INVAL == directory EOF, 633 * use the block. 634 */ 635 if (error == 0 && (bp->b_flags & B_INVAL)) 636 break; 637 } 638 /* 639 * An error will throw away the block and the 640 * for loop will break out. If no error and this 641 * is not the block we want, we throw away the 642 * block and go for the next one via the for loop. 643 */ 644 if (error || i < lbn) 645 brelse(bp); 646 } 647 } 648 /* 649 * The above while is repeated if we hit another cookie 650 * error. If we hit an error and it wasn't a cookie error, 651 * we give up. 652 */ 653 if (error) 654 return (error); 655 } 656 657 /* 658 * If not eof and read aheads are enabled, start one. 659 * (You need the current block first, so that you have the 660 * directory offset cookie of the next block.) 661 */ 662 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 663 (bp->b_flags & B_INVAL) == 0 && 664 (np->n_direofoffset == 0 || 665 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 666 !(np->n_flag & NQNFSNONCACHE) && 667 !incore(vp, lbn + 1)) { 668 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 669 if (rabp) { 670 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 671 rabp->b_flags |= B_ASYNC; 672 rabp->b_iocmd = BIO_READ; 673 vfs_busy_pages(rabp, 0); 674 if (nfs_asyncio(rabp, cred, p)) { 675 rabp->b_flags |= B_INVAL; 676 rabp->b_ioflags |= BIO_ERROR; 677 vfs_unbusy_pages(rabp); 678 brelse(rabp); 679 } 680 } else { 681 brelse(rabp); 682 } 683 } 684 } 685 /* 686 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 687 * chopped for the EOF condition, we cannot tell how large 688 * NFS directories are going to be until we hit EOF. So 689 * an NFS directory buffer is *not* chopped to its EOF. Now, 690 * it just so happens that b_resid will effectively chop it 691 * to EOF. *BUT* this information is lost if the buffer goes 692 * away and is reconstituted into a B_CACHE state ( due to 693 * being VMIO ) later. So we keep track of the directory eof 694 * in np->n_direofoffset and chop it off as an extra step 695 * right here. 696 */ 697 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 698 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 699 n = np->n_direofoffset - uio->uio_offset; 700 break; 701 default: 702 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 703 break; 704 }; 705 706 if (n > 0) { 707 error = uiomove(bp->b_data + on, (int)n, uio); 708 } 709 switch (vp->v_type) { 710 case VREG: 711 break; 712 case VLNK: 713 n = 0; 714 break; 715 case VDIR: 716 /* 717 * Invalidate buffer if caching is disabled, forcing a 718 * re-read from the remote later. 719 */ 720 if (np->n_flag & NQNFSNONCACHE) 721 bp->b_flags |= B_INVAL; 722 break; 723 default: 724 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 725 } 726 brelse(bp); 727 } while (error == 0 && uio->uio_resid > 0 && n > 0); 728 return (error); 729} 730 731/* 732 * Vnode op for write using bio 733 */ 734int 735nfs_write(ap) 736 struct vop_write_args /* { 737 struct vnode *a_vp; 738 struct uio *a_uio; 739 int a_ioflag; 740 struct ucred *a_cred; 741 } */ *ap; 742{ 743 int biosize; 744 struct uio *uio = ap->a_uio; 745 struct proc *p = uio->uio_procp; 746 struct vnode *vp = ap->a_vp; 747 struct nfsnode *np = VTONFS(vp); 748 struct ucred *cred = ap->a_cred; 749 int ioflag = ap->a_ioflag; 750 struct buf *bp; 751 struct vattr vattr; 752 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 753 daddr_t lbn; 754 int bcount; 755 int n, on, error = 0, iomode, must_commit; 756 int haverslock = 0; 757 758#ifdef DIAGNOSTIC 759 if (uio->uio_rw != UIO_WRITE) 760 panic("nfs_write mode"); 761 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 762 panic("nfs_write proc"); 763#endif 764 if (vp->v_type != VREG) 765 return (EIO); 766 if (np->n_flag & NWRITEERR) { 767 np->n_flag &= ~NWRITEERR; 768 return (np->n_error); 769 } 770 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 771 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 772 (void)nfs_fsinfo(nmp, vp, cred, p); 773 774 /* 775 * Synchronously flush pending buffers if we are in synchronous 776 * mode or if we are appending. 777 */ 778 if (ioflag & (IO_APPEND | IO_SYNC)) { 779 if (np->n_flag & NMODIFIED) { 780 np->n_attrstamp = 0; 781 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 782 if (error) 783 return (error); 784 } 785 } 786 787 /* 788 * If IO_APPEND then load uio_offset. We restart here if we cannot 789 * get the append lock. 790 */ 791restart: 792 if (ioflag & IO_APPEND) { 793 np->n_attrstamp = 0; 794 error = VOP_GETATTR(vp, &vattr, cred, p); 795 if (error) 796 return (error); 797 uio->uio_offset = np->n_size; 798 } 799 800 if (uio->uio_offset < 0) 801 return (EINVAL); 802 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 803 return (EFBIG); 804 if (uio->uio_resid == 0) 805 return (0); 806 807 /* 808 * We need to obtain the rslock if we intend to modify np->n_size 809 * in order to guarentee the append point with multiple contending 810 * writers, to guarentee that no other appenders modify n_size 811 * while we are trying to obtain a truncated buffer (i.e. to avoid 812 * accidently truncating data written by another appender due to 813 * the race), and to ensure that the buffer is populated prior to 814 * our extending of the file. We hold rslock through the entire 815 * operation. 816 * 817 * Note that we do not synchronize the case where someone truncates 818 * the file while we are appending to it because attempting to lock 819 * this case may deadlock other parts of the system unexpectedly. 820 */ 821 if ((ioflag & IO_APPEND) || 822 uio->uio_offset + uio->uio_resid > np->n_size) { 823 switch(nfs_rslock(np, p)) { 824 case ENOLCK: 825 goto restart; 826 /* not reached */ 827 case EINTR: 828 case ERESTART: 829 return(EINTR); 830 /* not reached */ 831 default: 832 break; 833 } 834 haverslock = 1; 835 } 836 837 /* 838 * Maybe this should be above the vnode op call, but so long as 839 * file servers have no limits, i don't think it matters 840 */ 841 if (p && uio->uio_offset + uio->uio_resid > 842 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 843 PROC_LOCK(p); 844 psignal(p, SIGXFSZ); 845 PROC_UNLOCK(p); 846 if (haverslock) 847 nfs_rsunlock(np, p); 848 return (EFBIG); 849 } 850 851 biosize = vp->v_mount->mnt_stat.f_iosize; 852 853 do { 854 /* 855 * Check for a valid write lease. 856 */ 857 if ((nmp->nm_flag & NFSMNT_NQNFS) && 858 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 859 do { 860 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 861 } while (error == NQNFS_EXPIRED); 862 if (error) 863 break; 864 if (np->n_lrev != np->n_brev || 865 (np->n_flag & NQNFSNONCACHE)) { 866 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 867 if (error) 868 break; 869 np->n_brev = np->n_lrev; 870 } 871 } 872 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 873 iomode = NFSV3WRITE_FILESYNC; 874 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 875 if (must_commit) 876 nfs_clearcommit(vp->v_mount); 877 break; 878 } 879 nfsstats.biocache_writes++; 880 lbn = uio->uio_offset / biosize; 881 on = uio->uio_offset & (biosize-1); 882 n = min((unsigned)(biosize - on), uio->uio_resid); 883again: 884 /* 885 * Handle direct append and file extension cases, calculate 886 * unaligned buffer size. 887 */ 888 889 if (uio->uio_offset == np->n_size && n) { 890 /* 891 * Get the buffer (in its pre-append state to maintain 892 * B_CACHE if it was previously set). Resize the 893 * nfsnode after we have locked the buffer to prevent 894 * readers from reading garbage. 895 */ 896 bcount = on; 897 bp = nfs_getcacheblk(vp, lbn, bcount, p); 898 899 if (bp != NULL) { 900 long save; 901 902 np->n_size = uio->uio_offset + n; 903 np->n_flag |= NMODIFIED; 904 vnode_pager_setsize(vp, np->n_size); 905 906 save = bp->b_flags & B_CACHE; 907 bcount += n; 908 allocbuf(bp, bcount); 909 bp->b_flags |= save; 910 bp->b_magic = B_MAGIC_NFS; 911 bp->b_op = &buf_ops_nfs; 912 } 913 } else { 914 /* 915 * Obtain the locked cache block first, and then 916 * adjust the file's size as appropriate. 917 */ 918 bcount = on + n; 919 if ((off_t)lbn * biosize + bcount < np->n_size) { 920 if ((off_t)(lbn + 1) * biosize < np->n_size) 921 bcount = biosize; 922 else 923 bcount = np->n_size - (off_t)lbn * biosize; 924 } 925 926 bp = nfs_getcacheblk(vp, lbn, bcount, p); 927 928 if (uio->uio_offset + n > np->n_size) { 929 np->n_size = uio->uio_offset + n; 930 np->n_flag |= NMODIFIED; 931 vnode_pager_setsize(vp, np->n_size); 932 } 933 } 934 935 if (!bp) { 936 error = EINTR; 937 break; 938 } 939 940 /* 941 * Issue a READ if B_CACHE is not set. In special-append 942 * mode, B_CACHE is based on the buffer prior to the write 943 * op and is typically set, avoiding the read. If a read 944 * is required in special append mode, the server will 945 * probably send us a short-read since we extended the file 946 * on our end, resulting in b_resid == 0 and, thusly, 947 * B_CACHE getting set. 948 * 949 * We can also avoid issuing the read if the write covers 950 * the entire buffer. We have to make sure the buffer state 951 * is reasonable in this case since we will not be initiating 952 * I/O. See the comments in kern/vfs_bio.c's getblk() for 953 * more information. 954 * 955 * B_CACHE may also be set due to the buffer being cached 956 * normally. 957 */ 958 959 if (on == 0 && n == bcount) { 960 bp->b_flags |= B_CACHE; 961 bp->b_flags &= ~B_INVAL; 962 bp->b_ioflags &= ~BIO_ERROR; 963 } 964 965 if ((bp->b_flags & B_CACHE) == 0) { 966 bp->b_iocmd = BIO_READ; 967 vfs_busy_pages(bp, 0); 968 error = nfs_doio(bp, cred, p); 969 if (error) { 970 brelse(bp); 971 break; 972 } 973 } 974 if (!bp) { 975 error = EINTR; 976 break; 977 } 978 if (bp->b_wcred == NOCRED) { 979 crhold(cred); 980 bp->b_wcred = cred; 981 } 982 np->n_flag |= NMODIFIED; 983 984 /* 985 * If dirtyend exceeds file size, chop it down. This should 986 * not normally occur but there is an append race where it 987 * might occur XXX, so we log it. 988 * 989 * If the chopping creates a reverse-indexed or degenerate 990 * situation with dirtyoff/end, we 0 both of them. 991 */ 992 993 if (bp->b_dirtyend > bcount) { 994 printf("NFS append race @%lx:%d\n", 995 (long)bp->b_blkno * DEV_BSIZE, 996 bp->b_dirtyend - bcount); 997 bp->b_dirtyend = bcount; 998 } 999 1000 if (bp->b_dirtyoff >= bp->b_dirtyend) 1001 bp->b_dirtyoff = bp->b_dirtyend = 0; 1002 1003 /* 1004 * If the new write will leave a contiguous dirty 1005 * area, just update the b_dirtyoff and b_dirtyend, 1006 * otherwise force a write rpc of the old dirty area. 1007 * 1008 * While it is possible to merge discontiguous writes due to 1009 * our having a B_CACHE buffer ( and thus valid read data 1010 * for the hole), we don't because it could lead to 1011 * significant cache coherency problems with multiple clients, 1012 * especially if locking is implemented later on. 1013 * 1014 * as an optimization we could theoretically maintain 1015 * a linked list of discontinuous areas, but we would still 1016 * have to commit them separately so there isn't much 1017 * advantage to it except perhaps a bit of asynchronization. 1018 */ 1019 1020 if (bp->b_dirtyend > 0 && 1021 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1022 if (BUF_WRITE(bp) == EINTR) 1023 return (EINTR); 1024 goto again; 1025 } 1026 1027 /* 1028 * Check for valid write lease and get one as required. 1029 * In case getblk() and/or bwrite() delayed us. 1030 */ 1031 if ((nmp->nm_flag & NFSMNT_NQNFS) && 1032 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 1033 do { 1034 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 1035 } while (error == NQNFS_EXPIRED); 1036 if (error) { 1037 brelse(bp); 1038 break; 1039 } 1040 if (np->n_lrev != np->n_brev || 1041 (np->n_flag & NQNFSNONCACHE)) { 1042 brelse(bp); 1043 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1044 if (error) 1045 break; 1046 np->n_brev = np->n_lrev; 1047 goto again; 1048 } 1049 } 1050 1051 error = uiomove((char *)bp->b_data + on, n, uio); 1052 1053 /* 1054 * Since this block is being modified, it must be written 1055 * again and not just committed. Since write clustering does 1056 * not work for the stage 1 data write, only the stage 2 1057 * commit rpc, we have to clear B_CLUSTEROK as well. 1058 */ 1059 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1060 1061 if (error) { 1062 bp->b_ioflags |= BIO_ERROR; 1063 brelse(bp); 1064 break; 1065 } 1066 1067 /* 1068 * Only update dirtyoff/dirtyend if not a degenerate 1069 * condition. 1070 */ 1071 if (n) { 1072 if (bp->b_dirtyend > 0) { 1073 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1074 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1075 } else { 1076 bp->b_dirtyoff = on; 1077 bp->b_dirtyend = on + n; 1078 } 1079 vfs_bio_set_validclean(bp, on, n); 1080 } 1081 1082 /* 1083 * If the lease is non-cachable or IO_SYNC do bwrite(). 1084 * 1085 * IO_INVAL appears to be unused. The idea appears to be 1086 * to turn off caching in this case. Very odd. XXX 1087 */ 1088 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 1089 if (ioflag & IO_INVAL) 1090 bp->b_flags |= B_NOCACHE; 1091 error = BUF_WRITE(bp); 1092 if (error) 1093 break; 1094 if (np->n_flag & NQNFSNONCACHE) { 1095 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 1096 if (error) 1097 break; 1098 } 1099 } else if ((n + on) == biosize && 1100 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 1101 bp->b_flags |= B_ASYNC; 1102 (void)nfs_writebp(bp, 0, 0); 1103 } else { 1104 bdwrite(bp); 1105 } 1106 } while (uio->uio_resid > 0 && n > 0); 1107 1108 if (haverslock) 1109 nfs_rsunlock(np, p); 1110 1111 return (error); 1112} 1113 1114/* 1115 * Get an nfs cache block. 1116 * 1117 * Allocate a new one if the block isn't currently in the cache 1118 * and return the block marked busy. If the calling process is 1119 * interrupted by a signal for an interruptible mount point, return 1120 * NULL. 1121 * 1122 * The caller must carefully deal with the possible B_INVAL state of 1123 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1124 * indirectly), so synchronous reads can be issued without worrying about 1125 * the B_INVAL state. We have to be a little more careful when dealing 1126 * with writes (see comments in nfs_write()) when extending a file past 1127 * its EOF. 1128 */ 1129static struct buf * 1130nfs_getcacheblk(vp, bn, size, p) 1131 struct vnode *vp; 1132 daddr_t bn; 1133 int size; 1134 struct proc *p; 1135{ 1136 register struct buf *bp; 1137 struct mount *mp; 1138 struct nfsmount *nmp; 1139 1140 mp = vp->v_mount; 1141 nmp = VFSTONFS(mp); 1142 1143 if (nmp->nm_flag & NFSMNT_INT) { 1144 bp = getblk(vp, bn, size, PCATCH, 0); 1145 while (bp == (struct buf *)0) { 1146 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 1147 return ((struct buf *)0); 1148 bp = getblk(vp, bn, size, 0, 2 * hz); 1149 } 1150 } else { 1151 bp = getblk(vp, bn, size, 0, 0); 1152 } 1153 1154 if (vp->v_type == VREG) { 1155 int biosize; 1156 1157 biosize = mp->mnt_stat.f_iosize; 1158 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1159 } 1160 return (bp); 1161} 1162 1163/* 1164 * Flush and invalidate all dirty buffers. If another process is already 1165 * doing the flush, just wait for completion. 1166 */ 1167int 1168nfs_vinvalbuf(vp, flags, cred, p, intrflg) 1169 struct vnode *vp; 1170 int flags; 1171 struct ucred *cred; 1172 struct proc *p; 1173 int intrflg; 1174{ 1175 register struct nfsnode *np = VTONFS(vp); 1176 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1177 int error = 0, slpflag, slptimeo; 1178 1179 if (vp->v_flag & VXLOCK) { 1180 return (0); 1181 } 1182 1183 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1184 intrflg = 0; 1185 if (intrflg) { 1186 slpflag = PCATCH; 1187 slptimeo = 2 * hz; 1188 } else { 1189 slpflag = 0; 1190 slptimeo = 0; 1191 } 1192 /* 1193 * First wait for any other process doing a flush to complete. 1194 */ 1195 while (np->n_flag & NFLUSHINPROG) { 1196 np->n_flag |= NFLUSHWANT; 1197 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 1198 slptimeo); 1199 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 1200 return (EINTR); 1201 } 1202 1203 /* 1204 * Now, flush as required. 1205 */ 1206 np->n_flag |= NFLUSHINPROG; 1207 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 1208 while (error) { 1209 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 1210 np->n_flag &= ~NFLUSHINPROG; 1211 if (np->n_flag & NFLUSHWANT) { 1212 np->n_flag &= ~NFLUSHWANT; 1213 wakeup((caddr_t)&np->n_flag); 1214 } 1215 return (EINTR); 1216 } 1217 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 1218 } 1219 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 1220 if (np->n_flag & NFLUSHWANT) { 1221 np->n_flag &= ~NFLUSHWANT; 1222 wakeup((caddr_t)&np->n_flag); 1223 } 1224 return (0); 1225} 1226 1227/* 1228 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1229 * This is mainly to avoid queueing async I/O requests when the nfsiods 1230 * are all hung on a dead server. 1231 * 1232 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1233 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1234 */ 1235int 1236nfs_asyncio(bp, cred, procp) 1237 register struct buf *bp; 1238 struct ucred *cred; 1239 struct proc *procp; 1240{ 1241 struct nfsmount *nmp; 1242 int i; 1243 int gotiod; 1244 int slpflag = 0; 1245 int slptimeo = 0; 1246 int error; 1247 1248 /* 1249 * If no async daemons then return EIO to force caller to run the rpc 1250 * synchronously. 1251 */ 1252 if (nfs_numasync == 0) 1253 return (EIO); 1254 1255 nmp = VFSTONFS(bp->b_vp->v_mount); 1256 1257 /* 1258 * Commits are usually short and sweet so lets save some cpu and 1259 * leave the async daemons for more important rpc's (such as reads 1260 * and writes). 1261 */ 1262 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1263 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1264 return(EIO); 1265 } 1266 1267again: 1268 if (nmp->nm_flag & NFSMNT_INT) 1269 slpflag = PCATCH; 1270 gotiod = FALSE; 1271 1272 /* 1273 * Find a free iod to process this request. 1274 */ 1275 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 1276 if (nfs_iodwant[i]) { 1277 /* 1278 * Found one, so wake it up and tell it which 1279 * mount to process. 1280 */ 1281 NFS_DPF(ASYNCIO, 1282 ("nfs_asyncio: waking iod %d for mount %p\n", 1283 i, nmp)); 1284 nfs_iodwant[i] = (struct proc *)0; 1285 nfs_iodmount[i] = nmp; 1286 nmp->nm_bufqiods++; 1287 wakeup((caddr_t)&nfs_iodwant[i]); 1288 gotiod = TRUE; 1289 break; 1290 } 1291 1292 /* 1293 * If none are free, we may already have an iod working on this mount 1294 * point. If so, it will process our request. 1295 */ 1296 if (!gotiod) { 1297 if (nmp->nm_bufqiods > 0) { 1298 NFS_DPF(ASYNCIO, 1299 ("nfs_asyncio: %d iods are already processing mount %p\n", 1300 nmp->nm_bufqiods, nmp)); 1301 gotiod = TRUE; 1302 } 1303 } 1304 1305 /* 1306 * If we have an iod which can process the request, then queue 1307 * the buffer. 1308 */ 1309 if (gotiod) { 1310 /* 1311 * Ensure that the queue never grows too large. We still want 1312 * to asynchronize so we block rather then return EIO. 1313 */ 1314 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1315 NFS_DPF(ASYNCIO, 1316 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1317 nmp->nm_bufqwant = TRUE; 1318 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1319 "nfsaio", slptimeo); 1320 if (error) { 1321 if (nfs_sigintr(nmp, NULL, procp)) 1322 return (EINTR); 1323 if (slpflag == PCATCH) { 1324 slpflag = 0; 1325 slptimeo = 2 * hz; 1326 } 1327 } 1328 /* 1329 * We might have lost our iod while sleeping, 1330 * so check and loop if nescessary. 1331 */ 1332 if (nmp->nm_bufqiods == 0) { 1333 NFS_DPF(ASYNCIO, 1334 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1335 goto again; 1336 } 1337 } 1338 1339 if (bp->b_iocmd == BIO_READ) { 1340 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1341 crhold(cred); 1342 bp->b_rcred = cred; 1343 } 1344 } else { 1345 bp->b_flags |= B_WRITEINPROG; 1346 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1347 crhold(cred); 1348 bp->b_wcred = cred; 1349 } 1350 } 1351 1352 BUF_KERNPROC(bp); 1353 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1354 nmp->nm_bufqlen++; 1355 return (0); 1356 } 1357 1358 /* 1359 * All the iods are busy on other mounts, so return EIO to 1360 * force the caller to process the i/o synchronously. 1361 */ 1362 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1363 return (EIO); 1364} 1365 1366/* 1367 * Do an I/O operation to/from a cache block. This may be called 1368 * synchronously or from an nfsiod. 1369 */ 1370int 1371nfs_doio(bp, cr, p) 1372 struct buf *bp; 1373 struct ucred *cr; 1374 struct proc *p; 1375{ 1376 struct uio *uiop; 1377 struct vnode *vp; 1378 struct nfsnode *np; 1379 struct nfsmount *nmp; 1380 int error = 0, iomode, must_commit = 0; 1381 struct uio uio; 1382 struct iovec io; 1383 1384 vp = bp->b_vp; 1385 np = VTONFS(vp); 1386 nmp = VFSTONFS(vp->v_mount); 1387 uiop = &uio; 1388 uiop->uio_iov = &io; 1389 uiop->uio_iovcnt = 1; 1390 uiop->uio_segflg = UIO_SYSSPACE; 1391 uiop->uio_procp = p; 1392 1393 /* 1394 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1395 * do this here so we do not have to do it in all the code that 1396 * calls us. 1397 */ 1398 bp->b_flags &= ~B_INVAL; 1399 bp->b_ioflags &= ~BIO_ERROR; 1400 1401 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1402 1403 /* 1404 * Historically, paging was done with physio, but no more. 1405 */ 1406 if (bp->b_flags & B_PHYS) { 1407 /* 1408 * ...though reading /dev/drum still gets us here. 1409 */ 1410 io.iov_len = uiop->uio_resid = bp->b_bcount; 1411 /* mapping was done by vmapbuf() */ 1412 io.iov_base = bp->b_data; 1413 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1414 if (bp->b_iocmd == BIO_READ) { 1415 uiop->uio_rw = UIO_READ; 1416 nfsstats.read_physios++; 1417 error = nfs_readrpc(vp, uiop, cr); 1418 } else { 1419 int com; 1420 1421 iomode = NFSV3WRITE_DATASYNC; 1422 uiop->uio_rw = UIO_WRITE; 1423 nfsstats.write_physios++; 1424 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1425 } 1426 if (error) { 1427 bp->b_ioflags |= BIO_ERROR; 1428 bp->b_error = error; 1429 } 1430 } else if (bp->b_iocmd == BIO_READ) { 1431 io.iov_len = uiop->uio_resid = bp->b_bcount; 1432 io.iov_base = bp->b_data; 1433 uiop->uio_rw = UIO_READ; 1434 switch (vp->v_type) { 1435 case VREG: 1436 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1437 nfsstats.read_bios++; 1438 error = nfs_readrpc(vp, uiop, cr); 1439 if (!error) { 1440 if (uiop->uio_resid) { 1441 /* 1442 * If we had a short read with no error, we must have 1443 * hit a file hole. We should zero-fill the remainder. 1444 * This can also occur if the server hits the file EOF. 1445 * 1446 * Holes used to be able to occur due to pending 1447 * writes, but that is not possible any longer. 1448 */ 1449 int nread = bp->b_bcount - uiop->uio_resid; 1450 int left = bp->b_bcount - nread; 1451 1452 if (left > 0) 1453 bzero((char *)bp->b_data + nread, left); 1454 uiop->uio_resid = 0; 1455 } 1456 } 1457 if (p && (vp->v_flag & VTEXT) && 1458 (((nmp->nm_flag & NFSMNT_NQNFS) && 1459 NQNFS_CKINVALID(vp, np, ND_READ) && 1460 np->n_lrev != np->n_brev) || 1461 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1462 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1463 uprintf("Process killed due to text file modification\n"); 1464 PROC_LOCK(p); 1465 psignal(p, SIGKILL); 1466 _PHOLD(p); 1467 PROC_UNLOCK(p); 1468 } 1469 break; 1470 case VLNK: 1471 uiop->uio_offset = (off_t)0; 1472 nfsstats.readlink_bios++; 1473 error = nfs_readlinkrpc(vp, uiop, cr); 1474 break; 1475 case VDIR: 1476 nfsstats.readdir_bios++; 1477 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1478 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1479 error = nfs_readdirplusrpc(vp, uiop, cr); 1480 if (error == NFSERR_NOTSUPP) 1481 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1482 } 1483 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1484 error = nfs_readdirrpc(vp, uiop, cr); 1485 /* 1486 * end-of-directory sets B_INVAL but does not generate an 1487 * error. 1488 */ 1489 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1490 bp->b_flags |= B_INVAL; 1491 break; 1492 default: 1493 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1494 break; 1495 }; 1496 if (error) { 1497 bp->b_ioflags |= BIO_ERROR; 1498 bp->b_error = error; 1499 } 1500 } else { 1501 /* 1502 * If we only need to commit, try to commit 1503 */ 1504 if (bp->b_flags & B_NEEDCOMMIT) { 1505 int retv; 1506 off_t off; 1507 1508 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1509 bp->b_flags |= B_WRITEINPROG; 1510 retv = nfs_commit( 1511 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1512 bp->b_wcred, p); 1513 bp->b_flags &= ~B_WRITEINPROG; 1514 if (retv == 0) { 1515 bp->b_dirtyoff = bp->b_dirtyend = 0; 1516 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1517 bp->b_resid = 0; 1518 bufdone(bp); 1519 return (0); 1520 } 1521 if (retv == NFSERR_STALEWRITEVERF) { 1522 nfs_clearcommit(bp->b_vp->v_mount); 1523 } 1524 } 1525 1526 /* 1527 * Setup for actual write 1528 */ 1529 1530 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1531 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1532 1533 if (bp->b_dirtyend > bp->b_dirtyoff) { 1534 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1535 - bp->b_dirtyoff; 1536 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1537 + bp->b_dirtyoff; 1538 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1539 uiop->uio_rw = UIO_WRITE; 1540 nfsstats.write_bios++; 1541 1542 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1543 iomode = NFSV3WRITE_UNSTABLE; 1544 else 1545 iomode = NFSV3WRITE_FILESYNC; 1546 1547 bp->b_flags |= B_WRITEINPROG; 1548 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1549 1550 /* 1551 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1552 * to cluster the buffers needing commit. This will allow 1553 * the system to submit a single commit rpc for the whole 1554 * cluster. We can do this even if the buffer is not 100% 1555 * dirty (relative to the NFS blocksize), so we optimize the 1556 * append-to-file-case. 1557 * 1558 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1559 * cleared because write clustering only works for commit 1560 * rpc's, not for the data portion of the write). 1561 */ 1562 1563 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1564 bp->b_flags |= B_NEEDCOMMIT; 1565 if (bp->b_dirtyoff == 0 1566 && bp->b_dirtyend == bp->b_bcount) 1567 bp->b_flags |= B_CLUSTEROK; 1568 } else { 1569 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1570 } 1571 bp->b_flags &= ~B_WRITEINPROG; 1572 1573 /* 1574 * For an interrupted write, the buffer is still valid 1575 * and the write hasn't been pushed to the server yet, 1576 * so we can't set BIO_ERROR and report the interruption 1577 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1578 * is not relevant, so the rpc attempt is essentially 1579 * a noop. For the case of a V3 write rpc not being 1580 * committed to stable storage, the block is still 1581 * dirty and requires either a commit rpc or another 1582 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1583 * the block is reused. This is indicated by setting 1584 * the B_DELWRI and B_NEEDCOMMIT flags. 1585 * 1586 * If the buffer is marked B_PAGING, it does not reside on 1587 * the vp's paging queues so we cannot call bdirty(). The 1588 * bp in this case is not an NFS cache block so we should 1589 * be safe. XXX 1590 */ 1591 if (error == EINTR 1592 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1593 int s; 1594 1595 s = splbio(); 1596 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1597 if ((bp->b_flags & B_PAGING) == 0) { 1598 bdirty(bp); 1599 bp->b_flags &= ~B_DONE; 1600 } 1601 if (error && (bp->b_flags & B_ASYNC) == 0) 1602 bp->b_flags |= B_EINTR; 1603 splx(s); 1604 } else { 1605 if (error) { 1606 bp->b_ioflags |= BIO_ERROR; 1607 bp->b_error = np->n_error = error; 1608 np->n_flag |= NWRITEERR; 1609 } 1610 bp->b_dirtyoff = bp->b_dirtyend = 0; 1611 } 1612 } else { 1613 bp->b_resid = 0; 1614 bufdone(bp); 1615 return (0); 1616 } 1617 } 1618 bp->b_resid = uiop->uio_resid; 1619 if (must_commit) 1620 nfs_clearcommit(vp->v_mount); 1621 bufdone(bp); 1622 return (error); 1623} 1624