nfs_bio.c revision 83366
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 37 * $FreeBSD: head/sys/nfsclient/nfs_bio.c 83366 2001-09-12 08:38:13Z julian $ 38 */ 39 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/bio.h> 44#include <sys/buf.h> 45#include <sys/kernel.h> 46#include <sys/mount.h> 47#include <sys/proc.h> 48#include <sys/resourcevar.h> 49#include <sys/signalvar.h> 50#include <sys/vmmeter.h> 51#include <sys/vnode.h> 52 53#include <vm/vm.h> 54#include <vm/vm_extern.h> 55#include <vm/vm_page.h> 56#include <vm/vm_object.h> 57#include <vm/vm_pager.h> 58#include <vm/vnode_pager.h> 59 60#include <nfs/rpcv2.h> 61#include <nfs/nfsproto.h> 62#include <nfs/nfs.h> 63#include <nfs/nfsmount.h> 64#include <nfs/nqnfs.h> 65#include <nfs/nfsnode.h> 66 67/* 68 * Just call nfs_writebp() with the force argument set to 1. 69 * 70 * NOTE: B_DONE may or may not be set in a_bp on call. 71 */ 72static int 73nfs_bwrite(struct buf *bp) 74{ 75 return (nfs_writebp(bp, 1, curthread)); 76} 77 78struct buf_ops buf_ops_nfs = { 79 "buf_ops_nfs", 80 nfs_bwrite 81}; 82 83 84static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 85 struct thread *td)); 86 87extern int nfs_numasync; 88extern int nfs_pbuf_freecnt; 89extern struct nfsstats nfsstats; 90 91/* 92 * Vnode op for VM getpages. 93 */ 94int 95nfs_getpages(ap) 96 struct vop_getpages_args /* { 97 struct vnode *a_vp; 98 vm_page_t *a_m; 99 int a_count; 100 int a_reqpage; 101 vm_ooffset_t a_offset; 102 } */ *ap; 103{ 104 int i, error, nextoff, size, toff, count, npages; 105 struct uio uio; 106 struct iovec iov; 107 vm_offset_t kva; 108 struct buf *bp; 109 struct vnode *vp; 110 struct thread *td; 111 struct ucred *cred; 112 struct nfsmount *nmp; 113 vm_page_t *pages; 114 115 GIANT_REQUIRED; 116 117 vp = ap->a_vp; 118 td = curthread; /* XXX */ 119 cred = curthread->td_proc->p_ucred; /* XXX */ 120 nmp = VFSTONFS(vp->v_mount); 121 pages = ap->a_m; 122 count = ap->a_count; 123 124 if (vp->v_object == NULL) { 125 printf("nfs_getpages: called with non-merged cache vnode??\n"); 126 return VM_PAGER_ERROR; 127 } 128 129 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 130 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 131 (void)nfs_fsinfo(nmp, vp, cred, td); 132 } 133 134 npages = btoc(count); 135 136 /* 137 * If the requested page is partially valid, just return it and 138 * allow the pager to zero-out the blanks. Partially valid pages 139 * can only occur at the file EOF. 140 */ 141 142 { 143 vm_page_t m = pages[ap->a_reqpage]; 144 145 if (m->valid != 0) { 146 /* handled by vm_fault now */ 147 /* vm_page_zero_invalid(m, TRUE); */ 148 for (i = 0; i < npages; ++i) { 149 if (i != ap->a_reqpage) 150 vm_page_free(pages[i]); 151 } 152 return(0); 153 } 154 } 155 156 /* 157 * We use only the kva address for the buffer, but this is extremely 158 * convienient and fast. 159 */ 160 bp = getpbuf(&nfs_pbuf_freecnt); 161 162 kva = (vm_offset_t) bp->b_data; 163 pmap_qenter(kva, pages, npages); 164 cnt.v_vnodein++; 165 cnt.v_vnodepgsin += npages; 166 167 iov.iov_base = (caddr_t) kva; 168 iov.iov_len = count; 169 uio.uio_iov = &iov; 170 uio.uio_iovcnt = 1; 171 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 172 uio.uio_resid = count; 173 uio.uio_segflg = UIO_SYSSPACE; 174 uio.uio_rw = UIO_READ; 175 uio.uio_td = td; 176 177 error = nfs_readrpc(vp, &uio, cred); 178 pmap_qremove(kva, npages); 179 180 relpbuf(bp, &nfs_pbuf_freecnt); 181 182 if (error && (uio.uio_resid == count)) { 183 printf("nfs_getpages: error %d\n", error); 184 for (i = 0; i < npages; ++i) { 185 if (i != ap->a_reqpage) 186 vm_page_free(pages[i]); 187 } 188 return VM_PAGER_ERROR; 189 } 190 191 /* 192 * Calculate the number of bytes read and validate only that number 193 * of bytes. Note that due to pending writes, size may be 0. This 194 * does not mean that the remaining data is invalid! 195 */ 196 197 size = count - uio.uio_resid; 198 199 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 200 vm_page_t m; 201 nextoff = toff + PAGE_SIZE; 202 m = pages[i]; 203 204 m->flags &= ~PG_ZERO; 205 206 if (nextoff <= size) { 207 /* 208 * Read operation filled an entire page 209 */ 210 m->valid = VM_PAGE_BITS_ALL; 211 vm_page_undirty(m); 212 } else if (size > toff) { 213 /* 214 * Read operation filled a partial page. 215 */ 216 m->valid = 0; 217 vm_page_set_validclean(m, 0, size - toff); 218 /* handled by vm_fault now */ 219 /* vm_page_zero_invalid(m, TRUE); */ 220 } 221 222 if (i != ap->a_reqpage) { 223 /* 224 * Whether or not to leave the page activated is up in 225 * the air, but we should put the page on a page queue 226 * somewhere (it already is in the object). Result: 227 * It appears that emperical results show that 228 * deactivating pages is best. 229 */ 230 231 /* 232 * Just in case someone was asking for this page we 233 * now tell them that it is ok to use. 234 */ 235 if (!error) { 236 if (m->flags & PG_WANTED) 237 vm_page_activate(m); 238 else 239 vm_page_deactivate(m); 240 vm_page_wakeup(m); 241 } else { 242 vm_page_free(m); 243 } 244 } 245 } 246 return 0; 247} 248 249/* 250 * Vnode op for VM putpages. 251 */ 252int 253nfs_putpages(ap) 254 struct vop_putpages_args /* { 255 struct vnode *a_vp; 256 vm_page_t *a_m; 257 int a_count; 258 int a_sync; 259 int *a_rtvals; 260 vm_ooffset_t a_offset; 261 } */ *ap; 262{ 263 struct uio uio; 264 struct iovec iov; 265 vm_offset_t kva; 266 struct buf *bp; 267 int iomode, must_commit, i, error, npages, count; 268 off_t offset; 269 int *rtvals; 270 struct vnode *vp; 271 struct thread *td; 272 struct ucred *cred; 273 struct nfsmount *nmp; 274 struct nfsnode *np; 275 vm_page_t *pages; 276 277 GIANT_REQUIRED; 278 279 vp = ap->a_vp; 280 np = VTONFS(vp); 281 td = curthread; /* XXX */ 282 cred = curthread->td_proc->p_ucred; /* XXX */ 283 nmp = VFSTONFS(vp->v_mount); 284 pages = ap->a_m; 285 count = ap->a_count; 286 rtvals = ap->a_rtvals; 287 npages = btoc(count); 288 offset = IDX_TO_OFF(pages[0]->pindex); 289 290 GIANT_REQUIRED; 291 292 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 293 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 294 (void)nfs_fsinfo(nmp, vp, cred, td); 295 } 296 297 for (i = 0; i < npages; i++) { 298 rtvals[i] = VM_PAGER_AGAIN; 299 } 300 301 /* 302 * When putting pages, do not extend file past EOF. 303 */ 304 305 if (offset + count > np->n_size) { 306 count = np->n_size - offset; 307 if (count < 0) 308 count = 0; 309 } 310 311 /* 312 * We use only the kva address for the buffer, but this is extremely 313 * convienient and fast. 314 */ 315 bp = getpbuf(&nfs_pbuf_freecnt); 316 317 kva = (vm_offset_t) bp->b_data; 318 pmap_qenter(kva, pages, npages); 319 cnt.v_vnodeout++; 320 cnt.v_vnodepgsout += count; 321 322 iov.iov_base = (caddr_t) kva; 323 iov.iov_len = count; 324 uio.uio_iov = &iov; 325 uio.uio_iovcnt = 1; 326 uio.uio_offset = offset; 327 uio.uio_resid = count; 328 uio.uio_segflg = UIO_SYSSPACE; 329 uio.uio_rw = UIO_WRITE; 330 uio.uio_td = td; 331 332 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 333 iomode = NFSV3WRITE_UNSTABLE; 334 else 335 iomode = NFSV3WRITE_FILESYNC; 336 337 error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit); 338 339 pmap_qremove(kva, npages); 340 relpbuf(bp, &nfs_pbuf_freecnt); 341 342 if (!error) { 343 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 344 for (i = 0; i < nwritten; i++) { 345 rtvals[i] = VM_PAGER_OK; 346 vm_page_undirty(pages[i]); 347 } 348 if (must_commit) { 349 nfs_clearcommit(vp->v_mount); 350 } 351 } 352 return rtvals[0]; 353} 354 355/* 356 * Vnode op for read using bio 357 */ 358int 359nfs_bioread(vp, uio, ioflag, cred) 360 register struct vnode *vp; 361 register struct uio *uio; 362 int ioflag; 363 struct ucred *cred; 364{ 365 register struct nfsnode *np = VTONFS(vp); 366 register int biosize, i; 367 struct buf *bp = 0, *rabp; 368 struct vattr vattr; 369 struct thread *td; 370 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 371 daddr_t lbn, rabn; 372 int bcount; 373 int seqcount; 374 int nra, error = 0, n = 0, on = 0; 375 376#ifdef DIAGNOSTIC 377 if (uio->uio_rw != UIO_READ) 378 panic("nfs_read mode"); 379#endif 380 if (uio->uio_resid == 0) 381 return (0); 382 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 383 return (EINVAL); 384 td = uio->uio_td; 385 386 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 387 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 388 (void)nfs_fsinfo(nmp, vp, cred, td); 389 if (vp->v_type != VDIR && 390 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 391 return (EFBIG); 392 biosize = vp->v_mount->mnt_stat.f_iosize; 393 seqcount = (int)((off_t)(ioflag >> 16) * biosize / BKVASIZE); 394 /* 395 * For nfs, cache consistency can only be maintained approximately. 396 * Although RFC1094 does not specify the criteria, the following is 397 * believed to be compatible with the reference port. 398 * For nqnfs, full cache consistency is maintained within the loop. 399 * For nfs: 400 * If the file's modify time on the server has changed since the 401 * last read rpc or you have written to the file, 402 * you may have lost data cache consistency with the 403 * server, so flush all of the file's data out of the cache. 404 * Then force a getattr rpc to ensure that you have up to date 405 * attributes. 406 * NB: This implies that cache data can be read when up to 407 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 408 * attributes this could be forced by setting n_attrstamp to 0 before 409 * the VOP_GETATTR() call. 410 */ 411 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 412 if (np->n_flag & NMODIFIED) { 413 if (vp->v_type != VREG) { 414 if (vp->v_type != VDIR) 415 panic("nfs: bioread, not dir"); 416 nfs_invaldir(vp); 417 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 418 if (error) 419 return (error); 420 } 421 np->n_attrstamp = 0; 422 error = VOP_GETATTR(vp, &vattr, cred, td); 423 if (error) 424 return (error); 425 np->n_mtime = vattr.va_mtime.tv_sec; 426 } else { 427 error = VOP_GETATTR(vp, &vattr, cred, td); 428 if (error) 429 return (error); 430 if (np->n_mtime != vattr.va_mtime.tv_sec) { 431 if (vp->v_type == VDIR) 432 nfs_invaldir(vp); 433 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 434 if (error) 435 return (error); 436 np->n_mtime = vattr.va_mtime.tv_sec; 437 } 438 } 439 } 440 do { 441 442 /* 443 * Get a valid lease. If cached data is stale, flush it. 444 */ 445 if (nmp->nm_flag & NFSMNT_NQNFS) { 446 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 447 do { 448 error = nqnfs_getlease(vp, ND_READ, cred, td); 449 } while (error == NQNFS_EXPIRED); 450 if (error) 451 return (error); 452 if (np->n_lrev != np->n_brev || 453 (np->n_flag & NQNFSNONCACHE) || 454 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 455 if (vp->v_type == VDIR) 456 nfs_invaldir(vp); 457 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 458 if (error) 459 return (error); 460 np->n_brev = np->n_lrev; 461 } 462 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 463 nfs_invaldir(vp); 464 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 465 if (error) 466 return (error); 467 } 468 } 469 if (np->n_flag & NQNFSNONCACHE) { 470 switch (vp->v_type) { 471 case VREG: 472 return (nfs_readrpc(vp, uio, cred)); 473 case VLNK: 474 return (nfs_readlinkrpc(vp, uio, cred)); 475 case VDIR: 476 break; 477 default: 478 printf(" NQNFSNONCACHE: type %x unexpected\n", 479 vp->v_type); 480 }; 481 } 482 switch (vp->v_type) { 483 case VREG: 484 nfsstats.biocache_reads++; 485 lbn = uio->uio_offset / biosize; 486 on = uio->uio_offset & (biosize - 1); 487 488 /* 489 * Start the read ahead(s), as required. 490 */ 491 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 492 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 493 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 494 rabn = lbn + 1 + nra; 495 if (!incore(vp, rabn)) { 496 rabp = nfs_getcacheblk(vp, rabn, biosize, td); 497 if (!rabp) 498 return (EINTR); 499 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 500 rabp->b_flags |= B_ASYNC; 501 rabp->b_iocmd = BIO_READ; 502 vfs_busy_pages(rabp, 0); 503 if (nfs_asyncio(rabp, cred, td)) { 504 rabp->b_flags |= B_INVAL; 505 rabp->b_ioflags |= BIO_ERROR; 506 vfs_unbusy_pages(rabp); 507 brelse(rabp); 508 break; 509 } 510 } else { 511 brelse(rabp); 512 } 513 } 514 } 515 } 516 517 /* 518 * Obtain the buffer cache block. Figure out the buffer size 519 * when we are at EOF. If we are modifying the size of the 520 * buffer based on an EOF condition we need to hold 521 * nfs_rslock() through obtaining the buffer to prevent 522 * a potential writer-appender from messing with n_size. 523 * Otherwise we may accidently truncate the buffer and 524 * lose dirty data. 525 * 526 * Note that bcount is *not* DEV_BSIZE aligned. 527 */ 528 529again: 530 bcount = biosize; 531 if ((off_t)lbn * biosize >= np->n_size) { 532 bcount = 0; 533 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 534 bcount = np->n_size - (off_t)lbn * biosize; 535 } 536 if (bcount != biosize) { 537 switch(nfs_rslock(np, td)) { 538 case ENOLCK: 539 goto again; 540 /* not reached */ 541 case EINTR: 542 case ERESTART: 543 return(EINTR); 544 /* not reached */ 545 default: 546 break; 547 } 548 } 549 550 bp = nfs_getcacheblk(vp, lbn, bcount, td); 551 552 if (bcount != biosize) 553 nfs_rsunlock(np, td); 554 if (!bp) 555 return (EINTR); 556 557 /* 558 * If B_CACHE is not set, we must issue the read. If this 559 * fails, we return an error. 560 */ 561 562 if ((bp->b_flags & B_CACHE) == 0) { 563 bp->b_iocmd = BIO_READ; 564 vfs_busy_pages(bp, 0); 565 error = nfs_doio(bp, cred, td); 566 if (error) { 567 brelse(bp); 568 return (error); 569 } 570 } 571 572 /* 573 * on is the offset into the current bp. Figure out how many 574 * bytes we can copy out of the bp. Note that bcount is 575 * NOT DEV_BSIZE aligned. 576 * 577 * Then figure out how many bytes we can copy into the uio. 578 */ 579 580 n = 0; 581 if (on < bcount) 582 n = min((unsigned)(bcount - on), uio->uio_resid); 583 break; 584 case VLNK: 585 nfsstats.biocache_readlinks++; 586 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 587 if (!bp) 588 return (EINTR); 589 if ((bp->b_flags & B_CACHE) == 0) { 590 bp->b_iocmd = BIO_READ; 591 vfs_busy_pages(bp, 0); 592 error = nfs_doio(bp, cred, td); 593 if (error) { 594 bp->b_ioflags |= BIO_ERROR; 595 brelse(bp); 596 return (error); 597 } 598 } 599 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 600 on = 0; 601 break; 602 case VDIR: 603 nfsstats.biocache_readdirs++; 604 if (np->n_direofoffset 605 && uio->uio_offset >= np->n_direofoffset) { 606 return (0); 607 } 608 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 609 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 610 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 611 if (!bp) 612 return (EINTR); 613 if ((bp->b_flags & B_CACHE) == 0) { 614 bp->b_iocmd = BIO_READ; 615 vfs_busy_pages(bp, 0); 616 error = nfs_doio(bp, cred, td); 617 if (error) { 618 brelse(bp); 619 } 620 while (error == NFSERR_BAD_COOKIE) { 621 printf("got bad cookie vp %p bp %p\n", vp, bp); 622 nfs_invaldir(vp); 623 error = nfs_vinvalbuf(vp, 0, cred, td, 1); 624 /* 625 * Yuck! The directory has been modified on the 626 * server. The only way to get the block is by 627 * reading from the beginning to get all the 628 * offset cookies. 629 * 630 * Leave the last bp intact unless there is an error. 631 * Loop back up to the while if the error is another 632 * NFSERR_BAD_COOKIE (double yuch!). 633 */ 634 for (i = 0; i <= lbn && !error; i++) { 635 if (np->n_direofoffset 636 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 637 return (0); 638 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 639 if (!bp) 640 return (EINTR); 641 if ((bp->b_flags & B_CACHE) == 0) { 642 bp->b_iocmd = BIO_READ; 643 vfs_busy_pages(bp, 0); 644 error = nfs_doio(bp, cred, td); 645 /* 646 * no error + B_INVAL == directory EOF, 647 * use the block. 648 */ 649 if (error == 0 && (bp->b_flags & B_INVAL)) 650 break; 651 } 652 /* 653 * An error will throw away the block and the 654 * for loop will break out. If no error and this 655 * is not the block we want, we throw away the 656 * block and go for the next one via the for loop. 657 */ 658 if (error || i < lbn) 659 brelse(bp); 660 } 661 } 662 /* 663 * The above while is repeated if we hit another cookie 664 * error. If we hit an error and it wasn't a cookie error, 665 * we give up. 666 */ 667 if (error) 668 return (error); 669 } 670 671 /* 672 * If not eof and read aheads are enabled, start one. 673 * (You need the current block first, so that you have the 674 * directory offset cookie of the next block.) 675 */ 676 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 677 (bp->b_flags & B_INVAL) == 0 && 678 (np->n_direofoffset == 0 || 679 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 680 !(np->n_flag & NQNFSNONCACHE) && 681 !incore(vp, lbn + 1)) { 682 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 683 if (rabp) { 684 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 685 rabp->b_flags |= B_ASYNC; 686 rabp->b_iocmd = BIO_READ; 687 vfs_busy_pages(rabp, 0); 688 if (nfs_asyncio(rabp, cred, td)) { 689 rabp->b_flags |= B_INVAL; 690 rabp->b_ioflags |= BIO_ERROR; 691 vfs_unbusy_pages(rabp); 692 brelse(rabp); 693 } 694 } else { 695 brelse(rabp); 696 } 697 } 698 } 699 /* 700 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 701 * chopped for the EOF condition, we cannot tell how large 702 * NFS directories are going to be until we hit EOF. So 703 * an NFS directory buffer is *not* chopped to its EOF. Now, 704 * it just so happens that b_resid will effectively chop it 705 * to EOF. *BUT* this information is lost if the buffer goes 706 * away and is reconstituted into a B_CACHE state ( due to 707 * being VMIO ) later. So we keep track of the directory eof 708 * in np->n_direofoffset and chop it off as an extra step 709 * right here. 710 */ 711 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 712 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 713 n = np->n_direofoffset - uio->uio_offset; 714 break; 715 default: 716 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 717 break; 718 }; 719 720 if (n > 0) { 721 error = uiomove(bp->b_data + on, (int)n, uio); 722 } 723 switch (vp->v_type) { 724 case VREG: 725 break; 726 case VLNK: 727 n = 0; 728 break; 729 case VDIR: 730 /* 731 * Invalidate buffer if caching is disabled, forcing a 732 * re-read from the remote later. 733 */ 734 if (np->n_flag & NQNFSNONCACHE) 735 bp->b_flags |= B_INVAL; 736 break; 737 default: 738 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 739 } 740 brelse(bp); 741 } while (error == 0 && uio->uio_resid > 0 && n > 0); 742 return (error); 743} 744 745/* 746 * Vnode op for write using bio 747 */ 748int 749nfs_write(ap) 750 struct vop_write_args /* { 751 struct vnode *a_vp; 752 struct uio *a_uio; 753 int a_ioflag; 754 struct ucred *a_cred; 755 } */ *ap; 756{ 757 int biosize; 758 struct uio *uio = ap->a_uio; 759 struct thread *td = uio->uio_td; 760 struct vnode *vp = ap->a_vp; 761 struct nfsnode *np = VTONFS(vp); 762 struct ucred *cred = ap->a_cred; 763 int ioflag = ap->a_ioflag; 764 struct buf *bp; 765 struct vattr vattr; 766 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 767 daddr_t lbn; 768 int bcount; 769 int n, on, error = 0, iomode, must_commit; 770 int haverslock = 0; 771 struct proc *p = td?td->td_proc:NULL; 772 773 GIANT_REQUIRED; 774 775#ifdef DIAGNOSTIC 776 if (uio->uio_rw != UIO_WRITE) 777 panic("nfs_write mode"); 778 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 779 panic("nfs_write proc"); 780#endif 781 if (vp->v_type != VREG) 782 return (EIO); 783 if (np->n_flag & NWRITEERR) { 784 np->n_flag &= ~NWRITEERR; 785 return (np->n_error); 786 } 787 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 788 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 789 (void)nfs_fsinfo(nmp, vp, cred, td); 790 791 /* 792 * Synchronously flush pending buffers if we are in synchronous 793 * mode or if we are appending. 794 */ 795 if (ioflag & (IO_APPEND | IO_SYNC)) { 796 if (np->n_flag & NMODIFIED) { 797 np->n_attrstamp = 0; 798 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 799 if (error) 800 return (error); 801 } 802 } 803 804 /* 805 * If IO_APPEND then load uio_offset. We restart here if we cannot 806 * get the append lock. 807 */ 808restart: 809 if (ioflag & IO_APPEND) { 810 np->n_attrstamp = 0; 811 error = VOP_GETATTR(vp, &vattr, cred, td); 812 if (error) 813 return (error); 814 uio->uio_offset = np->n_size; 815 } 816 817 if (uio->uio_offset < 0) 818 return (EINVAL); 819 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 820 return (EFBIG); 821 if (uio->uio_resid == 0) 822 return (0); 823 824 /* 825 * We need to obtain the rslock if we intend to modify np->n_size 826 * in order to guarentee the append point with multiple contending 827 * writers, to guarentee that no other appenders modify n_size 828 * while we are trying to obtain a truncated buffer (i.e. to avoid 829 * accidently truncating data written by another appender due to 830 * the race), and to ensure that the buffer is populated prior to 831 * our extending of the file. We hold rslock through the entire 832 * operation. 833 * 834 * Note that we do not synchronize the case where someone truncates 835 * the file while we are appending to it because attempting to lock 836 * this case may deadlock other parts of the system unexpectedly. 837 */ 838 if ((ioflag & IO_APPEND) || 839 uio->uio_offset + uio->uio_resid > np->n_size) { 840 switch(nfs_rslock(np, td)) { 841 case ENOLCK: 842 goto restart; 843 /* not reached */ 844 case EINTR: 845 case ERESTART: 846 return(EINTR); 847 /* not reached */ 848 default: 849 break; 850 } 851 haverslock = 1; 852 } 853 854 /* 855 * Maybe this should be above the vnode op call, but so long as 856 * file servers have no limits, i don't think it matters 857 */ 858 if (p && uio->uio_offset + uio->uio_resid > 859 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 860 PROC_LOCK(p); 861 psignal(p, SIGXFSZ); 862 PROC_UNLOCK(p); 863 if (haverslock) 864 nfs_rsunlock(np, td); 865 return (EFBIG); 866 } 867 868 biosize = vp->v_mount->mnt_stat.f_iosize; 869 870 do { 871 /* 872 * Check for a valid write lease. 873 */ 874 if ((nmp->nm_flag & NFSMNT_NQNFS) && 875 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 876 do { 877 error = nqnfs_getlease(vp, ND_WRITE, cred, td); 878 } while (error == NQNFS_EXPIRED); 879 if (error) 880 break; 881 if (np->n_lrev != np->n_brev || 882 (np->n_flag & NQNFSNONCACHE)) { 883 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 884 if (error) 885 break; 886 np->n_brev = np->n_lrev; 887 } 888 } 889 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 890 iomode = NFSV3WRITE_FILESYNC; 891 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 892 if (must_commit) 893 nfs_clearcommit(vp->v_mount); 894 break; 895 } 896 nfsstats.biocache_writes++; 897 lbn = uio->uio_offset / biosize; 898 on = uio->uio_offset & (biosize-1); 899 n = min((unsigned)(biosize - on), uio->uio_resid); 900again: 901 /* 902 * Handle direct append and file extension cases, calculate 903 * unaligned buffer size. 904 */ 905 906 if (uio->uio_offset == np->n_size && n) { 907 /* 908 * Get the buffer (in its pre-append state to maintain 909 * B_CACHE if it was previously set). Resize the 910 * nfsnode after we have locked the buffer to prevent 911 * readers from reading garbage. 912 */ 913 bcount = on; 914 bp = nfs_getcacheblk(vp, lbn, bcount, td); 915 916 if (bp != NULL) { 917 long save; 918 919 np->n_size = uio->uio_offset + n; 920 np->n_flag |= NMODIFIED; 921 vnode_pager_setsize(vp, np->n_size); 922 923 save = bp->b_flags & B_CACHE; 924 bcount += n; 925 allocbuf(bp, bcount); 926 bp->b_flags |= save; 927 bp->b_magic = B_MAGIC_NFS; 928 bp->b_op = &buf_ops_nfs; 929 } 930 } else { 931 /* 932 * Obtain the locked cache block first, and then 933 * adjust the file's size as appropriate. 934 */ 935 bcount = on + n; 936 if ((off_t)lbn * biosize + bcount < np->n_size) { 937 if ((off_t)(lbn + 1) * biosize < np->n_size) 938 bcount = biosize; 939 else 940 bcount = np->n_size - (off_t)lbn * biosize; 941 } 942 943 bp = nfs_getcacheblk(vp, lbn, bcount, td); 944 945 if (uio->uio_offset + n > np->n_size) { 946 np->n_size = uio->uio_offset + n; 947 np->n_flag |= NMODIFIED; 948 vnode_pager_setsize(vp, np->n_size); 949 } 950 } 951 952 if (!bp) { 953 error = EINTR; 954 break; 955 } 956 957 /* 958 * Issue a READ if B_CACHE is not set. In special-append 959 * mode, B_CACHE is based on the buffer prior to the write 960 * op and is typically set, avoiding the read. If a read 961 * is required in special append mode, the server will 962 * probably send us a short-read since we extended the file 963 * on our end, resulting in b_resid == 0 and, thusly, 964 * B_CACHE getting set. 965 * 966 * We can also avoid issuing the read if the write covers 967 * the entire buffer. We have to make sure the buffer state 968 * is reasonable in this case since we will not be initiating 969 * I/O. See the comments in kern/vfs_bio.c's getblk() for 970 * more information. 971 * 972 * B_CACHE may also be set due to the buffer being cached 973 * normally. 974 */ 975 976 if (on == 0 && n == bcount) { 977 bp->b_flags |= B_CACHE; 978 bp->b_flags &= ~B_INVAL; 979 bp->b_ioflags &= ~BIO_ERROR; 980 } 981 982 if ((bp->b_flags & B_CACHE) == 0) { 983 bp->b_iocmd = BIO_READ; 984 vfs_busy_pages(bp, 0); 985 error = nfs_doio(bp, cred, td); 986 if (error) { 987 brelse(bp); 988 break; 989 } 990 } 991 if (!bp) { 992 error = EINTR; 993 break; 994 } 995 if (bp->b_wcred == NOCRED) { 996 crhold(cred); 997 bp->b_wcred = cred; 998 } 999 np->n_flag |= NMODIFIED; 1000 1001 /* 1002 * If dirtyend exceeds file size, chop it down. This should 1003 * not normally occur but there is an append race where it 1004 * might occur XXX, so we log it. 1005 * 1006 * If the chopping creates a reverse-indexed or degenerate 1007 * situation with dirtyoff/end, we 0 both of them. 1008 */ 1009 1010 if (bp->b_dirtyend > bcount) { 1011 printf("NFS append race @%lx:%d\n", 1012 (long)bp->b_blkno * DEV_BSIZE, 1013 bp->b_dirtyend - bcount); 1014 bp->b_dirtyend = bcount; 1015 } 1016 1017 if (bp->b_dirtyoff >= bp->b_dirtyend) 1018 bp->b_dirtyoff = bp->b_dirtyend = 0; 1019 1020 /* 1021 * If the new write will leave a contiguous dirty 1022 * area, just update the b_dirtyoff and b_dirtyend, 1023 * otherwise force a write rpc of the old dirty area. 1024 * 1025 * While it is possible to merge discontiguous writes due to 1026 * our having a B_CACHE buffer ( and thus valid read data 1027 * for the hole), we don't because it could lead to 1028 * significant cache coherency problems with multiple clients, 1029 * especially if locking is implemented later on. 1030 * 1031 * as an optimization we could theoretically maintain 1032 * a linked list of discontinuous areas, but we would still 1033 * have to commit them separately so there isn't much 1034 * advantage to it except perhaps a bit of asynchronization. 1035 */ 1036 1037 if (bp->b_dirtyend > 0 && 1038 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 1039 if (BUF_WRITE(bp) == EINTR) 1040 return (EINTR); 1041 goto again; 1042 } 1043 1044 /* 1045 * Check for valid write lease and get one as required. 1046 * In case getblk() and/or bwrite() delayed us. 1047 */ 1048 if ((nmp->nm_flag & NFSMNT_NQNFS) && 1049 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 1050 do { 1051 error = nqnfs_getlease(vp, ND_WRITE, cred, td); 1052 } while (error == NQNFS_EXPIRED); 1053 if (error) { 1054 brelse(bp); 1055 break; 1056 } 1057 if (np->n_lrev != np->n_brev || 1058 (np->n_flag & NQNFSNONCACHE)) { 1059 brelse(bp); 1060 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 1061 if (error) 1062 break; 1063 np->n_brev = np->n_lrev; 1064 goto again; 1065 } 1066 } 1067 1068 error = uiomove((char *)bp->b_data + on, n, uio); 1069 1070 /* 1071 * Since this block is being modified, it must be written 1072 * again and not just committed. Since write clustering does 1073 * not work for the stage 1 data write, only the stage 2 1074 * commit rpc, we have to clear B_CLUSTEROK as well. 1075 */ 1076 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1077 1078 if (error) { 1079 bp->b_ioflags |= BIO_ERROR; 1080 brelse(bp); 1081 break; 1082 } 1083 1084 /* 1085 * Only update dirtyoff/dirtyend if not a degenerate 1086 * condition. 1087 */ 1088 if (n) { 1089 if (bp->b_dirtyend > 0) { 1090 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 1091 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 1092 } else { 1093 bp->b_dirtyoff = on; 1094 bp->b_dirtyend = on + n; 1095 } 1096 vfs_bio_set_validclean(bp, on, n); 1097 } 1098 1099 /* 1100 * If the lease is non-cachable or IO_SYNC do bwrite(). 1101 * 1102 * IO_INVAL appears to be unused. The idea appears to be 1103 * to turn off caching in this case. Very odd. XXX 1104 */ 1105 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 1106 if (ioflag & IO_INVAL) 1107 bp->b_flags |= B_NOCACHE; 1108 error = BUF_WRITE(bp); 1109 if (error) 1110 break; 1111 if (np->n_flag & NQNFSNONCACHE) { 1112 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 1113 if (error) 1114 break; 1115 } 1116 } else if ((n + on) == biosize && 1117 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 1118 bp->b_flags |= B_ASYNC; 1119 (void)nfs_writebp(bp, 0, 0); 1120 } else { 1121 bdwrite(bp); 1122 } 1123 } while (uio->uio_resid > 0 && n > 0); 1124 1125 if (haverslock) 1126 nfs_rsunlock(np, td); 1127 1128 return (error); 1129} 1130 1131/* 1132 * Get an nfs cache block. 1133 * 1134 * Allocate a new one if the block isn't currently in the cache 1135 * and return the block marked busy. If the calling process is 1136 * interrupted by a signal for an interruptible mount point, return 1137 * NULL. 1138 * 1139 * The caller must carefully deal with the possible B_INVAL state of 1140 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1141 * indirectly), so synchronous reads can be issued without worrying about 1142 * the B_INVAL state. We have to be a little more careful when dealing 1143 * with writes (see comments in nfs_write()) when extending a file past 1144 * its EOF. 1145 */ 1146static struct buf * 1147nfs_getcacheblk(vp, bn, size, td) 1148 struct vnode *vp; 1149 daddr_t bn; 1150 int size; 1151 struct thread *td; 1152{ 1153 register struct buf *bp; 1154 struct mount *mp; 1155 struct nfsmount *nmp; 1156 1157 mp = vp->v_mount; 1158 nmp = VFSTONFS(mp); 1159 1160 if (nmp->nm_flag & NFSMNT_INT) { 1161 bp = getblk(vp, bn, size, PCATCH, 0); 1162 while (bp == (struct buf *)0) { 1163 if (nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc)) 1164 return ((struct buf *)0); 1165 bp = getblk(vp, bn, size, 0, 2 * hz); 1166 } 1167 } else { 1168 bp = getblk(vp, bn, size, 0, 0); 1169 } 1170 1171 if (vp->v_type == VREG) { 1172 int biosize; 1173 1174 biosize = mp->mnt_stat.f_iosize; 1175 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1176 } 1177 return (bp); 1178} 1179 1180/* 1181 * Flush and invalidate all dirty buffers. If another process is already 1182 * doing the flush, just wait for completion. 1183 */ 1184int 1185nfs_vinvalbuf(vp, flags, cred, td, intrflg) 1186 struct vnode *vp; 1187 int flags; 1188 struct ucred *cred; 1189 struct thread *td; 1190 int intrflg; 1191{ 1192 register struct nfsnode *np = VTONFS(vp); 1193 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1194 int error = 0, slpflag, slptimeo; 1195 1196 if (vp->v_flag & VXLOCK) { 1197 return (0); 1198 } 1199 1200 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1201 intrflg = 0; 1202 if (intrflg) { 1203 slpflag = PCATCH; 1204 slptimeo = 2 * hz; 1205 } else { 1206 slpflag = 0; 1207 slptimeo = 0; 1208 } 1209 /* 1210 * First wait for any other process doing a flush to complete. 1211 */ 1212 while (np->n_flag & NFLUSHINPROG) { 1213 np->n_flag |= NFLUSHWANT; 1214 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 1215 slptimeo); 1216 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc)) 1217 return (EINTR); 1218 } 1219 1220 /* 1221 * Now, flush as required. 1222 */ 1223 np->n_flag |= NFLUSHINPROG; 1224 error = vinvalbuf(vp, flags, cred, td, slpflag, 0); 1225 while (error) { 1226 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td->td_proc)) { 1227 np->n_flag &= ~NFLUSHINPROG; 1228 if (np->n_flag & NFLUSHWANT) { 1229 np->n_flag &= ~NFLUSHWANT; 1230 wakeup((caddr_t)&np->n_flag); 1231 } 1232 return (EINTR); 1233 } 1234 error = vinvalbuf(vp, flags, cred, td, 0, slptimeo); 1235 } 1236 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 1237 if (np->n_flag & NFLUSHWANT) { 1238 np->n_flag &= ~NFLUSHWANT; 1239 wakeup((caddr_t)&np->n_flag); 1240 } 1241 return (0); 1242} 1243 1244/* 1245 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1246 * This is mainly to avoid queueing async I/O requests when the nfsiods 1247 * are all hung on a dead server. 1248 * 1249 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1250 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1251 */ 1252int 1253nfs_asyncio(bp, cred, td) 1254 register struct buf *bp; 1255 struct ucred *cred; 1256 struct thread *td; 1257{ 1258 struct nfsmount *nmp; 1259 int i; 1260 int gotiod; 1261 int slpflag = 0; 1262 int slptimeo = 0; 1263 int error; 1264 1265 /* 1266 * If no async daemons then return EIO to force caller to run the rpc 1267 * synchronously. 1268 */ 1269 if (nfs_numasync == 0) 1270 return (EIO); 1271 1272 nmp = VFSTONFS(bp->b_vp->v_mount); 1273 1274 /* 1275 * Commits are usually short and sweet so lets save some cpu and 1276 * leave the async daemons for more important rpc's (such as reads 1277 * and writes). 1278 */ 1279 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1280 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1281 return(EIO); 1282 } 1283 1284again: 1285 if (nmp->nm_flag & NFSMNT_INT) 1286 slpflag = PCATCH; 1287 gotiod = FALSE; 1288 1289 /* 1290 * Find a free iod to process this request. 1291 */ 1292 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 1293 if (nfs_iodwant[i]) { 1294 /* 1295 * Found one, so wake it up and tell it which 1296 * mount to process. 1297 */ 1298 NFS_DPF(ASYNCIO, 1299 ("nfs_asyncio: waking iod %d for mount %p\n", 1300 i, nmp)); 1301 nfs_iodwant[i] = (struct proc *)0; 1302 nfs_iodmount[i] = nmp; 1303 nmp->nm_bufqiods++; 1304 wakeup((caddr_t)&nfs_iodwant[i]); 1305 gotiod = TRUE; 1306 break; 1307 } 1308 1309 /* 1310 * If none are free, we may already have an iod working on this mount 1311 * point. If so, it will process our request. 1312 */ 1313 if (!gotiod) { 1314 if (nmp->nm_bufqiods > 0) { 1315 NFS_DPF(ASYNCIO, 1316 ("nfs_asyncio: %d iods are already processing mount %p\n", 1317 nmp->nm_bufqiods, nmp)); 1318 gotiod = TRUE; 1319 } 1320 } 1321 1322 /* 1323 * If we have an iod which can process the request, then queue 1324 * the buffer. 1325 */ 1326 if (gotiod) { 1327 /* 1328 * Ensure that the queue never grows too large. We still want 1329 * to asynchronize so we block rather then return EIO. 1330 */ 1331 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1332 NFS_DPF(ASYNCIO, 1333 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1334 nmp->nm_bufqwant = TRUE; 1335 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1336 "nfsaio", slptimeo); 1337 if (error) { 1338 if (nfs_sigintr(nmp, NULL, td->td_proc)) 1339 return (EINTR); 1340 if (slpflag == PCATCH) { 1341 slpflag = 0; 1342 slptimeo = 2 * hz; 1343 } 1344 } 1345 /* 1346 * We might have lost our iod while sleeping, 1347 * so check and loop if nescessary. 1348 */ 1349 if (nmp->nm_bufqiods == 0) { 1350 NFS_DPF(ASYNCIO, 1351 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1352 goto again; 1353 } 1354 } 1355 1356 if (bp->b_iocmd == BIO_READ) { 1357 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1358 crhold(cred); 1359 bp->b_rcred = cred; 1360 } 1361 } else { 1362 bp->b_flags |= B_WRITEINPROG; 1363 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1364 crhold(cred); 1365 bp->b_wcred = cred; 1366 } 1367 } 1368 1369 BUF_KERNPROC(bp); 1370 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1371 nmp->nm_bufqlen++; 1372 return (0); 1373 } 1374 1375 /* 1376 * All the iods are busy on other mounts, so return EIO to 1377 * force the caller to process the i/o synchronously. 1378 */ 1379 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1380 return (EIO); 1381} 1382 1383/* 1384 * Do an I/O operation to/from a cache block. This may be called 1385 * synchronously or from an nfsiod. 1386 */ 1387int 1388nfs_doio(bp, cr, td) 1389 struct buf *bp; 1390 struct ucred *cr; 1391 struct thread *td; 1392{ 1393 struct uio *uiop; 1394 struct vnode *vp; 1395 struct nfsnode *np; 1396 struct nfsmount *nmp; 1397 int error = 0, iomode, must_commit = 0; 1398 struct uio uio; 1399 struct iovec io; 1400 struct proc *p = td?td->td_proc:NULL; 1401 1402 vp = bp->b_vp; 1403 np = VTONFS(vp); 1404 nmp = VFSTONFS(vp->v_mount); 1405 uiop = &uio; 1406 uiop->uio_iov = &io; 1407 uiop->uio_iovcnt = 1; 1408 uiop->uio_segflg = UIO_SYSSPACE; 1409 uiop->uio_td = td; 1410 1411 /* 1412 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1413 * do this here so we do not have to do it in all the code that 1414 * calls us. 1415 */ 1416 bp->b_flags &= ~B_INVAL; 1417 bp->b_ioflags &= ~BIO_ERROR; 1418 1419 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1420 1421 /* 1422 * Historically, paging was done with physio, but no more. 1423 */ 1424 if (bp->b_flags & B_PHYS) { 1425 /* 1426 * ...though reading /dev/drum still gets us here. 1427 */ 1428 io.iov_len = uiop->uio_resid = bp->b_bcount; 1429 /* mapping was done by vmapbuf() */ 1430 io.iov_base = bp->b_data; 1431 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1432 if (bp->b_iocmd == BIO_READ) { 1433 uiop->uio_rw = UIO_READ; 1434 nfsstats.read_physios++; 1435 error = nfs_readrpc(vp, uiop, cr); 1436 } else { 1437 int com; 1438 1439 iomode = NFSV3WRITE_DATASYNC; 1440 uiop->uio_rw = UIO_WRITE; 1441 nfsstats.write_physios++; 1442 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1443 } 1444 if (error) { 1445 bp->b_ioflags |= BIO_ERROR; 1446 bp->b_error = error; 1447 } 1448 } else if (bp->b_iocmd == BIO_READ) { 1449 io.iov_len = uiop->uio_resid = bp->b_bcount; 1450 io.iov_base = bp->b_data; 1451 uiop->uio_rw = UIO_READ; 1452 switch (vp->v_type) { 1453 case VREG: 1454 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1455 nfsstats.read_bios++; 1456 error = nfs_readrpc(vp, uiop, cr); 1457 if (!error) { 1458 if (uiop->uio_resid) { 1459 /* 1460 * If we had a short read with no error, we must have 1461 * hit a file hole. We should zero-fill the remainder. 1462 * This can also occur if the server hits the file EOF. 1463 * 1464 * Holes used to be able to occur due to pending 1465 * writes, but that is not possible any longer. 1466 */ 1467 int nread = bp->b_bcount - uiop->uio_resid; 1468 int left = bp->b_bcount - nread; 1469 1470 if (left > 0) 1471 bzero((char *)bp->b_data + nread, left); 1472 uiop->uio_resid = 0; 1473 } 1474 } 1475 if (p && (vp->v_flag & VTEXT) && 1476 (((nmp->nm_flag & NFSMNT_NQNFS) && 1477 NQNFS_CKINVALID(vp, np, ND_READ) && 1478 np->n_lrev != np->n_brev) || 1479 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1480 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1481 uprintf("Process killed due to text file modification\n"); 1482 PROC_LOCK(p); 1483 psignal(p, SIGKILL); 1484 _PHOLD(p); 1485 PROC_UNLOCK(p); 1486 } 1487 break; 1488 case VLNK: 1489 uiop->uio_offset = (off_t)0; 1490 nfsstats.readlink_bios++; 1491 error = nfs_readlinkrpc(vp, uiop, cr); 1492 break; 1493 case VDIR: 1494 nfsstats.readdir_bios++; 1495 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1496 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1497 error = nfs_readdirplusrpc(vp, uiop, cr); 1498 if (error == NFSERR_NOTSUPP) 1499 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1500 } 1501 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1502 error = nfs_readdirrpc(vp, uiop, cr); 1503 /* 1504 * end-of-directory sets B_INVAL but does not generate an 1505 * error. 1506 */ 1507 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1508 bp->b_flags |= B_INVAL; 1509 break; 1510 default: 1511 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1512 break; 1513 }; 1514 if (error) { 1515 bp->b_ioflags |= BIO_ERROR; 1516 bp->b_error = error; 1517 } 1518 } else { 1519 /* 1520 * If we only need to commit, try to commit 1521 */ 1522 if (bp->b_flags & B_NEEDCOMMIT) { 1523 int retv; 1524 off_t off; 1525 1526 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1527 bp->b_flags |= B_WRITEINPROG; 1528 retv = nfs_commit( 1529 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1530 bp->b_wcred, td); 1531 bp->b_flags &= ~B_WRITEINPROG; 1532 if (retv == 0) { 1533 bp->b_dirtyoff = bp->b_dirtyend = 0; 1534 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1535 bp->b_resid = 0; 1536 bufdone(bp); 1537 return (0); 1538 } 1539 if (retv == NFSERR_STALEWRITEVERF) { 1540 nfs_clearcommit(bp->b_vp->v_mount); 1541 } 1542 } 1543 1544 /* 1545 * Setup for actual write 1546 */ 1547 1548 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1549 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1550 1551 if (bp->b_dirtyend > bp->b_dirtyoff) { 1552 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1553 - bp->b_dirtyoff; 1554 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1555 + bp->b_dirtyoff; 1556 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1557 uiop->uio_rw = UIO_WRITE; 1558 nfsstats.write_bios++; 1559 1560 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1561 iomode = NFSV3WRITE_UNSTABLE; 1562 else 1563 iomode = NFSV3WRITE_FILESYNC; 1564 1565 bp->b_flags |= B_WRITEINPROG; 1566 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1567 1568 /* 1569 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1570 * to cluster the buffers needing commit. This will allow 1571 * the system to submit a single commit rpc for the whole 1572 * cluster. We can do this even if the buffer is not 100% 1573 * dirty (relative to the NFS blocksize), so we optimize the 1574 * append-to-file-case. 1575 * 1576 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1577 * cleared because write clustering only works for commit 1578 * rpc's, not for the data portion of the write). 1579 */ 1580 1581 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1582 bp->b_flags |= B_NEEDCOMMIT; 1583 if (bp->b_dirtyoff == 0 1584 && bp->b_dirtyend == bp->b_bcount) 1585 bp->b_flags |= B_CLUSTEROK; 1586 } else { 1587 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1588 } 1589 bp->b_flags &= ~B_WRITEINPROG; 1590 1591 /* 1592 * For an interrupted write, the buffer is still valid 1593 * and the write hasn't been pushed to the server yet, 1594 * so we can't set BIO_ERROR and report the interruption 1595 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1596 * is not relevant, so the rpc attempt is essentially 1597 * a noop. For the case of a V3 write rpc not being 1598 * committed to stable storage, the block is still 1599 * dirty and requires either a commit rpc or another 1600 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1601 * the block is reused. This is indicated by setting 1602 * the B_DELWRI and B_NEEDCOMMIT flags. 1603 * 1604 * If the buffer is marked B_PAGING, it does not reside on 1605 * the vp's paging queues so we cannot call bdirty(). The 1606 * bp in this case is not an NFS cache block so we should 1607 * be safe. XXX 1608 */ 1609 if (error == EINTR 1610 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1611 int s; 1612 1613 s = splbio(); 1614 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1615 if ((bp->b_flags & B_PAGING) == 0) { 1616 bdirty(bp); 1617 bp->b_flags &= ~B_DONE; 1618 } 1619 if (error && (bp->b_flags & B_ASYNC) == 0) 1620 bp->b_flags |= B_EINTR; 1621 splx(s); 1622 } else { 1623 if (error) { 1624 bp->b_ioflags |= BIO_ERROR; 1625 bp->b_error = np->n_error = error; 1626 np->n_flag |= NWRITEERR; 1627 } 1628 bp->b_dirtyoff = bp->b_dirtyend = 0; 1629 } 1630 } else { 1631 bp->b_resid = 0; 1632 bufdone(bp); 1633 return (0); 1634 } 1635 } 1636 bp->b_resid = uiop->uio_resid; 1637 if (must_commit) 1638 nfs_clearcommit(vp->v_mount); 1639 bufdone(bp); 1640 return (error); 1641} 1642