nfs_bio.c revision 127977
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 33 */ 34 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 127977 2004-04-07 05:00:01Z imp $"); 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/bio.h> 41#include <sys/buf.h> 42#include <sys/kernel.h> 43#include <sys/mount.h> 44#include <sys/proc.h> 45#include <sys/resourcevar.h> 46#include <sys/signalvar.h> 47#include <sys/vmmeter.h> 48#include <sys/vnode.h> 49 50#include <vm/vm.h> 51#include <vm/vm_extern.h> 52#include <vm/vm_page.h> 53#include <vm/vm_object.h> 54#include <vm/vm_pager.h> 55#include <vm/vnode_pager.h> 56 57#include <rpc/rpcclnt.h> 58 59#include <nfs/rpcv2.h> 60#include <nfs/nfsproto.h> 61#include <nfsclient/nfs.h> 62#include <nfsclient/nfsmount.h> 63#include <nfsclient/nfsnode.h> 64 65#include <nfs4client/nfs4.h> 66 67/* 68 * Just call nfs_writebp() with the force argument set to 1. 69 * 70 * NOTE: B_DONE may or may not be set in a_bp on call. 71 */ 72static int 73nfs4_bwrite(struct buf *bp) 74{ 75 76 return (nfs4_writebp(bp, 1, curthread)); 77} 78 79static int 80nfs_bwrite(struct buf *bp) 81{ 82 83 return (nfs_writebp(bp, 1, curthread)); 84} 85 86struct buf_ops buf_ops_nfs4 = { 87 "buf_ops_nfs4", 88 nfs4_bwrite 89}; 90 91struct buf_ops buf_ops_nfs = { 92 "buf_ops_nfs", 93 nfs_bwrite 94}; 95 96static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 97 struct thread *td); 98 99/* 100 * Vnode op for VM getpages. 101 */ 102int 103nfs_getpages(struct vop_getpages_args *ap) 104{ 105 int i, error, nextoff, size, toff, count, npages; 106 struct uio uio; 107 struct iovec iov; 108 vm_offset_t kva; 109 struct buf *bp; 110 struct vnode *vp; 111 struct thread *td; 112 struct ucred *cred; 113 struct nfsmount *nmp; 114 vm_object_t object; 115 vm_page_t *pages; 116 117 GIANT_REQUIRED; 118 119 vp = ap->a_vp; 120 td = curthread; /* XXX */ 121 cred = curthread->td_ucred; /* XXX */ 122 nmp = VFSTONFS(vp->v_mount); 123 pages = ap->a_m; 124 count = ap->a_count; 125 126 if ((object = vp->v_object) == NULL) { 127 printf("nfs_getpages: called with non-merged cache vnode??\n"); 128 return VM_PAGER_ERROR; 129 } 130 131 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 132 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 133 /* We'll never get here for v4, because we always have fsinfo */ 134 (void)nfs_fsinfo(nmp, vp, cred, td); 135 } 136 137 npages = btoc(count); 138 139 /* 140 * If the requested page is partially valid, just return it and 141 * allow the pager to zero-out the blanks. Partially valid pages 142 * can only occur at the file EOF. 143 */ 144 145 { 146 vm_page_t m = pages[ap->a_reqpage]; 147 148 VM_OBJECT_LOCK(object); 149 vm_page_lock_queues(); 150 if (m->valid != 0) { 151 /* handled by vm_fault now */ 152 /* vm_page_zero_invalid(m, TRUE); */ 153 for (i = 0; i < npages; ++i) { 154 if (i != ap->a_reqpage) 155 vm_page_free(pages[i]); 156 } 157 vm_page_unlock_queues(); 158 VM_OBJECT_UNLOCK(object); 159 return(0); 160 } 161 vm_page_unlock_queues(); 162 VM_OBJECT_UNLOCK(object); 163 } 164 165 /* 166 * We use only the kva address for the buffer, but this is extremely 167 * convienient and fast. 168 */ 169 bp = getpbuf(&nfs_pbuf_freecnt); 170 171 kva = (vm_offset_t) bp->b_data; 172 pmap_qenter(kva, pages, npages); 173 cnt.v_vnodein++; 174 cnt.v_vnodepgsin += npages; 175 176 iov.iov_base = (caddr_t) kva; 177 iov.iov_len = count; 178 uio.uio_iov = &iov; 179 uio.uio_iovcnt = 1; 180 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 181 uio.uio_resid = count; 182 uio.uio_segflg = UIO_SYSSPACE; 183 uio.uio_rw = UIO_READ; 184 uio.uio_td = td; 185 186 error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 187 pmap_qremove(kva, npages); 188 189 relpbuf(bp, &nfs_pbuf_freecnt); 190 191 if (error && (uio.uio_resid == count)) { 192 printf("nfs_getpages: error %d\n", error); 193 VM_OBJECT_LOCK(object); 194 vm_page_lock_queues(); 195 for (i = 0; i < npages; ++i) { 196 if (i != ap->a_reqpage) 197 vm_page_free(pages[i]); 198 } 199 vm_page_unlock_queues(); 200 VM_OBJECT_UNLOCK(object); 201 return VM_PAGER_ERROR; 202 } 203 204 /* 205 * Calculate the number of bytes read and validate only that number 206 * of bytes. Note that due to pending writes, size may be 0. This 207 * does not mean that the remaining data is invalid! 208 */ 209 210 size = count - uio.uio_resid; 211 VM_OBJECT_LOCK(object); 212 vm_page_lock_queues(); 213 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 214 vm_page_t m; 215 nextoff = toff + PAGE_SIZE; 216 m = pages[i]; 217 218 m->flags &= ~PG_ZERO; 219 220 if (nextoff <= size) { 221 /* 222 * Read operation filled an entire page 223 */ 224 m->valid = VM_PAGE_BITS_ALL; 225 vm_page_undirty(m); 226 } else if (size > toff) { 227 /* 228 * Read operation filled a partial page. 229 */ 230 m->valid = 0; 231 vm_page_set_validclean(m, 0, size - toff); 232 /* handled by vm_fault now */ 233 /* vm_page_zero_invalid(m, TRUE); */ 234 } else { 235 /* 236 * Read operation was short. If no error occured 237 * we may have hit a zero-fill section. We simply 238 * leave valid set to 0. 239 */ 240 ; 241 } 242 if (i != ap->a_reqpage) { 243 /* 244 * Whether or not to leave the page activated is up in 245 * the air, but we should put the page on a page queue 246 * somewhere (it already is in the object). Result: 247 * It appears that emperical results show that 248 * deactivating pages is best. 249 */ 250 251 /* 252 * Just in case someone was asking for this page we 253 * now tell them that it is ok to use. 254 */ 255 if (!error) { 256 if (m->flags & PG_WANTED) 257 vm_page_activate(m); 258 else 259 vm_page_deactivate(m); 260 vm_page_wakeup(m); 261 } else { 262 vm_page_free(m); 263 } 264 } 265 } 266 vm_page_unlock_queues(); 267 VM_OBJECT_UNLOCK(object); 268 return 0; 269} 270 271/* 272 * Vnode op for VM putpages. 273 */ 274int 275nfs_putpages(struct vop_putpages_args *ap) 276{ 277 struct uio uio; 278 struct iovec iov; 279 vm_offset_t kva; 280 struct buf *bp; 281 int iomode, must_commit, i, error, npages, count; 282 off_t offset; 283 int *rtvals; 284 struct vnode *vp; 285 struct thread *td; 286 struct ucred *cred; 287 struct nfsmount *nmp; 288 struct nfsnode *np; 289 vm_page_t *pages; 290 291 GIANT_REQUIRED; 292 293 vp = ap->a_vp; 294 np = VTONFS(vp); 295 td = curthread; /* XXX */ 296 cred = curthread->td_ucred; /* XXX */ 297 nmp = VFSTONFS(vp->v_mount); 298 pages = ap->a_m; 299 count = ap->a_count; 300 rtvals = ap->a_rtvals; 301 npages = btoc(count); 302 offset = IDX_TO_OFF(pages[0]->pindex); 303 304 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 305 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 306 (void)nfs_fsinfo(nmp, vp, cred, td); 307 } 308 309 for (i = 0; i < npages; i++) 310 rtvals[i] = VM_PAGER_AGAIN; 311 312 /* 313 * When putting pages, do not extend file past EOF. 314 */ 315 316 if (offset + count > np->n_size) { 317 count = np->n_size - offset; 318 if (count < 0) 319 count = 0; 320 } 321 322 /* 323 * We use only the kva address for the buffer, but this is extremely 324 * convienient and fast. 325 */ 326 bp = getpbuf(&nfs_pbuf_freecnt); 327 328 kva = (vm_offset_t) bp->b_data; 329 pmap_qenter(kva, pages, npages); 330 cnt.v_vnodeout++; 331 cnt.v_vnodepgsout += count; 332 333 iov.iov_base = (caddr_t) kva; 334 iov.iov_len = count; 335 uio.uio_iov = &iov; 336 uio.uio_iovcnt = 1; 337 uio.uio_offset = offset; 338 uio.uio_resid = count; 339 uio.uio_segflg = UIO_SYSSPACE; 340 uio.uio_rw = UIO_WRITE; 341 uio.uio_td = td; 342 343 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 344 iomode = NFSV3WRITE_UNSTABLE; 345 else 346 iomode = NFSV3WRITE_FILESYNC; 347 348 error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 349 350 pmap_qremove(kva, npages); 351 relpbuf(bp, &nfs_pbuf_freecnt); 352 353 if (!error) { 354 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 355 for (i = 0; i < nwritten; i++) { 356 rtvals[i] = VM_PAGER_OK; 357 vm_page_undirty(pages[i]); 358 } 359 if (must_commit) { 360 nfs_clearcommit(vp->v_mount); 361 } 362 } 363 return rtvals[0]; 364} 365 366/* 367 * Vnode op for read using bio 368 */ 369int 370nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 371{ 372 struct nfsnode *np = VTONFS(vp); 373 int biosize, i; 374 struct buf *bp = 0, *rabp; 375 struct vattr vattr; 376 struct thread *td; 377 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 378 daddr_t lbn, rabn; 379 int bcount; 380 int seqcount; 381 int nra, error = 0, n = 0, on = 0; 382 383#ifdef DIAGNOSTIC 384 if (uio->uio_rw != UIO_READ) 385 panic("nfs_read mode"); 386#endif 387 if (uio->uio_resid == 0) 388 return (0); 389 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 390 return (EINVAL); 391 td = uio->uio_td; 392 393 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 394 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 395 (void)nfs_fsinfo(nmp, vp, cred, td); 396 if (vp->v_type != VDIR && 397 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 398 return (EFBIG); 399 biosize = vp->v_mount->mnt_stat.f_iosize; 400 seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 401 /* 402 * For nfs, cache consistency can only be maintained approximately. 403 * Although RFC1094 does not specify the criteria, the following is 404 * believed to be compatible with the reference port. 405 * For nfs: 406 * If the file's modify time on the server has changed since the 407 * last read rpc or you have written to the file, 408 * you may have lost data cache consistency with the 409 * server, so flush all of the file's data out of the cache. 410 * Then force a getattr rpc to ensure that you have up to date 411 * attributes. 412 * NB: This implies that cache data can be read when up to 413 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 414 * attributes this could be forced by setting n_attrstamp to 0 before 415 * the VOP_GETATTR() call. 416 */ 417 if (np->n_flag & NMODIFIED) { 418 if (vp->v_type != VREG) { 419 if (vp->v_type != VDIR) 420 panic("nfs: bioread, not dir"); 421 (nmp->nm_rpcops->nr_invaldir)(vp); 422 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 423 if (error) 424 return (error); 425 } 426 np->n_attrstamp = 0; 427 error = VOP_GETATTR(vp, &vattr, cred, td); 428 if (error) 429 return (error); 430 np->n_mtime = vattr.va_mtime.tv_sec; 431 } else { 432 error = VOP_GETATTR(vp, &vattr, cred, td); 433 if (error) 434 return (error); 435 if (np->n_mtime != vattr.va_mtime.tv_sec) { 436 if (vp->v_type == VDIR) 437 (nmp->nm_rpcops->nr_invaldir)(vp); 438 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 439 if (error) 440 return (error); 441 np->n_mtime = vattr.va_mtime.tv_sec; 442 } 443 } 444 do { 445 switch (vp->v_type) { 446 case VREG: 447 nfsstats.biocache_reads++; 448 lbn = uio->uio_offset / biosize; 449 on = uio->uio_offset & (biosize - 1); 450 451 /* 452 * Start the read ahead(s), as required. 453 */ 454 if (nmp->nm_readahead > 0) { 455 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 456 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 457 rabn = lbn + 1 + nra; 458 if (incore(vp, rabn) == NULL) { 459 rabp = nfs_getcacheblk(vp, rabn, biosize, td); 460 if (!rabp) 461 return (EINTR); 462 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 463 rabp->b_flags |= B_ASYNC; 464 rabp->b_iocmd = BIO_READ; 465 vfs_busy_pages(rabp, 0); 466 if (nfs_asyncio(rabp, cred, td)) { 467 rabp->b_flags |= B_INVAL; 468 rabp->b_ioflags |= BIO_ERROR; 469 vfs_unbusy_pages(rabp); 470 brelse(rabp); 471 break; 472 } 473 } else { 474 brelse(rabp); 475 } 476 } 477 } 478 } 479 480 /* 481 * Obtain the buffer cache block. Figure out the buffer size 482 * when we are at EOF. If we are modifying the size of the 483 * buffer based on an EOF condition we need to hold 484 * nfs_rslock() through obtaining the buffer to prevent 485 * a potential writer-appender from messing with n_size. 486 * Otherwise we may accidently truncate the buffer and 487 * lose dirty data. 488 * 489 * Note that bcount is *not* DEV_BSIZE aligned. 490 */ 491 492again: 493 bcount = biosize; 494 if ((off_t)lbn * biosize >= np->n_size) { 495 bcount = 0; 496 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 497 bcount = np->n_size - (off_t)lbn * biosize; 498 } 499 if (bcount != biosize) { 500 switch(nfs_rslock(np, td)) { 501 case ENOLCK: 502 goto again; 503 /* not reached */ 504 case EINTR: 505 case ERESTART: 506 return(EINTR); 507 /* not reached */ 508 default: 509 break; 510 } 511 } 512 513 bp = nfs_getcacheblk(vp, lbn, bcount, td); 514 515 if (bcount != biosize) 516 nfs_rsunlock(np, td); 517 if (!bp) 518 return (EINTR); 519 520 /* 521 * If B_CACHE is not set, we must issue the read. If this 522 * fails, we return an error. 523 */ 524 525 if ((bp->b_flags & B_CACHE) == 0) { 526 bp->b_iocmd = BIO_READ; 527 vfs_busy_pages(bp, 0); 528 error = nfs_doio(bp, cred, td); 529 if (error) { 530 brelse(bp); 531 return (error); 532 } 533 } 534 535 /* 536 * on is the offset into the current bp. Figure out how many 537 * bytes we can copy out of the bp. Note that bcount is 538 * NOT DEV_BSIZE aligned. 539 * 540 * Then figure out how many bytes we can copy into the uio. 541 */ 542 543 n = 0; 544 if (on < bcount) 545 n = min((unsigned)(bcount - on), uio->uio_resid); 546 break; 547 case VLNK: 548 nfsstats.biocache_readlinks++; 549 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 550 if (!bp) 551 return (EINTR); 552 if ((bp->b_flags & B_CACHE) == 0) { 553 bp->b_iocmd = BIO_READ; 554 vfs_busy_pages(bp, 0); 555 error = nfs_doio(bp, cred, td); 556 if (error) { 557 bp->b_ioflags |= BIO_ERROR; 558 brelse(bp); 559 return (error); 560 } 561 } 562 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 563 on = 0; 564 break; 565 case VDIR: 566 nfsstats.biocache_readdirs++; 567 if (np->n_direofoffset 568 && uio->uio_offset >= np->n_direofoffset) { 569 return (0); 570 } 571 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 572 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 573 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 574 if (!bp) 575 return (EINTR); 576 if ((bp->b_flags & B_CACHE) == 0) { 577 bp->b_iocmd = BIO_READ; 578 vfs_busy_pages(bp, 0); 579 error = nfs_doio(bp, cred, td); 580 if (error) { 581 brelse(bp); 582 } 583 while (error == NFSERR_BAD_COOKIE) { 584 printf("got bad cookie vp %p bp %p\n", vp, bp); 585 (nmp->nm_rpcops->nr_invaldir)(vp); 586 error = nfs_vinvalbuf(vp, 0, cred, td, 1); 587 /* 588 * Yuck! The directory has been modified on the 589 * server. The only way to get the block is by 590 * reading from the beginning to get all the 591 * offset cookies. 592 * 593 * Leave the last bp intact unless there is an error. 594 * Loop back up to the while if the error is another 595 * NFSERR_BAD_COOKIE (double yuch!). 596 */ 597 for (i = 0; i <= lbn && !error; i++) { 598 if (np->n_direofoffset 599 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 600 return (0); 601 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 602 if (!bp) 603 return (EINTR); 604 if ((bp->b_flags & B_CACHE) == 0) { 605 bp->b_iocmd = BIO_READ; 606 vfs_busy_pages(bp, 0); 607 error = nfs_doio(bp, cred, td); 608 /* 609 * no error + B_INVAL == directory EOF, 610 * use the block. 611 */ 612 if (error == 0 && (bp->b_flags & B_INVAL)) 613 break; 614 } 615 /* 616 * An error will throw away the block and the 617 * for loop will break out. If no error and this 618 * is not the block we want, we throw away the 619 * block and go for the next one via the for loop. 620 */ 621 if (error || i < lbn) 622 brelse(bp); 623 } 624 } 625 /* 626 * The above while is repeated if we hit another cookie 627 * error. If we hit an error and it wasn't a cookie error, 628 * we give up. 629 */ 630 if (error) 631 return (error); 632 } 633 634 /* 635 * If not eof and read aheads are enabled, start one. 636 * (You need the current block first, so that you have the 637 * directory offset cookie of the next block.) 638 */ 639 if (nmp->nm_readahead > 0 && 640 (bp->b_flags & B_INVAL) == 0 && 641 (np->n_direofoffset == 0 || 642 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 643 incore(vp, lbn + 1) == NULL) { 644 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 645 if (rabp) { 646 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 647 rabp->b_flags |= B_ASYNC; 648 rabp->b_iocmd = BIO_READ; 649 vfs_busy_pages(rabp, 0); 650 if (nfs_asyncio(rabp, cred, td)) { 651 rabp->b_flags |= B_INVAL; 652 rabp->b_ioflags |= BIO_ERROR; 653 vfs_unbusy_pages(rabp); 654 brelse(rabp); 655 } 656 } else { 657 brelse(rabp); 658 } 659 } 660 } 661 /* 662 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 663 * chopped for the EOF condition, we cannot tell how large 664 * NFS directories are going to be until we hit EOF. So 665 * an NFS directory buffer is *not* chopped to its EOF. Now, 666 * it just so happens that b_resid will effectively chop it 667 * to EOF. *BUT* this information is lost if the buffer goes 668 * away and is reconstituted into a B_CACHE state ( due to 669 * being VMIO ) later. So we keep track of the directory eof 670 * in np->n_direofoffset and chop it off as an extra step 671 * right here. 672 */ 673 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 674 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 675 n = np->n_direofoffset - uio->uio_offset; 676 break; 677 default: 678 printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 679 break; 680 }; 681 682 if (n > 0) { 683 error = uiomove(bp->b_data + on, (int)n, uio); 684 } 685 switch (vp->v_type) { 686 case VREG: 687 break; 688 case VLNK: 689 n = 0; 690 break; 691 case VDIR: 692 break; 693 default: 694 printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 695 } 696 brelse(bp); 697 } while (error == 0 && uio->uio_resid > 0 && n > 0); 698 return (error); 699} 700 701/* 702 * Vnode op for write using bio 703 */ 704int 705nfs_write(struct vop_write_args *ap) 706{ 707 int biosize; 708 struct uio *uio = ap->a_uio; 709 struct thread *td = uio->uio_td; 710 struct vnode *vp = ap->a_vp; 711 struct nfsnode *np = VTONFS(vp); 712 struct ucred *cred = ap->a_cred; 713 int ioflag = ap->a_ioflag; 714 struct buf *bp; 715 struct vattr vattr; 716 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 717 daddr_t lbn; 718 int bcount; 719 int n, on, error = 0; 720 int haverslock = 0; 721 struct proc *p = td?td->td_proc:NULL; 722 723 GIANT_REQUIRED; 724 725#ifdef DIAGNOSTIC 726 if (uio->uio_rw != UIO_WRITE) 727 panic("nfs_write mode"); 728 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 729 panic("nfs_write proc"); 730#endif 731 if (vp->v_type != VREG) 732 return (EIO); 733 if (np->n_flag & NWRITEERR) { 734 np->n_flag &= ~NWRITEERR; 735 return (np->n_error); 736 } 737 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 738 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 739 (void)nfs_fsinfo(nmp, vp, cred, td); 740 741 /* 742 * Synchronously flush pending buffers if we are in synchronous 743 * mode or if we are appending. 744 */ 745 if (ioflag & (IO_APPEND | IO_SYNC)) { 746 if (np->n_flag & NMODIFIED) { 747 np->n_attrstamp = 0; 748 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 749 if (error) 750 return (error); 751 } 752 } 753 754 /* 755 * If IO_APPEND then load uio_offset. We restart here if we cannot 756 * get the append lock. 757 */ 758restart: 759 if (ioflag & IO_APPEND) { 760 np->n_attrstamp = 0; 761 error = VOP_GETATTR(vp, &vattr, cred, td); 762 if (error) 763 return (error); 764 uio->uio_offset = np->n_size; 765 } 766 767 if (uio->uio_offset < 0) 768 return (EINVAL); 769 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 770 return (EFBIG); 771 if (uio->uio_resid == 0) 772 return (0); 773 774 /* 775 * We need to obtain the rslock if we intend to modify np->n_size 776 * in order to guarentee the append point with multiple contending 777 * writers, to guarentee that no other appenders modify n_size 778 * while we are trying to obtain a truncated buffer (i.e. to avoid 779 * accidently truncating data written by another appender due to 780 * the race), and to ensure that the buffer is populated prior to 781 * our extending of the file. We hold rslock through the entire 782 * operation. 783 * 784 * Note that we do not synchronize the case where someone truncates 785 * the file while we are appending to it because attempting to lock 786 * this case may deadlock other parts of the system unexpectedly. 787 */ 788 if ((ioflag & IO_APPEND) || 789 uio->uio_offset + uio->uio_resid > np->n_size) { 790 switch(nfs_rslock(np, td)) { 791 case ENOLCK: 792 goto restart; 793 /* not reached */ 794 case EINTR: 795 case ERESTART: 796 return(EINTR); 797 /* not reached */ 798 default: 799 break; 800 } 801 haverslock = 1; 802 } 803 804 /* 805 * Maybe this should be above the vnode op call, but so long as 806 * file servers have no limits, i don't think it matters 807 */ 808 if (p != NULL) { 809 PROC_LOCK(p); 810 if (uio->uio_offset + uio->uio_resid > 811 lim_cur(p, RLIMIT_FSIZE)) { 812 psignal(p, SIGXFSZ); 813 PROC_UNLOCK(p); 814 if (haverslock) 815 nfs_rsunlock(np, td); 816 return (EFBIG); 817 } 818 PROC_UNLOCK(p); 819 } 820 821 biosize = vp->v_mount->mnt_stat.f_iosize; 822 823 do { 824 nfsstats.biocache_writes++; 825 lbn = uio->uio_offset / biosize; 826 on = uio->uio_offset & (biosize-1); 827 n = min((unsigned)(biosize - on), uio->uio_resid); 828again: 829 /* 830 * Handle direct append and file extension cases, calculate 831 * unaligned buffer size. 832 */ 833 834 if (uio->uio_offset == np->n_size && n) { 835 /* 836 * Get the buffer (in its pre-append state to maintain 837 * B_CACHE if it was previously set). Resize the 838 * nfsnode after we have locked the buffer to prevent 839 * readers from reading garbage. 840 */ 841 bcount = on; 842 bp = nfs_getcacheblk(vp, lbn, bcount, td); 843 844 if (bp != NULL) { 845 long save; 846 847 np->n_size = uio->uio_offset + n; 848 np->n_flag |= NMODIFIED; 849 vnode_pager_setsize(vp, np->n_size); 850 851 save = bp->b_flags & B_CACHE; 852 bcount += n; 853 allocbuf(bp, bcount); 854 bp->b_flags |= save; 855 bp->b_magic = B_MAGIC_NFS; 856 if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) 857 bp->b_op = &buf_ops_nfs4; 858 else 859 bp->b_op = &buf_ops_nfs; 860 } 861 } else { 862 /* 863 * Obtain the locked cache block first, and then 864 * adjust the file's size as appropriate. 865 */ 866 bcount = on + n; 867 if ((off_t)lbn * biosize + bcount < np->n_size) { 868 if ((off_t)(lbn + 1) * biosize < np->n_size) 869 bcount = biosize; 870 else 871 bcount = np->n_size - (off_t)lbn * biosize; 872 } 873 bp = nfs_getcacheblk(vp, lbn, bcount, td); 874 if (uio->uio_offset + n > np->n_size) { 875 np->n_size = uio->uio_offset + n; 876 np->n_flag |= NMODIFIED; 877 vnode_pager_setsize(vp, np->n_size); 878 } 879 } 880 881 if (!bp) { 882 error = EINTR; 883 break; 884 } 885 886 /* 887 * Issue a READ if B_CACHE is not set. In special-append 888 * mode, B_CACHE is based on the buffer prior to the write 889 * op and is typically set, avoiding the read. If a read 890 * is required in special append mode, the server will 891 * probably send us a short-read since we extended the file 892 * on our end, resulting in b_resid == 0 and, thusly, 893 * B_CACHE getting set. 894 * 895 * We can also avoid issuing the read if the write covers 896 * the entire buffer. We have to make sure the buffer state 897 * is reasonable in this case since we will not be initiating 898 * I/O. See the comments in kern/vfs_bio.c's getblk() for 899 * more information. 900 * 901 * B_CACHE may also be set due to the buffer being cached 902 * normally. 903 */ 904 905 if (on == 0 && n == bcount) { 906 bp->b_flags |= B_CACHE; 907 bp->b_flags &= ~B_INVAL; 908 bp->b_ioflags &= ~BIO_ERROR; 909 } 910 911 if ((bp->b_flags & B_CACHE) == 0) { 912 bp->b_iocmd = BIO_READ; 913 vfs_busy_pages(bp, 0); 914 error = nfs_doio(bp, cred, td); 915 if (error) { 916 brelse(bp); 917 break; 918 } 919 } 920 if (!bp) { 921 error = EINTR; 922 break; 923 } 924 if (bp->b_wcred == NOCRED) 925 bp->b_wcred = crhold(cred); 926 np->n_flag |= NMODIFIED; 927 928 /* 929 * If dirtyend exceeds file size, chop it down. This should 930 * not normally occur but there is an append race where it 931 * might occur XXX, so we log it. 932 * 933 * If the chopping creates a reverse-indexed or degenerate 934 * situation with dirtyoff/end, we 0 both of them. 935 */ 936 937 if (bp->b_dirtyend > bcount) { 938 printf("NFS append race @%lx:%d\n", 939 (long)bp->b_blkno * DEV_BSIZE, 940 bp->b_dirtyend - bcount); 941 bp->b_dirtyend = bcount; 942 } 943 944 if (bp->b_dirtyoff >= bp->b_dirtyend) 945 bp->b_dirtyoff = bp->b_dirtyend = 0; 946 947 /* 948 * If the new write will leave a contiguous dirty 949 * area, just update the b_dirtyoff and b_dirtyend, 950 * otherwise force a write rpc of the old dirty area. 951 * 952 * While it is possible to merge discontiguous writes due to 953 * our having a B_CACHE buffer ( and thus valid read data 954 * for the hole), we don't because it could lead to 955 * significant cache coherency problems with multiple clients, 956 * especially if locking is implemented later on. 957 * 958 * as an optimization we could theoretically maintain 959 * a linked list of discontinuous areas, but we would still 960 * have to commit them separately so there isn't much 961 * advantage to it except perhaps a bit of asynchronization. 962 */ 963 964 if (bp->b_dirtyend > 0 && 965 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 966 if (bwrite(bp) == EINTR) { 967 error = EINTR; 968 break; 969 } 970 goto again; 971 } 972 973 error = uiomove((char *)bp->b_data + on, n, uio); 974 975 /* 976 * Since this block is being modified, it must be written 977 * again and not just committed. Since write clustering does 978 * not work for the stage 1 data write, only the stage 2 979 * commit rpc, we have to clear B_CLUSTEROK as well. 980 */ 981 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 982 983 if (error) { 984 bp->b_ioflags |= BIO_ERROR; 985 brelse(bp); 986 break; 987 } 988 989 /* 990 * Only update dirtyoff/dirtyend if not a degenerate 991 * condition. 992 */ 993 if (n) { 994 if (bp->b_dirtyend > 0) { 995 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 996 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 997 } else { 998 bp->b_dirtyoff = on; 999 bp->b_dirtyend = on + n; 1000 } 1001 vfs_bio_set_validclean(bp, on, n); 1002 } 1003 1004 /* 1005 * If IO_SYNC do bwrite(). 1006 * 1007 * IO_INVAL appears to be unused. The idea appears to be 1008 * to turn off caching in this case. Very odd. XXX 1009 */ 1010 if ((ioflag & IO_SYNC)) { 1011 if (ioflag & IO_INVAL) 1012 bp->b_flags |= B_NOCACHE; 1013 error = bwrite(bp); 1014 if (error) 1015 break; 1016 } else if ((n + on) == biosize) { 1017 bp->b_flags |= B_ASYNC; 1018 (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, 0); 1019 } else { 1020 bdwrite(bp); 1021 } 1022 } while (uio->uio_resid > 0 && n > 0); 1023 1024 if (haverslock) 1025 nfs_rsunlock(np, td); 1026 1027 return (error); 1028} 1029 1030/* 1031 * Get an nfs cache block. 1032 * 1033 * Allocate a new one if the block isn't currently in the cache 1034 * and return the block marked busy. If the calling process is 1035 * interrupted by a signal for an interruptible mount point, return 1036 * NULL. 1037 * 1038 * The caller must carefully deal with the possible B_INVAL state of 1039 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1040 * indirectly), so synchronous reads can be issued without worrying about 1041 * the B_INVAL state. We have to be a little more careful when dealing 1042 * with writes (see comments in nfs_write()) when extending a file past 1043 * its EOF. 1044 */ 1045static struct buf * 1046nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 1047{ 1048 struct buf *bp; 1049 struct mount *mp; 1050 struct nfsmount *nmp; 1051 1052 mp = vp->v_mount; 1053 nmp = VFSTONFS(mp); 1054 1055 if (nmp->nm_flag & NFSMNT_INT) { 1056 bp = getblk(vp, bn, size, PCATCH, 0, 0); 1057 while (bp == NULL) { 1058 if (nfs_sigintr(nmp, NULL, td)) 1059 return (NULL); 1060 bp = getblk(vp, bn, size, 0, 2 * hz, 0); 1061 } 1062 } else { 1063 bp = getblk(vp, bn, size, 0, 0, 0); 1064 } 1065 1066 if (vp->v_type == VREG) { 1067 int biosize; 1068 1069 biosize = mp->mnt_stat.f_iosize; 1070 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1071 } 1072 return (bp); 1073} 1074 1075/* 1076 * Flush and invalidate all dirty buffers. If another process is already 1077 * doing the flush, just wait for completion. 1078 */ 1079int 1080nfs_vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, 1081 struct thread *td, int intrflg) 1082{ 1083 struct nfsnode *np = VTONFS(vp); 1084 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1085 int error = 0, slpflag, slptimeo; 1086 1087 ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1088 1089 /* 1090 * XXX This check stops us from needlessly doing a vinvalbuf when 1091 * being called through vclean(). It is not clear that this is 1092 * unsafe. 1093 */ 1094 if (vp->v_iflag & VI_XLOCK) 1095 return (0); 1096 1097 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1098 intrflg = 0; 1099 if (intrflg) { 1100 slpflag = PCATCH; 1101 slptimeo = 2 * hz; 1102 } else { 1103 slpflag = 0; 1104 slptimeo = 0; 1105 } 1106 /* 1107 * First wait for any other process doing a flush to complete. 1108 */ 1109 while (np->n_flag & NFLUSHINPROG) { 1110 np->n_flag |= NFLUSHWANT; 1111 error = tsleep(&np->n_flag, PRIBIO + 2, "nfsvinval", 1112 slptimeo); 1113 if (error && intrflg && 1114 nfs_sigintr(nmp, NULL, td)) 1115 return (EINTR); 1116 } 1117 1118 /* 1119 * Now, flush as required. 1120 */ 1121 np->n_flag |= NFLUSHINPROG; 1122 error = vinvalbuf(vp, flags, cred, td, slpflag, 0); 1123 while (error) { 1124 if (intrflg && 1125 nfs_sigintr(nmp, NULL, td)) { 1126 np->n_flag &= ~NFLUSHINPROG; 1127 if (np->n_flag & NFLUSHWANT) { 1128 np->n_flag &= ~NFLUSHWANT; 1129 wakeup(&np->n_flag); 1130 } 1131 return (EINTR); 1132 } 1133 error = vinvalbuf(vp, flags, cred, td, 0, slptimeo); 1134 } 1135 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 1136 if (np->n_flag & NFLUSHWANT) { 1137 np->n_flag &= ~NFLUSHWANT; 1138 wakeup(&np->n_flag); 1139 } 1140 return (0); 1141} 1142 1143/* 1144 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1145 * This is mainly to avoid queueing async I/O requests when the nfsiods 1146 * are all hung on a dead server. 1147 * 1148 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1149 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1150 */ 1151int 1152nfs_asyncio(struct buf *bp, struct ucred *cred, struct thread *td) 1153{ 1154 struct nfsmount *nmp; 1155 int iod; 1156 int gotiod; 1157 int slpflag = 0; 1158 int slptimeo = 0; 1159 int error; 1160 1161 nmp = VFSTONFS(bp->b_vp->v_mount); 1162 1163 /* 1164 * Commits are usually short and sweet so lets save some cpu and 1165 * leave the async daemons for more important rpc's (such as reads 1166 * and writes). 1167 */ 1168 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1169 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1170 return(EIO); 1171 } 1172 1173again: 1174 if (nmp->nm_flag & NFSMNT_INT) 1175 slpflag = PCATCH; 1176 gotiod = FALSE; 1177 1178 /* 1179 * Find a free iod to process this request. 1180 */ 1181 for (iod = 0; iod < nfs_numasync; iod++) 1182 if (nfs_iodwant[iod]) { 1183 gotiod = TRUE; 1184 break; 1185 } 1186 1187 /* 1188 * Try to create one if none are free. 1189 */ 1190 if (!gotiod) { 1191 iod = nfs_nfsiodnew(); 1192 if (iod != -1) 1193 gotiod = TRUE; 1194 } 1195 1196 if (gotiod) { 1197 /* 1198 * Found one, so wake it up and tell it which 1199 * mount to process. 1200 */ 1201 NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 1202 iod, nmp)); 1203 nfs_iodwant[iod] = NULL; 1204 nfs_iodmount[iod] = nmp; 1205 nmp->nm_bufqiods++; 1206 wakeup(&nfs_iodwant[iod]); 1207 } 1208 1209 /* 1210 * If none are free, we may already have an iod working on this mount 1211 * point. If so, it will process our request. 1212 */ 1213 if (!gotiod) { 1214 if (nmp->nm_bufqiods > 0) { 1215 NFS_DPF(ASYNCIO, 1216 ("nfs_asyncio: %d iods are already processing mount %p\n", 1217 nmp->nm_bufqiods, nmp)); 1218 gotiod = TRUE; 1219 } 1220 } 1221 1222 /* 1223 * If we have an iod which can process the request, then queue 1224 * the buffer. 1225 */ 1226 if (gotiod) { 1227 /* 1228 * Ensure that the queue never grows too large. We still want 1229 * to asynchronize so we block rather then return EIO. 1230 */ 1231 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1232 NFS_DPF(ASYNCIO, 1233 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1234 nmp->nm_bufqwant = TRUE; 1235 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1236 "nfsaio", slptimeo); 1237 if (error) { 1238 if (nfs_sigintr(nmp, NULL, td)) 1239 return (EINTR); 1240 if (slpflag == PCATCH) { 1241 slpflag = 0; 1242 slptimeo = 2 * hz; 1243 } 1244 } 1245 /* 1246 * We might have lost our iod while sleeping, 1247 * so check and loop if nescessary. 1248 */ 1249 if (nmp->nm_bufqiods == 0) { 1250 NFS_DPF(ASYNCIO, 1251 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1252 goto again; 1253 } 1254 } 1255 1256 if (bp->b_iocmd == BIO_READ) { 1257 if (bp->b_rcred == NOCRED && cred != NOCRED) 1258 bp->b_rcred = crhold(cred); 1259 } else { 1260 bp->b_flags |= B_WRITEINPROG; 1261 if (bp->b_wcred == NOCRED && cred != NOCRED) 1262 bp->b_wcred = crhold(cred); 1263 } 1264 1265 BUF_KERNPROC(bp); 1266 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1267 nmp->nm_bufqlen++; 1268 return (0); 1269 } 1270 1271 /* 1272 * All the iods are busy on other mounts, so return EIO to 1273 * force the caller to process the i/o synchronously. 1274 */ 1275 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1276 return (EIO); 1277} 1278 1279/* 1280 * Do an I/O operation to/from a cache block. This may be called 1281 * synchronously or from an nfsiod. 1282 */ 1283int 1284nfs_doio(struct buf *bp, struct ucred *cr, struct thread *td) 1285{ 1286 struct uio *uiop; 1287 struct vnode *vp; 1288 struct nfsnode *np; 1289 struct nfsmount *nmp; 1290 int error = 0, iomode, must_commit = 0; 1291 struct uio uio; 1292 struct iovec io; 1293 struct proc *p = td ? td->td_proc : NULL; 1294 1295 vp = bp->b_vp; 1296 np = VTONFS(vp); 1297 nmp = VFSTONFS(vp->v_mount); 1298 uiop = &uio; 1299 uiop->uio_iov = &io; 1300 uiop->uio_iovcnt = 1; 1301 uiop->uio_segflg = UIO_SYSSPACE; 1302 uiop->uio_td = td; 1303 1304 /* 1305 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1306 * do this here so we do not have to do it in all the code that 1307 * calls us. 1308 */ 1309 bp->b_flags &= ~B_INVAL; 1310 bp->b_ioflags &= ~BIO_ERROR; 1311 1312 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1313 1314 if (bp->b_iocmd == BIO_READ) { 1315 io.iov_len = uiop->uio_resid = bp->b_bcount; 1316 io.iov_base = bp->b_data; 1317 uiop->uio_rw = UIO_READ; 1318 1319 switch (vp->v_type) { 1320 case VREG: 1321 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1322 nfsstats.read_bios++; 1323 error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 1324 1325 if (!error) { 1326 if (uiop->uio_resid) { 1327 /* 1328 * If we had a short read with no error, we must have 1329 * hit a file hole. We should zero-fill the remainder. 1330 * This can also occur if the server hits the file EOF. 1331 * 1332 * Holes used to be able to occur due to pending 1333 * writes, but that is not possible any longer. 1334 */ 1335 int nread = bp->b_bcount - uiop->uio_resid; 1336 int left = uiop->uio_resid; 1337 1338 if (left > 0) 1339 bzero((char *)bp->b_data + nread, left); 1340 uiop->uio_resid = 0; 1341 } 1342 } 1343 /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1344 if (p && (vp->v_vflag & VV_TEXT) && 1345 (np->n_mtime != np->n_vattr.va_mtime.tv_sec)) { 1346 uprintf("Process killed due to text file modification\n"); 1347 PROC_LOCK(p); 1348 psignal(p, SIGKILL); 1349 _PHOLD(p); 1350 PROC_UNLOCK(p); 1351 } 1352 break; 1353 case VLNK: 1354 uiop->uio_offset = (off_t)0; 1355 nfsstats.readlink_bios++; 1356 error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 1357 break; 1358 case VDIR: 1359 nfsstats.readdir_bios++; 1360 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1361 if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) 1362 error = nfs4_readdirrpc(vp, uiop, cr); 1363 else { 1364 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1365 error = nfs_readdirplusrpc(vp, uiop, cr); 1366 if (error == NFSERR_NOTSUPP) 1367 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1368 } 1369 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1370 error = nfs_readdirrpc(vp, uiop, cr); 1371 } 1372 /* 1373 * end-of-directory sets B_INVAL but does not generate an 1374 * error. 1375 */ 1376 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1377 bp->b_flags |= B_INVAL; 1378 break; 1379 default: 1380 printf("nfs_doio: type %x unexpected\n", vp->v_type); 1381 break; 1382 }; 1383 if (error) { 1384 bp->b_ioflags |= BIO_ERROR; 1385 bp->b_error = error; 1386 } 1387 } else { 1388 /* 1389 * If we only need to commit, try to commit 1390 */ 1391 if (bp->b_flags & B_NEEDCOMMIT) { 1392 int retv; 1393 off_t off; 1394 1395 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1396 bp->b_flags |= B_WRITEINPROG; 1397 retv = (nmp->nm_rpcops->nr_commit)( 1398 bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1399 bp->b_wcred, td); 1400 bp->b_flags &= ~B_WRITEINPROG; 1401 if (retv == 0) { 1402 bp->b_dirtyoff = bp->b_dirtyend = 0; 1403 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1404 bp->b_resid = 0; 1405 bufdone(bp); 1406 return (0); 1407 } 1408 if (retv == NFSERR_STALEWRITEVERF) { 1409 nfs_clearcommit(bp->b_vp->v_mount); 1410 } 1411 } 1412 1413 /* 1414 * Setup for actual write 1415 */ 1416 1417 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1418 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1419 1420 if (bp->b_dirtyend > bp->b_dirtyoff) { 1421 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1422 - bp->b_dirtyoff; 1423 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1424 + bp->b_dirtyoff; 1425 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1426 uiop->uio_rw = UIO_WRITE; 1427 nfsstats.write_bios++; 1428 1429 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1430 iomode = NFSV3WRITE_UNSTABLE; 1431 else 1432 iomode = NFSV3WRITE_FILESYNC; 1433 1434 bp->b_flags |= B_WRITEINPROG; 1435 error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 1436 1437 /* 1438 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1439 * to cluster the buffers needing commit. This will allow 1440 * the system to submit a single commit rpc for the whole 1441 * cluster. We can do this even if the buffer is not 100% 1442 * dirty (relative to the NFS blocksize), so we optimize the 1443 * append-to-file-case. 1444 * 1445 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1446 * cleared because write clustering only works for commit 1447 * rpc's, not for the data portion of the write). 1448 */ 1449 1450 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1451 bp->b_flags |= B_NEEDCOMMIT; 1452 if (bp->b_dirtyoff == 0 1453 && bp->b_dirtyend == bp->b_bcount) 1454 bp->b_flags |= B_CLUSTEROK; 1455 } else { 1456 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1457 } 1458 bp->b_flags &= ~B_WRITEINPROG; 1459 1460 /* 1461 * For an interrupted write, the buffer is still valid 1462 * and the write hasn't been pushed to the server yet, 1463 * so we can't set BIO_ERROR and report the interruption 1464 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1465 * is not relevant, so the rpc attempt is essentially 1466 * a noop. For the case of a V3 write rpc not being 1467 * committed to stable storage, the block is still 1468 * dirty and requires either a commit rpc or another 1469 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1470 * the block is reused. This is indicated by setting 1471 * the B_DELWRI and B_NEEDCOMMIT flags. 1472 * 1473 * If the buffer is marked B_PAGING, it does not reside on 1474 * the vp's paging queues so we cannot call bdirty(). The 1475 * bp in this case is not an NFS cache block so we should 1476 * be safe. XXX 1477 */ 1478 if (error == EINTR 1479 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1480 int s; 1481 1482 s = splbio(); 1483 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1484 if ((bp->b_flags & B_PAGING) == 0) { 1485 bdirty(bp); 1486 bp->b_flags &= ~B_DONE; 1487 } 1488 if (error && (bp->b_flags & B_ASYNC) == 0) 1489 bp->b_flags |= B_EINTR; 1490 splx(s); 1491 } else { 1492 if (error) { 1493 bp->b_ioflags |= BIO_ERROR; 1494 bp->b_error = np->n_error = error; 1495 np->n_flag |= NWRITEERR; 1496 } 1497 bp->b_dirtyoff = bp->b_dirtyend = 0; 1498 } 1499 } else { 1500 bp->b_resid = 0; 1501 bufdone(bp); 1502 return (0); 1503 } 1504 } 1505 bp->b_resid = uiop->uio_resid; 1506 if (must_commit) 1507 nfs_clearcommit(vp->v_mount); 1508 bufdone(bp); 1509 return (error); 1510} 1511 1512/* 1513 * Used to aid in handling ftruncate() operations on the NFS client side. 1514 * Truncation creates a number of special problems for NFS. We have to 1515 * throw away VM pages and buffer cache buffers that are beyond EOF, and 1516 * we have to properly handle VM pages or (potentially dirty) buffers 1517 * that straddle the truncation point. 1518 */ 1519 1520int 1521nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 1522{ 1523 struct nfsnode *np = VTONFS(vp); 1524 u_quad_t tsize = np->n_size; 1525 int biosize = vp->v_mount->mnt_stat.f_iosize; 1526 int error = 0; 1527 1528 np->n_size = nsize; 1529 1530 if (np->n_size < tsize) { 1531 struct buf *bp; 1532 daddr_t lbn; 1533 int bufsize; 1534 1535 /* 1536 * vtruncbuf() doesn't get the buffer overlapping the 1537 * truncation point. We may have a B_DELWRI and/or B_CACHE 1538 * buffer that now needs to be truncated. 1539 */ 1540 error = vtruncbuf(vp, cred, td, nsize, biosize); 1541 lbn = nsize / biosize; 1542 bufsize = nsize & (biosize - 1); 1543 bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1544 if (bp->b_dirtyoff > bp->b_bcount) 1545 bp->b_dirtyoff = bp->b_bcount; 1546 if (bp->b_dirtyend > bp->b_bcount) 1547 bp->b_dirtyend = bp->b_bcount; 1548 bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 1549 brelse(bp); 1550 } else { 1551 vnode_pager_setsize(vp, nsize); 1552 } 1553 return(error); 1554} 1555 1556