nfs_bio.c revision 138469
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 33 */ 34 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 138469 2004-12-06 18:52:28Z ps $"); 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/bio.h> 41#include <sys/buf.h> 42#include <sys/kernel.h> 43#include <sys/mount.h> 44#include <sys/proc.h> 45#include <sys/resourcevar.h> 46#include <sys/signalvar.h> 47#include <sys/vmmeter.h> 48#include <sys/vnode.h> 49 50#include <vm/vm.h> 51#include <vm/vm_extern.h> 52#include <vm/vm_page.h> 53#include <vm/vm_object.h> 54#include <vm/vm_pager.h> 55#include <vm/vnode_pager.h> 56 57#include <rpc/rpcclnt.h> 58 59#include <nfs/rpcv2.h> 60#include <nfs/nfsproto.h> 61#include <nfsclient/nfs.h> 62#include <nfsclient/nfsmount.h> 63#include <nfsclient/nfsnode.h> 64 65#include <nfs4client/nfs4.h> 66 67static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 68 struct thread *td); 69 70/* 71 * Vnode op for VM getpages. 72 */ 73int 74nfs_getpages(struct vop_getpages_args *ap) 75{ 76 int i, error, nextoff, size, toff, count, npages; 77 struct uio uio; 78 struct iovec iov; 79 vm_offset_t kva; 80 struct buf *bp; 81 struct vnode *vp; 82 struct thread *td; 83 struct ucred *cred; 84 struct nfsmount *nmp; 85 vm_object_t object; 86 vm_page_t *pages; 87 88 GIANT_REQUIRED; 89 90 vp = ap->a_vp; 91 td = curthread; /* XXX */ 92 cred = curthread->td_ucred; /* XXX */ 93 nmp = VFSTONFS(vp->v_mount); 94 pages = ap->a_m; 95 count = ap->a_count; 96 97 if ((object = vp->v_object) == NULL) { 98 printf("nfs_getpages: called with non-merged cache vnode??\n"); 99 return VM_PAGER_ERROR; 100 } 101 102 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 103 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 104 /* We'll never get here for v4, because we always have fsinfo */ 105 (void)nfs_fsinfo(nmp, vp, cred, td); 106 } 107 108 npages = btoc(count); 109 110 /* 111 * If the requested page is partially valid, just return it and 112 * allow the pager to zero-out the blanks. Partially valid pages 113 * can only occur at the file EOF. 114 */ 115 116 { 117 vm_page_t m = pages[ap->a_reqpage]; 118 119 VM_OBJECT_LOCK(object); 120 vm_page_lock_queues(); 121 if (m->valid != 0) { 122 /* handled by vm_fault now */ 123 /* vm_page_zero_invalid(m, TRUE); */ 124 for (i = 0; i < npages; ++i) { 125 if (i != ap->a_reqpage) 126 vm_page_free(pages[i]); 127 } 128 vm_page_unlock_queues(); 129 VM_OBJECT_UNLOCK(object); 130 return(0); 131 } 132 vm_page_unlock_queues(); 133 VM_OBJECT_UNLOCK(object); 134 } 135 136 /* 137 * We use only the kva address for the buffer, but this is extremely 138 * convienient and fast. 139 */ 140 bp = getpbuf(&nfs_pbuf_freecnt); 141 142 kva = (vm_offset_t) bp->b_data; 143 pmap_qenter(kva, pages, npages); 144 cnt.v_vnodein++; 145 cnt.v_vnodepgsin += npages; 146 147 iov.iov_base = (caddr_t) kva; 148 iov.iov_len = count; 149 uio.uio_iov = &iov; 150 uio.uio_iovcnt = 1; 151 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 152 uio.uio_resid = count; 153 uio.uio_segflg = UIO_SYSSPACE; 154 uio.uio_rw = UIO_READ; 155 uio.uio_td = td; 156 157 error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 158 pmap_qremove(kva, npages); 159 160 relpbuf(bp, &nfs_pbuf_freecnt); 161 162 if (error && (uio.uio_resid == count)) { 163 printf("nfs_getpages: error %d\n", error); 164 VM_OBJECT_LOCK(object); 165 vm_page_lock_queues(); 166 for (i = 0; i < npages; ++i) { 167 if (i != ap->a_reqpage) 168 vm_page_free(pages[i]); 169 } 170 vm_page_unlock_queues(); 171 VM_OBJECT_UNLOCK(object); 172 return VM_PAGER_ERROR; 173 } 174 175 /* 176 * Calculate the number of bytes read and validate only that number 177 * of bytes. Note that due to pending writes, size may be 0. This 178 * does not mean that the remaining data is invalid! 179 */ 180 181 size = count - uio.uio_resid; 182 VM_OBJECT_LOCK(object); 183 vm_page_lock_queues(); 184 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 185 vm_page_t m; 186 nextoff = toff + PAGE_SIZE; 187 m = pages[i]; 188 189 if (nextoff <= size) { 190 /* 191 * Read operation filled an entire page 192 */ 193 m->valid = VM_PAGE_BITS_ALL; 194 vm_page_undirty(m); 195 } else if (size > toff) { 196 /* 197 * Read operation filled a partial page. 198 */ 199 m->valid = 0; 200 vm_page_set_validclean(m, 0, size - toff); 201 /* handled by vm_fault now */ 202 /* vm_page_zero_invalid(m, TRUE); */ 203 } else { 204 /* 205 * Read operation was short. If no error occured 206 * we may have hit a zero-fill section. We simply 207 * leave valid set to 0. 208 */ 209 ; 210 } 211 if (i != ap->a_reqpage) { 212 /* 213 * Whether or not to leave the page activated is up in 214 * the air, but we should put the page on a page queue 215 * somewhere (it already is in the object). Result: 216 * It appears that emperical results show that 217 * deactivating pages is best. 218 */ 219 220 /* 221 * Just in case someone was asking for this page we 222 * now tell them that it is ok to use. 223 */ 224 if (!error) { 225 if (m->flags & PG_WANTED) 226 vm_page_activate(m); 227 else 228 vm_page_deactivate(m); 229 vm_page_wakeup(m); 230 } else { 231 vm_page_free(m); 232 } 233 } 234 } 235 vm_page_unlock_queues(); 236 VM_OBJECT_UNLOCK(object); 237 return 0; 238} 239 240/* 241 * Vnode op for VM putpages. 242 */ 243int 244nfs_putpages(struct vop_putpages_args *ap) 245{ 246 struct uio uio; 247 struct iovec iov; 248 vm_offset_t kva; 249 struct buf *bp; 250 int iomode, must_commit, i, error, npages, count; 251 off_t offset; 252 int *rtvals; 253 struct vnode *vp; 254 struct thread *td; 255 struct ucred *cred; 256 struct nfsmount *nmp; 257 struct nfsnode *np; 258 vm_page_t *pages; 259 260 GIANT_REQUIRED; 261 262 vp = ap->a_vp; 263 np = VTONFS(vp); 264 td = curthread; /* XXX */ 265 cred = curthread->td_ucred; /* XXX */ 266 nmp = VFSTONFS(vp->v_mount); 267 pages = ap->a_m; 268 count = ap->a_count; 269 rtvals = ap->a_rtvals; 270 npages = btoc(count); 271 offset = IDX_TO_OFF(pages[0]->pindex); 272 273 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 274 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 275 (void)nfs_fsinfo(nmp, vp, cred, td); 276 } 277 278 for (i = 0; i < npages; i++) 279 rtvals[i] = VM_PAGER_AGAIN; 280 281 /* 282 * When putting pages, do not extend file past EOF. 283 */ 284 285 if (offset + count > np->n_size) { 286 count = np->n_size - offset; 287 if (count < 0) 288 count = 0; 289 } 290 291 /* 292 * We use only the kva address for the buffer, but this is extremely 293 * convienient and fast. 294 */ 295 bp = getpbuf(&nfs_pbuf_freecnt); 296 297 kva = (vm_offset_t) bp->b_data; 298 pmap_qenter(kva, pages, npages); 299 cnt.v_vnodeout++; 300 cnt.v_vnodepgsout += count; 301 302 iov.iov_base = (caddr_t) kva; 303 iov.iov_len = count; 304 uio.uio_iov = &iov; 305 uio.uio_iovcnt = 1; 306 uio.uio_offset = offset; 307 uio.uio_resid = count; 308 uio.uio_segflg = UIO_SYSSPACE; 309 uio.uio_rw = UIO_WRITE; 310 uio.uio_td = td; 311 312 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 313 iomode = NFSV3WRITE_UNSTABLE; 314 else 315 iomode = NFSV3WRITE_FILESYNC; 316 317 error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 318 319 pmap_qremove(kva, npages); 320 relpbuf(bp, &nfs_pbuf_freecnt); 321 322 if (!error) { 323 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 324 for (i = 0; i < nwritten; i++) { 325 rtvals[i] = VM_PAGER_OK; 326 vm_page_undirty(pages[i]); 327 } 328 if (must_commit) { 329 nfs_clearcommit(vp->v_mount); 330 } 331 } 332 return rtvals[0]; 333} 334 335/* 336 * Vnode op for read using bio 337 */ 338int 339nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 340{ 341 struct nfsnode *np = VTONFS(vp); 342 int biosize, i; 343 struct buf *bp = 0, *rabp; 344 struct vattr vattr; 345 struct thread *td; 346 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 347 daddr_t lbn, rabn; 348 int bcount; 349 int seqcount; 350 int nra, error = 0, n = 0, on = 0; 351 352#ifdef DIAGNOSTIC 353 if (uio->uio_rw != UIO_READ) 354 panic("nfs_read mode"); 355#endif 356 if (uio->uio_resid == 0) 357 return (0); 358 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 359 return (EINVAL); 360 td = uio->uio_td; 361 362 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 363 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 364 (void)nfs_fsinfo(nmp, vp, cred, td); 365 if (vp->v_type != VDIR && 366 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 367 return (EFBIG); 368 biosize = vp->v_mount->mnt_stat.f_iosize; 369 seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 370 /* 371 * For nfs, cache consistency can only be maintained approximately. 372 * Although RFC1094 does not specify the criteria, the following is 373 * believed to be compatible with the reference port. 374 * For nfs: 375 * If the file's modify time on the server has changed since the 376 * last read rpc or you have written to the file, 377 * you may have lost data cache consistency with the 378 * server, so flush all of the file's data out of the cache. 379 * Then force a getattr rpc to ensure that you have up to date 380 * attributes. 381 * NB: This implies that cache data can be read when up to 382 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 383 * attributes this could be forced by setting n_attrstamp to 0 before 384 * the VOP_GETATTR() call. 385 */ 386 if (np->n_flag & NMODIFIED) { 387 if (vp->v_type != VREG) { 388 if (vp->v_type != VDIR) 389 panic("nfs: bioread, not dir"); 390 (nmp->nm_rpcops->nr_invaldir)(vp); 391 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 392 if (error) 393 return (error); 394 } 395 np->n_attrstamp = 0; 396 error = VOP_GETATTR(vp, &vattr, cred, td); 397 if (error) 398 return (error); 399 np->n_mtime = vattr.va_mtime.tv_sec; 400 } else { 401 error = VOP_GETATTR(vp, &vattr, cred, td); 402 if (error) 403 return (error); 404 if ((np->n_flag & NSIZECHANGED) 405 || (np->n_mtime != vattr.va_mtime.tv_sec)) { 406 if (vp->v_type == VDIR) 407 (nmp->nm_rpcops->nr_invaldir)(vp); 408 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 409 if (error) 410 return (error); 411 np->n_mtime = vattr.va_mtime.tv_sec; 412 np->n_flag &= ~NSIZECHANGED; 413 } 414 } 415 do { 416 switch (vp->v_type) { 417 case VREG: 418 nfsstats.biocache_reads++; 419 lbn = uio->uio_offset / biosize; 420 on = uio->uio_offset & (biosize - 1); 421 422 /* 423 * Start the read ahead(s), as required. 424 */ 425 if (nmp->nm_readahead > 0) { 426 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 427 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 428 rabn = lbn + 1 + nra; 429 if (incore(&vp->v_bufobj, rabn) == NULL) { 430 rabp = nfs_getcacheblk(vp, rabn, biosize, td); 431 if (!rabp) { 432 error = nfs_sigintr(nmp, NULL, td); 433 return (error ? error : EINTR); 434 } 435 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 436 rabp->b_flags |= B_ASYNC; 437 rabp->b_iocmd = BIO_READ; 438 vfs_busy_pages(rabp, 0); 439 if (nfs_asyncio(nmp, rabp, cred, td)) { 440 rabp->b_flags |= B_INVAL; 441 rabp->b_ioflags |= BIO_ERROR; 442 vfs_unbusy_pages(rabp); 443 brelse(rabp); 444 break; 445 } 446 } else { 447 brelse(rabp); 448 } 449 } 450 } 451 } 452 453 /* 454 * Obtain the buffer cache block. Figure out the buffer size 455 * when we are at EOF. If we are modifying the size of the 456 * buffer based on an EOF condition we need to hold 457 * nfs_rslock() through obtaining the buffer to prevent 458 * a potential writer-appender from messing with n_size. 459 * Otherwise we may accidently truncate the buffer and 460 * lose dirty data. 461 * 462 * Note that bcount is *not* DEV_BSIZE aligned. 463 */ 464 465again: 466 bcount = biosize; 467 if ((off_t)lbn * biosize >= np->n_size) { 468 bcount = 0; 469 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 470 bcount = np->n_size - (off_t)lbn * biosize; 471 } 472 if (bcount != biosize) { 473 switch(nfs_rslock(np, td)) { 474 case ENOLCK: 475 goto again; 476 /* not reached */ 477 case EIO: 478 return (EIO); 479 case EINTR: 480 case ERESTART: 481 return(EINTR); 482 /* not reached */ 483 default: 484 break; 485 } 486 } 487 488 bp = nfs_getcacheblk(vp, lbn, bcount, td); 489 490 if (bcount != biosize) 491 nfs_rsunlock(np, td); 492 if (!bp) { 493 error = nfs_sigintr(nmp, NULL, td); 494 return (error ? error : EINTR); 495 } 496 497 /* 498 * If B_CACHE is not set, we must issue the read. If this 499 * fails, we return an error. 500 */ 501 502 if ((bp->b_flags & B_CACHE) == 0) { 503 bp->b_iocmd = BIO_READ; 504 vfs_busy_pages(bp, 0); 505 error = nfs_doio(vp, bp, cred, td); 506 if (error) { 507 brelse(bp); 508 return (error); 509 } 510 } 511 512 /* 513 * on is the offset into the current bp. Figure out how many 514 * bytes we can copy out of the bp. Note that bcount is 515 * NOT DEV_BSIZE aligned. 516 * 517 * Then figure out how many bytes we can copy into the uio. 518 */ 519 520 n = 0; 521 if (on < bcount) 522 n = min((unsigned)(bcount - on), uio->uio_resid); 523 break; 524 case VLNK: 525 nfsstats.biocache_readlinks++; 526 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 527 if (!bp) { 528 error = nfs_sigintr(nmp, NULL, td); 529 return (error ? error : EINTR); 530 } 531 if ((bp->b_flags & B_CACHE) == 0) { 532 bp->b_iocmd = BIO_READ; 533 vfs_busy_pages(bp, 0); 534 error = nfs_doio(vp, bp, cred, td); 535 if (error) { 536 bp->b_ioflags |= BIO_ERROR; 537 brelse(bp); 538 return (error); 539 } 540 } 541 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 542 on = 0; 543 break; 544 case VDIR: 545 nfsstats.biocache_readdirs++; 546 if (np->n_direofoffset 547 && uio->uio_offset >= np->n_direofoffset) { 548 return (0); 549 } 550 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 551 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 552 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 553 if (!bp) { 554 error = nfs_sigintr(nmp, NULL, td); 555 return (error ? error : EINTR); 556 } 557 if ((bp->b_flags & B_CACHE) == 0) { 558 bp->b_iocmd = BIO_READ; 559 vfs_busy_pages(bp, 0); 560 error = nfs_doio(vp, bp, cred, td); 561 if (error) { 562 brelse(bp); 563 } 564 while (error == NFSERR_BAD_COOKIE) { 565 (nmp->nm_rpcops->nr_invaldir)(vp); 566 error = nfs_vinvalbuf(vp, 0, cred, td, 1); 567 /* 568 * Yuck! The directory has been modified on the 569 * server. The only way to get the block is by 570 * reading from the beginning to get all the 571 * offset cookies. 572 * 573 * Leave the last bp intact unless there is an error. 574 * Loop back up to the while if the error is another 575 * NFSERR_BAD_COOKIE (double yuch!). 576 */ 577 for (i = 0; i <= lbn && !error; i++) { 578 if (np->n_direofoffset 579 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 580 return (0); 581 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 582 if (!bp) { 583 error = nfs_sigintr(nmp, NULL, td); 584 return (error ? error : EINTR); 585 } 586 if ((bp->b_flags & B_CACHE) == 0) { 587 bp->b_iocmd = BIO_READ; 588 vfs_busy_pages(bp, 0); 589 error = nfs_doio(vp, bp, cred, td); 590 /* 591 * no error + B_INVAL == directory EOF, 592 * use the block. 593 */ 594 if (error == 0 && (bp->b_flags & B_INVAL)) 595 break; 596 } 597 /* 598 * An error will throw away the block and the 599 * for loop will break out. If no error and this 600 * is not the block we want, we throw away the 601 * block and go for the next one via the for loop. 602 */ 603 if (error || i < lbn) 604 brelse(bp); 605 } 606 } 607 /* 608 * The above while is repeated if we hit another cookie 609 * error. If we hit an error and it wasn't a cookie error, 610 * we give up. 611 */ 612 if (error) 613 return (error); 614 } 615 616 /* 617 * If not eof and read aheads are enabled, start one. 618 * (You need the current block first, so that you have the 619 * directory offset cookie of the next block.) 620 */ 621 if (nmp->nm_readahead > 0 && 622 (bp->b_flags & B_INVAL) == 0 && 623 (np->n_direofoffset == 0 || 624 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 625 incore(&vp->v_bufobj, lbn + 1) == NULL) { 626 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 627 if (rabp) { 628 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 629 rabp->b_flags |= B_ASYNC; 630 rabp->b_iocmd = BIO_READ; 631 vfs_busy_pages(rabp, 0); 632 if (nfs_asyncio(nmp, rabp, cred, td)) { 633 rabp->b_flags |= B_INVAL; 634 rabp->b_ioflags |= BIO_ERROR; 635 vfs_unbusy_pages(rabp); 636 brelse(rabp); 637 } 638 } else { 639 brelse(rabp); 640 } 641 } 642 } 643 /* 644 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 645 * chopped for the EOF condition, we cannot tell how large 646 * NFS directories are going to be until we hit EOF. So 647 * an NFS directory buffer is *not* chopped to its EOF. Now, 648 * it just so happens that b_resid will effectively chop it 649 * to EOF. *BUT* this information is lost if the buffer goes 650 * away and is reconstituted into a B_CACHE state ( due to 651 * being VMIO ) later. So we keep track of the directory eof 652 * in np->n_direofoffset and chop it off as an extra step 653 * right here. 654 */ 655 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 656 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 657 n = np->n_direofoffset - uio->uio_offset; 658 break; 659 default: 660 printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 661 break; 662 }; 663 664 if (n > 0) { 665 error = uiomove(bp->b_data + on, (int)n, uio); 666 } 667 switch (vp->v_type) { 668 case VREG: 669 break; 670 case VLNK: 671 n = 0; 672 break; 673 case VDIR: 674 break; 675 default: 676 printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 677 } 678 brelse(bp); 679 } while (error == 0 && uio->uio_resid > 0 && n > 0); 680 return (error); 681} 682 683/* 684 * Vnode op for write using bio 685 */ 686int 687nfs_write(struct vop_write_args *ap) 688{ 689 int biosize; 690 struct uio *uio = ap->a_uio; 691 struct thread *td = uio->uio_td; 692 struct vnode *vp = ap->a_vp; 693 struct nfsnode *np = VTONFS(vp); 694 struct ucred *cred = ap->a_cred; 695 int ioflag = ap->a_ioflag; 696 struct buf *bp; 697 struct vattr vattr; 698 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 699 daddr_t lbn; 700 int bcount; 701 int n, on, error = 0; 702 int haverslock = 0; 703 struct proc *p = td?td->td_proc:NULL; 704 705 GIANT_REQUIRED; 706 707#ifdef DIAGNOSTIC 708 if (uio->uio_rw != UIO_WRITE) 709 panic("nfs_write mode"); 710 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 711 panic("nfs_write proc"); 712#endif 713 if (vp->v_type != VREG) 714 return (EIO); 715 if (np->n_flag & NWRITEERR) { 716 np->n_flag &= ~NWRITEERR; 717 return (np->n_error); 718 } 719 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 720 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 721 (void)nfs_fsinfo(nmp, vp, cred, td); 722 723 /* 724 * Synchronously flush pending buffers if we are in synchronous 725 * mode or if we are appending. 726 */ 727 if (ioflag & (IO_APPEND | IO_SYNC)) { 728 if (np->n_flag & NMODIFIED) { 729 np->n_attrstamp = 0; 730 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 731 if (error) 732 return (error); 733 } 734 } 735 736 /* 737 * If IO_APPEND then load uio_offset. We restart here if we cannot 738 * get the append lock. 739 */ 740restart: 741 if (ioflag & IO_APPEND) { 742 np->n_attrstamp = 0; 743 error = VOP_GETATTR(vp, &vattr, cred, td); 744 if (error) 745 return (error); 746 uio->uio_offset = np->n_size; 747 } 748 749 if (uio->uio_offset < 0) 750 return (EINVAL); 751 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 752 return (EFBIG); 753 if (uio->uio_resid == 0) 754 return (0); 755 756 /* 757 * We need to obtain the rslock if we intend to modify np->n_size 758 * in order to guarentee the append point with multiple contending 759 * writers, to guarentee that no other appenders modify n_size 760 * while we are trying to obtain a truncated buffer (i.e. to avoid 761 * accidently truncating data written by another appender due to 762 * the race), and to ensure that the buffer is populated prior to 763 * our extending of the file. We hold rslock through the entire 764 * operation. 765 * 766 * Note that we do not synchronize the case where someone truncates 767 * the file while we are appending to it because attempting to lock 768 * this case may deadlock other parts of the system unexpectedly. 769 */ 770 if ((ioflag & IO_APPEND) || 771 uio->uio_offset + uio->uio_resid > np->n_size) { 772 switch(nfs_rslock(np, td)) { 773 case ENOLCK: 774 goto restart; 775 /* not reached */ 776 case EIO: 777 return (EIO); 778 case EINTR: 779 case ERESTART: 780 return(EINTR); 781 /* not reached */ 782 default: 783 break; 784 } 785 haverslock = 1; 786 } 787 788 /* 789 * Maybe this should be above the vnode op call, but so long as 790 * file servers have no limits, i don't think it matters 791 */ 792 if (p != NULL) { 793 PROC_LOCK(p); 794 if (uio->uio_offset + uio->uio_resid > 795 lim_cur(p, RLIMIT_FSIZE)) { 796 psignal(p, SIGXFSZ); 797 PROC_UNLOCK(p); 798 if (haverslock) 799 nfs_rsunlock(np, td); 800 return (EFBIG); 801 } 802 PROC_UNLOCK(p); 803 } 804 805 biosize = vp->v_mount->mnt_stat.f_iosize; 806 807 do { 808 nfsstats.biocache_writes++; 809 lbn = uio->uio_offset / biosize; 810 on = uio->uio_offset & (biosize-1); 811 n = min((unsigned)(biosize - on), uio->uio_resid); 812again: 813 /* 814 * Handle direct append and file extension cases, calculate 815 * unaligned buffer size. 816 */ 817 818 if (uio->uio_offset == np->n_size && n) { 819 /* 820 * Get the buffer (in its pre-append state to maintain 821 * B_CACHE if it was previously set). Resize the 822 * nfsnode after we have locked the buffer to prevent 823 * readers from reading garbage. 824 */ 825 bcount = on; 826 bp = nfs_getcacheblk(vp, lbn, bcount, td); 827 828 if (bp != NULL) { 829 long save; 830 831 np->n_size = uio->uio_offset + n; 832 np->n_flag |= NMODIFIED; 833 vnode_pager_setsize(vp, np->n_size); 834 835 save = bp->b_flags & B_CACHE; 836 bcount += n; 837 allocbuf(bp, bcount); 838 bp->b_flags |= save; 839 } 840 } else { 841 /* 842 * Obtain the locked cache block first, and then 843 * adjust the file's size as appropriate. 844 */ 845 bcount = on + n; 846 if ((off_t)lbn * biosize + bcount < np->n_size) { 847 if ((off_t)(lbn + 1) * biosize < np->n_size) 848 bcount = biosize; 849 else 850 bcount = np->n_size - (off_t)lbn * biosize; 851 } 852 bp = nfs_getcacheblk(vp, lbn, bcount, td); 853 if (uio->uio_offset + n > np->n_size) { 854 np->n_size = uio->uio_offset + n; 855 np->n_flag |= NMODIFIED; 856 vnode_pager_setsize(vp, np->n_size); 857 } 858 } 859 860 if (!bp) { 861 error = nfs_sigintr(nmp, NULL, td); 862 if (!error) 863 error = EINTR; 864 break; 865 } 866 867 /* 868 * Issue a READ if B_CACHE is not set. In special-append 869 * mode, B_CACHE is based on the buffer prior to the write 870 * op and is typically set, avoiding the read. If a read 871 * is required in special append mode, the server will 872 * probably send us a short-read since we extended the file 873 * on our end, resulting in b_resid == 0 and, thusly, 874 * B_CACHE getting set. 875 * 876 * We can also avoid issuing the read if the write covers 877 * the entire buffer. We have to make sure the buffer state 878 * is reasonable in this case since we will not be initiating 879 * I/O. See the comments in kern/vfs_bio.c's getblk() for 880 * more information. 881 * 882 * B_CACHE may also be set due to the buffer being cached 883 * normally. 884 */ 885 886 if (on == 0 && n == bcount) { 887 bp->b_flags |= B_CACHE; 888 bp->b_flags &= ~B_INVAL; 889 bp->b_ioflags &= ~BIO_ERROR; 890 } 891 892 if ((bp->b_flags & B_CACHE) == 0) { 893 bp->b_iocmd = BIO_READ; 894 vfs_busy_pages(bp, 0); 895 error = nfs_doio(vp, bp, cred, td); 896 if (error) { 897 brelse(bp); 898 break; 899 } 900 } 901 if (!bp) { 902 error = nfs_sigintr(nmp, NULL, td); 903 if (!error) 904 error = EINTR; 905 break; 906 } 907 if (bp->b_wcred == NOCRED) 908 bp->b_wcred = crhold(cred); 909 np->n_flag |= NMODIFIED; 910 911 /* 912 * If dirtyend exceeds file size, chop it down. This should 913 * not normally occur but there is an append race where it 914 * might occur XXX, so we log it. 915 * 916 * If the chopping creates a reverse-indexed or degenerate 917 * situation with dirtyoff/end, we 0 both of them. 918 */ 919 920 if (bp->b_dirtyend > bcount) { 921 printf("NFS append race @%lx:%d\n", 922 (long)bp->b_blkno * DEV_BSIZE, 923 bp->b_dirtyend - bcount); 924 bp->b_dirtyend = bcount; 925 } 926 927 if (bp->b_dirtyoff >= bp->b_dirtyend) 928 bp->b_dirtyoff = bp->b_dirtyend = 0; 929 930 /* 931 * If the new write will leave a contiguous dirty 932 * area, just update the b_dirtyoff and b_dirtyend, 933 * otherwise force a write rpc of the old dirty area. 934 * 935 * While it is possible to merge discontiguous writes due to 936 * our having a B_CACHE buffer ( and thus valid read data 937 * for the hole), we don't because it could lead to 938 * significant cache coherency problems with multiple clients, 939 * especially if locking is implemented later on. 940 * 941 * as an optimization we could theoretically maintain 942 * a linked list of discontinuous areas, but we would still 943 * have to commit them separately so there isn't much 944 * advantage to it except perhaps a bit of asynchronization. 945 */ 946 947 if (bp->b_dirtyend > 0 && 948 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 949 if (bwrite(bp) == EINTR) { 950 error = EINTR; 951 break; 952 } 953 goto again; 954 } 955 956 error = uiomove((char *)bp->b_data + on, n, uio); 957 958 /* 959 * Since this block is being modified, it must be written 960 * again and not just committed. Since write clustering does 961 * not work for the stage 1 data write, only the stage 2 962 * commit rpc, we have to clear B_CLUSTEROK as well. 963 */ 964 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 965 966 if (error) { 967 bp->b_ioflags |= BIO_ERROR; 968 brelse(bp); 969 break; 970 } 971 972 /* 973 * Only update dirtyoff/dirtyend if not a degenerate 974 * condition. 975 */ 976 if (n) { 977 if (bp->b_dirtyend > 0) { 978 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 979 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 980 } else { 981 bp->b_dirtyoff = on; 982 bp->b_dirtyend = on + n; 983 } 984 vfs_bio_set_validclean(bp, on, n); 985 } 986 987 /* 988 * If IO_SYNC do bwrite(). 989 * 990 * IO_INVAL appears to be unused. The idea appears to be 991 * to turn off caching in this case. Very odd. XXX 992 */ 993 if ((ioflag & IO_SYNC)) { 994 if (ioflag & IO_INVAL) 995 bp->b_flags |= B_NOCACHE; 996 error = bwrite(bp); 997 if (error) 998 break; 999 } else if ((n + on) == biosize) { 1000 bp->b_flags |= B_ASYNC; 1001 (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, 0); 1002 } else { 1003 bdwrite(bp); 1004 } 1005 } while (uio->uio_resid > 0 && n > 0); 1006 1007 if (haverslock) 1008 nfs_rsunlock(np, td); 1009 1010 return (error); 1011} 1012 1013/* 1014 * Get an nfs cache block. 1015 * 1016 * Allocate a new one if the block isn't currently in the cache 1017 * and return the block marked busy. If the calling process is 1018 * interrupted by a signal for an interruptible mount point, return 1019 * NULL. 1020 * 1021 * The caller must carefully deal with the possible B_INVAL state of 1022 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1023 * indirectly), so synchronous reads can be issued without worrying about 1024 * the B_INVAL state. We have to be a little more careful when dealing 1025 * with writes (see comments in nfs_write()) when extending a file past 1026 * its EOF. 1027 */ 1028static struct buf * 1029nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 1030{ 1031 struct buf *bp; 1032 struct mount *mp; 1033 struct nfsmount *nmp; 1034 1035 mp = vp->v_mount; 1036 nmp = VFSTONFS(mp); 1037 1038 if (nmp->nm_flag & NFSMNT_INT) { 1039 bp = getblk(vp, bn, size, PCATCH, 0, 0); 1040 while (bp == NULL) { 1041 if (nfs_sigintr(nmp, NULL, td)) 1042 return (NULL); 1043 bp = getblk(vp, bn, size, 0, 2 * hz, 0); 1044 } 1045 } else { 1046 bp = getblk(vp, bn, size, 0, 0, 0); 1047 } 1048 1049 if (vp->v_type == VREG) { 1050 int biosize; 1051 1052 biosize = mp->mnt_stat.f_iosize; 1053 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1054 } 1055 return (bp); 1056} 1057 1058/* 1059 * Flush and invalidate all dirty buffers. If another process is already 1060 * doing the flush, just wait for completion. 1061 */ 1062int 1063nfs_vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, 1064 struct thread *td, int intrflg) 1065{ 1066 struct nfsnode *np = VTONFS(vp); 1067 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1068 int error = 0, slpflag, slptimeo; 1069 int old_lock = 0; 1070 1071 ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1072 1073 /* 1074 * XXX This check stops us from needlessly doing a vinvalbuf when 1075 * being called through vclean(). It is not clear that this is 1076 * unsafe. 1077 */ 1078 if (vp->v_iflag & VI_XLOCK) 1079 return (0); 1080 1081 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1082 intrflg = 0; 1083 if (intrflg) { 1084 slpflag = PCATCH; 1085 slptimeo = 2 * hz; 1086 } else { 1087 slpflag = 0; 1088 slptimeo = 0; 1089 } 1090 1091 if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) { 1092 if (old_lock == LK_SHARED) { 1093 /* Upgrade to exclusive lock, this might block */ 1094 vn_lock(vp, LK_UPGRADE | LK_RETRY, td); 1095 } else { 1096 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1097 } 1098 } 1099 1100 /* 1101 * Now, flush as required. 1102 */ 1103 error = vinvalbuf(vp, flags, cred, td, slpflag, 0); 1104 while (error) { 1105 if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1106 goto out; 1107 error = vinvalbuf(vp, flags, cred, td, 0, slptimeo); 1108 } 1109 np->n_flag &= ~NMODIFIED; 1110out: 1111 if (old_lock != LK_EXCLUSIVE) { 1112 if (old_lock == LK_SHARED) { 1113 /* Downgrade from exclusive lock, this might block */ 1114 vn_lock(vp, LK_DOWNGRADE, td); 1115 } else { 1116 VOP_UNLOCK(vp, 0, td); 1117 } 1118 } 1119 return error; 1120} 1121 1122/* 1123 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1124 * This is mainly to avoid queueing async I/O requests when the nfsiods 1125 * are all hung on a dead server. 1126 * 1127 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1128 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1129 */ 1130int 1131nfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) 1132{ 1133 int iod; 1134 int gotiod; 1135 int slpflag = 0; 1136 int slptimeo = 0; 1137 int error, error2; 1138 1139 /* 1140 * Commits are usually short and sweet so lets save some cpu and 1141 * leave the async daemons for more important rpc's (such as reads 1142 * and writes). 1143 */ 1144 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1145 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1146 return(EIO); 1147 } 1148 1149again: 1150 if (nmp->nm_flag & NFSMNT_INT) 1151 slpflag = PCATCH; 1152 gotiod = FALSE; 1153 1154 /* 1155 * Find a free iod to process this request. 1156 */ 1157 for (iod = 0; iod < nfs_numasync; iod++) 1158 if (nfs_iodwant[iod]) { 1159 gotiod = TRUE; 1160 break; 1161 } 1162 1163 /* 1164 * Try to create one if none are free. 1165 */ 1166 if (!gotiod) { 1167 iod = nfs_nfsiodnew(); 1168 if (iod != -1) 1169 gotiod = TRUE; 1170 } 1171 1172 if (gotiod) { 1173 /* 1174 * Found one, so wake it up and tell it which 1175 * mount to process. 1176 */ 1177 NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 1178 iod, nmp)); 1179 nfs_iodwant[iod] = NULL; 1180 nfs_iodmount[iod] = nmp; 1181 nmp->nm_bufqiods++; 1182 wakeup(&nfs_iodwant[iod]); 1183 } 1184 1185 /* 1186 * If none are free, we may already have an iod working on this mount 1187 * point. If so, it will process our request. 1188 */ 1189 if (!gotiod) { 1190 if (nmp->nm_bufqiods > 0) { 1191 NFS_DPF(ASYNCIO, 1192 ("nfs_asyncio: %d iods are already processing mount %p\n", 1193 nmp->nm_bufqiods, nmp)); 1194 gotiod = TRUE; 1195 } 1196 } 1197 1198 /* 1199 * If we have an iod which can process the request, then queue 1200 * the buffer. 1201 */ 1202 if (gotiod) { 1203 /* 1204 * Ensure that the queue never grows too large. We still want 1205 * to asynchronize so we block rather then return EIO. 1206 */ 1207 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1208 NFS_DPF(ASYNCIO, 1209 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1210 nmp->nm_bufqwant = TRUE; 1211 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1212 "nfsaio", slptimeo); 1213 if (error) { 1214 error2 = nfs_sigintr(nmp, NULL, td); 1215 if (error2) 1216 return (error2); 1217 if (slpflag == PCATCH) { 1218 slpflag = 0; 1219 slptimeo = 2 * hz; 1220 } 1221 } 1222 /* 1223 * We might have lost our iod while sleeping, 1224 * so check and loop if nescessary. 1225 */ 1226 if (nmp->nm_bufqiods == 0) { 1227 NFS_DPF(ASYNCIO, 1228 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1229 goto again; 1230 } 1231 } 1232 1233 if (bp->b_iocmd == BIO_READ) { 1234 if (bp->b_rcred == NOCRED && cred != NOCRED) 1235 bp->b_rcred = crhold(cred); 1236 } else { 1237 if (bp->b_wcred == NOCRED && cred != NOCRED) 1238 bp->b_wcred = crhold(cred); 1239 } 1240 1241 if (bp->b_flags & B_REMFREE) 1242 bremfreef(bp); 1243 BUF_KERNPROC(bp); 1244 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1245 nmp->nm_bufqlen++; 1246 return (0); 1247 } 1248 1249 /* 1250 * All the iods are busy on other mounts, so return EIO to 1251 * force the caller to process the i/o synchronously. 1252 */ 1253 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1254 return (EIO); 1255} 1256 1257/* 1258 * Do an I/O operation to/from a cache block. This may be called 1259 * synchronously or from an nfsiod. 1260 */ 1261int 1262nfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) 1263{ 1264 struct uio *uiop; 1265 struct nfsnode *np; 1266 struct nfsmount *nmp; 1267 int error = 0, iomode, must_commit = 0; 1268 struct uio uio; 1269 struct iovec io; 1270 struct proc *p = td ? td->td_proc : NULL; 1271 1272 np = VTONFS(vp); 1273 nmp = VFSTONFS(vp->v_mount); 1274 uiop = &uio; 1275 uiop->uio_iov = &io; 1276 uiop->uio_iovcnt = 1; 1277 uiop->uio_segflg = UIO_SYSSPACE; 1278 uiop->uio_td = td; 1279 1280 /* 1281 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1282 * do this here so we do not have to do it in all the code that 1283 * calls us. 1284 */ 1285 bp->b_flags &= ~B_INVAL; 1286 bp->b_ioflags &= ~BIO_ERROR; 1287 1288 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1289 1290 if (bp->b_iocmd == BIO_READ) { 1291 io.iov_len = uiop->uio_resid = bp->b_bcount; 1292 io.iov_base = bp->b_data; 1293 uiop->uio_rw = UIO_READ; 1294 1295 switch (vp->v_type) { 1296 case VREG: 1297 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1298 nfsstats.read_bios++; 1299 error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 1300 1301 if (!error) { 1302 if (uiop->uio_resid) { 1303 /* 1304 * If we had a short read with no error, we must have 1305 * hit a file hole. We should zero-fill the remainder. 1306 * This can also occur if the server hits the file EOF. 1307 * 1308 * Holes used to be able to occur due to pending 1309 * writes, but that is not possible any longer. 1310 */ 1311 int nread = bp->b_bcount - uiop->uio_resid; 1312 int left = uiop->uio_resid; 1313 1314 if (left > 0) 1315 bzero((char *)bp->b_data + nread, left); 1316 uiop->uio_resid = 0; 1317 } 1318 } 1319 /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1320 if (p && (vp->v_vflag & VV_TEXT) && 1321 (np->n_mtime != np->n_vattr.va_mtime.tv_sec)) { 1322 PROC_LOCK(p); 1323 killproc(p, "text file modification"); 1324 PROC_UNLOCK(p); 1325 } 1326 break; 1327 case VLNK: 1328 uiop->uio_offset = (off_t)0; 1329 nfsstats.readlink_bios++; 1330 error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 1331 break; 1332 case VDIR: 1333 nfsstats.readdir_bios++; 1334 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1335 if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) 1336 error = nfs4_readdirrpc(vp, uiop, cr); 1337 else { 1338 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1339 error = nfs_readdirplusrpc(vp, uiop, cr); 1340 if (error == NFSERR_NOTSUPP) 1341 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1342 } 1343 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1344 error = nfs_readdirrpc(vp, uiop, cr); 1345 } 1346 /* 1347 * end-of-directory sets B_INVAL but does not generate an 1348 * error. 1349 */ 1350 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1351 bp->b_flags |= B_INVAL; 1352 break; 1353 default: 1354 printf("nfs_doio: type %x unexpected\n", vp->v_type); 1355 break; 1356 }; 1357 if (error) { 1358 bp->b_ioflags |= BIO_ERROR; 1359 bp->b_error = error; 1360 } 1361 } else { 1362 /* 1363 * If we only need to commit, try to commit 1364 */ 1365 if (bp->b_flags & B_NEEDCOMMIT) { 1366 int retv; 1367 off_t off; 1368 1369 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1370 retv = (nmp->nm_rpcops->nr_commit)( 1371 vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1372 bp->b_wcred, td); 1373 if (retv == 0) { 1374 bp->b_dirtyoff = bp->b_dirtyend = 0; 1375 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1376 bp->b_resid = 0; 1377 bufdone(bp); 1378 return (0); 1379 } 1380 if (retv == NFSERR_STALEWRITEVERF) { 1381 nfs_clearcommit(vp->v_mount); 1382 } 1383 } 1384 1385 /* 1386 * Setup for actual write 1387 */ 1388 1389 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1390 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1391 1392 if (bp->b_dirtyend > bp->b_dirtyoff) { 1393 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1394 - bp->b_dirtyoff; 1395 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1396 + bp->b_dirtyoff; 1397 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1398 uiop->uio_rw = UIO_WRITE; 1399 nfsstats.write_bios++; 1400 1401 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1402 iomode = NFSV3WRITE_UNSTABLE; 1403 else 1404 iomode = NFSV3WRITE_FILESYNC; 1405 1406 error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 1407 1408 /* 1409 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1410 * to cluster the buffers needing commit. This will allow 1411 * the system to submit a single commit rpc for the whole 1412 * cluster. We can do this even if the buffer is not 100% 1413 * dirty (relative to the NFS blocksize), so we optimize the 1414 * append-to-file-case. 1415 * 1416 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1417 * cleared because write clustering only works for commit 1418 * rpc's, not for the data portion of the write). 1419 */ 1420 1421 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1422 bp->b_flags |= B_NEEDCOMMIT; 1423 if (bp->b_dirtyoff == 0 1424 && bp->b_dirtyend == bp->b_bcount) 1425 bp->b_flags |= B_CLUSTEROK; 1426 } else { 1427 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1428 } 1429 1430 /* 1431 * For an interrupted write, the buffer is still valid 1432 * and the write hasn't been pushed to the server yet, 1433 * so we can't set BIO_ERROR and report the interruption 1434 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1435 * is not relevant, so the rpc attempt is essentially 1436 * a noop. For the case of a V3 write rpc not being 1437 * committed to stable storage, the block is still 1438 * dirty and requires either a commit rpc or another 1439 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1440 * the block is reused. This is indicated by setting 1441 * the B_DELWRI and B_NEEDCOMMIT flags. 1442 * 1443 * If the buffer is marked B_PAGING, it does not reside on 1444 * the vp's paging queues so we cannot call bdirty(). The 1445 * bp in this case is not an NFS cache block so we should 1446 * be safe. XXX 1447 */ 1448 if (error == EINTR || error == EIO 1449 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1450 int s; 1451 1452 s = splbio(); 1453 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1454 if ((bp->b_flags & B_PAGING) == 0) { 1455 bdirty(bp); 1456 bp->b_flags &= ~B_DONE; 1457 } 1458 if (error && (bp->b_flags & B_ASYNC) == 0) 1459 bp->b_flags |= B_EINTR; 1460 splx(s); 1461 } else { 1462 if (error) { 1463 bp->b_ioflags |= BIO_ERROR; 1464 bp->b_error = np->n_error = error; 1465 np->n_flag |= NWRITEERR; 1466 } 1467 bp->b_dirtyoff = bp->b_dirtyend = 0; 1468 } 1469 } else { 1470 bp->b_resid = 0; 1471 bufdone(bp); 1472 return (0); 1473 } 1474 } 1475 bp->b_resid = uiop->uio_resid; 1476 if (must_commit) 1477 nfs_clearcommit(vp->v_mount); 1478 bufdone(bp); 1479 return (error); 1480} 1481 1482/* 1483 * Used to aid in handling ftruncate() operations on the NFS client side. 1484 * Truncation creates a number of special problems for NFS. We have to 1485 * throw away VM pages and buffer cache buffers that are beyond EOF, and 1486 * we have to properly handle VM pages or (potentially dirty) buffers 1487 * that straddle the truncation point. 1488 */ 1489 1490int 1491nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 1492{ 1493 struct nfsnode *np = VTONFS(vp); 1494 u_quad_t tsize = np->n_size; 1495 int biosize = vp->v_mount->mnt_stat.f_iosize; 1496 int error = 0; 1497 1498 np->n_size = nsize; 1499 1500 if (np->n_size < tsize) { 1501 struct buf *bp; 1502 daddr_t lbn; 1503 int bufsize; 1504 1505 /* 1506 * vtruncbuf() doesn't get the buffer overlapping the 1507 * truncation point. We may have a B_DELWRI and/or B_CACHE 1508 * buffer that now needs to be truncated. 1509 */ 1510 error = vtruncbuf(vp, cred, td, nsize, biosize); 1511 lbn = nsize / biosize; 1512 bufsize = nsize & (biosize - 1); 1513 bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1514 if (bp->b_dirtyoff > bp->b_bcount) 1515 bp->b_dirtyoff = bp->b_bcount; 1516 if (bp->b_dirtyend > bp->b_bcount) 1517 bp->b_dirtyend = bp->b_bcount; 1518 bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 1519 brelse(bp); 1520 } else { 1521 vnode_pager_setsize(vp, nsize); 1522 } 1523 return(error); 1524} 1525 1526