nfs_bio.c revision 138644
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 33 */ 34 35#include <sys/cdefs.h> 36__FBSDID("$FreeBSD: head/sys/nfsclient/nfs_bio.c 138644 2004-12-10 03:27:12Z ps $"); 37 38#include <sys/param.h> 39#include <sys/systm.h> 40#include <sys/bio.h> 41#include <sys/buf.h> 42#include <sys/kernel.h> 43#include <sys/mount.h> 44#include <sys/proc.h> 45#include <sys/resourcevar.h> 46#include <sys/signalvar.h> 47#include <sys/vmmeter.h> 48#include <sys/vnode.h> 49 50#include <vm/vm.h> 51#include <vm/vm_extern.h> 52#include <vm/vm_page.h> 53#include <vm/vm_object.h> 54#include <vm/vm_pager.h> 55#include <vm/vnode_pager.h> 56 57#include <rpc/rpcclnt.h> 58 59#include <nfs/rpcv2.h> 60#include <nfs/nfsproto.h> 61#include <nfsclient/nfs.h> 62#include <nfsclient/nfsmount.h> 63#include <nfsclient/nfsnode.h> 64 65#include <nfs4client/nfs4.h> 66 67static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, 68 struct thread *td); 69 70/* 71 * Vnode op for VM getpages. 72 */ 73int 74nfs_getpages(struct vop_getpages_args *ap) 75{ 76 int i, error, nextoff, size, toff, count, npages; 77 struct uio uio; 78 struct iovec iov; 79 vm_offset_t kva; 80 struct buf *bp; 81 struct vnode *vp; 82 struct thread *td; 83 struct ucred *cred; 84 struct nfsmount *nmp; 85 vm_object_t object; 86 vm_page_t *pages; 87 88 GIANT_REQUIRED; 89 90 vp = ap->a_vp; 91 td = curthread; /* XXX */ 92 cred = curthread->td_ucred; /* XXX */ 93 nmp = VFSTONFS(vp->v_mount); 94 pages = ap->a_m; 95 count = ap->a_count; 96 97 if ((object = vp->v_object) == NULL) { 98 printf("nfs_getpages: called with non-merged cache vnode??\n"); 99 return VM_PAGER_ERROR; 100 } 101 102 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 103 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 104 /* We'll never get here for v4, because we always have fsinfo */ 105 (void)nfs_fsinfo(nmp, vp, cred, td); 106 } 107 108 npages = btoc(count); 109 110 /* 111 * If the requested page is partially valid, just return it and 112 * allow the pager to zero-out the blanks. Partially valid pages 113 * can only occur at the file EOF. 114 */ 115 116 { 117 vm_page_t m = pages[ap->a_reqpage]; 118 119 VM_OBJECT_LOCK(object); 120 vm_page_lock_queues(); 121 if (m->valid != 0) { 122 /* handled by vm_fault now */ 123 /* vm_page_zero_invalid(m, TRUE); */ 124 for (i = 0; i < npages; ++i) { 125 if (i != ap->a_reqpage) 126 vm_page_free(pages[i]); 127 } 128 vm_page_unlock_queues(); 129 VM_OBJECT_UNLOCK(object); 130 return(0); 131 } 132 vm_page_unlock_queues(); 133 VM_OBJECT_UNLOCK(object); 134 } 135 136 /* 137 * We use only the kva address for the buffer, but this is extremely 138 * convienient and fast. 139 */ 140 bp = getpbuf(&nfs_pbuf_freecnt); 141 142 kva = (vm_offset_t) bp->b_data; 143 pmap_qenter(kva, pages, npages); 144 cnt.v_vnodein++; 145 cnt.v_vnodepgsin += npages; 146 147 iov.iov_base = (caddr_t) kva; 148 iov.iov_len = count; 149 uio.uio_iov = &iov; 150 uio.uio_iovcnt = 1; 151 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex); 152 uio.uio_resid = count; 153 uio.uio_segflg = UIO_SYSSPACE; 154 uio.uio_rw = UIO_READ; 155 uio.uio_td = td; 156 157 error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred); 158 pmap_qremove(kva, npages); 159 160 relpbuf(bp, &nfs_pbuf_freecnt); 161 162 if (error && (uio.uio_resid == count)) { 163 printf("nfs_getpages: error %d\n", error); 164 VM_OBJECT_LOCK(object); 165 vm_page_lock_queues(); 166 for (i = 0; i < npages; ++i) { 167 if (i != ap->a_reqpage) 168 vm_page_free(pages[i]); 169 } 170 vm_page_unlock_queues(); 171 VM_OBJECT_UNLOCK(object); 172 return VM_PAGER_ERROR; 173 } 174 175 /* 176 * Calculate the number of bytes read and validate only that number 177 * of bytes. Note that due to pending writes, size may be 0. This 178 * does not mean that the remaining data is invalid! 179 */ 180 181 size = count - uio.uio_resid; 182 VM_OBJECT_LOCK(object); 183 vm_page_lock_queues(); 184 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 185 vm_page_t m; 186 nextoff = toff + PAGE_SIZE; 187 m = pages[i]; 188 189 if (nextoff <= size) { 190 /* 191 * Read operation filled an entire page 192 */ 193 m->valid = VM_PAGE_BITS_ALL; 194 vm_page_undirty(m); 195 } else if (size > toff) { 196 /* 197 * Read operation filled a partial page. 198 */ 199 m->valid = 0; 200 vm_page_set_validclean(m, 0, size - toff); 201 /* handled by vm_fault now */ 202 /* vm_page_zero_invalid(m, TRUE); */ 203 } else { 204 /* 205 * Read operation was short. If no error occured 206 * we may have hit a zero-fill section. We simply 207 * leave valid set to 0. 208 */ 209 ; 210 } 211 if (i != ap->a_reqpage) { 212 /* 213 * Whether or not to leave the page activated is up in 214 * the air, but we should put the page on a page queue 215 * somewhere (it already is in the object). Result: 216 * It appears that emperical results show that 217 * deactivating pages is best. 218 */ 219 220 /* 221 * Just in case someone was asking for this page we 222 * now tell them that it is ok to use. 223 */ 224 if (!error) { 225 if (m->flags & PG_WANTED) 226 vm_page_activate(m); 227 else 228 vm_page_deactivate(m); 229 vm_page_wakeup(m); 230 } else { 231 vm_page_free(m); 232 } 233 } 234 } 235 vm_page_unlock_queues(); 236 VM_OBJECT_UNLOCK(object); 237 return 0; 238} 239 240/* 241 * Vnode op for VM putpages. 242 */ 243int 244nfs_putpages(struct vop_putpages_args *ap) 245{ 246 struct uio uio; 247 struct iovec iov; 248 vm_offset_t kva; 249 struct buf *bp; 250 int iomode, must_commit, i, error, npages, count; 251 off_t offset; 252 int *rtvals; 253 struct vnode *vp; 254 struct thread *td; 255 struct ucred *cred; 256 struct nfsmount *nmp; 257 struct nfsnode *np; 258 vm_page_t *pages; 259 260 GIANT_REQUIRED; 261 262 vp = ap->a_vp; 263 np = VTONFS(vp); 264 td = curthread; /* XXX */ 265 cred = curthread->td_ucred; /* XXX */ 266 nmp = VFSTONFS(vp->v_mount); 267 pages = ap->a_m; 268 count = ap->a_count; 269 rtvals = ap->a_rtvals; 270 npages = btoc(count); 271 offset = IDX_TO_OFF(pages[0]->pindex); 272 273 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 274 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { 275 (void)nfs_fsinfo(nmp, vp, cred, td); 276 } 277 278 for (i = 0; i < npages; i++) 279 rtvals[i] = VM_PAGER_AGAIN; 280 281 /* 282 * When putting pages, do not extend file past EOF. 283 */ 284 285 if (offset + count > np->n_size) { 286 count = np->n_size - offset; 287 if (count < 0) 288 count = 0; 289 } 290 291 /* 292 * We use only the kva address for the buffer, but this is extremely 293 * convienient and fast. 294 */ 295 bp = getpbuf(&nfs_pbuf_freecnt); 296 297 kva = (vm_offset_t) bp->b_data; 298 pmap_qenter(kva, pages, npages); 299 cnt.v_vnodeout++; 300 cnt.v_vnodepgsout += count; 301 302 iov.iov_base = (caddr_t) kva; 303 iov.iov_len = count; 304 uio.uio_iov = &iov; 305 uio.uio_iovcnt = 1; 306 uio.uio_offset = offset; 307 uio.uio_resid = count; 308 uio.uio_segflg = UIO_SYSSPACE; 309 uio.uio_rw = UIO_WRITE; 310 uio.uio_td = td; 311 312 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 313 iomode = NFSV3WRITE_UNSTABLE; 314 else 315 iomode = NFSV3WRITE_FILESYNC; 316 317 error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit); 318 319 pmap_qremove(kva, npages); 320 relpbuf(bp, &nfs_pbuf_freecnt); 321 322 if (!error) { 323 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; 324 for (i = 0; i < nwritten; i++) { 325 rtvals[i] = VM_PAGER_OK; 326 vm_page_undirty(pages[i]); 327 } 328 if (must_commit) { 329 nfs_clearcommit(vp->v_mount); 330 } 331 } 332 return rtvals[0]; 333} 334 335/* 336 * Vnode op for read using bio 337 */ 338int 339nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) 340{ 341 struct nfsnode *np = VTONFS(vp); 342 int biosize, i; 343 struct buf *bp = 0, *rabp; 344 struct vattr vattr; 345 struct thread *td; 346 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 347 daddr_t lbn, rabn; 348 int bcount; 349 int seqcount; 350 int nra, error = 0, n = 0, on = 0; 351 352#ifdef DIAGNOSTIC 353 if (uio->uio_rw != UIO_READ) 354 panic("nfs_read mode"); 355#endif 356 if (uio->uio_resid == 0) 357 return (0); 358 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 359 return (EINVAL); 360 td = uio->uio_td; 361 362 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 363 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 364 (void)nfs_fsinfo(nmp, vp, cred, td); 365 if (vp->v_type != VDIR && 366 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 367 return (EFBIG); 368 biosize = vp->v_mount->mnt_stat.f_iosize; 369 seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); 370 /* 371 * For nfs, cache consistency can only be maintained approximately. 372 * Although RFC1094 does not specify the criteria, the following is 373 * believed to be compatible with the reference port. 374 * For nfs: 375 * If the file's modify time on the server has changed since the 376 * last read rpc or you have written to the file, 377 * you may have lost data cache consistency with the 378 * server, so flush all of the file's data out of the cache. 379 * Then force a getattr rpc to ensure that you have up to date 380 * attributes. 381 * NB: This implies that cache data can be read when up to 382 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 383 * attributes this could be forced by setting n_attrstamp to 0 before 384 * the VOP_GETATTR() call. 385 */ 386 if (np->n_flag & NMODIFIED) { 387 if (vp->v_type != VREG) { 388 if (vp->v_type != VDIR) 389 panic("nfs: bioread, not dir"); 390 (nmp->nm_rpcops->nr_invaldir)(vp); 391 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 392 if (error) 393 return (error); 394 } 395 np->n_attrstamp = 0; 396 error = VOP_GETATTR(vp, &vattr, cred, td); 397 if (error) 398 return (error); 399 np->n_mtime = vattr.va_mtime; 400 } else { 401 error = VOP_GETATTR(vp, &vattr, cred, td); 402 if (error) 403 return (error); 404 if ((np->n_flag & NSIZECHANGED) 405 || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) { 406 if (vp->v_type == VDIR) 407 (nmp->nm_rpcops->nr_invaldir)(vp); 408 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 409 if (error) 410 return (error); 411 np->n_mtime = vattr.va_mtime; 412 np->n_flag &= ~NSIZECHANGED; 413 } 414 } 415 do { 416 switch (vp->v_type) { 417 case VREG: 418 nfsstats.biocache_reads++; 419 lbn = uio->uio_offset / biosize; 420 on = uio->uio_offset & (biosize - 1); 421 422 /* 423 * Start the read ahead(s), as required. 424 * The readahead is kicked off only if sequential access 425 * is detected, based on the readahead hint (ra_expect_lbn). 426 */ 427 if (nmp->nm_readahead > 0 && np->ra_expect_lbn == lbn) { 428 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && 429 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 430 rabn = lbn + 1 + nra; 431 if (incore(&vp->v_bufobj, rabn) == NULL) { 432 rabp = nfs_getcacheblk(vp, rabn, biosize, td); 433 if (!rabp) { 434 error = nfs_sigintr(nmp, NULL, td); 435 return (error ? error : EINTR); 436 } 437 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 438 rabp->b_flags |= B_ASYNC; 439 rabp->b_iocmd = BIO_READ; 440 vfs_busy_pages(rabp, 0); 441 if (nfs_asyncio(nmp, rabp, cred, td)) { 442 rabp->b_flags |= B_INVAL; 443 rabp->b_ioflags |= BIO_ERROR; 444 vfs_unbusy_pages(rabp); 445 brelse(rabp); 446 break; 447 } 448 } else { 449 brelse(rabp); 450 } 451 } 452 } 453 np->ra_expect_lbn = lbn + 1; 454 } 455 456 /* 457 * Obtain the buffer cache block. Figure out the buffer size 458 * when we are at EOF. If we are modifying the size of the 459 * buffer based on an EOF condition we need to hold 460 * nfs_rslock() through obtaining the buffer to prevent 461 * a potential writer-appender from messing with n_size. 462 * Otherwise we may accidently truncate the buffer and 463 * lose dirty data. 464 * 465 * Note that bcount is *not* DEV_BSIZE aligned. 466 */ 467 468again: 469 bcount = biosize; 470 if ((off_t)lbn * biosize >= np->n_size) { 471 bcount = 0; 472 } else if ((off_t)(lbn + 1) * biosize > np->n_size) { 473 bcount = np->n_size - (off_t)lbn * biosize; 474 } 475 if (bcount != biosize) { 476 switch(nfs_rslock(np, td)) { 477 case ENOLCK: 478 goto again; 479 /* not reached */ 480 case EIO: 481 return (EIO); 482 case EINTR: 483 case ERESTART: 484 return(EINTR); 485 /* not reached */ 486 default: 487 break; 488 } 489 } 490 491 bp = nfs_getcacheblk(vp, lbn, bcount, td); 492 493 if (bcount != biosize) 494 nfs_rsunlock(np, td); 495 if (!bp) { 496 error = nfs_sigintr(nmp, NULL, td); 497 return (error ? error : EINTR); 498 } 499 500 /* 501 * If B_CACHE is not set, we must issue the read. If this 502 * fails, we return an error. 503 */ 504 505 if ((bp->b_flags & B_CACHE) == 0) { 506 bp->b_iocmd = BIO_READ; 507 vfs_busy_pages(bp, 0); 508 error = nfs_doio(vp, bp, cred, td); 509 if (error) { 510 brelse(bp); 511 return (error); 512 } 513 } 514 515 /* 516 * on is the offset into the current bp. Figure out how many 517 * bytes we can copy out of the bp. Note that bcount is 518 * NOT DEV_BSIZE aligned. 519 * 520 * Then figure out how many bytes we can copy into the uio. 521 */ 522 523 n = 0; 524 if (on < bcount) 525 n = min((unsigned)(bcount - on), uio->uio_resid); 526 break; 527 case VLNK: 528 nfsstats.biocache_readlinks++; 529 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td); 530 if (!bp) { 531 error = nfs_sigintr(nmp, NULL, td); 532 return (error ? error : EINTR); 533 } 534 if ((bp->b_flags & B_CACHE) == 0) { 535 bp->b_iocmd = BIO_READ; 536 vfs_busy_pages(bp, 0); 537 error = nfs_doio(vp, bp, cred, td); 538 if (error) { 539 bp->b_ioflags |= BIO_ERROR; 540 brelse(bp); 541 return (error); 542 } 543 } 544 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 545 on = 0; 546 break; 547 case VDIR: 548 nfsstats.biocache_readdirs++; 549 if (np->n_direofoffset 550 && uio->uio_offset >= np->n_direofoffset) { 551 return (0); 552 } 553 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; 554 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 555 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td); 556 if (!bp) { 557 error = nfs_sigintr(nmp, NULL, td); 558 return (error ? error : EINTR); 559 } 560 if ((bp->b_flags & B_CACHE) == 0) { 561 bp->b_iocmd = BIO_READ; 562 vfs_busy_pages(bp, 0); 563 error = nfs_doio(vp, bp, cred, td); 564 if (error) { 565 brelse(bp); 566 } 567 while (error == NFSERR_BAD_COOKIE) { 568 (nmp->nm_rpcops->nr_invaldir)(vp); 569 error = nfs_vinvalbuf(vp, 0, cred, td, 1); 570 /* 571 * Yuck! The directory has been modified on the 572 * server. The only way to get the block is by 573 * reading from the beginning to get all the 574 * offset cookies. 575 * 576 * Leave the last bp intact unless there is an error. 577 * Loop back up to the while if the error is another 578 * NFSERR_BAD_COOKIE (double yuch!). 579 */ 580 for (i = 0; i <= lbn && !error; i++) { 581 if (np->n_direofoffset 582 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 583 return (0); 584 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td); 585 if (!bp) { 586 error = nfs_sigintr(nmp, NULL, td); 587 return (error ? error : EINTR); 588 } 589 if ((bp->b_flags & B_CACHE) == 0) { 590 bp->b_iocmd = BIO_READ; 591 vfs_busy_pages(bp, 0); 592 error = nfs_doio(vp, bp, cred, td); 593 /* 594 * no error + B_INVAL == directory EOF, 595 * use the block. 596 */ 597 if (error == 0 && (bp->b_flags & B_INVAL)) 598 break; 599 } 600 /* 601 * An error will throw away the block and the 602 * for loop will break out. If no error and this 603 * is not the block we want, we throw away the 604 * block and go for the next one via the for loop. 605 */ 606 if (error || i < lbn) 607 brelse(bp); 608 } 609 } 610 /* 611 * The above while is repeated if we hit another cookie 612 * error. If we hit an error and it wasn't a cookie error, 613 * we give up. 614 */ 615 if (error) 616 return (error); 617 } 618 619 /* 620 * If not eof and read aheads are enabled, start one. 621 * (You need the current block first, so that you have the 622 * directory offset cookie of the next block.) 623 */ 624 if (nmp->nm_readahead > 0 && 625 (bp->b_flags & B_INVAL) == 0 && 626 (np->n_direofoffset == 0 || 627 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 628 incore(&vp->v_bufobj, lbn + 1) == NULL) { 629 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); 630 if (rabp) { 631 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 632 rabp->b_flags |= B_ASYNC; 633 rabp->b_iocmd = BIO_READ; 634 vfs_busy_pages(rabp, 0); 635 if (nfs_asyncio(nmp, rabp, cred, td)) { 636 rabp->b_flags |= B_INVAL; 637 rabp->b_ioflags |= BIO_ERROR; 638 vfs_unbusy_pages(rabp); 639 brelse(rabp); 640 } 641 } else { 642 brelse(rabp); 643 } 644 } 645 } 646 /* 647 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is 648 * chopped for the EOF condition, we cannot tell how large 649 * NFS directories are going to be until we hit EOF. So 650 * an NFS directory buffer is *not* chopped to its EOF. Now, 651 * it just so happens that b_resid will effectively chop it 652 * to EOF. *BUT* this information is lost if the buffer goes 653 * away and is reconstituted into a B_CACHE state ( due to 654 * being VMIO ) later. So we keep track of the directory eof 655 * in np->n_direofoffset and chop it off as an extra step 656 * right here. 657 */ 658 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 659 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset) 660 n = np->n_direofoffset - uio->uio_offset; 661 break; 662 default: 663 printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 664 break; 665 }; 666 667 if (n > 0) { 668 error = uiomove(bp->b_data + on, (int)n, uio); 669 } 670 switch (vp->v_type) { 671 case VREG: 672 break; 673 case VLNK: 674 n = 0; 675 break; 676 case VDIR: 677 break; 678 default: 679 printf(" nfs_bioread: type %x unexpected\n", vp->v_type); 680 } 681 brelse(bp); 682 } while (error == 0 && uio->uio_resid > 0 && n > 0); 683 return (error); 684} 685 686/* 687 * Vnode op for write using bio 688 */ 689int 690nfs_write(struct vop_write_args *ap) 691{ 692 int biosize; 693 struct uio *uio = ap->a_uio; 694 struct thread *td = uio->uio_td; 695 struct vnode *vp = ap->a_vp; 696 struct nfsnode *np = VTONFS(vp); 697 struct ucred *cred = ap->a_cred; 698 int ioflag = ap->a_ioflag; 699 struct buf *bp; 700 struct vattr vattr; 701 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 702 daddr_t lbn; 703 int bcount; 704 int n, on, error = 0; 705 int haverslock = 0; 706 struct proc *p = td?td->td_proc:NULL; 707 708 GIANT_REQUIRED; 709 710#ifdef DIAGNOSTIC 711 if (uio->uio_rw != UIO_WRITE) 712 panic("nfs_write mode"); 713 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) 714 panic("nfs_write proc"); 715#endif 716 if (vp->v_type != VREG) 717 return (EIO); 718 if (np->n_flag & NWRITEERR) { 719 np->n_flag &= ~NWRITEERR; 720 return (np->n_error); 721 } 722 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 723 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 724 (void)nfs_fsinfo(nmp, vp, cred, td); 725 726 /* 727 * Synchronously flush pending buffers if we are in synchronous 728 * mode or if we are appending. 729 */ 730 if (ioflag & (IO_APPEND | IO_SYNC)) { 731 if (np->n_flag & NMODIFIED) { 732 np->n_attrstamp = 0; 733 error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1); 734 if (error) 735 return (error); 736 } 737 } 738 739 /* 740 * If IO_APPEND then load uio_offset. We restart here if we cannot 741 * get the append lock. 742 */ 743restart: 744 if (ioflag & IO_APPEND) { 745 np->n_attrstamp = 0; 746 error = VOP_GETATTR(vp, &vattr, cred, td); 747 if (error) 748 return (error); 749 uio->uio_offset = np->n_size; 750 } 751 752 if (uio->uio_offset < 0) 753 return (EINVAL); 754 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 755 return (EFBIG); 756 if (uio->uio_resid == 0) 757 return (0); 758 759 /* 760 * We need to obtain the rslock if we intend to modify np->n_size 761 * in order to guarentee the append point with multiple contending 762 * writers, to guarentee that no other appenders modify n_size 763 * while we are trying to obtain a truncated buffer (i.e. to avoid 764 * accidently truncating data written by another appender due to 765 * the race), and to ensure that the buffer is populated prior to 766 * our extending of the file. We hold rslock through the entire 767 * operation. 768 * 769 * Note that we do not synchronize the case where someone truncates 770 * the file while we are appending to it because attempting to lock 771 * this case may deadlock other parts of the system unexpectedly. 772 */ 773 if ((ioflag & IO_APPEND) || 774 uio->uio_offset + uio->uio_resid > np->n_size) { 775 switch(nfs_rslock(np, td)) { 776 case ENOLCK: 777 goto restart; 778 /* not reached */ 779 case EIO: 780 return (EIO); 781 case EINTR: 782 case ERESTART: 783 return(EINTR); 784 /* not reached */ 785 default: 786 break; 787 } 788 haverslock = 1; 789 } 790 791 /* 792 * Maybe this should be above the vnode op call, but so long as 793 * file servers have no limits, i don't think it matters 794 */ 795 if (p != NULL) { 796 PROC_LOCK(p); 797 if (uio->uio_offset + uio->uio_resid > 798 lim_cur(p, RLIMIT_FSIZE)) { 799 psignal(p, SIGXFSZ); 800 PROC_UNLOCK(p); 801 if (haverslock) 802 nfs_rsunlock(np, td); 803 return (EFBIG); 804 } 805 PROC_UNLOCK(p); 806 } 807 808 biosize = vp->v_mount->mnt_stat.f_iosize; 809 810 do { 811 nfsstats.biocache_writes++; 812 lbn = uio->uio_offset / biosize; 813 on = uio->uio_offset & (biosize-1); 814 n = min((unsigned)(biosize - on), uio->uio_resid); 815again: 816 /* 817 * Handle direct append and file extension cases, calculate 818 * unaligned buffer size. 819 */ 820 821 if (uio->uio_offset == np->n_size && n) { 822 /* 823 * Get the buffer (in its pre-append state to maintain 824 * B_CACHE if it was previously set). Resize the 825 * nfsnode after we have locked the buffer to prevent 826 * readers from reading garbage. 827 */ 828 bcount = on; 829 bp = nfs_getcacheblk(vp, lbn, bcount, td); 830 831 if (bp != NULL) { 832 long save; 833 834 np->n_size = uio->uio_offset + n; 835 np->n_flag |= NMODIFIED; 836 vnode_pager_setsize(vp, np->n_size); 837 838 save = bp->b_flags & B_CACHE; 839 bcount += n; 840 allocbuf(bp, bcount); 841 bp->b_flags |= save; 842 } 843 } else { 844 /* 845 * Obtain the locked cache block first, and then 846 * adjust the file's size as appropriate. 847 */ 848 bcount = on + n; 849 if ((off_t)lbn * biosize + bcount < np->n_size) { 850 if ((off_t)(lbn + 1) * biosize < np->n_size) 851 bcount = biosize; 852 else 853 bcount = np->n_size - (off_t)lbn * biosize; 854 } 855 bp = nfs_getcacheblk(vp, lbn, bcount, td); 856 if (uio->uio_offset + n > np->n_size) { 857 np->n_size = uio->uio_offset + n; 858 np->n_flag |= NMODIFIED; 859 vnode_pager_setsize(vp, np->n_size); 860 } 861 } 862 863 if (!bp) { 864 error = nfs_sigintr(nmp, NULL, td); 865 if (!error) 866 error = EINTR; 867 break; 868 } 869 870 /* 871 * Issue a READ if B_CACHE is not set. In special-append 872 * mode, B_CACHE is based on the buffer prior to the write 873 * op and is typically set, avoiding the read. If a read 874 * is required in special append mode, the server will 875 * probably send us a short-read since we extended the file 876 * on our end, resulting in b_resid == 0 and, thusly, 877 * B_CACHE getting set. 878 * 879 * We can also avoid issuing the read if the write covers 880 * the entire buffer. We have to make sure the buffer state 881 * is reasonable in this case since we will not be initiating 882 * I/O. See the comments in kern/vfs_bio.c's getblk() for 883 * more information. 884 * 885 * B_CACHE may also be set due to the buffer being cached 886 * normally. 887 */ 888 889 if (on == 0 && n == bcount) { 890 bp->b_flags |= B_CACHE; 891 bp->b_flags &= ~B_INVAL; 892 bp->b_ioflags &= ~BIO_ERROR; 893 } 894 895 if ((bp->b_flags & B_CACHE) == 0) { 896 bp->b_iocmd = BIO_READ; 897 vfs_busy_pages(bp, 0); 898 error = nfs_doio(vp, bp, cred, td); 899 if (error) { 900 brelse(bp); 901 break; 902 } 903 } 904 if (!bp) { 905 error = nfs_sigintr(nmp, NULL, td); 906 if (!error) 907 error = EINTR; 908 break; 909 } 910 if (bp->b_wcred == NOCRED) 911 bp->b_wcred = crhold(cred); 912 np->n_flag |= NMODIFIED; 913 914 /* 915 * If dirtyend exceeds file size, chop it down. This should 916 * not normally occur but there is an append race where it 917 * might occur XXX, so we log it. 918 * 919 * If the chopping creates a reverse-indexed or degenerate 920 * situation with dirtyoff/end, we 0 both of them. 921 */ 922 923 if (bp->b_dirtyend > bcount) { 924 printf("NFS append race @%lx:%d\n", 925 (long)bp->b_blkno * DEV_BSIZE, 926 bp->b_dirtyend - bcount); 927 bp->b_dirtyend = bcount; 928 } 929 930 if (bp->b_dirtyoff >= bp->b_dirtyend) 931 bp->b_dirtyoff = bp->b_dirtyend = 0; 932 933 /* 934 * If the new write will leave a contiguous dirty 935 * area, just update the b_dirtyoff and b_dirtyend, 936 * otherwise force a write rpc of the old dirty area. 937 * 938 * While it is possible to merge discontiguous writes due to 939 * our having a B_CACHE buffer ( and thus valid read data 940 * for the hole), we don't because it could lead to 941 * significant cache coherency problems with multiple clients, 942 * especially if locking is implemented later on. 943 * 944 * as an optimization we could theoretically maintain 945 * a linked list of discontinuous areas, but we would still 946 * have to commit them separately so there isn't much 947 * advantage to it except perhaps a bit of asynchronization. 948 */ 949 950 if (bp->b_dirtyend > 0 && 951 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 952 if (bwrite(bp) == EINTR) { 953 error = EINTR; 954 break; 955 } 956 goto again; 957 } 958 959 error = uiomove((char *)bp->b_data + on, n, uio); 960 961 /* 962 * Since this block is being modified, it must be written 963 * again and not just committed. Since write clustering does 964 * not work for the stage 1 data write, only the stage 2 965 * commit rpc, we have to clear B_CLUSTEROK as well. 966 */ 967 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 968 969 if (error) { 970 bp->b_ioflags |= BIO_ERROR; 971 brelse(bp); 972 break; 973 } 974 975 /* 976 * Only update dirtyoff/dirtyend if not a degenerate 977 * condition. 978 */ 979 if (n) { 980 if (bp->b_dirtyend > 0) { 981 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 982 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 983 } else { 984 bp->b_dirtyoff = on; 985 bp->b_dirtyend = on + n; 986 } 987 vfs_bio_set_validclean(bp, on, n); 988 } 989 990 /* 991 * If IO_SYNC do bwrite(). 992 * 993 * IO_INVAL appears to be unused. The idea appears to be 994 * to turn off caching in this case. Very odd. XXX 995 */ 996 if ((ioflag & IO_SYNC)) { 997 if (ioflag & IO_INVAL) 998 bp->b_flags |= B_NOCACHE; 999 error = bwrite(bp); 1000 if (error) 1001 break; 1002 } else if ((n + on) == biosize) { 1003 bp->b_flags |= B_ASYNC; 1004 (void) (nmp->nm_rpcops->nr_writebp)(bp, 0, 0); 1005 } else { 1006 bdwrite(bp); 1007 } 1008 } while (uio->uio_resid > 0 && n > 0); 1009 1010 if (haverslock) 1011 nfs_rsunlock(np, td); 1012 1013 return (error); 1014} 1015 1016/* 1017 * Get an nfs cache block. 1018 * 1019 * Allocate a new one if the block isn't currently in the cache 1020 * and return the block marked busy. If the calling process is 1021 * interrupted by a signal for an interruptible mount point, return 1022 * NULL. 1023 * 1024 * The caller must carefully deal with the possible B_INVAL state of 1025 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it 1026 * indirectly), so synchronous reads can be issued without worrying about 1027 * the B_INVAL state. We have to be a little more careful when dealing 1028 * with writes (see comments in nfs_write()) when extending a file past 1029 * its EOF. 1030 */ 1031static struct buf * 1032nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td) 1033{ 1034 struct buf *bp; 1035 struct mount *mp; 1036 struct nfsmount *nmp; 1037 1038 mp = vp->v_mount; 1039 nmp = VFSTONFS(mp); 1040 1041 if (nmp->nm_flag & NFSMNT_INT) { 1042 sigset_t oldset; 1043 1044 nfs_set_sigmask(td, &oldset); 1045 bp = getblk(vp, bn, size, PCATCH, 0, 0); 1046 nfs_restore_sigmask(td, &oldset); 1047 while (bp == NULL) { 1048 if (nfs_sigintr(nmp, NULL, td)) 1049 return (NULL); 1050 bp = getblk(vp, bn, size, 0, 2 * hz, 0); 1051 } 1052 } else { 1053 bp = getblk(vp, bn, size, 0, 0, 0); 1054 } 1055 1056 if (vp->v_type == VREG) { 1057 int biosize; 1058 1059 biosize = mp->mnt_stat.f_iosize; 1060 bp->b_blkno = bn * (biosize / DEV_BSIZE); 1061 } 1062 return (bp); 1063} 1064 1065/* 1066 * Flush and invalidate all dirty buffers. If another process is already 1067 * doing the flush, just wait for completion. 1068 */ 1069int 1070nfs_vinvalbuf(struct vnode *vp, int flags, struct ucred *cred, 1071 struct thread *td, int intrflg) 1072{ 1073 struct nfsnode *np = VTONFS(vp); 1074 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 1075 int error = 0, slpflag, slptimeo; 1076 int old_lock = 0; 1077 1078 ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf"); 1079 1080 /* 1081 * XXX This check stops us from needlessly doing a vinvalbuf when 1082 * being called through vclean(). It is not clear that this is 1083 * unsafe. 1084 */ 1085 if (vp->v_iflag & VI_XLOCK) 1086 return (0); 1087 1088 if ((nmp->nm_flag & NFSMNT_INT) == 0) 1089 intrflg = 0; 1090 if (intrflg) { 1091 slpflag = PCATCH; 1092 slptimeo = 2 * hz; 1093 } else { 1094 slpflag = 0; 1095 slptimeo = 0; 1096 } 1097 1098 if ((old_lock = VOP_ISLOCKED(vp, td)) != LK_EXCLUSIVE) { 1099 if (old_lock == LK_SHARED) { 1100 /* Upgrade to exclusive lock, this might block */ 1101 vn_lock(vp, LK_UPGRADE | LK_RETRY, td); 1102 } else { 1103 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1104 } 1105 } 1106 1107 /* 1108 * Now, flush as required. 1109 */ 1110 error = vinvalbuf(vp, flags, cred, td, slpflag, 0); 1111 while (error) { 1112 if (intrflg && (error = nfs_sigintr(nmp, NULL, td))) 1113 goto out; 1114 error = vinvalbuf(vp, flags, cred, td, 0, slptimeo); 1115 } 1116 np->n_flag &= ~NMODIFIED; 1117out: 1118 if (old_lock != LK_EXCLUSIVE) { 1119 if (old_lock == LK_SHARED) { 1120 /* Downgrade from exclusive lock, this might block */ 1121 vn_lock(vp, LK_DOWNGRADE, td); 1122 } else { 1123 VOP_UNLOCK(vp, 0, td); 1124 } 1125 } 1126 return error; 1127} 1128 1129/* 1130 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 1131 * This is mainly to avoid queueing async I/O requests when the nfsiods 1132 * are all hung on a dead server. 1133 * 1134 * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp 1135 * is eventually dequeued by the async daemon, nfs_doio() *will*. 1136 */ 1137int 1138nfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td) 1139{ 1140 int iod; 1141 int gotiod; 1142 int slpflag = 0; 1143 int slptimeo = 0; 1144 int error, error2; 1145 1146 /* 1147 * Commits are usually short and sweet so lets save some cpu and 1148 * leave the async daemons for more important rpc's (such as reads 1149 * and writes). 1150 */ 1151 if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) && 1152 (nmp->nm_bufqiods > nfs_numasync / 2)) { 1153 return(EIO); 1154 } 1155 1156again: 1157 if (nmp->nm_flag & NFSMNT_INT) 1158 slpflag = PCATCH; 1159 gotiod = FALSE; 1160 1161 /* 1162 * Find a free iod to process this request. 1163 */ 1164 for (iod = 0; iod < nfs_numasync; iod++) 1165 if (nfs_iodwant[iod]) { 1166 gotiod = TRUE; 1167 break; 1168 } 1169 1170 /* 1171 * Try to create one if none are free. 1172 */ 1173 if (!gotiod) { 1174 iod = nfs_nfsiodnew(); 1175 if (iod != -1) 1176 gotiod = TRUE; 1177 } 1178 1179 if (gotiod) { 1180 /* 1181 * Found one, so wake it up and tell it which 1182 * mount to process. 1183 */ 1184 NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n", 1185 iod, nmp)); 1186 nfs_iodwant[iod] = NULL; 1187 nfs_iodmount[iod] = nmp; 1188 nmp->nm_bufqiods++; 1189 wakeup(&nfs_iodwant[iod]); 1190 } 1191 1192 /* 1193 * If none are free, we may already have an iod working on this mount 1194 * point. If so, it will process our request. 1195 */ 1196 if (!gotiod) { 1197 if (nmp->nm_bufqiods > 0) { 1198 NFS_DPF(ASYNCIO, 1199 ("nfs_asyncio: %d iods are already processing mount %p\n", 1200 nmp->nm_bufqiods, nmp)); 1201 gotiod = TRUE; 1202 } 1203 } 1204 1205 /* 1206 * If we have an iod which can process the request, then queue 1207 * the buffer. 1208 */ 1209 if (gotiod) { 1210 /* 1211 * Ensure that the queue never grows too large. We still want 1212 * to asynchronize so we block rather then return EIO. 1213 */ 1214 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1215 NFS_DPF(ASYNCIO, 1216 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1217 nmp->nm_bufqwant = TRUE; 1218 error = nfs_tsleep(td, &nmp->nm_bufq, slpflag | PRIBIO, 1219 "nfsaio", slptimeo); 1220 if (error) { 1221 error2 = nfs_sigintr(nmp, NULL, td); 1222 if (error2) 1223 return (error2); 1224 if (slpflag == PCATCH) { 1225 slpflag = 0; 1226 slptimeo = 2 * hz; 1227 } 1228 } 1229 /* 1230 * We might have lost our iod while sleeping, 1231 * so check and loop if nescessary. 1232 */ 1233 if (nmp->nm_bufqiods == 0) { 1234 NFS_DPF(ASYNCIO, 1235 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1236 goto again; 1237 } 1238 } 1239 1240 if (bp->b_iocmd == BIO_READ) { 1241 if (bp->b_rcred == NOCRED && cred != NOCRED) 1242 bp->b_rcred = crhold(cred); 1243 } else { 1244 if (bp->b_wcred == NOCRED && cred != NOCRED) 1245 bp->b_wcred = crhold(cred); 1246 } 1247 1248 if (bp->b_flags & B_REMFREE) 1249 bremfreef(bp); 1250 BUF_KERNPROC(bp); 1251 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1252 nmp->nm_bufqlen++; 1253 return (0); 1254 } 1255 1256 /* 1257 * All the iods are busy on other mounts, so return EIO to 1258 * force the caller to process the i/o synchronously. 1259 */ 1260 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1261 return (EIO); 1262} 1263 1264/* 1265 * Do an I/O operation to/from a cache block. This may be called 1266 * synchronously or from an nfsiod. 1267 */ 1268int 1269nfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td) 1270{ 1271 struct uio *uiop; 1272 struct nfsnode *np; 1273 struct nfsmount *nmp; 1274 int error = 0, iomode, must_commit = 0; 1275 struct uio uio; 1276 struct iovec io; 1277 struct proc *p = td ? td->td_proc : NULL; 1278 1279 np = VTONFS(vp); 1280 nmp = VFSTONFS(vp->v_mount); 1281 uiop = &uio; 1282 uiop->uio_iov = &io; 1283 uiop->uio_iovcnt = 1; 1284 uiop->uio_segflg = UIO_SYSSPACE; 1285 uiop->uio_td = td; 1286 1287 /* 1288 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We 1289 * do this here so we do not have to do it in all the code that 1290 * calls us. 1291 */ 1292 bp->b_flags &= ~B_INVAL; 1293 bp->b_ioflags &= ~BIO_ERROR; 1294 1295 KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp)); 1296 1297 if (bp->b_iocmd == BIO_READ) { 1298 io.iov_len = uiop->uio_resid = bp->b_bcount; 1299 io.iov_base = bp->b_data; 1300 uiop->uio_rw = UIO_READ; 1301 1302 switch (vp->v_type) { 1303 case VREG: 1304 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1305 nfsstats.read_bios++; 1306 error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr); 1307 1308 if (!error) { 1309 if (uiop->uio_resid) { 1310 /* 1311 * If we had a short read with no error, we must have 1312 * hit a file hole. We should zero-fill the remainder. 1313 * This can also occur if the server hits the file EOF. 1314 * 1315 * Holes used to be able to occur due to pending 1316 * writes, but that is not possible any longer. 1317 */ 1318 int nread = bp->b_bcount - uiop->uio_resid; 1319 int left = uiop->uio_resid; 1320 1321 if (left > 0) 1322 bzero((char *)bp->b_data + nread, left); 1323 uiop->uio_resid = 0; 1324 } 1325 } 1326 /* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */ 1327 if (p && (vp->v_vflag & VV_TEXT) && 1328 (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime))) { 1329 PROC_LOCK(p); 1330 killproc(p, "text file modification"); 1331 PROC_UNLOCK(p); 1332 } 1333 break; 1334 case VLNK: 1335 uiop->uio_offset = (off_t)0; 1336 nfsstats.readlink_bios++; 1337 error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr); 1338 break; 1339 case VDIR: 1340 nfsstats.readdir_bios++; 1341 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1342 if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) 1343 error = nfs4_readdirrpc(vp, uiop, cr); 1344 else { 1345 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) { 1346 error = nfs_readdirplusrpc(vp, uiop, cr); 1347 if (error == NFSERR_NOTSUPP) 1348 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1349 } 1350 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1351 error = nfs_readdirrpc(vp, uiop, cr); 1352 } 1353 /* 1354 * end-of-directory sets B_INVAL but does not generate an 1355 * error. 1356 */ 1357 if (error == 0 && uiop->uio_resid == bp->b_bcount) 1358 bp->b_flags |= B_INVAL; 1359 break; 1360 default: 1361 printf("nfs_doio: type %x unexpected\n", vp->v_type); 1362 break; 1363 }; 1364 if (error) { 1365 bp->b_ioflags |= BIO_ERROR; 1366 bp->b_error = error; 1367 } 1368 } else { 1369 /* 1370 * If we only need to commit, try to commit 1371 */ 1372 if (bp->b_flags & B_NEEDCOMMIT) { 1373 int retv; 1374 off_t off; 1375 1376 off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; 1377 retv = (nmp->nm_rpcops->nr_commit)( 1378 vp, off, bp->b_dirtyend-bp->b_dirtyoff, 1379 bp->b_wcred, td); 1380 if (retv == 0) { 1381 bp->b_dirtyoff = bp->b_dirtyend = 0; 1382 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1383 bp->b_resid = 0; 1384 bufdone(bp); 1385 return (0); 1386 } 1387 if (retv == NFSERR_STALEWRITEVERF) { 1388 nfs_clearcommit(vp->v_mount); 1389 } 1390 } 1391 1392 /* 1393 * Setup for actual write 1394 */ 1395 1396 if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size) 1397 bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE; 1398 1399 if (bp->b_dirtyend > bp->b_dirtyoff) { 1400 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1401 - bp->b_dirtyoff; 1402 uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE 1403 + bp->b_dirtyoff; 1404 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1405 uiop->uio_rw = UIO_WRITE; 1406 nfsstats.write_bios++; 1407 1408 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1409 iomode = NFSV3WRITE_UNSTABLE; 1410 else 1411 iomode = NFSV3WRITE_FILESYNC; 1412 1413 error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit); 1414 1415 /* 1416 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try 1417 * to cluster the buffers needing commit. This will allow 1418 * the system to submit a single commit rpc for the whole 1419 * cluster. We can do this even if the buffer is not 100% 1420 * dirty (relative to the NFS blocksize), so we optimize the 1421 * append-to-file-case. 1422 * 1423 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be 1424 * cleared because write clustering only works for commit 1425 * rpc's, not for the data portion of the write). 1426 */ 1427 1428 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1429 bp->b_flags |= B_NEEDCOMMIT; 1430 if (bp->b_dirtyoff == 0 1431 && bp->b_dirtyend == bp->b_bcount) 1432 bp->b_flags |= B_CLUSTEROK; 1433 } else { 1434 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 1435 } 1436 1437 /* 1438 * For an interrupted write, the buffer is still valid 1439 * and the write hasn't been pushed to the server yet, 1440 * so we can't set BIO_ERROR and report the interruption 1441 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1442 * is not relevant, so the rpc attempt is essentially 1443 * a noop. For the case of a V3 write rpc not being 1444 * committed to stable storage, the block is still 1445 * dirty and requires either a commit rpc or another 1446 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1447 * the block is reused. This is indicated by setting 1448 * the B_DELWRI and B_NEEDCOMMIT flags. 1449 * 1450 * If the buffer is marked B_PAGING, it does not reside on 1451 * the vp's paging queues so we cannot call bdirty(). The 1452 * bp in this case is not an NFS cache block so we should 1453 * be safe. XXX 1454 */ 1455 if (error == EINTR || error == EIO 1456 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1457 int s; 1458 1459 s = splbio(); 1460 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1461 if ((bp->b_flags & B_PAGING) == 0) { 1462 bdirty(bp); 1463 bp->b_flags &= ~B_DONE; 1464 } 1465 if (error && (bp->b_flags & B_ASYNC) == 0) 1466 bp->b_flags |= B_EINTR; 1467 splx(s); 1468 } else { 1469 if (error) { 1470 bp->b_ioflags |= BIO_ERROR; 1471 bp->b_error = np->n_error = error; 1472 np->n_flag |= NWRITEERR; 1473 } 1474 bp->b_dirtyoff = bp->b_dirtyend = 0; 1475 } 1476 } else { 1477 bp->b_resid = 0; 1478 bufdone(bp); 1479 return (0); 1480 } 1481 } 1482 bp->b_resid = uiop->uio_resid; 1483 if (must_commit) 1484 nfs_clearcommit(vp->v_mount); 1485 bufdone(bp); 1486 return (error); 1487} 1488 1489/* 1490 * Used to aid in handling ftruncate() operations on the NFS client side. 1491 * Truncation creates a number of special problems for NFS. We have to 1492 * throw away VM pages and buffer cache buffers that are beyond EOF, and 1493 * we have to properly handle VM pages or (potentially dirty) buffers 1494 * that straddle the truncation point. 1495 */ 1496 1497int 1498nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize) 1499{ 1500 struct nfsnode *np = VTONFS(vp); 1501 u_quad_t tsize = np->n_size; 1502 int biosize = vp->v_mount->mnt_stat.f_iosize; 1503 int error = 0; 1504 1505 np->n_size = nsize; 1506 1507 if (np->n_size < tsize) { 1508 struct buf *bp; 1509 daddr_t lbn; 1510 int bufsize; 1511 1512 /* 1513 * vtruncbuf() doesn't get the buffer overlapping the 1514 * truncation point. We may have a B_DELWRI and/or B_CACHE 1515 * buffer that now needs to be truncated. 1516 */ 1517 error = vtruncbuf(vp, cred, td, nsize, biosize); 1518 lbn = nsize / biosize; 1519 bufsize = nsize & (biosize - 1); 1520 bp = nfs_getcacheblk(vp, lbn, bufsize, td); 1521 if (!bp) 1522 return EINTR; 1523 if (bp->b_dirtyoff > bp->b_bcount) 1524 bp->b_dirtyoff = bp->b_bcount; 1525 if (bp->b_dirtyend > bp->b_bcount) 1526 bp->b_dirtyend = bp->b_bcount; 1527 bp->b_flags |= B_RELBUF; /* don't leave garbage around */ 1528 brelse(bp); 1529 } else { 1530 vnode_pager_setsize(vp, nsize); 1531 } 1532 return(error); 1533} 1534 1535