nfs_bio.c revision 36473
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 37 * $Id: nfs_bio.c,v 1.56 1998/05/20 08:02:23 peter Exp $ 38 */ 39 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/resourcevar.h> 44#include <sys/signalvar.h> 45#include <sys/proc.h> 46#include <sys/buf.h> 47#include <sys/vnode.h> 48#include <sys/mount.h> 49#include <sys/kernel.h> 50 51#include <vm/vm.h> 52#include <vm/vm_extern.h> 53#include <vm/vm_prot.h> 54#include <vm/vm_page.h> 55#include <vm/vm_object.h> 56#include <vm/vm_pager.h> 57#include <vm/vnode_pager.h> 58 59#include <nfs/rpcv2.h> 60#include <nfs/nfsproto.h> 61#include <nfs/nfs.h> 62#include <nfs/nfsmount.h> 63#include <nfs/nqnfs.h> 64#include <nfs/nfsnode.h> 65 66static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 67 struct proc *p)); 68static void nfs_prot_buf __P((struct buf *bp, int off, int n)); 69 70extern int nfs_numasync; 71extern struct nfsstats nfsstats; 72 73/* 74 * Vnode op for VM getpages. 75 */ 76int 77nfs_getpages(ap) 78 struct vop_getpages_args *ap; 79{ 80 int i, error, nextoff, size, toff, npages; 81 struct uio uio; 82 struct iovec iov; 83 vm_page_t m; 84 vm_offset_t kva; 85 struct buf *bp; 86 87 if ((ap->a_vp->v_object) == NULL) { 88 printf("nfs_getpages: called with non-merged cache vnode??\n"); 89 return EOPNOTSUPP; 90 } 91 92 /* 93 * We use only the kva address for the buffer, but this is extremely 94 * convienient and fast. 95 */ 96 bp = getpbuf(); 97 98 npages = btoc(ap->a_count); 99 kva = (vm_offset_t) bp->b_data; 100 pmap_qenter(kva, ap->a_m, npages); 101 102 iov.iov_base = (caddr_t) kva; 103 iov.iov_len = ap->a_count; 104 uio.uio_iov = &iov; 105 uio.uio_iovcnt = 1; 106 uio.uio_offset = IDX_TO_OFF(ap->a_m[0]->pindex); 107 uio.uio_resid = ap->a_count; 108 uio.uio_segflg = UIO_SYSSPACE; 109 uio.uio_rw = UIO_READ; 110 uio.uio_procp = curproc; 111 112 error = nfs_readrpc(ap->a_vp, &uio, curproc->p_ucred); 113 pmap_qremove(kva, npages); 114 115 relpbuf(bp); 116 117 if (error && (uio.uio_resid == ap->a_count)) 118 return VM_PAGER_ERROR; 119 120 size = ap->a_count - uio.uio_resid; 121 122 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) { 123 vm_page_t m; 124 nextoff = toff + PAGE_SIZE; 125 m = ap->a_m[i]; 126 127 m->flags &= ~PG_ZERO; 128 129 if (nextoff <= size) { 130 m->valid = VM_PAGE_BITS_ALL; 131 m->dirty = 0; 132 } else { 133 int nvalid = ((size + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1); 134 vm_page_set_validclean(m, 0, nvalid); 135 } 136 137 if (i != ap->a_reqpage) { 138 /* 139 * Whether or not to leave the page activated is up in 140 * the air, but we should put the page on a page queue 141 * somewhere (it already is in the object). Result: 142 * It appears that emperical results show that 143 * deactivating pages is best. 144 */ 145 146 /* 147 * Just in case someone was asking for this page we 148 * now tell them that it is ok to use. 149 */ 150 if (!error) { 151 if (m->flags & PG_WANTED) 152 vm_page_activate(m); 153 else 154 vm_page_deactivate(m); 155 PAGE_WAKEUP(m); 156 } else { 157 vnode_pager_freepage(m); 158 } 159 } 160 } 161 return 0; 162} 163 164/* 165 * Vnode op for VM putpages. 166 */ 167int 168nfs_putpages(ap) 169 struct vop_putpages_args *ap; 170{ 171 struct uio uio; 172 struct iovec iov; 173 vm_page_t m; 174 vm_offset_t kva; 175 struct buf *bp; 176 int iomode, must_commit, i, error, npages; 177 int *rtvals; 178 179 rtvals = ap->a_rtvals; 180 181 npages = btoc(ap->a_count); 182 183 for (i = 0; i < npages; i++) { 184 rtvals[i] = VM_PAGER_AGAIN; 185 } 186 187 /* 188 * We use only the kva address for the buffer, but this is extremely 189 * convienient and fast. 190 */ 191 bp = getpbuf(); 192 193 kva = (vm_offset_t) bp->b_data; 194 pmap_qenter(kva, ap->a_m, npages); 195 196 iov.iov_base = (caddr_t) kva; 197 iov.iov_len = ap->a_count; 198 uio.uio_iov = &iov; 199 uio.uio_iovcnt = 1; 200 uio.uio_offset = IDX_TO_OFF(ap->a_m[0]->pindex); 201 uio.uio_resid = ap->a_count; 202 uio.uio_segflg = UIO_SYSSPACE; 203 uio.uio_rw = UIO_WRITE; 204 uio.uio_procp = curproc; 205 206 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0) 207 iomode = NFSV3WRITE_UNSTABLE; 208 else 209 iomode = NFSV3WRITE_FILESYNC; 210 211 error = nfs_writerpc(ap->a_vp, &uio, 212 curproc->p_ucred, &iomode, &must_commit); 213 214 pmap_qremove(kva, npages); 215 relpbuf(bp); 216 217 if (!error) { 218 int nwritten = round_page(ap->a_count - uio.uio_resid) / PAGE_SIZE; 219 for (i = 0; i < nwritten; i++) { 220 rtvals[i] = VM_PAGER_OK; 221 ap->a_m[i]->dirty = 0; 222 } 223 if (must_commit) 224 nfs_clearcommit(ap->a_vp->v_mount); 225 } 226 return ap->a_rtvals[0]; 227} 228 229/* 230 * Vnode op for read using bio 231 * Any similarity to readip() is purely coincidental 232 */ 233int 234nfs_bioread(vp, uio, ioflag, cred, getpages) 235 register struct vnode *vp; 236 register struct uio *uio; 237 int ioflag; 238 struct ucred *cred; 239 int getpages; 240{ 241 register struct nfsnode *np = VTONFS(vp); 242 register int biosize, diff, i; 243 struct buf *bp = 0, *rabp; 244 struct vattr vattr; 245 struct proc *p; 246 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 247 daddr_t lbn, rabn; 248 int bufsize; 249 int nra, error = 0, n = 0, on = 0, not_readin; 250 251#ifdef DIAGNOSTIC 252 if (uio->uio_rw != UIO_READ) 253 panic("nfs_read mode"); 254#endif 255 if (uio->uio_resid == 0) 256 return (0); 257 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ 258 return (EINVAL); 259 p = uio->uio_procp; 260 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 261 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 262 (void)nfs_fsinfo(nmp, vp, cred, p); 263 if (vp->v_type != VDIR && 264 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 265 return (EFBIG); 266 biosize = vp->v_mount->mnt_stat.f_iosize; 267 /* 268 * For nfs, cache consistency can only be maintained approximately. 269 * Although RFC1094 does not specify the criteria, the following is 270 * believed to be compatible with the reference port. 271 * For nqnfs, full cache consistency is maintained within the loop. 272 * For nfs: 273 * If the file's modify time on the server has changed since the 274 * last read rpc or you have written to the file, 275 * you may have lost data cache consistency with the 276 * server, so flush all of the file's data out of the cache. 277 * Then force a getattr rpc to ensure that you have up to date 278 * attributes. 279 * NB: This implies that cache data can be read when up to 280 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 281 * attributes this could be forced by setting n_attrstamp to 0 before 282 * the VOP_GETATTR() call. 283 */ 284 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 285 if (np->n_flag & NMODIFIED) { 286 if (vp->v_type != VREG) { 287 if (vp->v_type != VDIR) 288 panic("nfs: bioread, not dir"); 289 nfs_invaldir(vp); 290 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 291 if (error) 292 return (error); 293 } 294 np->n_attrstamp = 0; 295 error = VOP_GETATTR(vp, &vattr, cred, p); 296 if (error) 297 return (error); 298 np->n_mtime = vattr.va_mtime.tv_sec; 299 } else { 300 error = VOP_GETATTR(vp, &vattr, cred, p); 301 if (error) 302 return (error); 303 if (np->n_mtime != vattr.va_mtime.tv_sec) { 304 if (vp->v_type == VDIR) 305 nfs_invaldir(vp); 306 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 307 if (error) 308 return (error); 309 np->n_mtime = vattr.va_mtime.tv_sec; 310 } 311 } 312 } 313 do { 314 315 /* 316 * Get a valid lease. If cached data is stale, flush it. 317 */ 318 if (nmp->nm_flag & NFSMNT_NQNFS) { 319 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 320 do { 321 error = nqnfs_getlease(vp, ND_READ, cred, p); 322 } while (error == NQNFS_EXPIRED); 323 if (error) 324 return (error); 325 if (np->n_lrev != np->n_brev || 326 (np->n_flag & NQNFSNONCACHE) || 327 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 328 if (vp->v_type == VDIR) 329 nfs_invaldir(vp); 330 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 331 if (error) 332 return (error); 333 np->n_brev = np->n_lrev; 334 } 335 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 336 nfs_invaldir(vp); 337 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 338 if (error) 339 return (error); 340 } 341 } 342 if (np->n_flag & NQNFSNONCACHE) { 343 switch (vp->v_type) { 344 case VREG: 345 return (nfs_readrpc(vp, uio, cred)); 346 case VLNK: 347 return (nfs_readlinkrpc(vp, uio, cred)); 348 case VDIR: 349 break; 350 default: 351 printf(" NQNFSNONCACHE: type %x unexpected\n", 352 vp->v_type); 353 }; 354 } 355 switch (vp->v_type) { 356 case VREG: 357 nfsstats.biocache_reads++; 358 lbn = uio->uio_offset / biosize; 359 on = uio->uio_offset & (biosize - 1); 360 not_readin = 1; 361 362 /* 363 * Start the read ahead(s), as required. 364 */ 365 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 366 for (nra = 0; nra < nmp->nm_readahead && 367 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 368 rabn = lbn + 1 + nra; 369 if (!incore(vp, rabn)) { 370 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 371 if (!rabp) 372 return (EINTR); 373 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 374 rabp->b_flags |= (B_READ | B_ASYNC); 375 vfs_busy_pages(rabp, 0); 376 if (nfs_asyncio(rabp, cred)) { 377 rabp->b_flags |= B_INVAL|B_ERROR; 378 vfs_unbusy_pages(rabp); 379 brelse(rabp); 380 } 381 } else 382 brelse(rabp); 383 } 384 } 385 } 386 387 /* 388 * If the block is in the cache and has the required data 389 * in a valid region, just copy it out. 390 * Otherwise, get the block and write back/read in, 391 * as required. 392 */ 393again: 394 bufsize = biosize; 395 if ((off_t)(lbn + 1) * biosize > np->n_size && 396 (off_t)(lbn + 1) * biosize - np->n_size < biosize) { 397 bufsize = np->n_size - lbn * biosize; 398 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 399 } 400 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 401 if (!bp) 402 return (EINTR); 403 /* 404 * If we are being called from nfs_getpages, we must 405 * make sure the buffer is a vmio buffer. The vp will 406 * already be setup for vmio but there may be some old 407 * non-vmio buffers attached to it. 408 */ 409 if (getpages && !(bp->b_flags & B_VMIO)) { 410#ifdef DIAGNOSTIC 411 printf("nfs_bioread: non vmio buf found, discarding\n"); 412#endif 413 bp->b_flags |= B_NOCACHE; 414 bp->b_flags |= B_INVAFTERWRITE; 415 if (bp->b_dirtyend > 0) { 416 if ((bp->b_flags & B_DELWRI) == 0) 417 panic("nfsbioread"); 418 if (VOP_BWRITE(bp) == EINTR) 419 return (EINTR); 420 } else 421 brelse(bp); 422 goto again; 423 } 424 if ((bp->b_flags & B_CACHE) == 0) { 425 bp->b_flags |= B_READ; 426 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 427 not_readin = 0; 428 vfs_busy_pages(bp, 0); 429 error = nfs_doio(bp, cred, p); 430 if (error) { 431 brelse(bp); 432 return (error); 433 } 434 } 435 if (bufsize > on) { 436 n = min((unsigned)(bufsize - on), uio->uio_resid); 437 } else { 438 n = 0; 439 } 440 diff = np->n_size - uio->uio_offset; 441 if (diff < n) 442 n = diff; 443 if (not_readin && n > 0) { 444 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 445 bp->b_flags |= B_NOCACHE; 446 bp->b_flags |= B_INVAFTERWRITE; 447 if (bp->b_dirtyend > 0) { 448 if ((bp->b_flags & B_DELWRI) == 0) 449 panic("nfsbioread"); 450 if (VOP_BWRITE(bp) == EINTR) 451 return (EINTR); 452 } else 453 brelse(bp); 454 goto again; 455 } 456 } 457 vp->v_lastr = lbn; 458 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 459 if (diff < n) 460 n = diff; 461 break; 462 case VLNK: 463 nfsstats.biocache_readlinks++; 464 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 465 if (!bp) 466 return (EINTR); 467 if ((bp->b_flags & B_CACHE) == 0) { 468 bp->b_flags |= B_READ; 469 vfs_busy_pages(bp, 0); 470 error = nfs_doio(bp, cred, p); 471 if (error) { 472 bp->b_flags |= B_ERROR; 473 brelse(bp); 474 return (error); 475 } 476 } 477 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 478 on = 0; 479 break; 480 case VDIR: 481 nfsstats.biocache_readdirs++; 482 if (np->n_direofoffset 483 && uio->uio_offset >= np->n_direofoffset) { 484 return (0); 485 } 486 lbn = uio->uio_offset / NFS_DIRBLKSIZ; 487 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 488 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 489 if (!bp) 490 return (EINTR); 491 if ((bp->b_flags & B_CACHE) == 0) { 492 bp->b_flags |= B_READ; 493 vfs_busy_pages(bp, 0); 494 error = nfs_doio(bp, cred, p); 495 if (error) { 496 brelse(bp); 497 } 498 while (error == NFSERR_BAD_COOKIE) { 499 nfs_invaldir(vp); 500 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 501 /* 502 * Yuck! The directory has been modified on the 503 * server. The only way to get the block is by 504 * reading from the beginning to get all the 505 * offset cookies. 506 */ 507 for (i = 0; i <= lbn && !error; i++) { 508 if (np->n_direofoffset 509 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 510 return (0); 511 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 512 if (!bp) 513 return (EINTR); 514 if ((bp->b_flags & B_DONE) == 0) { 515 bp->b_flags |= B_READ; 516 vfs_busy_pages(bp, 0); 517 error = nfs_doio(bp, cred, p); 518 if (error) { 519 brelse(bp); 520 } else if (i < lbn) { 521 brelse(bp); 522 } 523 } 524 } 525 } 526 if (error) 527 return (error); 528 } 529 530 /* 531 * If not eof and read aheads are enabled, start one. 532 * (You need the current block first, so that you have the 533 * directory offset cookie of the next block.) 534 */ 535 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 536 (np->n_direofoffset == 0 || 537 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 538 !(np->n_flag & NQNFSNONCACHE) && 539 !incore(vp, lbn + 1)) { 540 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 541 if (rabp) { 542 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 543 rabp->b_flags |= (B_READ | B_ASYNC); 544 vfs_busy_pages(rabp, 0); 545 if (nfs_asyncio(rabp, cred)) { 546 rabp->b_flags |= B_INVAL|B_ERROR; 547 vfs_unbusy_pages(rabp); 548 brelse(rabp); 549 } 550 } else { 551 brelse(rabp); 552 } 553 } 554 } 555 /* 556 * Make sure we use a signed variant of min() since 557 * the second term may be negative. 558 */ 559 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 560 break; 561 default: 562 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 563 break; 564 }; 565 566 if (n > 0) { 567 error = uiomove(bp->b_data + on, (int)n, uio); 568 } 569 switch (vp->v_type) { 570 case VREG: 571 break; 572 case VLNK: 573 n = 0; 574 break; 575 case VDIR: 576 if (np->n_flag & NQNFSNONCACHE) 577 bp->b_flags |= B_INVAL; 578 break; 579 default: 580 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 581 } 582 brelse(bp); 583 } while (error == 0 && uio->uio_resid > 0 && n > 0); 584 return (error); 585} 586 587static void 588nfs_prot_buf(bp, off, n) 589 struct buf *bp; 590 int off; 591 int n; 592{ 593 int pindex, boff, end; 594 595 if ((bp->b_flags & B_VMIO) == 0) 596 return; 597 598 end = round_page(off + n); 599 for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) { 600 pindex = boff >> PAGE_SHIFT; 601 vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE); 602 } 603} 604 605/* 606 * Vnode op for write using bio 607 */ 608int 609nfs_write(ap) 610 struct vop_write_args /* { 611 struct vnode *a_vp; 612 struct uio *a_uio; 613 int a_ioflag; 614 struct ucred *a_cred; 615 } */ *ap; 616{ 617 register int biosize; 618 register struct uio *uio = ap->a_uio; 619 struct proc *p = uio->uio_procp; 620 register struct vnode *vp = ap->a_vp; 621 struct nfsnode *np = VTONFS(vp); 622 register struct ucred *cred = ap->a_cred; 623 int ioflag = ap->a_ioflag; 624 struct buf *bp; 625 struct vattr vattr; 626 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 627 daddr_t lbn; 628 int bufsize; 629 int n, on, error = 0, iomode, must_commit; 630 631#ifdef DIAGNOSTIC 632 if (uio->uio_rw != UIO_WRITE) 633 panic("nfs_write mode"); 634 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 635 panic("nfs_write proc"); 636#endif 637 if (vp->v_type != VREG) 638 return (EIO); 639 if (np->n_flag & NWRITEERR) { 640 np->n_flag &= ~NWRITEERR; 641 return (np->n_error); 642 } 643 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && 644 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) 645 (void)nfs_fsinfo(nmp, vp, cred, p); 646 if (ioflag & (IO_APPEND | IO_SYNC)) { 647 if (np->n_flag & NMODIFIED) { 648 np->n_attrstamp = 0; 649 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 650 if (error) 651 return (error); 652 } 653 if (ioflag & IO_APPEND) { 654 np->n_attrstamp = 0; 655 error = VOP_GETATTR(vp, &vattr, cred, p); 656 if (error) 657 return (error); 658 uio->uio_offset = np->n_size; 659 } 660 } 661 if (uio->uio_offset < 0) 662 return (EINVAL); 663 if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) 664 return (EFBIG); 665 if (uio->uio_resid == 0) 666 return (0); 667 /* 668 * Maybe this should be above the vnode op call, but so long as 669 * file servers have no limits, i don't think it matters 670 */ 671 if (p && uio->uio_offset + uio->uio_resid > 672 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 673 psignal(p, SIGXFSZ); 674 return (EFBIG); 675 } 676 /* 677 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 678 * will be the same size within a filesystem. nfs_writerpc will 679 * still use nm_wsize when sizing the rpc's. 680 */ 681 biosize = vp->v_mount->mnt_stat.f_iosize; 682 do { 683 /* 684 * Check for a valid write lease. 685 */ 686 if ((nmp->nm_flag & NFSMNT_NQNFS) && 687 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 688 do { 689 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 690 } while (error == NQNFS_EXPIRED); 691 if (error) 692 return (error); 693 if (np->n_lrev != np->n_brev || 694 (np->n_flag & NQNFSNONCACHE)) { 695 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 696 if (error) 697 return (error); 698 np->n_brev = np->n_lrev; 699 } 700 } 701 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 702 iomode = NFSV3WRITE_FILESYNC; 703 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 704 if (must_commit) 705 nfs_clearcommit(vp->v_mount); 706 return (error); 707 } 708 nfsstats.biocache_writes++; 709 lbn = uio->uio_offset / biosize; 710 on = uio->uio_offset & (biosize-1); 711 n = min((unsigned)(biosize - on), uio->uio_resid); 712again: 713 if (uio->uio_offset + n > np->n_size) { 714 np->n_size = uio->uio_offset + n; 715 np->n_flag |= NMODIFIED; 716 vnode_pager_setsize(vp, (u_long)np->n_size); 717 } 718 bufsize = biosize; 719 if ((lbn + 1) * biosize > np->n_size) { 720 bufsize = np->n_size - lbn * biosize; 721 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 722 } 723 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 724 if (!bp) 725 return (EINTR); 726 if (bp->b_wcred == NOCRED) { 727 crhold(cred); 728 bp->b_wcred = cred; 729 } 730 np->n_flag |= NMODIFIED; 731 732 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) { 733 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 734 } 735 736 /* 737 * If the new write will leave a contiguous dirty 738 * area, just update the b_dirtyoff and b_dirtyend, 739 * otherwise force a write rpc of the old dirty area. 740 */ 741 if (bp->b_dirtyend > 0 && 742 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 743 bp->b_proc = p; 744 if (VOP_BWRITE(bp) == EINTR) 745 return (EINTR); 746 goto again; 747 } 748 749 /* 750 * Check for valid write lease and get one as required. 751 * In case getblk() and/or bwrite() delayed us. 752 */ 753 if ((nmp->nm_flag & NFSMNT_NQNFS) && 754 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 755 do { 756 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 757 } while (error == NQNFS_EXPIRED); 758 if (error) { 759 brelse(bp); 760 return (error); 761 } 762 if (np->n_lrev != np->n_brev || 763 (np->n_flag & NQNFSNONCACHE)) { 764 brelse(bp); 765 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 766 if (error) 767 return (error); 768 np->n_brev = np->n_lrev; 769 goto again; 770 } 771 } 772 773 error = uiomove((char *)bp->b_data + on, n, uio); 774 if (error) { 775 bp->b_flags |= B_ERROR; 776 brelse(bp); 777 return (error); 778 } 779 780 /* 781 * This will keep the buffer and mmaped regions more coherent. 782 */ 783 nfs_prot_buf(bp, on, n); 784 785 if (bp->b_dirtyend > 0) { 786 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 787 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 788 } else { 789 bp->b_dirtyoff = on; 790 bp->b_dirtyend = on + n; 791 } 792 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 793 bp->b_validoff > bp->b_dirtyend) { 794 bp->b_validoff = bp->b_dirtyoff; 795 bp->b_validend = bp->b_dirtyend; 796 } else { 797 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 798 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 799 } 800 801 /* 802 * Since this block is being modified, it must be written 803 * again and not just committed. 804 */ 805 bp->b_flags &= ~B_NEEDCOMMIT; 806 807 /* 808 * If the lease is non-cachable or IO_SYNC do bwrite(). 809 */ 810 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 811 bp->b_proc = p; 812 if (ioflag & IO_INVAL) 813 bp->b_flags |= B_INVAL; 814 error = VOP_BWRITE(bp); 815 if (error) 816 return (error); 817 if (np->n_flag & NQNFSNONCACHE) { 818 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 819 if (error) 820 return (error); 821 } 822 } else if ((n + on) == biosize && 823 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 824 bp->b_proc = (struct proc *)0; 825 bp->b_flags |= B_ASYNC; 826 (void)nfs_writebp(bp, 0); 827 } else 828 bdwrite(bp); 829 } while (uio->uio_resid > 0 && n > 0); 830 return (0); 831} 832 833/* 834 * Get an nfs cache block. 835 * Allocate a new one if the block isn't currently in the cache 836 * and return the block marked busy. If the calling process is 837 * interrupted by a signal for an interruptible mount point, return 838 * NULL. 839 */ 840static struct buf * 841nfs_getcacheblk(vp, bn, size, p) 842 struct vnode *vp; 843 daddr_t bn; 844 int size; 845 struct proc *p; 846{ 847 register struct buf *bp; 848 struct mount *mp; 849 struct nfsmount *nmp; 850 851 mp = vp->v_mount; 852 nmp = VFSTONFS(mp); 853 854 if (nmp->nm_flag & NFSMNT_INT) { 855 bp = getblk(vp, bn, size, PCATCH, 0); 856 while (bp == (struct buf *)0) { 857 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 858 return ((struct buf *)0); 859 bp = getblk(vp, bn, size, 0, 2 * hz); 860 } 861 } else 862 bp = getblk(vp, bn, size, 0, 0); 863 864 if( vp->v_type == VREG) { 865 int biosize; 866 biosize = mp->mnt_stat.f_iosize; 867 bp->b_blkno = (bn * biosize) / DEV_BSIZE; 868 } 869 870 return (bp); 871} 872 873/* 874 * Flush and invalidate all dirty buffers. If another process is already 875 * doing the flush, just wait for completion. 876 */ 877int 878nfs_vinvalbuf(vp, flags, cred, p, intrflg) 879 struct vnode *vp; 880 int flags; 881 struct ucred *cred; 882 struct proc *p; 883 int intrflg; 884{ 885 register struct nfsnode *np = VTONFS(vp); 886 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 887 int error = 0, slpflag, slptimeo; 888 889 if (vp->v_flag & VXLOCK) { 890 return (0); 891 } 892 893 if ((nmp->nm_flag & NFSMNT_INT) == 0) 894 intrflg = 0; 895 if (intrflg) { 896 slpflag = PCATCH; 897 slptimeo = 2 * hz; 898 } else { 899 slpflag = 0; 900 slptimeo = 0; 901 } 902 /* 903 * First wait for any other process doing a flush to complete. 904 */ 905 while (np->n_flag & NFLUSHINPROG) { 906 np->n_flag |= NFLUSHWANT; 907 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 908 slptimeo); 909 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 910 return (EINTR); 911 } 912 913 /* 914 * Now, flush as required. 915 */ 916 np->n_flag |= NFLUSHINPROG; 917 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 918 while (error) { 919 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 920 np->n_flag &= ~NFLUSHINPROG; 921 if (np->n_flag & NFLUSHWANT) { 922 np->n_flag &= ~NFLUSHWANT; 923 wakeup((caddr_t)&np->n_flag); 924 } 925 return (EINTR); 926 } 927 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 928 } 929 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 930 if (np->n_flag & NFLUSHWANT) { 931 np->n_flag &= ~NFLUSHWANT; 932 wakeup((caddr_t)&np->n_flag); 933 } 934 return (0); 935} 936 937/* 938 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 939 * This is mainly to avoid queueing async I/O requests when the nfsiods 940 * are all hung on a dead server. 941 */ 942int 943nfs_asyncio(bp, cred) 944 register struct buf *bp; 945 struct ucred *cred; 946{ 947 struct nfsmount *nmp; 948 int i; 949 int gotiod; 950 int slpflag = 0; 951 int slptimeo = 0; 952 int error; 953 954 if (nfs_numasync == 0) 955 return (EIO); 956 957 nmp = VFSTONFS(bp->b_vp->v_mount); 958again: 959 if (nmp->nm_flag & NFSMNT_INT) 960 slpflag = PCATCH; 961 gotiod = FALSE; 962 963 /* 964 * Find a free iod to process this request. 965 */ 966 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 967 if (nfs_iodwant[i]) { 968 /* 969 * Found one, so wake it up and tell it which 970 * mount to process. 971 */ 972 NFS_DPF(ASYNCIO, 973 ("nfs_asyncio: waking iod %d for mount %p\n", 974 i, nmp)); 975 nfs_iodwant[i] = (struct proc *)0; 976 nfs_iodmount[i] = nmp; 977 nmp->nm_bufqiods++; 978 wakeup((caddr_t)&nfs_iodwant[i]); 979 gotiod = TRUE; 980 break; 981 } 982 983 /* 984 * If none are free, we may already have an iod working on this mount 985 * point. If so, it will process our request. 986 */ 987 if (!gotiod) { 988 if (nmp->nm_bufqiods > 0) { 989 NFS_DPF(ASYNCIO, 990 ("nfs_asyncio: %d iods are already processing mount %p\n", 991 nmp->nm_bufqiods, nmp)); 992 gotiod = TRUE; 993 } 994 } 995 996 /* 997 * If we have an iod which can process the request, then queue 998 * the buffer. 999 */ 1000 if (gotiod) { 1001 /* 1002 * Ensure that the queue never grows too large. 1003 */ 1004 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 1005 NFS_DPF(ASYNCIO, 1006 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 1007 nmp->nm_bufqwant = TRUE; 1008 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 1009 "nfsaio", slptimeo); 1010 if (error) { 1011 if (nfs_sigintr(nmp, NULL, bp->b_proc)) 1012 return (EINTR); 1013 if (slpflag == PCATCH) { 1014 slpflag = 0; 1015 slptimeo = 2 * hz; 1016 } 1017 } 1018 /* 1019 * We might have lost our iod while sleeping, 1020 * so check and loop if nescessary. 1021 */ 1022 if (nmp->nm_bufqiods == 0) { 1023 NFS_DPF(ASYNCIO, 1024 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 1025 goto again; 1026 } 1027 } 1028 1029 if (bp->b_flags & B_READ) { 1030 if (bp->b_rcred == NOCRED && cred != NOCRED) { 1031 crhold(cred); 1032 bp->b_rcred = cred; 1033 } 1034 } else { 1035 bp->b_flags |= B_WRITEINPROG; 1036 if (bp->b_wcred == NOCRED && cred != NOCRED) { 1037 crhold(cred); 1038 bp->b_wcred = cred; 1039 } 1040 } 1041 1042 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 1043 nmp->nm_bufqlen++; 1044 return (0); 1045 } 1046 1047 /* 1048 * All the iods are busy on other mounts, so return EIO to 1049 * force the caller to process the i/o synchronously. 1050 */ 1051 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 1052 return (EIO); 1053} 1054 1055/* 1056 * Do an I/O operation to/from a cache block. This may be called 1057 * synchronously or from an nfsiod. 1058 */ 1059int 1060nfs_doio(bp, cr, p) 1061 register struct buf *bp; 1062 struct ucred *cr; 1063 struct proc *p; 1064{ 1065 register struct uio *uiop; 1066 register struct vnode *vp; 1067 struct nfsnode *np; 1068 struct nfsmount *nmp; 1069 int error = 0, diff, len, iomode, must_commit = 0; 1070 struct uio uio; 1071 struct iovec io; 1072 1073 vp = bp->b_vp; 1074 np = VTONFS(vp); 1075 nmp = VFSTONFS(vp->v_mount); 1076 uiop = &uio; 1077 uiop->uio_iov = &io; 1078 uiop->uio_iovcnt = 1; 1079 uiop->uio_segflg = UIO_SYSSPACE; 1080 uiop->uio_procp = p; 1081 1082 /* 1083 * Historically, paging was done with physio, but no more. 1084 */ 1085 if (bp->b_flags & B_PHYS) { 1086 /* 1087 * ...though reading /dev/drum still gets us here. 1088 */ 1089 io.iov_len = uiop->uio_resid = bp->b_bcount; 1090 /* mapping was done by vmapbuf() */ 1091 io.iov_base = bp->b_data; 1092 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1093 if (bp->b_flags & B_READ) { 1094 uiop->uio_rw = UIO_READ; 1095 nfsstats.read_physios++; 1096 error = nfs_readrpc(vp, uiop, cr); 1097 } else { 1098 int com; 1099 1100 iomode = NFSV3WRITE_DATASYNC; 1101 uiop->uio_rw = UIO_WRITE; 1102 nfsstats.write_physios++; 1103 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 1104 } 1105 if (error) { 1106 bp->b_flags |= B_ERROR; 1107 bp->b_error = error; 1108 } 1109 } else if (bp->b_flags & B_READ) { 1110 io.iov_len = uiop->uio_resid = bp->b_bcount; 1111 io.iov_base = bp->b_data; 1112 uiop->uio_rw = UIO_READ; 1113 switch (vp->v_type) { 1114 case VREG: 1115 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 1116 nfsstats.read_bios++; 1117 error = nfs_readrpc(vp, uiop, cr); 1118 if (!error) { 1119 bp->b_validoff = 0; 1120 if (uiop->uio_resid) { 1121 /* 1122 * If len > 0, there is a hole in the file and 1123 * no writes after the hole have been pushed to 1124 * the server yet. 1125 * Just zero fill the rest of the valid area. 1126 */ 1127 diff = bp->b_bcount - uiop->uio_resid; 1128 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 1129 + diff); 1130 if (len > 0) { 1131 len = min(len, uiop->uio_resid); 1132 bzero((char *)bp->b_data + diff, len); 1133 bp->b_validend = diff + len; 1134 } else 1135 bp->b_validend = diff; 1136 } else 1137 bp->b_validend = bp->b_bcount; 1138 } 1139 if (p && (vp->v_flag & VTEXT) && 1140 (((nmp->nm_flag & NFSMNT_NQNFS) && 1141 NQNFS_CKINVALID(vp, np, ND_READ) && 1142 np->n_lrev != np->n_brev) || 1143 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1144 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1145 uprintf("Process killed due to text file modification\n"); 1146 psignal(p, SIGKILL); 1147 p->p_flag |= P_NOSWAP; 1148 } 1149 break; 1150 case VLNK: 1151 uiop->uio_offset = (off_t)0; 1152 nfsstats.readlink_bios++; 1153 error = nfs_readlinkrpc(vp, uiop, cr); 1154 break; 1155 case VDIR: 1156 nfsstats.readdir_bios++; 1157 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1158 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1159 error = nfs_readdirplusrpc(vp, uiop, cr); 1160 if (error == NFSERR_NOTSUPP) 1161 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1162 } 1163 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1164 error = nfs_readdirrpc(vp, uiop, cr); 1165 break; 1166 default: 1167 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1168 break; 1169 }; 1170 if (error) { 1171 bp->b_flags |= B_ERROR; 1172 bp->b_error = error; 1173 } 1174 } else { 1175 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size) 1176 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 1177 1178 if (bp->b_dirtyend > bp->b_dirtyoff) { 1179 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1180 - bp->b_dirtyoff; 1181 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 1182 + bp->b_dirtyoff; 1183 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1184 uiop->uio_rw = UIO_WRITE; 1185 nfsstats.write_bios++; 1186 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1187 iomode = NFSV3WRITE_UNSTABLE; 1188 else 1189 iomode = NFSV3WRITE_FILESYNC; 1190 bp->b_flags |= B_WRITEINPROG; 1191 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1192 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1193 bp->b_flags |= B_NEEDCOMMIT; 1194 if (bp->b_dirtyoff == 0 1195 && bp->b_dirtyend == bp->b_bufsize) 1196 bp->b_flags |= B_CLUSTEROK; 1197 } else 1198 bp->b_flags &= ~B_NEEDCOMMIT; 1199 bp->b_flags &= ~B_WRITEINPROG; 1200 1201 /* 1202 * For an interrupted write, the buffer is still valid 1203 * and the write hasn't been pushed to the server yet, 1204 * so we can't set B_ERROR and report the interruption 1205 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1206 * is not relevant, so the rpc attempt is essentially 1207 * a noop. For the case of a V3 write rpc not being 1208 * committed to stable storage, the block is still 1209 * dirty and requires either a commit rpc or another 1210 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1211 * the block is reused. This is indicated by setting 1212 * the B_DELWRI and B_NEEDCOMMIT flags. 1213 */ 1214 if (error == EINTR 1215 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1216 int s; 1217 1218 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1219 ++numdirtybuffers; 1220 bp->b_flags |= B_DELWRI; 1221 s = splbio(); 1222 reassignbuf(bp, vp); 1223 splx(s); 1224 if ((bp->b_flags & B_ASYNC) == 0) 1225 bp->b_flags |= B_EINTR; 1226 } else { 1227 if (error) { 1228 bp->b_flags |= B_ERROR; 1229 bp->b_error = np->n_error = error; 1230 np->n_flag |= NWRITEERR; 1231 } 1232 bp->b_dirtyoff = bp->b_dirtyend = 0; 1233 } 1234 } else { 1235 bp->b_resid = 0; 1236 biodone(bp); 1237 return (0); 1238 } 1239 } 1240 bp->b_resid = uiop->uio_resid; 1241 if (must_commit) 1242 nfs_clearcommit(vp->v_mount); 1243 biodone(bp); 1244 return (error); 1245} 1246