nfs_bio.c revision 33108
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 37 * $Id: nfs_bio.c,v 1.48 1998/01/31 01:27:18 tegge Exp $ 38 */ 39 40#include "opt_diagnostic.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/resourcevar.h> 45#include <sys/signalvar.h> 46#include <sys/proc.h> 47#include <sys/buf.h> 48#include <sys/vnode.h> 49#include <sys/mount.h> 50#include <sys/kernel.h> 51 52#include <vm/vm.h> 53#include <vm/vm_extern.h> 54#include <vm/vm_prot.h> 55#include <vm/vm_page.h> 56#include <vm/vm_object.h> 57#include <vm/vm_pager.h> 58#include <vm/vnode_pager.h> 59 60#include <nfs/rpcv2.h> 61#include <nfs/nfsproto.h> 62#include <nfs/nfs.h> 63#include <nfs/nfsmount.h> 64#include <nfs/nqnfs.h> 65#include <nfs/nfsnode.h> 66 67static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 68 struct proc *p)); 69 70extern int nfs_numasync; 71extern struct nfsstats nfsstats; 72 73/* 74 * Vnode op for VM getpages. 75 */ 76int 77nfs_getpages(ap) 78 struct vop_getpages_args *ap; 79{ 80 int i, pcount, error; 81 struct uio uio; 82 struct iovec iov; 83 vm_page_t m; 84 vm_offset_t kva; 85 86 if ((ap->a_vp->v_object) == NULL) { 87 printf("nfs_getpages: called with non-merged cache vnode??\n"); 88 return EOPNOTSUPP; 89 } 90 91 m = ap->a_m[ap->a_reqpage]; 92 kva = vm_pager_map_page(m); 93 94 iov.iov_base = (caddr_t) kva; 95 iov.iov_len = PAGE_SIZE; 96 uio.uio_iov = &iov; 97 uio.uio_iovcnt = 1; 98 uio.uio_offset = IDX_TO_OFF(m->pindex); 99 uio.uio_resid = PAGE_SIZE; 100 uio.uio_segflg = UIO_SYSSPACE; 101 uio.uio_rw = UIO_READ; 102 uio.uio_procp = curproc; 103 104 error = nfs_readrpc(ap->a_vp, &uio, curproc->p_ucred); 105 vm_pager_unmap_page(kva); 106 107 pcount = round_page(ap->a_count) / PAGE_SIZE; 108 for (i = 0; i < pcount; i++) { 109 if (i != ap->a_reqpage) { 110 vnode_pager_freepage(ap->a_m[i]); 111 } 112 } 113 114 if (error && (uio.uio_resid == PAGE_SIZE)) 115 return VM_PAGER_ERROR; 116 return 0; 117} 118 119/* 120 * Vnode op for read using bio 121 * Any similarity to readip() is purely coincidental 122 */ 123int 124nfs_bioread(vp, uio, ioflag, cred, getpages) 125 register struct vnode *vp; 126 register struct uio *uio; 127 int ioflag; 128 struct ucred *cred; 129 int getpages; 130{ 131 register struct nfsnode *np = VTONFS(vp); 132 register int biosize, diff, i; 133 struct buf *bp = 0, *rabp; 134 struct vattr vattr; 135 struct proc *p; 136 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 137 daddr_t lbn, rabn; 138 int bufsize; 139 int nra, error = 0, n = 0, on = 0, not_readin; 140 141#ifdef DIAGNOSTIC 142 if (uio->uio_rw != UIO_READ) 143 panic("nfs_read mode"); 144#endif 145 if (uio->uio_resid == 0) 146 return (0); 147 if (uio->uio_offset < 0) 148 return (EINVAL); 149 p = uio->uio_procp; 150 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 151 (void)nfs_fsinfo(nmp, vp, cred, p); 152 biosize = vp->v_mount->mnt_stat.f_iosize; 153 /* 154 * For nfs, cache consistency can only be maintained approximately. 155 * Although RFC1094 does not specify the criteria, the following is 156 * believed to be compatible with the reference port. 157 * For nqnfs, full cache consistency is maintained within the loop. 158 * For nfs: 159 * If the file's modify time on the server has changed since the 160 * last read rpc or you have written to the file, 161 * you may have lost data cache consistency with the 162 * server, so flush all of the file's data out of the cache. 163 * Then force a getattr rpc to ensure that you have up to date 164 * attributes. 165 * NB: This implies that cache data can be read when up to 166 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 167 * attributes this could be forced by setting n_attrstamp to 0 before 168 * the VOP_GETATTR() call. 169 */ 170 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 171 if (np->n_flag & NMODIFIED) { 172 if (vp->v_type != VREG) { 173 if (vp->v_type != VDIR) 174 panic("nfs: bioread, not dir"); 175 nfs_invaldir(vp); 176 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 177 if (error) 178 return (error); 179 } 180 np->n_attrstamp = 0; 181 error = VOP_GETATTR(vp, &vattr, cred, p); 182 if (error) 183 return (error); 184 np->n_mtime = vattr.va_mtime.tv_sec; 185 } else { 186 error = VOP_GETATTR(vp, &vattr, cred, p); 187 if (error) 188 return (error); 189 if (np->n_mtime != vattr.va_mtime.tv_sec) { 190 if (vp->v_type == VDIR) 191 nfs_invaldir(vp); 192 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 193 if (error) 194 return (error); 195 np->n_mtime = vattr.va_mtime.tv_sec; 196 } 197 } 198 } 199 do { 200 201 /* 202 * Get a valid lease. If cached data is stale, flush it. 203 */ 204 if (nmp->nm_flag & NFSMNT_NQNFS) { 205 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 206 do { 207 error = nqnfs_getlease(vp, ND_READ, cred, p); 208 } while (error == NQNFS_EXPIRED); 209 if (error) 210 return (error); 211 if (np->n_lrev != np->n_brev || 212 (np->n_flag & NQNFSNONCACHE) || 213 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 214 if (vp->v_type == VDIR) 215 nfs_invaldir(vp); 216 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 217 if (error) 218 return (error); 219 np->n_brev = np->n_lrev; 220 } 221 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 222 nfs_invaldir(vp); 223 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 224 if (error) 225 return (error); 226 } 227 } 228 if (np->n_flag & NQNFSNONCACHE) { 229 switch (vp->v_type) { 230 case VREG: 231 return (nfs_readrpc(vp, uio, cred)); 232 case VLNK: 233 return (nfs_readlinkrpc(vp, uio, cred)); 234 case VDIR: 235 break; 236 default: 237 printf(" NQNFSNONCACHE: type %x unexpected\n", 238 vp->v_type); 239 }; 240 } 241 switch (vp->v_type) { 242 case VREG: 243 nfsstats.biocache_reads++; 244 lbn = uio->uio_offset / biosize; 245 on = uio->uio_offset & (biosize - 1); 246 not_readin = 1; 247 248 /* 249 * Start the read ahead(s), as required. 250 */ 251 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 252 for (nra = 0; nra < nmp->nm_readahead && 253 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 254 rabn = lbn + 1 + nra; 255 if (!incore(vp, rabn)) { 256 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 257 if (!rabp) 258 return (EINTR); 259 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 260 rabp->b_flags |= (B_READ | B_ASYNC); 261 vfs_busy_pages(rabp, 0); 262 if (nfs_asyncio(rabp, cred)) { 263 rabp->b_flags |= B_INVAL|B_ERROR; 264 vfs_unbusy_pages(rabp); 265 brelse(rabp); 266 } 267 } else 268 brelse(rabp); 269 } 270 } 271 } 272 273 /* 274 * If the block is in the cache and has the required data 275 * in a valid region, just copy it out. 276 * Otherwise, get the block and write back/read in, 277 * as required. 278 */ 279again: 280 bufsize = biosize; 281 if ((off_t)(lbn + 1) * biosize > np->n_size && 282 (off_t)(lbn + 1) * biosize - np->n_size < biosize) { 283 bufsize = np->n_size - lbn * biosize; 284 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 285 } 286 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 287 if (!bp) 288 return (EINTR); 289 /* 290 * If we are being called from nfs_getpages, we must 291 * make sure the buffer is a vmio buffer. The vp will 292 * already be setup for vmio but there may be some old 293 * non-vmio buffers attached to it. 294 */ 295 if (getpages && !(bp->b_flags & B_VMIO)) { 296#ifdef DIAGNOSTIC 297 printf("nfs_bioread: non vmio buf found, discarding\n"); 298#endif 299 bp->b_flags |= B_NOCACHE; 300 bp->b_flags |= B_INVAFTERWRITE; 301 if (bp->b_dirtyend > 0) { 302 if ((bp->b_flags & B_DELWRI) == 0) 303 panic("nfsbioread"); 304 if (VOP_BWRITE(bp) == EINTR) 305 return (EINTR); 306 } else 307 brelse(bp); 308 goto again; 309 } 310 if ((bp->b_flags & B_CACHE) == 0) { 311 bp->b_flags |= B_READ; 312 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 313 not_readin = 0; 314 vfs_busy_pages(bp, 0); 315 error = nfs_doio(bp, cred, p); 316 if (error) { 317 brelse(bp); 318 return (error); 319 } 320 } 321 if (bufsize > on) { 322 n = min((unsigned)(bufsize - on), uio->uio_resid); 323 } else { 324 n = 0; 325 } 326 diff = np->n_size - uio->uio_offset; 327 if (diff < n) 328 n = diff; 329 if (not_readin && n > 0) { 330 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 331 bp->b_flags |= B_NOCACHE; 332 bp->b_flags |= B_INVAFTERWRITE; 333 if (bp->b_dirtyend > 0) { 334 if ((bp->b_flags & B_DELWRI) == 0) 335 panic("nfsbioread"); 336 if (VOP_BWRITE(bp) == EINTR) 337 return (EINTR); 338 } else 339 brelse(bp); 340 goto again; 341 } 342 } 343 vp->v_lastr = lbn; 344 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 345 if (diff < n) 346 n = diff; 347 break; 348 case VLNK: 349 nfsstats.biocache_readlinks++; 350 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 351 if (!bp) 352 return (EINTR); 353 if ((bp->b_flags & B_CACHE) == 0) { 354 bp->b_flags |= B_READ; 355 vfs_busy_pages(bp, 0); 356 error = nfs_doio(bp, cred, p); 357 if (error) { 358 bp->b_flags |= B_ERROR; 359 brelse(bp); 360 return (error); 361 } 362 } 363 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 364 on = 0; 365 break; 366 case VDIR: 367 nfsstats.biocache_readdirs++; 368 if (np->n_direofoffset 369 && uio->uio_offset >= np->n_direofoffset) { 370 return (0); 371 } 372 lbn = uio->uio_offset / NFS_DIRBLKSIZ; 373 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 374 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 375 if (!bp) 376 return (EINTR); 377 if ((bp->b_flags & B_CACHE) == 0) { 378 bp->b_flags |= B_READ; 379 vfs_busy_pages(bp, 0); 380 error = nfs_doio(bp, cred, p); 381 if (error) { 382 brelse(bp); 383 } 384 while (error == NFSERR_BAD_COOKIE) { 385 nfs_invaldir(vp); 386 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 387 /* 388 * Yuck! The directory has been modified on the 389 * server. The only way to get the block is by 390 * reading from the beginning to get all the 391 * offset cookies. 392 */ 393 for (i = 0; i <= lbn && !error; i++) { 394 if (np->n_direofoffset 395 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) 396 return (0); 397 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 398 if (!bp) 399 return (EINTR); 400 if ((bp->b_flags & B_DONE) == 0) { 401 bp->b_flags |= B_READ; 402 vfs_busy_pages(bp, 0); 403 error = nfs_doio(bp, cred, p); 404 if (error) { 405 brelse(bp); 406 } else if (i < lbn) { 407 brelse(bp); 408 } 409 } 410 } 411 } 412 if (error) 413 return (error); 414 } 415 416 /* 417 * If not eof and read aheads are enabled, start one. 418 * (You need the current block first, so that you have the 419 * directory offset cookie of the next block.) 420 */ 421 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 422 (np->n_direofoffset == 0 || 423 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 424 !(np->n_flag & NQNFSNONCACHE) && 425 !incore(vp, lbn + 1)) { 426 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 427 if (rabp) { 428 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 429 rabp->b_flags |= (B_READ | B_ASYNC); 430 vfs_busy_pages(rabp, 0); 431 if (nfs_asyncio(rabp, cred)) { 432 rabp->b_flags |= B_INVAL|B_ERROR; 433 vfs_unbusy_pages(rabp); 434 brelse(rabp); 435 } 436 } else { 437 brelse(rabp); 438 } 439 } 440 } 441 /* 442 * Make sure we use a signed variant of min() since 443 * the second term may be negative. 444 */ 445 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 446 break; 447 default: 448 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 449 break; 450 }; 451 452 if (n > 0) { 453 error = uiomove(bp->b_data + on, (int)n, uio); 454 } 455 switch (vp->v_type) { 456 case VREG: 457 break; 458 case VLNK: 459 n = 0; 460 break; 461 case VDIR: 462 if (np->n_flag & NQNFSNONCACHE) 463 bp->b_flags |= B_INVAL; 464 break; 465 default: 466 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 467 } 468 brelse(bp); 469 } while (error == 0 && uio->uio_resid > 0 && n > 0); 470 return (error); 471} 472 473/* 474 * Vnode op for write using bio 475 */ 476int 477nfs_write(ap) 478 struct vop_write_args /* { 479 struct vnode *a_vp; 480 struct uio *a_uio; 481 int a_ioflag; 482 struct ucred *a_cred; 483 } */ *ap; 484{ 485 register int biosize; 486 register struct uio *uio = ap->a_uio; 487 struct proc *p = uio->uio_procp; 488 register struct vnode *vp = ap->a_vp; 489 struct nfsnode *np = VTONFS(vp); 490 register struct ucred *cred = ap->a_cred; 491 int ioflag = ap->a_ioflag; 492 struct buf *bp; 493 struct vattr vattr; 494 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 495 daddr_t lbn; 496 int bufsize; 497 int n, on, error = 0, iomode, must_commit; 498 499#ifdef DIAGNOSTIC 500 if (uio->uio_rw != UIO_WRITE) 501 panic("nfs_write mode"); 502 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 503 panic("nfs_write proc"); 504#endif 505 if (vp->v_type != VREG) 506 return (EIO); 507 if (np->n_flag & NWRITEERR) { 508 np->n_flag &= ~NWRITEERR; 509 return (np->n_error); 510 } 511 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 512 (void)nfs_fsinfo(nmp, vp, cred, p); 513 if (ioflag & (IO_APPEND | IO_SYNC)) { 514 if (np->n_flag & NMODIFIED) { 515 np->n_attrstamp = 0; 516 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 517 if (error) 518 return (error); 519 } 520 if (ioflag & IO_APPEND) { 521 np->n_attrstamp = 0; 522 error = VOP_GETATTR(vp, &vattr, cred, p); 523 if (error) 524 return (error); 525 uio->uio_offset = np->n_size; 526 } 527 } 528 if (uio->uio_offset < 0) 529 return (EINVAL); 530 if (uio->uio_resid == 0) 531 return (0); 532 /* 533 * Maybe this should be above the vnode op call, but so long as 534 * file servers have no limits, i don't think it matters 535 */ 536 if (p && uio->uio_offset + uio->uio_resid > 537 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 538 psignal(p, SIGXFSZ); 539 return (EFBIG); 540 } 541 /* 542 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 543 * will be the same size within a filesystem. nfs_writerpc will 544 * still use nm_wsize when sizing the rpc's. 545 */ 546 biosize = vp->v_mount->mnt_stat.f_iosize; 547 do { 548 /* 549 * Check for a valid write lease. 550 */ 551 if ((nmp->nm_flag & NFSMNT_NQNFS) && 552 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 553 do { 554 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 555 } while (error == NQNFS_EXPIRED); 556 if (error) 557 return (error); 558 if (np->n_lrev != np->n_brev || 559 (np->n_flag & NQNFSNONCACHE)) { 560 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 561 if (error) 562 return (error); 563 np->n_brev = np->n_lrev; 564 } 565 } 566 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 567 iomode = NFSV3WRITE_FILESYNC; 568 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 569 if (must_commit) 570 nfs_clearcommit(vp->v_mount); 571 return (error); 572 } 573 nfsstats.biocache_writes++; 574 lbn = uio->uio_offset / biosize; 575 on = uio->uio_offset & (biosize-1); 576 n = min((unsigned)(biosize - on), uio->uio_resid); 577again: 578 if (uio->uio_offset + n > np->n_size) { 579 np->n_size = uio->uio_offset + n; 580 np->n_flag |= NMODIFIED; 581 vnode_pager_setsize(vp, (u_long)np->n_size); 582 } 583 bufsize = biosize; 584 if ((lbn + 1) * biosize > np->n_size) { 585 bufsize = np->n_size - lbn * biosize; 586 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 587 } 588 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 589 if (!bp) 590 return (EINTR); 591 if (bp->b_wcred == NOCRED) { 592 crhold(cred); 593 bp->b_wcred = cred; 594 } 595 np->n_flag |= NMODIFIED; 596 597 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) { 598 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 599 } 600 601 /* 602 * If the new write will leave a contiguous dirty 603 * area, just update the b_dirtyoff and b_dirtyend, 604 * otherwise force a write rpc of the old dirty area. 605 */ 606 if (bp->b_dirtyend > 0 && 607 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 608 bp->b_proc = p; 609 if (VOP_BWRITE(bp) == EINTR) 610 return (EINTR); 611 goto again; 612 } 613 614 /* 615 * Check for valid write lease and get one as required. 616 * In case getblk() and/or bwrite() delayed us. 617 */ 618 if ((nmp->nm_flag & NFSMNT_NQNFS) && 619 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 620 do { 621 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 622 } while (error == NQNFS_EXPIRED); 623 if (error) { 624 brelse(bp); 625 return (error); 626 } 627 if (np->n_lrev != np->n_brev || 628 (np->n_flag & NQNFSNONCACHE)) { 629 brelse(bp); 630 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 631 if (error) 632 return (error); 633 np->n_brev = np->n_lrev; 634 goto again; 635 } 636 } 637 error = uiomove((char *)bp->b_data + on, n, uio); 638 if (error) { 639 bp->b_flags |= B_ERROR; 640 brelse(bp); 641 return (error); 642 } 643 if (bp->b_dirtyend > 0) { 644 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 645 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 646 } else { 647 bp->b_dirtyoff = on; 648 bp->b_dirtyend = on + n; 649 } 650 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 651 bp->b_validoff > bp->b_dirtyend) { 652 bp->b_validoff = bp->b_dirtyoff; 653 bp->b_validend = bp->b_dirtyend; 654 } else { 655 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 656 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 657 } 658 659 /* 660 * Since this block is being modified, it must be written 661 * again and not just committed. 662 */ 663 bp->b_flags &= ~B_NEEDCOMMIT; 664 665 /* 666 * If the lease is non-cachable or IO_SYNC do bwrite(). 667 */ 668 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 669 bp->b_proc = p; 670 error = VOP_BWRITE(bp); 671 if (error) 672 return (error); 673 if (np->n_flag & NQNFSNONCACHE) { 674 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 675 if (error) 676 return (error); 677 } 678 } else if ((n + on) == biosize && 679 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 680 bp->b_proc = (struct proc *)0; 681 bp->b_flags |= B_ASYNC; 682 (void)nfs_writebp(bp, 0); 683 } else 684 bdwrite(bp); 685 } while (uio->uio_resid > 0 && n > 0); 686 return (0); 687} 688 689/* 690 * Get an nfs cache block. 691 * Allocate a new one if the block isn't currently in the cache 692 * and return the block marked busy. If the calling process is 693 * interrupted by a signal for an interruptible mount point, return 694 * NULL. 695 */ 696static struct buf * 697nfs_getcacheblk(vp, bn, size, p) 698 struct vnode *vp; 699 daddr_t bn; 700 int size; 701 struct proc *p; 702{ 703 register struct buf *bp; 704 struct mount *mp; 705 struct nfsmount *nmp; 706 707 mp = vp->v_mount; 708 nmp = VFSTONFS(mp); 709 710 if (nmp->nm_flag & NFSMNT_INT) { 711 bp = getblk(vp, bn, size, PCATCH, 0); 712 while (bp == (struct buf *)0) { 713 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 714 return ((struct buf *)0); 715 bp = getblk(vp, bn, size, 0, 2 * hz); 716 } 717 } else 718 bp = getblk(vp, bn, size, 0, 0); 719 720 if( vp->v_type == VREG) { 721 int biosize; 722 biosize = mp->mnt_stat.f_iosize; 723 bp->b_blkno = (bn * biosize) / DEV_BSIZE; 724 } 725 726 return (bp); 727} 728 729/* 730 * Flush and invalidate all dirty buffers. If another process is already 731 * doing the flush, just wait for completion. 732 */ 733int 734nfs_vinvalbuf(vp, flags, cred, p, intrflg) 735 struct vnode *vp; 736 int flags; 737 struct ucred *cred; 738 struct proc *p; 739 int intrflg; 740{ 741 register struct nfsnode *np = VTONFS(vp); 742 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 743 int error = 0, slpflag, slptimeo; 744 745 if (vp->v_flag & VXLOCK) { 746 return (0); 747 } 748 749 if ((nmp->nm_flag & NFSMNT_INT) == 0) 750 intrflg = 0; 751 if (intrflg) { 752 slpflag = PCATCH; 753 slptimeo = 2 * hz; 754 } else { 755 slpflag = 0; 756 slptimeo = 0; 757 } 758 /* 759 * First wait for any other process doing a flush to complete. 760 */ 761 while (np->n_flag & NFLUSHINPROG) { 762 np->n_flag |= NFLUSHWANT; 763 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 764 slptimeo); 765 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 766 return (EINTR); 767 } 768 769 /* 770 * Now, flush as required. 771 */ 772 np->n_flag |= NFLUSHINPROG; 773 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 774 while (error) { 775 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 776 np->n_flag &= ~NFLUSHINPROG; 777 if (np->n_flag & NFLUSHWANT) { 778 np->n_flag &= ~NFLUSHWANT; 779 wakeup((caddr_t)&np->n_flag); 780 } 781 return (EINTR); 782 } 783 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 784 } 785 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 786 if (np->n_flag & NFLUSHWANT) { 787 np->n_flag &= ~NFLUSHWANT; 788 wakeup((caddr_t)&np->n_flag); 789 } 790 return (0); 791} 792 793/* 794 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 795 * This is mainly to avoid queueing async I/O requests when the nfsiods 796 * are all hung on a dead server. 797 */ 798int 799nfs_asyncio(bp, cred) 800 register struct buf *bp; 801 struct ucred *cred; 802{ 803 struct nfsmount *nmp; 804 int i; 805 int gotiod; 806 int slpflag = 0; 807 int slptimeo = 0; 808 int error; 809 810 if (nfs_numasync == 0) 811 return (EIO); 812 813 nmp = VFSTONFS(bp->b_vp->v_mount); 814again: 815 if (nmp->nm_flag & NFSMNT_INT) 816 slpflag = PCATCH; 817 gotiod = FALSE; 818 819 /* 820 * Find a free iod to process this request. 821 */ 822 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 823 if (nfs_iodwant[i]) { 824 /* 825 * Found one, so wake it up and tell it which 826 * mount to process. 827 */ 828 NFS_DPF(ASYNCIO, 829 ("nfs_asyncio: waking iod %d for mount %p\n", 830 i, nmp)); 831 nfs_iodwant[i] = (struct proc *)0; 832 nfs_iodmount[i] = nmp; 833 nmp->nm_bufqiods++; 834 wakeup((caddr_t)&nfs_iodwant[i]); 835 gotiod = TRUE; 836 break; 837 } 838 839 /* 840 * If none are free, we may already have an iod working on this mount 841 * point. If so, it will process our request. 842 */ 843 if (!gotiod) { 844 if (nmp->nm_bufqiods > 0) { 845 NFS_DPF(ASYNCIO, 846 ("nfs_asyncio: %d iods are already processing mount %p\n", 847 nmp->nm_bufqiods, nmp)); 848 gotiod = TRUE; 849 } 850 } 851 852 /* 853 * If we have an iod which can process the request, then queue 854 * the buffer. 855 */ 856 if (gotiod) { 857 /* 858 * Ensure that the queue never grows too large. 859 */ 860 while (nmp->nm_bufqlen >= 2*nfs_numasync) { 861 NFS_DPF(ASYNCIO, 862 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); 863 nmp->nm_bufqwant = TRUE; 864 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO, 865 "nfsaio", slptimeo); 866 if (error) { 867 if (nfs_sigintr(nmp, NULL, bp->b_proc)) 868 return (EINTR); 869 if (slpflag == PCATCH) { 870 slpflag = 0; 871 slptimeo = 2 * hz; 872 } 873 } 874 /* 875 * We might have lost our iod while sleeping, 876 * so check and loop if nescessary. 877 */ 878 if (nmp->nm_bufqiods == 0) { 879 NFS_DPF(ASYNCIO, 880 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); 881 goto again; 882 } 883 } 884 885 if (bp->b_flags & B_READ) { 886 if (bp->b_rcred == NOCRED && cred != NOCRED) { 887 crhold(cred); 888 bp->b_rcred = cred; 889 } 890 } else { 891 bp->b_flags |= B_WRITEINPROG; 892 if (bp->b_wcred == NOCRED && cred != NOCRED) { 893 crhold(cred); 894 bp->b_wcred = cred; 895 } 896 } 897 898 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); 899 nmp->nm_bufqlen++; 900 return (0); 901 } 902 903 /* 904 * All the iods are busy on other mounts, so return EIO to 905 * force the caller to process the i/o synchronously. 906 */ 907 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); 908 return (EIO); 909} 910 911/* 912 * Do an I/O operation to/from a cache block. This may be called 913 * synchronously or from an nfsiod. 914 */ 915int 916nfs_doio(bp, cr, p) 917 register struct buf *bp; 918 struct ucred *cr; 919 struct proc *p; 920{ 921 register struct uio *uiop; 922 register struct vnode *vp; 923 struct nfsnode *np; 924 struct nfsmount *nmp; 925 int error = 0, diff, len, iomode, must_commit = 0; 926 struct uio uio; 927 struct iovec io; 928 929 vp = bp->b_vp; 930 np = VTONFS(vp); 931 nmp = VFSTONFS(vp->v_mount); 932 uiop = &uio; 933 uiop->uio_iov = &io; 934 uiop->uio_iovcnt = 1; 935 uiop->uio_segflg = UIO_SYSSPACE; 936 uiop->uio_procp = p; 937 938 /* 939 * Historically, paging was done with physio, but no more. 940 */ 941 if (bp->b_flags & B_PHYS) { 942 /* 943 * ...though reading /dev/drum still gets us here. 944 */ 945 io.iov_len = uiop->uio_resid = bp->b_bcount; 946 /* mapping was done by vmapbuf() */ 947 io.iov_base = bp->b_data; 948 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 949 if (bp->b_flags & B_READ) { 950 uiop->uio_rw = UIO_READ; 951 nfsstats.read_physios++; 952 error = nfs_readrpc(vp, uiop, cr); 953 } else { 954 int com; 955 956 iomode = NFSV3WRITE_DATASYNC; 957 uiop->uio_rw = UIO_WRITE; 958 nfsstats.write_physios++; 959 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 960 } 961 if (error) { 962 bp->b_flags |= B_ERROR; 963 bp->b_error = error; 964 } 965 } else if (bp->b_flags & B_READ) { 966 io.iov_len = uiop->uio_resid = bp->b_bcount; 967 io.iov_base = bp->b_data; 968 uiop->uio_rw = UIO_READ; 969 switch (vp->v_type) { 970 case VREG: 971 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 972 nfsstats.read_bios++; 973 error = nfs_readrpc(vp, uiop, cr); 974 if (!error) { 975 bp->b_validoff = 0; 976 if (uiop->uio_resid) { 977 /* 978 * If len > 0, there is a hole in the file and 979 * no writes after the hole have been pushed to 980 * the server yet. 981 * Just zero fill the rest of the valid area. 982 */ 983 diff = bp->b_bcount - uiop->uio_resid; 984 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 985 + diff); 986 if (len > 0) { 987 len = min(len, uiop->uio_resid); 988 bzero((char *)bp->b_data + diff, len); 989 bp->b_validend = diff + len; 990 } else 991 bp->b_validend = diff; 992 } else 993 bp->b_validend = bp->b_bcount; 994 } 995 if (p && (vp->v_flag & VTEXT) && 996 (((nmp->nm_flag & NFSMNT_NQNFS) && 997 NQNFS_CKINVALID(vp, np, ND_READ) && 998 np->n_lrev != np->n_brev) || 999 (!(nmp->nm_flag & NFSMNT_NQNFS) && 1000 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 1001 uprintf("Process killed due to text file modification\n"); 1002 psignal(p, SIGKILL); 1003 p->p_flag |= P_NOSWAP; 1004 } 1005 break; 1006 case VLNK: 1007 uiop->uio_offset = (off_t)0; 1008 nfsstats.readlink_bios++; 1009 error = nfs_readlinkrpc(vp, uiop, cr); 1010 break; 1011 case VDIR: 1012 nfsstats.readdir_bios++; 1013 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 1014 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 1015 error = nfs_readdirplusrpc(vp, uiop, cr); 1016 if (error == NFSERR_NOTSUPP) 1017 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 1018 } 1019 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 1020 error = nfs_readdirrpc(vp, uiop, cr); 1021 break; 1022 default: 1023 printf("nfs_doio: type %x unexpected\n",vp->v_type); 1024 break; 1025 }; 1026 if (error) { 1027 bp->b_flags |= B_ERROR; 1028 bp->b_error = error; 1029 } 1030 } else { 1031 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size) 1032 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 1033 1034 if (bp->b_dirtyend > bp->b_dirtyoff) { 1035 io.iov_len = uiop->uio_resid = bp->b_dirtyend 1036 - bp->b_dirtyoff; 1037 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 1038 + bp->b_dirtyoff; 1039 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 1040 uiop->uio_rw = UIO_WRITE; 1041 nfsstats.write_bios++; 1042 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC) 1043 iomode = NFSV3WRITE_UNSTABLE; 1044 else 1045 iomode = NFSV3WRITE_FILESYNC; 1046 bp->b_flags |= B_WRITEINPROG; 1047 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 1048 if (!error && iomode == NFSV3WRITE_UNSTABLE) { 1049 bp->b_flags |= B_NEEDCOMMIT; 1050 if (bp->b_dirtyoff == 0 1051 && bp->b_dirtyend == bp->b_bufsize) 1052 bp->b_flags |= B_CLUSTEROK; 1053 } else 1054 bp->b_flags &= ~B_NEEDCOMMIT; 1055 bp->b_flags &= ~B_WRITEINPROG; 1056 1057 /* 1058 * For an interrupted write, the buffer is still valid 1059 * and the write hasn't been pushed to the server yet, 1060 * so we can't set B_ERROR and report the interruption 1061 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 1062 * is not relevant, so the rpc attempt is essentially 1063 * a noop. For the case of a V3 write rpc not being 1064 * committed to stable storage, the block is still 1065 * dirty and requires either a commit rpc or another 1066 * write rpc with iomode == NFSV3WRITE_FILESYNC before 1067 * the block is reused. This is indicated by setting 1068 * the B_DELWRI and B_NEEDCOMMIT flags. 1069 */ 1070 if (error == EINTR 1071 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 1072 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 1073 ++numdirtybuffers; 1074 bp->b_flags |= B_DELWRI; 1075 reassignbuf(bp, vp); 1076 if ((bp->b_flags & B_ASYNC) == 0) 1077 bp->b_flags |= B_EINTR; 1078 } else { 1079 if (error) { 1080 bp->b_flags |= B_ERROR; 1081 bp->b_error = np->n_error = error; 1082 np->n_flag |= NWRITEERR; 1083 } 1084 bp->b_dirtyoff = bp->b_dirtyend = 0; 1085 } 1086 } else { 1087 bp->b_resid = 0; 1088 biodone(bp); 1089 return (0); 1090 } 1091 } 1092 bp->b_resid = uiop->uio_resid; 1093 if (must_commit) 1094 nfs_clearcommit(vp->v_mount); 1095 biodone(bp); 1096 return (error); 1097} 1098