nfs_bio.c revision 11921
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 37 * $Id: nfs_bio.c,v 1.17 1995/08/24 10:17:32 dfr Exp $ 38 */ 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/resourcevar.h> 43#include <sys/signalvar.h> 44#include <sys/proc.h> 45#include <sys/buf.h> 46#include <sys/vnode.h> 47#include <sys/mount.h> 48#include <sys/kernel.h> 49 50#include <vm/vm.h> 51 52#include <nfs/rpcv2.h> 53#include <nfs/nfsproto.h> 54#include <nfs/nfs.h> 55#include <nfs/nfsmount.h> 56#include <nfs/nqnfs.h> 57#include <nfs/nfsnode.h> 58 59struct buf *nfs_getcacheblk(); 60extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; 61extern int nfs_numasync; 62extern struct nfsstats nfsstats; 63 64/* 65 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate 66 * that this isn't done inside getblk() and brelse() so these calls 67 * wouldn't need to be here. 68 */ 69#ifdef B_VMIO 70#define vnode_pager_uncache(vp) 71#else 72#define vfs_busy_pages(bp, f) 73#define vfs_unbusy_pages(bp) 74#define vfs_dirty_pages(bp) 75#endif 76 77/* 78 * Vnode op for read using bio 79 * Any similarity to readip() is purely coincidental 80 */ 81int 82nfs_bioread(vp, uio, ioflag, cred) 83 register struct vnode *vp; 84 register struct uio *uio; 85 int ioflag; 86 struct ucred *cred; 87{ 88 register struct nfsnode *np = VTONFS(vp); 89 register int biosize, diff, i; 90 struct buf *bp = 0, *rabp; 91 struct vattr vattr; 92 struct proc *p; 93 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 94 daddr_t lbn, rabn; 95 int bufsize; 96 int nra, error = 0, n = 0, on = 0, not_readin; 97 98#ifdef DIAGNOSTIC 99 if (uio->uio_rw != UIO_READ) 100 panic("nfs_read mode"); 101#endif 102 if (uio->uio_resid == 0) 103 return (0); 104 if (uio->uio_offset < 0) 105 return (EINVAL); 106 p = uio->uio_procp; 107 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 108 (void)nfs_fsinfo(nmp, vp, cred, p); 109 biosize = vp->v_mount->mnt_stat.f_iosize; 110 /* 111 * For nfs, cache consistency can only be maintained approximately. 112 * Although RFC1094 does not specify the criteria, the following is 113 * believed to be compatible with the reference port. 114 * For nqnfs, full cache consistency is maintained within the loop. 115 * For nfs: 116 * If the file's modify time on the server has changed since the 117 * last read rpc or you have written to the file, 118 * you may have lost data cache consistency with the 119 * server, so flush all of the file's data out of the cache. 120 * Then force a getattr rpc to ensure that you have up to date 121 * attributes. 122 * NB: This implies that cache data can be read when up to 123 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 124 * attributes this could be forced by setting n_attrstamp to 0 before 125 * the VOP_GETATTR() call. 126 */ 127 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 128 if (np->n_flag & NMODIFIED) { 129 if (vp->v_type != VREG) { 130 if (vp->v_type != VDIR) 131 panic("nfs: bioread, not dir"); 132 nfs_invaldir(vp); 133 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 134 if (error) 135 return (error); 136 } 137 np->n_attrstamp = 0; 138 error = VOP_GETATTR(vp, &vattr, cred, p); 139 if (error) 140 return (error); 141 np->n_mtime = vattr.va_mtime.ts_sec; 142 } else { 143 error = VOP_GETATTR(vp, &vattr, cred, p); 144 if (error) 145 return (error); 146 if (np->n_mtime != vattr.va_mtime.ts_sec) { 147 if (vp->v_type == VDIR) 148 nfs_invaldir(vp); 149 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 150 if (error) 151 return (error); 152 np->n_mtime = vattr.va_mtime.ts_sec; 153 } 154 } 155 } 156 do { 157 158 /* 159 * Get a valid lease. If cached data is stale, flush it. 160 */ 161 if (nmp->nm_flag & NFSMNT_NQNFS) { 162 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 163 do { 164 error = nqnfs_getlease(vp, ND_READ, cred, p); 165 } while (error == NQNFS_EXPIRED); 166 if (error) 167 return (error); 168 if (np->n_lrev != np->n_brev || 169 (np->n_flag & NQNFSNONCACHE) || 170 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 171 if (vp->v_type == VDIR) 172 nfs_invaldir(vp); 173 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 174 if (error) 175 return (error); 176 np->n_brev = np->n_lrev; 177 } 178 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 179 nfs_invaldir(vp); 180 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 181 if (error) 182 return (error); 183 } 184 } 185 if (np->n_flag & NQNFSNONCACHE) { 186 switch (vp->v_type) { 187 case VREG: 188 return (nfs_readrpc(vp, uio, cred)); 189 case VLNK: 190 return (nfs_readlinkrpc(vp, uio, cred)); 191 case VDIR: 192 break; 193 default: 194 printf(" NQNFSNONCACHE: type %x unexpected\n", 195 vp->v_type); 196 }; 197 } 198 switch (vp->v_type) { 199 case VREG: 200 nfsstats.biocache_reads++; 201 lbn = uio->uio_offset / biosize; 202 on = uio->uio_offset & (biosize - 1); 203 not_readin = 1; 204 205 /* 206 * Start the read ahead(s), as required. 207 */ 208 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 209 for (nra = 0; nra < nmp->nm_readahead && 210 (lbn + 1 + nra) * biosize < np->n_size; nra++) { 211 rabn = lbn + 1 + nra; 212 if (!incore(vp, rabn)) { 213 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 214 if (!rabp) 215 return (EINTR); 216 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 217 rabp->b_flags |= (B_READ | B_ASYNC); 218 vfs_busy_pages(rabp, 0); 219 if (nfs_asyncio(rabp, cred)) { 220 rabp->b_flags |= B_INVAL|B_ERROR; 221 vfs_unbusy_pages(rabp); 222 brelse(rabp); 223 } 224 } else { 225 brelse(rabp); 226 } 227 } 228 } 229 } 230 231 /* 232 * If the block is in the cache and has the required data 233 * in a valid region, just copy it out. 234 * Otherwise, get the block and write back/read in, 235 * as required. 236 */ 237again: 238 bufsize = biosize; 239 if ((lbn + 1) * biosize > np->n_size) { 240 bufsize = np->n_size - lbn * biosize; 241 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 242 } 243 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 244 if (!bp) 245 return (EINTR); 246 if ((bp->b_flags & B_CACHE) == 0) { 247 bp->b_flags |= B_READ; 248 not_readin = 0; 249 vfs_busy_pages(bp, 0); 250 error = nfs_doio(bp, cred, p); 251 if (error) { 252 brelse(bp); 253 return (error); 254 } 255 } 256 if (bufsize > on) { 257 n = min((unsigned)(bufsize - on), uio->uio_resid); 258 } else { 259 n = 0; 260 } 261 diff = np->n_size - uio->uio_offset; 262 if (diff < n) 263 n = diff; 264 if (not_readin && n > 0) { 265 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 266 bp->b_flags |= B_NOCACHE; 267 if (bp->b_dirtyend > 0) { 268 if ((bp->b_flags & B_DELWRI) == 0) 269 panic("nfsbioread"); 270 if (VOP_BWRITE(bp) == EINTR) 271 return (EINTR); 272 } else 273 brelse(bp); 274 goto again; 275 } 276 } 277 vp->v_lastr = lbn; 278 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 279 if (diff < n) 280 n = diff; 281 break; 282 case VLNK: 283 nfsstats.biocache_readlinks++; 284 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 285 if (!bp) 286 return (EINTR); 287 if ((bp->b_flags & B_CACHE) == 0) { 288 bp->b_flags |= B_READ; 289 vfs_busy_pages(bp, 0); 290 error = nfs_doio(bp, cred, p); 291 if (error) { 292 bp->b_flags |= B_ERROR; 293 brelse(bp); 294 return (error); 295 } 296 } 297 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 298 on = 0; 299 break; 300 case VDIR: 301 nfsstats.biocache_readdirs++; 302 lbn = uio->uio_offset / NFS_DIRBLKSIZ; 303 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 304 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 305 if (!bp) 306 return (EINTR); 307 if ((bp->b_flags & B_CACHE) == 0) { 308 bp->b_flags |= B_READ; 309 vfs_busy_pages(bp, 0); 310 error = nfs_doio(bp, cred, p); 311 if (error) { 312 brelse(bp); 313 while (error == NFSERR_BAD_COOKIE) { 314 nfs_invaldir(vp); 315 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 316 /* 317 * Yuck! The directory has been modified on the 318 * server. The only way to get the block is by 319 * reading from the beginning to get all the 320 * offset cookies. 321 */ 322 for (i = 0; i <= lbn && !error; i++) { 323 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 324 if (!bp) 325 return (EINTR); 326 if ((bp->b_flags & B_DONE) == 0) { 327 bp->b_flags |= B_READ; 328 vfs_busy_pages(bp, 0); 329 error = nfs_doio(bp, cred, p); 330 if (error) 331 brelse(bp); 332 } 333 } 334 } 335 if (error) 336 return (error); 337 } 338 } 339 340 /* 341 * If not eof and read aheads are enabled, start one. 342 * (You need the current block first, so that you have the 343 * directory offset cookie of the next block.) 344 */ 345 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 346 (np->n_direofoffset == 0 || 347 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 348 !(np->n_flag & NQNFSNONCACHE) && 349 !incore(vp, lbn + 1)) { 350 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 351 if (rabp) { 352 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 353 rabp->b_flags |= (B_READ | B_ASYNC); 354 vfs_busy_pages(rabp, 0); 355 if (nfs_asyncio(rabp, cred)) { 356 rabp->b_flags |= B_INVAL|B_ERROR; 357 vfs_unbusy_pages(rabp); 358 brelse(rabp); 359 } 360 } else { 361 brelse(rabp); 362 } 363 } 364 } 365 n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 366 break; 367 default: 368 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 369 break; 370 }; 371 372 if (n > 0) { 373 error = uiomove(bp->b_data + on, (int)n, uio); 374 } 375 switch (vp->v_type) { 376 case VREG: 377 break; 378 case VLNK: 379 n = 0; 380 break; 381 case VDIR: 382 if (np->n_flag & NQNFSNONCACHE) 383 bp->b_flags |= B_INVAL; 384 break; 385 default: 386 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 387 } 388 brelse(bp); 389 } while (error == 0 && uio->uio_resid > 0 && n > 0); 390 return (error); 391} 392 393/* 394 * Vnode op for write using bio 395 */ 396int 397nfs_write(ap) 398 struct vop_write_args /* { 399 struct vnode *a_vp; 400 struct uio *a_uio; 401 int a_ioflag; 402 struct ucred *a_cred; 403 } */ *ap; 404{ 405 register int biosize; 406 register struct uio *uio = ap->a_uio; 407 struct proc *p = uio->uio_procp; 408 register struct vnode *vp = ap->a_vp; 409 struct nfsnode *np = VTONFS(vp); 410 register struct ucred *cred = ap->a_cred; 411 int ioflag = ap->a_ioflag; 412 struct buf *bp; 413 struct vattr vattr; 414 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 415 daddr_t lbn; 416 int bufsize; 417 int n, on, error = 0, iomode, must_commit; 418 419#ifdef DIAGNOSTIC 420 if (uio->uio_rw != UIO_WRITE) 421 panic("nfs_write mode"); 422 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 423 panic("nfs_write proc"); 424#endif 425 if (vp->v_type != VREG) 426 return (EIO); 427 if (np->n_flag & NWRITEERR) { 428 np->n_flag &= ~NWRITEERR; 429 return (np->n_error); 430 } 431 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 432 (void)nfs_fsinfo(nmp, vp, cred, p); 433 if (ioflag & (IO_APPEND | IO_SYNC)) { 434 if (np->n_flag & NMODIFIED) { 435 np->n_attrstamp = 0; 436 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 437 if (error) 438 return (error); 439 } 440 if (ioflag & IO_APPEND) { 441 np->n_attrstamp = 0; 442 error = VOP_GETATTR(vp, &vattr, cred, p); 443 if (error) 444 return (error); 445 uio->uio_offset = np->n_size; 446 } 447 } 448 if (uio->uio_offset < 0) 449 return (EINVAL); 450 if (uio->uio_resid == 0) 451 return (0); 452 /* 453 * Maybe this should be above the vnode op call, but so long as 454 * file servers have no limits, i don't think it matters 455 */ 456 if (p && uio->uio_offset + uio->uio_resid > 457 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 458 psignal(p, SIGXFSZ); 459 return (EFBIG); 460 } 461 /* 462 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 463 * will be the same size within a filesystem. nfs_writerpc will 464 * still use nm_wsize when sizing the rpc's. 465 */ 466 biosize = vp->v_mount->mnt_stat.f_iosize; 467 do { 468 469 /* 470 * XXX make sure we aren't cached in the VM page cache 471 */ 472 /* 473 * Check for a valid write lease. 474 */ 475 if ((nmp->nm_flag & NFSMNT_NQNFS) && 476 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 477 do { 478 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 479 } while (error == NQNFS_EXPIRED); 480 if (error) 481 return (error); 482 if (np->n_lrev != np->n_brev || 483 (np->n_flag & NQNFSNONCACHE)) { 484 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 485 if (error) 486 return (error); 487 np->n_brev = np->n_lrev; 488 } 489 } 490 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 491 iomode = NFSV3WRITE_FILESYNC; 492 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 493 if (must_commit) 494 nfs_clearcommit(vp->v_mount); 495 return (error); 496 } 497 nfsstats.biocache_writes++; 498 lbn = uio->uio_offset / biosize; 499 on = uio->uio_offset & (biosize-1); 500 n = min((unsigned)(biosize - on), uio->uio_resid); 501again: 502 if (uio->uio_offset + n > np->n_size) { 503 np->n_size = uio->uio_offset + n; 504 vnode_pager_setsize(vp, (u_long)np->n_size); 505 } 506 bufsize = biosize; 507 if ((lbn + 1) * biosize > np->n_size) { 508 bufsize = np->n_size - lbn * biosize; 509 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 510 } 511 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 512 if (!bp) 513 return (EINTR); 514 if (bp->b_wcred == NOCRED) { 515 crhold(cred); 516 bp->b_wcred = cred; 517 } 518 np->n_flag |= NMODIFIED; 519 520 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) { 521 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 522 } 523 524 /* 525 * If the new write will leave a contiguous dirty 526 * area, just update the b_dirtyoff and b_dirtyend, 527 * otherwise force a write rpc of the old dirty area. 528 */ 529 if (bp->b_dirtyend > 0 && 530 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 531 bp->b_proc = p; 532 if (VOP_BWRITE(bp) == EINTR) 533 return (EINTR); 534 goto again; 535 } 536 537 /* 538 * Check for valid write lease and get one as required. 539 * In case getblk() and/or bwrite() delayed us. 540 */ 541 if ((nmp->nm_flag & NFSMNT_NQNFS) && 542 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 543 do { 544 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 545 } while (error == NQNFS_EXPIRED); 546 if (error) { 547 brelse(bp); 548 return (error); 549 } 550 if (np->n_lrev != np->n_brev || 551 (np->n_flag & NQNFSNONCACHE)) { 552 brelse(bp); 553 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 554 if (error) 555 return (error); 556 np->n_brev = np->n_lrev; 557 goto again; 558 } 559 } 560 error = uiomove((char *)bp->b_data + on, n, uio); 561 if (error) { 562 bp->b_flags |= B_ERROR; 563 brelse(bp); 564 return (error); 565 } 566 if (bp->b_dirtyend > 0) { 567 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 568 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 569 } else { 570 bp->b_dirtyoff = on; 571 bp->b_dirtyend = on + n; 572 } 573 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 574 bp->b_validoff > bp->b_dirtyend) { 575 bp->b_validoff = bp->b_dirtyoff; 576 bp->b_validend = bp->b_dirtyend; 577 } else { 578 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 579 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 580 } 581 /* 582 * If the lease is non-cachable or IO_SYNC do bwrite(). 583 */ 584 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 585 bp->b_proc = p; 586 error = VOP_BWRITE(bp); 587 if (error) 588 return (error); 589 if (np->n_flag & NQNFSNONCACHE) { 590 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 591 if (error) 592 return (error); 593 } 594 } else if ((n + on) == biosize && 595 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 596 bp->b_proc = (struct proc *)0; 597 bp->b_flags |= B_ASYNC; 598 (void)nfs_writebp(bp, 0); 599 } else 600 bdwrite(bp); 601 } while (uio->uio_resid > 0 && n > 0); 602 return (0); 603} 604 605/* 606 * Get an nfs cache block. 607 * Allocate a new one if the block isn't currently in the cache 608 * and return the block marked busy. If the calling process is 609 * interrupted by a signal for an interruptible mount point, return 610 * NULL. 611 */ 612struct buf * 613nfs_getcacheblk(vp, bn, size, p) 614 struct vnode *vp; 615 daddr_t bn; 616 int size; 617 struct proc *p; 618{ 619 register struct buf *bp; 620 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 621 int biosize = vp->v_mount->mnt_stat.f_iosize; 622 623 if (nmp->nm_flag & NFSMNT_INT) { 624 bp = getblk(vp, bn, size, PCATCH, 0); 625 while (bp == (struct buf *)0) { 626 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 627 return ((struct buf *)0); 628 bp = getblk(vp, bn, size, 0, 2 * hz); 629 } 630 } else 631 bp = getblk(vp, bn, size, 0, 0); 632 633 if( vp->v_type == VREG) 634 bp->b_blkno = (bn * biosize) / DEV_BSIZE; 635 636 return (bp); 637} 638 639/* 640 * Flush and invalidate all dirty buffers. If another process is already 641 * doing the flush, just wait for completion. 642 */ 643int 644nfs_vinvalbuf(vp, flags, cred, p, intrflg) 645 struct vnode *vp; 646 int flags; 647 struct ucred *cred; 648 struct proc *p; 649 int intrflg; 650{ 651 register struct nfsnode *np = VTONFS(vp); 652 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 653 int error = 0, slpflag, slptimeo; 654 655 if ((nmp->nm_flag & NFSMNT_INT) == 0) 656 intrflg = 0; 657 if (intrflg) { 658 slpflag = PCATCH; 659 slptimeo = 2 * hz; 660 } else { 661 slpflag = 0; 662 slptimeo = 0; 663 } 664 /* 665 * First wait for any other process doing a flush to complete. 666 */ 667 while (np->n_flag & NFLUSHINPROG) { 668 np->n_flag |= NFLUSHWANT; 669 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 670 slptimeo); 671 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 672 return (EINTR); 673 } 674 675 /* 676 * Now, flush as required. 677 */ 678 np->n_flag |= NFLUSHINPROG; 679 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 680 while (error) { 681 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 682 np->n_flag &= ~NFLUSHINPROG; 683 if (np->n_flag & NFLUSHWANT) { 684 np->n_flag &= ~NFLUSHWANT; 685 wakeup((caddr_t)&np->n_flag); 686 } 687 return (EINTR); 688 } 689 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 690 } 691 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 692 if (np->n_flag & NFLUSHWANT) { 693 np->n_flag &= ~NFLUSHWANT; 694 wakeup((caddr_t)&np->n_flag); 695 } 696 return (0); 697} 698 699/* 700 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 701 * This is mainly to avoid queueing async I/O requests when the nfsiods 702 * are all hung on a dead server. 703 */ 704int 705nfs_asyncio(bp, cred) 706 register struct buf *bp; 707 struct ucred *cred; 708{ 709 register int i; 710 711 if (nfs_numasync == 0) 712 return (EIO); 713 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 714 if (nfs_iodwant[i]) { 715 if (bp->b_flags & B_READ) { 716 if (bp->b_rcred == NOCRED && cred != NOCRED) { 717 crhold(cred); 718 bp->b_rcred = cred; 719 } 720 } else { 721 bp->b_flags |= B_WRITEINPROG; 722 if (bp->b_wcred == NOCRED && cred != NOCRED) { 723 crhold(cred); 724 bp->b_wcred = cred; 725 } 726 } 727 728 TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); 729 nfs_iodwant[i] = (struct proc *)0; 730 wakeup((caddr_t)&nfs_iodwant[i]); 731 return (0); 732 } 733 734 /* 735 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE 736 * return EIO so the process will call nfs_doio() and do it 737 * synchronously. 738 */ 739 if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE)) 740 return (EIO); 741 742 /* 743 * Just turn the async write into a delayed write, instead of 744 * doing in synchronously. Hopefully, at least one of the nfsiods 745 * is currently doing a write for this file and will pick up the 746 * delayed writes before going back to sleep. 747 */ 748 bp->b_flags |= B_DELWRI; 749 reassignbuf(bp, bp->b_vp); 750 biodone(bp); 751 return (0); 752} 753 754/* 755 * Do an I/O operation to/from a cache block. This may be called 756 * synchronously or from an nfsiod. 757 */ 758int 759nfs_doio(bp, cr, p) 760 register struct buf *bp; 761 struct ucred *cr; 762 struct proc *p; 763{ 764 register struct uio *uiop; 765 register struct vnode *vp; 766 struct nfsnode *np; 767 struct nfsmount *nmp; 768 int error = 0, diff, len, iomode, must_commit = 0; 769 struct uio uio; 770 struct iovec io; 771 772 vp = bp->b_vp; 773 np = VTONFS(vp); 774 nmp = VFSTONFS(vp->v_mount); 775 uiop = &uio; 776 uiop->uio_iov = &io; 777 uiop->uio_iovcnt = 1; 778 uiop->uio_segflg = UIO_SYSSPACE; 779 uiop->uio_procp = p; 780 781 /* 782 * Historically, paging was done with physio, but no more. 783 */ 784 if (bp->b_flags & B_PHYS) { 785 /* 786 * ...though reading /dev/drum still gets us here. 787 */ 788 io.iov_len = uiop->uio_resid = bp->b_bcount; 789 /* mapping was done by vmapbuf() */ 790 io.iov_base = bp->b_data; 791 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 792 if (bp->b_flags & B_READ) { 793 uiop->uio_rw = UIO_READ; 794 nfsstats.read_physios++; 795 error = nfs_readrpc(vp, uiop, cr); 796 } else { 797 int com; 798 799 iomode = NFSV3WRITE_DATASYNC; 800 uiop->uio_rw = UIO_WRITE; 801 nfsstats.write_physios++; 802 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 803 } 804 if (error) { 805 bp->b_flags |= B_ERROR; 806 bp->b_error = error; 807 } 808 } else if (bp->b_flags & B_READ) { 809 io.iov_len = uiop->uio_resid = bp->b_bcount; 810 io.iov_base = bp->b_data; 811 uiop->uio_rw = UIO_READ; 812 switch (vp->v_type) { 813 case VREG: 814 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 815 nfsstats.read_bios++; 816 error = nfs_readrpc(vp, uiop, cr); 817 if (!error) { 818 bp->b_validoff = 0; 819 if (uiop->uio_resid) { 820 /* 821 * If len > 0, there is a hole in the file and 822 * no writes after the hole have been pushed to 823 * the server yet. 824 * Just zero fill the rest of the valid area. 825 */ 826 diff = bp->b_bcount - uiop->uio_resid; 827 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 828 + diff); 829 if (len > 0) { 830 len = min(len, uiop->uio_resid); 831 bzero((char *)bp->b_data + diff, len); 832 bp->b_validend = diff + len; 833 } else 834 bp->b_validend = diff; 835 } else 836 bp->b_validend = bp->b_bcount; 837 } 838 if (p && (vp->v_flag & VTEXT) && 839 (((nmp->nm_flag & NFSMNT_NQNFS) && 840 NQNFS_CKINVALID(vp, np, ND_READ) && 841 np->n_lrev != np->n_brev) || 842 (!(nmp->nm_flag & NFSMNT_NQNFS) && 843 np->n_mtime != np->n_vattr.va_mtime.ts_sec))) { 844 uprintf("Process killed due to text file modification\n"); 845 psignal(p, SIGKILL); 846#ifdef __NetBSD__ 847 p->p_holdcnt++; 848#else 849 p->p_flag |= P_NOSWAP; 850#endif 851 } 852 break; 853 case VLNK: 854 uiop->uio_offset = (off_t)0; 855 nfsstats.readlink_bios++; 856 error = nfs_readlinkrpc(vp, uiop, cr); 857 break; 858 case VDIR: 859 nfsstats.readdir_bios++; 860 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 861 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 862 error = nfs_readdirplusrpc(vp, uiop, cr); 863 if (error == NFSERR_NOTSUPP) 864 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 865 } 866 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 867 error = nfs_readdirrpc(vp, uiop, cr); 868 break; 869 default: 870 printf("nfs_doio: type %x unexpected\n",vp->v_type); 871 break; 872 }; 873 if (error) { 874 bp->b_flags |= B_ERROR; 875 bp->b_error = error; 876 } 877 } else { 878 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size) 879 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 880 881 if (bp->b_dirtyend > bp->b_dirtyoff) { 882 io.iov_len = uiop->uio_resid = bp->b_dirtyend 883 - bp->b_dirtyoff; 884 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 885 + bp->b_dirtyoff; 886 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 887 uiop->uio_rw = UIO_WRITE; 888 nfsstats.write_bios++; 889 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) 890 iomode = NFSV3WRITE_UNSTABLE; 891 else 892 iomode = NFSV3WRITE_FILESYNC; 893 bp->b_flags |= B_WRITEINPROG; 894 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 895 if (!error && iomode == NFSV3WRITE_UNSTABLE) 896 bp->b_flags |= B_NEEDCOMMIT; 897 else 898 bp->b_flags &= ~B_NEEDCOMMIT; 899 bp->b_flags &= ~B_WRITEINPROG; 900 901 /* 902 * For an interrupted write, the buffer is still valid 903 * and the write hasn't been pushed to the server yet, 904 * so we can't set B_ERROR and report the interruption 905 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 906 * is not relevant, so the rpc attempt is essentially 907 * a noop. For the case of a V3 write rpc not being 908 * committed to stable storage, the block is still 909 * dirty and requires either a commit rpc or another 910 * write rpc with iomode == NFSV3WRITE_FILESYNC before 911 * the block is reused. This is indicated by setting 912 * the B_DELWRI and B_NEEDCOMMIT flags. 913 */ 914 if (error == EINTR 915 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 916 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 917 bp->b_flags |= B_DELWRI; 918 919 /* 920 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the 921 * buffer to the clean list, we have to reassign it back to the 922 * dirty one. Ugh. 923 */ 924 if (bp->b_flags & B_ASYNC) 925 reassignbuf(bp, vp); 926 else 927 bp->b_flags |= B_EINTR; 928 } else { 929 if (error) { 930 bp->b_flags |= B_ERROR; 931 bp->b_error = np->n_error = error; 932 np->n_flag |= NWRITEERR; 933 } 934 bp->b_dirtyoff = bp->b_dirtyend = 0; 935 } 936 } else { 937 bp->b_resid = 0; 938 biodone(bp); 939 return (0); 940 } 941 } 942 bp->b_resid = uiop->uio_resid; 943 if (must_commit) 944 nfs_clearcommit(vp->v_mount); 945 biodone(bp); 946 return (error); 947} 948