nfs_bio.c revision 18397
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Rick Macklem at The University of Guelph. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94 37 * $Id: nfs_bio.c,v 1.24 1996/07/16 10:19:43 dfr Exp $ 38 */ 39 40#include <sys/param.h> 41#include <sys/systm.h> 42#include <sys/resourcevar.h> 43#include <sys/signalvar.h> 44#include <sys/proc.h> 45#include <sys/buf.h> 46#include <sys/vnode.h> 47#include <sys/mount.h> 48#include <sys/kernel.h> 49 50#include <vm/vm.h> 51#include <vm/vm_param.h> 52#include <vm/vm_extern.h> 53 54#include <nfs/rpcv2.h> 55#include <nfs/nfsproto.h> 56#include <nfs/nfs.h> 57#include <nfs/nfsmount.h> 58#include <nfs/nqnfs.h> 59#include <nfs/nfsnode.h> 60 61static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size, 62 struct proc *p)); 63 64extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON]; 65extern int nfs_numasync; 66extern struct nfsstats nfsstats; 67 68/* 69 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate 70 * that this isn't done inside getblk() and brelse() so these calls 71 * wouldn't need to be here. 72 */ 73#ifdef B_VMIO 74#define vnode_pager_uncache(vp) 75#else 76#define vfs_busy_pages(bp, f) 77#define vfs_unbusy_pages(bp) 78#define vfs_dirty_pages(bp) 79#endif 80 81/* 82 * Vnode op for read using bio 83 * Any similarity to readip() is purely coincidental 84 */ 85int 86nfs_bioread(vp, uio, ioflag, cred) 87 register struct vnode *vp; 88 register struct uio *uio; 89 int ioflag; 90 struct ucred *cred; 91{ 92 register struct nfsnode *np = VTONFS(vp); 93 register int biosize, diff, i; 94 struct buf *bp = 0, *rabp; 95 struct vattr vattr; 96 struct proc *p; 97 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 98 daddr_t lbn, rabn; 99 int bufsize; 100 int nra, error = 0, n = 0, on = 0, not_readin; 101 102#ifdef DIAGNOSTIC 103 if (uio->uio_rw != UIO_READ) 104 panic("nfs_read mode"); 105#endif 106 if (uio->uio_resid == 0) 107 return (0); 108 if (uio->uio_offset < 0) 109 return (EINVAL); 110 p = uio->uio_procp; 111 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 112 (void)nfs_fsinfo(nmp, vp, cred, p); 113 biosize = vp->v_mount->mnt_stat.f_iosize; 114 /* 115 * For nfs, cache consistency can only be maintained approximately. 116 * Although RFC1094 does not specify the criteria, the following is 117 * believed to be compatible with the reference port. 118 * For nqnfs, full cache consistency is maintained within the loop. 119 * For nfs: 120 * If the file's modify time on the server has changed since the 121 * last read rpc or you have written to the file, 122 * you may have lost data cache consistency with the 123 * server, so flush all of the file's data out of the cache. 124 * Then force a getattr rpc to ensure that you have up to date 125 * attributes. 126 * NB: This implies that cache data can be read when up to 127 * NFS_ATTRTIMEO seconds out of date. If you find that you need current 128 * attributes this could be forced by setting n_attrstamp to 0 before 129 * the VOP_GETATTR() call. 130 */ 131 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) { 132 if (np->n_flag & NMODIFIED) { 133 if (vp->v_type != VREG) { 134 if (vp->v_type != VDIR) 135 panic("nfs: bioread, not dir"); 136 nfs_invaldir(vp); 137 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 138 if (error) 139 return (error); 140 } 141 np->n_attrstamp = 0; 142 error = VOP_GETATTR(vp, &vattr, cred, p); 143 if (error) 144 return (error); 145 np->n_mtime = vattr.va_mtime.tv_sec; 146 } else { 147 error = VOP_GETATTR(vp, &vattr, cred, p); 148 if (error) 149 return (error); 150 if (np->n_mtime != vattr.va_mtime.tv_sec) { 151 if (vp->v_type == VDIR) 152 nfs_invaldir(vp); 153 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 154 if (error) 155 return (error); 156 np->n_mtime = vattr.va_mtime.tv_sec; 157 } 158 } 159 } 160 do { 161 162 /* 163 * Get a valid lease. If cached data is stale, flush it. 164 */ 165 if (nmp->nm_flag & NFSMNT_NQNFS) { 166 if (NQNFS_CKINVALID(vp, np, ND_READ)) { 167 do { 168 error = nqnfs_getlease(vp, ND_READ, cred, p); 169 } while (error == NQNFS_EXPIRED); 170 if (error) 171 return (error); 172 if (np->n_lrev != np->n_brev || 173 (np->n_flag & NQNFSNONCACHE) || 174 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) { 175 if (vp->v_type == VDIR) 176 nfs_invaldir(vp); 177 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 178 if (error) 179 return (error); 180 np->n_brev = np->n_lrev; 181 } 182 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) { 183 nfs_invaldir(vp); 184 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 185 if (error) 186 return (error); 187 } 188 } 189 if (np->n_flag & NQNFSNONCACHE) { 190 switch (vp->v_type) { 191 case VREG: 192 return (nfs_readrpc(vp, uio, cred)); 193 case VLNK: 194 return (nfs_readlinkrpc(vp, uio, cred)); 195 case VDIR: 196 break; 197 default: 198 printf(" NQNFSNONCACHE: type %x unexpected\n", 199 vp->v_type); 200 }; 201 } 202 switch (vp->v_type) { 203 case VREG: 204 nfsstats.biocache_reads++; 205 lbn = uio->uio_offset / biosize; 206 on = uio->uio_offset & (biosize - 1); 207 not_readin = 1; 208 209 /* 210 * Start the read ahead(s), as required. 211 */ 212 if (nfs_numasync > 0 && nmp->nm_readahead > 0) { 213 for (nra = 0; nra < nmp->nm_readahead && 214 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { 215 rabn = lbn + 1 + nra; 216 if (!incore(vp, rabn)) { 217 rabp = nfs_getcacheblk(vp, rabn, biosize, p); 218 if (!rabp) 219 return (EINTR); 220 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 221 rabp->b_flags |= (B_READ | B_ASYNC); 222 vfs_busy_pages(rabp, 0); 223 if (nfs_asyncio(rabp, cred)) { 224 rabp->b_flags |= B_INVAL|B_ERROR; 225 vfs_unbusy_pages(rabp); 226 brelse(rabp); 227 } 228 } else { 229 brelse(rabp); 230 } 231 } 232 } 233 } 234 235 /* 236 * If the block is in the cache and has the required data 237 * in a valid region, just copy it out. 238 * Otherwise, get the block and write back/read in, 239 * as required. 240 */ 241again: 242 bufsize = biosize; 243 if ((off_t)(lbn + 1) * biosize > np->n_size && 244 (off_t)(lbn + 1) * biosize - np->n_size < biosize) { 245 bufsize = np->n_size - lbn * biosize; 246 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 247 } 248 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 249 if (!bp) 250 return (EINTR); 251 if ((bp->b_flags & B_CACHE) == 0) { 252 bp->b_flags |= B_READ; 253 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 254 not_readin = 0; 255 vfs_busy_pages(bp, 0); 256 error = nfs_doio(bp, cred, p); 257 if (error) { 258 brelse(bp); 259 return (error); 260 } 261 } 262 if (bufsize > on) { 263 n = min((unsigned)(bufsize - on), uio->uio_resid); 264 } else { 265 n = 0; 266 } 267 diff = np->n_size - uio->uio_offset; 268 if (diff < n) 269 n = diff; 270 if (not_readin && n > 0) { 271 if (on < bp->b_validoff || (on + n) > bp->b_validend) { 272 bp->b_flags |= B_NOCACHE; 273 if (bp->b_dirtyend > 0) { 274 if ((bp->b_flags & B_DELWRI) == 0) 275 panic("nfsbioread"); 276 if (VOP_BWRITE(bp) == EINTR) 277 return (EINTR); 278 } else 279 brelse(bp); 280 goto again; 281 } 282 } 283 vp->v_lastr = lbn; 284 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); 285 if (diff < n) 286 n = diff; 287 break; 288 case VLNK: 289 nfsstats.biocache_readlinks++; 290 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); 291 if (!bp) 292 return (EINTR); 293 if ((bp->b_flags & B_CACHE) == 0) { 294 bp->b_flags |= B_READ; 295 vfs_busy_pages(bp, 0); 296 error = nfs_doio(bp, cred, p); 297 if (error) { 298 bp->b_flags |= B_ERROR; 299 brelse(bp); 300 return (error); 301 } 302 } 303 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); 304 on = 0; 305 break; 306 case VDIR: 307 nfsstats.biocache_readdirs++; 308 lbn = uio->uio_offset / NFS_DIRBLKSIZ; 309 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1); 310 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p); 311 if (!bp) 312 return (EINTR); 313 if ((bp->b_flags & B_CACHE) == 0) { 314 bp->b_flags |= B_READ; 315 vfs_busy_pages(bp, 0); 316 error = nfs_doio(bp, cred, p); 317 if (error) { 318 brelse(bp); 319 while (error == NFSERR_BAD_COOKIE) { 320 nfs_invaldir(vp); 321 error = nfs_vinvalbuf(vp, 0, cred, p, 1); 322 /* 323 * Yuck! The directory has been modified on the 324 * server. The only way to get the block is by 325 * reading from the beginning to get all the 326 * offset cookies. 327 */ 328 for (i = 0; i <= lbn && !error; i++) { 329 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p); 330 if (!bp) 331 return (EINTR); 332 if ((bp->b_flags & B_DONE) == 0) { 333 bp->b_flags |= B_READ; 334 vfs_busy_pages(bp, 0); 335 error = nfs_doio(bp, cred, p); 336 if (error) 337 brelse(bp); 338 } 339 } 340 } 341 if (error) 342 return (error); 343 } 344 } 345 346 /* 347 * If not eof and read aheads are enabled, start one. 348 * (You need the current block first, so that you have the 349 * directory offset cookie of the next block.) 350 */ 351 if (nfs_numasync > 0 && nmp->nm_readahead > 0 && 352 (np->n_direofoffset == 0 || 353 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && 354 !(np->n_flag & NQNFSNONCACHE) && 355 !incore(vp, lbn + 1)) { 356 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p); 357 if (rabp) { 358 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { 359 rabp->b_flags |= (B_READ | B_ASYNC); 360 vfs_busy_pages(rabp, 0); 361 if (nfs_asyncio(rabp, cred)) { 362 rabp->b_flags |= B_INVAL|B_ERROR; 363 vfs_unbusy_pages(rabp); 364 brelse(rabp); 365 } 366 } else { 367 brelse(rabp); 368 } 369 } 370 } 371 n = min(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on); 372 break; 373 default: 374 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 375 break; 376 }; 377 378 if (n > 0) { 379 error = uiomove(bp->b_data + on, (int)n, uio); 380 } 381 switch (vp->v_type) { 382 case VREG: 383 break; 384 case VLNK: 385 n = 0; 386 break; 387 case VDIR: 388 if (np->n_flag & NQNFSNONCACHE) 389 bp->b_flags |= B_INVAL; 390 break; 391 default: 392 printf(" nfs_bioread: type %x unexpected\n",vp->v_type); 393 } 394 brelse(bp); 395 } while (error == 0 && uio->uio_resid > 0 && n > 0); 396 return (error); 397} 398 399/* 400 * Vnode op for write using bio 401 */ 402int 403nfs_write(ap) 404 struct vop_write_args /* { 405 struct vnode *a_vp; 406 struct uio *a_uio; 407 int a_ioflag; 408 struct ucred *a_cred; 409 } */ *ap; 410{ 411 register int biosize; 412 register struct uio *uio = ap->a_uio; 413 struct proc *p = uio->uio_procp; 414 register struct vnode *vp = ap->a_vp; 415 struct nfsnode *np = VTONFS(vp); 416 register struct ucred *cred = ap->a_cred; 417 int ioflag = ap->a_ioflag; 418 struct buf *bp; 419 struct vattr vattr; 420 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 421 daddr_t lbn; 422 int bufsize; 423 int n, on, error = 0, iomode, must_commit; 424 425#ifdef DIAGNOSTIC 426 if (uio->uio_rw != UIO_WRITE) 427 panic("nfs_write mode"); 428 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc) 429 panic("nfs_write proc"); 430#endif 431 if (vp->v_type != VREG) 432 return (EIO); 433 if (np->n_flag & NWRITEERR) { 434 np->n_flag &= ~NWRITEERR; 435 return (np->n_error); 436 } 437 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3) 438 (void)nfs_fsinfo(nmp, vp, cred, p); 439 if (ioflag & (IO_APPEND | IO_SYNC)) { 440 if (np->n_flag & NMODIFIED) { 441 np->n_attrstamp = 0; 442 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 443 if (error) 444 return (error); 445 } 446 if (ioflag & IO_APPEND) { 447 np->n_attrstamp = 0; 448 error = VOP_GETATTR(vp, &vattr, cred, p); 449 if (error) 450 return (error); 451 uio->uio_offset = np->n_size; 452 } 453 } 454 if (uio->uio_offset < 0) 455 return (EINVAL); 456 if (uio->uio_resid == 0) 457 return (0); 458 /* 459 * Maybe this should be above the vnode op call, but so long as 460 * file servers have no limits, i don't think it matters 461 */ 462 if (p && uio->uio_offset + uio->uio_resid > 463 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) { 464 psignal(p, SIGXFSZ); 465 return (EFBIG); 466 } 467 /* 468 * I use nm_rsize, not nm_wsize so that all buffer cache blocks 469 * will be the same size within a filesystem. nfs_writerpc will 470 * still use nm_wsize when sizing the rpc's. 471 */ 472 biosize = vp->v_mount->mnt_stat.f_iosize; 473 do { 474 475 /* 476 * XXX make sure we aren't cached in the VM page cache 477 */ 478 /* 479 * Check for a valid write lease. 480 */ 481 if ((nmp->nm_flag & NFSMNT_NQNFS) && 482 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 483 do { 484 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 485 } while (error == NQNFS_EXPIRED); 486 if (error) 487 return (error); 488 if (np->n_lrev != np->n_brev || 489 (np->n_flag & NQNFSNONCACHE)) { 490 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 491 if (error) 492 return (error); 493 np->n_brev = np->n_lrev; 494 } 495 } 496 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { 497 iomode = NFSV3WRITE_FILESYNC; 498 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); 499 if (must_commit) 500 nfs_clearcommit(vp->v_mount); 501 return (error); 502 } 503 nfsstats.biocache_writes++; 504 lbn = uio->uio_offset / biosize; 505 on = uio->uio_offset & (biosize-1); 506 n = min((unsigned)(biosize - on), uio->uio_resid); 507again: 508 if (uio->uio_offset + n > np->n_size) { 509 np->n_size = uio->uio_offset + n; 510 vnode_pager_setsize(vp, (u_long)np->n_size); 511 } 512 bufsize = biosize; 513 if ((lbn + 1) * biosize > np->n_size) { 514 bufsize = np->n_size - lbn * biosize; 515 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 516 } 517 bp = nfs_getcacheblk(vp, lbn, bufsize, p); 518 if (!bp) 519 return (EINTR); 520 if (bp->b_wcred == NOCRED) { 521 crhold(cred); 522 bp->b_wcred = cred; 523 } 524 np->n_flag |= NMODIFIED; 525 526 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) { 527 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 528 } 529 530 /* 531 * If the new write will leave a contiguous dirty 532 * area, just update the b_dirtyoff and b_dirtyend, 533 * otherwise force a write rpc of the old dirty area. 534 */ 535 if (bp->b_dirtyend > 0 && 536 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { 537 bp->b_proc = p; 538 if (VOP_BWRITE(bp) == EINTR) 539 return (EINTR); 540 goto again; 541 } 542 543 /* 544 * Check for valid write lease and get one as required. 545 * In case getblk() and/or bwrite() delayed us. 546 */ 547 if ((nmp->nm_flag & NFSMNT_NQNFS) && 548 NQNFS_CKINVALID(vp, np, ND_WRITE)) { 549 do { 550 error = nqnfs_getlease(vp, ND_WRITE, cred, p); 551 } while (error == NQNFS_EXPIRED); 552 if (error) { 553 brelse(bp); 554 return (error); 555 } 556 if (np->n_lrev != np->n_brev || 557 (np->n_flag & NQNFSNONCACHE)) { 558 brelse(bp); 559 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 560 if (error) 561 return (error); 562 np->n_brev = np->n_lrev; 563 goto again; 564 } 565 } 566 error = uiomove((char *)bp->b_data + on, n, uio); 567 if (error) { 568 bp->b_flags |= B_ERROR; 569 brelse(bp); 570 return (error); 571 } 572 if (bp->b_dirtyend > 0) { 573 bp->b_dirtyoff = min(on, bp->b_dirtyoff); 574 bp->b_dirtyend = max((on + n), bp->b_dirtyend); 575 } else { 576 bp->b_dirtyoff = on; 577 bp->b_dirtyend = on + n; 578 } 579 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || 580 bp->b_validoff > bp->b_dirtyend) { 581 bp->b_validoff = bp->b_dirtyoff; 582 bp->b_validend = bp->b_dirtyend; 583 } else { 584 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 585 bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 586 } 587 588 /* 589 * Since this block is being modified, it must be written 590 * again and not just committed. 591 */ 592 bp->b_flags &= ~B_NEEDCOMMIT; 593 594 /* 595 * If the lease is non-cachable or IO_SYNC do bwrite(). 596 */ 597 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { 598 bp->b_proc = p; 599 error = VOP_BWRITE(bp); 600 if (error) 601 return (error); 602 if (np->n_flag & NQNFSNONCACHE) { 603 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); 604 if (error) 605 return (error); 606 } 607 } else if ((n + on) == biosize && 608 (nmp->nm_flag & NFSMNT_NQNFS) == 0) { 609 bp->b_proc = (struct proc *)0; 610 bp->b_flags |= B_ASYNC; 611 (void)nfs_writebp(bp, 0); 612 } else 613 bdwrite(bp); 614 } while (uio->uio_resid > 0 && n > 0); 615 return (0); 616} 617 618/* 619 * Get an nfs cache block. 620 * Allocate a new one if the block isn't currently in the cache 621 * and return the block marked busy. If the calling process is 622 * interrupted by a signal for an interruptible mount point, return 623 * NULL. 624 */ 625static struct buf * 626nfs_getcacheblk(vp, bn, size, p) 627 struct vnode *vp; 628 daddr_t bn; 629 int size; 630 struct proc *p; 631{ 632 register struct buf *bp; 633 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 634 int biosize = vp->v_mount->mnt_stat.f_iosize; 635 636 if (nmp->nm_flag & NFSMNT_INT) { 637 bp = getblk(vp, bn, size, PCATCH, 0); 638 while (bp == (struct buf *)0) { 639 if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) 640 return ((struct buf *)0); 641 bp = getblk(vp, bn, size, 0, 2 * hz); 642 } 643 } else 644 bp = getblk(vp, bn, size, 0, 0); 645 646 if( vp->v_type == VREG) 647 bp->b_blkno = (bn * biosize) / DEV_BSIZE; 648 649 return (bp); 650} 651 652/* 653 * Flush and invalidate all dirty buffers. If another process is already 654 * doing the flush, just wait for completion. 655 */ 656int 657nfs_vinvalbuf(vp, flags, cred, p, intrflg) 658 struct vnode *vp; 659 int flags; 660 struct ucred *cred; 661 struct proc *p; 662 int intrflg; 663{ 664 register struct nfsnode *np = VTONFS(vp); 665 struct nfsmount *nmp = VFSTONFS(vp->v_mount); 666 int error = 0, slpflag, slptimeo; 667 668 if ((nmp->nm_flag & NFSMNT_INT) == 0) 669 intrflg = 0; 670 if (intrflg) { 671 slpflag = PCATCH; 672 slptimeo = 2 * hz; 673 } else { 674 slpflag = 0; 675 slptimeo = 0; 676 } 677 /* 678 * First wait for any other process doing a flush to complete. 679 */ 680 while (np->n_flag & NFLUSHINPROG) { 681 np->n_flag |= NFLUSHWANT; 682 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", 683 slptimeo); 684 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) 685 return (EINTR); 686 } 687 688 /* 689 * Now, flush as required. 690 */ 691 np->n_flag |= NFLUSHINPROG; 692 error = vinvalbuf(vp, flags, cred, p, slpflag, 0); 693 while (error) { 694 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { 695 np->n_flag &= ~NFLUSHINPROG; 696 if (np->n_flag & NFLUSHWANT) { 697 np->n_flag &= ~NFLUSHWANT; 698 wakeup((caddr_t)&np->n_flag); 699 } 700 return (EINTR); 701 } 702 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo); 703 } 704 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG); 705 if (np->n_flag & NFLUSHWANT) { 706 np->n_flag &= ~NFLUSHWANT; 707 wakeup((caddr_t)&np->n_flag); 708 } 709 return (0); 710} 711 712/* 713 * Initiate asynchronous I/O. Return an error if no nfsiods are available. 714 * This is mainly to avoid queueing async I/O requests when the nfsiods 715 * are all hung on a dead server. 716 */ 717int 718nfs_asyncio(bp, cred) 719 register struct buf *bp; 720 struct ucred *cred; 721{ 722 register int i; 723 724 if (nfs_numasync == 0) 725 return (EIO); 726 for (i = 0; i < NFS_MAXASYNCDAEMON; i++) 727 if (nfs_iodwant[i]) { 728 if (bp->b_flags & B_READ) { 729 if (bp->b_rcred == NOCRED && cred != NOCRED) { 730 crhold(cred); 731 bp->b_rcred = cred; 732 } 733 } else { 734 bp->b_flags |= B_WRITEINPROG; 735 if (bp->b_wcred == NOCRED && cred != NOCRED) { 736 crhold(cred); 737 bp->b_wcred = cred; 738 } 739 } 740 741 TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); 742 nfs_iodwant[i] = (struct proc *)0; 743 wakeup((caddr_t)&nfs_iodwant[i]); 744 return (0); 745 } 746 747 /* 748 * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE 749 * return EIO so the process will call nfs_doio() and do it 750 * synchronously. 751 */ 752 if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE)) 753 return (EIO); 754 755 /* 756 * Just turn the async write into a delayed write, instead of 757 * doing in synchronously. Hopefully, at least one of the nfsiods 758 * is currently doing a write for this file and will pick up the 759 * delayed writes before going back to sleep. 760 */ 761 bp->b_flags |= B_DELWRI; 762 reassignbuf(bp, bp->b_vp); 763 biodone(bp); 764 return (0); 765} 766 767/* 768 * Do an I/O operation to/from a cache block. This may be called 769 * synchronously or from an nfsiod. 770 */ 771int 772nfs_doio(bp, cr, p) 773 register struct buf *bp; 774 struct ucred *cr; 775 struct proc *p; 776{ 777 register struct uio *uiop; 778 register struct vnode *vp; 779 struct nfsnode *np; 780 struct nfsmount *nmp; 781 int error = 0, diff, len, iomode, must_commit = 0; 782 struct uio uio; 783 struct iovec io; 784 785 vp = bp->b_vp; 786 np = VTONFS(vp); 787 nmp = VFSTONFS(vp->v_mount); 788 uiop = &uio; 789 uiop->uio_iov = &io; 790 uiop->uio_iovcnt = 1; 791 uiop->uio_segflg = UIO_SYSSPACE; 792 uiop->uio_procp = p; 793 794 /* 795 * Historically, paging was done with physio, but no more. 796 */ 797 if (bp->b_flags & B_PHYS) { 798 /* 799 * ...though reading /dev/drum still gets us here. 800 */ 801 io.iov_len = uiop->uio_resid = bp->b_bcount; 802 /* mapping was done by vmapbuf() */ 803 io.iov_base = bp->b_data; 804 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 805 if (bp->b_flags & B_READ) { 806 uiop->uio_rw = UIO_READ; 807 nfsstats.read_physios++; 808 error = nfs_readrpc(vp, uiop, cr); 809 } else { 810 int com; 811 812 iomode = NFSV3WRITE_DATASYNC; 813 uiop->uio_rw = UIO_WRITE; 814 nfsstats.write_physios++; 815 error = nfs_writerpc(vp, uiop, cr, &iomode, &com); 816 } 817 if (error) { 818 bp->b_flags |= B_ERROR; 819 bp->b_error = error; 820 } 821 } else if (bp->b_flags & B_READ) { 822 io.iov_len = uiop->uio_resid = bp->b_bcount; 823 io.iov_base = bp->b_data; 824 uiop->uio_rw = UIO_READ; 825 switch (vp->v_type) { 826 case VREG: 827 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; 828 nfsstats.read_bios++; 829 error = nfs_readrpc(vp, uiop, cr); 830 if (!error) { 831 bp->b_validoff = 0; 832 if (uiop->uio_resid) { 833 /* 834 * If len > 0, there is a hole in the file and 835 * no writes after the hole have been pushed to 836 * the server yet. 837 * Just zero fill the rest of the valid area. 838 */ 839 diff = bp->b_bcount - uiop->uio_resid; 840 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE 841 + diff); 842 if (len > 0) { 843 len = min(len, uiop->uio_resid); 844 bzero((char *)bp->b_data + diff, len); 845 bp->b_validend = diff + len; 846 } else 847 bp->b_validend = diff; 848 } else 849 bp->b_validend = bp->b_bcount; 850 } 851 if (p && (vp->v_flag & VTEXT) && 852 (((nmp->nm_flag & NFSMNT_NQNFS) && 853 NQNFS_CKINVALID(vp, np, ND_READ) && 854 np->n_lrev != np->n_brev) || 855 (!(nmp->nm_flag & NFSMNT_NQNFS) && 856 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) { 857 uprintf("Process killed due to text file modification\n"); 858 psignal(p, SIGKILL); 859#ifdef __NetBSD__ 860 p->p_holdcnt++; 861#else 862 p->p_flag |= P_NOSWAP; 863#endif 864 } 865 break; 866 case VLNK: 867 uiop->uio_offset = (off_t)0; 868 nfsstats.readlink_bios++; 869 error = nfs_readlinkrpc(vp, uiop, cr); 870 break; 871 case VDIR: 872 nfsstats.readdir_bios++; 873 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ; 874 if (nmp->nm_flag & NFSMNT_RDIRPLUS) { 875 error = nfs_readdirplusrpc(vp, uiop, cr); 876 if (error == NFSERR_NOTSUPP) 877 nmp->nm_flag &= ~NFSMNT_RDIRPLUS; 878 } 879 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) 880 error = nfs_readdirrpc(vp, uiop, cr); 881 break; 882 default: 883 printf("nfs_doio: type %x unexpected\n",vp->v_type); 884 break; 885 }; 886 if (error) { 887 bp->b_flags |= B_ERROR; 888 bp->b_error = error; 889 } 890 } else { 891 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size) 892 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE); 893 894 if (bp->b_dirtyend > bp->b_dirtyoff) { 895 io.iov_len = uiop->uio_resid = bp->b_dirtyend 896 - bp->b_dirtyoff; 897 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE 898 + bp->b_dirtyoff; 899 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; 900 uiop->uio_rw = UIO_WRITE; 901 nfsstats.write_bios++; 902 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) 903 iomode = NFSV3WRITE_UNSTABLE; 904 else 905 iomode = NFSV3WRITE_FILESYNC; 906 bp->b_flags |= B_WRITEINPROG; 907 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); 908 if (!error && iomode == NFSV3WRITE_UNSTABLE) 909 bp->b_flags |= B_NEEDCOMMIT; 910 else 911 bp->b_flags &= ~B_NEEDCOMMIT; 912 bp->b_flags &= ~B_WRITEINPROG; 913 914 /* 915 * For an interrupted write, the buffer is still valid 916 * and the write hasn't been pushed to the server yet, 917 * so we can't set B_ERROR and report the interruption 918 * by setting B_EINTR. For the B_ASYNC case, B_EINTR 919 * is not relevant, so the rpc attempt is essentially 920 * a noop. For the case of a V3 write rpc not being 921 * committed to stable storage, the block is still 922 * dirty and requires either a commit rpc or another 923 * write rpc with iomode == NFSV3WRITE_FILESYNC before 924 * the block is reused. This is indicated by setting 925 * the B_DELWRI and B_NEEDCOMMIT flags. 926 */ 927 if (error == EINTR 928 || (!error && (bp->b_flags & B_NEEDCOMMIT))) { 929 bp->b_flags &= ~(B_INVAL|B_NOCACHE); 930 bp->b_flags |= B_DELWRI; 931 932 /* 933 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the 934 * buffer to the clean list, we have to reassign it back to the 935 * dirty one. Ugh. 936 */ 937 if (bp->b_flags & B_ASYNC) 938 reassignbuf(bp, vp); 939 else 940 bp->b_flags |= B_EINTR; 941 } else { 942 if (error) { 943 bp->b_flags |= B_ERROR; 944 bp->b_error = np->n_error = error; 945 np->n_flag |= NWRITEERR; 946 } 947 bp->b_dirtyoff = bp->b_dirtyend = 0; 948 } 949 } else { 950 bp->b_resid = 0; 951 biodone(bp); 952 return (0); 953 } 954 } 955 bp->b_resid = uiop->uio_resid; 956 if (must_commit) 957 nfs_clearcommit(vp->v_mount); 958 biodone(bp); 959 return (error); 960} 961