vfs_bio.c revision 12767
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. This work was done expressly for inclusion into FreeBSD. Other use 17 * is allowed if this notation is included. 18 * 5. Modifications may be freely made to this file if the above conditions 19 * are met. 20 * 21 * $Id: vfs_bio.c,v 1.75 1995/12/07 12:47:02 davidg Exp $ 22 */ 23 24/* 25 * this file contains a new buffer I/O scheme implementing a coherent 26 * VM object and buffer cache scheme. Pains have been taken to make 27 * sure that the performance degradation associated with schemes such 28 * as this is not realized. 29 * 30 * Author: John S. Dyson 31 * Significant help during the development and debugging phases 32 * had been provided by David Greenman, also of the FreeBSD core team. 33 */ 34 35#define VMIO 36#include <sys/param.h> 37#include <sys/systm.h> 38#include <sys/sysproto.h> 39#include <sys/kernel.h> 40#include <sys/sysctl.h> 41#include <sys/proc.h> 42#include <sys/vnode.h> 43#include <sys/vmmeter.h> 44#include <vm/vm.h> 45#include <vm/vm_param.h> 46#include <vm/vm_prot.h> 47#include <vm/vm_kern.h> 48#include <vm/vm_pageout.h> 49#include <vm/vm_page.h> 50#include <vm/vm_object.h> 51#include <vm/vm_extern.h> 52#include <sys/buf.h> 53#include <sys/mount.h> 54#include <sys/malloc.h> 55#include <sys/resourcevar.h> 56#include <sys/proc.h> 57 58#include <miscfs/specfs/specdev.h> 59 60static void vfs_update __P((void)); 61struct proc *updateproc; 62static struct kproc_desc up_kp = { 63 "update", 64 vfs_update, 65 &updateproc 66}; 67SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 68 69struct buf *buf; /* buffer header pool */ 70struct swqueue bswlist; 71 72int count_lock_queue __P((void)); 73void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 74void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to); 75void vfs_clean_pages(struct buf * bp); 76static void vfs_setdirty(struct buf *bp); 77 78int needsbuffer; 79 80/* 81 * Internal update daemon, process 3 82 * The variable vfs_update_wakeup allows for internal syncs. 83 */ 84int vfs_update_wakeup; 85 86 87/* 88 * buffers base kva 89 */ 90caddr_t buffers_kva; 91 92/* 93 * bogus page -- for I/O to/from partially complete buffers 94 * this is a temporary solution to the problem, but it is not 95 * really that bad. it would be better to split the buffer 96 * for input in the case of buffers partially already in memory, 97 * but the code is intricate enough already. 98 */ 99vm_page_t bogus_page; 100vm_offset_t bogus_offset; 101 102int bufspace, maxbufspace; 103 104struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; 105struct bqueues bufqueues[BUFFER_QUEUES]; 106 107#define BUF_MAXUSE 8 108 109/* 110 * Initialize buffer headers and related structures. 111 */ 112void 113bufinit() 114{ 115 struct buf *bp; 116 int i; 117 118 TAILQ_INIT(&bswlist); 119 LIST_INIT(&invalhash); 120 121 /* first, make a null hash table */ 122 for (i = 0; i < BUFHSZ; i++) 123 LIST_INIT(&bufhashtbl[i]); 124 125 /* next, make a null set of free lists */ 126 for (i = 0; i < BUFFER_QUEUES; i++) 127 TAILQ_INIT(&bufqueues[i]); 128 129 buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 130 /* finally, initialize each buffer header and stick on empty q */ 131 for (i = 0; i < nbuf; i++) { 132 bp = &buf[i]; 133 bzero(bp, sizeof *bp); 134 bp->b_flags = B_INVAL; /* we're just an empty header */ 135 bp->b_dev = NODEV; 136 bp->b_rcred = NOCRED; 137 bp->b_wcred = NOCRED; 138 bp->b_qindex = QUEUE_EMPTY; 139 bp->b_vnbufs.le_next = NOLIST; 140 bp->b_data = buffers_kva + i * MAXBSIZE; 141 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 142 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 143 } 144/* 145 * maxbufspace is currently calculated to support all filesystem blocks 146 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 147 * cache is still the same as it would be for 8K filesystems. This 148 * keeps the size of the buffer cache "in check" for big block filesystems. 149 */ 150 maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE; 151 152 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 153 bogus_page = vm_page_alloc(kernel_object, 154 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 155 VM_ALLOC_NORMAL); 156 157} 158 159/* 160 * remove the buffer from the appropriate free list 161 */ 162void 163bremfree(struct buf * bp) 164{ 165 int s = splbio(); 166 167 if (bp->b_qindex != QUEUE_NONE) { 168 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 169 bp->b_qindex = QUEUE_NONE; 170 } else { 171 panic("bremfree: removing a buffer when not on a queue"); 172 } 173 splx(s); 174} 175 176/* 177 * Get a buffer with the specified data. Look in the cache first. 178 */ 179int 180bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 181 struct buf ** bpp) 182{ 183 struct buf *bp; 184 185 bp = getblk(vp, blkno, size, 0, 0); 186 *bpp = bp; 187 188 /* if not found in cache, do some I/O */ 189 if ((bp->b_flags & B_CACHE) == 0) { 190 if (curproc != NULL) 191 curproc->p_stats->p_ru.ru_inblock++; 192 bp->b_flags |= B_READ; 193 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 194 if (bp->b_rcred == NOCRED) { 195 if (cred != NOCRED) 196 crhold(cred); 197 bp->b_rcred = cred; 198 } 199 vfs_busy_pages(bp, 0); 200 VOP_STRATEGY(bp); 201 return (biowait(bp)); 202 } 203 return (0); 204} 205 206/* 207 * Operates like bread, but also starts asynchronous I/O on 208 * read-ahead blocks. 209 */ 210int 211breadn(struct vnode * vp, daddr_t blkno, int size, 212 daddr_t * rablkno, int *rabsize, 213 int cnt, struct ucred * cred, struct buf ** bpp) 214{ 215 struct buf *bp, *rabp; 216 int i; 217 int rv = 0, readwait = 0; 218 219 *bpp = bp = getblk(vp, blkno, size, 0, 0); 220 221 /* if not found in cache, do some I/O */ 222 if ((bp->b_flags & B_CACHE) == 0) { 223 if (curproc != NULL) 224 curproc->p_stats->p_ru.ru_inblock++; 225 bp->b_flags |= B_READ; 226 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 227 if (bp->b_rcred == NOCRED) { 228 if (cred != NOCRED) 229 crhold(cred); 230 bp->b_rcred = cred; 231 } 232 vfs_busy_pages(bp, 0); 233 VOP_STRATEGY(bp); 234 ++readwait; 235 } 236 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 237 if (inmem(vp, *rablkno)) 238 continue; 239 rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 240 241 if ((rabp->b_flags & B_CACHE) == 0) { 242 if (curproc != NULL) 243 curproc->p_stats->p_ru.ru_inblock++; 244 rabp->b_flags |= B_READ | B_ASYNC; 245 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 246 if (rabp->b_rcred == NOCRED) { 247 if (cred != NOCRED) 248 crhold(cred); 249 rabp->b_rcred = cred; 250 } 251 vfs_busy_pages(rabp, 0); 252 VOP_STRATEGY(rabp); 253 } else { 254 brelse(rabp); 255 } 256 } 257 258 if (readwait) { 259 rv = biowait(bp); 260 } 261 return (rv); 262} 263 264/* 265 * Write, release buffer on completion. (Done by iodone 266 * if async.) 267 */ 268int 269bwrite(struct buf * bp) 270{ 271 int oldflags = bp->b_flags; 272 273 if (bp->b_flags & B_INVAL) { 274 brelse(bp); 275 return (0); 276 } 277 if (!(bp->b_flags & B_BUSY)) 278 panic("bwrite: buffer is not busy???"); 279 280 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 281 bp->b_flags |= B_WRITEINPROG; 282 283 if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { 284 reassignbuf(bp, bp->b_vp); 285 } 286 287 bp->b_vp->v_numoutput++; 288 vfs_busy_pages(bp, 1); 289 if (curproc != NULL) 290 curproc->p_stats->p_ru.ru_oublock++; 291 VOP_STRATEGY(bp); 292 293 if ((oldflags & B_ASYNC) == 0) { 294 int rtval = biowait(bp); 295 296 if (oldflags & B_DELWRI) { 297 reassignbuf(bp, bp->b_vp); 298 } 299 brelse(bp); 300 return (rtval); 301 } 302 return (0); 303} 304 305int 306vn_bwrite(ap) 307 struct vop_bwrite_args *ap; 308{ 309 return (bwrite(ap->a_bp)); 310} 311 312/* 313 * Delayed write. (Buffer is marked dirty). 314 */ 315void 316bdwrite(struct buf * bp) 317{ 318 319 if ((bp->b_flags & B_BUSY) == 0) { 320 panic("bdwrite: buffer is not busy"); 321 } 322 if (bp->b_flags & B_INVAL) { 323 brelse(bp); 324 return; 325 } 326 if (bp->b_flags & B_TAPE) { 327 bawrite(bp); 328 return; 329 } 330 bp->b_flags &= ~(B_READ|B_RELBUF); 331 if ((bp->b_flags & B_DELWRI) == 0) { 332 bp->b_flags |= B_DONE | B_DELWRI; 333 reassignbuf(bp, bp->b_vp); 334 } 335 336 /* 337 * This bmap keeps the system from needing to do the bmap later, 338 * perhaps when the system is attempting to do a sync. Since it 339 * is likely that the indirect block -- or whatever other datastructure 340 * that the filesystem needs is still in memory now, it is a good 341 * thing to do this. Note also, that if the pageout daemon is 342 * requesting a sync -- there might not be enough memory to do 343 * the bmap then... So, this is important to do. 344 */ 345 if( bp->b_lblkno == bp->b_blkno) { 346 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 347 } 348 349 /* 350 * Set the *dirty* buffer range based upon the VM system dirty pages. 351 */ 352 vfs_setdirty(bp); 353 354 /* 355 * We need to do this here to satisfy the vnode_pager and the 356 * pageout daemon, so that it thinks that the pages have been 357 * "cleaned". Note that since the pages are in a delayed write 358 * buffer -- the VFS layer "will" see that the pages get written 359 * out on the next sync, or perhaps the cluster will be completed. 360 */ 361 vfs_clean_pages(bp); 362 brelse(bp); 363 return; 364} 365 366/* 367 * Asynchronous write. 368 * Start output on a buffer, but do not wait for it to complete. 369 * The buffer is released when the output completes. 370 */ 371void 372bawrite(struct buf * bp) 373{ 374 bp->b_flags |= B_ASYNC; 375 (void) VOP_BWRITE(bp); 376} 377 378/* 379 * Release a buffer. 380 */ 381void 382brelse(struct buf * bp) 383{ 384 int s; 385 386 if (bp->b_flags & B_CLUSTER) { 387 relpbuf(bp); 388 return; 389 } 390 /* anyone need a "free" block? */ 391 s = splbio(); 392 393 if (needsbuffer) { 394 needsbuffer = 0; 395 wakeup(&needsbuffer); 396 } 397 398 /* anyone need this block? */ 399 if (bp->b_flags & B_WANTED) { 400 bp->b_flags &= ~(B_WANTED | B_AGE); 401 wakeup(bp); 402 } 403 404 if (bp->b_flags & B_LOCKED) 405 bp->b_flags &= ~B_ERROR; 406 407 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 408 (bp->b_bufsize <= 0)) { 409 bp->b_flags |= B_INVAL; 410 bp->b_flags &= ~(B_DELWRI | B_CACHE); 411 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) 412 brelvp(bp); 413 } 414 415 /* 416 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 417 * constituted, so the B_INVAL flag is used to *invalidate* the buffer, 418 * but the VM object is kept around. The B_NOCACHE flag is used to 419 * invalidate the pages in the VM object. 420 */ 421 if (bp->b_flags & B_VMIO) { 422 vm_ooffset_t foff; 423 vm_object_t obj; 424 int i, resid; 425 vm_page_t m; 426 struct vnode *vp; 427 int iototal = bp->b_bufsize; 428 429 vp = bp->b_vp; 430 if (!vp) 431 panic("brelse: missing vp"); 432 433 if (bp->b_npages) { 434 vm_pindex_t poff; 435 obj = (vm_object_t) vp->v_object; 436 if (vp->v_type == VBLK) 437 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; 438 else 439 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 440 poff = OFF_TO_IDX(foff); 441 for (i = 0; i < bp->b_npages; i++) { 442 m = bp->b_pages[i]; 443 if (m == bogus_page) { 444 m = vm_page_lookup(obj, poff + i); 445 if (!m) { 446 panic("brelse: page missing\n"); 447 } 448 bp->b_pages[i] = m; 449 pmap_qenter(trunc_page(bp->b_data), 450 bp->b_pages, bp->b_npages); 451 } 452 resid = IDX_TO_OFF(m->pindex+1) - foff; 453 if (resid > iototal) 454 resid = iototal; 455 if (resid > 0) { 456 /* 457 * Don't invalidate the page if the local machine has already 458 * modified it. This is the lesser of two evils, and should 459 * be fixed. 460 */ 461 if (bp->b_flags & (B_NOCACHE | B_ERROR)) { 462 vm_page_test_dirty(m); 463 if (m->dirty == 0) { 464 vm_page_set_invalid(m, (vm_offset_t) foff, resid); 465 if (m->valid == 0) 466 vm_page_protect(m, VM_PROT_NONE); 467 } 468 } 469 } 470 foff += resid; 471 iototal -= resid; 472 } 473 } 474 475 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 476 for(i = 0; i < bp->b_npages; i++) { 477 m = bp->b_pages[i]; 478 --m->bmapped; 479 if (m->bmapped == 0) { 480 if (m->flags & PG_WANTED) { 481 m->flags &= ~PG_WANTED; 482 wakeup(m); 483 } 484 if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) { 485 if (m->object->flags & OBJ_MIGHTBEDIRTY) { 486 vm_page_test_dirty(m); 487 } 488 /* 489 * if page isn't valid, no sense in keeping it around 490 */ 491 if (m->valid == 0) { 492 vm_page_protect(m, VM_PROT_NONE); 493 vm_page_free(m); 494 /* 495 * if page isn't dirty and hasn't been referenced by 496 * a process, then cache it 497 */ 498 } else if ((m->dirty & m->valid) == 0 && 499 (m->flags & PG_REFERENCED) == 0 && 500 !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) { 501 vm_page_cache(m); 502 /* 503 * otherwise activate it 504 */ 505 } else if ((m->flags & PG_ACTIVE) == 0) { 506 vm_page_activate(m); 507 m->act_count = 0; 508 } 509 } 510 } 511 } 512 bufspace -= bp->b_bufsize; 513 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 514 bp->b_npages = 0; 515 bp->b_bufsize = 0; 516 bp->b_flags &= ~B_VMIO; 517 if (bp->b_vp) 518 brelvp(bp); 519 } 520 } 521 if (bp->b_qindex != QUEUE_NONE) 522 panic("brelse: free buffer onto another queue???"); 523 524 /* enqueue */ 525 /* buffers with no memory */ 526 if (bp->b_bufsize == 0) { 527 bp->b_qindex = QUEUE_EMPTY; 528 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 529 LIST_REMOVE(bp, b_hash); 530 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 531 bp->b_dev = NODEV; 532 /* buffers with junk contents */ 533 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { 534 bp->b_qindex = QUEUE_AGE; 535 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 536 LIST_REMOVE(bp, b_hash); 537 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 538 bp->b_dev = NODEV; 539 /* buffers that are locked */ 540 } else if (bp->b_flags & B_LOCKED) { 541 bp->b_qindex = QUEUE_LOCKED; 542 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 543 /* buffers with stale but valid contents */ 544 } else if (bp->b_flags & B_AGE) { 545 bp->b_qindex = QUEUE_AGE; 546 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 547 /* buffers with valid and quite potentially reuseable contents */ 548 } else { 549 bp->b_qindex = QUEUE_LRU; 550 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 551 } 552 553 /* unlock */ 554 bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 555 splx(s); 556} 557 558/* 559 * Check to see if a block is currently memory resident. 560 */ 561__inline struct buf * 562gbincore(struct vnode * vp, daddr_t blkno) 563{ 564 struct buf *bp; 565 struct bufhashhdr *bh; 566 567 bh = BUFHASH(vp, blkno); 568 bp = bh->lh_first; 569 570 /* Search hash chain */ 571 while (bp != NULL) { 572 /* hit */ 573 if (bp->b_vp == vp && bp->b_lblkno == blkno) { 574 break; 575 } 576 bp = bp->b_hash.le_next; 577 } 578 return (bp); 579} 580 581/* 582 * this routine implements clustered async writes for 583 * clearing out B_DELWRI buffers... This is much better 584 * than the old way of writing only one buffer at a time. 585 */ 586int 587vfs_bio_awrite(struct buf * bp) 588{ 589 int i; 590 daddr_t lblkno = bp->b_lblkno; 591 struct vnode *vp = bp->b_vp; 592 int s; 593 int ncl; 594 struct buf *bpa; 595 int nwritten; 596 597 s = splbio(); 598 if (/* (vp->v_type != VBLK) && */ 599 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 600 int size; 601 int maxcl; 602 603 size = vp->v_mount->mnt_stat.f_iosize; 604 maxcl = MAXPHYS / size; 605 606 for (i = 1; i < maxcl; i++) { 607 if ((bpa = gbincore(vp, lblkno + i)) && 608 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == 609 (B_DELWRI | B_CLUSTEROK)) && 610 (bpa->b_bufsize == size)) { 611 if ((bpa->b_blkno == bpa->b_lblkno) || 612 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 613 break; 614 } else { 615 break; 616 } 617 } 618 ncl = i; 619 /* 620 * this is a possible cluster write 621 */ 622 if (ncl != 1) { 623 nwritten = cluster_wbuild(vp, size, lblkno, ncl); 624 splx(s); 625 return nwritten; 626 } 627 } 628 bremfree(bp); 629 splx(s); 630 /* 631 * default (old) behavior, writing out only one block 632 */ 633 bp->b_flags |= B_BUSY | B_ASYNC; 634 nwritten = bp->b_bufsize; 635 (void) VOP_BWRITE(bp); 636 return nwritten; 637} 638 639 640/* 641 * Find a buffer header which is available for use. 642 */ 643static struct buf * 644getnewbuf(int slpflag, int slptimeo, int doingvmio) 645{ 646 struct buf *bp; 647 int s; 648 int nbyteswritten = 0; 649 650 s = splbio(); 651start: 652 if (bufspace >= maxbufspace) 653 goto trytofreespace; 654 655 /* can we constitute a new buffer? */ 656 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 657 if (bp->b_qindex != QUEUE_EMPTY) 658 panic("getnewbuf: inconsistent EMPTY queue"); 659 bremfree(bp); 660 goto fillbuf; 661 } 662trytofreespace: 663 /* 664 * We keep the file I/O from hogging metadata I/O 665 * This is desirable because file data is cached in the 666 * VM/Buffer cache even if a buffer is freed. 667 */ 668 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 669 if (bp->b_qindex != QUEUE_AGE) 670 panic("getnewbuf: inconsistent AGE queue"); 671 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 672 if (bp->b_qindex != QUEUE_LRU) 673 panic("getnewbuf: inconsistent LRU queue"); 674 } 675 if (!bp) { 676 /* wait for a free buffer of any kind */ 677 needsbuffer = 1; 678 tsleep(&needsbuffer, 679 (PRIBIO + 1) | slpflag, "newbuf", slptimeo); 680 splx(s); 681 return (0); 682 } 683 684 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { 685 --bp->b_usecount; 686 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); 687 if (bufqueues[QUEUE_LRU].tqh_first != NULL) { 688 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 689 goto start; 690 } 691 } 692 693 /* if we are a delayed write, convert to an async write */ 694 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { 695 nbyteswritten += vfs_bio_awrite(bp); 696 if (!slpflag && !slptimeo) { 697 splx(s); 698 return (0); 699 } 700 goto start; 701 } 702 703 if (bp->b_flags & B_WANTED) { 704 bp->b_flags &= ~B_WANTED; 705 wakeup(bp); 706 } 707 bremfree(bp); 708 709 if (bp->b_flags & B_VMIO) { 710 bp->b_flags |= B_RELBUF | B_BUSY | B_DONE; 711 brelse(bp); 712 bremfree(bp); 713 } 714 715 if (bp->b_vp) 716 brelvp(bp); 717 718 /* we are not free, nor do we contain interesting data */ 719 if (bp->b_rcred != NOCRED) 720 crfree(bp->b_rcred); 721 if (bp->b_wcred != NOCRED) 722 crfree(bp->b_wcred); 723fillbuf: 724 bp->b_flags |= B_BUSY; 725 LIST_REMOVE(bp, b_hash); 726 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 727 splx(s); 728 if (bp->b_bufsize) { 729 allocbuf(bp, 0); 730 } 731 bp->b_flags = B_BUSY; 732 bp->b_dev = NODEV; 733 bp->b_vp = NULL; 734 bp->b_blkno = bp->b_lblkno = 0; 735 bp->b_iodone = 0; 736 bp->b_error = 0; 737 bp->b_resid = 0; 738 bp->b_bcount = 0; 739 bp->b_npages = 0; 740 bp->b_wcred = bp->b_rcred = NOCRED; 741 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 742 bp->b_dirtyoff = bp->b_dirtyend = 0; 743 bp->b_validoff = bp->b_validend = 0; 744 bp->b_usecount = 2; 745 if (bufspace >= maxbufspace + nbyteswritten) { 746 s = splbio(); 747 bp->b_flags |= B_INVAL; 748 brelse(bp); 749 goto trytofreespace; 750 } 751 return (bp); 752} 753 754/* 755 * Check to see if a block is currently memory resident. 756 */ 757struct buf * 758incore(struct vnode * vp, daddr_t blkno) 759{ 760 struct buf *bp; 761 struct bufhashhdr *bh; 762 763 int s = splbio(); 764 765 bh = BUFHASH(vp, blkno); 766 bp = bh->lh_first; 767 768 /* Search hash chain */ 769 while (bp != NULL) { 770 /* hit */ 771 if (bp->b_vp == vp && bp->b_lblkno == blkno && 772 (bp->b_flags & B_INVAL) == 0) { 773 break; 774 } 775 bp = bp->b_hash.le_next; 776 } 777 splx(s); 778 return (bp); 779} 780 781/* 782 * Returns true if no I/O is needed to access the 783 * associated VM object. This is like incore except 784 * it also hunts around in the VM system for the data. 785 */ 786 787int 788inmem(struct vnode * vp, daddr_t blkno) 789{ 790 vm_object_t obj; 791 vm_offset_t toff, tinc; 792 vm_page_t m; 793 vm_ooffset_t off; 794 795 if (incore(vp, blkno)) 796 return 1; 797 if (vp->v_mount == NULL) 798 return 0; 799 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0) 800 return 0; 801 802 obj = vp->v_object; 803 tinc = PAGE_SIZE; 804 if (tinc > vp->v_mount->mnt_stat.f_iosize) 805 tinc = vp->v_mount->mnt_stat.f_iosize; 806 off = blkno * vp->v_mount->mnt_stat.f_iosize; 807 808 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 809 810 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 811 if (!m) 812 return 0; 813 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) 814 return 0; 815 } 816 return 1; 817} 818 819/* 820 * now we set the dirty range for the buffer -- 821 * for NFS -- if the file is mapped and pages have 822 * been written to, let it know. We want the 823 * entire range of the buffer to be marked dirty if 824 * any of the pages have been written to for consistancy 825 * with the b_validoff, b_validend set in the nfs write 826 * code, and used by the nfs read code. 827 */ 828static void 829vfs_setdirty(struct buf *bp) { 830 int i; 831 vm_object_t object; 832 vm_offset_t boffset, offset; 833 /* 834 * We qualify the scan for modified pages on whether the 835 * object has been flushed yet. The OBJ_WRITEABLE flag 836 * is not cleared simply by protecting pages off. 837 */ 838 if ((bp->b_flags & B_VMIO) && 839 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { 840 /* 841 * test the pages to see if they have been modified directly 842 * by users through the VM system. 843 */ 844 for (i = 0; i < bp->b_npages; i++) 845 vm_page_test_dirty(bp->b_pages[i]); 846 847 /* 848 * scan forwards for the first page modified 849 */ 850 for (i = 0; i < bp->b_npages; i++) { 851 if (bp->b_pages[i]->dirty) { 852 break; 853 } 854 } 855 boffset = (i << PAGE_SHIFT); 856 if (boffset < bp->b_dirtyoff) { 857 bp->b_dirtyoff = boffset; 858 } 859 860 /* 861 * scan backwards for the last page modified 862 */ 863 for (i = bp->b_npages - 1; i >= 0; --i) { 864 if (bp->b_pages[i]->dirty) { 865 break; 866 } 867 } 868 boffset = (i + 1); 869 offset = boffset + bp->b_pages[0]->pindex; 870 if (offset >= object->size) 871 boffset = object->size - bp->b_pages[0]->pindex; 872 if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) 873 bp->b_dirtyend = (boffset << PAGE_SHIFT); 874 } 875} 876 877/* 878 * Get a block given a specified block and offset into a file/device. 879 */ 880struct buf * 881getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 882{ 883 struct buf *bp; 884 int s; 885 struct bufhashhdr *bh; 886 887 s = splbio(); 888loop: 889 if ((bp = gbincore(vp, blkno))) { 890 if (bp->b_flags & (B_BUSY|B_INVAL)) { 891 bp->b_flags |= B_WANTED; 892 if (bp->b_usecount < BUF_MAXUSE) 893 ++bp->b_usecount; 894 if (!tsleep(bp, 895 (PRIBIO + 1) | slpflag, "getblk", slptimeo)) 896 goto loop; 897 898 splx(s); 899 return (struct buf *) NULL; 900 } 901 bp->b_flags |= B_BUSY | B_CACHE; 902 bremfree(bp); 903 904 /* 905 * check for size inconsistancies (note that they shouldn't happen 906 * but do when filesystems don't handle the size changes correctly.) 907 * We are conservative on metadata and don't just extend the buffer 908 * but write and re-constitute it. 909 */ 910 911 if (bp->b_bcount != size) { 912 if (bp->b_flags & B_VMIO) { 913 allocbuf(bp, size); 914 } else { 915 bp->b_flags |= B_NOCACHE; 916 VOP_BWRITE(bp); 917 goto loop; 918 } 919 } 920 921 /* 922 * make sure that all pages in the buffer are valid, if they 923 * aren't, clear the cache flag. 924 * ASSUMPTION: 925 * if the buffer is greater than 1 page in size, it is assumed 926 * that the buffer address starts on a page boundary... 927 */ 928 if (bp->b_flags & B_VMIO) { 929 int szleft, i; 930 szleft = size; 931 for (i=0;i<bp->b_npages;i++) { 932 if (szleft > PAGE_SIZE) { 933 if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) != 934 VM_PAGE_BITS_ALL) { 935 bp->b_flags &= ~(B_CACHE|B_DONE); 936 break; 937 } 938 szleft -= PAGE_SIZE; 939 } else { 940 if (!vm_page_is_valid(bp->b_pages[i], 941 (((vm_offset_t) bp->b_data) & PAGE_MASK), 942 szleft)) { 943 bp->b_flags &= ~(B_CACHE|B_DONE); 944 break; 945 } 946 szleft = 0; 947 } 948 } 949 } 950 if (bp->b_usecount < BUF_MAXUSE) 951 ++bp->b_usecount; 952 splx(s); 953 return (bp); 954 } else { 955 vm_object_t obj; 956 int doingvmio; 957 958 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 959 doingvmio = 1; 960 } else { 961 doingvmio = 0; 962 } 963 if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) { 964 if (slpflag || slptimeo) { 965 splx(s); 966 return NULL; 967 } 968 goto loop; 969 } 970 971 /* 972 * This code is used to make sure that a buffer is not 973 * created while the getnewbuf routine is blocked. 974 * Normally the vnode is locked so this isn't a problem. 975 * VBLK type I/O requests, however, don't lock the vnode. 976 */ 977 if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 978 bp->b_flags |= B_INVAL; 979 brelse(bp); 980 goto loop; 981 } 982 983 /* 984 * Insert the buffer into the hash, so that it can 985 * be found by incore. 986 */ 987 bp->b_blkno = bp->b_lblkno = blkno; 988 bgetvp(vp, bp); 989 LIST_REMOVE(bp, b_hash); 990 bh = BUFHASH(vp, blkno); 991 LIST_INSERT_HEAD(bh, bp, b_hash); 992 993 if (doingvmio) { 994 bp->b_flags |= (B_VMIO | B_CACHE); 995#if defined(VFS_BIO_DEBUG) 996 if (vp->v_type != VREG) 997 printf("getblk: vmioing file type %d???\n", vp->v_type); 998#endif 999 } else { 1000 bp->b_flags &= ~B_VMIO; 1001 } 1002 splx(s); 1003 1004 allocbuf(bp, size); 1005 return (bp); 1006 } 1007} 1008 1009/* 1010 * Get an empty, disassociated buffer of given size. 1011 */ 1012struct buf * 1013geteblk(int size) 1014{ 1015 struct buf *bp; 1016 1017 while ((bp = getnewbuf(0, 0, 0)) == 0); 1018 allocbuf(bp, size); 1019 bp->b_flags |= B_INVAL; 1020 return (bp); 1021} 1022 1023/* 1024 * This code constitutes the buffer memory from either anonymous system 1025 * memory (in the case of non-VMIO operations) or from an associated 1026 * VM object (in the case of VMIO operations). 1027 * 1028 * Note that this code is tricky, and has many complications to resolve 1029 * deadlock or inconsistant data situations. Tread lightly!!! 1030 * 1031 * Modify the length of a buffer's underlying buffer storage without 1032 * destroying information (unless, of course the buffer is shrinking). 1033 */ 1034int 1035allocbuf(struct buf * bp, int size) 1036{ 1037 1038 int s; 1039 int newbsize, mbsize; 1040 int i; 1041 1042 if (!(bp->b_flags & B_BUSY)) 1043 panic("allocbuf: buffer not busy"); 1044 1045 if ((bp->b_flags & B_VMIO) == 0) { 1046 /* 1047 * Just get anonymous memory from the kernel 1048 */ 1049 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1050 newbsize = round_page(size); 1051 1052 if (newbsize < bp->b_bufsize) { 1053 vm_hold_free_pages( 1054 bp, 1055 (vm_offset_t) bp->b_data + newbsize, 1056 (vm_offset_t) bp->b_data + bp->b_bufsize); 1057 } else if (newbsize > bp->b_bufsize) { 1058 vm_hold_load_pages( 1059 bp, 1060 (vm_offset_t) bp->b_data + bp->b_bufsize, 1061 (vm_offset_t) bp->b_data + newbsize); 1062 } 1063 } else { 1064 vm_page_t m; 1065 int desiredpages; 1066 1067 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1068 desiredpages = (round_page(newbsize) >> PAGE_SHIFT); 1069 1070 if (newbsize < bp->b_bufsize) { 1071 if (desiredpages < bp->b_npages) { 1072 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 1073 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 1074 for (i = desiredpages; i < bp->b_npages; i++) { 1075 m = bp->b_pages[i]; 1076 s = splhigh(); 1077 while ((m->flags & PG_BUSY) || (m->busy != 0)) { 1078 m->flags |= PG_WANTED; 1079 tsleep(m, PVM, "biodep", 0); 1080 } 1081 splx(s); 1082 1083 if (m->bmapped == 0) { 1084 printf("allocbuf: bmapped is zero for page %d\n", i); 1085 panic("allocbuf: error"); 1086 } 1087 --m->bmapped; 1088 if (m->bmapped == 0) { 1089 vm_page_protect(m, VM_PROT_NONE); 1090 vm_page_free(m); 1091 } 1092 bp->b_pages[i] = NULL; 1093 } 1094 bp->b_npages = desiredpages; 1095 } 1096 } else if (newbsize > bp->b_bufsize) { 1097 vm_object_t obj; 1098 vm_offset_t tinc, toff; 1099 vm_ooffset_t off; 1100 vm_pindex_t objoff; 1101 int pageindex, curbpnpages; 1102 struct vnode *vp; 1103 int bsize; 1104 1105 vp = bp->b_vp; 1106 1107 if (vp->v_type == VBLK) 1108 bsize = DEV_BSIZE; 1109 else 1110 bsize = vp->v_mount->mnt_stat.f_iosize; 1111 1112 if (bp->b_npages < desiredpages) { 1113 obj = vp->v_object; 1114 tinc = PAGE_SIZE; 1115 if (tinc > bsize) 1116 tinc = bsize; 1117 off = (vm_ooffset_t) bp->b_lblkno * bsize; 1118 doretry: 1119 curbpnpages = bp->b_npages; 1120 bp->b_flags |= B_CACHE; 1121 for (toff = 0; toff < newbsize; toff += tinc) { 1122 int bytesinpage; 1123 1124 pageindex = toff >> PAGE_SHIFT; 1125 objoff = OFF_TO_IDX(off + toff); 1126 if (pageindex < curbpnpages) { 1127 1128 m = bp->b_pages[pageindex]; 1129 if (m->pindex != objoff) 1130 panic("allocbuf: page changed offset??!!!?"); 1131 bytesinpage = tinc; 1132 if (tinc > (newbsize - toff)) 1133 bytesinpage = newbsize - toff; 1134 if (!vm_page_is_valid(m, 1135 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), 1136 bytesinpage)) { 1137 bp->b_flags &= ~B_CACHE; 1138 } 1139 if ((m->flags & PG_ACTIVE) == 0) { 1140 vm_page_activate(m); 1141 m->act_count = 0; 1142 } 1143 continue; 1144 } 1145 m = vm_page_lookup(obj, objoff); 1146 if (!m) { 1147 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1148 if (!m) { 1149 int j; 1150 1151 for (j = bp->b_npages; j < pageindex; j++) { 1152 PAGE_WAKEUP(bp->b_pages[j]); 1153 } 1154 VM_WAIT; 1155 goto doretry; 1156 } 1157 vm_page_activate(m); 1158 m->act_count = 0; 1159 m->valid = 0; 1160 bp->b_flags &= ~B_CACHE; 1161 } else if (m->flags & PG_BUSY) { 1162 int j; 1163 1164 for (j = bp->b_npages; j < pageindex; j++) { 1165 PAGE_WAKEUP(bp->b_pages[j]); 1166 } 1167 1168 s = splbio(); 1169 m->flags |= PG_WANTED; 1170 tsleep(m, PVM, "pgtblk", 0); 1171 splx(s); 1172 1173 goto doretry; 1174 } else { 1175 if ((curproc != pageproc) && 1176 (m->flags & PG_CACHE) && 1177 (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) { 1178 pagedaemon_wakeup(); 1179 } 1180 bytesinpage = tinc; 1181 if (tinc > (newbsize - toff)) 1182 bytesinpage = newbsize - toff; 1183 if (!vm_page_is_valid(m, 1184 (vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)), 1185 bytesinpage)) { 1186 bp->b_flags &= ~B_CACHE; 1187 } 1188 if ((m->flags & PG_ACTIVE) == 0) { 1189 vm_page_activate(m); 1190 m->act_count = 0; 1191 } 1192 m->flags |= PG_BUSY; 1193 } 1194 bp->b_pages[pageindex] = m; 1195 curbpnpages = pageindex + 1; 1196 } 1197 for (i = bp->b_npages; i < curbpnpages; i++) { 1198 m = bp->b_pages[i]; 1199 m->bmapped++; 1200 PAGE_WAKEUP(m); 1201 } 1202 bp->b_npages = curbpnpages; 1203 bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE; 1204 pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages); 1205 bp->b_data += off & (PAGE_SIZE - 1); 1206 } 1207 } 1208 } 1209 bufspace += (newbsize - bp->b_bufsize); 1210 bp->b_bufsize = newbsize; 1211 bp->b_bcount = size; 1212 return 1; 1213} 1214 1215/* 1216 * Wait for buffer I/O completion, returning error status. 1217 */ 1218int 1219biowait(register struct buf * bp) 1220{ 1221 int s; 1222 1223 s = splbio(); 1224 while ((bp->b_flags & B_DONE) == 0) 1225 tsleep(bp, PRIBIO, "biowait", 0); 1226 splx(s); 1227 if (bp->b_flags & B_EINTR) { 1228 bp->b_flags &= ~B_EINTR; 1229 return (EINTR); 1230 } 1231 if (bp->b_flags & B_ERROR) { 1232 return (bp->b_error ? bp->b_error : EIO); 1233 } else { 1234 return (0); 1235 } 1236} 1237 1238/* 1239 * Finish I/O on a buffer, calling an optional function. 1240 * This is usually called from interrupt level, so process blocking 1241 * is not *a good idea*. 1242 */ 1243void 1244biodone(register struct buf * bp) 1245{ 1246 int s; 1247 1248 s = splbio(); 1249 if (!(bp->b_flags & B_BUSY)) 1250 panic("biodone: buffer not busy"); 1251 1252 if (bp->b_flags & B_DONE) { 1253 splx(s); 1254 printf("biodone: buffer already done\n"); 1255 return; 1256 } 1257 bp->b_flags |= B_DONE; 1258 1259 if ((bp->b_flags & B_READ) == 0) { 1260 vwakeup(bp); 1261 } 1262#ifdef BOUNCE_BUFFERS 1263 if (bp->b_flags & B_BOUNCE) 1264 vm_bounce_free(bp); 1265#endif 1266 1267 /* call optional completion function if requested */ 1268 if (bp->b_flags & B_CALL) { 1269 bp->b_flags &= ~B_CALL; 1270 (*bp->b_iodone) (bp); 1271 splx(s); 1272 return; 1273 } 1274 if (bp->b_flags & B_VMIO) { 1275 int i, resid; 1276 vm_ooffset_t foff; 1277 vm_page_t m; 1278 vm_object_t obj; 1279 int iosize; 1280 struct vnode *vp = bp->b_vp; 1281 1282 if (vp->v_type == VBLK) 1283 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1284 else 1285 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1286 obj = vp->v_object; 1287 if (!obj) { 1288 panic("biodone: no object"); 1289 } 1290#if defined(VFS_BIO_DEBUG) 1291 if (obj->paging_in_progress < bp->b_npages) { 1292 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1293 obj->paging_in_progress, bp->b_npages); 1294 } 1295#endif 1296 iosize = bp->b_bufsize; 1297 for (i = 0; i < bp->b_npages; i++) { 1298 int bogusflag = 0; 1299 m = bp->b_pages[i]; 1300 if (m == bogus_page) { 1301 bogusflag = 1; 1302 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 1303 if (!m) { 1304#if defined(VFS_BIO_DEBUG) 1305 printf("biodone: page disappeared\n"); 1306#endif 1307 --obj->paging_in_progress; 1308 continue; 1309 } 1310 bp->b_pages[i] = m; 1311 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1312 } 1313#if defined(VFS_BIO_DEBUG) 1314 if (OFF_TO_IDX(foff) != m->pindex) { 1315 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); 1316 } 1317#endif 1318 resid = IDX_TO_OFF(m->pindex + 1) - foff; 1319 if (resid > iosize) 1320 resid = iosize; 1321 /* 1322 * In the write case, the valid and clean bits are 1323 * already changed correctly, so we only need to do this 1324 * here in the read case. 1325 */ 1326 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1327 vm_page_set_validclean(m, 1328 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); 1329 } 1330 1331 /* 1332 * when debugging new filesystems or buffer I/O methods, this 1333 * is the most common error that pops up. if you see this, you 1334 * have not set the page busy flag correctly!!! 1335 */ 1336 if (m->busy == 0) { 1337 printf("biodone: page busy < 0, " 1338 "pindex: %d, foff: 0x(%x,%x), " 1339 "resid: %d, index: %d\n", 1340 (int) m->pindex, (int)(foff >> 32), 1341 (int) foff & 0xffffffff, resid, i); 1342 if (vp->v_type != VBLK) 1343 printf(" iosize: %d, lblkno: %d, flags: 0x%lx, npages: %d\n", 1344 bp->b_vp->v_mount->mnt_stat.f_iosize, 1345 (int) bp->b_lblkno, 1346 bp->b_flags, bp->b_npages); 1347 else 1348 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 1349 (int) bp->b_lblkno, 1350 bp->b_flags, bp->b_npages); 1351 printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n", 1352 m->valid, m->dirty, m->bmapped); 1353 panic("biodone: page busy < 0\n"); 1354 } 1355 --m->busy; 1356 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1357 m->flags &= ~PG_WANTED; 1358 wakeup(m); 1359 } 1360 --obj->paging_in_progress; 1361 foff += resid; 1362 iosize -= resid; 1363 } 1364 if (obj && obj->paging_in_progress == 0 && 1365 (obj->flags & OBJ_PIPWNT)) { 1366 obj->flags &= ~OBJ_PIPWNT; 1367 wakeup(obj); 1368 } 1369 } 1370 /* 1371 * For asynchronous completions, release the buffer now. The brelse 1372 * checks for B_WANTED and will do the wakeup there if necessary - so 1373 * no need to do a wakeup here in the async case. 1374 */ 1375 1376 if (bp->b_flags & B_ASYNC) { 1377 brelse(bp); 1378 } else { 1379 bp->b_flags &= ~B_WANTED; 1380 wakeup(bp); 1381 } 1382 splx(s); 1383} 1384 1385int 1386count_lock_queue() 1387{ 1388 int count; 1389 struct buf *bp; 1390 1391 count = 0; 1392 for (bp = bufqueues[QUEUE_LOCKED].tqh_first; 1393 bp != NULL; 1394 bp = bp->b_freelist.tqe_next) 1395 count++; 1396 return (count); 1397} 1398 1399int vfs_update_interval = 30; 1400 1401static void 1402vfs_update() 1403{ 1404 (void) spl0(); /* XXX redundant? wrong place? */ 1405 while (1) { 1406 tsleep(&vfs_update_wakeup, PUSER, "update", 1407 hz * vfs_update_interval); 1408 vfs_update_wakeup = 0; 1409 sync(curproc, NULL, NULL); 1410 } 1411} 1412 1413static int 1414sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS 1415{ 1416 int error = sysctl_handle_int(oidp, 1417 oidp->oid_arg1, oidp->oid_arg2, req); 1418 if (!error) 1419 wakeup(&vfs_update_wakeup); 1420 return error; 1421} 1422 1423SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, 1424 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); 1425 1426 1427/* 1428 * This routine is called in lieu of iodone in the case of 1429 * incomplete I/O. This keeps the busy status for pages 1430 * consistant. 1431 */ 1432void 1433vfs_unbusy_pages(struct buf * bp) 1434{ 1435 int i; 1436 1437 if (bp->b_flags & B_VMIO) { 1438 struct vnode *vp = bp->b_vp; 1439 vm_object_t obj = vp->v_object; 1440 vm_ooffset_t foff; 1441 1442 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1443 1444 for (i = 0; i < bp->b_npages; i++) { 1445 vm_page_t m = bp->b_pages[i]; 1446 1447 if (m == bogus_page) { 1448 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); 1449 if (!m) { 1450 panic("vfs_unbusy_pages: page missing\n"); 1451 } 1452 bp->b_pages[i] = m; 1453 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1454 } 1455 --obj->paging_in_progress; 1456 --m->busy; 1457 if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1458 m->flags &= ~PG_WANTED; 1459 wakeup(m); 1460 } 1461 } 1462 if (obj->paging_in_progress == 0 && 1463 (obj->flags & OBJ_PIPWNT)) { 1464 obj->flags &= ~OBJ_PIPWNT; 1465 wakeup(obj); 1466 } 1467 } 1468} 1469 1470/* 1471 * This routine is called before a device strategy routine. 1472 * It is used to tell the VM system that paging I/O is in 1473 * progress, and treat the pages associated with the buffer 1474 * almost as being PG_BUSY. Also the object paging_in_progress 1475 * flag is handled to make sure that the object doesn't become 1476 * inconsistant. 1477 */ 1478void 1479vfs_busy_pages(struct buf * bp, int clear_modify) 1480{ 1481 int i; 1482 1483 if (bp->b_flags & B_VMIO) { 1484 vm_object_t obj = bp->b_vp->v_object; 1485 vm_ooffset_t foff; 1486 int iocount = bp->b_bufsize; 1487 1488 if (bp->b_vp->v_type == VBLK) 1489 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1490 else 1491 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1492 vfs_setdirty(bp); 1493 for (i = 0; i < bp->b_npages; i++) { 1494 vm_page_t m = bp->b_pages[i]; 1495 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1496 1497 if (resid > iocount) 1498 resid = iocount; 1499 if ((bp->b_flags & B_CLUSTER) == 0) { 1500 obj->paging_in_progress++; 1501 m->busy++; 1502 } 1503 if (clear_modify) { 1504 vm_page_protect(m, VM_PROT_READ); 1505 vm_page_set_validclean(m, 1506 (vm_offset_t) (foff & (PAGE_SIZE-1)), resid); 1507 } else if (bp->b_bcount >= PAGE_SIZE) { 1508 if (m->valid && (bp->b_flags & B_CACHE) == 0) { 1509 bp->b_pages[i] = bogus_page; 1510 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1511 } 1512 } 1513 foff += resid; 1514 iocount -= resid; 1515 } 1516 } 1517} 1518 1519/* 1520 * Tell the VM system that the pages associated with this buffer 1521 * are clean. This is used for delayed writes where the data is 1522 * going to go to disk eventually without additional VM intevention. 1523 */ 1524void 1525vfs_clean_pages(struct buf * bp) 1526{ 1527 int i; 1528 1529 if (bp->b_flags & B_VMIO) { 1530 vm_ooffset_t foff; 1531 int iocount = bp->b_bufsize; 1532 1533 if (bp->b_vp->v_type == VBLK) 1534 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1535 else 1536 foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1537 1538 for (i = 0; i < bp->b_npages; i++) { 1539 vm_page_t m = bp->b_pages[i]; 1540 int resid = IDX_TO_OFF(m->pindex + 1) - foff; 1541 1542 if (resid > iocount) 1543 resid = iocount; 1544 if (resid > 0) { 1545 vm_page_set_validclean(m, 1546 ((vm_offset_t) foff & (PAGE_SIZE-1)), resid); 1547 } 1548 foff += resid; 1549 iocount -= resid; 1550 } 1551 } 1552} 1553 1554void 1555vfs_bio_clrbuf(struct buf *bp) { 1556 int i; 1557 if( bp->b_flags & B_VMIO) { 1558 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 1559 int mask; 1560 mask = 0; 1561 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE) 1562 mask |= (1 << (i/DEV_BSIZE)); 1563 if( bp->b_pages[0]->valid != mask) { 1564 bzero(bp->b_data, bp->b_bufsize); 1565 } 1566 bp->b_pages[0]->valid = mask; 1567 bp->b_resid = 0; 1568 return; 1569 } 1570 for(i=0;i<bp->b_npages;i++) { 1571 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 1572 continue; 1573 if( bp->b_pages[i]->valid == 0) { 1574 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) 1575 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); 1576 } else { 1577 int j; 1578 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 1579 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 1580 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); 1581 } 1582 } 1583 bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; 1584 } 1585 bp->b_resid = 0; 1586 } else { 1587 clrbuf(bp); 1588 } 1589} 1590 1591/* 1592 * vm_hold_load_pages and vm_hold_unload pages get pages into 1593 * a buffers address space. The pages are anonymous and are 1594 * not associated with a file object. 1595 */ 1596void 1597vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1598{ 1599 vm_offset_t pg; 1600 vm_page_t p; 1601 vm_offset_t from = round_page(froma); 1602 vm_offset_t to = round_page(toa); 1603 1604 for (pg = from; pg < to; pg += PAGE_SIZE) { 1605 1606tryagain: 1607 1608 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 1609 VM_ALLOC_NORMAL); 1610 if (!p) { 1611 VM_WAIT; 1612 goto tryagain; 1613 } 1614 vm_page_wire(p); 1615 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 1616 bp->b_pages[((caddr_t) pg - bp->b_data) >> PAGE_SHIFT] = p; 1617 PAGE_WAKEUP(p); 1618 bp->b_npages++; 1619 } 1620} 1621 1622void 1623vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa) 1624{ 1625 vm_offset_t pg; 1626 vm_page_t p; 1627 vm_offset_t from = round_page(froma); 1628 vm_offset_t to = round_page(toa); 1629 1630 for (pg = from; pg < to; pg += PAGE_SIZE) { 1631 int index = ((caddr_t) pg - bp->b_data) >> PAGE_SHIFT; 1632 p = bp->b_pages[index]; 1633 bp->b_pages[index] = 0; 1634 pmap_kremove(pg); 1635 vm_page_free(p); 1636 --bp->b_npages; 1637 } 1638} 1639