vfs_bio.c revision 3374
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $Id: vfs_bio.c,v 1.13 1994/10/04 03:10:47 davidg Exp $ 20 */ 21 22#include <sys/param.h> 23#include <sys/systm.h> 24#include <sys/kernel.h> 25#include <sys/proc.h> 26#include <sys/vnode.h> 27#include <sys/buf.h> 28#include <sys/mount.h> 29#include <sys/malloc.h> 30#include <sys/resourcevar.h> 31#include <sys/proc.h> 32#include <vm/vm.h> 33#include <vm/vm_pageout.h> 34 35#include <miscfs/specfs/specdev.h> 36 37struct buf *buf; /* buffer header pool */ 38int nbuf; /* number of buffer headers calculated elsewhere */ 39struct swqueue bswlist; 40struct buf *bclnlist; /* Head of cleaned page list. */ 41 42extern vm_map_t buffer_map, io_map; 43 44void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 45void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 46 47int needsbuffer; 48 49/* 50 * Internal update daemon, process 3 51 * The variable vfs_update_wakeup allows for internal syncs. 52 */ 53int vfs_update_wakeup; 54 55/* 56 * Initialize buffer headers and related structures. 57 */ 58void 59bufinit() 60{ 61 struct buf *bp; 62 int i; 63 caddr_t baddr; 64 65 TAILQ_INIT(&bswlist); 66 LIST_INIT(&invalhash); 67 68 /* first, make a null hash table */ 69 for(i=0;i<BUFHSZ;i++) 70 LIST_INIT(&bufhashtbl[i]); 71 72 /* next, make a null set of free lists */ 73 for(i=0;i<BUFFER_QUEUES;i++) 74 TAILQ_INIT(&bufqueues[i]); 75 76 baddr = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf); 77 /* finally, initialize each buffer header and stick on empty q */ 78 for(i=0;i<nbuf;i++) { 79 bp = &buf[i]; 80 bzero(bp, sizeof *bp); 81 bp->b_flags = B_INVAL; /* we're just an empty header */ 82 bp->b_dev = NODEV; 83 bp->b_vp = NULL; 84 bp->b_rcred = NOCRED; 85 bp->b_wcred = NOCRED; 86 bp->b_qindex = QUEUE_EMPTY; 87 bp->b_vnbufs.le_next = NOLIST; 88 bp->b_data = baddr + i * MAXBSIZE; 89 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 90 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 91 } 92} 93 94/* 95 * remove the buffer from the appropriate free list 96 */ 97void 98bremfree(struct buf *bp) 99{ 100 int s = splbio(); 101 if( bp->b_qindex != QUEUE_NONE) { 102 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 103 bp->b_qindex = QUEUE_NONE; 104 } else { 105 panic("bremfree: removing a buffer when not on a queue"); 106 } 107 splx(s); 108} 109 110/* 111 * Get a buffer with the specified data. Look in the cache first. 112 */ 113int 114bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 115 struct buf **bpp) 116{ 117 struct buf *bp; 118 119 bp = getblk (vp, blkno, size, 0, 0); 120 *bpp = bp; 121 122 /* if not found in cache, do some I/O */ 123 if ((bp->b_flags & B_CACHE) == 0) { 124 if (curproc && curproc->p_stats) /* count block I/O */ 125 curproc->p_stats->p_ru.ru_inblock++; 126 bp->b_flags |= B_READ; 127 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 128 if( bp->b_rcred == NOCRED) { 129 if (cred != NOCRED) 130 crhold(cred); 131 bp->b_rcred = cred; 132 } 133 VOP_STRATEGY(bp); 134 return( biowait (bp)); 135 } 136 137 return (0); 138} 139 140/* 141 * Operates like bread, but also starts asynchronous I/O on 142 * read-ahead blocks. 143 */ 144int 145breadn(struct vnode *vp, daddr_t blkno, int size, 146 daddr_t *rablkno, int *rabsize, 147 int cnt, struct ucred *cred, struct buf **bpp) 148{ 149 struct buf *bp, *rabp; 150 int i; 151 int rv = 0, readwait = 0; 152 153 *bpp = bp = getblk (vp, blkno, size, 0, 0); 154 155 /* if not found in cache, do some I/O */ 156 if ((bp->b_flags & B_CACHE) == 0) { 157 if (curproc && curproc->p_stats) /* count block I/O */ 158 curproc->p_stats->p_ru.ru_inblock++; 159 bp->b_flags |= B_READ; 160 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 161 if( bp->b_rcred == NOCRED) { 162 if (cred != NOCRED) 163 crhold(cred); 164 bp->b_rcred = cred; 165 } 166 VOP_STRATEGY(bp); 167 ++readwait; 168 } 169 170 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 171 if( incore(vp, *rablkno)) { 172 continue; 173 } 174 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 175 176 if ((rabp->b_flags & B_CACHE) == 0) { 177 if (curproc && curproc->p_stats) 178 curproc->p_stats->p_ru.ru_inblock++; 179 rabp->b_flags |= B_READ | B_ASYNC; 180 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 181 if( rabp->b_rcred == NOCRED) { 182 if (cred != NOCRED) 183 crhold(cred); 184 rabp->b_rcred = cred; 185 } 186 VOP_STRATEGY(rabp); 187 } else { 188 brelse(rabp); 189 } 190 } 191 192 if( readwait) { 193 rv = biowait (bp); 194 } 195 196 return (rv); 197} 198 199/* 200 * Write, release buffer on completion. (Done by iodone 201 * if async.) 202 */ 203int 204bwrite(struct buf *bp) 205{ 206 int oldflags = bp->b_flags; 207 208 if(bp->b_flags & B_INVAL) { 209 brelse(bp); 210 return (0); 211 } 212 213 if(!(bp->b_flags & B_BUSY)) 214 panic("bwrite: buffer is not busy???"); 215 216 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 217 bp->b_flags |= B_WRITEINPROG; 218 219 if (oldflags & B_ASYNC) { 220 if (oldflags & B_DELWRI) { 221 reassignbuf(bp, bp->b_vp); 222 } else if( curproc) { 223 ++curproc->p_stats->p_ru.ru_oublock; 224 } 225 } 226 227 bp->b_vp->v_numoutput++; 228 VOP_STRATEGY(bp); 229 230 if( (oldflags & B_ASYNC) == 0) { 231 int rtval = biowait(bp); 232 if (oldflags & B_DELWRI) { 233 reassignbuf(bp, bp->b_vp); 234 } else if( curproc) { 235 ++curproc->p_stats->p_ru.ru_oublock; 236 } 237 brelse(bp); 238 return (rtval); 239 } 240 241 return(0); 242} 243 244int 245vn_bwrite(ap) 246 struct vop_bwrite_args *ap; 247{ 248 return (bwrite(ap->a_bp)); 249} 250 251/* 252 * Delayed write. (Buffer is marked dirty). 253 */ 254void 255bdwrite(struct buf *bp) 256{ 257 258 if((bp->b_flags & B_BUSY) == 0) { 259 panic("bdwrite: buffer is not busy"); 260 } 261 262 if(bp->b_flags & B_INVAL) { 263 brelse(bp); 264 return; 265 } 266 267 if(bp->b_flags & B_TAPE) { 268 bawrite(bp); 269 return; 270 } 271 272 bp->b_flags &= ~B_READ; 273 if( (bp->b_flags & B_DELWRI) == 0) { 274 if( curproc) 275 ++curproc->p_stats->p_ru.ru_oublock; 276 bp->b_flags |= B_DONE|B_DELWRI; 277 reassignbuf(bp, bp->b_vp); 278 } 279 brelse(bp); 280 return; 281} 282 283/* 284 * Asynchronous write. 285 * Start output on a buffer, but do not wait for it to complete. 286 * The buffer is released when the output completes. 287 */ 288void 289bawrite(struct buf *bp) 290{ 291 bp->b_flags |= B_ASYNC; 292 (void) bwrite(bp); 293} 294 295/* 296 * Release a buffer. 297 */ 298void 299brelse(struct buf *bp) 300{ 301 int x; 302 303 /* anyone need a "free" block? */ 304 x=splbio(); 305 if (needsbuffer) { 306 needsbuffer = 0; 307 wakeup((caddr_t)&needsbuffer); 308 } 309 310 /* anyone need this block? */ 311 if (bp->b_flags & B_WANTED) { 312 bp->b_flags &= ~(B_WANTED|B_AGE); 313 wakeup((caddr_t)bp); 314 } 315 316 if (bp->b_flags & B_LOCKED) 317 bp->b_flags &= ~B_ERROR; 318 319 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 320 (bp->b_bufsize <= 0)) { 321 bp->b_flags |= B_INVAL; 322 bp->b_flags &= ~(B_DELWRI|B_CACHE); 323 if(bp->b_vp) 324 brelvp(bp); 325 } 326 327 if( bp->b_qindex != QUEUE_NONE) 328 panic("brelse: free buffer onto another queue???"); 329 330 /* enqueue */ 331 /* buffers with no memory */ 332 if(bp->b_bufsize == 0) { 333 bp->b_qindex = QUEUE_EMPTY; 334 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 335 LIST_REMOVE(bp, b_hash); 336 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 337 bp->b_dev = NODEV; 338 /* buffers with junk contents */ 339 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 340 bp->b_qindex = QUEUE_AGE; 341 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 342 LIST_REMOVE(bp, b_hash); 343 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 344 bp->b_dev = NODEV; 345 /* buffers that are locked */ 346 } else if(bp->b_flags & B_LOCKED) { 347 bp->b_qindex = QUEUE_LOCKED; 348 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 349 /* buffers with stale but valid contents */ 350 } else if(bp->b_flags & B_AGE) { 351 bp->b_qindex = QUEUE_AGE; 352 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 353 /* buffers with valid and quite potentially reuseable contents */ 354 } else { 355 bp->b_qindex = QUEUE_LRU; 356 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 357 } 358 359 /* unlock */ 360 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 361 splx(x); 362} 363 364int freebufspace; 365int allocbufspace; 366 367/* 368 * Find a buffer header which is available for use. 369 */ 370struct buf * 371getnewbuf(int slpflag, int slptimeo) 372{ 373 struct buf *bp; 374 int s; 375 s = splbio(); 376start: 377 /* can we constitute a new buffer? */ 378 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 379 if( bp->b_qindex != QUEUE_EMPTY) 380 panic("getnewbuf: inconsistent EMPTY queue"); 381 bremfree(bp); 382 goto fillbuf; 383 } 384 385 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 386 if( bp->b_qindex != QUEUE_AGE) 387 panic("getnewbuf: inconsistent AGE queue"); 388 bremfree(bp); 389 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 390 if( bp->b_qindex != QUEUE_LRU) 391 panic("getnewbuf: inconsistent LRU queue"); 392 bremfree(bp); 393 } else { 394 /* wait for a free buffer of any kind */ 395 needsbuffer = 1; 396 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 397 splx(s); 398 return (0); 399 } 400 401 402 /* if we are a delayed write, convert to an async write */ 403 if (bp->b_flags & B_DELWRI) { 404 bp->b_flags |= B_BUSY; 405 bawrite (bp); 406 goto start; 407 } 408 409 if(bp->b_vp) 410 brelvp(bp); 411 412 /* we are not free, nor do we contain interesting data */ 413 if (bp->b_rcred != NOCRED) 414 crfree(bp->b_rcred); 415 if (bp->b_wcred != NOCRED) 416 crfree(bp->b_wcred); 417fillbuf: 418 bp->b_flags = B_BUSY; 419 LIST_REMOVE(bp, b_hash); 420 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 421 splx(s); 422 bp->b_dev = NODEV; 423 bp->b_vp = NULL; 424 bp->b_blkno = bp->b_lblkno = 0; 425 bp->b_iodone = 0; 426 bp->b_error = 0; 427 bp->b_resid = 0; 428 bp->b_bcount = 0; 429 bp->b_wcred = bp->b_rcred = NOCRED; 430 bp->b_dirtyoff = bp->b_dirtyend = 0; 431 bp->b_validoff = bp->b_validend = 0; 432 return (bp); 433} 434 435/* 436 * Check to see if a block is currently memory resident. 437 */ 438struct buf * 439incore(struct vnode *vp, daddr_t blkno) 440{ 441 struct buf *bp; 442 struct bufhashhdr *bh; 443 444 int s = splbio(); 445 446 bh = BUFHASH(vp, blkno); 447 bp = bh->lh_first; 448 449 /* Search hash chain */ 450 while (bp) { 451#ifdef DEBUG 452 if( (bp < buf) || (bp >= buf + nbuf)) { 453 printf("incore: buf out of range: %p, hash: %d\n", 454 bp, bh - bufhashtbl); 455 panic("incore: buf fault"); 456 } 457#endif 458 /* hit */ 459 if (bp->b_lblkno == blkno && bp->b_vp == vp 460 && (bp->b_flags & B_INVAL) == 0) { 461 splx(s); 462 return (bp); 463 } 464 bp = bp->b_hash.le_next; 465 } 466 splx(s); 467 468 return(0); 469} 470 471/* 472 * Get a block given a specified block and offset into a file/device. 473 */ 474struct buf * 475getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 476{ 477 struct buf *bp; 478 int s; 479 struct bufhashhdr *bh; 480 481 s = splbio(); 482loop: 483 if ((bp = incore(vp, blkno))) { 484 if (bp->b_flags & B_BUSY) { 485 bp->b_flags |= B_WANTED; 486 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 487 goto loop; 488 } 489 bp->b_flags |= B_BUSY | B_CACHE; 490 bremfree(bp); 491 /* 492 * check for size inconsistancies 493 */ 494 if (bp->b_bcount != size) { 495 printf("getblk: invalid buffer size: %ld\n", bp->b_bcount); 496 bp->b_flags |= B_INVAL; 497 bwrite(bp); 498 goto loop; 499 } 500 } else { 501 if ((bp = getnewbuf(0, 0)) == 0) 502 goto loop; 503 bp->b_blkno = bp->b_lblkno = blkno; 504 bgetvp(vp, bp); 505 LIST_REMOVE(bp, b_hash); 506 bh = BUFHASH(vp, blkno); 507 LIST_INSERT_HEAD(bh, bp, b_hash); 508 allocbuf(bp, size); 509 } 510 splx(s); 511 return (bp); 512} 513 514/* 515 * Get an empty, disassociated buffer of given size. 516 */ 517struct buf * 518geteblk(int size) 519{ 520 struct buf *bp; 521 while ((bp = getnewbuf(0, 0)) == 0) 522 ; 523 allocbuf(bp, size); 524 bp->b_flags |= B_INVAL; 525 return (bp); 526} 527 528/* 529 * Modify the length of a buffer's underlying buffer storage without 530 * destroying information (unless, of course the buffer is shrinking). 531 */ 532void 533allocbuf(struct buf *bp, int size) 534{ 535 536 int newbsize = round_page(size); 537 538 if( newbsize == bp->b_bufsize) { 539 bp->b_bcount = size; 540 return; 541 } else if( newbsize < bp->b_bufsize) { 542 vm_hold_free_pages( 543 (vm_offset_t) bp->b_data + newbsize, 544 (vm_offset_t) bp->b_data + bp->b_bufsize); 545 } else if( newbsize > bp->b_bufsize) { 546 vm_hold_load_pages( 547 (vm_offset_t) bp->b_data + bp->b_bufsize, 548 (vm_offset_t) bp->b_data + newbsize); 549 } 550 551 /* adjust buffer cache's idea of memory allocated to buffer contents */ 552 freebufspace -= newbsize - bp->b_bufsize; 553 allocbufspace += newbsize - bp->b_bufsize; 554 555 bp->b_bufsize = newbsize; 556 bp->b_bcount = size; 557} 558 559/* 560 * Wait for buffer I/O completion, returning error status. 561 */ 562int 563biowait(register struct buf *bp) 564{ 565 int s; 566 567 s = splbio(); 568 while ((bp->b_flags & B_DONE) == 0) 569 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 570 if((bp->b_flags & B_ERROR) || bp->b_error) { 571 if ((bp->b_flags & B_INVAL) == 0) { 572 bp->b_flags |= B_INVAL; 573 bp->b_dev = NODEV; 574 LIST_REMOVE(bp, b_hash); 575 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 576 } 577 if (!bp->b_error) 578 bp->b_error = EIO; 579 else 580 bp->b_flags |= B_ERROR; 581 splx(s); 582 return (bp->b_error); 583 } else { 584 splx(s); 585 return (0); 586 } 587} 588 589/* 590 * Finish I/O on a buffer, calling an optional function. 591 * This is usually called from interrupt level, so process blocking 592 * is not *a good idea*. 593 */ 594void 595biodone(register struct buf *bp) 596{ 597 int s; 598 s = splbio(); 599 bp->b_flags |= B_DONE; 600 601 if ((bp->b_flags & B_READ) == 0) { 602 vwakeup(bp); 603 } 604 605#ifdef BOUNCE_BUFFERS 606 if (bp->b_flags & B_BOUNCE) 607 vm_bounce_free(bp); 608#endif 609 610 /* call optional completion function if requested */ 611 if (bp->b_flags & B_CALL) { 612 bp->b_flags &= ~B_CALL; 613 (*bp->b_iodone)(bp); 614 splx(s); 615 return; 616 } 617 618/* 619 * For asynchronous completions, release the buffer now. The brelse 620 * checks for B_WANTED and will do the wakeup there if necessary - 621 * so no need to do a wakeup here in the async case. 622 */ 623 624 if (bp->b_flags & B_ASYNC) { 625 brelse(bp); 626 } else { 627 bp->b_flags &= ~B_WANTED; 628 wakeup((caddr_t) bp); 629 } 630 splx(s); 631} 632 633int 634count_lock_queue() 635{ 636 int count; 637 struct buf *bp; 638 639 count = 0; 640 for(bp = bufqueues[QUEUE_LOCKED].tqh_first; 641 bp != NULL; 642 bp = bp->b_freelist.tqe_next) 643 count++; 644 return(count); 645} 646 647int vfs_update_interval = 30; 648 649void 650vfs_update() { 651 (void) spl0(); 652 while(1) { 653 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 654 hz * vfs_update_interval); 655 vfs_update_wakeup = 0; 656 sync(curproc, NULL, NULL); 657 } 658} 659 660#if 0 661#define MAXFREEBP 128 662#define LDFREE_BUSY 1 663#define LDFREE_WANT 2 664int loadfreeing; 665struct buf *freebp[MAXFREEBP]; 666#endif 667/* 668 * these routines are not in the correct place (yet) 669 * also they work *ONLY* for kernel_pmap!!! 670 */ 671void 672vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 673 vm_offset_t pg; 674 vm_page_t p; 675 vm_offset_t from = round_page(froma); 676 vm_offset_t to = round_page(toa); 677 678 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 679 680 tryagain: 681#if 0 682/* 683 * don't allow buffer cache to cause VM paging 684 */ 685 if ( cnt.v_free_count < cnt.v_free_min) { 686 if( !loadfreeing ) { 687 int n=0; 688 struct buf *bp; 689 loadfreeing = LDFREE_BUSY; 690 while( (cnt.v_free_count <= cnt.v_free_min) && 691 (n < MAXFREEBP)) { 692 bp = geteblk(0); 693 if( bp) 694 freebp[n++] = bp; 695 else 696 break; 697 } 698 while(--n >= 0) { 699 brelse(freebp[n]); 700 } 701 if( loadfreeing & LDFREE_WANT) 702 wakeup((caddr_t) &loadfreeing); 703 loadfreeing = 0; 704 } else { 705 loadfreeing |= LDFREE_WANT; 706 tsleep(&loadfreeing, PRIBIO, "biofree", 0); 707 } 708 } 709#endif 710 if (cnt.v_free_count <= 711 cnt.v_free_reserved + (toa-froma) / PAGE_SIZE) { 712 VM_WAIT; 713 goto tryagain; 714 } 715 716 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 717 if( !p) { 718 VM_WAIT; 719 goto tryagain; 720 } 721 722 vm_page_wire(p); 723 pmap_kenter( pg, VM_PAGE_TO_PHYS(p)); 724 } 725} 726 727void 728vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) 729{ 730 vm_offset_t pg; 731 vm_page_t p; 732 vm_offset_t from = round_page(froma); 733 vm_offset_t to = round_page(toa); 734 735 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 736 p = PHYS_TO_VM_PAGE( pmap_kextract( pg)); 737 pmap_kremove( pg); 738 vm_page_free(p); 739 } 740} 741 742void 743bufstats() 744{ 745} 746 747