vfs_bio.c revision 3098
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $Id: vfs_bio.c,v 1.11 1994/08/31 06:17:37 davidg Exp $ 20 */ 21 22#include <sys/param.h> 23#include <sys/systm.h> 24#include <sys/kernel.h> 25#include <sys/proc.h> 26#include <sys/vnode.h> 27#include <sys/buf.h> 28#include <sys/mount.h> 29#include <sys/malloc.h> 30#include <sys/resourcevar.h> 31#include <sys/proc.h> 32#include <vm/vm.h> 33#include <vm/vm_pageout.h> 34 35#include <miscfs/specfs/specdev.h> 36 37struct buf *buf; /* buffer header pool */ 38int nbuf; /* number of buffer headers calculated elsewhere */ 39struct swqueue bswlist; 40struct buf *bclnlist; /* Head of cleaned page list. */ 41 42extern vm_map_t buffer_map, io_map; 43 44void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 45void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 46 47int needsbuffer; 48 49/* 50 * Internal update daemon, process 3 51 * The variable vfs_update_wakeup allows for internal syncs. 52 */ 53int vfs_update_wakeup; 54 55/* 56 * Initialize buffer headers and related structures. 57 */ 58void 59bufinit() 60{ 61 struct buf *bp; 62 int i; 63 64 TAILQ_INIT(&bswlist); 65 LIST_INIT(&invalhash); 66 67 /* first, make a null hash table */ 68 for(i=0;i<BUFHSZ;i++) 69 LIST_INIT(&bufhashtbl[i]); 70 71 /* next, make a null set of free lists */ 72 for(i=0;i<BUFFER_QUEUES;i++) 73 TAILQ_INIT(&bufqueues[i]); 74 75 /* finally, initialize each buffer header and stick on empty q */ 76 for(i=0;i<nbuf;i++) { 77 bp = &buf[i]; 78 bzero(bp, sizeof *bp); 79 bp->b_flags = B_INVAL; /* we're just an empty header */ 80 bp->b_dev = NODEV; 81 bp->b_vp = NULL; 82 bp->b_rcred = NOCRED; 83 bp->b_wcred = NOCRED; 84 bp->b_qindex = QUEUE_EMPTY; 85 bp->b_vnbufs.le_next = NOLIST; 86 bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE); 87 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 88 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 89 } 90} 91 92/* 93 * remove the buffer from the appropriate free list 94 */ 95void 96bremfree(struct buf *bp) 97{ 98 int s = splbio(); 99 if( bp->b_qindex != QUEUE_NONE) { 100 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 101 bp->b_qindex = QUEUE_NONE; 102 } else { 103 panic("bremfree: removing a buffer when not on a queue"); 104 } 105 splx(s); 106} 107 108/* 109 * Get a buffer with the specified data. Look in the cache first. 110 */ 111int 112bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 113 struct buf **bpp) 114{ 115 struct buf *bp; 116 117 bp = getblk (vp, blkno, size, 0, 0); 118 *bpp = bp; 119 120 /* if not found in cache, do some I/O */ 121 if ((bp->b_flags & B_CACHE) == 0) { 122 if (curproc && curproc->p_stats) /* count block I/O */ 123 curproc->p_stats->p_ru.ru_inblock++; 124 bp->b_flags |= B_READ; 125 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 126 if( bp->b_rcred == NOCRED) { 127 if (cred != NOCRED) 128 crhold(cred); 129 bp->b_rcred = cred; 130 } 131 VOP_STRATEGY(bp); 132 return( biowait (bp)); 133 } 134 135 return (0); 136} 137 138/* 139 * Operates like bread, but also starts asynchronous I/O on 140 * read-ahead blocks. 141 */ 142int 143breadn(struct vnode *vp, daddr_t blkno, int size, 144 daddr_t *rablkno, int *rabsize, 145 int cnt, struct ucred *cred, struct buf **bpp) 146{ 147 struct buf *bp, *rabp; 148 int i; 149 int rv = 0, readwait = 0; 150 151 *bpp = bp = getblk (vp, blkno, size, 0, 0); 152 153 /* if not found in cache, do some I/O */ 154 if ((bp->b_flags & B_CACHE) == 0) { 155 if (curproc && curproc->p_stats) /* count block I/O */ 156 curproc->p_stats->p_ru.ru_inblock++; 157 bp->b_flags |= B_READ; 158 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 159 if( bp->b_rcred == NOCRED) { 160 if (cred != NOCRED) 161 crhold(cred); 162 bp->b_rcred = cred; 163 } 164 VOP_STRATEGY(bp); 165 ++readwait; 166 } 167 168 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 169 if( incore(vp, *rablkno)) { 170 continue; 171 } 172 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 173 174 if ((rabp->b_flags & B_CACHE) == 0) { 175 if (curproc && curproc->p_stats) 176 curproc->p_stats->p_ru.ru_inblock++; 177 rabp->b_flags |= B_READ | B_ASYNC; 178 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 179 if( rabp->b_rcred == NOCRED) { 180 if (cred != NOCRED) 181 crhold(cred); 182 rabp->b_rcred = cred; 183 } 184 VOP_STRATEGY(rabp); 185 } else { 186 brelse(rabp); 187 } 188 } 189 190 if( readwait) { 191 rv = biowait (bp); 192 } 193 194 return (rv); 195} 196 197/* 198 * Write, release buffer on completion. (Done by iodone 199 * if async.) 200 */ 201int 202bwrite(struct buf *bp) 203{ 204 int oldflags = bp->b_flags; 205 206 if(bp->b_flags & B_INVAL) { 207 brelse(bp); 208 return (0); 209 } 210 211 if(!(bp->b_flags & B_BUSY)) 212 panic("bwrite: buffer is not busy???"); 213 214 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 215 bp->b_flags |= B_WRITEINPROG; 216 217 if (oldflags & B_ASYNC) { 218 if (oldflags & B_DELWRI) { 219 reassignbuf(bp, bp->b_vp); 220 } else if( curproc) { 221 ++curproc->p_stats->p_ru.ru_oublock; 222 } 223 } 224 225 bp->b_vp->v_numoutput++; 226 VOP_STRATEGY(bp); 227 228 if( (oldflags & B_ASYNC) == 0) { 229 int rtval = biowait(bp); 230 if (oldflags & B_DELWRI) { 231 reassignbuf(bp, bp->b_vp); 232 } else if( curproc) { 233 ++curproc->p_stats->p_ru.ru_oublock; 234 } 235 brelse(bp); 236 return (rtval); 237 } 238 239 return(0); 240} 241 242int 243vn_bwrite(ap) 244 struct vop_bwrite_args *ap; 245{ 246 return (bwrite(ap->a_bp)); 247} 248 249/* 250 * Delayed write. (Buffer is marked dirty). 251 */ 252void 253bdwrite(struct buf *bp) 254{ 255 256 if((bp->b_flags & B_BUSY) == 0) { 257 panic("bdwrite: buffer is not busy"); 258 } 259 260 if(bp->b_flags & B_INVAL) { 261 brelse(bp); 262 return; 263 } 264 265 if(bp->b_flags & B_TAPE) { 266 bawrite(bp); 267 return; 268 } 269 270 bp->b_flags &= ~B_READ; 271 if( (bp->b_flags & B_DELWRI) == 0) { 272 if( curproc) 273 ++curproc->p_stats->p_ru.ru_oublock; 274 bp->b_flags |= B_DONE|B_DELWRI; 275 reassignbuf(bp, bp->b_vp); 276 } 277 brelse(bp); 278 return; 279} 280 281/* 282 * Asynchronous write. 283 * Start output on a buffer, but do not wait for it to complete. 284 * The buffer is released when the output completes. 285 */ 286void 287bawrite(struct buf *bp) 288{ 289 bp->b_flags |= B_ASYNC; 290 (void) bwrite(bp); 291} 292 293/* 294 * Release a buffer. 295 */ 296void 297brelse(struct buf *bp) 298{ 299 int x; 300 301 /* anyone need a "free" block? */ 302 x=splbio(); 303 if (needsbuffer) { 304 needsbuffer = 0; 305 wakeup((caddr_t)&needsbuffer); 306 } 307 308 /* anyone need this block? */ 309 if (bp->b_flags & B_WANTED) { 310 bp->b_flags &= ~(B_WANTED|B_AGE); 311 wakeup((caddr_t)bp); 312 } 313 314 if (bp->b_flags & B_LOCKED) 315 bp->b_flags &= ~B_ERROR; 316 317 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 318 (bp->b_bufsize <= 0)) { 319 bp->b_flags |= B_INVAL; 320 bp->b_flags &= ~(B_DELWRI|B_CACHE); 321 if(bp->b_vp) 322 brelvp(bp); 323 } 324 325 if( bp->b_qindex != QUEUE_NONE) 326 panic("brelse: free buffer onto another queue???"); 327 328 /* enqueue */ 329 /* buffers with no memory */ 330 if(bp->b_bufsize == 0) { 331 bp->b_qindex = QUEUE_EMPTY; 332 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 333 LIST_REMOVE(bp, b_hash); 334 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 335 bp->b_dev = NODEV; 336 /* buffers with junk contents */ 337 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 338 bp->b_qindex = QUEUE_AGE; 339 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 340 LIST_REMOVE(bp, b_hash); 341 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 342 bp->b_dev = NODEV; 343 /* buffers that are locked */ 344 } else if(bp->b_flags & B_LOCKED) { 345 bp->b_qindex = QUEUE_LOCKED; 346 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 347 /* buffers with stale but valid contents */ 348 } else if(bp->b_flags & B_AGE) { 349 bp->b_qindex = QUEUE_AGE; 350 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 351 /* buffers with valid and quite potentially reuseable contents */ 352 } else { 353 bp->b_qindex = QUEUE_LRU; 354 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 355 } 356 357 /* unlock */ 358 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 359 splx(x); 360} 361 362int freebufspace; 363int allocbufspace; 364 365/* 366 * Find a buffer header which is available for use. 367 */ 368struct buf * 369getnewbuf(int slpflag, int slptimeo) 370{ 371 struct buf *bp; 372 int s; 373 s = splbio(); 374start: 375 /* can we constitute a new buffer? */ 376 if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) { 377 if( bp->b_qindex != QUEUE_EMPTY) 378 panic("getnewbuf: inconsistent EMPTY queue"); 379 bremfree(bp); 380 goto fillbuf; 381 } 382 383 if ((bp = bufqueues[QUEUE_AGE].tqh_first)) { 384 if( bp->b_qindex != QUEUE_AGE) 385 panic("getnewbuf: inconsistent AGE queue"); 386 bremfree(bp); 387 } else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) { 388 if( bp->b_qindex != QUEUE_LRU) 389 panic("getnewbuf: inconsistent LRU queue"); 390 bremfree(bp); 391 } else { 392 /* wait for a free buffer of any kind */ 393 needsbuffer = 1; 394 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 395 splx(s); 396 return (0); 397 } 398 399 400 /* if we are a delayed write, convert to an async write */ 401 if (bp->b_flags & B_DELWRI) { 402 bp->b_flags |= B_BUSY; 403 bawrite (bp); 404 goto start; 405 } 406 407 if(bp->b_vp) 408 brelvp(bp); 409 410 /* we are not free, nor do we contain interesting data */ 411 if (bp->b_rcred != NOCRED) 412 crfree(bp->b_rcred); 413 if (bp->b_wcred != NOCRED) 414 crfree(bp->b_wcred); 415fillbuf: 416 bp->b_flags = B_BUSY; 417 LIST_REMOVE(bp, b_hash); 418 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 419 splx(s); 420 bp->b_dev = NODEV; 421 bp->b_vp = NULL; 422 bp->b_blkno = bp->b_lblkno = 0; 423 bp->b_iodone = 0; 424 bp->b_error = 0; 425 bp->b_resid = 0; 426 bp->b_bcount = 0; 427 bp->b_wcred = bp->b_rcred = NOCRED; 428 bp->b_dirtyoff = bp->b_dirtyend = 0; 429 bp->b_validoff = bp->b_validend = 0; 430 return (bp); 431} 432 433/* 434 * Check to see if a block is currently memory resident. 435 */ 436struct buf * 437incore(struct vnode *vp, daddr_t blkno) 438{ 439 struct buf *bp; 440 struct bufhashhdr *bh; 441 442 int s = splbio(); 443 444 bh = BUFHASH(vp, blkno); 445 bp = bh->lh_first; 446 447 /* Search hash chain */ 448 while (bp) { 449 if( (bp < buf) || (bp >= buf + nbuf)) { 450 printf("incore: buf out of range: %p, hash: %d\n", 451 bp, bh - bufhashtbl); 452 panic("incore: buf fault"); 453 } 454 /* hit */ 455 if (bp->b_lblkno == blkno && bp->b_vp == vp 456 && (bp->b_flags & B_INVAL) == 0) { 457 splx(s); 458 return (bp); 459 } 460 bp = bp->b_hash.le_next; 461 } 462 splx(s); 463 464 return(0); 465} 466 467/* 468 * Get a block given a specified block and offset into a file/device. 469 */ 470struct buf * 471getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 472{ 473 struct buf *bp; 474 int s; 475 struct bufhashhdr *bh; 476 477 s = splbio(); 478loop: 479 if ((bp = incore(vp, blkno))) { 480 if (bp->b_flags & B_BUSY) { 481 bp->b_flags |= B_WANTED; 482 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 483 goto loop; 484 } 485 bp->b_flags |= B_BUSY | B_CACHE; 486 bremfree(bp); 487 /* 488 * check for size inconsistancies 489 */ 490 if (bp->b_bcount != size) { 491 printf("getblk: invalid buffer size: %ld\n", bp->b_bcount); 492 bp->b_flags |= B_INVAL; 493 bwrite(bp); 494 goto loop; 495 } 496 } else { 497 498 if ((bp = getnewbuf(0, 0)) == 0) 499 goto loop; 500 allocbuf(bp, size); 501 /* 502 * have to check again, because of a possible 503 * race condition. 504 */ 505 if (incore( vp, blkno)) { 506 allocbuf(bp, 0); 507 bp->b_flags |= B_INVAL; 508 brelse(bp); 509 goto loop; 510 } 511 bp->b_blkno = bp->b_lblkno = blkno; 512 bgetvp(vp, bp); 513 LIST_REMOVE(bp, b_hash); 514 bh = BUFHASH(vp, blkno); 515 LIST_INSERT_HEAD(bh, bp, b_hash); 516 } 517 splx(s); 518 return (bp); 519} 520 521/* 522 * Get an empty, disassociated buffer of given size. 523 */ 524struct buf * 525geteblk(int size) 526{ 527 struct buf *bp; 528 while ((bp = getnewbuf(0, 0)) == 0) 529 ; 530 allocbuf(bp, size); 531 bp->b_flags |= B_INVAL; 532 return (bp); 533} 534 535/* 536 * Modify the length of a buffer's underlying buffer storage without 537 * destroying information (unless, of course the buffer is shrinking). 538 */ 539void 540allocbuf(struct buf *bp, int size) 541{ 542 543 int newbsize = round_page(size); 544 545 if( newbsize == bp->b_bufsize) { 546 bp->b_bcount = size; 547 return; 548 } else if( newbsize < bp->b_bufsize) { 549 vm_hold_free_pages( 550 (vm_offset_t) bp->b_data + newbsize, 551 (vm_offset_t) bp->b_data + bp->b_bufsize); 552 } else if( newbsize > bp->b_bufsize) { 553 vm_hold_load_pages( 554 (vm_offset_t) bp->b_data + bp->b_bufsize, 555 (vm_offset_t) bp->b_data + newbsize); 556 } 557 558 /* adjust buffer cache's idea of memory allocated to buffer contents */ 559 freebufspace -= newbsize - bp->b_bufsize; 560 allocbufspace += newbsize - bp->b_bufsize; 561 562 bp->b_bufsize = newbsize; 563 bp->b_bcount = size; 564} 565 566/* 567 * Wait for buffer I/O completion, returning error status. 568 */ 569int 570biowait(register struct buf *bp) 571{ 572 int s; 573 574 s = splbio(); 575 while ((bp->b_flags & B_DONE) == 0) 576 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 577 if((bp->b_flags & B_ERROR) || bp->b_error) { 578 if ((bp->b_flags & B_INVAL) == 0) { 579 bp->b_flags |= B_INVAL; 580 bp->b_dev = NODEV; 581 LIST_REMOVE(bp, b_hash); 582 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 583 } 584 if (!bp->b_error) 585 bp->b_error = EIO; 586 else 587 bp->b_flags |= B_ERROR; 588 splx(s); 589 return (bp->b_error); 590 } else { 591 splx(s); 592 return (0); 593 } 594} 595 596/* 597 * Finish I/O on a buffer, calling an optional function. 598 * This is usually called from interrupt level, so process blocking 599 * is not *a good idea*. 600 */ 601void 602biodone(register struct buf *bp) 603{ 604 int s; 605 s = splbio(); 606 bp->b_flags |= B_DONE; 607 608 if ((bp->b_flags & B_READ) == 0) { 609 vwakeup(bp); 610 } 611 612#ifdef BOUNCE_BUFFERS 613 if (bp->b_flags & B_BOUNCE) 614 vm_bounce_free(bp); 615#endif 616 617 /* call optional completion function if requested */ 618 if (bp->b_flags & B_CALL) { 619 bp->b_flags &= ~B_CALL; 620 (*bp->b_iodone)(bp); 621 splx(s); 622 return; 623 } 624 625/* 626 * For asynchronous completions, release the buffer now. The brelse 627 * checks for B_WANTED and will do the wakeup there if necessary - 628 * so no need to do a wakeup here in the async case. 629 */ 630 631 if (bp->b_flags & B_ASYNC) { 632 brelse(bp); 633 } else { 634 bp->b_flags &= ~B_WANTED; 635 wakeup((caddr_t) bp); 636 } 637 splx(s); 638} 639 640int 641count_lock_queue() 642{ 643 int count; 644 struct buf *bp; 645 646 count = 0; 647 for(bp = bufqueues[QUEUE_LOCKED].tqh_first; 648 bp != NULL; 649 bp = bp->b_freelist.tqe_next) 650 count++; 651 return(count); 652} 653 654int vfs_update_interval = 30; 655 656void 657vfs_update() { 658 (void) spl0(); 659 while(1) { 660 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 661 hz * vfs_update_interval); 662 vfs_update_wakeup = 0; 663 sync(curproc, NULL, NULL); 664 } 665} 666 667#define MAXFREEBP 128 668#define LDFREE_BUSY 1 669#define LDFREE_WANT 2 670int loadfreeing; 671struct buf *freebp[MAXFREEBP]; 672/* 673 * these routines are not in the correct place (yet) 674 * also they work *ONLY* for kernel_pmap!!! 675 */ 676void 677vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 678 vm_offset_t pg; 679 vm_page_t p; 680 vm_offset_t from = round_page(froma); 681 vm_offset_t to = round_page(toa); 682 683 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 684 685 tryagain: 686/* 687 * don't allow buffer cache to cause VM paging 688 */ 689 if ( cnt.v_free_count < cnt.v_free_min) { 690 if( !loadfreeing ) { 691 int n=0; 692 struct buf *bp; 693 loadfreeing = LDFREE_BUSY; 694 while( (cnt.v_free_count <= cnt.v_free_min) && 695 (n < MAXFREEBP)) { 696 bp = geteblk(0); 697 if( bp) 698 freebp[n++] = bp; 699 else 700 break; 701 } 702 while(--n >= 0) { 703 brelse(freebp[n]); 704 } 705 if( loadfreeing & LDFREE_WANT) 706 wakeup((caddr_t) &loadfreeing); 707 loadfreeing = 0; 708 } else { 709 loadfreeing |= LDFREE_WANT; 710 tsleep(&loadfreeing, PRIBIO, "biofree", 0); 711 } 712 } 713 714 715 if (cnt.v_free_count <= 716 cnt.v_free_reserved + (toa-froma) / PAGE_SIZE) { 717 VM_WAIT; 718 goto tryagain; 719 } 720 721 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 722 if( !p) { 723 VM_WAIT; 724 goto tryagain; 725 } 726 727 vm_page_wire(p); 728 pmap_kenter( pg, VM_PAGE_TO_PHYS(p)); 729 } 730} 731 732void 733vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) 734{ 735 vm_offset_t pg; 736 vm_page_t p; 737 vm_offset_t from = round_page(froma); 738 vm_offset_t to = round_page(toa); 739 740 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 741 p = PHYS_TO_VM_PAGE( pmap_kextract( pg)); 742 pmap_kremove( pg); 743 vm_page_free(p); 744 } 745} 746 747void 748bufstats() 749{ 750} 751 752