vfs_bio.c revision 1549
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 */ 19 20#include <sys/param.h> 21#include <sys/systm.h> 22#include <sys/kernel.h> 23#include <sys/proc.h> 24#include <sys/vnode.h> 25#include <sys/buf.h> 26#include <sys/mount.h> 27#include <sys/malloc.h> 28#include <sys/resourcevar.h> 29#include <vm/vm.h> 30#include <vm/vm_pageout.h> 31 32#include <miscfs/specfs/specdev.h> 33 34struct buf *buf; /* the buffer pool itself */ 35int nbuf; /* number of buffer headers */ 36int bufpages; /* number of memory pages in the buffer pool */ 37struct buf *swbuf; /* swap I/O headers */ 38int nswbuf; 39#define BUFHSZ 512 40int bufhash = BUFHSZ - 1; 41 42struct buf *getnewbuf(int,int); 43extern vm_map_t buffer_map, io_map; 44void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 45void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 46/* 47 * Definitions for the buffer hash lists. 48 */ 49#define BUFHASH(dvp, lbn) \ 50 (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 51 52/* 53 * Definitions for the buffer free lists. 54 */ 55#define BQUEUES 5 /* number of free buffer queues */ 56 57LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; 58TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; 59 60#define BQ_NONE 0 /* on no queue */ 61#define BQ_LOCKED 1 /* locked buffers */ 62#define BQ_LRU 2 /* useful buffers */ 63#define BQ_AGE 3 /* less useful buffers */ 64#define BQ_EMPTY 4 /* empty buffer headers*/ 65 66int needsbuffer; 67 68/* 69 * Internal update daemon, process 3 70 * The variable vfs_update_wakeup allows for internal syncs. 71 */ 72int vfs_update_wakeup; 73 74/* 75 * Initialize buffer headers and related structures. 76 */ 77void bufinit() 78{ 79 struct buf *bp; 80 int i; 81 82 TAILQ_INIT(&bswlist); 83 LIST_INIT(&invalhash); 84 85 /* first, make a null hash table */ 86 for(i=0;i<BUFHSZ;i++) 87 LIST_INIT(&bufhashtbl[i]); 88 89 /* next, make a null set of free lists */ 90 for(i=0;i<BQUEUES;i++) 91 TAILQ_INIT(&bufqueues[i]); 92 93 /* finally, initialize each buffer header and stick on empty q */ 94 for(i=0;i<nbuf;i++) { 95 bp = &buf[i]; 96 bzero(bp, sizeof *bp); 97 bp->b_flags = B_INVAL; /* we're just an empty header */ 98 bp->b_dev = NODEV; 99 bp->b_vp = NULL; 100 bp->b_rcred = NOCRED; 101 bp->b_wcred = NOCRED; 102 bp->b_qindex = BQ_EMPTY; 103 bp->b_vnbufs.le_next = NOLIST; 104 bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE); 105 TAILQ_INSERT_TAIL(&bufqueues[BQ_EMPTY], bp, b_freelist); 106 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 107 } 108} 109 110/* 111 * remove the buffer from the appropriate free list 112 */ 113void 114bremfree(struct buf *bp) 115{ 116 int s = splbio(); 117 if( bp->b_qindex != BQ_NONE) { 118 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 119 bp->b_qindex = BQ_NONE; 120 } else { 121 panic("bremfree: removing a buffer when not on a queue"); 122 } 123 splx(s); 124} 125 126/* 127 * Get a buffer with the specified data. Look in the cache first. 128 */ 129int 130bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 131 struct buf **bpp) 132{ 133 struct buf *bp; 134 135 bp = getblk (vp, blkno, size, 0, 0); 136 *bpp = bp; 137 138 /* if not found in cache, do some I/O */ 139 if ((bp->b_flags & B_CACHE) == 0) { 140 if (curproc && curproc->p_stats) /* count block I/O */ 141 curproc->p_stats->p_ru.ru_inblock++; 142 bp->b_flags |= B_READ; 143 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 144 if( bp->b_rcred == NOCRED) { 145 if (cred != NOCRED) 146 crhold(cred); 147 bp->b_rcred = cred; 148 } 149 VOP_STRATEGY(bp); 150 return( biowait (bp)); 151 } 152 153 return (0); 154} 155 156/* 157 * Operates like bread, but also starts asynchronous I/O on 158 * read-ahead blocks. 159 */ 160int 161breadn(struct vnode *vp, daddr_t blkno, int size, 162 daddr_t *rablkno, int *rabsize, 163 int cnt, struct ucred *cred, struct buf **bpp) 164{ 165 struct buf *bp, *rabp; 166 int i; 167 int rv = 0, readwait = 0; 168 169 *bpp = bp = getblk (vp, blkno, size, 0, 0); 170 171 /* if not found in cache, do some I/O */ 172 if ((bp->b_flags & B_CACHE) == 0) { 173 if (curproc && curproc->p_stats) /* count block I/O */ 174 curproc->p_stats->p_ru.ru_inblock++; 175 bp->b_flags |= B_READ; 176 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 177 if( bp->b_rcred == NOCRED) { 178 if (cred != NOCRED) 179 crhold(cred); 180 bp->b_rcred = cred; 181 } 182 VOP_STRATEGY(bp); 183 ++readwait; 184 } 185 186 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 187 if( incore(vp, *rablkno)) { 188 continue; 189 } 190 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 191 192 if ((rabp->b_flags & B_CACHE) == 0) { 193 if (curproc && curproc->p_stats) 194 curproc->p_stats->p_ru.ru_inblock++; 195 rabp->b_flags |= B_READ | B_ASYNC; 196 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 197 if( rabp->b_rcred == NOCRED) { 198 if (cred != NOCRED) 199 crhold(cred); 200 rabp->b_rcred = cred; 201 } 202 VOP_STRATEGY(rabp); 203 } else { 204 brelse(rabp); 205 } 206 } 207 208 if( readwait) { 209 rv = biowait (bp); 210 } 211 212 return (rv); 213} 214 215/* 216 * Write, release buffer on completion. (Done by iodone 217 * if async.) 218 */ 219int 220bwrite(struct buf *bp) 221{ 222 int oldflags = bp->b_flags; 223 224 if(bp->b_flags & B_INVAL) { 225 brelse(bp); 226 return (0); 227 } 228 229 if(!(bp->b_flags & B_BUSY)) 230 panic("bwrite: buffer is not busy???"); 231 232 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 233 bp->b_flags |= B_WRITEINPROG; 234 235 if (oldflags & B_ASYNC) { 236 if (oldflags & B_DELWRI) { 237 reassignbuf(bp, bp->b_vp); 238 } else if( curproc) { 239 ++curproc->p_stats->p_ru.ru_oublock; 240 } 241 } 242 243 bp->b_vp->v_numoutput++; 244 VOP_STRATEGY(bp); 245 246 if( (oldflags & B_ASYNC) == 0) { 247 int rtval = biowait(bp); 248 if (oldflags & B_DELWRI) { 249 reassignbuf(bp, bp->b_vp); 250 } else if( curproc) { 251 ++curproc->p_stats->p_ru.ru_oublock; 252 } 253 brelse(bp); 254 return (rtval); 255 } 256 257 return(0); 258} 259 260int 261vn_bwrite(ap) 262 struct vop_bwrite_args *ap; 263{ 264 return (bwrite(ap->a_bp)); 265} 266 267/* 268 * Delayed write. (Buffer is marked dirty). 269 */ 270void 271bdwrite(struct buf *bp) 272{ 273 274 if((bp->b_flags & B_BUSY) == 0) { 275 panic("bdwrite: buffer is not busy"); 276 } 277 278 if(bp->b_flags & B_INVAL) { 279 brelse(bp); 280 return; 281 } 282 283 if(bp->b_flags & B_TAPE) { 284 bawrite(bp); 285 return; 286 } 287 288 bp->b_flags &= ~B_READ; 289 if( (bp->b_flags & B_DELWRI) == 0) { 290 if( curproc) 291 ++curproc->p_stats->p_ru.ru_oublock; 292 bp->b_flags |= B_DONE|B_DELWRI; 293 reassignbuf(bp, bp->b_vp); 294 } 295 brelse(bp); 296 return; 297} 298 299/* 300 * Asynchronous write. 301 * Start output on a buffer, but do not wait for it to complete. 302 * The buffer is released when the output completes. 303 */ 304void 305bawrite(struct buf *bp) 306{ 307 bp->b_flags |= B_ASYNC; 308 (void) bwrite(bp); 309} 310 311/* 312 * Release a buffer. 313 */ 314void 315brelse(struct buf *bp) 316{ 317 int x; 318 319 /* anyone need a "free" block? */ 320 x=splbio(); 321 if (needsbuffer) { 322 needsbuffer = 0; 323 wakeup((caddr_t)&needsbuffer); 324 } 325 /* anyone need this very block? */ 326 if (bp->b_flags & B_WANTED) { 327 bp->b_flags &= ~(B_WANTED|B_AGE); 328 wakeup((caddr_t)bp); 329 } 330 331 if (bp->b_flags & B_LOCKED) 332 bp->b_flags &= ~B_ERROR; 333 334 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 335 (bp->b_bufsize <= 0)) { 336 bp->b_flags |= B_INVAL; 337 bp->b_flags &= ~(B_DELWRI|B_CACHE); 338 if(bp->b_vp) 339 brelvp(bp); 340 } 341 342 if( bp->b_qindex != BQ_NONE) 343 panic("brelse: free buffer onto another queue???"); 344 345 /* enqueue */ 346 /* buffers with junk contents */ 347 if(bp->b_bufsize == 0) { 348 bp->b_qindex = BQ_EMPTY; 349 TAILQ_INSERT_HEAD(&bufqueues[BQ_EMPTY], bp, b_freelist); 350 LIST_REMOVE(bp, b_hash); 351 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 352 bp->b_dev = NODEV; 353 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 354 bp->b_qindex = BQ_AGE; 355 TAILQ_INSERT_HEAD(&bufqueues[BQ_AGE], bp, b_freelist); 356 LIST_REMOVE(bp, b_hash); 357 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 358 bp->b_dev = NODEV; 359 /* buffers that are locked */ 360 } else if(bp->b_flags & B_LOCKED) { 361 bp->b_qindex = BQ_LOCKED; 362 TAILQ_INSERT_TAIL(&bufqueues[BQ_LOCKED], bp, b_freelist); 363 /* buffers with stale but valid contents */ 364 } else if(bp->b_flags & B_AGE) { 365 bp->b_qindex = BQ_AGE; 366 TAILQ_INSERT_TAIL(&bufqueues[BQ_AGE], bp, b_freelist); 367 /* buffers with valid and quite potentially reuseable contents */ 368 } else { 369 bp->b_qindex = BQ_LRU; 370 TAILQ_INSERT_TAIL(&bufqueues[BQ_LRU], bp, b_freelist); 371 } 372 373 /* unlock */ 374 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 375 splx(x); 376} 377 378int freebufspace; 379int allocbufspace; 380 381/* 382 * Find a buffer header which is available for use. 383 */ 384struct buf * 385getnewbuf(int slpflag, int slptimeo) 386{ 387 struct buf *bp; 388 int x; 389 x = splbio(); 390start: 391 /* can we constitute a new buffer? */ 392 if (bp = bufqueues[BQ_EMPTY].tqh_first) { 393 if( bp->b_qindex != BQ_EMPTY) 394 panic("getnewbuf: inconsistent EMPTY queue"); 395 bremfree(bp); 396 goto fillbuf; 397 } 398 399tryfree: 400 if (bp = bufqueues[BQ_AGE].tqh_first) { 401 if( bp->b_qindex != BQ_AGE) 402 panic("getnewbuf: inconsistent AGE queue"); 403 bremfree(bp); 404 } else if (bp = bufqueues[BQ_LRU].tqh_first) { 405 if( bp->b_qindex != BQ_LRU) 406 panic("getnewbuf: inconsistent LRU queue"); 407 bremfree(bp); 408 } else { 409 /* wait for a free buffer of any kind */ 410 needsbuffer = 1; 411 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 412 splx(x); 413 return (0); 414 } 415 416 417 /* if we are a delayed write, convert to an async write */ 418 if (bp->b_flags & B_DELWRI) { 419 bp->b_flags |= B_BUSY; 420 bawrite (bp); 421 goto start; 422 } 423 424 if(bp->b_vp) 425 brelvp(bp); 426 427 /* we are not free, nor do we contain interesting data */ 428 if (bp->b_rcred != NOCRED) 429 crfree(bp->b_rcred); 430 if (bp->b_wcred != NOCRED) 431 crfree(bp->b_wcred); 432fillbuf: 433 bp->b_flags = B_BUSY; 434 LIST_REMOVE(bp, b_hash); 435 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 436 splx(x); 437 bp->b_dev = NODEV; 438 bp->b_vp = NULL; 439 bp->b_blkno = bp->b_lblkno = 0; 440 bp->b_iodone = 0; 441 bp->b_error = 0; 442 bp->b_resid = 0; 443 bp->b_bcount = 0; 444 bp->b_wcred = bp->b_rcred = NOCRED; 445 bp->b_dirtyoff = bp->b_dirtyend = 0; 446 bp->b_validoff = bp->b_validend = 0; 447 return (bp); 448} 449 450/* 451 * Check to see if a block is currently memory resident. 452 */ 453struct buf * 454incore(struct vnode *vp, daddr_t blkno) 455{ 456 struct buf *bp; 457 struct bufhashhdr *bh; 458 459 int s = splbio(); 460 461 bh = BUFHASH(vp, blkno); 462 bp = bh->lh_first; 463 464 /* Search hash chain */ 465 while (bp) { 466 if( (bp < buf) || (bp >= buf + nbuf)) { 467 printf("incore: buf out of range: %lx, hash: %d\n", 468 bp, bh - bufhashtbl); 469 panic("incore: buf fault"); 470 } 471 /* hit */ 472 if (bp->b_lblkno == blkno && bp->b_vp == vp 473 && (bp->b_flags & B_INVAL) == 0) 474 return (bp); 475 bp = bp->b_hash.le_next; 476 } 477 splx(s); 478 479 return(0); 480} 481 482/* 483 * Get a block given a specified block and offset into a file/device. 484 */ 485struct buf * 486getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 487{ 488 struct buf *bp; 489 int x; 490 struct bufhashhdr *bh; 491 492 x = splbio(); 493loop: 494 if (bp = incore(vp, blkno)) { 495 if (bp->b_flags & B_BUSY) { 496 bp->b_flags |= B_WANTED; 497 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 498 goto loop; 499 } 500 bp->b_flags |= B_BUSY | B_CACHE; 501 bremfree(bp); 502 /* 503 * check for size inconsistancies 504 */ 505 if (bp->b_bcount != size) { 506 printf("getblk: invalid buffer size: %d\n", bp->b_bcount); 507 bp->b_flags |= B_INVAL; 508 bwrite(bp); 509 goto loop; 510 } 511 } else { 512 513 if ((bp = getnewbuf(0, 0)) == 0) 514 goto loop; 515 allocbuf(bp, size); 516 /* 517 * have to check again, because of a possible 518 * race condition. 519 */ 520 if (incore( vp, blkno)) { 521 allocbuf(bp, 0); 522 bp->b_flags |= B_INVAL; 523 brelse(bp); 524 goto loop; 525 } 526 bp->b_blkno = bp->b_lblkno = blkno; 527 bgetvp(vp, bp); 528 LIST_REMOVE(bp, b_hash); 529 bh = BUFHASH(vp, blkno); 530 LIST_INSERT_HEAD(bh, bp, b_hash); 531 } 532 splx(x); 533 return (bp); 534} 535 536/* 537 * Get an empty, disassociated buffer of given size. 538 */ 539struct buf * 540geteblk(int size) 541{ 542 struct buf *bp; 543 while ((bp = getnewbuf(0, 0)) == 0) 544 ; 545 allocbuf(bp, size); 546 bp->b_flags |= B_INVAL; 547 return (bp); 548} 549 550/* 551 * Modify the length of a buffer's underlying buffer storage without 552 * destroying information (unless, of course the buffer is shrinking). 553 */ 554void 555allocbuf(struct buf *bp, int size) 556{ 557 558 int newbsize = round_page(size); 559 560 if( newbsize == bp->b_bufsize) { 561 bp->b_bcount = size; 562 return; 563 } else if( newbsize < bp->b_bufsize) { 564 vm_hold_free_pages( 565 (vm_offset_t) bp->b_data + newbsize, 566 (vm_offset_t) bp->b_data + bp->b_bufsize); 567 } else if( newbsize > bp->b_bufsize) { 568 vm_hold_load_pages( 569 (vm_offset_t) bp->b_data + bp->b_bufsize, 570 (vm_offset_t) bp->b_data + newbsize); 571 } 572 573 /* adjust buffer cache's idea of memory allocated to buffer contents */ 574 freebufspace -= newbsize - bp->b_bufsize; 575 allocbufspace += newbsize - bp->b_bufsize; 576 577 bp->b_bufsize = newbsize; 578 bp->b_bcount = size; 579} 580 581/* 582 * Wait for buffer I/O completion, returning error status. 583 */ 584int 585biowait(register struct buf *bp) 586{ 587 int x; 588 589 x = splbio(); 590 while ((bp->b_flags & B_DONE) == 0) 591 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 592 if((bp->b_flags & B_ERROR) || bp->b_error) { 593 if ((bp->b_flags & B_INVAL) == 0) { 594 bp->b_flags |= B_INVAL; 595 bp->b_dev = NODEV; 596 LIST_REMOVE(bp, b_hash); 597 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 598 } 599 if (!bp->b_error) 600 bp->b_error = EIO; 601 else 602 bp->b_flags |= B_ERROR; 603 splx(x); 604 return (bp->b_error); 605 } else { 606 splx(x); 607 return (0); 608 } 609} 610 611/* 612 * Finish I/O on a buffer, calling an optional function. 613 * This is usually called from interrupt level, so process blocking 614 * is not *a good idea*. 615 */ 616void 617biodone(register struct buf *bp) 618{ 619 int s; 620 s = splbio(); 621 bp->b_flags |= B_DONE; 622 623 if ((bp->b_flags & B_READ) == 0) { 624 vwakeup(bp); 625 } 626 627 /* call optional completion function if requested */ 628 if (bp->b_flags & B_CALL) { 629 bp->b_flags &= ~B_CALL; 630 (*bp->b_iodone)(bp); 631 splx(s); 632 return; 633 } 634 635/* 636 * For asynchronous completions, release the buffer now. The brelse 637 * checks for B_WANTED and will do the wakeup there if necessary - 638 * so no need to do a wakeup here in the async case. 639 */ 640 641 if (bp->b_flags & B_ASYNC) { 642 brelse(bp); 643 } else { 644 bp->b_flags &= ~B_WANTED; 645 wakeup((caddr_t) bp); 646 } 647 splx(s); 648} 649 650int 651count_lock_queue() 652{ 653 int count; 654 struct buf *bp; 655 656 count = 0; 657 for(bp = bufqueues[BQ_LOCKED].tqh_first; 658 bp != NULL; 659 bp = bp->b_freelist.tqe_next) 660 count++; 661 return(count); 662} 663 664#ifndef UPDATE_INTERVAL 665int vfs_update_interval = 30; 666#else 667int vfs_update_interval = UPDATE_INTERVAL; 668#endif 669 670void 671vfs_update() { 672 (void) spl0(); 673 while(1) { 674 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 675 hz * vfs_update_interval); 676 vfs_update_wakeup = 0; 677 sync(curproc, NULL, NULL); 678 } 679} 680 681/* 682 * these routines are not in the correct place (yet) 683 * also they work *ONLY* for kernel_pmap!!! 684 */ 685void 686vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 687 vm_offset_t pg; 688 vm_page_t p; 689 vm_offset_t from = round_page(froma); 690 vm_offset_t to = round_page(toa); 691 692 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 693 vm_offset_t pa; 694 695 tryagain: 696 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 697 if( !p) { 698 VM_WAIT; 699 goto tryagain; 700 } 701 702 vm_page_wire(p); 703 pmap_enter(kernel_pmap, pg, VM_PAGE_TO_PHYS(p), 704 VM_PROT_READ|VM_PROT_WRITE, 1); 705 } 706} 707 708void 709vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) { 710 vm_offset_t pg; 711 vm_page_t p; 712 vm_offset_t from = round_page(froma); 713 vm_offset_t to = round_page(toa); 714 715 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 716 vm_offset_t pa; 717 pa = pmap_kextract(pg); 718 if( !pa) { 719 printf("No pa for va: %x\n", pg); 720 } else { 721 p = PHYS_TO_VM_PAGE( pa); 722 pmap_remove(kernel_pmap, pg, pg + PAGE_SIZE); 723 vm_page_free(p); 724 } 725 } 726} 727 728void 729bufstats() 730{ 731} 732 733