vfs_bio.c revision 1896
1/* 2 * Copyright (c) 1994 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. Absolutely no warranty of function or purpose is made by the author 15 * John S. Dyson. 16 * 4. Modifications may be freely made to this file if the above conditions 17 * are met. 18 * 19 * $Id: vfs_bio.c,v 1.6 1994/08/06 09:15:28 davidg Exp $ 20 */ 21 22#include <sys/param.h> 23#include <sys/systm.h> 24#include <sys/kernel.h> 25#include <sys/proc.h> 26#include <sys/vnode.h> 27#include <sys/buf.h> 28#include <sys/mount.h> 29#include <sys/malloc.h> 30#include <sys/resourcevar.h> 31#include <vm/vm.h> 32#include <vm/vm_pageout.h> 33 34#include <miscfs/specfs/specdev.h> 35 36struct buf *buf; /* buffer header pool */ 37int nbuf; /* number of buffer headers calculated elsewhere */ 38 39extern vm_map_t buffer_map, io_map; 40 41void vm_hold_free_pages(vm_offset_t from, vm_offset_t to); 42void vm_hold_load_pages(vm_offset_t from, vm_offset_t to); 43 44int needsbuffer; 45 46/* 47 * Internal update daemon, process 3 48 * The variable vfs_update_wakeup allows for internal syncs. 49 */ 50int vfs_update_wakeup; 51 52/* 53 * Initialize buffer headers and related structures. 54 */ 55void bufinit() 56{ 57 struct buf *bp; 58 int i; 59 60 TAILQ_INIT(&bswlist); 61 LIST_INIT(&invalhash); 62 63 /* first, make a null hash table */ 64 for(i=0;i<BUFHSZ;i++) 65 LIST_INIT(&bufhashtbl[i]); 66 67 /* next, make a null set of free lists */ 68 for(i=0;i<BUFFER_QUEUES;i++) 69 TAILQ_INIT(&bufqueues[i]); 70 71 /* finally, initialize each buffer header and stick on empty q */ 72 for(i=0;i<nbuf;i++) { 73 bp = &buf[i]; 74 bzero(bp, sizeof *bp); 75 bp->b_flags = B_INVAL; /* we're just an empty header */ 76 bp->b_dev = NODEV; 77 bp->b_vp = NULL; 78 bp->b_rcred = NOCRED; 79 bp->b_wcred = NOCRED; 80 bp->b_qindex = QUEUE_EMPTY; 81 bp->b_vnbufs.le_next = NOLIST; 82 bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE); 83 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 84 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 85 } 86} 87 88/* 89 * remove the buffer from the appropriate free list 90 */ 91void 92bremfree(struct buf *bp) 93{ 94 int s = splbio(); 95 if( bp->b_qindex != QUEUE_NONE) { 96 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 97 bp->b_qindex = QUEUE_NONE; 98 } else { 99 panic("bremfree: removing a buffer when not on a queue"); 100 } 101 splx(s); 102} 103 104/* 105 * Get a buffer with the specified data. Look in the cache first. 106 */ 107int 108bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred, 109 struct buf **bpp) 110{ 111 struct buf *bp; 112 113 bp = getblk (vp, blkno, size, 0, 0); 114 *bpp = bp; 115 116 /* if not found in cache, do some I/O */ 117 if ((bp->b_flags & B_CACHE) == 0) { 118 if (curproc && curproc->p_stats) /* count block I/O */ 119 curproc->p_stats->p_ru.ru_inblock++; 120 bp->b_flags |= B_READ; 121 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 122 if( bp->b_rcred == NOCRED) { 123 if (cred != NOCRED) 124 crhold(cred); 125 bp->b_rcred = cred; 126 } 127 VOP_STRATEGY(bp); 128 return( biowait (bp)); 129 } 130 131 return (0); 132} 133 134/* 135 * Operates like bread, but also starts asynchronous I/O on 136 * read-ahead blocks. 137 */ 138int 139breadn(struct vnode *vp, daddr_t blkno, int size, 140 daddr_t *rablkno, int *rabsize, 141 int cnt, struct ucred *cred, struct buf **bpp) 142{ 143 struct buf *bp, *rabp; 144 int i; 145 int rv = 0, readwait = 0; 146 147 *bpp = bp = getblk (vp, blkno, size, 0, 0); 148 149 /* if not found in cache, do some I/O */ 150 if ((bp->b_flags & B_CACHE) == 0) { 151 if (curproc && curproc->p_stats) /* count block I/O */ 152 curproc->p_stats->p_ru.ru_inblock++; 153 bp->b_flags |= B_READ; 154 bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 155 if( bp->b_rcred == NOCRED) { 156 if (cred != NOCRED) 157 crhold(cred); 158 bp->b_rcred = cred; 159 } 160 VOP_STRATEGY(bp); 161 ++readwait; 162 } 163 164 for(i=0;i<cnt;i++, rablkno++, rabsize++) { 165 if( incore(vp, *rablkno)) { 166 continue; 167 } 168 rabp = getblk (vp, *rablkno, *rabsize, 0, 0); 169 170 if ((rabp->b_flags & B_CACHE) == 0) { 171 if (curproc && curproc->p_stats) 172 curproc->p_stats->p_ru.ru_inblock++; 173 rabp->b_flags |= B_READ | B_ASYNC; 174 rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL); 175 if( rabp->b_rcred == NOCRED) { 176 if (cred != NOCRED) 177 crhold(cred); 178 rabp->b_rcred = cred; 179 } 180 VOP_STRATEGY(rabp); 181 } else { 182 brelse(rabp); 183 } 184 } 185 186 if( readwait) { 187 rv = biowait (bp); 188 } 189 190 return (rv); 191} 192 193/* 194 * Write, release buffer on completion. (Done by iodone 195 * if async.) 196 */ 197int 198bwrite(struct buf *bp) 199{ 200 int oldflags = bp->b_flags; 201 202 if(bp->b_flags & B_INVAL) { 203 brelse(bp); 204 return (0); 205 } 206 207 if(!(bp->b_flags & B_BUSY)) 208 panic("bwrite: buffer is not busy???"); 209 210 bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI); 211 bp->b_flags |= B_WRITEINPROG; 212 213 if (oldflags & B_ASYNC) { 214 if (oldflags & B_DELWRI) { 215 reassignbuf(bp, bp->b_vp); 216 } else if( curproc) { 217 ++curproc->p_stats->p_ru.ru_oublock; 218 } 219 } 220 221 bp->b_vp->v_numoutput++; 222 VOP_STRATEGY(bp); 223 224 if( (oldflags & B_ASYNC) == 0) { 225 int rtval = biowait(bp); 226 if (oldflags & B_DELWRI) { 227 reassignbuf(bp, bp->b_vp); 228 } else if( curproc) { 229 ++curproc->p_stats->p_ru.ru_oublock; 230 } 231 brelse(bp); 232 return (rtval); 233 } 234 235 return(0); 236} 237 238int 239vn_bwrite(ap) 240 struct vop_bwrite_args *ap; 241{ 242 return (bwrite(ap->a_bp)); 243} 244 245/* 246 * Delayed write. (Buffer is marked dirty). 247 */ 248void 249bdwrite(struct buf *bp) 250{ 251 252 if((bp->b_flags & B_BUSY) == 0) { 253 panic("bdwrite: buffer is not busy"); 254 } 255 256 if(bp->b_flags & B_INVAL) { 257 brelse(bp); 258 return; 259 } 260 261 if(bp->b_flags & B_TAPE) { 262 bawrite(bp); 263 return; 264 } 265 266 bp->b_flags &= ~B_READ; 267 if( (bp->b_flags & B_DELWRI) == 0) { 268 if( curproc) 269 ++curproc->p_stats->p_ru.ru_oublock; 270 bp->b_flags |= B_DONE|B_DELWRI; 271 reassignbuf(bp, bp->b_vp); 272 } 273 brelse(bp); 274 return; 275} 276 277/* 278 * Asynchronous write. 279 * Start output on a buffer, but do not wait for it to complete. 280 * The buffer is released when the output completes. 281 */ 282void 283bawrite(struct buf *bp) 284{ 285 bp->b_flags |= B_ASYNC; 286 (void) bwrite(bp); 287} 288 289/* 290 * Release a buffer. 291 */ 292void 293brelse(struct buf *bp) 294{ 295 int x; 296 297 /* anyone need a "free" block? */ 298 x=splbio(); 299 if (needsbuffer) { 300 needsbuffer = 0; 301 wakeup((caddr_t)&needsbuffer); 302 } 303 304 /* anyone need this block? */ 305 if (bp->b_flags & B_WANTED) { 306 bp->b_flags &= ~(B_WANTED|B_AGE); 307 wakeup((caddr_t)bp); 308 } 309 310 if (bp->b_flags & B_LOCKED) 311 bp->b_flags &= ~B_ERROR; 312 313 if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) || 314 (bp->b_bufsize <= 0)) { 315 bp->b_flags |= B_INVAL; 316 bp->b_flags &= ~(B_DELWRI|B_CACHE); 317 if(bp->b_vp) 318 brelvp(bp); 319 } 320 321 if( bp->b_qindex != QUEUE_NONE) 322 panic("brelse: free buffer onto another queue???"); 323 324 /* enqueue */ 325 /* buffers with no memory */ 326 if(bp->b_bufsize == 0) { 327 bp->b_qindex = QUEUE_EMPTY; 328 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 329 LIST_REMOVE(bp, b_hash); 330 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 331 bp->b_dev = NODEV; 332 /* buffers with junk contents */ 333 } else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) { 334 bp->b_qindex = QUEUE_AGE; 335 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 336 LIST_REMOVE(bp, b_hash); 337 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 338 bp->b_dev = NODEV; 339 /* buffers that are locked */ 340 } else if(bp->b_flags & B_LOCKED) { 341 bp->b_qindex = QUEUE_LOCKED; 342 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 343 /* buffers with stale but valid contents */ 344 } else if(bp->b_flags & B_AGE) { 345 bp->b_qindex = QUEUE_AGE; 346 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 347 /* buffers with valid and quite potentially reuseable contents */ 348 } else { 349 bp->b_qindex = QUEUE_LRU; 350 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 351 } 352 353 /* unlock */ 354 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE); 355 splx(x); 356} 357 358int freebufspace; 359int allocbufspace; 360 361/* 362 * Find a buffer header which is available for use. 363 */ 364struct buf * 365getnewbuf(int slpflag, int slptimeo) 366{ 367 struct buf *bp; 368 int s; 369 s = splbio(); 370start: 371 /* can we constitute a new buffer? */ 372 if (bp = bufqueues[QUEUE_EMPTY].tqh_first) { 373 if( bp->b_qindex != QUEUE_EMPTY) 374 panic("getnewbuf: inconsistent EMPTY queue"); 375 bremfree(bp); 376 goto fillbuf; 377 } 378 379tryfree: 380 if (bp = bufqueues[QUEUE_AGE].tqh_first) { 381 if( bp->b_qindex != QUEUE_AGE) 382 panic("getnewbuf: inconsistent AGE queue"); 383 bremfree(bp); 384 } else if (bp = bufqueues[QUEUE_LRU].tqh_first) { 385 if( bp->b_qindex != QUEUE_LRU) 386 panic("getnewbuf: inconsistent LRU queue"); 387 bremfree(bp); 388 } else { 389 /* wait for a free buffer of any kind */ 390 needsbuffer = 1; 391 tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0); 392 splx(s); 393 return (0); 394 } 395 396 397 /* if we are a delayed write, convert to an async write */ 398 if (bp->b_flags & B_DELWRI) { 399 bp->b_flags |= B_BUSY; 400 bawrite (bp); 401 goto start; 402 } 403 404 if(bp->b_vp) 405 brelvp(bp); 406 407 /* we are not free, nor do we contain interesting data */ 408 if (bp->b_rcred != NOCRED) 409 crfree(bp->b_rcred); 410 if (bp->b_wcred != NOCRED) 411 crfree(bp->b_wcred); 412fillbuf: 413 bp->b_flags = B_BUSY; 414 LIST_REMOVE(bp, b_hash); 415 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 416 splx(s); 417 bp->b_dev = NODEV; 418 bp->b_vp = NULL; 419 bp->b_blkno = bp->b_lblkno = 0; 420 bp->b_iodone = 0; 421 bp->b_error = 0; 422 bp->b_resid = 0; 423 bp->b_bcount = 0; 424 bp->b_wcred = bp->b_rcred = NOCRED; 425 bp->b_dirtyoff = bp->b_dirtyend = 0; 426 bp->b_validoff = bp->b_validend = 0; 427 return (bp); 428} 429 430/* 431 * Check to see if a block is currently memory resident. 432 */ 433struct buf * 434incore(struct vnode *vp, daddr_t blkno) 435{ 436 struct buf *bp; 437 struct bufhashhdr *bh; 438 439 int s = splbio(); 440 441 bh = BUFHASH(vp, blkno); 442 bp = bh->lh_first; 443 444 /* Search hash chain */ 445 while (bp) { 446 if( (bp < buf) || (bp >= buf + nbuf)) { 447 printf("incore: buf out of range: %lx, hash: %d\n", 448 bp, bh - bufhashtbl); 449 panic("incore: buf fault"); 450 } 451 /* hit */ 452 if (bp->b_lblkno == blkno && bp->b_vp == vp 453 && (bp->b_flags & B_INVAL) == 0) { 454 splx(s); 455 return (bp); 456 } 457 bp = bp->b_hash.le_next; 458 } 459 splx(s); 460 461 return(0); 462} 463 464/* 465 * Get a block given a specified block and offset into a file/device. 466 */ 467struct buf * 468getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 469{ 470 struct buf *bp; 471 int s; 472 struct bufhashhdr *bh; 473 474 s = splbio(); 475loop: 476 if (bp = incore(vp, blkno)) { 477 if (bp->b_flags & B_BUSY) { 478 bp->b_flags |= B_WANTED; 479 tsleep ((caddr_t)bp, PRIBIO, "getblk", 0); 480 goto loop; 481 } 482 bp->b_flags |= B_BUSY | B_CACHE; 483 bremfree(bp); 484 /* 485 * check for size inconsistancies 486 */ 487 if (bp->b_bcount != size) { 488 printf("getblk: invalid buffer size: %d\n", bp->b_bcount); 489 bp->b_flags |= B_INVAL; 490 bwrite(bp); 491 goto loop; 492 } 493 } else { 494 495 if ((bp = getnewbuf(0, 0)) == 0) 496 goto loop; 497 allocbuf(bp, size); 498 /* 499 * have to check again, because of a possible 500 * race condition. 501 */ 502 if (incore( vp, blkno)) { 503 allocbuf(bp, 0); 504 bp->b_flags |= B_INVAL; 505 brelse(bp); 506 goto loop; 507 } 508 bp->b_blkno = bp->b_lblkno = blkno; 509 bgetvp(vp, bp); 510 LIST_REMOVE(bp, b_hash); 511 bh = BUFHASH(vp, blkno); 512 LIST_INSERT_HEAD(bh, bp, b_hash); 513 } 514 splx(s); 515 return (bp); 516} 517 518/* 519 * Get an empty, disassociated buffer of given size. 520 */ 521struct buf * 522geteblk(int size) 523{ 524 struct buf *bp; 525 while ((bp = getnewbuf(0, 0)) == 0) 526 ; 527 allocbuf(bp, size); 528 bp->b_flags |= B_INVAL; 529 return (bp); 530} 531 532/* 533 * Modify the length of a buffer's underlying buffer storage without 534 * destroying information (unless, of course the buffer is shrinking). 535 */ 536void 537allocbuf(struct buf *bp, int size) 538{ 539 540 int newbsize = round_page(size); 541 542 if( newbsize == bp->b_bufsize) { 543 bp->b_bcount = size; 544 return; 545 } else if( newbsize < bp->b_bufsize) { 546 vm_hold_free_pages( 547 (vm_offset_t) bp->b_data + newbsize, 548 (vm_offset_t) bp->b_data + bp->b_bufsize); 549 } else if( newbsize > bp->b_bufsize) { 550 vm_hold_load_pages( 551 (vm_offset_t) bp->b_data + bp->b_bufsize, 552 (vm_offset_t) bp->b_data + newbsize); 553 } 554 555 /* adjust buffer cache's idea of memory allocated to buffer contents */ 556 freebufspace -= newbsize - bp->b_bufsize; 557 allocbufspace += newbsize - bp->b_bufsize; 558 559 bp->b_bufsize = newbsize; 560 bp->b_bcount = size; 561} 562 563/* 564 * Wait for buffer I/O completion, returning error status. 565 */ 566int 567biowait(register struct buf *bp) 568{ 569 int s; 570 571 s = splbio(); 572 while ((bp->b_flags & B_DONE) == 0) 573 tsleep((caddr_t)bp, PRIBIO, "biowait", 0); 574 if((bp->b_flags & B_ERROR) || bp->b_error) { 575 if ((bp->b_flags & B_INVAL) == 0) { 576 bp->b_flags |= B_INVAL; 577 bp->b_dev = NODEV; 578 LIST_REMOVE(bp, b_hash); 579 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 580 } 581 if (!bp->b_error) 582 bp->b_error = EIO; 583 else 584 bp->b_flags |= B_ERROR; 585 splx(s); 586 return (bp->b_error); 587 } else { 588 splx(s); 589 return (0); 590 } 591} 592 593/* 594 * Finish I/O on a buffer, calling an optional function. 595 * This is usually called from interrupt level, so process blocking 596 * is not *a good idea*. 597 */ 598void 599biodone(register struct buf *bp) 600{ 601 int s; 602 s = splbio(); 603 bp->b_flags |= B_DONE; 604 605 if ((bp->b_flags & B_READ) == 0) { 606 vwakeup(bp); 607 } 608 609 if (bp->b_flags & B_BOUNCE) 610 vm_bounce_free(bp); 611 612 /* call optional completion function if requested */ 613 if (bp->b_flags & B_CALL) { 614 bp->b_flags &= ~B_CALL; 615 (*bp->b_iodone)(bp); 616 splx(s); 617 return; 618 } 619 620/* 621 * For asynchronous completions, release the buffer now. The brelse 622 * checks for B_WANTED and will do the wakeup there if necessary - 623 * so no need to do a wakeup here in the async case. 624 */ 625 626 if (bp->b_flags & B_ASYNC) { 627 brelse(bp); 628 } else { 629 bp->b_flags &= ~B_WANTED; 630 wakeup((caddr_t) bp); 631 } 632 splx(s); 633} 634 635int 636count_lock_queue() 637{ 638 int count; 639 struct buf *bp; 640 641 count = 0; 642 for(bp = bufqueues[QUEUE_LOCKED].tqh_first; 643 bp != NULL; 644 bp = bp->b_freelist.tqe_next) 645 count++; 646 return(count); 647} 648 649#ifndef UPDATE_INTERVAL 650int vfs_update_interval = 30; 651#else 652int vfs_update_interval = UPDATE_INTERVAL; 653#endif 654 655void 656vfs_update() { 657 (void) spl0(); 658 while(1) { 659 tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update", 660 hz * vfs_update_interval); 661 vfs_update_wakeup = 0; 662 sync(curproc, NULL, NULL); 663 } 664} 665 666/* 667 * these routines are not in the correct place (yet) 668 * also they work *ONLY* for kernel_pmap!!! 669 */ 670void 671vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) { 672 vm_offset_t pg; 673 vm_page_t p; 674 vm_offset_t from = round_page(froma); 675 vm_offset_t to = round_page(toa); 676 677 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 678 vm_offset_t pa; 679 680 tryagain: 681 if (cnt.v_free_count <= cnt.v_free_reserved) { 682 VM_WAIT; 683 goto tryagain; 684 } 685 686 p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS); 687 if( !p) { 688 VM_WAIT; 689 goto tryagain; 690 } 691 692 vm_page_wire(p); 693 pmap_kenter( pg, VM_PAGE_TO_PHYS(p)); 694 } 695} 696 697void 698vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) { 699 vm_offset_t pg; 700 vm_page_t p; 701 vm_offset_t from = round_page(froma); 702 vm_offset_t to = round_page(toa); 703 704 for(pg = from ; pg < to ; pg += PAGE_SIZE) { 705 p = PHYS_TO_VM_PAGE( pmap_kextract( pg)); 706 pmap_kremove( pg); 707 vm_page_free(p); 708 } 709} 710 711void 712bufstats() 713{ 714} 715 716