vfs_bio.c revision 136767
1/* 2 * Copyright (c) 1994,1997 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Absolutely no warranty of function or purpose is made by the author 12 * John S. Dyson. 13 */ 14 15/* 16 * this file contains a new buffer I/O scheme implementing a coherent 17 * VM object and buffer cache scheme. Pains have been taken to make 18 * sure that the performance degradation associated with schemes such 19 * as this is not realized. 20 * 21 * Author: John S. Dyson 22 * Significant help during the development and debugging phases 23 * had been provided by David Greenman, also of the FreeBSD core team. 24 * 25 * see man buf(9) for more info. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 136767 2004-10-22 08:47:20Z phk $"); 30 31#include <sys/param.h> 32#include <sys/systm.h> 33#include <sys/bio.h> 34#include <sys/conf.h> 35#include <sys/buf.h> 36#include <sys/devicestat.h> 37#include <sys/eventhandler.h> 38#include <sys/lock.h> 39#include <sys/malloc.h> 40#include <sys/mount.h> 41#include <sys/mutex.h> 42#include <sys/kernel.h> 43#include <sys/kthread.h> 44#include <sys/proc.h> 45#include <sys/resourcevar.h> 46#include <sys/sysctl.h> 47#include <sys/vmmeter.h> 48#include <sys/vnode.h> 49#include <vm/vm.h> 50#include <vm/vm_param.h> 51#include <vm/vm_kern.h> 52#include <vm/vm_pageout.h> 53#include <vm/vm_page.h> 54#include <vm/vm_object.h> 55#include <vm/vm_extern.h> 56#include <vm/vm_map.h> 57#include "opt_directio.h" 58#include "opt_swap.h" 59 60static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 61 62struct bio_ops bioops; /* I/O operation notification */ 63 64static int ibwrite(struct buf *); 65static int inmem(struct vnode * vp, daddr_t blkno); 66 67struct buf_ops buf_ops_bio = { 68 "buf_ops_bio", 69 ibwrite 70}; 71 72/* 73 * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has 74 * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. 75 */ 76struct buf *buf; /* buffer header pool */ 77 78static struct proc *bufdaemonproc; 79 80static void vm_hold_free_pages(struct buf *bp, vm_offset_t from, 81 vm_offset_t to); 82static void vm_hold_load_pages(struct buf *bp, vm_offset_t from, 83 vm_offset_t to); 84static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, 85 int pageno, vm_page_t m); 86static void vfs_clean_pages(struct buf *bp); 87static void vfs_setdirty(struct buf *bp); 88static void vfs_vmio_release(struct buf *bp); 89static void vfs_backgroundwritedone(struct buf *bp); 90static int vfs_bio_clcheck(struct vnode *vp, int size, 91 daddr_t lblkno, daddr_t blkno); 92static int flushbufqueues(int flushdeps); 93static void buf_daemon(void); 94void bremfreel(struct buf *bp); 95 96int vmiodirenable = TRUE; 97SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, 98 "Use the VM system for directory writes"); 99int runningbufspace; 100SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 101 "Amount of presently outstanding async buffer io"); 102static int bufspace; 103SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 104 "KVA memory used for bufs"); 105static int maxbufspace; 106SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 107 "Maximum allowed value of bufspace (including buf_daemon)"); 108static int bufmallocspace; 109SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 110 "Amount of malloced memory for buffers"); 111static int maxbufmallocspace; 112SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, 113 "Maximum amount of malloced memory for buffers"); 114static int lobufspace; 115SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 116 "Minimum amount of buffers we want to have"); 117static int hibufspace; 118SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 119 "Maximum allowed value of bufspace (excluding buf_daemon)"); 120static int bufreusecnt; 121SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, 122 "Number of times we have reused a buffer"); 123static int buffreekvacnt; 124SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, 125 "Number of times we have freed the KVA space from some buffer"); 126static int bufdefragcnt; 127SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, 128 "Number of times we have had to repeat buffer allocation to defragment"); 129static int lorunningspace; 130SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 131 "Minimum preferred space used for in-progress I/O"); 132static int hirunningspace; 133SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 134 "Maximum amount of space to use for in-progress I/O"); 135static int dirtybufferflushes; 136SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes, 137 0, "Number of bdwrite to bawrite conversions to limit dirty buffers"); 138static int altbufferflushes; 139SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes, 140 0, "Number of fsync flushes to limit dirty buffers"); 141static int recursiveflushes; 142SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes, 143 0, "Number of flushes skipped due to being recursive"); 144static int numdirtybuffers; 145SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, 146 "Number of buffers that are dirty (has unwritten changes) at the moment"); 147static int lodirtybuffers; 148SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, 149 "How many buffers we want to have free before bufdaemon can sleep"); 150static int hidirtybuffers; 151SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, 152 "When the number of dirty buffers is considered severe"); 153static int dirtybufthresh; 154SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh, 155 0, "Number of bdwrite to bawrite conversions to clear dirty buffers"); 156static int numfreebuffers; 157SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, 158 "Number of free buffers"); 159static int lofreebuffers; 160SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, 161 "XXX Unused"); 162static int hifreebuffers; 163SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, 164 "XXX Complicatedly unused"); 165static int getnewbufcalls; 166SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, 167 "Number of calls to getnewbuf"); 168static int getnewbufrestarts; 169SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, 170 "Number of times getnewbuf has had to restart a buffer aquisition"); 171static int dobkgrdwrite = 1; 172SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, 173 "Do background writes (honoring the BV_BKGRDWRITE flag)?"); 174 175/* 176 * Wakeup point for bufdaemon, as well as indicator of whether it is already 177 * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it 178 * is idling. 179 */ 180static int bd_request; 181 182/* 183 * This lock synchronizes access to bd_request. 184 */ 185static struct mtx bdlock; 186 187/* 188 * bogus page -- for I/O to/from partially complete buffers 189 * this is a temporary solution to the problem, but it is not 190 * really that bad. it would be better to split the buffer 191 * for input in the case of buffers partially already in memory, 192 * but the code is intricate enough already. 193 */ 194vm_page_t bogus_page; 195 196/* 197 * Synchronization (sleep/wakeup) variable for active buffer space requests. 198 * Set when wait starts, cleared prior to wakeup(). 199 * Used in runningbufwakeup() and waitrunningbufspace(). 200 */ 201static int runningbufreq; 202 203/* 204 * This lock protects the runningbufreq and synchronizes runningbufwakeup and 205 * waitrunningbufspace(). 206 */ 207static struct mtx rbreqlock; 208 209/* 210 * Synchronization (sleep/wakeup) variable for buffer requests. 211 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done 212 * by and/or. 213 * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), 214 * getnewbuf(), and getblk(). 215 */ 216static int needsbuffer; 217 218/* 219 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it. 220 */ 221static struct mtx nblock; 222 223/* 224 * Lock that protects against bwait()/bdone()/B_DONE races. 225 */ 226 227static struct mtx bdonelock; 228 229/* 230 * Definitions for the buffer free lists. 231 */ 232#define BUFFER_QUEUES 5 /* number of free buffer queues */ 233 234#define QUEUE_NONE 0 /* on no queue */ 235#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */ 236#define QUEUE_DIRTY 2 /* B_DELWRI buffers */ 237#define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */ 238#define QUEUE_EMPTY 4 /* empty buffer headers */ 239 240/* Queues for free buffers with various properties */ 241static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; 242 243/* Lock for the bufqueues */ 244static struct mtx bqlock; 245 246/* 247 * Single global constant for BUF_WMESG, to avoid getting multiple references. 248 * buf_wmesg is referred from macros. 249 */ 250const char *buf_wmesg = BUF_WMESG; 251 252#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 253#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ 254#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ 255#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 256 257#ifdef DIRECTIO 258extern void ffs_rawread_setup(void); 259#endif /* DIRECTIO */ 260/* 261 * numdirtywakeup: 262 * 263 * If someone is blocked due to there being too many dirty buffers, 264 * and numdirtybuffers is now reasonable, wake them up. 265 */ 266 267static __inline void 268numdirtywakeup(int level) 269{ 270 271 if (numdirtybuffers <= level) { 272 mtx_lock(&nblock); 273 if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { 274 needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; 275 wakeup(&needsbuffer); 276 } 277 mtx_unlock(&nblock); 278 } 279} 280 281/* 282 * bufspacewakeup: 283 * 284 * Called when buffer space is potentially available for recovery. 285 * getnewbuf() will block on this flag when it is unable to free 286 * sufficient buffer space. Buffer space becomes recoverable when 287 * bp's get placed back in the queues. 288 */ 289 290static __inline void 291bufspacewakeup(void) 292{ 293 294 /* 295 * If someone is waiting for BUF space, wake them up. Even 296 * though we haven't freed the kva space yet, the waiting 297 * process will be able to now. 298 */ 299 mtx_lock(&nblock); 300 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 301 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 302 wakeup(&needsbuffer); 303 } 304 mtx_unlock(&nblock); 305} 306 307/* 308 * runningbufwakeup() - in-progress I/O accounting. 309 * 310 */ 311static __inline void 312runningbufwakeup(struct buf *bp) 313{ 314 315 if (bp->b_runningbufspace) { 316 atomic_subtract_int(&runningbufspace, bp->b_runningbufspace); 317 bp->b_runningbufspace = 0; 318 mtx_lock(&rbreqlock); 319 if (runningbufreq && runningbufspace <= lorunningspace) { 320 runningbufreq = 0; 321 wakeup(&runningbufreq); 322 } 323 mtx_unlock(&rbreqlock); 324 } 325} 326 327/* 328 * bufcountwakeup: 329 * 330 * Called when a buffer has been added to one of the free queues to 331 * account for the buffer and to wakeup anyone waiting for free buffers. 332 * This typically occurs when large amounts of metadata are being handled 333 * by the buffer cache ( else buffer space runs out first, usually ). 334 */ 335 336static __inline void 337bufcountwakeup(void) 338{ 339 340 atomic_add_int(&numfreebuffers, 1); 341 mtx_lock(&nblock); 342 if (needsbuffer) { 343 needsbuffer &= ~VFS_BIO_NEED_ANY; 344 if (numfreebuffers >= hifreebuffers) 345 needsbuffer &= ~VFS_BIO_NEED_FREE; 346 wakeup(&needsbuffer); 347 } 348 mtx_unlock(&nblock); 349} 350 351/* 352 * waitrunningbufspace() 353 * 354 * runningbufspace is a measure of the amount of I/O currently 355 * running. This routine is used in async-write situations to 356 * prevent creating huge backups of pending writes to a device. 357 * Only asynchronous writes are governed by this function. 358 * 359 * Reads will adjust runningbufspace, but will not block based on it. 360 * The read load has a side effect of reducing the allowed write load. 361 * 362 * This does NOT turn an async write into a sync write. It waits 363 * for earlier writes to complete and generally returns before the 364 * caller's write has reached the device. 365 */ 366static __inline void 367waitrunningbufspace(void) 368{ 369 370 mtx_lock(&rbreqlock); 371 while (runningbufspace > hirunningspace) { 372 ++runningbufreq; 373 msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0); 374 } 375 mtx_unlock(&rbreqlock); 376} 377 378 379/* 380 * vfs_buf_test_cache: 381 * 382 * Called when a buffer is extended. This function clears the B_CACHE 383 * bit if the newly extended portion of the buffer does not contain 384 * valid data. 385 */ 386static __inline 387void 388vfs_buf_test_cache(struct buf *bp, 389 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 390 vm_page_t m) 391{ 392 393 GIANT_REQUIRED; 394 395 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 396 if (bp->b_flags & B_CACHE) { 397 int base = (foff + off) & PAGE_MASK; 398 if (vm_page_is_valid(m, base, size) == 0) 399 bp->b_flags &= ~B_CACHE; 400 } 401} 402 403/* Wake up the buffer deamon if necessary */ 404static __inline 405void 406bd_wakeup(int dirtybuflevel) 407{ 408 409 mtx_lock(&bdlock); 410 if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { 411 bd_request = 1; 412 wakeup(&bd_request); 413 } 414 mtx_unlock(&bdlock); 415} 416 417/* 418 * bd_speedup - speedup the buffer cache flushing code 419 */ 420 421static __inline 422void 423bd_speedup(void) 424{ 425 426 bd_wakeup(1); 427} 428 429/* 430 * Calculating buffer cache scaling values and reserve space for buffer 431 * headers. This is called during low level kernel initialization and 432 * may be called more then once. We CANNOT write to the memory area 433 * being reserved at this time. 434 */ 435caddr_t 436kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est) 437{ 438 439 /* 440 * physmem_est is in pages. Convert it to kilobytes (assumes 441 * PAGE_SIZE is >= 1K) 442 */ 443 physmem_est = physmem_est * (PAGE_SIZE / 1024); 444 445 /* 446 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 447 * For the first 64MB of ram nominally allocate sufficient buffers to 448 * cover 1/4 of our ram. Beyond the first 64MB allocate additional 449 * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 450 * the buffer cache we limit the eventual kva reservation to 451 * maxbcache bytes. 452 * 453 * factor represents the 1/4 x ram conversion. 454 */ 455 if (nbuf == 0) { 456 int factor = 4 * BKVASIZE / 1024; 457 458 nbuf = 50; 459 if (physmem_est > 4096) 460 nbuf += min((physmem_est - 4096) / factor, 461 65536 / factor); 462 if (physmem_est > 65536) 463 nbuf += (physmem_est - 65536) * 2 / (factor * 5); 464 465 if (maxbcache && nbuf > maxbcache / BKVASIZE) 466 nbuf = maxbcache / BKVASIZE; 467 } 468 469#if 0 470 /* 471 * Do not allow the buffer_map to be more then 1/2 the size of the 472 * kernel_map. 473 */ 474 if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 475 (BKVASIZE * 2)) { 476 nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 477 (BKVASIZE * 2); 478 printf("Warning: nbufs capped at %d\n", nbuf); 479 } 480#endif 481 482 /* 483 * swbufs are used as temporary holders for I/O, such as paging I/O. 484 * We have no less then 16 and no more then 256. 485 */ 486 nswbuf = max(min(nbuf/4, 256), 16); 487#ifdef NSWBUF_MIN 488 if (nswbuf < NSWBUF_MIN) 489 nswbuf = NSWBUF_MIN; 490#endif 491#ifdef DIRECTIO 492 ffs_rawread_setup(); 493#endif 494 495 /* 496 * Reserve space for the buffer cache buffers 497 */ 498 swbuf = (void *)v; 499 v = (caddr_t)(swbuf + nswbuf); 500 buf = (void *)v; 501 v = (caddr_t)(buf + nbuf); 502 503 return(v); 504} 505 506/* Initialize the buffer subsystem. Called before use of any buffers. */ 507void 508bufinit(void) 509{ 510 struct buf *bp; 511 int i; 512 513 GIANT_REQUIRED; 514 515 mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF); 516 mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF); 517 mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); 518 mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); 519 mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF); 520 521 /* next, make a null set of free lists */ 522 for (i = 0; i < BUFFER_QUEUES; i++) 523 TAILQ_INIT(&bufqueues[i]); 524 525 /* finally, initialize each buffer header and stick on empty q */ 526 for (i = 0; i < nbuf; i++) { 527 bp = &buf[i]; 528 bzero(bp, sizeof *bp); 529 bp->b_flags = B_INVAL; /* we're just an empty header */ 530 bp->b_dev = NULL; 531 bp->b_rcred = NOCRED; 532 bp->b_wcred = NOCRED; 533 bp->b_qindex = QUEUE_EMPTY; 534 bp->b_vflags = 0; 535 bp->b_xflags = 0; 536 LIST_INIT(&bp->b_dep); 537 BUF_LOCKINIT(bp); 538 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 539 } 540 541 /* 542 * maxbufspace is the absolute maximum amount of buffer space we are 543 * allowed to reserve in KVM and in real terms. The absolute maximum 544 * is nominally used by buf_daemon. hibufspace is the nominal maximum 545 * used by most other processes. The differential is required to 546 * ensure that buf_daemon is able to run when other processes might 547 * be blocked waiting for buffer space. 548 * 549 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 550 * this may result in KVM fragmentation which is not handled optimally 551 * by the system. 552 */ 553 maxbufspace = nbuf * BKVASIZE; 554 hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 555 lobufspace = hibufspace - MAXBSIZE; 556 557 lorunningspace = 512 * 1024; 558 hirunningspace = 1024 * 1024; 559 560/* 561 * Limit the amount of malloc memory since it is wired permanently into 562 * the kernel space. Even though this is accounted for in the buffer 563 * allocation, we don't want the malloced region to grow uncontrolled. 564 * The malloc scheme improves memory utilization significantly on average 565 * (small) directories. 566 */ 567 maxbufmallocspace = hibufspace / 20; 568 569/* 570 * Reduce the chance of a deadlock occuring by limiting the number 571 * of delayed-write dirty buffers we allow to stack up. 572 */ 573 hidirtybuffers = nbuf / 4 + 20; 574 dirtybufthresh = hidirtybuffers * 9 / 10; 575 numdirtybuffers = 0; 576/* 577 * To support extreme low-memory systems, make sure hidirtybuffers cannot 578 * eat up all available buffer space. This occurs when our minimum cannot 579 * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming 580 * BKVASIZE'd (8K) buffers. 581 */ 582 while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { 583 hidirtybuffers >>= 1; 584 } 585 lodirtybuffers = hidirtybuffers / 2; 586 587/* 588 * Try to keep the number of free buffers in the specified range, 589 * and give special processes (e.g. like buf_daemon) access to an 590 * emergency reserve. 591 */ 592 lofreebuffers = nbuf / 18 + 5; 593 hifreebuffers = 2 * lofreebuffers; 594 numfreebuffers = nbuf; 595 596/* 597 * Maximum number of async ops initiated per buf_daemon loop. This is 598 * somewhat of a hack at the moment, we really need to limit ourselves 599 * based on the number of bytes of I/O in-transit that were initiated 600 * from buf_daemon. 601 */ 602 603 bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | 604 VM_ALLOC_NORMAL | VM_ALLOC_WIRED); 605} 606 607/* 608 * bfreekva() - free the kva allocation for a buffer. 609 * 610 * Must be called at splbio() or higher as this is the only locking for 611 * buffer_map. 612 * 613 * Since this call frees up buffer space, we call bufspacewakeup(). 614 */ 615static void 616bfreekva(struct buf *bp) 617{ 618 619 GIANT_REQUIRED; 620 621 if (bp->b_kvasize) { 622 atomic_add_int(&buffreekvacnt, 1); 623 atomic_subtract_int(&bufspace, bp->b_kvasize); 624 vm_map_delete(buffer_map, 625 (vm_offset_t) bp->b_kvabase, 626 (vm_offset_t) bp->b_kvabase + bp->b_kvasize 627 ); 628 bp->b_kvasize = 0; 629 bufspacewakeup(); 630 } 631} 632 633/* 634 * bremfree: 635 * 636 * Remove the buffer from the appropriate free list. 637 */ 638void 639bremfree(struct buf *bp) 640{ 641 642 mtx_lock(&bqlock); 643 bremfreel(bp); 644 mtx_unlock(&bqlock); 645} 646 647void 648bremfreel(struct buf *bp) 649{ 650 int s = splbio(); 651 int old_qindex = bp->b_qindex; 652 653 GIANT_REQUIRED; 654 655 if (bp->b_qindex != QUEUE_NONE) { 656 KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); 657 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 658 bp->b_qindex = QUEUE_NONE; 659 } else { 660 if (BUF_REFCNT(bp) <= 1) 661 panic("bremfree: removing a buffer not on a queue"); 662 } 663 664 /* 665 * Fixup numfreebuffers count. If the buffer is invalid or not 666 * delayed-write, and it was on the EMPTY, LRU, or AGE queues, 667 * the buffer was free and we must decrement numfreebuffers. 668 */ 669 if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { 670 switch(old_qindex) { 671 case QUEUE_DIRTY: 672 case QUEUE_CLEAN: 673 case QUEUE_EMPTY: 674 case QUEUE_EMPTYKVA: 675 atomic_subtract_int(&numfreebuffers, 1); 676 break; 677 default: 678 break; 679 } 680 } 681 splx(s); 682} 683 684 685/* 686 * Get a buffer with the specified data. Look in the cache first. We 687 * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 688 * is set, the buffer is valid and we do not have to do anything ( see 689 * getblk() ). This is really just a special case of breadn(). 690 */ 691int 692bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 693 struct buf **bpp) 694{ 695 696 return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); 697} 698 699/* 700 * Operates like bread, but also starts asynchronous I/O on 701 * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior 702 * to initiating I/O . If B_CACHE is set, the buffer is valid 703 * and we do not have to do anything. 704 */ 705int 706breadn(struct vnode * vp, daddr_t blkno, int size, 707 daddr_t * rablkno, int *rabsize, 708 int cnt, struct ucred * cred, struct buf **bpp) 709{ 710 struct buf *bp, *rabp; 711 int i; 712 int rv = 0, readwait = 0; 713 714 *bpp = bp = getblk(vp, blkno, size, 0, 0, 0); 715 716 /* if not found in cache, do some I/O */ 717 if ((bp->b_flags & B_CACHE) == 0) { 718 if (curthread != PCPU_GET(idlethread)) 719 curthread->td_proc->p_stats->p_ru.ru_inblock++; 720 bp->b_iocmd = BIO_READ; 721 bp->b_flags &= ~B_INVAL; 722 bp->b_ioflags &= ~BIO_ERROR; 723 if (bp->b_rcred == NOCRED && cred != NOCRED) 724 bp->b_rcred = crhold(cred); 725 vfs_busy_pages(bp, 0); 726 bp->b_iooffset = dbtob(bp->b_blkno); 727 if (vp->v_type == VCHR) 728 VOP_SPECSTRATEGY(vp, bp); 729 else 730 VOP_STRATEGY(vp, bp); 731 ++readwait; 732 } 733 734 for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 735 if (inmem(vp, *rablkno)) 736 continue; 737 rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); 738 739 if ((rabp->b_flags & B_CACHE) == 0) { 740 if (curthread != PCPU_GET(idlethread)) 741 curthread->td_proc->p_stats->p_ru.ru_inblock++; 742 rabp->b_flags |= B_ASYNC; 743 rabp->b_flags &= ~B_INVAL; 744 rabp->b_ioflags &= ~BIO_ERROR; 745 rabp->b_iocmd = BIO_READ; 746 if (rabp->b_rcred == NOCRED && cred != NOCRED) 747 rabp->b_rcred = crhold(cred); 748 vfs_busy_pages(rabp, 0); 749 BUF_KERNPROC(rabp); 750 rabp->b_iooffset = dbtob(rabp->b_blkno); 751 if (vp->v_type == VCHR) 752 VOP_SPECSTRATEGY(vp, rabp); 753 else 754 VOP_STRATEGY(vp, rabp); 755 } else { 756 brelse(rabp); 757 } 758 } 759 760 if (readwait) { 761 rv = bufwait(bp); 762 } 763 return (rv); 764} 765 766/* 767 * Write, release buffer on completion. (Done by iodone 768 * if async). Do not bother writing anything if the buffer 769 * is invalid. 770 * 771 * Note that we set B_CACHE here, indicating that buffer is 772 * fully valid and thus cacheable. This is true even of NFS 773 * now so we set it generally. This could be set either here 774 * or in biodone() since the I/O is synchronous. We put it 775 * here. 776 */ 777int 778bwrite(struct buf *bp) 779{ 780 781 KASSERT(bp->b_op != NULL && bp->b_op->bop_write != NULL, 782 ("Martian buffer %p in bwrite: nobody to write it.", bp)); 783 return (bp->b_op->bop_write(bp)); 784} 785 786static int 787ibwrite(struct buf *bp) 788{ 789 int oldflags, s; 790 struct buf *newbp; 791 792 if (bp->b_flags & B_INVAL) { 793 brelse(bp); 794 return (0); 795 } 796 797 oldflags = bp->b_flags; 798 799 if (BUF_REFCNT(bp) == 0) 800 panic("ibwrite: buffer is not busy???"); 801 s = splbio(); 802 /* 803 * If a background write is already in progress, delay 804 * writing this block if it is asynchronous. Otherwise 805 * wait for the background write to complete. 806 */ 807 BO_LOCK(bp->b_bufobj); 808 if (bp->b_vflags & BV_BKGRDINPROG) { 809 if (bp->b_flags & B_ASYNC) { 810 BO_UNLOCK(bp->b_bufobj); 811 splx(s); 812 bdwrite(bp); 813 return (0); 814 } 815 bp->b_vflags |= BV_BKGRDWAIT; 816 msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0); 817 if (bp->b_vflags & BV_BKGRDINPROG) 818 panic("ibwrite: still writing"); 819 } 820 BO_UNLOCK(bp->b_bufobj); 821 822 /* Mark the buffer clean */ 823 bundirty(bp); 824 825 /* 826 * If this buffer is marked for background writing and we 827 * do not have to wait for it, make a copy and write the 828 * copy so as to leave this buffer ready for further use. 829 * 830 * This optimization eats a lot of memory. If we have a page 831 * or buffer shortfall we can't do it. 832 */ 833 if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && 834 (bp->b_flags & B_ASYNC) && 835 !vm_page_count_severe() && 836 !buf_dirty_count_severe()) { 837 KASSERT(bp->b_iodone == NULL, 838 ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); 839 840 /* get a new block */ 841 newbp = geteblk(bp->b_bufsize); 842 843 /* 844 * set it to be identical to the old block. We have to 845 * set b_lblkno and BKGRDMARKER before calling bgetvp() 846 * to avoid confusing the splay tree and gbincore(). 847 */ 848 memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); 849 newbp->b_lblkno = bp->b_lblkno; 850 newbp->b_xflags |= BX_BKGRDMARKER; 851 BO_LOCK(bp->b_bufobj); 852 bp->b_vflags |= BV_BKGRDINPROG; 853 bgetvp(bp->b_vp, newbp); 854 BO_UNLOCK(bp->b_bufobj); 855 newbp->b_bufobj = &bp->b_vp->v_bufobj; 856 newbp->b_blkno = bp->b_blkno; 857 newbp->b_offset = bp->b_offset; 858 newbp->b_iodone = vfs_backgroundwritedone; 859 newbp->b_flags |= B_ASYNC; 860 newbp->b_flags &= ~B_INVAL; 861 862 /* move over the dependencies */ 863 if (LIST_FIRST(&bp->b_dep) != NULL) 864 buf_movedeps(bp, newbp); 865 866 /* 867 * Initiate write on the copy, release the original to 868 * the B_LOCKED queue so that it cannot go away until 869 * the background write completes. If not locked it could go 870 * away and then be reconstituted while it was being written. 871 * If the reconstituted buffer were written, we could end up 872 * with two background copies being written at the same time. 873 */ 874 bqrelse(bp); 875 bp = newbp; 876 } 877 878 bp->b_flags &= ~B_DONE; 879 bp->b_ioflags &= ~BIO_ERROR; 880 bp->b_flags |= B_CACHE; 881 bp->b_iocmd = BIO_WRITE; 882 883 bufobj_wref(bp->b_bufobj); 884 vfs_busy_pages(bp, 1); 885 886 /* 887 * Normal bwrites pipeline writes 888 */ 889 bp->b_runningbufspace = bp->b_bufsize; 890 atomic_add_int(&runningbufspace, bp->b_runningbufspace); 891 892 if (curthread != PCPU_GET(idlethread)) 893 curthread->td_proc->p_stats->p_ru.ru_oublock++; 894 splx(s); 895 if (oldflags & B_ASYNC) 896 BUF_KERNPROC(bp); 897 bp->b_iooffset = dbtob(bp->b_blkno); 898 if (bp->b_vp->v_type == VCHR) { 899 if (!buf_prewrite(bp->b_vp, bp)) 900 VOP_SPECSTRATEGY(bp->b_vp, bp); 901 } else { 902 VOP_STRATEGY(bp->b_vp, bp); 903 } 904 905 if ((oldflags & B_ASYNC) == 0) { 906 int rtval = bufwait(bp); 907 brelse(bp); 908 return (rtval); 909 } else { 910 /* 911 * don't allow the async write to saturate the I/O 912 * system. We will not deadlock here because 913 * we are blocking waiting for I/O that is already in-progress 914 * to complete. We do not block here if it is the update 915 * or syncer daemon trying to clean up as that can lead 916 * to deadlock. 917 */ 918 if (curthread->td_proc != bufdaemonproc && 919 curthread->td_proc != updateproc) 920 waitrunningbufspace(); 921 } 922 923 return (0); 924} 925 926/* 927 * Complete a background write started from bwrite. 928 */ 929static void 930vfs_backgroundwritedone(struct buf *bp) 931{ 932 struct buf *origbp; 933 934 /* 935 * Find the original buffer that we are writing. 936 */ 937 BO_LOCK(bp->b_bufobj); 938 if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) 939 panic("backgroundwritedone: lost buffer"); 940 941 /* 942 * Clear the BV_BKGRDINPROG flag in the original buffer 943 * and awaken it if it is waiting for the write to complete. 944 * If BV_BKGRDINPROG is not set in the original buffer it must 945 * have been released and re-instantiated - which is not legal. 946 */ 947 KASSERT((origbp->b_vflags & BV_BKGRDINPROG), 948 ("backgroundwritedone: lost buffer2")); 949 origbp->b_vflags &= ~BV_BKGRDINPROG; 950 if (origbp->b_vflags & BV_BKGRDWAIT) { 951 origbp->b_vflags &= ~BV_BKGRDWAIT; 952 wakeup(&origbp->b_xflags); 953 } 954 BO_UNLOCK(bp->b_bufobj); 955 /* 956 * Process dependencies then return any unfinished ones. 957 */ 958 if (LIST_FIRST(&bp->b_dep) != NULL) 959 buf_complete(bp); 960 if (LIST_FIRST(&bp->b_dep) != NULL) 961 buf_movedeps(bp, origbp); 962 963 /* 964 * This buffer is marked B_NOCACHE, so when it is released 965 * by biodone, it will be tossed. We mark it with BIO_READ 966 * to avoid biodone doing a second bufobj_wdrop. 967 */ 968 bp->b_flags |= B_NOCACHE; 969 bp->b_iocmd = BIO_READ; 970 bp->b_flags &= ~(B_CACHE | B_DONE); 971 bp->b_iodone = 0; 972 bufdone(bp); 973} 974 975/* 976 * Delayed write. (Buffer is marked dirty). Do not bother writing 977 * anything if the buffer is marked invalid. 978 * 979 * Note that since the buffer must be completely valid, we can safely 980 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 981 * biodone() in order to prevent getblk from writing the buffer 982 * out synchronously. 983 */ 984void 985bdwrite(struct buf *bp) 986{ 987 struct thread *td = curthread; 988 struct vnode *vp; 989 struct buf *nbp; 990 struct bufobj *bo; 991 992 GIANT_REQUIRED; 993 994 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 995 KASSERT(BUF_REFCNT(bp) != 0, ("bdwrite: buffer is not busy")); 996 997 if (bp->b_flags & B_INVAL) { 998 brelse(bp); 999 return; 1000 } 1001 1002 /* 1003 * If we have too many dirty buffers, don't create any more. 1004 * If we are wildly over our limit, then force a complete 1005 * cleanup. Otherwise, just keep the situation from getting 1006 * out of control. Note that we have to avoid a recursive 1007 * disaster and not try to clean up after our own cleanup! 1008 */ 1009 vp = bp->b_vp; 1010 bo = bp->b_bufobj; 1011 BO_LOCK(bo); 1012 if (td->td_pflags & TDP_COWINPROGRESS) { 1013 recursiveflushes++; 1014 } else if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) { 1015 BO_UNLOCK(bo); 1016 (void) VOP_FSYNC(vp, td->td_ucred, MNT_NOWAIT, td); 1017 BO_LOCK(bo); 1018 altbufferflushes++; 1019 } else if (bo->bo_dirty.bv_cnt > dirtybufthresh) { 1020 /* 1021 * Try to find a buffer to flush. 1022 */ 1023 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 1024 if ((nbp->b_vflags & BV_BKGRDINPROG) || 1025 buf_countdeps(nbp, 0) || 1026 BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) 1027 continue; 1028 if (bp == nbp) 1029 panic("bdwrite: found ourselves"); 1030 BO_UNLOCK(bo); 1031 if (nbp->b_flags & B_CLUSTEROK) { 1032 vfs_bio_awrite(nbp); 1033 } else { 1034 bremfree(nbp); 1035 bawrite(nbp); 1036 } 1037 BO_LOCK(bo); 1038 dirtybufferflushes++; 1039 break; 1040 } 1041 } 1042 BO_UNLOCK(bo); 1043 1044 bdirty(bp); 1045 /* 1046 * Set B_CACHE, indicating that the buffer is fully valid. This is 1047 * true even of NFS now. 1048 */ 1049 bp->b_flags |= B_CACHE; 1050 1051 /* 1052 * This bmap keeps the system from needing to do the bmap later, 1053 * perhaps when the system is attempting to do a sync. Since it 1054 * is likely that the indirect block -- or whatever other datastructure 1055 * that the filesystem needs is still in memory now, it is a good 1056 * thing to do this. Note also, that if the pageout daemon is 1057 * requesting a sync -- there might not be enough memory to do 1058 * the bmap then... So, this is important to do. 1059 */ 1060 if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) { 1061 VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 1062 } 1063 1064 /* 1065 * Set the *dirty* buffer range based upon the VM system dirty pages. 1066 */ 1067 vfs_setdirty(bp); 1068 1069 /* 1070 * We need to do this here to satisfy the vnode_pager and the 1071 * pageout daemon, so that it thinks that the pages have been 1072 * "cleaned". Note that since the pages are in a delayed write 1073 * buffer -- the VFS layer "will" see that the pages get written 1074 * out on the next sync, or perhaps the cluster will be completed. 1075 */ 1076 vfs_clean_pages(bp); 1077 bqrelse(bp); 1078 1079 /* 1080 * Wakeup the buffer flushing daemon if we have a lot of dirty 1081 * buffers (midpoint between our recovery point and our stall 1082 * point). 1083 */ 1084 bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); 1085 1086 /* 1087 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1088 * due to the softdep code. 1089 */ 1090} 1091 1092/* 1093 * bdirty: 1094 * 1095 * Turn buffer into delayed write request. We must clear BIO_READ and 1096 * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 1097 * itself to properly update it in the dirty/clean lists. We mark it 1098 * B_DONE to ensure that any asynchronization of the buffer properly 1099 * clears B_DONE ( else a panic will occur later ). 1100 * 1101 * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 1102 * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 1103 * should only be called if the buffer is known-good. 1104 * 1105 * Since the buffer is not on a queue, we do not update the numfreebuffers 1106 * count. 1107 * 1108 * Must be called at splbio(). 1109 * The buffer must be on QUEUE_NONE. 1110 */ 1111void 1112bdirty(struct buf *bp) 1113{ 1114 1115 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1116 KASSERT(bp->b_qindex == QUEUE_NONE, 1117 ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1118 bp->b_flags &= ~(B_RELBUF); 1119 bp->b_iocmd = BIO_WRITE; 1120 1121 if ((bp->b_flags & B_DELWRI) == 0) { 1122 bp->b_flags |= B_DONE | B_DELWRI; 1123 reassignbuf(bp); 1124 atomic_add_int(&numdirtybuffers, 1); 1125 bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); 1126 } 1127} 1128 1129/* 1130 * bundirty: 1131 * 1132 * Clear B_DELWRI for buffer. 1133 * 1134 * Since the buffer is not on a queue, we do not update the numfreebuffers 1135 * count. 1136 * 1137 * Must be called at splbio(). 1138 * The buffer must be on QUEUE_NONE. 1139 */ 1140 1141void 1142bundirty(struct buf *bp) 1143{ 1144 1145 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1146 KASSERT(bp->b_qindex == QUEUE_NONE, 1147 ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1148 1149 if (bp->b_flags & B_DELWRI) { 1150 bp->b_flags &= ~B_DELWRI; 1151 reassignbuf(bp); 1152 atomic_subtract_int(&numdirtybuffers, 1); 1153 numdirtywakeup(lodirtybuffers); 1154 } 1155 /* 1156 * Since it is now being written, we can clear its deferred write flag. 1157 */ 1158 bp->b_flags &= ~B_DEFERRED; 1159} 1160 1161/* 1162 * bawrite: 1163 * 1164 * Asynchronous write. Start output on a buffer, but do not wait for 1165 * it to complete. The buffer is released when the output completes. 1166 * 1167 * bwrite() ( or the VOP routine anyway ) is responsible for handling 1168 * B_INVAL buffers. Not us. 1169 */ 1170void 1171bawrite(struct buf *bp) 1172{ 1173 1174 bp->b_flags |= B_ASYNC; 1175 (void) bwrite(bp); 1176} 1177 1178/* 1179 * bwillwrite: 1180 * 1181 * Called prior to the locking of any vnodes when we are expecting to 1182 * write. We do not want to starve the buffer cache with too many 1183 * dirty buffers so we block here. By blocking prior to the locking 1184 * of any vnodes we attempt to avoid the situation where a locked vnode 1185 * prevents the various system daemons from flushing related buffers. 1186 */ 1187 1188void 1189bwillwrite(void) 1190{ 1191 1192 if (numdirtybuffers >= hidirtybuffers) { 1193 int s; 1194 1195 mtx_lock(&Giant); 1196 s = splbio(); 1197 mtx_lock(&nblock); 1198 while (numdirtybuffers >= hidirtybuffers) { 1199 bd_wakeup(1); 1200 needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; 1201 msleep(&needsbuffer, &nblock, 1202 (PRIBIO + 4), "flswai", 0); 1203 } 1204 splx(s); 1205 mtx_unlock(&nblock); 1206 mtx_unlock(&Giant); 1207 } 1208} 1209 1210/* 1211 * Return true if we have too many dirty buffers. 1212 */ 1213int 1214buf_dirty_count_severe(void) 1215{ 1216 1217 return(numdirtybuffers >= hidirtybuffers); 1218} 1219 1220/* 1221 * brelse: 1222 * 1223 * Release a busy buffer and, if requested, free its resources. The 1224 * buffer will be stashed in the appropriate bufqueue[] allowing it 1225 * to be accessed later as a cache entity or reused for other purposes. 1226 */ 1227void 1228brelse(struct buf *bp) 1229{ 1230 int s; 1231 1232 GIANT_REQUIRED; 1233 1234 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1235 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1236 1237 s = splbio(); 1238 1239 if (bp->b_iocmd == BIO_WRITE && 1240 (bp->b_ioflags & BIO_ERROR) && 1241 !(bp->b_flags & B_INVAL)) { 1242 /* 1243 * Failed write, redirty. Must clear BIO_ERROR to prevent 1244 * pages from being scrapped. If B_INVAL is set then 1245 * this case is not run and the next case is run to 1246 * destroy the buffer. B_INVAL can occur if the buffer 1247 * is outside the range supported by the underlying device. 1248 */ 1249 bp->b_ioflags &= ~BIO_ERROR; 1250 bdirty(bp); 1251 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || 1252 (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) { 1253 /* 1254 * Either a failed I/O or we were asked to free or not 1255 * cache the buffer. 1256 */ 1257 bp->b_flags |= B_INVAL; 1258 if (LIST_FIRST(&bp->b_dep) != NULL) 1259 buf_deallocate(bp); 1260 if (bp->b_flags & B_DELWRI) { 1261 atomic_subtract_int(&numdirtybuffers, 1); 1262 numdirtywakeup(lodirtybuffers); 1263 } 1264 bp->b_flags &= ~(B_DELWRI | B_CACHE); 1265 if ((bp->b_flags & B_VMIO) == 0) { 1266 if (bp->b_bufsize) 1267 allocbuf(bp, 0); 1268 if (bp->b_vp) 1269 brelvp(bp); 1270 } 1271 } 1272 1273 /* 1274 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() 1275 * is called with B_DELWRI set, the underlying pages may wind up 1276 * getting freed causing a previous write (bdwrite()) to get 'lost' 1277 * because pages associated with a B_DELWRI bp are marked clean. 1278 * 1279 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1280 * if B_DELWRI is set. 1281 * 1282 * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1283 * on pages to return pages to the VM page queues. 1284 */ 1285 if (bp->b_flags & B_DELWRI) 1286 bp->b_flags &= ~B_RELBUF; 1287 else if (vm_page_count_severe()) { 1288 /* 1289 * XXX This lock may not be necessary since BKGRDINPROG 1290 * cannot be set while we hold the buf lock, it can only be 1291 * cleared if it is already pending. 1292 */ 1293 if (bp->b_vp) { 1294 BO_LOCK(bp->b_bufobj); 1295 if (!(bp->b_vflags & BV_BKGRDINPROG)) 1296 bp->b_flags |= B_RELBUF; 1297 BO_UNLOCK(bp->b_bufobj); 1298 } else 1299 bp->b_flags |= B_RELBUF; 1300 } 1301 1302 /* 1303 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 1304 * constituted, not even NFS buffers now. Two flags effect this. If 1305 * B_INVAL, the struct buf is invalidated but the VM object is kept 1306 * around ( i.e. so it is trivial to reconstitute the buffer later ). 1307 * 1308 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be 1309 * invalidated. BIO_ERROR cannot be set for a failed write unless the 1310 * buffer is also B_INVAL because it hits the re-dirtying code above. 1311 * 1312 * Normally we can do this whether a buffer is B_DELWRI or not. If 1313 * the buffer is an NFS buffer, it is tracking piecemeal writes or 1314 * the commit state and we cannot afford to lose the buffer. If the 1315 * buffer has a background write in progress, we need to keep it 1316 * around to prevent it from being reconstituted and starting a second 1317 * background write. 1318 */ 1319 if ((bp->b_flags & B_VMIO) 1320 && !(bp->b_vp->v_mount != NULL && 1321 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 && 1322 !vn_isdisk(bp->b_vp, NULL) && 1323 (bp->b_flags & B_DELWRI)) 1324 ) { 1325 1326 int i, j, resid; 1327 vm_page_t m; 1328 off_t foff; 1329 vm_pindex_t poff; 1330 vm_object_t obj; 1331 1332 obj = bp->b_object; 1333 1334 /* 1335 * Get the base offset and length of the buffer. Note that 1336 * in the VMIO case if the buffer block size is not 1337 * page-aligned then b_data pointer may not be page-aligned. 1338 * But our b_pages[] array *IS* page aligned. 1339 * 1340 * block sizes less then DEV_BSIZE (usually 512) are not 1341 * supported due to the page granularity bits (m->valid, 1342 * m->dirty, etc...). 1343 * 1344 * See man buf(9) for more information 1345 */ 1346 resid = bp->b_bufsize; 1347 foff = bp->b_offset; 1348 VM_OBJECT_LOCK(obj); 1349 for (i = 0; i < bp->b_npages; i++) { 1350 int had_bogus = 0; 1351 1352 m = bp->b_pages[i]; 1353 1354 /* 1355 * If we hit a bogus page, fixup *all* the bogus pages 1356 * now. 1357 */ 1358 if (m == bogus_page) { 1359 poff = OFF_TO_IDX(bp->b_offset); 1360 had_bogus = 1; 1361 1362 for (j = i; j < bp->b_npages; j++) { 1363 vm_page_t mtmp; 1364 mtmp = bp->b_pages[j]; 1365 if (mtmp == bogus_page) { 1366 mtmp = vm_page_lookup(obj, poff + j); 1367 if (!mtmp) { 1368 panic("brelse: page missing\n"); 1369 } 1370 bp->b_pages[j] = mtmp; 1371 } 1372 } 1373 1374 if ((bp->b_flags & B_INVAL) == 0) { 1375 pmap_qenter( 1376 trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 1377 } 1378 m = bp->b_pages[i]; 1379 } 1380 if ((bp->b_flags & B_NOCACHE) || 1381 (bp->b_ioflags & BIO_ERROR)) { 1382 int poffset = foff & PAGE_MASK; 1383 int presid = resid > (PAGE_SIZE - poffset) ? 1384 (PAGE_SIZE - poffset) : resid; 1385 1386 KASSERT(presid >= 0, ("brelse: extra page")); 1387 vm_page_lock_queues(); 1388 vm_page_set_invalid(m, poffset, presid); 1389 vm_page_unlock_queues(); 1390 if (had_bogus) 1391 printf("avoided corruption bug in bogus_page/brelse code\n"); 1392 } 1393 resid -= PAGE_SIZE - (foff & PAGE_MASK); 1394 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 1395 } 1396 VM_OBJECT_UNLOCK(obj); 1397 if (bp->b_flags & (B_INVAL | B_RELBUF)) 1398 vfs_vmio_release(bp); 1399 1400 } else if (bp->b_flags & B_VMIO) { 1401 1402 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1403 vfs_vmio_release(bp); 1404 } 1405 1406 } 1407 1408 if (bp->b_qindex != QUEUE_NONE) 1409 panic("brelse: free buffer onto another queue???"); 1410 if (BUF_REFCNT(bp) > 1) { 1411 /* do not release to free list */ 1412 BUF_UNLOCK(bp); 1413 splx(s); 1414 return; 1415 } 1416 1417 /* enqueue */ 1418 mtx_lock(&bqlock); 1419 1420 /* buffers with no memory */ 1421 if (bp->b_bufsize == 0) { 1422 bp->b_flags |= B_INVAL; 1423 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); 1424 if (bp->b_vflags & BV_BKGRDINPROG) 1425 panic("losing buffer 1"); 1426 if (bp->b_kvasize) { 1427 bp->b_qindex = QUEUE_EMPTYKVA; 1428 } else { 1429 bp->b_qindex = QUEUE_EMPTY; 1430 } 1431 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1432 bp->b_dev = NULL; 1433 /* buffers with junk contents */ 1434 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || 1435 (bp->b_ioflags & BIO_ERROR)) { 1436 bp->b_flags |= B_INVAL; 1437 bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA); 1438 if (bp->b_vflags & BV_BKGRDINPROG) 1439 panic("losing buffer 2"); 1440 bp->b_qindex = QUEUE_CLEAN; 1441 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1442 bp->b_dev = NULL; 1443 /* remaining buffers */ 1444 } else { 1445 if (bp->b_flags & B_DELWRI) 1446 bp->b_qindex = QUEUE_DIRTY; 1447 else 1448 bp->b_qindex = QUEUE_CLEAN; 1449 if (bp->b_flags & B_AGE) 1450 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1451 else 1452 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 1453 } 1454 mtx_unlock(&bqlock); 1455 1456 /* 1457 * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already 1458 * placed the buffer on the correct queue. We must also disassociate 1459 * the device and vnode for a B_INVAL buffer so gbincore() doesn't 1460 * find it. 1461 */ 1462 if (bp->b_flags & B_INVAL) { 1463 if (bp->b_flags & B_DELWRI) 1464 bundirty(bp); 1465 if (bp->b_vp) 1466 brelvp(bp); 1467 } 1468 1469 /* 1470 * Fixup numfreebuffers count. The bp is on an appropriate queue 1471 * unless locked. We then bump numfreebuffers if it is not B_DELWRI. 1472 * We've already handled the B_INVAL case ( B_DELWRI will be clear 1473 * if B_INVAL is set ). 1474 */ 1475 1476 if (!(bp->b_flags & B_DELWRI)) 1477 bufcountwakeup(); 1478 1479 /* 1480 * Something we can maybe free or reuse 1481 */ 1482 if (bp->b_bufsize || bp->b_kvasize) 1483 bufspacewakeup(); 1484 1485 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT); 1486 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1487 panic("brelse: not dirty"); 1488 /* unlock */ 1489 BUF_UNLOCK(bp); 1490 splx(s); 1491} 1492 1493/* 1494 * Release a buffer back to the appropriate queue but do not try to free 1495 * it. The buffer is expected to be used again soon. 1496 * 1497 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1498 * biodone() to requeue an async I/O on completion. It is also used when 1499 * known good buffers need to be requeued but we think we may need the data 1500 * again soon. 1501 * 1502 * XXX we should be able to leave the B_RELBUF hint set on completion. 1503 */ 1504void 1505bqrelse(struct buf *bp) 1506{ 1507 int s; 1508 1509 s = splbio(); 1510 1511 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1512 ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1513 1514 if (bp->b_qindex != QUEUE_NONE) 1515 panic("bqrelse: free buffer onto another queue???"); 1516 if (BUF_REFCNT(bp) > 1) { 1517 /* do not release to free list */ 1518 BUF_UNLOCK(bp); 1519 splx(s); 1520 return; 1521 } 1522 mtx_lock(&bqlock); 1523 /* buffers with stale but valid contents */ 1524 if (bp->b_flags & B_DELWRI) { 1525 bp->b_qindex = QUEUE_DIRTY; 1526 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 1527 } else { 1528 /* 1529 * XXX This lock may not be necessary since BKGRDINPROG 1530 * cannot be set while we hold the buf lock, it can only be 1531 * cleared if it is already pending. 1532 */ 1533 BO_LOCK(bp->b_bufobj); 1534 if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) { 1535 BO_UNLOCK(bp->b_bufobj); 1536 bp->b_qindex = QUEUE_CLEAN; 1537 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, 1538 b_freelist); 1539 } else { 1540 /* 1541 * We are too low on memory, we have to try to free 1542 * the buffer (most importantly: the wired pages 1543 * making up its backing store) *now*. 1544 */ 1545 BO_UNLOCK(bp->b_bufobj); 1546 mtx_unlock(&bqlock); 1547 splx(s); 1548 brelse(bp); 1549 return; 1550 } 1551 } 1552 mtx_unlock(&bqlock); 1553 1554 if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) 1555 bufcountwakeup(); 1556 1557 /* 1558 * Something we can maybe free or reuse. 1559 */ 1560 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1561 bufspacewakeup(); 1562 1563 bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 1564 if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1565 panic("bqrelse: not dirty"); 1566 /* unlock */ 1567 BUF_UNLOCK(bp); 1568 splx(s); 1569} 1570 1571/* Give pages used by the bp back to the VM system (where possible) */ 1572static void 1573vfs_vmio_release(struct buf *bp) 1574{ 1575 int i; 1576 vm_page_t m; 1577 1578 GIANT_REQUIRED; 1579 VM_OBJECT_LOCK(bp->b_object); 1580 vm_page_lock_queues(); 1581 for (i = 0; i < bp->b_npages; i++) { 1582 m = bp->b_pages[i]; 1583 bp->b_pages[i] = NULL; 1584 /* 1585 * In order to keep page LRU ordering consistent, put 1586 * everything on the inactive queue. 1587 */ 1588 vm_page_unwire(m, 0); 1589 /* 1590 * We don't mess with busy pages, it is 1591 * the responsibility of the process that 1592 * busied the pages to deal with them. 1593 */ 1594 if ((m->flags & PG_BUSY) || (m->busy != 0)) 1595 continue; 1596 1597 if (m->wire_count == 0) { 1598 /* 1599 * Might as well free the page if we can and it has 1600 * no valid data. We also free the page if the 1601 * buffer was used for direct I/O 1602 */ 1603 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && 1604 m->hold_count == 0) { 1605 vm_page_busy(m); 1606 pmap_remove_all(m); 1607 vm_page_free(m); 1608 } else if (bp->b_flags & B_DIRECT) { 1609 vm_page_try_to_free(m); 1610 } else if (vm_page_count_severe()) { 1611 vm_page_try_to_cache(m); 1612 } 1613 } 1614 } 1615 vm_page_unlock_queues(); 1616 VM_OBJECT_UNLOCK(bp->b_object); 1617 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 1618 1619 if (bp->b_bufsize) { 1620 bufspacewakeup(); 1621 bp->b_bufsize = 0; 1622 } 1623 bp->b_npages = 0; 1624 bp->b_flags &= ~B_VMIO; 1625 if (bp->b_vp) 1626 brelvp(bp); 1627} 1628 1629/* 1630 * Check to see if a block at a particular lbn is available for a clustered 1631 * write. 1632 */ 1633static int 1634vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno) 1635{ 1636 struct buf *bpa; 1637 int match; 1638 1639 match = 0; 1640 1641 /* If the buf isn't in core skip it */ 1642 if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL) 1643 return (0); 1644 1645 /* If the buf is busy we don't want to wait for it */ 1646 if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 1647 return (0); 1648 1649 /* Only cluster with valid clusterable delayed write buffers */ 1650 if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) != 1651 (B_DELWRI | B_CLUSTEROK)) 1652 goto done; 1653 1654 if (bpa->b_bufsize != size) 1655 goto done; 1656 1657 /* 1658 * Check to see if it is in the expected place on disk and that the 1659 * block has been mapped. 1660 */ 1661 if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno)) 1662 match = 1; 1663done: 1664 BUF_UNLOCK(bpa); 1665 return (match); 1666} 1667 1668/* 1669 * vfs_bio_awrite: 1670 * 1671 * Implement clustered async writes for clearing out B_DELWRI buffers. 1672 * This is much better then the old way of writing only one buffer at 1673 * a time. Note that we may not be presented with the buffers in the 1674 * correct order, so we search for the cluster in both directions. 1675 */ 1676int 1677vfs_bio_awrite(struct buf *bp) 1678{ 1679 int i; 1680 int j; 1681 daddr_t lblkno = bp->b_lblkno; 1682 struct vnode *vp = bp->b_vp; 1683 int s; 1684 int ncl; 1685 int nwritten; 1686 int size; 1687 int maxcl; 1688 1689 s = splbio(); 1690 /* 1691 * right now we support clustered writing only to regular files. If 1692 * we find a clusterable block we could be in the middle of a cluster 1693 * rather then at the beginning. 1694 */ 1695 if ((vp->v_type == VREG) && 1696 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1697 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1698 1699 size = vp->v_mount->mnt_stat.f_iosize; 1700 maxcl = MAXPHYS / size; 1701 1702 VI_LOCK(vp); 1703 for (i = 1; i < maxcl; i++) 1704 if (vfs_bio_clcheck(vp, size, lblkno + i, 1705 bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0) 1706 break; 1707 1708 for (j = 1; i + j <= maxcl && j <= lblkno; j++) 1709 if (vfs_bio_clcheck(vp, size, lblkno - j, 1710 bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0) 1711 break; 1712 1713 VI_UNLOCK(vp); 1714 --j; 1715 ncl = i + j; 1716 /* 1717 * this is a possible cluster write 1718 */ 1719 if (ncl != 1) { 1720 BUF_UNLOCK(bp); 1721 nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); 1722 splx(s); 1723 return nwritten; 1724 } 1725 } 1726 1727 bremfree(bp); 1728 bp->b_flags |= B_ASYNC; 1729 1730 splx(s); 1731 /* 1732 * default (old) behavior, writing out only one block 1733 * 1734 * XXX returns b_bufsize instead of b_bcount for nwritten? 1735 */ 1736 nwritten = bp->b_bufsize; 1737 (void) bwrite(bp); 1738 1739 return nwritten; 1740} 1741 1742/* 1743 * getnewbuf: 1744 * 1745 * Find and initialize a new buffer header, freeing up existing buffers 1746 * in the bufqueues as necessary. The new buffer is returned locked. 1747 * 1748 * Important: B_INVAL is not set. If the caller wishes to throw the 1749 * buffer away, the caller must set B_INVAL prior to calling brelse(). 1750 * 1751 * We block if: 1752 * We have insufficient buffer headers 1753 * We have insufficient buffer space 1754 * buffer_map is too fragmented ( space reservation fails ) 1755 * If we have to flush dirty buffers ( but we try to avoid this ) 1756 * 1757 * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1758 * Instead we ask the buf daemon to do it for us. We attempt to 1759 * avoid piecemeal wakeups of the pageout daemon. 1760 */ 1761 1762static struct buf * 1763getnewbuf(int slpflag, int slptimeo, int size, int maxsize) 1764{ 1765 struct buf *bp; 1766 struct buf *nbp; 1767 int defrag = 0; 1768 int nqindex; 1769 static int flushingbufs; 1770 1771 GIANT_REQUIRED; 1772 1773 /* 1774 * We can't afford to block since we might be holding a vnode lock, 1775 * which may prevent system daemons from running. We deal with 1776 * low-memory situations by proactively returning memory and running 1777 * async I/O rather then sync I/O. 1778 */ 1779 1780 atomic_add_int(&getnewbufcalls, 1); 1781 atomic_subtract_int(&getnewbufrestarts, 1); 1782restart: 1783 atomic_add_int(&getnewbufrestarts, 1); 1784 1785 /* 1786 * Setup for scan. If we do not have enough free buffers, 1787 * we setup a degenerate case that immediately fails. Note 1788 * that if we are specially marked process, we are allowed to 1789 * dip into our reserves. 1790 * 1791 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN 1792 * 1793 * We start with EMPTYKVA. If the list is empty we backup to EMPTY. 1794 * However, there are a number of cases (defragging, reusing, ...) 1795 * where we cannot backup. 1796 */ 1797 mtx_lock(&bqlock); 1798 nqindex = QUEUE_EMPTYKVA; 1799 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); 1800 1801 if (nbp == NULL) { 1802 /* 1803 * If no EMPTYKVA buffers and we are either 1804 * defragging or reusing, locate a CLEAN buffer 1805 * to free or reuse. If bufspace useage is low 1806 * skip this step so we can allocate a new buffer. 1807 */ 1808 if (defrag || bufspace >= lobufspace) { 1809 nqindex = QUEUE_CLEAN; 1810 nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 1811 } 1812 1813 /* 1814 * If we could not find or were not allowed to reuse a 1815 * CLEAN buffer, check to see if it is ok to use an EMPTY 1816 * buffer. We can only use an EMPTY buffer if allocating 1817 * its KVA would not otherwise run us out of buffer space. 1818 */ 1819 if (nbp == NULL && defrag == 0 && 1820 bufspace + maxsize < hibufspace) { 1821 nqindex = QUEUE_EMPTY; 1822 nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 1823 } 1824 } 1825 1826 /* 1827 * Run scan, possibly freeing data and/or kva mappings on the fly 1828 * depending. 1829 */ 1830 1831 while ((bp = nbp) != NULL) { 1832 int qindex = nqindex; 1833 1834 /* 1835 * Calculate next bp ( we can only use it if we do not block 1836 * or do other fancy things ). 1837 */ 1838 if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { 1839 switch(qindex) { 1840 case QUEUE_EMPTY: 1841 nqindex = QUEUE_EMPTYKVA; 1842 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) 1843 break; 1844 /* FALLTHROUGH */ 1845 case QUEUE_EMPTYKVA: 1846 nqindex = QUEUE_CLEAN; 1847 if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) 1848 break; 1849 /* FALLTHROUGH */ 1850 case QUEUE_CLEAN: 1851 /* 1852 * nbp is NULL. 1853 */ 1854 break; 1855 } 1856 } 1857 if (bp->b_vp) { 1858 BO_LOCK(bp->b_bufobj); 1859 if (bp->b_vflags & BV_BKGRDINPROG) { 1860 BO_UNLOCK(bp->b_bufobj); 1861 continue; 1862 } 1863 BO_UNLOCK(bp->b_bufobj); 1864 } 1865 1866 /* 1867 * Sanity Checks 1868 */ 1869 KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); 1870 1871 /* 1872 * Note: we no longer distinguish between VMIO and non-VMIO 1873 * buffers. 1874 */ 1875 1876 KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); 1877 1878 /* 1879 * If we are defragging then we need a buffer with 1880 * b_kvasize != 0. XXX this situation should no longer 1881 * occur, if defrag is non-zero the buffer's b_kvasize 1882 * should also be non-zero at this point. XXX 1883 */ 1884 if (defrag && bp->b_kvasize == 0) { 1885 printf("Warning: defrag empty buffer %p\n", bp); 1886 continue; 1887 } 1888 1889 /* 1890 * Start freeing the bp. This is somewhat involved. nbp 1891 * remains valid only for QUEUE_EMPTY[KVA] bp's. 1892 */ 1893 1894 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 1895 panic("getnewbuf: locked buf"); 1896 bremfreel(bp); 1897 mtx_unlock(&bqlock); 1898 1899 if (qindex == QUEUE_CLEAN) { 1900 if (bp->b_flags & B_VMIO) { 1901 bp->b_flags &= ~B_ASYNC; 1902 vfs_vmio_release(bp); 1903 } 1904 if (bp->b_vp) 1905 brelvp(bp); 1906 } 1907 1908 /* 1909 * NOTE: nbp is now entirely invalid. We can only restart 1910 * the scan from this point on. 1911 * 1912 * Get the rest of the buffer freed up. b_kva* is still 1913 * valid after this operation. 1914 */ 1915 1916 if (bp->b_rcred != NOCRED) { 1917 crfree(bp->b_rcred); 1918 bp->b_rcred = NOCRED; 1919 } 1920 if (bp->b_wcred != NOCRED) { 1921 crfree(bp->b_wcred); 1922 bp->b_wcred = NOCRED; 1923 } 1924 if (LIST_FIRST(&bp->b_dep) != NULL) 1925 buf_deallocate(bp); 1926 if (bp->b_vflags & BV_BKGRDINPROG) 1927 panic("losing buffer 3"); 1928 1929 if (bp->b_bufsize) 1930 allocbuf(bp, 0); 1931 1932 bp->b_flags = 0; 1933 bp->b_ioflags = 0; 1934 bp->b_xflags = 0; 1935 bp->b_vflags = 0; 1936 bp->b_dev = NULL; 1937 bp->b_vp = NULL; 1938 bp->b_blkno = bp->b_lblkno = 0; 1939 bp->b_offset = NOOFFSET; 1940 bp->b_iodone = 0; 1941 bp->b_error = 0; 1942 bp->b_resid = 0; 1943 bp->b_bcount = 0; 1944 bp->b_npages = 0; 1945 bp->b_dirtyoff = bp->b_dirtyend = 0; 1946 bp->b_magic = B_MAGIC_BIO; 1947 bp->b_op = &buf_ops_bio; 1948 bp->b_object = NULL; 1949 bp->b_bufobj = NULL; 1950 1951 LIST_INIT(&bp->b_dep); 1952 1953 /* 1954 * If we are defragging then free the buffer. 1955 */ 1956 if (defrag) { 1957 bp->b_flags |= B_INVAL; 1958 bfreekva(bp); 1959 brelse(bp); 1960 defrag = 0; 1961 goto restart; 1962 } 1963 1964 /* 1965 * If we are overcomitted then recover the buffer and its 1966 * KVM space. This occurs in rare situations when multiple 1967 * processes are blocked in getnewbuf() or allocbuf(). 1968 */ 1969 if (bufspace >= hibufspace) 1970 flushingbufs = 1; 1971 if (flushingbufs && bp->b_kvasize != 0) { 1972 bp->b_flags |= B_INVAL; 1973 bfreekva(bp); 1974 brelse(bp); 1975 goto restart; 1976 } 1977 if (bufspace < lobufspace) 1978 flushingbufs = 0; 1979 break; 1980 } 1981 1982 /* 1983 * If we exhausted our list, sleep as appropriate. We may have to 1984 * wakeup various daemons and write out some dirty buffers. 1985 * 1986 * Generally we are sleeping due to insufficient buffer space. 1987 */ 1988 1989 if (bp == NULL) { 1990 int flags; 1991 char *waitmsg; 1992 1993 mtx_unlock(&bqlock); 1994 if (defrag) { 1995 flags = VFS_BIO_NEED_BUFSPACE; 1996 waitmsg = "nbufkv"; 1997 } else if (bufspace >= hibufspace) { 1998 waitmsg = "nbufbs"; 1999 flags = VFS_BIO_NEED_BUFSPACE; 2000 } else { 2001 waitmsg = "newbuf"; 2002 flags = VFS_BIO_NEED_ANY; 2003 } 2004 2005 bd_speedup(); /* heeeelp */ 2006 2007 mtx_lock(&nblock); 2008 needsbuffer |= flags; 2009 while (needsbuffer & flags) { 2010 if (msleep(&needsbuffer, &nblock, 2011 (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) { 2012 mtx_unlock(&nblock); 2013 return (NULL); 2014 } 2015 } 2016 mtx_unlock(&nblock); 2017 } else { 2018 /* 2019 * We finally have a valid bp. We aren't quite out of the 2020 * woods, we still have to reserve kva space. In order 2021 * to keep fragmentation sane we only allocate kva in 2022 * BKVASIZE chunks. 2023 */ 2024 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 2025 2026 if (maxsize != bp->b_kvasize) { 2027 vm_offset_t addr = 0; 2028 2029 bfreekva(bp); 2030 2031 if (vm_map_findspace(buffer_map, 2032 vm_map_min(buffer_map), maxsize, &addr)) { 2033 /* 2034 * Uh oh. Buffer map is to fragmented. We 2035 * must defragment the map. 2036 */ 2037 atomic_add_int(&bufdefragcnt, 1); 2038 defrag = 1; 2039 bp->b_flags |= B_INVAL; 2040 brelse(bp); 2041 goto restart; 2042 } 2043 if (addr) { 2044 vm_map_insert(buffer_map, NULL, 0, 2045 addr, addr + maxsize, 2046 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); 2047 2048 bp->b_kvabase = (caddr_t) addr; 2049 bp->b_kvasize = maxsize; 2050 atomic_add_int(&bufspace, bp->b_kvasize); 2051 atomic_add_int(&bufreusecnt, 1); 2052 } 2053 } 2054 bp->b_saveaddr = bp->b_kvabase; 2055 bp->b_data = bp->b_saveaddr; 2056 } 2057 return(bp); 2058} 2059 2060/* 2061 * buf_daemon: 2062 * 2063 * buffer flushing daemon. Buffers are normally flushed by the 2064 * update daemon but if it cannot keep up this process starts to 2065 * take the load in an attempt to prevent getnewbuf() from blocking. 2066 */ 2067 2068static struct kproc_desc buf_kp = { 2069 "bufdaemon", 2070 buf_daemon, 2071 &bufdaemonproc 2072}; 2073SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) 2074 2075static void 2076buf_daemon() 2077{ 2078 int s; 2079 2080 mtx_lock(&Giant); 2081 2082 /* 2083 * This process needs to be suspended prior to shutdown sync. 2084 */ 2085 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, 2086 SHUTDOWN_PRI_LAST); 2087 2088 /* 2089 * This process is allowed to take the buffer cache to the limit 2090 */ 2091 s = splbio(); 2092 mtx_lock(&bdlock); 2093 2094 for (;;) { 2095 bd_request = 0; 2096 mtx_unlock(&bdlock); 2097 2098 kthread_suspend_check(bufdaemonproc); 2099 2100 /* 2101 * Do the flush. Limit the amount of in-transit I/O we 2102 * allow to build up, otherwise we would completely saturate 2103 * the I/O system. Wakeup any waiting processes before we 2104 * normally would so they can run in parallel with our drain. 2105 */ 2106 while (numdirtybuffers > lodirtybuffers) { 2107 if (flushbufqueues(0) == 0) { 2108 /* 2109 * Could not find any buffers without rollback 2110 * dependencies, so just write the first one 2111 * in the hopes of eventually making progress. 2112 */ 2113 flushbufqueues(1); 2114 break; 2115 } 2116 waitrunningbufspace(); 2117 numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); 2118 } 2119 2120 /* 2121 * Only clear bd_request if we have reached our low water 2122 * mark. The buf_daemon normally waits 1 second and 2123 * then incrementally flushes any dirty buffers that have 2124 * built up, within reason. 2125 * 2126 * If we were unable to hit our low water mark and couldn't 2127 * find any flushable buffers, we sleep half a second. 2128 * Otherwise we loop immediately. 2129 */ 2130 mtx_lock(&bdlock); 2131 if (numdirtybuffers <= lodirtybuffers) { 2132 /* 2133 * We reached our low water mark, reset the 2134 * request and sleep until we are needed again. 2135 * The sleep is just so the suspend code works. 2136 */ 2137 bd_request = 0; 2138 msleep(&bd_request, &bdlock, PVM, "psleep", hz); 2139 } else { 2140 /* 2141 * We couldn't find any flushable dirty buffers but 2142 * still have too many dirty buffers, we 2143 * have to sleep and try again. (rare) 2144 */ 2145 msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10); 2146 } 2147 } 2148} 2149 2150/* 2151 * flushbufqueues: 2152 * 2153 * Try to flush a buffer in the dirty queue. We must be careful to 2154 * free up B_INVAL buffers instead of write them, which NFS is 2155 * particularly sensitive to. 2156 */ 2157int flushwithdeps = 0; 2158SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps, 2159 0, "Number of buffers flushed with dependecies that require rollbacks"); 2160 2161static int 2162flushbufqueues(int flushdeps) 2163{ 2164 struct thread *td = curthread; 2165 struct vnode *vp; 2166 struct mount *mp; 2167 struct buf *bp; 2168 int hasdeps; 2169 2170 mtx_lock(&bqlock); 2171 TAILQ_FOREACH(bp, &bufqueues[QUEUE_DIRTY], b_freelist) { 2172 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) 2173 continue; 2174 KASSERT((bp->b_flags & B_DELWRI), 2175 ("unexpected clean buffer %p", bp)); 2176 BO_LOCK(bp->b_bufobj); 2177 if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { 2178 BO_UNLOCK(bp->b_bufobj); 2179 BUF_UNLOCK(bp); 2180 continue; 2181 } 2182 BO_UNLOCK(bp->b_bufobj); 2183 if (bp->b_flags & B_INVAL) { 2184 bremfreel(bp); 2185 mtx_unlock(&bqlock); 2186 brelse(bp); 2187 return (1); 2188 } 2189 2190 if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) { 2191 if (flushdeps == 0) { 2192 BUF_UNLOCK(bp); 2193 continue; 2194 } 2195 hasdeps = 1; 2196 } else 2197 hasdeps = 0; 2198 /* 2199 * We must hold the lock on a vnode before writing 2200 * one of its buffers. Otherwise we may confuse, or 2201 * in the case of a snapshot vnode, deadlock the 2202 * system. 2203 * 2204 * The lock order here is the reverse of the normal 2205 * of vnode followed by buf lock. This is ok because 2206 * the NOWAIT will prevent deadlock. 2207 */ 2208 vp = bp->b_vp; 2209 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 2210 BUF_UNLOCK(bp); 2211 continue; 2212 } 2213 if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) { 2214 mtx_unlock(&bqlock); 2215 vfs_bio_awrite(bp); 2216 vn_finished_write(mp); 2217 VOP_UNLOCK(vp, 0, td); 2218 flushwithdeps += hasdeps; 2219 return (1); 2220 } 2221 vn_finished_write(mp); 2222 BUF_UNLOCK(bp); 2223 } 2224 mtx_unlock(&bqlock); 2225 return (0); 2226} 2227 2228/* 2229 * Check to see if a block is currently memory resident. 2230 */ 2231struct buf * 2232incore(struct bufobj *bo, daddr_t blkno) 2233{ 2234 struct buf *bp; 2235 2236 int s = splbio(); 2237 BO_LOCK(bo); 2238 bp = gbincore(bo, blkno); 2239 BO_UNLOCK(bo); 2240 splx(s); 2241 return (bp); 2242} 2243 2244/* 2245 * Returns true if no I/O is needed to access the 2246 * associated VM object. This is like incore except 2247 * it also hunts around in the VM system for the data. 2248 */ 2249 2250static int 2251inmem(struct vnode * vp, daddr_t blkno) 2252{ 2253 vm_object_t obj; 2254 vm_offset_t toff, tinc, size; 2255 vm_page_t m; 2256 vm_ooffset_t off; 2257 2258 GIANT_REQUIRED; 2259 ASSERT_VOP_LOCKED(vp, "inmem"); 2260 2261 if (incore(&vp->v_bufobj, blkno)) 2262 return 1; 2263 if (vp->v_mount == NULL) 2264 return 0; 2265 if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_vflag & VV_OBJBUF) == 0) 2266 return 0; 2267 2268 size = PAGE_SIZE; 2269 if (size > vp->v_mount->mnt_stat.f_iosize) 2270 size = vp->v_mount->mnt_stat.f_iosize; 2271 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; 2272 2273 VM_OBJECT_LOCK(obj); 2274 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2275 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 2276 if (!m) 2277 goto notinmem; 2278 tinc = size; 2279 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 2280 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 2281 if (vm_page_is_valid(m, 2282 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 2283 goto notinmem; 2284 } 2285 VM_OBJECT_UNLOCK(obj); 2286 return 1; 2287 2288notinmem: 2289 VM_OBJECT_UNLOCK(obj); 2290 return (0); 2291} 2292 2293/* 2294 * vfs_setdirty: 2295 * 2296 * Sets the dirty range for a buffer based on the status of the dirty 2297 * bits in the pages comprising the buffer. 2298 * 2299 * The range is limited to the size of the buffer. 2300 * 2301 * This routine is primarily used by NFS, but is generalized for the 2302 * B_VMIO case. 2303 */ 2304static void 2305vfs_setdirty(struct buf *bp) 2306{ 2307 int i; 2308 vm_object_t object; 2309 2310 GIANT_REQUIRED; 2311 /* 2312 * Degenerate case - empty buffer 2313 */ 2314 2315 if (bp->b_bufsize == 0) 2316 return; 2317 2318 /* 2319 * We qualify the scan for modified pages on whether the 2320 * object has been flushed yet. The OBJ_WRITEABLE flag 2321 * is not cleared simply by protecting pages off. 2322 */ 2323 2324 if ((bp->b_flags & B_VMIO) == 0) 2325 return; 2326 2327 object = bp->b_pages[0]->object; 2328 VM_OBJECT_LOCK(object); 2329 if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) 2330 printf("Warning: object %p writeable but not mightbedirty\n", object); 2331 if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) 2332 printf("Warning: object %p mightbedirty but not writeable\n", object); 2333 2334 if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { 2335 vm_offset_t boffset; 2336 vm_offset_t eoffset; 2337 2338 vm_page_lock_queues(); 2339 /* 2340 * test the pages to see if they have been modified directly 2341 * by users through the VM system. 2342 */ 2343 for (i = 0; i < bp->b_npages; i++) 2344 vm_page_test_dirty(bp->b_pages[i]); 2345 2346 /* 2347 * Calculate the encompassing dirty range, boffset and eoffset, 2348 * (eoffset - boffset) bytes. 2349 */ 2350 2351 for (i = 0; i < bp->b_npages; i++) { 2352 if (bp->b_pages[i]->dirty) 2353 break; 2354 } 2355 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 2356 2357 for (i = bp->b_npages - 1; i >= 0; --i) { 2358 if (bp->b_pages[i]->dirty) { 2359 break; 2360 } 2361 } 2362 eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 2363 2364 vm_page_unlock_queues(); 2365 /* 2366 * Fit it to the buffer. 2367 */ 2368 2369 if (eoffset > bp->b_bcount) 2370 eoffset = bp->b_bcount; 2371 2372 /* 2373 * If we have a good dirty range, merge with the existing 2374 * dirty range. 2375 */ 2376 2377 if (boffset < eoffset) { 2378 if (bp->b_dirtyoff > boffset) 2379 bp->b_dirtyoff = boffset; 2380 if (bp->b_dirtyend < eoffset) 2381 bp->b_dirtyend = eoffset; 2382 } 2383 } 2384 VM_OBJECT_UNLOCK(object); 2385} 2386 2387/* 2388 * getblk: 2389 * 2390 * Get a block given a specified block and offset into a file/device. 2391 * The buffers B_DONE bit will be cleared on return, making it almost 2392 * ready for an I/O initiation. B_INVAL may or may not be set on 2393 * return. The caller should clear B_INVAL prior to initiating a 2394 * READ. 2395 * 2396 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 2397 * an existing buffer. 2398 * 2399 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 2400 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 2401 * and then cleared based on the backing VM. If the previous buffer is 2402 * non-0-sized but invalid, B_CACHE will be cleared. 2403 * 2404 * If getblk() must create a new buffer, the new buffer is returned with 2405 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 2406 * case it is returned with B_INVAL clear and B_CACHE set based on the 2407 * backing VM. 2408 * 2409 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 2410 * B_CACHE bit is clear. 2411 * 2412 * What this means, basically, is that the caller should use B_CACHE to 2413 * determine whether the buffer is fully valid or not and should clear 2414 * B_INVAL prior to issuing a read. If the caller intends to validate 2415 * the buffer by loading its data area with something, the caller needs 2416 * to clear B_INVAL. If the caller does this without issuing an I/O, 2417 * the caller should set B_CACHE ( as an optimization ), else the caller 2418 * should issue the I/O and biodone() will set B_CACHE if the I/O was 2419 * a write attempt or if it was a successfull read. If the caller 2420 * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR 2421 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 2422 */ 2423struct buf * 2424getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo, 2425 int flags) 2426{ 2427 struct buf *bp; 2428 struct bufobj *bo; 2429 int s; 2430 int error; 2431 ASSERT_VOP_LOCKED(vp, "getblk"); 2432 2433 if (size > MAXBSIZE) 2434 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 2435 2436 bo = &vp->v_bufobj; 2437 s = splbio(); 2438loop: 2439 /* 2440 * Block if we are low on buffers. Certain processes are allowed 2441 * to completely exhaust the buffer cache. 2442 * 2443 * If this check ever becomes a bottleneck it may be better to 2444 * move it into the else, when gbincore() fails. At the moment 2445 * it isn't a problem. 2446 * 2447 * XXX remove if 0 sections (clean this up after its proven) 2448 */ 2449 if (numfreebuffers == 0) { 2450 if (curthread == PCPU_GET(idlethread)) 2451 return NULL; 2452 mtx_lock(&nblock); 2453 needsbuffer |= VFS_BIO_NEED_ANY; 2454 mtx_unlock(&nblock); 2455 } 2456 2457 VI_LOCK(vp); 2458 bp = gbincore(bo, blkno); 2459 if (bp != NULL) { 2460 int lockflags; 2461 /* 2462 * Buffer is in-core. If the buffer is not busy, it must 2463 * be on a queue. 2464 */ 2465 lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK; 2466 2467 if (flags & GB_LOCK_NOWAIT) 2468 lockflags |= LK_NOWAIT; 2469 2470 error = BUF_TIMELOCK(bp, lockflags, 2471 VI_MTX(vp), "getblk", slpflag, slptimeo); 2472 2473 /* 2474 * If we slept and got the lock we have to restart in case 2475 * the buffer changed identities. 2476 */ 2477 if (error == ENOLCK) 2478 goto loop; 2479 /* We timed out or were interrupted. */ 2480 else if (error) 2481 return (NULL); 2482 2483 /* 2484 * The buffer is locked. B_CACHE is cleared if the buffer is 2485 * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set 2486 * and for a VMIO buffer B_CACHE is adjusted according to the 2487 * backing VM cache. 2488 */ 2489 if (bp->b_flags & B_INVAL) 2490 bp->b_flags &= ~B_CACHE; 2491 else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 2492 bp->b_flags |= B_CACHE; 2493 bremfree(bp); 2494 2495 /* 2496 * check for size inconsistancies for non-VMIO case. 2497 */ 2498 2499 if (bp->b_bcount != size) { 2500 if ((bp->b_flags & B_VMIO) == 0 || 2501 (size > bp->b_kvasize)) { 2502 if (bp->b_flags & B_DELWRI) { 2503 bp->b_flags |= B_NOCACHE; 2504 bwrite(bp); 2505 } else { 2506 if ((bp->b_flags & B_VMIO) && 2507 (LIST_FIRST(&bp->b_dep) == NULL)) { 2508 bp->b_flags |= B_RELBUF; 2509 brelse(bp); 2510 } else { 2511 bp->b_flags |= B_NOCACHE; 2512 bwrite(bp); 2513 } 2514 } 2515 goto loop; 2516 } 2517 } 2518 2519 /* 2520 * If the size is inconsistant in the VMIO case, we can resize 2521 * the buffer. This might lead to B_CACHE getting set or 2522 * cleared. If the size has not changed, B_CACHE remains 2523 * unchanged from its previous state. 2524 */ 2525 2526 if (bp->b_bcount != size) 2527 allocbuf(bp, size); 2528 2529 KASSERT(bp->b_offset != NOOFFSET, 2530 ("getblk: no buffer offset")); 2531 2532 /* 2533 * A buffer with B_DELWRI set and B_CACHE clear must 2534 * be committed before we can return the buffer in 2535 * order to prevent the caller from issuing a read 2536 * ( due to B_CACHE not being set ) and overwriting 2537 * it. 2538 * 2539 * Most callers, including NFS and FFS, need this to 2540 * operate properly either because they assume they 2541 * can issue a read if B_CACHE is not set, or because 2542 * ( for example ) an uncached B_DELWRI might loop due 2543 * to softupdates re-dirtying the buffer. In the latter 2544 * case, B_CACHE is set after the first write completes, 2545 * preventing further loops. 2546 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 2547 * above while extending the buffer, we cannot allow the 2548 * buffer to remain with B_CACHE set after the write 2549 * completes or it will represent a corrupt state. To 2550 * deal with this we set B_NOCACHE to scrap the buffer 2551 * after the write. 2552 * 2553 * We might be able to do something fancy, like setting 2554 * B_CACHE in bwrite() except if B_DELWRI is already set, 2555 * so the below call doesn't set B_CACHE, but that gets real 2556 * confusing. This is much easier. 2557 */ 2558 2559 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 2560 bp->b_flags |= B_NOCACHE; 2561 bwrite(bp); 2562 goto loop; 2563 } 2564 2565 splx(s); 2566 bp->b_flags &= ~B_DONE; 2567 } else { 2568 int bsize, maxsize, vmio; 2569 off_t offset; 2570 2571 /* 2572 * Buffer is not in-core, create new buffer. The buffer 2573 * returned by getnewbuf() is locked. Note that the returned 2574 * buffer is also considered valid (not marked B_INVAL). 2575 */ 2576 VI_UNLOCK(vp); 2577 /* 2578 * If the user does not want us to create the buffer, bail out 2579 * here. 2580 */ 2581 if (flags & GB_NOCREAT) { 2582 splx(s); 2583 return NULL; 2584 } 2585 if (vn_isdisk(vp, NULL)) 2586 bsize = DEV_BSIZE; 2587 else if (vp->v_mountedhere) 2588 bsize = vp->v_mountedhere->mnt_stat.f_iosize; 2589 else if (vp->v_mount) 2590 bsize = vp->v_mount->mnt_stat.f_iosize; 2591 else 2592 bsize = size; 2593 2594 if (vp->v_bsize != bsize) { 2595#if 0 2596 printf("WARNING: Wrong block size on vnode: %d should be %d\n", vp->v_bsize, bsize); 2597#endif 2598 vp->v_bsize = bsize; 2599 } 2600 2601 offset = blkno * bsize; 2602 vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && 2603 (vp->v_vflag & VV_OBJBUF); 2604 maxsize = vmio ? size + (offset & PAGE_MASK) : size; 2605 maxsize = imax(maxsize, bsize); 2606 2607 bp = getnewbuf(slpflag, slptimeo, size, maxsize); 2608 if (bp == NULL) { 2609 if (slpflag || slptimeo) { 2610 splx(s); 2611 return NULL; 2612 } 2613 goto loop; 2614 } 2615 2616 /* 2617 * This code is used to make sure that a buffer is not 2618 * created while the getnewbuf routine is blocked. 2619 * This can be a problem whether the vnode is locked or not. 2620 * If the buffer is created out from under us, we have to 2621 * throw away the one we just created. There is now window 2622 * race because we are safely running at splbio() from the 2623 * point of the duplicate buffer creation through to here, 2624 * and we've locked the buffer. 2625 * 2626 * Note: this must occur before we associate the buffer 2627 * with the vp especially considering limitations in 2628 * the splay tree implementation when dealing with duplicate 2629 * lblkno's. 2630 */ 2631 BO_LOCK(bo); 2632 if (gbincore(bo, blkno)) { 2633 BO_UNLOCK(bo); 2634 bp->b_flags |= B_INVAL; 2635 brelse(bp); 2636 goto loop; 2637 } 2638 2639 /* 2640 * Insert the buffer into the hash, so that it can 2641 * be found by incore. 2642 */ 2643 bp->b_blkno = bp->b_lblkno = blkno; 2644 bp->b_offset = offset; 2645 2646 bgetvp(vp, bp); 2647 BO_UNLOCK(bo); 2648 2649 /* 2650 * set B_VMIO bit. allocbuf() the buffer bigger. Since the 2651 * buffer size starts out as 0, B_CACHE will be set by 2652 * allocbuf() for the VMIO case prior to it testing the 2653 * backing store for validity. 2654 */ 2655 2656 if (vmio) { 2657 bp->b_flags |= B_VMIO; 2658#if defined(VFS_BIO_DEBUG) 2659 if (vn_canvmio(vp) != TRUE) 2660 printf("getblk: VMIO on vnode type %d\n", 2661 vp->v_type); 2662#endif 2663 VOP_GETVOBJECT(vp, &bp->b_object); 2664 } else { 2665 bp->b_flags &= ~B_VMIO; 2666 bp->b_object = NULL; 2667 } 2668 2669 allocbuf(bp, size); 2670 2671 splx(s); 2672 bp->b_flags &= ~B_DONE; 2673 } 2674 KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp)); 2675 KASSERT(bp->b_bufobj == bo, 2676 ("wrong b_bufobj %p should be %p", bp->b_bufobj, bo)); 2677 return (bp); 2678} 2679 2680/* 2681 * Get an empty, disassociated buffer of given size. The buffer is initially 2682 * set to B_INVAL. 2683 */ 2684struct buf * 2685geteblk(int size) 2686{ 2687 struct buf *bp; 2688 int s; 2689 int maxsize; 2690 2691 maxsize = (size + BKVAMASK) & ~BKVAMASK; 2692 2693 s = splbio(); 2694 while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) 2695 continue; 2696 splx(s); 2697 allocbuf(bp, size); 2698 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 2699 KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); 2700 return (bp); 2701} 2702 2703 2704/* 2705 * This code constitutes the buffer memory from either anonymous system 2706 * memory (in the case of non-VMIO operations) or from an associated 2707 * VM object (in the case of VMIO operations). This code is able to 2708 * resize a buffer up or down. 2709 * 2710 * Note that this code is tricky, and has many complications to resolve 2711 * deadlock or inconsistant data situations. Tread lightly!!! 2712 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 2713 * the caller. Calling this code willy nilly can result in the loss of data. 2714 * 2715 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 2716 * B_CACHE for the non-VMIO case. 2717 */ 2718 2719int 2720allocbuf(struct buf *bp, int size) 2721{ 2722 int newbsize, mbsize; 2723 int i; 2724 2725 GIANT_REQUIRED; 2726 2727 if (BUF_REFCNT(bp) == 0) 2728 panic("allocbuf: buffer not busy"); 2729 2730 if (bp->b_kvasize < size) 2731 panic("allocbuf: buffer too small"); 2732 2733 if ((bp->b_flags & B_VMIO) == 0) { 2734 caddr_t origbuf; 2735 int origbufsize; 2736 /* 2737 * Just get anonymous memory from the kernel. Don't 2738 * mess with B_CACHE. 2739 */ 2740 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 2741 if (bp->b_flags & B_MALLOC) 2742 newbsize = mbsize; 2743 else 2744 newbsize = round_page(size); 2745 2746 if (newbsize < bp->b_bufsize) { 2747 /* 2748 * malloced buffers are not shrunk 2749 */ 2750 if (bp->b_flags & B_MALLOC) { 2751 if (newbsize) { 2752 bp->b_bcount = size; 2753 } else { 2754 free(bp->b_data, M_BIOBUF); 2755 if (bp->b_bufsize) { 2756 atomic_subtract_int( 2757 &bufmallocspace, 2758 bp->b_bufsize); 2759 bufspacewakeup(); 2760 bp->b_bufsize = 0; 2761 } 2762 bp->b_saveaddr = bp->b_kvabase; 2763 bp->b_data = bp->b_saveaddr; 2764 bp->b_bcount = 0; 2765 bp->b_flags &= ~B_MALLOC; 2766 } 2767 return 1; 2768 } 2769 vm_hold_free_pages( 2770 bp, 2771 (vm_offset_t) bp->b_data + newbsize, 2772 (vm_offset_t) bp->b_data + bp->b_bufsize); 2773 } else if (newbsize > bp->b_bufsize) { 2774 /* 2775 * We only use malloced memory on the first allocation. 2776 * and revert to page-allocated memory when the buffer 2777 * grows. 2778 */ 2779 /* 2780 * There is a potential smp race here that could lead 2781 * to bufmallocspace slightly passing the max. It 2782 * is probably extremely rare and not worth worrying 2783 * over. 2784 */ 2785 if ( (bufmallocspace < maxbufmallocspace) && 2786 (bp->b_bufsize == 0) && 2787 (mbsize <= PAGE_SIZE/2)) { 2788 2789 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 2790 bp->b_bufsize = mbsize; 2791 bp->b_bcount = size; 2792 bp->b_flags |= B_MALLOC; 2793 atomic_add_int(&bufmallocspace, mbsize); 2794 return 1; 2795 } 2796 origbuf = NULL; 2797 origbufsize = 0; 2798 /* 2799 * If the buffer is growing on its other-than-first allocation, 2800 * then we revert to the page-allocation scheme. 2801 */ 2802 if (bp->b_flags & B_MALLOC) { 2803 origbuf = bp->b_data; 2804 origbufsize = bp->b_bufsize; 2805 bp->b_data = bp->b_kvabase; 2806 if (bp->b_bufsize) { 2807 atomic_subtract_int(&bufmallocspace, 2808 bp->b_bufsize); 2809 bufspacewakeup(); 2810 bp->b_bufsize = 0; 2811 } 2812 bp->b_flags &= ~B_MALLOC; 2813 newbsize = round_page(newbsize); 2814 } 2815 vm_hold_load_pages( 2816 bp, 2817 (vm_offset_t) bp->b_data + bp->b_bufsize, 2818 (vm_offset_t) bp->b_data + newbsize); 2819 if (origbuf) { 2820 bcopy(origbuf, bp->b_data, origbufsize); 2821 free(origbuf, M_BIOBUF); 2822 } 2823 } 2824 } else { 2825 int desiredpages; 2826 2827 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 2828 desiredpages = (size == 0) ? 0 : 2829 num_pages((bp->b_offset & PAGE_MASK) + newbsize); 2830 2831 if (bp->b_flags & B_MALLOC) 2832 panic("allocbuf: VMIO buffer can't be malloced"); 2833 /* 2834 * Set B_CACHE initially if buffer is 0 length or will become 2835 * 0-length. 2836 */ 2837 if (size == 0 || bp->b_bufsize == 0) 2838 bp->b_flags |= B_CACHE; 2839 2840 if (newbsize < bp->b_bufsize) { 2841 /* 2842 * DEV_BSIZE aligned new buffer size is less then the 2843 * DEV_BSIZE aligned existing buffer size. Figure out 2844 * if we have to remove any pages. 2845 */ 2846 if (desiredpages < bp->b_npages) { 2847 vm_page_t m; 2848 2849 vm_page_lock_queues(); 2850 for (i = desiredpages; i < bp->b_npages; i++) { 2851 /* 2852 * the page is not freed here -- it 2853 * is the responsibility of 2854 * vnode_pager_setsize 2855 */ 2856 m = bp->b_pages[i]; 2857 KASSERT(m != bogus_page, 2858 ("allocbuf: bogus page found")); 2859 while (vm_page_sleep_if_busy(m, TRUE, "biodep")) 2860 vm_page_lock_queues(); 2861 2862 bp->b_pages[i] = NULL; 2863 vm_page_unwire(m, 0); 2864 } 2865 vm_page_unlock_queues(); 2866 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 2867 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 2868 bp->b_npages = desiredpages; 2869 } 2870 } else if (size > bp->b_bcount) { 2871 /* 2872 * We are growing the buffer, possibly in a 2873 * byte-granular fashion. 2874 */ 2875 struct vnode *vp; 2876 vm_object_t obj; 2877 vm_offset_t toff; 2878 vm_offset_t tinc; 2879 2880 /* 2881 * Step 1, bring in the VM pages from the object, 2882 * allocating them if necessary. We must clear 2883 * B_CACHE if these pages are not valid for the 2884 * range covered by the buffer. 2885 */ 2886 2887 vp = bp->b_vp; 2888 obj = bp->b_object; 2889 2890 VM_OBJECT_LOCK(obj); 2891 while (bp->b_npages < desiredpages) { 2892 vm_page_t m; 2893 vm_pindex_t pi; 2894 2895 pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; 2896 if ((m = vm_page_lookup(obj, pi)) == NULL) { 2897 /* 2898 * note: must allocate system pages 2899 * since blocking here could intefere 2900 * with paging I/O, no matter which 2901 * process we are. 2902 */ 2903 m = vm_page_alloc(obj, pi, 2904 VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 2905 if (m == NULL) { 2906 atomic_add_int(&vm_pageout_deficit, 2907 desiredpages - bp->b_npages); 2908 VM_OBJECT_UNLOCK(obj); 2909 VM_WAIT; 2910 VM_OBJECT_LOCK(obj); 2911 } else { 2912 vm_page_lock_queues(); 2913 vm_page_wakeup(m); 2914 vm_page_unlock_queues(); 2915 bp->b_flags &= ~B_CACHE; 2916 bp->b_pages[bp->b_npages] = m; 2917 ++bp->b_npages; 2918 } 2919 continue; 2920 } 2921 2922 /* 2923 * We found a page. If we have to sleep on it, 2924 * retry because it might have gotten freed out 2925 * from under us. 2926 * 2927 * We can only test PG_BUSY here. Blocking on 2928 * m->busy might lead to a deadlock: 2929 * 2930 * vm_fault->getpages->cluster_read->allocbuf 2931 * 2932 */ 2933 vm_page_lock_queues(); 2934 if (vm_page_sleep_if_busy(m, FALSE, "pgtblk")) 2935 continue; 2936 2937 /* 2938 * We have a good page. Should we wakeup the 2939 * page daemon? 2940 */ 2941 if ((curproc != pageproc) && 2942 ((m->queue - m->pc) == PQ_CACHE) && 2943 ((cnt.v_free_count + cnt.v_cache_count) < 2944 (cnt.v_free_min + cnt.v_cache_min))) { 2945 pagedaemon_wakeup(); 2946 } 2947 vm_page_wire(m); 2948 vm_page_unlock_queues(); 2949 bp->b_pages[bp->b_npages] = m; 2950 ++bp->b_npages; 2951 } 2952 2953 /* 2954 * Step 2. We've loaded the pages into the buffer, 2955 * we have to figure out if we can still have B_CACHE 2956 * set. Note that B_CACHE is set according to the 2957 * byte-granular range ( bcount and size ), new the 2958 * aligned range ( newbsize ). 2959 * 2960 * The VM test is against m->valid, which is DEV_BSIZE 2961 * aligned. Needless to say, the validity of the data 2962 * needs to also be DEV_BSIZE aligned. Note that this 2963 * fails with NFS if the server or some other client 2964 * extends the file's EOF. If our buffer is resized, 2965 * B_CACHE may remain set! XXX 2966 */ 2967 2968 toff = bp->b_bcount; 2969 tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); 2970 2971 while ((bp->b_flags & B_CACHE) && toff < size) { 2972 vm_pindex_t pi; 2973 2974 if (tinc > (size - toff)) 2975 tinc = size - toff; 2976 2977 pi = ((bp->b_offset & PAGE_MASK) + toff) >> 2978 PAGE_SHIFT; 2979 2980 vfs_buf_test_cache( 2981 bp, 2982 bp->b_offset, 2983 toff, 2984 tinc, 2985 bp->b_pages[pi] 2986 ); 2987 toff += tinc; 2988 tinc = PAGE_SIZE; 2989 } 2990 VM_OBJECT_UNLOCK(obj); 2991 2992 /* 2993 * Step 3, fixup the KVM pmap. Remember that 2994 * bp->b_data is relative to bp->b_offset, but 2995 * bp->b_offset may be offset into the first page. 2996 */ 2997 2998 bp->b_data = (caddr_t) 2999 trunc_page((vm_offset_t)bp->b_data); 3000 pmap_qenter( 3001 (vm_offset_t)bp->b_data, 3002 bp->b_pages, 3003 bp->b_npages 3004 ); 3005 3006 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 3007 (vm_offset_t)(bp->b_offset & PAGE_MASK)); 3008 } 3009 } 3010 if (newbsize < bp->b_bufsize) 3011 bufspacewakeup(); 3012 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3013 bp->b_bcount = size; /* requested buffer size */ 3014 return 1; 3015} 3016 3017void 3018biodone(struct bio *bp) 3019{ 3020 3021 mtx_lock(&bdonelock); 3022 bp->bio_flags |= BIO_DONE; 3023 if (bp->bio_done == NULL) 3024 wakeup(bp); 3025 mtx_unlock(&bdonelock); 3026 if (bp->bio_done != NULL) 3027 bp->bio_done(bp); 3028} 3029 3030/* 3031 * Wait for a BIO to finish. 3032 * 3033 * XXX: resort to a timeout for now. The optimal locking (if any) for this 3034 * case is not yet clear. 3035 */ 3036int 3037biowait(struct bio *bp, const char *wchan) 3038{ 3039 3040 mtx_lock(&bdonelock); 3041 while ((bp->bio_flags & BIO_DONE) == 0) 3042 msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10); 3043 mtx_unlock(&bdonelock); 3044 if (bp->bio_error != 0) 3045 return (bp->bio_error); 3046 if (!(bp->bio_flags & BIO_ERROR)) 3047 return (0); 3048 return (EIO); 3049} 3050 3051void 3052biofinish(struct bio *bp, struct devstat *stat, int error) 3053{ 3054 3055 if (error) { 3056 bp->bio_error = error; 3057 bp->bio_flags |= BIO_ERROR; 3058 } 3059 if (stat != NULL) 3060 devstat_end_transaction_bio(stat, bp); 3061 biodone(bp); 3062} 3063 3064/* 3065 * bufwait: 3066 * 3067 * Wait for buffer I/O completion, returning error status. The buffer 3068 * is left locked and B_DONE on return. B_EINTR is converted into an EINTR 3069 * error and cleared. 3070 */ 3071int 3072bufwait(struct buf *bp) 3073{ 3074 int s; 3075 3076 s = splbio(); 3077 if (bp->b_iocmd == BIO_READ) 3078 bwait(bp, PRIBIO, "biord"); 3079 else 3080 bwait(bp, PRIBIO, "biowr"); 3081 splx(s); 3082 if (bp->b_flags & B_EINTR) { 3083 bp->b_flags &= ~B_EINTR; 3084 return (EINTR); 3085 } 3086 if (bp->b_ioflags & BIO_ERROR) { 3087 return (bp->b_error ? bp->b_error : EIO); 3088 } else { 3089 return (0); 3090 } 3091} 3092 3093 /* 3094 * Call back function from struct bio back up to struct buf. 3095 */ 3096static void 3097bufdonebio(struct bio *bp) 3098{ 3099 3100 /* Device drivers may or may not hold giant, hold it here. */ 3101 mtx_lock(&Giant); 3102 bufdone(bp->bio_caller2); 3103 mtx_unlock(&Giant); 3104} 3105 3106void 3107dev_strategy(struct buf *bp) 3108{ 3109 struct cdevsw *csw; 3110 struct cdev *dev; 3111 3112 if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1))) 3113 panic("b_iocmd botch"); 3114 bp->b_io.bio_done = bufdonebio; 3115 bp->b_io.bio_caller2 = bp; 3116 dev = bp->b_io.bio_dev; 3117 KASSERT(dev->si_refcount > 0, 3118 ("dev_strategy on un-referenced struct cdev *(%s)", 3119 devtoname(dev))); 3120 csw = dev_refthread(dev); 3121 if (csw == NULL) { 3122 bp->b_error = ENXIO; 3123 bp->b_ioflags = BIO_ERROR; 3124 mtx_lock(&Giant); /* XXX: too defensive ? */ 3125 bufdone(bp); 3126 mtx_unlock(&Giant); /* XXX: too defensive ? */ 3127 return; 3128 } 3129 (*csw->d_strategy)(&bp->b_io); 3130 dev_relthread(dev); 3131} 3132 3133/* 3134 * bufdone: 3135 * 3136 * Finish I/O on a buffer, optionally calling a completion function. 3137 * This is usually called from an interrupt so process blocking is 3138 * not allowed. 3139 * 3140 * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 3141 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3142 * assuming B_INVAL is clear. 3143 * 3144 * For the VMIO case, we set B_CACHE if the op was a read and no 3145 * read error occured, or if the op was a write. B_CACHE is never 3146 * set if the buffer is invalid or otherwise uncacheable. 3147 * 3148 * biodone does not mess with B_INVAL, allowing the I/O routine or the 3149 * initiator to leave B_INVAL set to brelse the buffer out of existance 3150 * in the biodone routine. 3151 */ 3152void 3153bufdone(struct buf *bp) 3154{ 3155 int s; 3156 void (*biodone)(struct buf *); 3157 3158 3159 s = splbio(); 3160 3161 KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); 3162 KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 3163 3164 bp->b_flags |= B_DONE; 3165 runningbufwakeup(bp); 3166 3167 if (bp->b_iocmd == BIO_WRITE && bp->b_bufobj != NULL) 3168 bufobj_wdrop(bp->b_bufobj); 3169 3170 /* call optional completion function if requested */ 3171 if (bp->b_iodone != NULL) { 3172 biodone = bp->b_iodone; 3173 bp->b_iodone = NULL; 3174 (*biodone) (bp); 3175 splx(s); 3176 return; 3177 } 3178 if (LIST_FIRST(&bp->b_dep) != NULL) 3179 buf_complete(bp); 3180 3181 if (bp->b_flags & B_VMIO) { 3182 int i; 3183 vm_ooffset_t foff; 3184 vm_page_t m; 3185 vm_object_t obj; 3186 int iosize; 3187 struct vnode *vp = bp->b_vp; 3188 3189 obj = bp->b_object; 3190 3191#if defined(VFS_BIO_DEBUG) 3192 mp_fixme("usecount and vflag accessed without locks."); 3193 if (vp->v_usecount == 0) { 3194 panic("biodone: zero vnode ref count"); 3195 } 3196 3197 if ((vp->v_vflag & VV_OBJBUF) == 0) { 3198 panic("biodone: vnode is not setup for merged cache"); 3199 } 3200#endif 3201 3202 foff = bp->b_offset; 3203 KASSERT(bp->b_offset != NOOFFSET, 3204 ("biodone: no buffer offset")); 3205 3206 VM_OBJECT_LOCK(obj); 3207#if defined(VFS_BIO_DEBUG) 3208 if (obj->paging_in_progress < bp->b_npages) { 3209 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 3210 obj->paging_in_progress, bp->b_npages); 3211 } 3212#endif 3213 3214 /* 3215 * Set B_CACHE if the op was a normal read and no error 3216 * occured. B_CACHE is set for writes in the b*write() 3217 * routines. 3218 */ 3219 iosize = bp->b_bcount - bp->b_resid; 3220 if (bp->b_iocmd == BIO_READ && 3221 !(bp->b_flags & (B_INVAL|B_NOCACHE)) && 3222 !(bp->b_ioflags & BIO_ERROR)) { 3223 bp->b_flags |= B_CACHE; 3224 } 3225 vm_page_lock_queues(); 3226 for (i = 0; i < bp->b_npages; i++) { 3227 int bogusflag = 0; 3228 int resid; 3229 3230 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 3231 if (resid > iosize) 3232 resid = iosize; 3233 3234 /* 3235 * cleanup bogus pages, restoring the originals 3236 */ 3237 m = bp->b_pages[i]; 3238 if (m == bogus_page) { 3239 bogusflag = 1; 3240 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 3241 if (m == NULL) 3242 panic("biodone: page disappeared!"); 3243 bp->b_pages[i] = m; 3244 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 3245 } 3246#if defined(VFS_BIO_DEBUG) 3247 if (OFF_TO_IDX(foff) != m->pindex) { 3248 printf( 3249"biodone: foff(%jd)/m->pindex(%ju) mismatch\n", 3250 (intmax_t)foff, (uintmax_t)m->pindex); 3251 } 3252#endif 3253 3254 /* 3255 * In the write case, the valid and clean bits are 3256 * already changed correctly ( see bdwrite() ), so we 3257 * only need to do this here in the read case. 3258 */ 3259 if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { 3260 vfs_page_set_valid(bp, foff, i, m); 3261 } 3262 3263 /* 3264 * when debugging new filesystems or buffer I/O methods, this 3265 * is the most common error that pops up. if you see this, you 3266 * have not set the page busy flag correctly!!! 3267 */ 3268 if (m->busy == 0) { 3269 printf("biodone: page busy < 0, " 3270 "pindex: %d, foff: 0x(%x,%x), " 3271 "resid: %d, index: %d\n", 3272 (int) m->pindex, (int)(foff >> 32), 3273 (int) foff & 0xffffffff, resid, i); 3274 if (!vn_isdisk(vp, NULL)) 3275 printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n", 3276 (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize, 3277 (intmax_t) bp->b_lblkno, 3278 bp->b_flags, bp->b_npages); 3279 else 3280 printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n", 3281 (intmax_t) bp->b_lblkno, 3282 bp->b_flags, bp->b_npages); 3283 printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n", 3284 (u_long)m->valid, (u_long)m->dirty, 3285 m->wire_count); 3286 panic("biodone: page busy < 0\n"); 3287 } 3288 vm_page_io_finish(m); 3289 vm_object_pip_subtract(obj, 1); 3290 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3291 iosize -= resid; 3292 } 3293 vm_page_unlock_queues(); 3294 vm_object_pip_wakeupn(obj, 0); 3295 VM_OBJECT_UNLOCK(obj); 3296 } 3297 3298 /* 3299 * For asynchronous completions, release the buffer now. The brelse 3300 * will do a wakeup there if necessary - so no need to do a wakeup 3301 * here in the async case. The sync case always needs to do a wakeup. 3302 */ 3303 3304 if (bp->b_flags & B_ASYNC) { 3305 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) 3306 brelse(bp); 3307 else 3308 bqrelse(bp); 3309 } else { 3310 bdone(bp); 3311 } 3312 splx(s); 3313} 3314 3315/* 3316 * This routine is called in lieu of iodone in the case of 3317 * incomplete I/O. This keeps the busy status for pages 3318 * consistant. 3319 */ 3320void 3321vfs_unbusy_pages(struct buf *bp) 3322{ 3323 int i; 3324 vm_object_t obj; 3325 vm_page_t m; 3326 3327 runningbufwakeup(bp); 3328 if (!(bp->b_flags & B_VMIO)) 3329 return; 3330 3331 obj = bp->b_object; 3332 VM_OBJECT_LOCK(obj); 3333 vm_page_lock_queues(); 3334 for (i = 0; i < bp->b_npages; i++) { 3335 m = bp->b_pages[i]; 3336 if (m == bogus_page) { 3337 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); 3338 if (!m) 3339 panic("vfs_unbusy_pages: page missing\n"); 3340 bp->b_pages[i] = m; 3341 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3342 bp->b_pages, bp->b_npages); 3343 } 3344 vm_object_pip_subtract(obj, 1); 3345 vm_page_io_finish(m); 3346 } 3347 vm_page_unlock_queues(); 3348 vm_object_pip_wakeupn(obj, 0); 3349 VM_OBJECT_UNLOCK(obj); 3350} 3351 3352/* 3353 * vfs_page_set_valid: 3354 * 3355 * Set the valid bits in a page based on the supplied offset. The 3356 * range is restricted to the buffer's size. 3357 * 3358 * This routine is typically called after a read completes. 3359 */ 3360static void 3361vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) 3362{ 3363 vm_ooffset_t soff, eoff; 3364 3365 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3366 /* 3367 * Start and end offsets in buffer. eoff - soff may not cross a 3368 * page boundry or cross the end of the buffer. The end of the 3369 * buffer, in this case, is our file EOF, not the allocation size 3370 * of the buffer. 3371 */ 3372 soff = off; 3373 eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3374 if (eoff > bp->b_offset + bp->b_bcount) 3375 eoff = bp->b_offset + bp->b_bcount; 3376 3377 /* 3378 * Set valid range. This is typically the entire buffer and thus the 3379 * entire page. 3380 */ 3381 if (eoff > soff) { 3382 vm_page_set_validclean( 3383 m, 3384 (vm_offset_t) (soff & PAGE_MASK), 3385 (vm_offset_t) (eoff - soff) 3386 ); 3387 } 3388} 3389 3390/* 3391 * This routine is called before a device strategy routine. 3392 * It is used to tell the VM system that paging I/O is in 3393 * progress, and treat the pages associated with the buffer 3394 * almost as being PG_BUSY. Also the object paging_in_progress 3395 * flag is handled to make sure that the object doesn't become 3396 * inconsistant. 3397 * 3398 * Since I/O has not been initiated yet, certain buffer flags 3399 * such as BIO_ERROR or B_INVAL may be in an inconsistant state 3400 * and should be ignored. 3401 */ 3402void 3403vfs_busy_pages(struct buf *bp, int clear_modify) 3404{ 3405 int i, bogus; 3406 vm_object_t obj; 3407 vm_ooffset_t foff; 3408 vm_page_t m; 3409 3410 if (!(bp->b_flags & B_VMIO)) 3411 return; 3412 3413 obj = bp->b_object; 3414 foff = bp->b_offset; 3415 KASSERT(bp->b_offset != NOOFFSET, 3416 ("vfs_busy_pages: no buffer offset")); 3417 vfs_setdirty(bp); 3418 VM_OBJECT_LOCK(obj); 3419retry: 3420 vm_page_lock_queues(); 3421 for (i = 0; i < bp->b_npages; i++) { 3422 m = bp->b_pages[i]; 3423 3424 if (vm_page_sleep_if_busy(m, FALSE, "vbpage")) 3425 goto retry; 3426 } 3427 bogus = 0; 3428 for (i = 0; i < bp->b_npages; i++) { 3429 m = bp->b_pages[i]; 3430 3431 if ((bp->b_flags & B_CLUSTER) == 0) { 3432 vm_object_pip_add(obj, 1); 3433 vm_page_io_start(m); 3434 } 3435 /* 3436 * When readying a buffer for a read ( i.e 3437 * clear_modify == 0 ), it is important to do 3438 * bogus_page replacement for valid pages in 3439 * partially instantiated buffers. Partially 3440 * instantiated buffers can, in turn, occur when 3441 * reconstituting a buffer from its VM backing store 3442 * base. We only have to do this if B_CACHE is 3443 * clear ( which causes the I/O to occur in the 3444 * first place ). The replacement prevents the read 3445 * I/O from overwriting potentially dirty VM-backed 3446 * pages. XXX bogus page replacement is, uh, bogus. 3447 * It may not work properly with small-block devices. 3448 * We need to find a better way. 3449 */ 3450 pmap_remove_all(m); 3451 if (clear_modify) 3452 vfs_page_set_valid(bp, foff, i, m); 3453 else if (m->valid == VM_PAGE_BITS_ALL && 3454 (bp->b_flags & B_CACHE) == 0) { 3455 bp->b_pages[i] = bogus_page; 3456 bogus++; 3457 } 3458 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3459 } 3460 vm_page_unlock_queues(); 3461 VM_OBJECT_UNLOCK(obj); 3462 if (bogus) 3463 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3464 bp->b_pages, bp->b_npages); 3465} 3466 3467/* 3468 * Tell the VM system that the pages associated with this buffer 3469 * are clean. This is used for delayed writes where the data is 3470 * going to go to disk eventually without additional VM intevention. 3471 * 3472 * Note that while we only really need to clean through to b_bcount, we 3473 * just go ahead and clean through to b_bufsize. 3474 */ 3475static void 3476vfs_clean_pages(struct buf *bp) 3477{ 3478 int i; 3479 vm_ooffset_t foff, noff, eoff; 3480 vm_page_t m; 3481 3482 if (!(bp->b_flags & B_VMIO)) 3483 return; 3484 3485 foff = bp->b_offset; 3486 KASSERT(bp->b_offset != NOOFFSET, 3487 ("vfs_clean_pages: no buffer offset")); 3488 VM_OBJECT_LOCK(bp->b_object); 3489 vm_page_lock_queues(); 3490 for (i = 0; i < bp->b_npages; i++) { 3491 m = bp->b_pages[i]; 3492 noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3493 eoff = noff; 3494 3495 if (eoff > bp->b_offset + bp->b_bufsize) 3496 eoff = bp->b_offset + bp->b_bufsize; 3497 vfs_page_set_valid(bp, foff, i, m); 3498 /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ 3499 foff = noff; 3500 } 3501 vm_page_unlock_queues(); 3502 VM_OBJECT_UNLOCK(bp->b_object); 3503} 3504 3505/* 3506 * vfs_bio_set_validclean: 3507 * 3508 * Set the range within the buffer to valid and clean. The range is 3509 * relative to the beginning of the buffer, b_offset. Note that b_offset 3510 * itself may be offset from the beginning of the first page. 3511 * 3512 */ 3513 3514void 3515vfs_bio_set_validclean(struct buf *bp, int base, int size) 3516{ 3517 int i, n; 3518 vm_page_t m; 3519 3520 if (!(bp->b_flags & B_VMIO)) 3521 return; 3522 3523 /* 3524 * Fixup base to be relative to beginning of first page. 3525 * Set initial n to be the maximum number of bytes in the 3526 * first page that can be validated. 3527 */ 3528 3529 base += (bp->b_offset & PAGE_MASK); 3530 n = PAGE_SIZE - (base & PAGE_MASK); 3531 3532 VM_OBJECT_LOCK(bp->b_object); 3533 vm_page_lock_queues(); 3534 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 3535 m = bp->b_pages[i]; 3536 3537 if (n > size) 3538 n = size; 3539 3540 vm_page_set_validclean(m, base & PAGE_MASK, n); 3541 base += n; 3542 size -= n; 3543 n = PAGE_SIZE; 3544 } 3545 vm_page_unlock_queues(); 3546 VM_OBJECT_UNLOCK(bp->b_object); 3547} 3548 3549/* 3550 * vfs_bio_clrbuf: 3551 * 3552 * clear a buffer. This routine essentially fakes an I/O, so we need 3553 * to clear BIO_ERROR and B_INVAL. 3554 * 3555 * Note that while we only theoretically need to clear through b_bcount, 3556 * we go ahead and clear through b_bufsize. 3557 */ 3558 3559void 3560vfs_bio_clrbuf(struct buf *bp) 3561{ 3562 int i, j, mask = 0; 3563 caddr_t sa, ea; 3564 3565 GIANT_REQUIRED; 3566 3567 if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) { 3568 clrbuf(bp); 3569 return; 3570 } 3571 bp->b_flags &= ~B_INVAL; 3572 bp->b_ioflags &= ~BIO_ERROR; 3573 VM_OBJECT_LOCK(bp->b_object); 3574 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 3575 (bp->b_offset & PAGE_MASK) == 0) { 3576 if (bp->b_pages[0] == bogus_page) 3577 goto unlock; 3578 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 3579 VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED); 3580 if ((bp->b_pages[0]->valid & mask) == mask) 3581 goto unlock; 3582 if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && 3583 ((bp->b_pages[0]->valid & mask) == 0)) { 3584 bzero(bp->b_data, bp->b_bufsize); 3585 bp->b_pages[0]->valid |= mask; 3586 goto unlock; 3587 } 3588 } 3589 ea = sa = bp->b_data; 3590 for(i = 0; i < bp->b_npages; i++, sa = ea) { 3591 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 3592 ea = (caddr_t)(vm_offset_t)ulmin( 3593 (u_long)(vm_offset_t)ea, 3594 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 3595 if (bp->b_pages[i] == bogus_page) 3596 continue; 3597 j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 3598 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 3599 VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED); 3600 if ((bp->b_pages[i]->valid & mask) == mask) 3601 continue; 3602 if ((bp->b_pages[i]->valid & mask) == 0) { 3603 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) 3604 bzero(sa, ea - sa); 3605 } else { 3606 for (; sa < ea; sa += DEV_BSIZE, j++) { 3607 if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && 3608 (bp->b_pages[i]->valid & (1<<j)) == 0) 3609 bzero(sa, DEV_BSIZE); 3610 } 3611 } 3612 bp->b_pages[i]->valid |= mask; 3613 } 3614unlock: 3615 VM_OBJECT_UNLOCK(bp->b_object); 3616 bp->b_resid = 0; 3617} 3618 3619/* 3620 * vm_hold_load_pages and vm_hold_free_pages get pages into 3621 * a buffers address space. The pages are anonymous and are 3622 * not associated with a file object. 3623 */ 3624static void 3625vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 3626{ 3627 vm_offset_t pg; 3628 vm_page_t p; 3629 int index; 3630 3631 to = round_page(to); 3632 from = round_page(from); 3633 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3634 3635 VM_OBJECT_LOCK(kernel_object); 3636 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3637tryagain: 3638 /* 3639 * note: must allocate system pages since blocking here 3640 * could intefere with paging I/O, no matter which 3641 * process we are. 3642 */ 3643 p = vm_page_alloc(kernel_object, 3644 ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 3645 VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 3646 if (!p) { 3647 atomic_add_int(&vm_pageout_deficit, 3648 (to - pg) >> PAGE_SHIFT); 3649 VM_OBJECT_UNLOCK(kernel_object); 3650 VM_WAIT; 3651 VM_OBJECT_LOCK(kernel_object); 3652 goto tryagain; 3653 } 3654 p->valid = VM_PAGE_BITS_ALL; 3655 pmap_qenter(pg, &p, 1); 3656 bp->b_pages[index] = p; 3657 vm_page_lock_queues(); 3658 vm_page_wakeup(p); 3659 vm_page_unlock_queues(); 3660 } 3661 VM_OBJECT_UNLOCK(kernel_object); 3662 bp->b_npages = index; 3663} 3664 3665/* Return pages associated with this buf to the vm system */ 3666static void 3667vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 3668{ 3669 vm_offset_t pg; 3670 vm_page_t p; 3671 int index, newnpages; 3672 3673 GIANT_REQUIRED; 3674 3675 from = round_page(from); 3676 to = round_page(to); 3677 newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3678 3679 VM_OBJECT_LOCK(kernel_object); 3680 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3681 p = bp->b_pages[index]; 3682 if (p && (index < bp->b_npages)) { 3683 if (p->busy) { 3684 printf( 3685 "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", 3686 (intmax_t)bp->b_blkno, 3687 (intmax_t)bp->b_lblkno); 3688 } 3689 bp->b_pages[index] = NULL; 3690 pmap_qremove(pg, 1); 3691 vm_page_lock_queues(); 3692 vm_page_busy(p); 3693 vm_page_unwire(p, 0); 3694 vm_page_free(p); 3695 vm_page_unlock_queues(); 3696 } 3697 } 3698 VM_OBJECT_UNLOCK(kernel_object); 3699 bp->b_npages = newnpages; 3700} 3701 3702/* 3703 * Map an IO request into kernel virtual address space. 3704 * 3705 * All requests are (re)mapped into kernel VA space. 3706 * Notice that we use b_bufsize for the size of the buffer 3707 * to be mapped. b_bcount might be modified by the driver. 3708 * 3709 * Note that even if the caller determines that the address space should 3710 * be valid, a race or a smaller-file mapped into a larger space may 3711 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST 3712 * check the return value. 3713 */ 3714int 3715vmapbuf(struct buf *bp) 3716{ 3717 caddr_t addr, kva; 3718 vm_prot_t prot; 3719 int pidx, i; 3720 struct vm_page *m; 3721 struct pmap *pmap = &curproc->p_vmspace->vm_pmap; 3722 3723 if (bp->b_bufsize < 0) 3724 return (-1); 3725 prot = VM_PROT_READ; 3726 if (bp->b_iocmd == BIO_READ) 3727 prot |= VM_PROT_WRITE; /* Less backwards than it looks */ 3728 for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0; 3729 addr < bp->b_data + bp->b_bufsize; 3730 addr += PAGE_SIZE, pidx++) { 3731 /* 3732 * Do the vm_fault if needed; do the copy-on-write thing 3733 * when reading stuff off device into memory. 3734 * 3735 * NOTE! Must use pmap_extract() because addr may be in 3736 * the userland address space, and kextract is only guarenteed 3737 * to work for the kernland address space (see: sparc64 port). 3738 */ 3739retry: 3740 if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data, 3741 prot) < 0) { 3742 vm_page_lock_queues(); 3743 for (i = 0; i < pidx; ++i) { 3744 vm_page_unhold(bp->b_pages[i]); 3745 bp->b_pages[i] = NULL; 3746 } 3747 vm_page_unlock_queues(); 3748 return(-1); 3749 } 3750 m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot); 3751 if (m == NULL) 3752 goto retry; 3753 bp->b_pages[pidx] = m; 3754 } 3755 if (pidx > btoc(MAXPHYS)) 3756 panic("vmapbuf: mapped more than MAXPHYS"); 3757 pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx); 3758 3759 kva = bp->b_saveaddr; 3760 bp->b_npages = pidx; 3761 bp->b_saveaddr = bp->b_data; 3762 bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK); 3763 return(0); 3764} 3765 3766/* 3767 * Free the io map PTEs associated with this IO operation. 3768 * We also invalidate the TLB entries and restore the original b_addr. 3769 */ 3770void 3771vunmapbuf(struct buf *bp) 3772{ 3773 int pidx; 3774 int npages; 3775 3776 npages = bp->b_npages; 3777 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 3778 vm_page_lock_queues(); 3779 for (pidx = 0; pidx < npages; pidx++) 3780 vm_page_unhold(bp->b_pages[pidx]); 3781 vm_page_unlock_queues(); 3782 3783 bp->b_data = bp->b_saveaddr; 3784} 3785 3786void 3787bdone(struct buf *bp) 3788{ 3789 3790 mtx_lock(&bdonelock); 3791 bp->b_flags |= B_DONE; 3792 wakeup(bp); 3793 mtx_unlock(&bdonelock); 3794} 3795 3796void 3797bwait(struct buf *bp, u_char pri, const char *wchan) 3798{ 3799 3800 mtx_lock(&bdonelock); 3801 while ((bp->b_flags & B_DONE) == 0) 3802 msleep(bp, &bdonelock, pri, wchan, 0); 3803 mtx_unlock(&bdonelock); 3804} 3805 3806#if 0 /* this is here to unconfuse p4 diff */ 3807 3808void 3809bufstrategy(struct bufobj *bo, struct buf *bp) 3810{ 3811 int i = 0; 3812 struct vnode *vp; 3813 3814 vp = bp->b_vp; 3815 KASSERT(vp == bo->bo_vnode, ("Inconsistent vnode bufstrategy")); 3816 KASSERT(vp->v_type != VCHR && vp->v_type != VBLK, 3817 ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp)); 3818 i = VOP_STRATEGY(vp, bp); 3819 KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp)); 3820} 3821 3822#endif 3823 3824void 3825bufobj_wref(struct bufobj *bo) 3826{ 3827 3828 KASSERT(bo != NULL, ("NULL bo in bufobj_wref")); 3829 BO_LOCK(bo); 3830 bo->bo_numoutput++; 3831 BO_UNLOCK(bo); 3832} 3833 3834void 3835bufobj_wdrop(struct bufobj *bo) 3836{ 3837 3838 KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop")); 3839 BO_LOCK(bo); 3840 KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count")); 3841 if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) { 3842 bo->bo_flag &= ~BO_WWAIT; 3843 wakeup(&bo->bo_numoutput); 3844 } 3845 BO_UNLOCK(bo); 3846} 3847 3848int 3849bufobj_wwait(struct bufobj *bo, int slpflag, int timeo) 3850{ 3851 int error; 3852 3853 KASSERT(bo != NULL, ("NULL bo in bufobj_wwait")); 3854 ASSERT_BO_LOCKED(bo); 3855 error = 0; 3856 while (bo->bo_numoutput) { 3857 bo->bo_flag |= BO_WWAIT; 3858 error = msleep(&bo->bo_numoutput, BO_MTX(bo), 3859 slpflag | (PRIBIO + 1), "bo_wwait", timeo); 3860 if (error) 3861 break; 3862 } 3863 return (error); 3864} 3865 3866#include "opt_ddb.h" 3867#ifdef DDB 3868#include <ddb/ddb.h> 3869 3870/* DDB command to show buffer data */ 3871DB_SHOW_COMMAND(buffer, db_show_buffer) 3872{ 3873 /* get args */ 3874 struct buf *bp = (struct buf *)addr; 3875 3876 if (!have_addr) { 3877 db_printf("usage: show buffer <addr>\n"); 3878 return; 3879 } 3880 3881 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 3882 db_printf( 3883 "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" 3884 "b_dev = (%d,%d), b_data = %p, b_blkno = %jd\n", 3885 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 3886 major(bp->b_dev), minor(bp->b_dev), bp->b_data, 3887 (intmax_t)bp->b_blkno); 3888 if (bp->b_npages) { 3889 int i; 3890 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 3891 for (i = 0; i < bp->b_npages; i++) { 3892 vm_page_t m; 3893 m = bp->b_pages[i]; 3894 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 3895 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 3896 if ((i + 1) < bp->b_npages) 3897 db_printf(","); 3898 } 3899 db_printf("\n"); 3900 } 3901} 3902#endif /* DDB */ 3903