vfs_bio.c revision 99589
1235783Skib/* 2235783Skib * Copyright (c) 1994,1997 John S. Dyson 3235783Skib * All rights reserved. 4235783Skib * 5235783Skib * Redistribution and use in source and binary forms, with or without 6235783Skib * modification, are permitted provided that the following conditions 7235783Skib * are met: 8235783Skib * 1. Redistributions of source code must retain the above copyright 9235783Skib * notice immediately at the beginning of the file, without modification, 10235783Skib * this list of conditions, and the following disclaimer. 11235783Skib * 2. Absolutely no warranty of function or purpose is made by the author 12235783Skib * John S. Dyson. 13235783Skib * 14235783Skib * $FreeBSD: head/sys/kern/vfs_bio.c 99589 2002-07-08 12:21:11Z bde $ 15235783Skib */ 16235783Skib 17235783Skib/* 18235783Skib * this file contains a new buffer I/O scheme implementing a coherent 19235783Skib * VM object and buffer cache scheme. Pains have been taken to make 20235783Skib * sure that the performance degradation associated with schemes such 21235783Skib * as this is not realized. 22235783Skib * 23235783Skib * Author: John S. Dyson 24235783Skib * Significant help during the development and debugging phases 25235783Skib * had been provided by David Greenman, also of the FreeBSD core team. 26235783Skib * 27235783Skib * see man buf(9) for more info. 28235783Skib */ 29235783Skib 30235783Skib#include <sys/param.h> 31235783Skib#include <sys/systm.h> 32235783Skib#include <sys/stdint.h> 33235783Skib#include <sys/bio.h> 34235783Skib#include <sys/buf.h> 35235783Skib#include <sys/eventhandler.h> 36235783Skib#include <sys/lock.h> 37235783Skib#include <sys/malloc.h> 38235783Skib#include <sys/mount.h> 39235783Skib#include <sys/mutex.h> 40280183Sdumbbell#include <sys/kernel.h> 41280183Sdumbbell#include <sys/kthread.h> 42280183Sdumbbell#include <sys/ktr.h> 43235783Skib#include <sys/proc.h> 44235783Skib#include <sys/reboot.h> 45235783Skib#include <sys/resourcevar.h> 46235783Skib#include <sys/sysctl.h> 47235783Skib#include <sys/vmmeter.h> 48280183Sdumbbell#include <sys/vnode.h> 49235783Skib#include <vm/vm.h> 50235783Skib#include <vm/vm_param.h> 51235783Skib#include <vm/vm_kern.h> 52235783Skib#include <vm/vm_pageout.h> 53235783Skib#include <vm/vm_page.h> 54235783Skib#include <vm/vm_object.h> 55235783Skib#include <vm/vm_extern.h> 56235783Skib#include <vm/vm_map.h> 57235783Skib 58235783Skibstatic MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 59235783Skib 60235783Skibstruct bio_ops bioops; /* I/O operation notification */ 61235783Skib 62235783Skibstruct buf_ops buf_ops_bio = { 63235783Skib "buf_ops_bio", 64235783Skib bwrite 65235783Skib}; 66235783Skib 67235783Skib/* 68235783Skib * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has 69235783Skib * carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c. 70235783Skib */ 71235783Skibstruct buf *buf; /* buffer header pool */ 72235783Skibstruct mtx buftimelock; /* Interlock on setting prio and timo */ 73235783Skib 74235783Skibstatic void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 75235783Skib vm_offset_t to); 76235783Skibstatic void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 77235783Skib vm_offset_t to); 78235783Skibstatic void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, 79235783Skib int pageno, vm_page_t m); 80235783Skibstatic void vfs_clean_pages(struct buf * bp); 81235783Skibstatic void vfs_setdirty(struct buf *bp); 82235783Skibstatic void vfs_vmio_release(struct buf *bp); 83235783Skibstatic void vfs_backgroundwritedone(struct buf *bp); 84235783Skibstatic int flushbufqueues(void); 85235783Skibstatic void buf_daemon(void); 86235783Skib 87235783Skibint vmiodirenable = TRUE; 88235783SkibSYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0, 89235783Skib "Use the VM system for directory writes"); 90235783Skibint runningbufspace; 91235783SkibSYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 92235783Skib "Amount of presently outstanding async buffer io"); 93235783Skibstatic int bufspace; 94235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 95235783Skib "KVA memory used for bufs"); 96235783Skibstatic int maxbufspace; 97235783SkibSYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 98235783Skib "Maximum allowed value of bufspace (including buf_daemon)"); 99235783Skibstatic int bufmallocspace; 100235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 101235783Skib "Amount of malloced memory for buffers"); 102235783Skibstatic int maxbufmallocspace; 103235783SkibSYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0, 104235783Skib "Maximum amount of malloced memory for buffers"); 105235783Skibstatic int lobufspace; 106235783SkibSYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 107235783Skib "Minimum amount of buffers we want to have"); 108235783Skibstatic int hibufspace; 109235783SkibSYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 110235783Skib "Maximum allowed value of bufspace (excluding buf_daemon)"); 111235783Skibstatic int bufreusecnt; 112235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0, 113235783Skib "Number of times we have reused a buffer"); 114235783Skibstatic int buffreekvacnt; 115235783SkibSYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0, 116235783Skib "Number of times we have freed the KVA space from some buffer"); 117235783Skibstatic int bufdefragcnt; 118235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0, 119235783Skib "Number of times we have had to repeat buffer allocation to defragment"); 120235783Skibstatic int lorunningspace; 121235783SkibSYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 122235783Skib "Minimum preferred space used for in-progress I/O"); 123235783Skibstatic int hirunningspace; 124235783SkibSYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 125235783Skib "Maximum amount of space to use for in-progress I/O"); 126235783Skibstatic int numdirtybuffers; 127235783SkibSYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0, 128235783Skib "Number of buffers that are dirty (has unwritten changes) at the moment"); 129235783Skibstatic int lodirtybuffers; 130235783SkibSYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0, 131235783Skib "How many buffers we want to have free before bufdaemon can sleep"); 132235783Skibstatic int hidirtybuffers; 133235783SkibSYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0, 134235783Skib "When the number of dirty buffers is considered severe"); 135235783Skibstatic int numfreebuffers; 136235783SkibSYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0, 137235783Skib "Number of free buffers"); 138235783Skibstatic int lofreebuffers; 139235783SkibSYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0, 140235783Skib "XXX Unused"); 141235783Skibstatic int hifreebuffers; 142235783SkibSYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0, 143235783Skib "XXX Complicatedly unused"); 144235783Skibstatic int getnewbufcalls; 145235783SkibSYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0, 146235783Skib "Number of calls to getnewbuf"); 147235783Skibstatic int getnewbufrestarts; 148235783SkibSYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0, 149235783Skib "Number of times getnewbuf has had to restart a buffer aquisition"); 150235783Skibstatic int dobkgrdwrite = 1; 151235783SkibSYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, 152235783Skib "Do background writes (honoring the BX_BKGRDWRITE flag)?"); 153235783Skib 154235783Skib/* 155235783Skib * Wakeup point for bufdaemon, as well as indicator of whether it is already 156235783Skib * active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it 157235783Skib * is idling. 158235783Skib */ 159235783Skibstatic int bd_request; 160235783Skib 161235783Skib/* 162235783Skib * bogus page -- for I/O to/from partially complete buffers 163235783Skib * this is a temporary solution to the problem, but it is not 164280183Sdumbbell * really that bad. it would be better to split the buffer 165235783Skib * for input in the case of buffers partially already in memory, 166235783Skib * but the code is intricate enough already. 167235783Skib */ 168235783Skibvm_page_t bogus_page; 169235783Skib 170235783Skib/* 171235783Skib * Offset for bogus_page. 172235783Skib * XXX bogus_offset should be local to bufinit 173280183Sdumbbell */ 174235783Skibstatic vm_offset_t bogus_offset; 175235783Skib 176235783Skib/* 177235783Skib * Synchronization (sleep/wakeup) variable for active buffer space requests. 178235783Skib * Set when wait starts, cleared prior to wakeup(). 179235783Skib * Used in runningbufwakeup() and waitrunningbufspace(). 180235783Skib */ 181235783Skibstatic int runningbufreq; 182235783Skib 183235783Skib/* 184235783Skib * Synchronization (sleep/wakeup) variable for buffer requests. 185280183Sdumbbell * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done 186235783Skib * by and/or. 187235783Skib * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(), 188235783Skib * getnewbuf(), and getblk(). 189235783Skib */ 190235783Skibstatic int needsbuffer; 191235783Skib 192235783Skib/* 193235783Skib * Mask for index into the buffer hash table, which needs to be power of 2 in 194235783Skib * size. Set in kern_vfs_bio_buffer_alloc. 195235783Skib */ 196280183Sdumbbellstatic int bufhashmask; 197235783Skib 198235783Skib/* 199235783Skib * Hash table for all buffers, with a linked list hanging from each table 200235783Skib * entry. Set in kern_vfs_bio_buffer_alloc, initialized in buf_init. 201235783Skib */ 202235783Skibstatic LIST_HEAD(bufhashhdr, buf) *bufhashtbl; 203235783Skib 204235783Skib/* 205235783Skib * Somewhere to store buffers when they are not in another list, to always 206235783Skib * have them in a list (and thus being able to use the same set of operations 207235783Skib * on them.) 208235783Skib */ 209235783Skibstatic struct bufhashhdr invalhash; 210235783Skib 211235783Skib/* 212235783Skib * Definitions for the buffer free lists. 213235783Skib */ 214235783Skib#define BUFFER_QUEUES 6 /* number of free buffer queues */ 215235783Skib 216235783Skib#define QUEUE_NONE 0 /* on no queue */ 217235783Skib#define QUEUE_LOCKED 1 /* locked buffers */ 218235783Skib#define QUEUE_CLEAN 2 /* non-B_DELWRI buffers */ 219235783Skib#define QUEUE_DIRTY 3 /* B_DELWRI buffers */ 220235783Skib#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */ 221235783Skib#define QUEUE_EMPTY 5 /* empty buffer headers */ 222235783Skib 223235783Skib/* Queues for free buffers with various properties */ 224235783Skibstatic TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } }; 225235783Skib/* 226280183Sdumbbell * Single global constant for BUF_WMESG, to avoid getting multiple references. 227235783Skib * buf_wmesg is referred from macros. 228235783Skib */ 229235783Skibconst char *buf_wmesg = BUF_WMESG; 230280183Sdumbbell 231280183Sdumbbell#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 232235783Skib#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ 233235783Skib#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ 234235783Skib#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 235235783Skib 236235783Skib/* 237235783Skib * Buffer hash table code. Note that the logical block scans linearly, which 238235783Skib * gives us some L1 cache locality. 239235783Skib */ 240235783Skib 241235783Skibstatic __inline 242235783Skibstruct bufhashhdr * 243235783Skibbufhash(struct vnode *vnp, daddr_t bn) 244235783Skib{ 245235783Skib return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); 246235783Skib} 247235783Skib 248235783Skib/* 249235783Skib * numdirtywakeup: 250235783Skib * 251235783Skib * If someone is blocked due to there being too many dirty buffers, 252235783Skib * and numdirtybuffers is now reasonable, wake them up. 253235783Skib */ 254235783Skib 255235783Skibstatic __inline void 256235783Skibnumdirtywakeup(int level) 257235783Skib{ 258280183Sdumbbell if (numdirtybuffers <= level) { 259235783Skib if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { 260235783Skib needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; 261235783Skib wakeup(&needsbuffer); 262235783Skib } 263235783Skib } 264235783Skib} 265235783Skib 266280183Sdumbbell/* 267235783Skib * bufspacewakeup: 268235783Skib * 269235783Skib * Called when buffer space is potentially available for recovery. 270235783Skib * getnewbuf() will block on this flag when it is unable to free 271235783Skib * sufficient buffer space. Buffer space becomes recoverable when 272235783Skib * bp's get placed back in the queues. 273235783Skib */ 274235783Skib 275235783Skibstatic __inline void 276235783Skibbufspacewakeup(void) 277235783Skib{ 278235783Skib /* 279235783Skib * If someone is waiting for BUF space, wake them up. Even 280235783Skib * though we haven't freed the kva space yet, the waiting 281235783Skib * process will be able to now. 282235783Skib */ 283235783Skib if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 284235783Skib needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 285235783Skib wakeup(&needsbuffer); 286280183Sdumbbell } 287235783Skib} 288235783Skib 289235783Skib/* 290235783Skib * runningbufwakeup() - in-progress I/O accounting. 291235783Skib * 292235783Skib */ 293235783Skibstatic __inline void 294235783Skibrunningbufwakeup(struct buf *bp) 295235783Skib{ 296235783Skib if (bp->b_runningbufspace) { 297235783Skib runningbufspace -= bp->b_runningbufspace; 298235783Skib bp->b_runningbufspace = 0; 299280183Sdumbbell if (runningbufreq && runningbufspace <= lorunningspace) { 300235783Skib runningbufreq = 0; 301280183Sdumbbell wakeup(&runningbufreq); 302280183Sdumbbell } 303280183Sdumbbell } 304280183Sdumbbell} 305280183Sdumbbell 306235783Skib/* 307280183Sdumbbell * bufcountwakeup: 308280183Sdumbbell * 309280183Sdumbbell * Called when a buffer has been added to one of the free queues to 310280183Sdumbbell * account for the buffer and to wakeup anyone waiting for free buffers. 311280183Sdumbbell * This typically occurs when large amounts of metadata are being handled 312280183Sdumbbell * by the buffer cache ( else buffer space runs out first, usually ). 313280183Sdumbbell */ 314280183Sdumbbell 315280183Sdumbbellstatic __inline void 316280183Sdumbbellbufcountwakeup(void) 317280183Sdumbbell{ 318280183Sdumbbell ++numfreebuffers; 319280183Sdumbbell if (needsbuffer) { 320280183Sdumbbell needsbuffer &= ~VFS_BIO_NEED_ANY; 321280183Sdumbbell if (numfreebuffers >= hifreebuffers) 322280183Sdumbbell needsbuffer &= ~VFS_BIO_NEED_FREE; 323280183Sdumbbell wakeup(&needsbuffer); 324280183Sdumbbell } 325280183Sdumbbell} 326280183Sdumbbell 327280183Sdumbbell/* 328280183Sdumbbell * waitrunningbufspace() 329280183Sdumbbell * 330280183Sdumbbell * runningbufspace is a measure of the amount of I/O currently 331280183Sdumbbell * running. This routine is used in async-write situations to 332280183Sdumbbell * prevent creating huge backups of pending writes to a device. 333280183Sdumbbell * Only asynchronous writes are governed by this function. 334235783Skib * 335235783Skib * Reads will adjust runningbufspace, but will not block based on it. 336235783Skib * The read load has a side effect of reducing the allowed write load. 337235783Skib * 338235783Skib * This does NOT turn an async write into a sync write. It waits 339235783Skib * for earlier writes to complete and generally returns before the 340235783Skib * caller's write has reached the device. 341235783Skib */ 342235783Skibstatic __inline void 343235783Skibwaitrunningbufspace(void) 344235783Skib{ 345235783Skib /* 346280183Sdumbbell * XXX race against wakeup interrupt, currently 347280183Sdumbbell * protected by Giant. FIXME! 348280183Sdumbbell */ 349280183Sdumbbell while (runningbufspace > hirunningspace) { 350280183Sdumbbell ++runningbufreq; 351280183Sdumbbell tsleep(&runningbufreq, PVM, "wdrain", 0); 352280183Sdumbbell } 353280183Sdumbbell} 354280183Sdumbbell 355280183Sdumbbell 356280183Sdumbbell/* 357280183Sdumbbell * vfs_buf_test_cache: 358280183Sdumbbell * 359280183Sdumbbell * Called when a buffer is extended. This function clears the B_CACHE 360280183Sdumbbell * bit if the newly extended portion of the buffer does not contain 361280183Sdumbbell * valid data. 362280183Sdumbbell */ 363280183Sdumbbellstatic __inline__ 364280183Sdumbbellvoid 365280183Sdumbbellvfs_buf_test_cache(struct buf *bp, 366280183Sdumbbell vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 367280183Sdumbbell vm_page_t m) 368280183Sdumbbell{ 369280183Sdumbbell GIANT_REQUIRED; 370280183Sdumbbell 371280183Sdumbbell if (bp->b_flags & B_CACHE) { 372235783Skib int base = (foff + off) & PAGE_MASK; 373235783Skib if (vm_page_is_valid(m, base, size) == 0) 374235783Skib bp->b_flags &= ~B_CACHE; 375235783Skib } 376235783Skib} 377235783Skib 378235783Skib/* Wake up the buffer deamon if necessary */ 379235783Skibstatic __inline__ 380235783Skibvoid 381235783Skibbd_wakeup(int dirtybuflevel) 382235783Skib{ 383235783Skib if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) { 384235783Skib bd_request = 1; 385235783Skib wakeup(&bd_request); 386235783Skib } 387235783Skib} 388235783Skib 389235783Skib/* 390235783Skib * bd_speedup - speedup the buffer cache flushing code 391235783Skib */ 392235783Skib 393235783Skibstatic __inline__ 394235783Skibvoid 395235783Skibbd_speedup(void) 396235783Skib{ 397235783Skib bd_wakeup(1); 398235783Skib} 399235783Skib 400235783Skib/* 401235783Skib * Calculating buffer cache scaling values and reserve space for buffer 402280183Sdumbbell * headers. This is called during low level kernel initialization and 403280183Sdumbbell * may be called more then once. We CANNOT write to the memory area 404280183Sdumbbell * being reserved at this time. 405235783Skib */ 406280183Sdumbbellcaddr_t 407235783Skibkern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est) 408235783Skib{ 409235783Skib /* 410235783Skib * physmem_est is in pages. Convert it to kilobytes (assumes 411235783Skib * PAGE_SIZE is >= 1K) 412235783Skib */ 413235783Skib physmem_est = physmem_est * (PAGE_SIZE / 1024); 414235783Skib 415277487Skib /* 416235783Skib * The nominal buffer size (and minimum KVA allocation) is BKVASIZE. 417235783Skib * For the first 64MB of ram nominally allocate sufficient buffers to 418235783Skib * cover 1/4 of our ram. Beyond the first 64MB allocate additional 419235783Skib * buffers to cover 1/20 of our ram over 64MB. When auto-sizing 420235783Skib * the buffer cache we limit the eventual kva reservation to 421235783Skib * maxbcache bytes. 422235783Skib * 423235783Skib * factor represents the 1/4 x ram conversion. 424235783Skib */ 425235783Skib if (nbuf == 0) { 426235783Skib int factor = 4 * BKVASIZE / 1024; 427235783Skib 428235783Skib nbuf = 50; 429280183Sdumbbell if (physmem_est > 4096) 430235783Skib nbuf += min((physmem_est - 4096) / factor, 431235783Skib 65536 / factor); 432280183Sdumbbell if (physmem_est > 65536) 433235783Skib nbuf += (physmem_est - 65536) * 2 / (factor * 5); 434235783Skib 435235783Skib if (maxbcache && nbuf > maxbcache / BKVASIZE) 436235783Skib nbuf = maxbcache / BKVASIZE; 437277487Skib } 438277487Skib 439235783Skib#if 0 440235783Skib /* 441277487Skib * Do not allow the buffer_map to be more then 1/2 the size of the 442280183Sdumbbell * kernel_map. 443235783Skib */ 444235783Skib if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 445235783Skib (BKVASIZE * 2)) { 446235783Skib nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 447280183Sdumbbell (BKVASIZE * 2); 448235783Skib printf("Warning: nbufs capped at %d\n", nbuf); 449235783Skib } 450235783Skib#endif 451235783Skib 452235783Skib /* 453235783Skib * swbufs are used as temporary holders for I/O, such as paging I/O. 454235783Skib * We have no less then 16 and no more then 256. 455235783Skib */ 456235783Skib nswbuf = max(min(nbuf/4, 256), 16); 457235783Skib 458235783Skib /* 459235783Skib * Reserve space for the buffer cache buffers 460235783Skib */ 461235783Skib swbuf = (void *)v; 462235783Skib v = (caddr_t)(swbuf + nswbuf); 463280183Sdumbbell buf = (void *)v; 464280183Sdumbbell v = (caddr_t)(buf + nbuf); 465235783Skib 466235783Skib /* 467235783Skib * Calculate the hash table size and reserve space 468235783Skib */ 469235783Skib for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) 470280183Sdumbbell ; 471235783Skib bufhashtbl = (void *)v; 472235783Skib v = (caddr_t)(bufhashtbl + bufhashmask); 473235783Skib --bufhashmask; 474235783Skib 475235783Skib return(v); 476235783Skib} 477235783Skib 478235783Skib/* Initialize the buffer subsystem. Called before use of any buffers. */ 479235783Skibvoid 480235783Skibbufinit(void) 481235783Skib{ 482235783Skib struct buf *bp; 483235783Skib int i; 484235783Skib 485235783Skib GIANT_REQUIRED; 486235783Skib 487280183Sdumbbell LIST_INIT(&invalhash); 488235783Skib mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF); 489235783Skib 490235783Skib for (i = 0; i <= bufhashmask; i++) 491235783Skib LIST_INIT(&bufhashtbl[i]); 492235783Skib 493235783Skib /* next, make a null set of free lists */ 494235783Skib for (i = 0; i < BUFFER_QUEUES; i++) 495235783Skib TAILQ_INIT(&bufqueues[i]); 496235783Skib 497235783Skib /* finally, initialize each buffer header and stick on empty q */ 498235783Skib for (i = 0; i < nbuf; i++) { 499235783Skib bp = &buf[i]; 500235783Skib bzero(bp, sizeof *bp); 501235783Skib bp->b_flags = B_INVAL; /* we're just an empty header */ 502235783Skib bp->b_dev = NODEV; 503235783Skib bp->b_rcred = NOCRED; 504235783Skib bp->b_wcred = NOCRED; 505280183Sdumbbell bp->b_qindex = QUEUE_EMPTY; 506235783Skib bp->b_xflags = 0; 507235783Skib LIST_INIT(&bp->b_dep); 508235783Skib BUF_LOCKINIT(bp); 509235783Skib TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 510235783Skib LIST_INSERT_HEAD(&invalhash, bp, b_hash); 511235783Skib } 512235783Skib 513235783Skib /* 514235783Skib * maxbufspace is the absolute maximum amount of buffer space we are 515235783Skib * allowed to reserve in KVM and in real terms. The absolute maximum 516235783Skib * is nominally used by buf_daemon. hibufspace is the nominal maximum 517235783Skib * used by most other processes. The differential is required to 518235783Skib * ensure that buf_daemon is able to run when other processes might 519235783Skib * be blocked waiting for buffer space. 520235783Skib * 521235783Skib * maxbufspace is based on BKVASIZE. Allocating buffers larger then 522235783Skib * this may result in KVM fragmentation which is not handled optimally 523235783Skib * by the system. 524235783Skib */ 525235783Skib maxbufspace = nbuf * BKVASIZE; 526235783Skib hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 527235783Skib lobufspace = hibufspace - MAXBSIZE; 528235783Skib 529235783Skib lorunningspace = 512 * 1024; 530235783Skib hirunningspace = 1024 * 1024; 531235783Skib 532235783Skib/* 533235783Skib * Limit the amount of malloc memory since it is wired permanently into 534235783Skib * the kernel space. Even though this is accounted for in the buffer 535235783Skib * allocation, we don't want the malloced region to grow uncontrolled. 536277487Skib * The malloc scheme improves memory utilization significantly on average 537235783Skib * (small) directories. 538235783Skib */ 539235783Skib maxbufmallocspace = hibufspace / 20; 540235783Skib 541235783Skib/* 542235783Skib * Reduce the chance of a deadlock occuring by limiting the number 543235783Skib * of delayed-write dirty buffers we allow to stack up. 544235783Skib */ 545235783Skib hidirtybuffers = nbuf / 4 + 20; 546280183Sdumbbell numdirtybuffers = 0; 547235783Skib/* 548235783Skib * To support extreme low-memory systems, make sure hidirtybuffers cannot 549235783Skib * eat up all available buffer space. This occurs when our minimum cannot 550235783Skib * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming 551280183Sdumbbell * BKVASIZE'd (8K) buffers. 552280183Sdumbbell */ 553280183Sdumbbell while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) { 554280183Sdumbbell hidirtybuffers >>= 1; 555235783Skib } 556280183Sdumbbell lodirtybuffers = hidirtybuffers / 2; 557235783Skib 558235783Skib/* 559280183Sdumbbell * Try to keep the number of free buffers in the specified range, 560235783Skib * and give special processes (e.g. like buf_daemon) access to an 561235783Skib * emergency reserve. 562235783Skib */ 563235783Skib lofreebuffers = nbuf / 18 + 5; 564280183Sdumbbell hifreebuffers = 2 * lofreebuffers; 565235783Skib numfreebuffers = nbuf; 566235783Skib 567235783Skib/* 568235783Skib * Maximum number of async ops initiated per buf_daemon loop. This is 569235783Skib * somewhat of a hack at the moment, we really need to limit ourselves 570235783Skib * based on the number of bytes of I/O in-transit that were initiated 571235783Skib * from buf_daemon. 572235783Skib */ 573235783Skib 574235783Skib bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 575235783Skib bogus_page = vm_page_alloc(kernel_object, 576235783Skib ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 577235783Skib VM_ALLOC_NORMAL); 578235783Skib cnt.v_wire_count++; 579235783Skib} 580235783Skib 581235783Skib/* 582235783Skib * bfreekva() - free the kva allocation for a buffer. 583235783Skib * 584235783Skib * Must be called at splbio() or higher as this is the only locking for 585235783Skib * buffer_map. 586235783Skib * 587235783Skib * Since this call frees up buffer space, we call bufspacewakeup(). 588235783Skib */ 589235783Skibstatic void 590235783Skibbfreekva(struct buf * bp) 591235783Skib{ 592235783Skib GIANT_REQUIRED; 593235783Skib 594235783Skib if (bp->b_kvasize) { 595280183Sdumbbell ++buffreekvacnt; 596235783Skib bufspace -= bp->b_kvasize; 597280183Sdumbbell vm_map_delete(buffer_map, 598280183Sdumbbell (vm_offset_t) bp->b_kvabase, 599280183Sdumbbell (vm_offset_t) bp->b_kvabase + bp->b_kvasize 600280183Sdumbbell ); 601280183Sdumbbell bp->b_kvasize = 0; 602280183Sdumbbell bufspacewakeup(); 603280183Sdumbbell } 604280183Sdumbbell} 605280183Sdumbbell 606280183Sdumbbell/* 607280183Sdumbbell * bremfree: 608280183Sdumbbell * 609280183Sdumbbell * Remove the buffer from the appropriate free list. 610235783Skib */ 611280183Sdumbbellvoid 612280183Sdumbbellbremfree(struct buf * bp) 613280183Sdumbbell{ 614235783Skib int s = splbio(); 615235783Skib int old_qindex = bp->b_qindex; 616235783Skib 617235783Skib GIANT_REQUIRED; 618235783Skib 619235783Skib if (bp->b_qindex != QUEUE_NONE) { 620235783Skib KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); 621235783Skib TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 622235783Skib bp->b_qindex = QUEUE_NONE; 623235783Skib } else { 624235783Skib if (BUF_REFCNT(bp) <= 1) 625235783Skib panic("bremfree: removing a buffer not on a queue"); 626235783Skib } 627235783Skib 628235783Skib /* 629235783Skib * Fixup numfreebuffers count. If the buffer is invalid or not 630280183Sdumbbell * delayed-write, and it was on the EMPTY, LRU, or AGE queues, 631235783Skib * the buffer was free and we must decrement numfreebuffers. 632235783Skib */ 633235783Skib if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { 634235783Skib switch(old_qindex) { 635280183Sdumbbell case QUEUE_DIRTY: 636235783Skib case QUEUE_CLEAN: 637235783Skib case QUEUE_EMPTY: 638235783Skib case QUEUE_EMPTYKVA: 639235783Skib --numfreebuffers; 640235783Skib break; 641235783Skib default: 642235783Skib break; 643235783Skib } 644235783Skib } 645235783Skib splx(s); 646280183Sdumbbell} 647235783Skib 648235783Skib 649235783Skib/* 650235783Skib * Get a buffer with the specified data. Look in the cache first. We 651235783Skib * must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 652235783Skib * is set, the buffer is valid and we do not have to do anything ( see 653235783Skib * getblk() ). This is really just a special case of breadn(). 654235783Skib */ 655235783Skibint 656235783Skibbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 657235783Skib struct buf ** bpp) 658235783Skib{ 659235783Skib 660235783Skib return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp)); 661235783Skib} 662277487Skib 663235783Skib/* 664235783Skib * Operates like bread, but also starts asynchronous I/O on 665235783Skib * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior 666293851Sdumbbell * to initiating I/O . If B_CACHE is set, the buffer is valid 667280183Sdumbbell * and we do not have to do anything. 668280183Sdumbbell */ 669280183Sdumbbellint 670280183Sdumbbellbreadn(struct vnode * vp, daddr_t blkno, int size, 671280183Sdumbbell daddr_t * rablkno, int *rabsize, 672280183Sdumbbell int cnt, struct ucred * cred, struct buf ** bpp) 673235783Skib{ 674235783Skib struct buf *bp, *rabp; 675235783Skib int i; 676235783Skib int rv = 0, readwait = 0; 677235783Skib 678235783Skib *bpp = bp = getblk(vp, blkno, size, 0, 0); 679235783Skib 680235783Skib /* if not found in cache, do some I/O */ 681235783Skib if ((bp->b_flags & B_CACHE) == 0) { 682235783Skib if (curthread != PCPU_GET(idlethread)) 683235783Skib curthread->td_proc->p_stats->p_ru.ru_inblock++; 684235783Skib bp->b_iocmd = BIO_READ; 685235783Skib bp->b_flags &= ~B_INVAL; 686235783Skib bp->b_ioflags &= ~BIO_ERROR; 687235783Skib if (bp->b_rcred == NOCRED && cred != NOCRED) 688235783Skib bp->b_rcred = crhold(cred); 689280183Sdumbbell vfs_busy_pages(bp, 0); 690235783Skib VOP_STRATEGY(vp, bp); 691235783Skib ++readwait; 692235783Skib } 693235783Skib 694280183Sdumbbell for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 695235783Skib if (inmem(vp, *rablkno)) 696235783Skib continue; 697235783Skib rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 698235783Skib 699235783Skib if ((rabp->b_flags & B_CACHE) == 0) { 700235783Skib if (curthread != PCPU_GET(idlethread)) 701235783Skib curthread->td_proc->p_stats->p_ru.ru_inblock++; 702235783Skib rabp->b_flags |= B_ASYNC; 703235783Skib rabp->b_flags &= ~B_INVAL; 704235783Skib rabp->b_ioflags &= ~BIO_ERROR; 705235783Skib rabp->b_iocmd = BIO_READ; 706235783Skib if (rabp->b_rcred == NOCRED && cred != NOCRED) 707235783Skib rabp->b_rcred = crhold(cred); 708235783Skib vfs_busy_pages(rabp, 0); 709235783Skib BUF_KERNPROC(rabp); 710280183Sdumbbell VOP_STRATEGY(vp, rabp); 711235783Skib } else { 712235783Skib brelse(rabp); 713235783Skib } 714235783Skib } 715235783Skib 716235783Skib if (readwait) { 717235783Skib rv = bufwait(bp); 718235783Skib } 719235783Skib return (rv); 720235783Skib} 721235783Skib 722235783Skib/* 723235783Skib * Write, release buffer on completion. (Done by iodone 724235783Skib * if async). Do not bother writing anything if the buffer 725235783Skib * is invalid. 726235783Skib * 727235783Skib * Note that we set B_CACHE here, indicating that buffer is 728235783Skib * fully valid and thus cacheable. This is true even of NFS 729235783Skib * now so we set it generally. This could be set either here 730280183Sdumbbell * or in biodone() since the I/O is synchronous. We put it 731280183Sdumbbell * here. 732235783Skib */ 733235783Skib 734235783Skibint 735280183Sdumbbellbwrite(struct buf * bp) 736235783Skib{ 737280183Sdumbbell int oldflags, s; 738235783Skib struct buf *newbp; 739235783Skib 740280183Sdumbbell if (bp->b_flags & B_INVAL) { 741235783Skib brelse(bp); 742235783Skib return (0); 743235783Skib } 744235783Skib 745235783Skib oldflags = bp->b_flags; 746235783Skib 747235783Skib if (BUF_REFCNT(bp) == 0) 748235783Skib panic("bwrite: buffer is not busy???"); 749235783Skib s = splbio(); 750235783Skib /* 751235783Skib * If a background write is already in progress, delay 752235783Skib * writing this block if it is asynchronous. Otherwise 753235783Skib * wait for the background write to complete. 754235783Skib */ 755235783Skib if (bp->b_xflags & BX_BKGRDINPROG) { 756235783Skib if (bp->b_flags & B_ASYNC) { 757235783Skib splx(s); 758235783Skib bdwrite(bp); 759235783Skib return (0); 760235783Skib } 761280183Sdumbbell bp->b_xflags |= BX_BKGRDWAIT; 762235783Skib tsleep(&bp->b_xflags, PRIBIO, "bwrbg", 0); 763235783Skib if (bp->b_xflags & BX_BKGRDINPROG) 764235783Skib panic("bwrite: still writing"); 765235783Skib } 766235783Skib 767235783Skib /* Mark the buffer clean */ 768235783Skib bundirty(bp); 769235783Skib 770235783Skib /* 771235783Skib * If this buffer is marked for background writing and we 772235783Skib * do not have to wait for it, make a copy and write the 773235783Skib * copy so as to leave this buffer ready for further use. 774235783Skib * 775235783Skib * This optimization eats a lot of memory. If we have a page 776235783Skib * or buffer shortfall we can't do it. 777235783Skib */ 778280183Sdumbbell if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && 779235783Skib (bp->b_flags & B_ASYNC) && 780235783Skib !vm_page_count_severe() && 781235783Skib !buf_dirty_count_severe()) { 782235783Skib if (bp->b_iodone != NULL) { 783235783Skib printf("bp->b_iodone = %p\n", bp->b_iodone); 784235783Skib panic("bwrite: need chained iodone"); 785235783Skib } 786235783Skib 787235783Skib /* get a new block */ 788235783Skib newbp = geteblk(bp->b_bufsize); 789235783Skib 790235783Skib /* set it to be identical to the old block */ 791235783Skib memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); 792235783Skib bgetvp(bp->b_vp, newbp); 793235783Skib newbp->b_lblkno = bp->b_lblkno; 794235783Skib newbp->b_blkno = bp->b_blkno; 795235783Skib newbp->b_offset = bp->b_offset; 796235783Skib newbp->b_iodone = vfs_backgroundwritedone; 797235783Skib newbp->b_flags |= B_ASYNC; 798235783Skib newbp->b_flags &= ~B_INVAL; 799235783Skib 800235783Skib /* move over the dependencies */ 801235783Skib if (LIST_FIRST(&bp->b_dep) != NULL) 802280183Sdumbbell buf_movedeps(bp, newbp); 803235783Skib 804235783Skib /* 805235783Skib * Initiate write on the copy, release the original to 806235783Skib * the B_LOCKED queue so that it cannot go away until 807235783Skib * the background write completes. If not locked it could go 808280183Sdumbbell * away and then be reconstituted while it was being written. 809235783Skib * If the reconstituted buffer were written, we could end up 810235783Skib * with two background copies being written at the same time. 811235783Skib */ 812235783Skib bp->b_xflags |= BX_BKGRDINPROG; 813280183Sdumbbell bp->b_flags |= B_LOCKED; 814235783Skib bqrelse(bp); 815235783Skib bp = newbp; 816235783Skib } 817235783Skib 818235783Skib bp->b_flags &= ~B_DONE; 819235783Skib bp->b_ioflags &= ~BIO_ERROR; 820235783Skib bp->b_flags |= B_WRITEINPROG | B_CACHE; 821235783Skib bp->b_iocmd = BIO_WRITE; 822235783Skib 823235783Skib bp->b_vp->v_numoutput++; 824235783Skib vfs_busy_pages(bp, 1); 825235783Skib 826235783Skib /* 827235783Skib * Normal bwrites pipeline writes 828235783Skib */ 829235783Skib bp->b_runningbufspace = bp->b_bufsize; 830235783Skib runningbufspace += bp->b_runningbufspace; 831235783Skib 832235783Skib if (curthread != PCPU_GET(idlethread)) 833235783Skib curthread->td_proc->p_stats->p_ru.ru_oublock++; 834235783Skib splx(s); 835235783Skib if (oldflags & B_ASYNC) 836235783Skib BUF_KERNPROC(bp); 837235783Skib BUF_STRATEGY(bp); 838235783Skib 839235783Skib if ((oldflags & B_ASYNC) == 0) { 840235783Skib int rtval = bufwait(bp); 841235783Skib brelse(bp); 842280183Sdumbbell return (rtval); 843235783Skib } else if ((oldflags & B_NOWDRAIN) == 0) { 844235783Skib /* 845235783Skib * don't allow the async write to saturate the I/O 846235783Skib * system. Deadlocks can occur only if a device strategy 847235783Skib * routine (like in MD) turns around and issues another 848235783Skib * high-level write, in which case B_NOWDRAIN is expected 849280183Sdumbbell * to be set. Otherwise we will not deadlock here because 850235783Skib * we are blocking waiting for I/O that is already in-progress 851235783Skib * to complete. 852235783Skib */ 853235783Skib waitrunningbufspace(); 854235783Skib } 855235783Skib 856235783Skib return (0); 857235783Skib} 858235783Skib 859235783Skib/* 860235783Skib * Complete a background write started from bwrite. 861235783Skib */ 862235783Skibstatic void 863235783Skibvfs_backgroundwritedone(bp) 864235783Skib struct buf *bp; 865235783Skib{ 866235783Skib struct buf *origbp; 867235783Skib 868235783Skib /* 869235783Skib * Find the original buffer that we are writing. 870235783Skib */ 871235783Skib if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL) 872235783Skib panic("backgroundwritedone: lost buffer"); 873235783Skib /* 874235783Skib * Process dependencies then return any unfinished ones. 875235783Skib */ 876235783Skib if (LIST_FIRST(&bp->b_dep) != NULL) 877235783Skib buf_complete(bp); 878235783Skib if (LIST_FIRST(&bp->b_dep) != NULL) 879235783Skib buf_movedeps(bp, origbp); 880235783Skib /* 881235783Skib * Clear the BX_BKGRDINPROG flag in the original buffer 882235783Skib * and awaken it if it is waiting for the write to complete. 883235783Skib * If BX_BKGRDINPROG is not set in the original buffer it must 884235783Skib * have been released and re-instantiated - which is not legal. 885235783Skib */ 886235783Skib KASSERT((origbp->b_xflags & BX_BKGRDINPROG), 887235783Skib ("backgroundwritedone: lost buffer2")); 888235783Skib origbp->b_xflags &= ~BX_BKGRDINPROG; 889235783Skib if (origbp->b_xflags & BX_BKGRDWAIT) { 890235783Skib origbp->b_xflags &= ~BX_BKGRDWAIT; 891235783Skib wakeup(&origbp->b_xflags); 892235783Skib } 893235783Skib /* 894280183Sdumbbell * Clear the B_LOCKED flag and remove it from the locked 895235783Skib * queue if it currently resides there. 896235783Skib */ 897235783Skib origbp->b_flags &= ~B_LOCKED; 898235783Skib if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 899235783Skib bremfree(origbp); 900235783Skib bqrelse(origbp); 901235783Skib } 902235783Skib /* 903235783Skib * This buffer is marked B_NOCACHE, so when it is released 904235783Skib * by biodone, it will be tossed. We mark it with BIO_READ 905235783Skib * to avoid biodone doing a second vwakeup. 906235783Skib */ 907235783Skib bp->b_flags |= B_NOCACHE; 908235783Skib bp->b_iocmd = BIO_READ; 909235783Skib bp->b_flags &= ~(B_CACHE | B_DONE); 910235783Skib bp->b_iodone = 0; 911235783Skib bufdone(bp); 912235783Skib} 913280183Sdumbbell 914235783Skib/* 915235783Skib * Delayed write. (Buffer is marked dirty). Do not bother writing 916235783Skib * anything if the buffer is marked invalid. 917235783Skib * 918235783Skib * Note that since the buffer must be completely valid, we can safely 919280183Sdumbbell * set B_CACHE. In fact, we have to set B_CACHE here rather then in 920235783Skib * biodone() in order to prevent getblk from writing the buffer 921235783Skib * out synchronously. 922235783Skib */ 923235783Skibvoid 924235783Skibbdwrite(struct buf * bp) 925235783Skib{ 926235783Skib GIANT_REQUIRED; 927235783Skib 928235783Skib if (BUF_REFCNT(bp) == 0) 929235783Skib panic("bdwrite: buffer is not busy"); 930235783Skib 931235783Skib if (bp->b_flags & B_INVAL) { 932235783Skib brelse(bp); 933235783Skib return; 934235783Skib } 935235783Skib bdirty(bp); 936235783Skib 937235783Skib /* 938280183Sdumbbell * Set B_CACHE, indicating that the buffer is fully valid. This is 939235783Skib * true even of NFS now. 940235783Skib */ 941235783Skib bp->b_flags |= B_CACHE; 942235783Skib 943280183Sdumbbell /* 944235783Skib * This bmap keeps the system from needing to do the bmap later, 945235783Skib * perhaps when the system is attempting to do a sync. Since it 946235783Skib * is likely that the indirect block -- or whatever other datastructure 947235783Skib * that the filesystem needs is still in memory now, it is a good 948235783Skib * thing to do this. Note also, that if the pageout daemon is 949235783Skib * requesting a sync -- there might not be enough memory to do 950235783Skib * the bmap then... So, this is important to do. 951235783Skib */ 952235783Skib if (bp->b_lblkno == bp->b_blkno) { 953235783Skib VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 954235783Skib } 955235783Skib 956235783Skib /* 957235783Skib * Set the *dirty* buffer range based upon the VM system dirty pages. 958235783Skib */ 959235783Skib vfs_setdirty(bp); 960235783Skib 961235783Skib /* 962235783Skib * We need to do this here to satisfy the vnode_pager and the 963280183Sdumbbell * pageout daemon, so that it thinks that the pages have been 964235783Skib * "cleaned". Note that since the pages are in a delayed write 965235783Skib * buffer -- the VFS layer "will" see that the pages get written 966235783Skib * out on the next sync, or perhaps the cluster will be completed. 967235783Skib */ 968280183Sdumbbell vfs_clean_pages(bp); 969235783Skib bqrelse(bp); 970235783Skib 971235783Skib /* 972235783Skib * Wakeup the buffer flushing daemon if we have a lot of dirty 973235783Skib * buffers (midpoint between our recovery point and our stall 974235783Skib * point). 975235783Skib */ 976235783Skib bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); 977235783Skib 978235783Skib /* 979235783Skib * note: we cannot initiate I/O from a bdwrite even if we wanted to, 980235783Skib * due to the softdep code. 981235783Skib */ 982235783Skib} 983235783Skib 984235783Skib/* 985235783Skib * bdirty: 986235783Skib * 987235783Skib * Turn buffer into delayed write request. We must clear BIO_READ and 988235783Skib * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 989235783Skib * itself to properly update it in the dirty/clean lists. We mark it 990235783Skib * B_DONE to ensure that any asynchronization of the buffer properly 991235783Skib * clears B_DONE ( else a panic will occur later ). 992235783Skib * 993235783Skib * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 994235783Skib * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 995235783Skib * should only be called if the buffer is known-good. 996235783Skib * 997235783Skib * Since the buffer is not on a queue, we do not update the numfreebuffers 998235783Skib * count. 999235783Skib * 1000235783Skib * Must be called at splbio(). 1001235783Skib * The buffer must be on QUEUE_NONE. 1002280183Sdumbbell */ 1003235783Skibvoid 1004280183Sdumbbellbdirty(bp) 1005235783Skib struct buf *bp; 1006235783Skib{ 1007235783Skib KASSERT(bp->b_qindex == QUEUE_NONE, 1008235783Skib ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1009235783Skib bp->b_flags &= ~(B_RELBUF); 1010235783Skib bp->b_iocmd = BIO_WRITE; 1011235783Skib 1012235783Skib if ((bp->b_flags & B_DELWRI) == 0) { 1013293851Sdumbbell bp->b_flags |= B_DONE | B_DELWRI; 1014280183Sdumbbell reassignbuf(bp, bp->b_vp); 1015280183Sdumbbell ++numdirtybuffers; 1016235783Skib bd_wakeup((lodirtybuffers + hidirtybuffers) / 2); 1017235783Skib } 1018235783Skib} 1019235783Skib 1020235783Skib/* 1021235783Skib * bundirty: 1022235783Skib * 1023280183Sdumbbell * Clear B_DELWRI for buffer. 1024280183Sdumbbell * 1025280183Sdumbbell * Since the buffer is not on a queue, we do not update the numfreebuffers 1026280183Sdumbbell * count. 1027280183Sdumbbell * 1028280183Sdumbbell * Must be called at splbio(). 1029235783Skib * The buffer must be on QUEUE_NONE. 1030235783Skib */ 1031235783Skib 1032235783Skibvoid 1033235783Skibbundirty(bp) 1034235783Skib struct buf *bp; 1035235783Skib{ 1036235783Skib KASSERT(bp->b_qindex == QUEUE_NONE, 1037235783Skib ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1038235783Skib 1039235783Skib if (bp->b_flags & B_DELWRI) { 1040235783Skib bp->b_flags &= ~B_DELWRI; 1041235783Skib reassignbuf(bp, bp->b_vp); 1042235783Skib --numdirtybuffers; 1043235783Skib numdirtywakeup(lodirtybuffers); 1044235783Skib } 1045235783Skib /* 1046235783Skib * Since it is now being written, we can clear its deferred write flag. 1047235783Skib */ 1048235783Skib bp->b_flags &= ~B_DEFERRED; 1049235783Skib} 1050235783Skib 1051235783Skib/* 1052235783Skib * bawrite: 1053280183Sdumbbell * 1054235783Skib * Asynchronous write. Start output on a buffer, but do not wait for 1055235783Skib * it to complete. The buffer is released when the output completes. 1056235783Skib * 1057235783Skib * bwrite() ( or the VOP routine anyway ) is responsible for handling 1058235783Skib * B_INVAL buffers. Not us. 1059235783Skib */ 1060235783Skibvoid 1061235783Skibbawrite(struct buf * bp) 1062235783Skib{ 1063235783Skib bp->b_flags |= B_ASYNC; 1064235783Skib (void) BUF_WRITE(bp); 1065235783Skib} 1066235783Skib 1067235783Skib/* 1068235783Skib * bwillwrite: 1069235783Skib * 1070235783Skib * Called prior to the locking of any vnodes when we are expecting to 1071235783Skib * write. We do not want to starve the buffer cache with too many 1072235783Skib * dirty buffers so we block here. By blocking prior to the locking 1073235783Skib * of any vnodes we attempt to avoid the situation where a locked vnode 1074280183Sdumbbell * prevents the various system daemons from flushing related buffers. 1075235783Skib */ 1076235783Skib 1077235783Skibvoid 1078235783Skibbwillwrite(void) 1079235783Skib{ 1080235783Skib if (numdirtybuffers >= hidirtybuffers) { 1081235783Skib int s; 1082235783Skib 1083235783Skib mtx_lock(&Giant); 1084235783Skib s = splbio(); 1085235783Skib while (numdirtybuffers >= hidirtybuffers) { 1086235783Skib bd_wakeup(1); 1087235783Skib needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; 1088235783Skib tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); 1089235783Skib } 1090235783Skib splx(s); 1091235783Skib mtx_unlock(&Giant); 1092280183Sdumbbell } 1093280183Sdumbbell} 1094280183Sdumbbell 1095235783Skib/* 1096235783Skib * Return true if we have too many dirty buffers. 1097280183Sdumbbell */ 1098280183Sdumbbellint 1099235783Skibbuf_dirty_count_severe(void) 1100235783Skib{ 1101235783Skib return(numdirtybuffers >= hidirtybuffers); 1102235783Skib} 1103235783Skib 1104235783Skib/* 1105280183Sdumbbell * brelse: 1106280183Sdumbbell * 1107280183Sdumbbell * Release a busy buffer and, if requested, free its resources. The 1108280183Sdumbbell * buffer will be stashed in the appropriate bufqueue[] allowing it 1109280183Sdumbbell * to be accessed later as a cache entity or reused for other purposes. 1110235783Skib */ 1111235783Skibvoid 1112280183Sdumbbellbrelse(struct buf * bp) 1113235783Skib{ 1114235783Skib int s; 1115235783Skib 1116235783Skib GIANT_REQUIRED; 1117235783Skib 1118235783Skib KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), 1119235783Skib ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1120235783Skib 1121235783Skib s = splbio(); 1122235783Skib 1123235783Skib if (bp->b_flags & B_LOCKED) 1124235783Skib bp->b_ioflags &= ~BIO_ERROR; 1125235783Skib 1126235783Skib if (bp->b_iocmd == BIO_WRITE && 1127235783Skib (bp->b_ioflags & BIO_ERROR) && 1128235783Skib !(bp->b_flags & B_INVAL)) { 1129280183Sdumbbell /* 1130280183Sdumbbell * Failed write, redirty. Must clear BIO_ERROR to prevent 1131280183Sdumbbell * pages from being scrapped. If B_INVAL is set then 1132280183Sdumbbell * this case is not run and the next case is run to 1133280183Sdumbbell * destroy the buffer. B_INVAL can occur if the buffer 1134235783Skib * is outside the range supported by the underlying device. 1135235783Skib */ 1136235783Skib bp->b_ioflags &= ~BIO_ERROR; 1137235783Skib bdirty(bp); 1138235783Skib } else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) || 1139235783Skib (bp->b_ioflags & BIO_ERROR) || 1140235783Skib bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) { 1141235783Skib /* 1142235783Skib * Either a failed I/O or we were asked to free or not 1143235783Skib * cache the buffer. 1144235783Skib */ 1145235783Skib bp->b_flags |= B_INVAL; 1146235783Skib if (LIST_FIRST(&bp->b_dep) != NULL) 1147235783Skib buf_deallocate(bp); 1148235783Skib if (bp->b_flags & B_DELWRI) { 1149235783Skib --numdirtybuffers; 1150235783Skib numdirtywakeup(lodirtybuffers); 1151235783Skib } 1152235783Skib bp->b_flags &= ~(B_DELWRI | B_CACHE); 1153235783Skib if ((bp->b_flags & B_VMIO) == 0) { 1154235783Skib if (bp->b_bufsize) 1155235783Skib allocbuf(bp, 0); 1156235783Skib if (bp->b_vp) 1157235783Skib brelvp(bp); 1158235783Skib } 1159235783Skib } 1160235783Skib 1161235783Skib /* 1162235783Skib * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() 1163235783Skib * is called with B_DELWRI set, the underlying pages may wind up 1164235783Skib * getting freed causing a previous write (bdwrite()) to get 'lost' 1165235783Skib * because pages associated with a B_DELWRI bp are marked clean. 1166235783Skib * 1167235783Skib * We still allow the B_INVAL case to call vfs_vmio_release(), even 1168235783Skib * if B_DELWRI is set. 1169235783Skib * 1170235783Skib * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1171280183Sdumbbell * on pages to return pages to the VM page queues. 1172235783Skib */ 1173235783Skib if (bp->b_flags & B_DELWRI) 1174235783Skib bp->b_flags &= ~B_RELBUF; 1175235783Skib else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG)) 1176235783Skib bp->b_flags |= B_RELBUF; 1177235783Skib 1178235783Skib /* 1179235783Skib * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 1180235783Skib * constituted, not even NFS buffers now. Two flags effect this. If 1181235783Skib * B_INVAL, the struct buf is invalidated but the VM object is kept 1182235783Skib * around ( i.e. so it is trivial to reconstitute the buffer later ). 1183235783Skib * 1184235783Skib * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be 1185235783Skib * invalidated. BIO_ERROR cannot be set for a failed write unless the 1186235783Skib * buffer is also B_INVAL because it hits the re-dirtying code above. 1187235783Skib * 1188235783Skib * Normally we can do this whether a buffer is B_DELWRI or not. If 1189235783Skib * the buffer is an NFS buffer, it is tracking piecemeal writes or 1190235783Skib * the commit state and we cannot afford to lose the buffer. If the 1191235783Skib * buffer has a background write in progress, we need to keep it 1192235783Skib * around to prevent it from being reconstituted and starting a second 1193235783Skib * background write. 1194235783Skib */ 1195235783Skib if ((bp->b_flags & B_VMIO) 1196235783Skib && !(bp->b_vp->v_tag == VT_NFS && 1197235783Skib !vn_isdisk(bp->b_vp, NULL) && 1198235783Skib (bp->b_flags & B_DELWRI)) 1199235783Skib ) { 1200235783Skib 1201235783Skib int i, j, resid; 1202235783Skib vm_page_t m; 1203235783Skib off_t foff; 1204235783Skib vm_pindex_t poff; 1205235783Skib vm_object_t obj; 1206235783Skib struct vnode *vp; 1207235783Skib 1208235783Skib vp = bp->b_vp; 1209235783Skib obj = bp->b_object; 1210235783Skib 1211235783Skib /* 1212235783Skib * Get the base offset and length of the buffer. Note that 1213235783Skib * in the VMIO case if the buffer block size is not 1214235783Skib * page-aligned then b_data pointer may not be page-aligned. 1215235783Skib * But our b_pages[] array *IS* page aligned. 1216235783Skib * 1217235783Skib * block sizes less then DEV_BSIZE (usually 512) are not 1218235783Skib * supported due to the page granularity bits (m->valid, 1219235783Skib * m->dirty, etc...). 1220235783Skib * 1221235783Skib * See man buf(9) for more information 1222235783Skib */ 1223235783Skib resid = bp->b_bufsize; 1224235783Skib foff = bp->b_offset; 1225235783Skib 1226235783Skib for (i = 0; i < bp->b_npages; i++) { 1227235783Skib int had_bogus = 0; 1228235783Skib 1229235783Skib m = bp->b_pages[i]; 1230235783Skib vm_page_flag_clear(m, PG_ZERO); 1231235783Skib 1232235783Skib /* 1233280183Sdumbbell * If we hit a bogus page, fixup *all* the bogus pages 1234235783Skib * now. 1235235783Skib */ 1236235783Skib if (m == bogus_page) { 1237235783Skib poff = OFF_TO_IDX(bp->b_offset); 1238235783Skib had_bogus = 1; 1239235783Skib 1240235783Skib for (j = i; j < bp->b_npages; j++) { 1241235783Skib vm_page_t mtmp; 1242235783Skib mtmp = bp->b_pages[j]; 1243235783Skib if (mtmp == bogus_page) { 1244280183Sdumbbell mtmp = vm_page_lookup(obj, poff + j); 1245280183Sdumbbell if (!mtmp) { 1246235783Skib panic("brelse: page missing\n"); 1247235783Skib } 1248235783Skib bp->b_pages[j] = mtmp; 1249235783Skib } 1250235783Skib } 1251235783Skib 1252235783Skib if ((bp->b_flags & B_INVAL) == 0) { 1253235783Skib pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 1254235783Skib } 1255235783Skib m = bp->b_pages[i]; 1256235783Skib } 1257235783Skib if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) { 1258235783Skib int poffset = foff & PAGE_MASK; 1259235783Skib int presid = resid > (PAGE_SIZE - poffset) ? 1260235783Skib (PAGE_SIZE - poffset) : resid; 1261235783Skib 1262235783Skib KASSERT(presid >= 0, ("brelse: extra page")); 1263235783Skib vm_page_set_invalid(m, poffset, presid); 1264235783Skib if (had_bogus) 1265235783Skib printf("avoided corruption bug in bogus_page/brelse code\n"); 1266235783Skib } 1267235783Skib resid -= PAGE_SIZE - (foff & PAGE_MASK); 1268235783Skib foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 1269235783Skib } 1270235783Skib 1271280183Sdumbbell if (bp->b_flags & (B_INVAL | B_RELBUF)) 1272235783Skib vfs_vmio_release(bp); 1273280183Sdumbbell 1274280183Sdumbbell } else if (bp->b_flags & B_VMIO) { 1275235783Skib 1276235783Skib if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1277235783Skib vfs_vmio_release(bp); 1278235783Skib } 1279235783Skib 1280235783Skib } 1281235783Skib 1282235783Skib if (bp->b_qindex != QUEUE_NONE) 1283235783Skib panic("brelse: free buffer onto another queue???"); 1284235783Skib if (BUF_REFCNT(bp) > 1) { 1285280183Sdumbbell /* do not release to free list */ 1286280183Sdumbbell BUF_UNLOCK(bp); 1287235783Skib splx(s); 1288235783Skib return; 1289235783Skib } 1290280183Sdumbbell 1291280183Sdumbbell /* enqueue */ 1292235783Skib 1293235783Skib /* buffers with no memory */ 1294235783Skib if (bp->b_bufsize == 0) { 1295235783Skib bp->b_flags |= B_INVAL; 1296235783Skib bp->b_xflags &= ~BX_BKGRDWRITE; 1297235783Skib if (bp->b_xflags & BX_BKGRDINPROG) 1298280183Sdumbbell panic("losing buffer 1"); 1299280183Sdumbbell if (bp->b_kvasize) { 1300280183Sdumbbell bp->b_qindex = QUEUE_EMPTYKVA; 1301235783Skib } else { 1302235783Skib bp->b_qindex = QUEUE_EMPTY; 1303235783Skib } 1304235783Skib TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1305235783Skib LIST_REMOVE(bp, b_hash); 1306235783Skib LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1307235783Skib bp->b_dev = NODEV; 1308235783Skib /* buffers with junk contents */ 1309235783Skib } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || 1310235783Skib (bp->b_ioflags & BIO_ERROR)) { 1311235783Skib bp->b_flags |= B_INVAL; 1312280183Sdumbbell bp->b_xflags &= ~BX_BKGRDWRITE; 1313280183Sdumbbell if (bp->b_xflags & BX_BKGRDINPROG) 1314235783Skib panic("losing buffer 2"); 1315235783Skib bp->b_qindex = QUEUE_CLEAN; 1316235783Skib TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1317235783Skib LIST_REMOVE(bp, b_hash); 1318235783Skib LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1319280183Sdumbbell bp->b_dev = NODEV; 1320280183Sdumbbell 1321280183Sdumbbell /* buffers that are locked */ 1322235783Skib } else if (bp->b_flags & B_LOCKED) { 1323235783Skib bp->b_qindex = QUEUE_LOCKED; 1324235783Skib TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 1325235783Skib 1326235783Skib /* remaining buffers */ 1327280183Sdumbbell } else { 1328280183Sdumbbell if (bp->b_flags & B_DELWRI) 1329280183Sdumbbell bp->b_qindex = QUEUE_DIRTY; 1330280183Sdumbbell else 1331235783Skib bp->b_qindex = QUEUE_CLEAN; 1332235783Skib if (bp->b_flags & B_AGE) 1333235783Skib TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1334235783Skib else 1335235783Skib TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 1336235783Skib } 1337235783Skib 1338235783Skib /* 1339235783Skib * If B_INVAL, clear B_DELWRI. We've already placed the buffer 1340235783Skib * on the correct queue. 1341235783Skib */ 1342235783Skib if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) 1343280183Sdumbbell bundirty(bp); 1344280183Sdumbbell 1345235783Skib /* 1346235783Skib * Fixup numfreebuffers count. The bp is on an appropriate queue 1347235783Skib * unless locked. We then bump numfreebuffers if it is not B_DELWRI. 1348235783Skib * We've already handled the B_INVAL case ( B_DELWRI will be clear 1349235783Skib * if B_INVAL is set ). 1350235783Skib */ 1351280183Sdumbbell 1352280183Sdumbbell if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) 1353280183Sdumbbell bufcountwakeup(); 1354235783Skib 1355235783Skib /* 1356235783Skib * Something we can maybe free or reuse 1357235783Skib */ 1358235783Skib if (bp->b_bufsize || bp->b_kvasize) 1359235783Skib bufspacewakeup(); 1360235783Skib 1361235783Skib /* unlock */ 1362280183Sdumbbell BUF_UNLOCK(bp); 1363280183Sdumbbell bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | 1364280183Sdumbbell B_DIRECT | B_NOWDRAIN); 1365235783Skib if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1366235783Skib panic("brelse: not dirty"); 1367235783Skib splx(s); 1368235783Skib} 1369235783Skib 1370235783Skib/* 1371235783Skib * Release a buffer back to the appropriate queue but do not try to free 1372235783Skib * it. The buffer is expected to be used again soon. 1373235783Skib * 1374235783Skib * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1375235783Skib * biodone() to requeue an async I/O on completion. It is also used when 1376235783Skib * known good buffers need to be requeued but we think we may need the data 1377235783Skib * again soon. 1378235783Skib * 1379235783Skib * XXX we should be able to leave the B_RELBUF hint set on completion. 1380235783Skib */ 1381235783Skibvoid 1382235783Skibbqrelse(struct buf * bp) 1383235783Skib{ 1384235783Skib int s; 1385235783Skib 1386235783Skib s = splbio(); 1387235783Skib 1388235783Skib KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1389235783Skib 1390235783Skib if (bp->b_qindex != QUEUE_NONE) 1391235783Skib panic("bqrelse: free buffer onto another queue???"); 1392235783Skib if (BUF_REFCNT(bp) > 1) { 1393235783Skib /* do not release to free list */ 1394235783Skib BUF_UNLOCK(bp); 1395235783Skib splx(s); 1396235783Skib return; 1397235783Skib } 1398235783Skib if (bp->b_flags & B_LOCKED) { 1399235783Skib bp->b_ioflags &= ~BIO_ERROR; 1400235783Skib bp->b_qindex = QUEUE_LOCKED; 1401235783Skib TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 1402235783Skib /* buffers with stale but valid contents */ 1403235783Skib } else if (bp->b_flags & B_DELWRI) { 1404235783Skib bp->b_qindex = QUEUE_DIRTY; 1405235783Skib TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 1406235783Skib } else if (vm_page_count_severe()) { 1407280183Sdumbbell /* 1408235783Skib * We are too low on memory, we have to try to free the 1409235783Skib * buffer (most importantly: the wired pages making up its 1410235783Skib * backing store) *now*. 1411235783Skib */ 1412235783Skib splx(s); 1413235783Skib brelse(bp); 1414280183Sdumbbell return; 1415235783Skib } else { 1416235783Skib bp->b_qindex = QUEUE_CLEAN; 1417235783Skib TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1418235783Skib } 1419235783Skib 1420235783Skib if ((bp->b_flags & B_LOCKED) == 0 && 1421235783Skib ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { 1422235783Skib bufcountwakeup(); 1423235783Skib } 1424235783Skib 1425235783Skib /* 1426235783Skib * Something we can maybe free or reuse. 1427235783Skib */ 1428235783Skib if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1429235783Skib bufspacewakeup(); 1430235783Skib 1431235783Skib /* unlock */ 1432235783Skib BUF_UNLOCK(bp); 1433235783Skib bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 1434235783Skib if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) 1435235783Skib panic("bqrelse: not dirty"); 1436235783Skib splx(s); 1437235783Skib} 1438235783Skib 1439235783Skib/* Give pages used by the bp back to the VM system (where possible) */ 1440235783Skibstatic void 1441235783Skibvfs_vmio_release(bp) 1442235783Skib struct buf *bp; 1443235783Skib{ 1444235783Skib int i; 1445235783Skib vm_page_t m; 1446235783Skib 1447235783Skib GIANT_REQUIRED; 1448235783Skib 1449235783Skib for (i = 0; i < bp->b_npages; i++) { 1450235783Skib m = bp->b_pages[i]; 1451235783Skib bp->b_pages[i] = NULL; 1452235783Skib /* 1453235783Skib * In order to keep page LRU ordering consistent, put 1454235783Skib * everything on the inactive queue. 1455235783Skib */ 1456235783Skib vm_page_unwire(m, 0); 1457235783Skib /* 1458235783Skib * We don't mess with busy pages, it is 1459235783Skib * the responsibility of the process that 1460235783Skib * busied the pages to deal with them. 1461235783Skib */ 1462235783Skib if ((m->flags & PG_BUSY) || (m->busy != 0)) 1463235783Skib continue; 1464235783Skib 1465235783Skib if (m->wire_count == 0) { 1466235783Skib vm_page_flag_clear(m, PG_ZERO); 1467235783Skib /* 1468235783Skib * Might as well free the page if we can and it has 1469235783Skib * no valid data. We also free the page if the 1470235783Skib * buffer was used for direct I/O 1471235783Skib */ 1472235783Skib if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && 1473280183Sdumbbell m->hold_count == 0) { 1474280183Sdumbbell vm_page_busy(m); 1475280183Sdumbbell vm_page_protect(m, VM_PROT_NONE); 1476235783Skib vm_page_free(m); 1477235783Skib } else if (bp->b_flags & B_DIRECT) { 1478280183Sdumbbell vm_page_try_to_free(m); 1479235783Skib } else if (vm_page_count_severe()) { 1480235783Skib vm_page_try_to_cache(m); 1481235783Skib } 1482235783Skib } 1483235783Skib } 1484235783Skib pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 1485235783Skib 1486235783Skib if (bp->b_bufsize) { 1487235783Skib bufspacewakeup(); 1488235783Skib bp->b_bufsize = 0; 1489280183Sdumbbell } 1490235783Skib bp->b_npages = 0; 1491235783Skib bp->b_flags &= ~B_VMIO; 1492235783Skib if (bp->b_vp) 1493235783Skib brelvp(bp); 1494277487Skib} 1495235783Skib 1496235783Skib/* 1497235783Skib * Check to see if a block is currently memory resident. 1498235783Skib */ 1499235783Skibstruct buf * 1500235783Skibgbincore(struct vnode * vp, daddr_t blkno) 1501235783Skib{ 1502235783Skib struct buf *bp; 1503235783Skib struct bufhashhdr *bh; 1504235783Skib 1505235783Skib bh = bufhash(vp, blkno); 1506235783Skib 1507235783Skib /* Search hash chain */ 1508235783Skib LIST_FOREACH(bp, bh, b_hash) { 1509235783Skib /* hit */ 1510235783Skib if (bp->b_vp == vp && bp->b_lblkno == blkno && 1511235783Skib (bp->b_flags & B_INVAL) == 0) { 1512235783Skib break; 1513235783Skib } 1514235783Skib } 1515235783Skib return (bp); 1516235783Skib} 1517235783Skib 1518235783Skib/* 1519235783Skib * vfs_bio_awrite: 1520235783Skib * 1521235783Skib * Implement clustered async writes for clearing out B_DELWRI buffers. 1522235783Skib * This is much better then the old way of writing only one buffer at 1523235783Skib * a time. Note that we may not be presented with the buffers in the 1524235783Skib * correct order, so we search for the cluster in both directions. 1525235783Skib */ 1526235783Skibint 1527235783Skibvfs_bio_awrite(struct buf * bp) 1528235783Skib{ 1529235783Skib int i; 1530280183Sdumbbell int j; 1531235783Skib daddr_t lblkno = bp->b_lblkno; 1532235783Skib struct vnode *vp = bp->b_vp; 1533280183Sdumbbell int s; 1534280183Sdumbbell int ncl; 1535280183Sdumbbell struct buf *bpa; 1536235783Skib int nwritten; 1537235783Skib int size; 1538235783Skib int maxcl; 1539235783Skib 1540235783Skib s = splbio(); 1541235783Skib /* 1542235783Skib * right now we support clustered writing only to regular files. If 1543235783Skib * we find a clusterable block we could be in the middle of a cluster 1544235783Skib * rather then at the beginning. 1545280183Sdumbbell */ 1546280183Sdumbbell if ((vp->v_type == VREG) && 1547277487Skib (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1548280183Sdumbbell (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1549280183Sdumbbell 1550280183Sdumbbell size = vp->v_mount->mnt_stat.f_iosize; 1551277487Skib maxcl = MAXPHYS / size; 1552277487Skib 1553235783Skib for (i = 1; i < maxcl; i++) { 1554280183Sdumbbell if ((bpa = gbincore(vp, lblkno + i)) && 1555280183Sdumbbell BUF_REFCNT(bpa) == 0 && 1556280183Sdumbbell ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1557277487Skib (B_DELWRI | B_CLUSTEROK)) && 1558235783Skib (bpa->b_bufsize == size)) { 1559277487Skib if ((bpa->b_blkno == bpa->b_lblkno) || 1560235783Skib (bpa->b_blkno != 1561235783Skib bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 1562235783Skib break; 1563235783Skib } else { 1564235783Skib break; 1565235783Skib } 1566280183Sdumbbell } 1567235783Skib for (j = 1; i + j <= maxcl && j <= lblkno; j++) { 1568235783Skib if ((bpa = gbincore(vp, lblkno - j)) && 1569280183Sdumbbell BUF_REFCNT(bpa) == 0 && 1570280183Sdumbbell ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1571280183Sdumbbell (B_DELWRI | B_CLUSTEROK)) && 1572235783Skib (bpa->b_bufsize == size)) { 1573235783Skib if ((bpa->b_blkno == bpa->b_lblkno) || 1574235783Skib (bpa->b_blkno != 1575235783Skib bp->b_blkno - ((j * size) >> DEV_BSHIFT))) 1576235783Skib break; 1577235783Skib } else { 1578235783Skib break; 1579235783Skib } 1580235783Skib } 1581235783Skib --j; 1582235783Skib ncl = i + j; 1583235783Skib /* 1584235783Skib * this is a possible cluster write 1585235783Skib */ 1586235783Skib if (ncl != 1) { 1587235783Skib nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); 1588235783Skib splx(s); 1589235783Skib return nwritten; 1590235783Skib } 1591235783Skib } 1592235783Skib 1593235783Skib BUF_LOCK(bp, LK_EXCLUSIVE); 1594280183Sdumbbell bremfree(bp); 1595235783Skib bp->b_flags |= B_ASYNC; 1596235783Skib 1597235783Skib splx(s); 1598235783Skib /* 1599235783Skib * default (old) behavior, writing out only one block 1600280183Sdumbbell * 1601235783Skib * XXX returns b_bufsize instead of b_bcount for nwritten? 1602235783Skib */ 1603235783Skib nwritten = bp->b_bufsize; 1604235783Skib (void) BUF_WRITE(bp); 1605235783Skib 1606235783Skib return nwritten; 1607235783Skib} 1608235783Skib 1609235783Skib/* 1610235783Skib * getnewbuf: 1611235783Skib * 1612235783Skib * Find and initialize a new buffer header, freeing up existing buffers 1613235783Skib * in the bufqueues as necessary. The new buffer is returned locked. 1614235783Skib * 1615235783Skib * Important: B_INVAL is not set. If the caller wishes to throw the 1616235783Skib * buffer away, the caller must set B_INVAL prior to calling brelse(). 1617235783Skib * 1618235783Skib * We block if: 1619235783Skib * We have insufficient buffer headers 1620235783Skib * We have insufficient buffer space 1621235783Skib * buffer_map is too fragmented ( space reservation fails ) 1622235783Skib * If we have to flush dirty buffers ( but we try to avoid this ) 1623235783Skib * 1624235783Skib * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1625235783Skib * Instead we ask the buf daemon to do it for us. We attempt to 1626235783Skib * avoid piecemeal wakeups of the pageout daemon. 1627235783Skib */ 1628235783Skib 1629235783Skibstatic struct buf * 1630235783Skibgetnewbuf(int slpflag, int slptimeo, int size, int maxsize) 1631235783Skib{ 1632235783Skib struct buf *bp; 1633235783Skib struct buf *nbp; 1634235783Skib int defrag = 0; 1635235783Skib int nqindex; 1636280183Sdumbbell static int flushingbufs; 1637235783Skib 1638235783Skib GIANT_REQUIRED; 1639235783Skib 1640280183Sdumbbell /* 1641235783Skib * We can't afford to block since we might be holding a vnode lock, 1642235783Skib * which may prevent system daemons from running. We deal with 1643235783Skib * low-memory situations by proactively returning memory and running 1644235783Skib * async I/O rather then sync I/O. 1645235783Skib */ 1646235783Skib 1647235783Skib ++getnewbufcalls; 1648235783Skib --getnewbufrestarts; 1649235783Skibrestart: 1650235783Skib ++getnewbufrestarts; 1651280183Sdumbbell 1652235783Skib /* 1653235783Skib * Setup for scan. If we do not have enough free buffers, 1654280183Sdumbbell * we setup a degenerate case that immediately fails. Note 1655280183Sdumbbell * that if we are specially marked process, we are allowed to 1656235783Skib * dip into our reserves. 1657235783Skib * 1658235783Skib * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN 1659235783Skib * 1660235783Skib * We start with EMPTYKVA. If the list is empty we backup to EMPTY. 1661235783Skib * However, there are a number of cases (defragging, reusing, ...) 1662235783Skib * where we cannot backup. 1663235783Skib */ 1664235783Skib nqindex = QUEUE_EMPTYKVA; 1665235783Skib nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); 1666235783Skib 1667235783Skib if (nbp == NULL) { 1668235783Skib /* 1669235783Skib * If no EMPTYKVA buffers and we are either 1670235783Skib * defragging or reusing, locate a CLEAN buffer 1671235783Skib * to free or reuse. If bufspace useage is low 1672235783Skib * skip this step so we can allocate a new buffer. 1673235783Skib */ 1674235783Skib if (defrag || bufspace >= lobufspace) { 1675235783Skib nqindex = QUEUE_CLEAN; 1676235783Skib nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 1677235783Skib } 1678235783Skib 1679235783Skib /* 1680235783Skib * If we could not find or were not allowed to reuse a 1681235783Skib * CLEAN buffer, check to see if it is ok to use an EMPTY 1682235783Skib * buffer. We can only use an EMPTY buffer if allocating 1683235783Skib * its KVA would not otherwise run us out of buffer space. 1684235783Skib */ 1685235783Skib if (nbp == NULL && defrag == 0 && 1686280183Sdumbbell bufspace + maxsize < hibufspace) { 1687235783Skib nqindex = QUEUE_EMPTY; 1688235783Skib nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 1689235783Skib } 1690280183Sdumbbell } 1691235783Skib 1692235783Skib /* 1693235783Skib * Run scan, possibly freeing data and/or kva mappings on the fly 1694235783Skib * depending. 1695235783Skib */ 1696280183Sdumbbell 1697235783Skib while ((bp = nbp) != NULL) { 1698235783Skib int qindex = nqindex; 1699235783Skib 1700235783Skib /* 1701235783Skib * Calculate next bp ( we can only use it if we do not block 1702235783Skib * or do other fancy things ). 1703235783Skib */ 1704235783Skib if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { 1705235783Skib switch(qindex) { 1706235783Skib case QUEUE_EMPTY: 1707235783Skib nqindex = QUEUE_EMPTYKVA; 1708235783Skib if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) 1709235783Skib break; 1710235783Skib /* fall through */ 1711235783Skib case QUEUE_EMPTYKVA: 1712235783Skib nqindex = QUEUE_CLEAN; 1713235783Skib if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) 1714235783Skib break; 1715235783Skib /* fall through */ 1716235783Skib case QUEUE_CLEAN: 1717235783Skib /* 1718235783Skib * nbp is NULL. 1719235783Skib */ 1720235783Skib break; 1721280183Sdumbbell } 1722280183Sdumbbell } 1723235783Skib 1724235783Skib /* 1725280183Sdumbbell * Sanity Checks 1726235783Skib */ 1727235783Skib KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); 1728235783Skib 1729235783Skib /* 1730235783Skib * Note: we no longer distinguish between VMIO and non-VMIO 1731235783Skib * buffers. 1732235783Skib */ 1733235783Skib 1734235783Skib KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); 1735235783Skib 1736235783Skib /* 1737235783Skib * If we are defragging then we need a buffer with 1738235783Skib * b_kvasize != 0. XXX this situation should no longer 1739235783Skib * occur, if defrag is non-zero the buffer's b_kvasize 1740235783Skib * should also be non-zero at this point. XXX 1741235783Skib */ 1742235783Skib if (defrag && bp->b_kvasize == 0) { 1743235783Skib printf("Warning: defrag empty buffer %p\n", bp); 1744235783Skib continue; 1745235783Skib } 1746235783Skib 1747235783Skib /* 1748235783Skib * Start freeing the bp. This is somewhat involved. nbp 1749235783Skib * remains valid only for QUEUE_EMPTY[KVA] bp's. 1750235783Skib */ 1751235783Skib 1752235783Skib if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 1753235783Skib panic("getnewbuf: locked buf"); 1754235783Skib bremfree(bp); 1755235783Skib 1756235783Skib if (qindex == QUEUE_CLEAN) { 1757235783Skib if (bp->b_flags & B_VMIO) { 1758235783Skib bp->b_flags &= ~B_ASYNC; 1759235783Skib vfs_vmio_release(bp); 1760235783Skib } 1761280183Sdumbbell if (bp->b_vp) 1762235783Skib brelvp(bp); 1763235783Skib } 1764235783Skib 1765235783Skib /* 1766235783Skib * NOTE: nbp is now entirely invalid. We can only restart 1767235783Skib * the scan from this point on. 1768235783Skib * 1769235783Skib * Get the rest of the buffer freed up. b_kva* is still 1770235783Skib * valid after this operation. 1771235783Skib */ 1772235783Skib 1773235783Skib if (bp->b_rcred != NOCRED) { 1774280183Sdumbbell crfree(bp->b_rcred); 1775235783Skib bp->b_rcred = NOCRED; 1776235783Skib } 1777235783Skib if (bp->b_wcred != NOCRED) { 1778235783Skib crfree(bp->b_wcred); 1779235783Skib bp->b_wcred = NOCRED; 1780235783Skib } 1781235783Skib if (LIST_FIRST(&bp->b_dep) != NULL) 1782235783Skib buf_deallocate(bp); 1783235783Skib if (bp->b_xflags & BX_BKGRDINPROG) 1784235783Skib panic("losing buffer 3"); 1785235783Skib LIST_REMOVE(bp, b_hash); 1786235783Skib LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1787235783Skib 1788235783Skib if (bp->b_bufsize) 1789235783Skib allocbuf(bp, 0); 1790235783Skib 1791235783Skib bp->b_flags = 0; 1792280183Sdumbbell bp->b_ioflags = 0; 1793235783Skib bp->b_xflags = 0; 1794235783Skib bp->b_dev = NODEV; 1795235783Skib bp->b_vp = NULL; 1796235783Skib bp->b_blkno = bp->b_lblkno = 0; 1797235783Skib bp->b_offset = NOOFFSET; 1798235783Skib bp->b_iodone = 0; 1799235783Skib bp->b_error = 0; 1800235783Skib bp->b_resid = 0; 1801235783Skib bp->b_bcount = 0; 1802280183Sdumbbell bp->b_npages = 0; 1803235783Skib bp->b_dirtyoff = bp->b_dirtyend = 0; 1804235783Skib bp->b_magic = B_MAGIC_BIO; 1805235783Skib bp->b_op = &buf_ops_bio; 1806235783Skib bp->b_object = NULL; 1807235783Skib 1808235783Skib LIST_INIT(&bp->b_dep); 1809235783Skib 1810235783Skib /* 1811235783Skib * If we are defragging then free the buffer. 1812235783Skib */ 1813280183Sdumbbell if (defrag) { 1814235783Skib bp->b_flags |= B_INVAL; 1815235783Skib bfreekva(bp); 1816235783Skib brelse(bp); 1817235783Skib defrag = 0; 1818235783Skib goto restart; 1819235783Skib } 1820235783Skib 1821235783Skib /* 1822235783Skib * If we are overcomitted then recover the buffer and its 1823235783Skib * KVM space. This occurs in rare situations when multiple 1824235783Skib * processes are blocked in getnewbuf() or allocbuf(). 1825235783Skib */ 1826235783Skib if (bufspace >= hibufspace) 1827235783Skib flushingbufs = 1; 1828235783Skib if (flushingbufs && bp->b_kvasize != 0) { 1829235783Skib bp->b_flags |= B_INVAL; 1830235783Skib bfreekva(bp); 1831235783Skib brelse(bp); 1832235783Skib goto restart; 1833235783Skib } 1834235783Skib if (bufspace < lobufspace) 1835280183Sdumbbell flushingbufs = 0; 1836235783Skib break; 1837235783Skib } 1838235783Skib 1839235783Skib /* 1840235783Skib * If we exhausted our list, sleep as appropriate. We may have to 1841235783Skib * wakeup various daemons and write out some dirty buffers. 1842235783Skib * 1843235783Skib * Generally we are sleeping due to insufficient buffer space. 1844235783Skib */ 1845235783Skib 1846235783Skib if (bp == NULL) { 1847280183Sdumbbell int flags; 1848235783Skib char *waitmsg; 1849235783Skib 1850235783Skib if (defrag) { 1851280183Sdumbbell flags = VFS_BIO_NEED_BUFSPACE; 1852235783Skib waitmsg = "nbufkv"; 1853235783Skib } else if (bufspace >= hibufspace) { 1854235783Skib waitmsg = "nbufbs"; 1855235783Skib flags = VFS_BIO_NEED_BUFSPACE; 1856235783Skib } else { 1857235783Skib waitmsg = "newbuf"; 1858235783Skib flags = VFS_BIO_NEED_ANY; 1859235783Skib } 1860235783Skib 1861235783Skib bd_speedup(); /* heeeelp */ 1862235783Skib 1863235783Skib needsbuffer |= flags; 1864235783Skib while (needsbuffer & flags) { 1865235783Skib if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, 1866235783Skib waitmsg, slptimeo)) 1867235783Skib return (NULL); 1868235783Skib } 1869235783Skib } else { 1870235783Skib /* 1871235783Skib * We finally have a valid bp. We aren't quite out of the 1872235783Skib * woods, we still have to reserve kva space. In order 1873235783Skib * to keep fragmentation sane we only allocate kva in 1874235783Skib * BKVASIZE chunks. 1875235783Skib */ 1876235783Skib maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 1877235783Skib 1878235783Skib if (maxsize != bp->b_kvasize) { 1879235783Skib vm_offset_t addr = 0; 1880235783Skib 1881235783Skib bfreekva(bp); 1882235783Skib 1883235783Skib if (vm_map_findspace(buffer_map, 1884235783Skib vm_map_min(buffer_map), maxsize, &addr)) { 1885235783Skib /* 1886235783Skib * Uh oh. Buffer map is to fragmented. We 1887235783Skib * must defragment the map. 1888235783Skib */ 1889235783Skib ++bufdefragcnt; 1890235783Skib defrag = 1; 1891235783Skib bp->b_flags |= B_INVAL; 1892235783Skib brelse(bp); 1893235783Skib goto restart; 1894235783Skib } 1895280183Sdumbbell if (addr) { 1896277487Skib vm_map_insert(buffer_map, NULL, 0, 1897235783Skib addr, addr + maxsize, 1898235783Skib VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); 1899235783Skib 1900280183Sdumbbell bp->b_kvabase = (caddr_t) addr; 1901235783Skib bp->b_kvasize = maxsize; 1902235783Skib bufspace += bp->b_kvasize; 1903235783Skib ++bufreusecnt; 1904280183Sdumbbell } 1905235783Skib } 1906235783Skib bp->b_data = bp->b_kvabase; 1907235783Skib } 1908235783Skib return(bp); 1909235783Skib} 1910235783Skib 1911280183Sdumbbell/* 1912235783Skib * buf_daemon: 1913235783Skib * 1914235783Skib * buffer flushing daemon. Buffers are normally flushed by the 1915235783Skib * update daemon but if it cannot keep up this process starts to 1916235783Skib * take the load in an attempt to prevent getnewbuf() from blocking. 1917235783Skib */ 1918280183Sdumbbell 1919235783Skibstatic struct proc *bufdaemonproc; 1920235783Skib 1921235783Skibstatic struct kproc_desc buf_kp = { 1922235783Skib "bufdaemon", 1923235783Skib buf_daemon, 1924235783Skib &bufdaemonproc 1925235783Skib}; 1926235783SkibSYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) 1927235783Skib 1928235783Skibstatic void 1929235783Skibbuf_daemon() 1930235783Skib{ 1931235783Skib int s; 1932235783Skib 1933235783Skib mtx_lock(&Giant); 1934280183Sdumbbell 1935235783Skib /* 1936235783Skib * This process needs to be suspended prior to shutdown sync. 1937235783Skib */ 1938235783Skib EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc, 1939235783Skib SHUTDOWN_PRI_LAST); 1940235783Skib 1941235783Skib /* 1942280183Sdumbbell * This process is allowed to take the buffer cache to the limit 1943235783Skib */ 1944235783Skib s = splbio(); 1945235783Skib 1946235783Skib for (;;) { 1947235783Skib kthread_suspend_check(bufdaemonproc); 1948235783Skib 1949235783Skib bd_request = 0; 1950235783Skib 1951235783Skib /* 1952235783Skib * Do the flush. Limit the amount of in-transit I/O we 1953235783Skib * allow to build up, otherwise we would completely saturate 1954280183Sdumbbell * the I/O system. Wakeup any waiting processes before we 1955280183Sdumbbell * normally would so they can run in parallel with our drain. 1956280183Sdumbbell */ 1957280183Sdumbbell while (numdirtybuffers > lodirtybuffers) { 1958280183Sdumbbell if (flushbufqueues() == 0) 1959280183Sdumbbell break; 1960280183Sdumbbell waitrunningbufspace(); 1961280183Sdumbbell numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2); 1962280183Sdumbbell } 1963280183Sdumbbell 1964280183Sdumbbell /* 1965280183Sdumbbell * Only clear bd_request if we have reached our low water 1966280183Sdumbbell * mark. The buf_daemon normally waits 1 second and 1967280183Sdumbbell * then incrementally flushes any dirty buffers that have 1968280183Sdumbbell * built up, within reason. 1969280183Sdumbbell * 1970280183Sdumbbell * If we were unable to hit our low water mark and couldn't 1971280183Sdumbbell * find any flushable buffers, we sleep half a second. 1972280183Sdumbbell * Otherwise we loop immediately. 1973235783Skib */ 1974235783Skib if (numdirtybuffers <= lodirtybuffers) { 1975235783Skib /* 1976235783Skib * We reached our low water mark, reset the 1977235783Skib * request and sleep until we are needed again. 1978235783Skib * The sleep is just so the suspend code works. 1979280183Sdumbbell */ 1980235783Skib bd_request = 0; 1981235783Skib tsleep(&bd_request, PVM, "psleep", hz); 1982235783Skib } else { 1983235783Skib /* 1984235783Skib * We couldn't find any flushable dirty buffers but 1985235783Skib * still have too many dirty buffers, we 1986280183Sdumbbell * have to sleep and try again. (rare) 1987235783Skib */ 1988235783Skib tsleep(&bd_request, PVM, "qsleep", hz / 2); 1989235783Skib } 1990235783Skib } 1991235783Skib} 1992235783Skib 1993235783Skib/* 1994235783Skib * flushbufqueues: 1995280183Sdumbbell * 1996235783Skib * Try to flush a buffer in the dirty queue. We must be careful to 1997235783Skib * free up B_INVAL buffers instead of write them, which NFS is 1998235783Skib * particularly sensitive to. 1999235783Skib */ 2000280183Sdumbbell 2001293851Sdumbbellstatic int 2002280183Sdumbbellflushbufqueues(void) 2003280183Sdumbbell{ 2004280183Sdumbbell struct buf *bp; 2005280183Sdumbbell int r = 0; 2006235783Skib 2007235783Skib bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); 2008280183Sdumbbell 2009280183Sdumbbell while (bp) { 2010280183Sdumbbell KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); 2011235783Skib if ((bp->b_flags & B_DELWRI) != 0 && 2012235783Skib (bp->b_xflags & BX_BKGRDINPROG) == 0) { 2013235783Skib if (bp->b_flags & B_INVAL) { 2014235783Skib if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 2015235783Skib panic("flushbufqueues: locked buf"); 2016235783Skib bremfree(bp); 2017235783Skib brelse(bp); 2018235783Skib ++r; 2019280183Sdumbbell break; 2020235783Skib } 2021235783Skib if (LIST_FIRST(&bp->b_dep) != NULL && 2022235783Skib (bp->b_flags & B_DEFERRED) == 0 && 2023235783Skib buf_countdeps(bp, 0)) { 2024235783Skib TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY], 2025235783Skib bp, b_freelist); 2026235783Skib TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], 2027235783Skib bp, b_freelist); 2028235783Skib bp->b_flags |= B_DEFERRED; 2029235783Skib bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); 2030235783Skib continue; 2031235783Skib } 2032235783Skib vfs_bio_awrite(bp); 2033235783Skib ++r; 2034235783Skib break; 2035235783Skib } 2036235783Skib bp = TAILQ_NEXT(bp, b_freelist); 2037235783Skib } 2038235783Skib return (r); 2039235783Skib} 2040235783Skib 2041235783Skib/* 2042235783Skib * Check to see if a block is currently memory resident. 2043235783Skib */ 2044235783Skibstruct buf * 2045235783Skibincore(struct vnode * vp, daddr_t blkno) 2046235783Skib{ 2047235783Skib struct buf *bp; 2048235783Skib 2049235783Skib int s = splbio(); 2050235783Skib bp = gbincore(vp, blkno); 2051235783Skib splx(s); 2052235783Skib return (bp); 2053235783Skib} 2054235783Skib 2055235783Skib/* 2056280183Sdumbbell * Returns true if no I/O is needed to access the 2057235783Skib * associated VM object. This is like incore except 2058280183Sdumbbell * it also hunts around in the VM system for the data. 2059280183Sdumbbell */ 2060235783Skib 2061235783Skibint 2062235783Skibinmem(struct vnode * vp, daddr_t blkno) 2063235783Skib{ 2064235783Skib vm_object_t obj; 2065280183Sdumbbell vm_offset_t toff, tinc, size; 2066235783Skib vm_page_t m; 2067235783Skib vm_ooffset_t off; 2068235783Skib 2069235783Skib GIANT_REQUIRED; 2070235783Skib 2071235783Skib if (incore(vp, blkno)) 2072280183Sdumbbell return 1; 2073235783Skib if (vp->v_mount == NULL) 2074235783Skib return 0; 2075235783Skib if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0) 2076280183Sdumbbell return 0; 2077235783Skib 2078235783Skib size = PAGE_SIZE; 2079235783Skib if (size > vp->v_mount->mnt_stat.f_iosize) 2080235783Skib size = vp->v_mount->mnt_stat.f_iosize; 2081235783Skib off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; 2082235783Skib 2083235783Skib for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2084280183Sdumbbell m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 2085235783Skib if (!m) 2086235783Skib goto notinmem; 2087235783Skib tinc = size; 2088235783Skib if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 2089235783Skib tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 2090235783Skib if (vm_page_is_valid(m, 2091235783Skib (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 2092235783Skib goto notinmem; 2093235783Skib } 2094235783Skib return 1; 2095235783Skib 2096235783Skibnotinmem: 2097235783Skib return (0); 2098235783Skib} 2099235783Skib 2100280183Sdumbbell/* 2101235783Skib * vfs_setdirty: 2102235783Skib * 2103235783Skib * Sets the dirty range for a buffer based on the status of the dirty 2104235783Skib * bits in the pages comprising the buffer. 2105235783Skib * 2106235783Skib * The range is limited to the size of the buffer. 2107235783Skib * 2108235783Skib * This routine is primarily used by NFS, but is generalized for the 2109235783Skib * B_VMIO case. 2110235783Skib */ 2111235783Skibstatic void 2112235783Skibvfs_setdirty(struct buf *bp) 2113235783Skib{ 2114235783Skib int i; 2115235783Skib vm_object_t object; 2116235783Skib 2117235783Skib GIANT_REQUIRED; 2118235783Skib /* 2119235783Skib * Degenerate case - empty buffer 2120280183Sdumbbell */ 2121235783Skib 2122235783Skib if (bp->b_bufsize == 0) 2123235783Skib return; 2124235783Skib 2125235783Skib /* 2126235783Skib * We qualify the scan for modified pages on whether the 2127280183Sdumbbell * object has been flushed yet. The OBJ_WRITEABLE flag 2128235783Skib * is not cleared simply by protecting pages off. 2129235783Skib */ 2130235783Skib 2131235783Skib if ((bp->b_flags & B_VMIO) == 0) 2132235783Skib return; 2133235783Skib 2134235783Skib object = bp->b_pages[0]->object; 2135235783Skib 2136235783Skib if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) 2137235783Skib printf("Warning: object %p writeable but not mightbedirty\n", object); 2138235783Skib if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) 2139235783Skib printf("Warning: object %p mightbedirty but not writeable\n", object); 2140235783Skib 2141235783Skib if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { 2142235783Skib vm_offset_t boffset; 2143235783Skib vm_offset_t eoffset; 2144235783Skib 2145235783Skib /* 2146235783Skib * test the pages to see if they have been modified directly 2147235783Skib * by users through the VM system. 2148235783Skib */ 2149235783Skib for (i = 0; i < bp->b_npages; i++) { 2150235783Skib vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 2151235783Skib vm_page_test_dirty(bp->b_pages[i]); 2152235783Skib } 2153235783Skib 2154235783Skib /* 2155235783Skib * Calculate the encompassing dirty range, boffset and eoffset, 2156235783Skib * (eoffset - boffset) bytes. 2157235783Skib */ 2158235783Skib 2159235783Skib for (i = 0; i < bp->b_npages; i++) { 2160235783Skib if (bp->b_pages[i]->dirty) 2161235783Skib break; 2162235783Skib } 2163235783Skib boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 2164280183Sdumbbell 2165235783Skib for (i = bp->b_npages - 1; i >= 0; --i) { 2166235783Skib if (bp->b_pages[i]->dirty) { 2167280183Sdumbbell break; 2168280183Sdumbbell } 2169235783Skib } 2170280183Sdumbbell eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 2171235783Skib 2172235783Skib /* 2173235783Skib * Fit it to the buffer. 2174280183Sdumbbell */ 2175280183Sdumbbell 2176280183Sdumbbell if (eoffset > bp->b_bcount) 2177280183Sdumbbell eoffset = bp->b_bcount; 2178235783Skib 2179280183Sdumbbell /* 2180235783Skib * If we have a good dirty range, merge with the existing 2181235783Skib * dirty range. 2182235783Skib */ 2183235783Skib 2184235783Skib if (boffset < eoffset) { 2185235783Skib if (bp->b_dirtyoff > boffset) 2186235783Skib bp->b_dirtyoff = boffset; 2187235783Skib if (bp->b_dirtyend < eoffset) 2188235783Skib bp->b_dirtyend = eoffset; 2189235783Skib } 2190235783Skib } 2191235783Skib} 2192280183Sdumbbell 2193235783Skib/* 2194235783Skib * getblk: 2195235783Skib * 2196235783Skib * Get a block given a specified block and offset into a file/device. 2197235783Skib * The buffers B_DONE bit will be cleared on return, making it almost 2198235783Skib * ready for an I/O initiation. B_INVAL may or may not be set on 2199235783Skib * return. The caller should clear B_INVAL prior to initiating a 2200235783Skib * READ. 2201235783Skib * 2202235783Skib * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 2203235783Skib * an existing buffer. 2204235783Skib * 2205235783Skib * For a VMIO buffer, B_CACHE is modified according to the backing VM. 2206235783Skib * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 2207235783Skib * and then cleared based on the backing VM. If the previous buffer is 2208235783Skib * non-0-sized but invalid, B_CACHE will be cleared. 2209235783Skib * 2210235783Skib * If getblk() must create a new buffer, the new buffer is returned with 2211235783Skib * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 2212235783Skib * case it is returned with B_INVAL clear and B_CACHE set based on the 2213235783Skib * backing VM. 2214235783Skib * 2215235783Skib * getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos 2216235783Skib * B_CACHE bit is clear. 2217235783Skib * 2218235783Skib * What this means, basically, is that the caller should use B_CACHE to 2219235783Skib * determine whether the buffer is fully valid or not and should clear 2220235783Skib * B_INVAL prior to issuing a read. If the caller intends to validate 2221235783Skib * the buffer by loading its data area with something, the caller needs 2222235783Skib * to clear B_INVAL. If the caller does this without issuing an I/O, 2223235783Skib * the caller should set B_CACHE ( as an optimization ), else the caller 2224235783Skib * should issue the I/O and biodone() will set B_CACHE if the I/O was 2225235783Skib * a write attempt or if it was a successfull read. If the caller 2226235783Skib * intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR 2227235783Skib * prior to issuing the READ. biodone() will *not* clear B_INVAL. 2228235783Skib */ 2229235783Skibstruct buf * 2230235783Skibgetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 2231235783Skib{ 2232235783Skib struct buf *bp; 2233235783Skib int s; 2234235783Skib struct bufhashhdr *bh; 2235235783Skib 2236235783Skib if (size > MAXBSIZE) 2237235783Skib panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 2238235783Skib 2239235783Skib s = splbio(); 2240235783Skibloop: 2241235783Skib /* 2242235783Skib * Block if we are low on buffers. Certain processes are allowed 2243235783Skib * to completely exhaust the buffer cache. 2244235783Skib * 2245280183Sdumbbell * If this check ever becomes a bottleneck it may be better to 2246280183Sdumbbell * move it into the else, when gbincore() fails. At the moment 2247235783Skib * it isn't a problem. 2248235783Skib * 2249235783Skib * XXX remove if 0 sections (clean this up after its proven) 2250235783Skib */ 2251235783Skib if (numfreebuffers == 0) { 2252235783Skib if (curthread == PCPU_GET(idlethread)) 2253235783Skib return NULL; 2254235783Skib needsbuffer |= VFS_BIO_NEED_ANY; 2255235783Skib } 2256235783Skib 2257235783Skib if ((bp = gbincore(vp, blkno))) { 2258235783Skib /* 2259280183Sdumbbell * Buffer is in-core. If the buffer is not busy, it must 2260235783Skib * be on a queue. 2261235783Skib */ 2262235783Skib 2263280183Sdumbbell if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 2264277487Skib if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, 2265277487Skib "getblk", slpflag, slptimeo) == ENOLCK) 2266277487Skib goto loop; 2267277487Skib splx(s); 2268277487Skib return (struct buf *) NULL; 2269277487Skib } 2270277487Skib 2271277487Skib /* 2272277487Skib * The buffer is locked. B_CACHE is cleared if the buffer is 2273277487Skib * invalid. Otherwise, for a non-VMIO buffer, B_CACHE is set 2274277487Skib * and for a VMIO buffer B_CACHE is adjusted according to the 2275277487Skib * backing VM cache. 2276277487Skib */ 2277277487Skib if (bp->b_flags & B_INVAL) 2278277487Skib bp->b_flags &= ~B_CACHE; 2279277487Skib else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 2280277487Skib bp->b_flags |= B_CACHE; 2281277487Skib bremfree(bp); 2282277487Skib 2283277487Skib /* 2284277487Skib * check for size inconsistancies for non-VMIO case. 2285277487Skib */ 2286277487Skib 2287277487Skib if (bp->b_bcount != size) { 2288277487Skib if ((bp->b_flags & B_VMIO) == 0 || 2289280183Sdumbbell (size > bp->b_kvasize)) { 2290280183Sdumbbell if (bp->b_flags & B_DELWRI) { 2291277487Skib bp->b_flags |= B_NOCACHE; 2292277487Skib BUF_WRITE(bp); 2293277487Skib } else { 2294277487Skib if ((bp->b_flags & B_VMIO) && 2295277487Skib (LIST_FIRST(&bp->b_dep) == NULL)) { 2296277487Skib bp->b_flags |= B_RELBUF; 2297280183Sdumbbell brelse(bp); 2298280183Sdumbbell } else { 2299280183Sdumbbell bp->b_flags |= B_NOCACHE; 2300280183Sdumbbell BUF_WRITE(bp); 2301280183Sdumbbell } 2302280183Sdumbbell } 2303280183Sdumbbell goto loop; 2304277487Skib } 2305277487Skib } 2306277487Skib 2307277487Skib /* 2308277487Skib * If the size is inconsistant in the VMIO case, we can resize 2309277487Skib * the buffer. This might lead to B_CACHE getting set or 2310277487Skib * cleared. If the size has not changed, B_CACHE remains 2311277487Skib * unchanged from its previous state. 2312235783Skib */ 2313235783Skib 2314235783Skib if (bp->b_bcount != size) 2315235783Skib allocbuf(bp, size); 2316235783Skib 2317235783Skib KASSERT(bp->b_offset != NOOFFSET, 2318235783Skib ("getblk: no buffer offset")); 2319235783Skib 2320235783Skib /* 2321235783Skib * A buffer with B_DELWRI set and B_CACHE clear must 2322235783Skib * be committed before we can return the buffer in 2323235783Skib * order to prevent the caller from issuing a read 2324235783Skib * ( due to B_CACHE not being set ) and overwriting 2325235783Skib * it. 2326235783Skib * 2327235783Skib * Most callers, including NFS and FFS, need this to 2328235783Skib * operate properly either because they assume they 2329235783Skib * can issue a read if B_CACHE is not set, or because 2330235783Skib * ( for example ) an uncached B_DELWRI might loop due 2331235783Skib * to softupdates re-dirtying the buffer. In the latter 2332235783Skib * case, B_CACHE is set after the first write completes, 2333235783Skib * preventing further loops. 2334235783Skib * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 2335280183Sdumbbell * above while extending the buffer, we cannot allow the 2336235783Skib * buffer to remain with B_CACHE set after the write 2337235783Skib * completes or it will represent a corrupt state. To 2338280183Sdumbbell * deal with this we set B_NOCACHE to scrap the buffer 2339235783Skib * after the write. 2340280183Sdumbbell * 2341280183Sdumbbell * We might be able to do something fancy, like setting 2342280183Sdumbbell * B_CACHE in bwrite() except if B_DELWRI is already set, 2343280183Sdumbbell * so the below call doesn't set B_CACHE, but that gets real 2344280183Sdumbbell * confusing. This is much easier. 2345235783Skib */ 2346277487Skib 2347235783Skib if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 2348280183Sdumbbell bp->b_flags |= B_NOCACHE; 2349235783Skib BUF_WRITE(bp); 2350235783Skib goto loop; 2351277487Skib } 2352235783Skib 2353280183Sdumbbell splx(s); 2354235783Skib bp->b_flags &= ~B_DONE; 2355235783Skib } else { 2356277487Skib /* 2357277487Skib * Buffer is not in-core, create new buffer. The buffer 2358280183Sdumbbell * returned by getnewbuf() is locked. Note that the returned 2359235783Skib * buffer is also considered valid (not marked B_INVAL). 2360235783Skib */ 2361235783Skib int bsize, maxsize, vmio; 2362280183Sdumbbell off_t offset; 2363235783Skib 2364280183Sdumbbell if (vn_isdisk(vp, NULL)) 2365235783Skib bsize = DEV_BSIZE; 2366235783Skib else if (vp->v_mountedhere) 2367235783Skib bsize = vp->v_mountedhere->mnt_stat.f_iosize; 2368235783Skib else if (vp->v_mount) 2369235783Skib bsize = vp->v_mount->mnt_stat.f_iosize; 2370235783Skib else 2371235783Skib bsize = size; 2372235783Skib 2373235783Skib offset = blkno * bsize; 2374280183Sdumbbell vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF); 2375235783Skib maxsize = vmio ? size + (offset & PAGE_MASK) : size; 2376235783Skib maxsize = imax(maxsize, bsize); 2377235783Skib 2378235783Skib if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { 2379235783Skib if (slpflag || slptimeo) { 2380235783Skib splx(s); 2381235783Skib return NULL; 2382235783Skib } 2383235783Skib goto loop; 2384235783Skib } 2385235783Skib 2386235783Skib /* 2387235783Skib * This code is used to make sure that a buffer is not 2388235783Skib * created while the getnewbuf routine is blocked. 2389235783Skib * This can be a problem whether the vnode is locked or not. 2390235783Skib * If the buffer is created out from under us, we have to 2391235783Skib * throw away the one we just created. There is now window 2392235783Skib * race because we are safely running at splbio() from the 2393235783Skib * point of the duplicate buffer creation through to here, 2394235783Skib * and we've locked the buffer. 2395235783Skib */ 2396235783Skib if (gbincore(vp, blkno)) { 2397235783Skib bp->b_flags |= B_INVAL; 2398235783Skib brelse(bp); 2399235783Skib goto loop; 2400235783Skib } 2401235783Skib 2402235783Skib /* 2403235783Skib * Insert the buffer into the hash, so that it can 2404235783Skib * be found by incore. 2405280183Sdumbbell */ 2406235783Skib bp->b_blkno = bp->b_lblkno = blkno; 2407235783Skib bp->b_offset = offset; 2408235783Skib 2409235783Skib bgetvp(vp, bp); 2410235783Skib LIST_REMOVE(bp, b_hash); 2411280183Sdumbbell bh = bufhash(vp, blkno); 2412235783Skib LIST_INSERT_HEAD(bh, bp, b_hash); 2413235783Skib 2414235783Skib /* 2415235783Skib * set B_VMIO bit. allocbuf() the buffer bigger. Since the 2416235783Skib * buffer size starts out as 0, B_CACHE will be set by 2417235783Skib * allocbuf() for the VMIO case prior to it testing the 2418235783Skib * backing store for validity. 2419235783Skib */ 2420235783Skib 2421280183Sdumbbell if (vmio) { 2422235783Skib bp->b_flags |= B_VMIO; 2423235783Skib#if defined(VFS_BIO_DEBUG) 2424235783Skib if (vp->v_type != VREG) 2425280183Sdumbbell printf("getblk: vmioing file type %d???\n", vp->v_type); 2426235783Skib#endif 2427235783Skib VOP_GETVOBJECT(vp, &bp->b_object); 2428235783Skib } else { 2429235783Skib bp->b_flags &= ~B_VMIO; 2430235783Skib bp->b_object = NULL; 2431235783Skib } 2432235783Skib 2433235783Skib allocbuf(bp, size); 2434235783Skib 2435235783Skib splx(s); 2436235783Skib bp->b_flags &= ~B_DONE; 2437235783Skib } 2438235783Skib KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp)); 2439235783Skib return (bp); 2440235783Skib} 2441235783Skib 2442235783Skib/* 2443235783Skib * Get an empty, disassociated buffer of given size. The buffer is initially 2444235783Skib * set to B_INVAL. 2445235783Skib */ 2446235783Skibstruct buf * 2447235783Skibgeteblk(int size) 2448235783Skib{ 2449235783Skib struct buf *bp; 2450235783Skib int s; 2451235783Skib int maxsize; 2452235783Skib 2453235783Skib maxsize = (size + BKVAMASK) & ~BKVAMASK; 2454235783Skib 2455235783Skib s = splbio(); 2456235783Skib while ((bp = getnewbuf(0, 0, size, maxsize)) == 0); 2457235783Skib splx(s); 2458280183Sdumbbell allocbuf(bp, size); 2459235783Skib bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 2460235783Skib KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp)); 2461235783Skib return (bp); 2462235783Skib} 2463280183Sdumbbell 2464235783Skib 2465235783Skib/* 2466235783Skib * This code constitutes the buffer memory from either anonymous system 2467235783Skib * memory (in the case of non-VMIO operations) or from an associated 2468235783Skib * VM object (in the case of VMIO operations). This code is able to 2469235783Skib * resize a buffer up or down. 2470235783Skib * 2471235783Skib * Note that this code is tricky, and has many complications to resolve 2472235783Skib * deadlock or inconsistant data situations. Tread lightly!!! 2473257869Sdumbbell * There are B_CACHE and B_DELWRI interactions that must be dealt with by 2474235783Skib * the caller. Calling this code willy nilly can result in the loss of data. 2475235783Skib * 2476235783Skib * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 2477235783Skib * B_CACHE for the non-VMIO case. 2478235783Skib */ 2479235783Skib 2480235783Skibint 2481235783Skiballocbuf(struct buf *bp, int size) 2482235783Skib{ 2483235783Skib int newbsize, mbsize; 2484235783Skib int i; 2485235783Skib 2486235783Skib GIANT_REQUIRED; 2487235783Skib 2488235783Skib if (BUF_REFCNT(bp) == 0) 2489235783Skib panic("allocbuf: buffer not busy"); 2490235783Skib 2491277487Skib if (bp->b_kvasize < size) 2492235783Skib panic("allocbuf: buffer too small"); 2493235783Skib 2494280183Sdumbbell if ((bp->b_flags & B_VMIO) == 0) { 2495235783Skib caddr_t origbuf; 2496235783Skib int origbufsize; 2497235783Skib /* 2498235783Skib * Just get anonymous memory from the kernel. Don't 2499280183Sdumbbell * mess with B_CACHE. 2500235783Skib */ 2501235783Skib mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 2502235783Skib if (bp->b_flags & B_MALLOC) 2503235783Skib newbsize = mbsize; 2504235783Skib else 2505280183Sdumbbell newbsize = round_page(size); 2506235783Skib 2507235783Skib if (newbsize < bp->b_bufsize) { 2508280183Sdumbbell /* 2509235783Skib * malloced buffers are not shrunk 2510235783Skib */ 2511235783Skib if (bp->b_flags & B_MALLOC) { 2512235783Skib if (newbsize) { 2513235783Skib bp->b_bcount = size; 2514235783Skib } else { 2515235783Skib free(bp->b_data, M_BIOBUF); 2516280183Sdumbbell if (bp->b_bufsize) { 2517235783Skib bufmallocspace -= bp->b_bufsize; 2518235783Skib bufspacewakeup(); 2519235783Skib bp->b_bufsize = 0; 2520235783Skib } 2521235783Skib bp->b_data = bp->b_kvabase; 2522280183Sdumbbell bp->b_bcount = 0; 2523235783Skib bp->b_flags &= ~B_MALLOC; 2524235783Skib } 2525235783Skib return 1; 2526293851Sdumbbell } 2527280183Sdumbbell vm_hold_free_pages( 2528280183Sdumbbell bp, 2529280183Sdumbbell (vm_offset_t) bp->b_data + newbsize, 2530280183Sdumbbell (vm_offset_t) bp->b_data + bp->b_bufsize); 2531235783Skib } else if (newbsize > bp->b_bufsize) { 2532280183Sdumbbell /* 2533280183Sdumbbell * We only use malloced memory on the first allocation. 2534280183Sdumbbell * and revert to page-allocated memory when the buffer 2535280183Sdumbbell * grows. 2536235783Skib */ 2537280183Sdumbbell if ( (bufmallocspace < maxbufmallocspace) && 2538235783Skib (bp->b_bufsize == 0) && 2539235783Skib (mbsize <= PAGE_SIZE/2)) { 2540235783Skib 2541280183Sdumbbell bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 2542235783Skib bp->b_bufsize = mbsize; 2543235783Skib bp->b_bcount = size; 2544280183Sdumbbell bp->b_flags |= B_MALLOC; 2545235783Skib bufmallocspace += mbsize; 2546235783Skib return 1; 2547235783Skib } 2548235783Skib origbuf = NULL; 2549235783Skib origbufsize = 0; 2550235783Skib /* 2551235783Skib * If the buffer is growing on its other-than-first allocation, 2552235783Skib * then we revert to the page-allocation scheme. 2553235783Skib */ 2554235783Skib if (bp->b_flags & B_MALLOC) { 2555235783Skib origbuf = bp->b_data; 2556235783Skib origbufsize = bp->b_bufsize; 2557235783Skib bp->b_data = bp->b_kvabase; 2558235783Skib if (bp->b_bufsize) { 2559235783Skib bufmallocspace -= bp->b_bufsize; 2560235783Skib bufspacewakeup(); 2561235783Skib bp->b_bufsize = 0; 2562235783Skib } 2563235783Skib bp->b_flags &= ~B_MALLOC; 2564235783Skib newbsize = round_page(newbsize); 2565235783Skib } 2566235783Skib vm_hold_load_pages( 2567235783Skib bp, 2568235783Skib (vm_offset_t) bp->b_data + bp->b_bufsize, 2569235783Skib (vm_offset_t) bp->b_data + newbsize); 2570235783Skib if (origbuf) { 2571235783Skib bcopy(origbuf, bp->b_data, origbufsize); 2572235783Skib free(origbuf, M_BIOBUF); 2573235783Skib } 2574235783Skib } 2575235783Skib } else { 2576235783Skib vm_page_t m; 2577280183Sdumbbell int desiredpages; 2578235783Skib 2579235783Skib newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 2580235783Skib desiredpages = (size == 0) ? 0 : 2581235783Skib num_pages((bp->b_offset & PAGE_MASK) + newbsize); 2582235783Skib 2583235783Skib if (bp->b_flags & B_MALLOC) 2584235783Skib panic("allocbuf: VMIO buffer can't be malloced"); 2585235783Skib /* 2586235783Skib * Set B_CACHE initially if buffer is 0 length or will become 2587235783Skib * 0-length. 2588235783Skib */ 2589235783Skib if (size == 0 || bp->b_bufsize == 0) 2590235783Skib bp->b_flags |= B_CACHE; 2591235783Skib 2592235783Skib if (newbsize < bp->b_bufsize) { 2593235783Skib /* 2594235783Skib * DEV_BSIZE aligned new buffer size is less then the 2595235783Skib * DEV_BSIZE aligned existing buffer size. Figure out 2596235783Skib * if we have to remove any pages. 2597235783Skib */ 2598235783Skib if (desiredpages < bp->b_npages) { 2599235783Skib for (i = desiredpages; i < bp->b_npages; i++) { 2600235783Skib /* 2601235783Skib * the page is not freed here -- it 2602235783Skib * is the responsibility of 2603235783Skib * vnode_pager_setsize 2604235783Skib */ 2605235783Skib m = bp->b_pages[i]; 2606235783Skib KASSERT(m != bogus_page, 2607235783Skib ("allocbuf: bogus page found")); 2608235783Skib while (vm_page_sleep_busy(m, TRUE, "biodep")) 2609235783Skib ; 2610235783Skib 2611280183Sdumbbell bp->b_pages[i] = NULL; 2612235783Skib vm_page_unwire(m, 0); 2613235783Skib } 2614235783Skib pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 2615235783Skib (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 2616235783Skib bp->b_npages = desiredpages; 2617235783Skib } 2618235783Skib } else if (size > bp->b_bcount) { 2619235783Skib /* 2620235783Skib * We are growing the buffer, possibly in a 2621235783Skib * byte-granular fashion. 2622235783Skib */ 2623235783Skib struct vnode *vp; 2624235783Skib vm_object_t obj; 2625235783Skib vm_offset_t toff; 2626235783Skib vm_offset_t tinc; 2627235783Skib 2628235783Skib /* 2629235783Skib * Step 1, bring in the VM pages from the object, 2630235783Skib * allocating them if necessary. We must clear 2631235783Skib * B_CACHE if these pages are not valid for the 2632235783Skib * range covered by the buffer. 2633280183Sdumbbell */ 2634235783Skib 2635235783Skib vp = bp->b_vp; 2636235783Skib obj = bp->b_object; 2637235783Skib 2638235783Skib while (bp->b_npages < desiredpages) { 2639235783Skib vm_page_t m; 2640235783Skib vm_pindex_t pi; 2641235783Skib 2642235783Skib pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; 2643235783Skib if ((m = vm_page_lookup(obj, pi)) == NULL) { 2644235783Skib /* 2645235783Skib * note: must allocate system pages 2646235783Skib * since blocking here could intefere 2647235783Skib * with paging I/O, no matter which 2648235783Skib * process we are. 2649235783Skib */ 2650235783Skib m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM); 2651235783Skib if (m == NULL) { 2652235783Skib VM_WAIT; 2653235783Skib vm_pageout_deficit += desiredpages - bp->b_npages; 2654235783Skib } else { 2655235783Skib vm_page_wire(m); 2656235783Skib vm_page_wakeup(m); 2657235783Skib bp->b_flags &= ~B_CACHE; 2658235783Skib bp->b_pages[bp->b_npages] = m; 2659235783Skib ++bp->b_npages; 2660235783Skib } 2661235783Skib continue; 2662235783Skib } 2663235783Skib 2664235783Skib /* 2665235783Skib * We found a page. If we have to sleep on it, 2666235783Skib * retry because it might have gotten freed out 2667280183Sdumbbell * from under us. 2668235783Skib * 2669235783Skib * We can only test PG_BUSY here. Blocking on 2670235783Skib * m->busy might lead to a deadlock: 2671235783Skib * 2672235783Skib * vm_fault->getpages->cluster_read->allocbuf 2673235783Skib * 2674235783Skib */ 2675235783Skib 2676235783Skib if (vm_page_sleep_busy(m, FALSE, "pgtblk")) 2677235783Skib continue; 2678235783Skib 2679235783Skib /* 2680235783Skib * We have a good page. Should we wakeup the 2681235783Skib * page daemon? 2682235783Skib */ 2683235783Skib if ((curproc != pageproc) && 2684235783Skib ((m->queue - m->pc) == PQ_CACHE) && 2685235783Skib ((cnt.v_free_count + cnt.v_cache_count) < 2686235783Skib (cnt.v_free_min + cnt.v_cache_min))) { 2687235783Skib pagedaemon_wakeup(); 2688235783Skib } 2689235783Skib vm_page_flag_clear(m, PG_ZERO); 2690277487Skib vm_page_wire(m); 2691235783Skib bp->b_pages[bp->b_npages] = m; 2692235783Skib ++bp->b_npages; 2693235783Skib } 2694235783Skib 2695235783Skib /* 2696235783Skib * Step 2. We've loaded the pages into the buffer, 2697235783Skib * we have to figure out if we can still have B_CACHE 2698235783Skib * set. Note that B_CACHE is set according to the 2699235783Skib * byte-granular range ( bcount and size ), new the 2700235783Skib * aligned range ( newbsize ). 2701235783Skib * 2702235783Skib * The VM test is against m->valid, which is DEV_BSIZE 2703235783Skib * aligned. Needless to say, the validity of the data 2704235783Skib * needs to also be DEV_BSIZE aligned. Note that this 2705235783Skib * fails with NFS if the server or some other client 2706235783Skib * extends the file's EOF. If our buffer is resized, 2707235783Skib * B_CACHE may remain set! XXX 2708235783Skib */ 2709235783Skib 2710235783Skib toff = bp->b_bcount; 2711235783Skib tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); 2712235783Skib 2713235783Skib while ((bp->b_flags & B_CACHE) && toff < size) { 2714235783Skib vm_pindex_t pi; 2715235783Skib 2716235783Skib if (tinc > (size - toff)) 2717235783Skib tinc = size - toff; 2718235783Skib 2719235783Skib pi = ((bp->b_offset & PAGE_MASK) + toff) >> 2720235783Skib PAGE_SHIFT; 2721235783Skib 2722235783Skib vfs_buf_test_cache( 2723235783Skib bp, 2724235783Skib bp->b_offset, 2725235783Skib toff, 2726235783Skib tinc, 2727235783Skib bp->b_pages[pi] 2728235783Skib ); 2729235783Skib toff += tinc; 2730235783Skib tinc = PAGE_SIZE; 2731235783Skib } 2732235783Skib 2733235783Skib /* 2734235783Skib * Step 3, fixup the KVM pmap. Remember that 2735235783Skib * bp->b_data is relative to bp->b_offset, but 2736235783Skib * bp->b_offset may be offset into the first page. 2737235783Skib */ 2738235783Skib 2739235783Skib bp->b_data = (caddr_t) 2740235783Skib trunc_page((vm_offset_t)bp->b_data); 2741235783Skib pmap_qenter( 2742235783Skib (vm_offset_t)bp->b_data, 2743235783Skib bp->b_pages, 2744277487Skib bp->b_npages 2745235783Skib ); 2746235783Skib 2747235783Skib bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 2748235783Skib (vm_offset_t)(bp->b_offset & PAGE_MASK)); 2749235783Skib } 2750235783Skib } 2751235783Skib if (newbsize < bp->b_bufsize) 2752235783Skib bufspacewakeup(); 2753235783Skib bp->b_bufsize = newbsize; /* actual buffer allocation */ 2754235783Skib bp->b_bcount = size; /* requested buffer size */ 2755235783Skib return 1; 2756235783Skib} 2757235783Skib 2758235783Skib/* 2759235783Skib * bufwait: 2760235783Skib * 2761235783Skib * Wait for buffer I/O completion, returning error status. The buffer 2762235783Skib * is left locked and B_DONE on return. B_EINTR is converted into a EINTR 2763235783Skib * error and cleared. 2764235783Skib */ 2765235783Skibint 2766235783Skibbufwait(register struct buf * bp) 2767235783Skib{ 2768235783Skib int s; 2769235783Skib 2770235783Skib s = splbio(); 2771235783Skib while ((bp->b_flags & B_DONE) == 0) { 2772235783Skib if (bp->b_iocmd == BIO_READ) 2773235783Skib tsleep(bp, PRIBIO, "biord", 0); 2774235783Skib else 2775235783Skib tsleep(bp, PRIBIO, "biowr", 0); 2776235783Skib } 2777293851Sdumbbell splx(s); 2778280183Sdumbbell if (bp->b_flags & B_EINTR) { 2779280183Sdumbbell bp->b_flags &= ~B_EINTR; 2780235783Skib return (EINTR); 2781235783Skib } 2782235783Skib if (bp->b_ioflags & BIO_ERROR) { 2783293851Sdumbbell return (bp->b_error ? bp->b_error : EIO); 2784280183Sdumbbell } else { 2785280183Sdumbbell return (0); 2786235783Skib } 2787235783Skib} 2788235783Skib 2789235783Skib /* 2790235783Skib * Call back function from struct bio back up to struct buf. 2791280183Sdumbbell * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY(). 2792235783Skib */ 2793235783Skibvoid 2794235783Skibbufdonebio(struct bio *bp) 2795235783Skib{ 2796235783Skib bufdone(bp->bio_caller2); 2797235783Skib} 2798235783Skib 2799235783Skib/* 2800235783Skib * bufdone: 2801235783Skib * 2802235783Skib * Finish I/O on a buffer, optionally calling a completion function. 2803235783Skib * This is usually called from an interrupt so process blocking is 2804235783Skib * not allowed. 2805235783Skib * 2806280183Sdumbbell * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 2807235783Skib * In a non-VMIO bp, B_CACHE will be set on the next getblk() 2808280183Sdumbbell * assuming B_INVAL is clear. 2809235783Skib * 2810235783Skib * For the VMIO case, we set B_CACHE if the op was a read and no 2811235783Skib * read error occured, or if the op was a write. B_CACHE is never 2812235783Skib * set if the buffer is invalid or otherwise uncacheable. 2813235783Skib * 2814235783Skib * biodone does not mess with B_INVAL, allowing the I/O routine or the 2815235783Skib * initiator to leave B_INVAL set to brelse the buffer out of existance 2816235783Skib * in the biodone routine. 2817235783Skib */ 2818235783Skibvoid 2819235783Skibbufdone(struct buf *bp) 2820235783Skib{ 2821235783Skib int s; 2822235783Skib void (*biodone)(struct buf *); 2823235783Skib 2824235783Skib GIANT_REQUIRED; 2825235783Skib 2826235783Skib s = splbio(); 2827235783Skib 2828235783Skib KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); 2829235783Skib KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 2830235783Skib 2831235783Skib bp->b_flags |= B_DONE; 2832235783Skib runningbufwakeup(bp); 2833235783Skib 2834235783Skib if (bp->b_iocmd == BIO_DELETE) { 2835235783Skib brelse(bp); 2836280183Sdumbbell splx(s); 2837235783Skib return; 2838277487Skib } 2839277487Skib 2840277487Skib if (bp->b_iocmd == BIO_WRITE) { 2841277487Skib vwakeup(bp); 2842277487Skib } 2843277487Skib 2844277487Skib /* call optional completion function if requested */ 2845277487Skib if (bp->b_iodone != NULL) { 2846277487Skib biodone = bp->b_iodone; 2847277487Skib bp->b_iodone = NULL; 2848277487Skib (*biodone) (bp); 2849277487Skib splx(s); 2850277487Skib return; 2851277487Skib } 2852277487Skib if (LIST_FIRST(&bp->b_dep) != NULL) 2853277487Skib buf_complete(bp); 2854277487Skib 2855277487Skib if (bp->b_flags & B_VMIO) { 2856277487Skib int i; 2857277487Skib vm_ooffset_t foff; 2858277487Skib vm_page_t m; 2859277487Skib vm_object_t obj; 2860277487Skib int iosize; 2861277487Skib struct vnode *vp = bp->b_vp; 2862277487Skib 2863277487Skib obj = bp->b_object; 2864280183Sdumbbell 2865277487Skib#if defined(VFS_BIO_DEBUG) 2866235783Skib if (vp->v_usecount == 0) { 2867235783Skib panic("biodone: zero vnode ref count"); 2868235783Skib } 2869235783Skib 2870235783Skib if ((vp->v_flag & VOBJBUF) == 0) { 2871235783Skib panic("biodone: vnode is not setup for merged cache"); 2872235783Skib } 2873235783Skib#endif 2874235783Skib 2875235783Skib foff = bp->b_offset; 2876235783Skib KASSERT(bp->b_offset != NOOFFSET, 2877235783Skib ("biodone: no buffer offset")); 2878235783Skib 2879235783Skib#if defined(VFS_BIO_DEBUG) 2880235783Skib if (obj->paging_in_progress < bp->b_npages) { 2881235783Skib printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 2882235783Skib obj->paging_in_progress, bp->b_npages); 2883280183Sdumbbell } 2884235783Skib#endif 2885235783Skib 2886235783Skib /* 2887235783Skib * Set B_CACHE if the op was a normal read and no error 2888235783Skib * occured. B_CACHE is set for writes in the b*write() 2889235783Skib * routines. 2890277487Skib */ 2891235783Skib iosize = bp->b_bcount - bp->b_resid; 2892235783Skib if (bp->b_iocmd == BIO_READ && 2893277487Skib !(bp->b_flags & (B_INVAL|B_NOCACHE)) && 2894277487Skib !(bp->b_ioflags & BIO_ERROR)) { 2895277487Skib bp->b_flags |= B_CACHE; 2896277487Skib } 2897277487Skib 2898277487Skib for (i = 0; i < bp->b_npages; i++) { 2899277487Skib int bogusflag = 0; 2900235783Skib int resid; 2901235783Skib 2902235783Skib resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 2903235783Skib if (resid > iosize) 2904235783Skib resid = iosize; 2905235783Skib 2906235783Skib /* 2907235783Skib * cleanup bogus pages, restoring the originals 2908235783Skib */ 2909235783Skib m = bp->b_pages[i]; 2910235783Skib if (m == bogus_page) { 2911293851Sdumbbell bogusflag = 1; 2912280183Sdumbbell m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 2913280183Sdumbbell if (m == NULL) 2914235783Skib panic("biodone: page disappeared!"); 2915235783Skib bp->b_pages[i] = m; 2916235783Skib pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 2917235783Skib } 2918235783Skib#if defined(VFS_BIO_DEBUG) 2919235783Skib if (OFF_TO_IDX(foff) != m->pindex) { 2920235783Skib printf( 2921235783Skib"biodone: foff(%jd)/m->pindex(%ju) mismatch\n", 2922235783Skib (intmax_t)foff, (uintmax_t)m->pindex); 2923280183Sdumbbell } 2924235783Skib#endif 2925235783Skib 2926235783Skib /* 2927235783Skib * In the write case, the valid and clean bits are 2928235783Skib * already changed correctly ( see bdwrite() ), so we 2929235783Skib * only need to do this here in the read case. 2930235783Skib */ 2931235783Skib if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { 2932235783Skib vfs_page_set_valid(bp, foff, i, m); 2933235783Skib } 2934235783Skib vm_page_flag_clear(m, PG_ZERO); 2935235783Skib 2936235783Skib /* 2937235783Skib * when debugging new filesystems or buffer I/O methods, this 2938235783Skib * is the most common error that pops up. if you see this, you 2939235783Skib * have not set the page busy flag correctly!!! 2940280183Sdumbbell */ 2941235783Skib if (m->busy == 0) { 2942277487Skib printf("biodone: page busy < 0, " 2943277487Skib "pindex: %d, foff: 0x(%x,%x), " 2944277487Skib "resid: %d, index: %d\n", 2945277487Skib (int) m->pindex, (int)(foff >> 32), 2946277487Skib (int) foff & 0xffffffff, resid, i); 2947277487Skib if (!vn_isdisk(vp, NULL)) 2948277487Skib printf(" iosize: %ld, lblkno: %jd, flags: 0x%lx, npages: %d\n", 2949280183Sdumbbell bp->b_vp->v_mount->mnt_stat.f_iosize, 2950277487Skib (intmax_t) bp->b_lblkno, 2951277487Skib bp->b_flags, bp->b_npages); 2952277487Skib else 2953277487Skib printf(" VDEV, lblkno: %jd, flags: 0x%lx, npages: %d\n", 2954280183Sdumbbell (intmax_t) bp->b_lblkno, 2955277487Skib bp->b_flags, bp->b_npages); 2956277487Skib printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 2957277487Skib m->valid, m->dirty, m->wire_count); 2958277487Skib panic("biodone: page busy < 0\n"); 2959277487Skib } 2960280183Sdumbbell vm_page_io_finish(m); 2961277487Skib vm_object_pip_subtract(obj, 1); 2962277487Skib foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 2963277487Skib iosize -= resid; 2964277487Skib } 2965235783Skib if (obj) 2966235783Skib vm_object_pip_wakeupn(obj, 0); 2967277487Skib } 2968277487Skib 2969277487Skib /* 2970277487Skib * For asynchronous completions, release the buffer now. The brelse 2971235783Skib * will do a wakeup there if necessary - so no need to do a wakeup 2972235783Skib * here in the async case. The sync case always needs to do a wakeup. 2973235783Skib */ 2974277487Skib 2975235783Skib if (bp->b_flags & B_ASYNC) { 2976280183Sdumbbell if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR)) 2977235783Skib brelse(bp); 2978277487Skib else 2979235783Skib bqrelse(bp); 2980235783Skib } else { 2981235783Skib wakeup(bp); 2982235783Skib } 2983277487Skib splx(s); 2984277487Skib} 2985277487Skib 2986277487Skib/* 2987235783Skib * This routine is called in lieu of iodone in the case of 2988235783Skib * incomplete I/O. This keeps the busy status for pages 2989235783Skib * consistant. 2990277487Skib */ 2991235783Skibvoid 2992280183Sdumbbellvfs_unbusy_pages(struct buf * bp) 2993235783Skib{ 2994235783Skib int i; 2995235783Skib 2996235783Skib GIANT_REQUIRED; 2997235783Skib 2998235783Skib runningbufwakeup(bp); 2999235783Skib if (bp->b_flags & B_VMIO) { 3000235783Skib vm_object_t obj; 3001235783Skib 3002235783Skib obj = bp->b_object; 3003235783Skib 3004235783Skib for (i = 0; i < bp->b_npages; i++) { 3005235783Skib vm_page_t m = bp->b_pages[i]; 3006235783Skib 3007235783Skib if (m == bogus_page) { 3008280183Sdumbbell m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); 3009280183Sdumbbell if (!m) { 3010280183Sdumbbell panic("vfs_unbusy_pages: page missing\n"); 3011235783Skib } 3012235783Skib bp->b_pages[i] = m; 3013235783Skib pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 3014235783Skib } 3015235783Skib vm_object_pip_subtract(obj, 1); 3016235783Skib vm_page_flag_clear(m, PG_ZERO); 3017235783Skib vm_page_io_finish(m); 3018235783Skib } 3019235783Skib vm_object_pip_wakeupn(obj, 0); 3020235783Skib } 3021235783Skib} 3022235783Skib 3023277487Skib/* 3024235783Skib * vfs_page_set_valid: 3025235783Skib * 3026235783Skib * Set the valid bits in a page based on the supplied offset. The 3027235783Skib * range is restricted to the buffer's size. 3028235783Skib * 3029235783Skib * This routine is typically called after a read completes. 3030235783Skib */ 3031235783Skibstatic void 3032235783Skibvfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) 3033235783Skib{ 3034235783Skib vm_ooffset_t soff, eoff; 3035235783Skib 3036235783Skib GIANT_REQUIRED; 3037235783Skib /* 3038280183Sdumbbell * Start and end offsets in buffer. eoff - soff may not cross a 3039235783Skib * page boundry or cross the end of the buffer. The end of the 3040280183Sdumbbell * buffer, in this case, is our file EOF, not the allocation size 3041235783Skib * of the buffer. 3042235783Skib */ 3043235783Skib soff = off; 3044235783Skib eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3045235783Skib if (eoff > bp->b_offset + bp->b_bcount) 3046235783Skib eoff = bp->b_offset + bp->b_bcount; 3047235783Skib 3048277487Skib /* 3049235783Skib * Set valid range. This is typically the entire buffer and thus the 3050235783Skib * entire page. 3051280183Sdumbbell */ 3052235783Skib if (eoff > soff) { 3053235783Skib vm_page_set_validclean( 3054280183Sdumbbell m, 3055235783Skib (vm_offset_t) (soff & PAGE_MASK), 3056235783Skib (vm_offset_t) (eoff - soff) 3057235783Skib ); 3058235783Skib } 3059280183Sdumbbell} 3060280183Sdumbbell 3061235783Skib/* 3062235783Skib * This routine is called before a device strategy routine. 3063235783Skib * It is used to tell the VM system that paging I/O is in 3064235783Skib * progress, and treat the pages associated with the buffer 3065235783Skib * almost as being PG_BUSY. Also the object paging_in_progress 3066235783Skib * flag is handled to make sure that the object doesn't become 3067235783Skib * inconsistant. 3068235783Skib * 3069235783Skib * Since I/O has not been initiated yet, certain buffer flags 3070235783Skib * such as BIO_ERROR or B_INVAL may be in an inconsistant state 3071235783Skib * and should be ignored. 3072235783Skib */ 3073280183Sdumbbellvoid 3074280183Sdumbbellvfs_busy_pages(struct buf * bp, int clear_modify) 3075235783Skib{ 3076235783Skib int i, bogus; 3077280183Sdumbbell 3078235783Skib GIANT_REQUIRED; 3079235783Skib 3080235783Skib if (bp->b_flags & B_VMIO) { 3081235783Skib vm_object_t obj; 3082280183Sdumbbell vm_ooffset_t foff; 3083235783Skib 3084235783Skib obj = bp->b_object; 3085235783Skib foff = bp->b_offset; 3086235783Skib KASSERT(bp->b_offset != NOOFFSET, 3087235783Skib ("vfs_busy_pages: no buffer offset")); 3088235783Skib vfs_setdirty(bp); 3089235783Skib 3090235783Skibretry: 3091235783Skib for (i = 0; i < bp->b_npages; i++) { 3092235783Skib vm_page_t m = bp->b_pages[i]; 3093235783Skib if (vm_page_sleep_busy(m, FALSE, "vbpage")) 3094235783Skib goto retry; 3095235783Skib } 3096235783Skib 3097235783Skib bogus = 0; 3098235783Skib for (i = 0; i < bp->b_npages; i++) { 3099235783Skib vm_page_t m = bp->b_pages[i]; 3100235783Skib 3101235783Skib vm_page_flag_clear(m, PG_ZERO); 3102235783Skib if ((bp->b_flags & B_CLUSTER) == 0) { 3103235783Skib vm_object_pip_add(obj, 1); 3104235783Skib vm_page_io_start(m); 3105235783Skib } 3106280183Sdumbbell 3107293851Sdumbbell /* 3108280183Sdumbbell * When readying a buffer for a read ( i.e 3109280183Sdumbbell * clear_modify == 0 ), it is important to do 3110235783Skib * bogus_page replacement for valid pages in 3111235783Skib * partially instantiated buffers. Partially 3112235783Skib * instantiated buffers can, in turn, occur when 3113235783Skib * reconstituting a buffer from its VM backing store 3114280183Sdumbbell * base. We only have to do this if B_CACHE is 3115235783Skib * clear ( which causes the I/O to occur in the 3116235783Skib * first place ). The replacement prevents the read 3117235783Skib * I/O from overwriting potentially dirty VM-backed 3118235783Skib * pages. XXX bogus page replacement is, uh, bogus. 3119235783Skib * It may not work properly with small-block devices. 3120235783Skib * We need to find a better way. 3121235783Skib */ 3122235783Skib 3123235783Skib vm_page_protect(m, VM_PROT_NONE); 3124235783Skib if (clear_modify) 3125235783Skib vfs_page_set_valid(bp, foff, i, m); 3126235783Skib else if (m->valid == VM_PAGE_BITS_ALL && 3127235783Skib (bp->b_flags & B_CACHE) == 0) { 3128235783Skib bp->b_pages[i] = bogus_page; 3129235783Skib bogus++; 3130235783Skib } 3131235783Skib foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3132235783Skib } 3133235783Skib if (bogus) 3134235783Skib pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 3135235783Skib } 3136235783Skib} 3137235783Skib 3138235783Skib/* 3139235783Skib * Tell the VM system that the pages associated with this buffer 3140280183Sdumbbell * are clean. This is used for delayed writes where the data is 3141235783Skib * going to go to disk eventually without additional VM intevention. 3142235783Skib * 3143235783Skib * Note that while we only really need to clean through to b_bcount, we 3144235783Skib * just go ahead and clean through to b_bufsize. 3145235783Skib */ 3146235783Skibstatic void 3147235783Skibvfs_clean_pages(struct buf * bp) 3148235783Skib{ 3149235783Skib int i; 3150235783Skib 3151235783Skib GIANT_REQUIRED; 3152235783Skib 3153235783Skib if (bp->b_flags & B_VMIO) { 3154280183Sdumbbell vm_ooffset_t foff; 3155280183Sdumbbell 3156235783Skib foff = bp->b_offset; 3157235783Skib KASSERT(bp->b_offset != NOOFFSET, 3158235783Skib ("vfs_clean_pages: no buffer offset")); 3159235783Skib for (i = 0; i < bp->b_npages; i++) { 3160235783Skib vm_page_t m = bp->b_pages[i]; 3161235783Skib vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3162235783Skib vm_ooffset_t eoff = noff; 3163235783Skib 3164235783Skib if (eoff > bp->b_offset + bp->b_bufsize) 3165235783Skib eoff = bp->b_offset + bp->b_bufsize; 3166235783Skib vfs_page_set_valid(bp, foff, i, m); 3167235783Skib /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ 3168235783Skib foff = noff; 3169235783Skib } 3170235783Skib } 3171277487Skib} 3172235783Skib 3173235783Skib/* 3174235783Skib * vfs_bio_set_validclean: 3175235783Skib * 3176235783Skib * Set the range within the buffer to valid and clean. The range is 3177235783Skib * relative to the beginning of the buffer, b_offset. Note that b_offset 3178235783Skib * itself may be offset from the beginning of the first page. 3179280183Sdumbbell * 3180235783Skib */ 3181235783Skib 3182235783Skibvoid 3183235783Skibvfs_bio_set_validclean(struct buf *bp, int base, int size) 3184235783Skib{ 3185235783Skib if (bp->b_flags & B_VMIO) { 3186280183Sdumbbell int i; 3187280183Sdumbbell int n; 3188235783Skib 3189280183Sdumbbell /* 3190235783Skib * Fixup base to be relative to beginning of first page. 3191235783Skib * Set initial n to be the maximum number of bytes in the 3192235783Skib * first page that can be validated. 3193235783Skib */ 3194235783Skib 3195280183Sdumbbell base += (bp->b_offset & PAGE_MASK); 3196235783Skib n = PAGE_SIZE - (base & PAGE_MASK); 3197277487Skib 3198280183Sdumbbell for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 3199277487Skib vm_page_t m = bp->b_pages[i]; 3200277487Skib 3201277487Skib if (n > size) 3202277487Skib n = size; 3203277487Skib 3204277487Skib vm_page_set_validclean(m, base & PAGE_MASK, n); 3205277487Skib base += n; 3206277487Skib size -= n; 3207277487Skib n = PAGE_SIZE; 3208280183Sdumbbell } 3209277487Skib } 3210277487Skib} 3211277487Skib 3212280183Sdumbbell/* 3213280183Sdumbbell * vfs_bio_clrbuf: 3214280183Sdumbbell * 3215277487Skib * clear a buffer. This routine essentially fakes an I/O, so we need 3216277487Skib * to clear BIO_ERROR and B_INVAL. 3217277487Skib * 3218277487Skib * Note that while we only theoretically need to clear through b_bcount, 3219277487Skib * we go ahead and clear through b_bufsize. 3220277487Skib */ 3221277487Skib 3222277487Skibvoid 3223277487Skibvfs_bio_clrbuf(struct buf *bp) 3224235783Skib{ 3225235783Skib int i, mask = 0; 3226235783Skib caddr_t sa, ea; 3227277487Skib 3228277487Skib GIANT_REQUIRED; 3229277487Skib 3230277487Skib if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 3231277487Skib bp->b_flags &= ~B_INVAL; 3232277487Skib bp->b_ioflags &= ~BIO_ERROR; 3233277487Skib if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 3234277487Skib (bp->b_offset & PAGE_MASK) == 0) { 3235277487Skib mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 3236277487Skib if ((bp->b_pages[0]->valid & mask) == mask) { 3237277487Skib bp->b_resid = 0; 3238277487Skib return; 3239277487Skib } 3240277487Skib if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && 3241277487Skib ((bp->b_pages[0]->valid & mask) == 0)) { 3242277487Skib bzero(bp->b_data, bp->b_bufsize); 3243277487Skib bp->b_pages[0]->valid |= mask; 3244277487Skib bp->b_resid = 0; 3245277487Skib return; 3246277487Skib } 3247277487Skib } 3248277487Skib ea = sa = bp->b_data; 3249277487Skib for(i=0;i<bp->b_npages;i++,sa=ea) { 3250277487Skib int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 3251277487Skib ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 3252277487Skib ea = (caddr_t)(vm_offset_t)ulmin( 3253277487Skib (u_long)(vm_offset_t)ea, 3254277487Skib (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 3255277487Skib mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 3256280183Sdumbbell if ((bp->b_pages[i]->valid & mask) == mask) 3257277487Skib continue; 3258277487Skib if ((bp->b_pages[i]->valid & mask) == 0) { 3259277487Skib if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { 3260277487Skib bzero(sa, ea - sa); 3261277487Skib } 3262277487Skib } else { 3263277487Skib for (; sa < ea; sa += DEV_BSIZE, j++) { 3264277487Skib if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && 3265277487Skib (bp->b_pages[i]->valid & (1<<j)) == 0) 3266277487Skib bzero(sa, DEV_BSIZE); 3267277487Skib } 3268277487Skib } 3269277487Skib bp->b_pages[i]->valid |= mask; 3270277487Skib vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 3271277487Skib } 3272277487Skib bp->b_resid = 0; 3273277487Skib } else { 3274277487Skib clrbuf(bp); 3275277487Skib } 3276277487Skib} 3277277487Skib 3278277487Skib/* 3279277487Skib * vm_hold_load_pages and vm_hold_free_pages get pages into 3280277487Skib * a buffers address space. The pages are anonymous and are 3281277487Skib * not associated with a file object. 3282277487Skib */ 3283277487Skibstatic void 3284277487Skibvm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3285277487Skib{ 3286277487Skib vm_offset_t pg; 3287277487Skib vm_page_t p; 3288277487Skib int index; 3289277487Skib 3290277487Skib GIANT_REQUIRED; 3291277487Skib 3292277487Skib to = round_page(to); 3293277487Skib from = round_page(from); 3294235783Skib index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3295277487Skib 3296235783Skib for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3297277487Skibtryagain: 3298277487Skib /* 3299277487Skib * note: must allocate system pages since blocking here 3300277487Skib * could intefere with paging I/O, no matter which 3301235783Skib * process we are. 3302235783Skib */ 3303235783Skib p = vm_page_alloc(kernel_object, 3304235783Skib ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 3305235783Skib VM_ALLOC_SYSTEM); 3306235783Skib if (!p) { 3307277487Skib vm_pageout_deficit += (to - from) >> PAGE_SHIFT; 3308235783Skib VM_WAIT; 3309277487Skib goto tryagain; 3310235783Skib } 3311235783Skib vm_page_wire(p); 3312277487Skib p->valid = VM_PAGE_BITS_ALL; 3313277487Skib vm_page_flag_clear(p, PG_ZERO); 3314277487Skib pmap_qenter(pg, &p, 1); 3315277487Skib bp->b_pages[index] = p; 3316235783Skib vm_page_wakeup(p); 3317277487Skib } 3318277487Skib bp->b_npages = index; 3319277487Skib} 3320277487Skib 3321277487Skib/* Return pages associated with this buf to the vm system */ 3322277487Skibvoid 3323277487Skibvm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3324277487Skib{ 3325277487Skib vm_offset_t pg; 3326277487Skib vm_page_t p; 3327280183Sdumbbell int index, newnpages; 3328280183Sdumbbell 3329277487Skib GIANT_REQUIRED; 3330277487Skib 3331277487Skib from = round_page(from); 3332280183Sdumbbell to = round_page(to); 3333280183Sdumbbell newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3334277487Skib 3335277487Skib for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3336277487Skib p = bp->b_pages[index]; 3337277487Skib if (p && (index < bp->b_npages)) { 3338277487Skib if (p->busy) { 3339235783Skib printf( 3340277487Skib "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n", 3341277487Skib (intmax_t)bp->b_blkno, 3342277487Skib (intmax_t)bp->b_lblkno); 3343277487Skib } 3344277487Skib bp->b_pages[index] = NULL; 3345235783Skib pmap_qremove(pg, 1); 3346277487Skib vm_page_busy(p); 3347277487Skib vm_page_unwire(p, 0); 3348277487Skib vm_page_free(p); 3349277487Skib } 3350277487Skib } 3351277487Skib bp->b_npages = newnpages; 3352277487Skib} 3353277487Skib 3354277487Skib 3355277487Skib#include "opt_ddb.h" 3356277487Skib#ifdef DDB 3357277487Skib#include <ddb/ddb.h> 3358277487Skib 3359277487Skib/* DDB command to show buffer data */ 3360277487SkibDB_SHOW_COMMAND(buffer, db_show_buffer) 3361277487Skib{ 3362277487Skib /* get args */ 3363235783Skib struct buf *bp = (struct buf *)addr; 3364277487Skib 3365277487Skib if (!have_addr) { 3366235783Skib db_printf("usage: show buffer <addr>\n"); 3367277487Skib return; 3368277487Skib } 3369277487Skib 3370277487Skib db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 3371277487Skib db_printf( 3372235783Skib "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n" 3373235783Skib "b_dev = (%d,%d), b_data = %p, b_blkno = %jd, b_pblkno = %jd\n", 3374277487Skib bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 3375277487Skib major(bp->b_dev), minor(bp->b_dev), bp->b_data, 3376277487Skib (intmax_t)bp->b_blkno, (intmax_t)bp->b_pblkno); 3377235783Skib if (bp->b_npages) { 3378277487Skib int i; 3379235783Skib db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 3380277487Skib for (i = 0; i < bp->b_npages; i++) { 3381277487Skib vm_page_t m; 3382235783Skib m = bp->b_pages[i]; 3383277487Skib db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 3384277487Skib (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 3385277487Skib if ((i + 1) < bp->b_npages) 3386277487Skib db_printf(","); 3387277487Skib } 3388277487Skib db_printf("\n"); 3389277487Skib } 3390277487Skib} 3391277487Skib#endif /* DDB */ 3392277487Skib