vfs_bio.c revision 99589
1235783Skib/*
2235783Skib * Copyright (c) 1994,1997 John S. Dyson
3235783Skib * All rights reserved.
4235783Skib *
5235783Skib * Redistribution and use in source and binary forms, with or without
6235783Skib * modification, are permitted provided that the following conditions
7235783Skib * are met:
8235783Skib * 1. Redistributions of source code must retain the above copyright
9235783Skib *    notice immediately at the beginning of the file, without modification,
10235783Skib *    this list of conditions, and the following disclaimer.
11235783Skib * 2. Absolutely no warranty of function or purpose is made by the author
12235783Skib *		John S. Dyson.
13235783Skib *
14235783Skib * $FreeBSD: head/sys/kern/vfs_bio.c 99589 2002-07-08 12:21:11Z bde $
15235783Skib */
16235783Skib
17235783Skib/*
18235783Skib * this file contains a new buffer I/O scheme implementing a coherent
19235783Skib * VM object and buffer cache scheme.  Pains have been taken to make
20235783Skib * sure that the performance degradation associated with schemes such
21235783Skib * as this is not realized.
22235783Skib *
23235783Skib * Author:  John S. Dyson
24235783Skib * Significant help during the development and debugging phases
25235783Skib * had been provided by David Greenman, also of the FreeBSD core team.
26235783Skib *
27235783Skib * see man buf(9) for more info.
28235783Skib */
29235783Skib
30235783Skib#include <sys/param.h>
31235783Skib#include <sys/systm.h>
32235783Skib#include <sys/stdint.h>
33235783Skib#include <sys/bio.h>
34235783Skib#include <sys/buf.h>
35235783Skib#include <sys/eventhandler.h>
36235783Skib#include <sys/lock.h>
37235783Skib#include <sys/malloc.h>
38235783Skib#include <sys/mount.h>
39235783Skib#include <sys/mutex.h>
40280183Sdumbbell#include <sys/kernel.h>
41280183Sdumbbell#include <sys/kthread.h>
42280183Sdumbbell#include <sys/ktr.h>
43235783Skib#include <sys/proc.h>
44235783Skib#include <sys/reboot.h>
45235783Skib#include <sys/resourcevar.h>
46235783Skib#include <sys/sysctl.h>
47235783Skib#include <sys/vmmeter.h>
48280183Sdumbbell#include <sys/vnode.h>
49235783Skib#include <vm/vm.h>
50235783Skib#include <vm/vm_param.h>
51235783Skib#include <vm/vm_kern.h>
52235783Skib#include <vm/vm_pageout.h>
53235783Skib#include <vm/vm_page.h>
54235783Skib#include <vm/vm_object.h>
55235783Skib#include <vm/vm_extern.h>
56235783Skib#include <vm/vm_map.h>
57235783Skib
58235783Skibstatic MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
59235783Skib
60235783Skibstruct	bio_ops bioops;		/* I/O operation notification */
61235783Skib
62235783Skibstruct	buf_ops buf_ops_bio = {
63235783Skib	"buf_ops_bio",
64235783Skib	bwrite
65235783Skib};
66235783Skib
67235783Skib/*
68235783Skib * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
69235783Skib * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
70235783Skib */
71235783Skibstruct buf *buf;		/* buffer header pool */
72235783Skibstruct mtx buftimelock;		/* Interlock on setting prio and timo */
73235783Skib
74235783Skibstatic void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
75235783Skib		vm_offset_t to);
76235783Skibstatic void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
77235783Skib		vm_offset_t to);
78235783Skibstatic void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
79235783Skib			       int pageno, vm_page_t m);
80235783Skibstatic void vfs_clean_pages(struct buf * bp);
81235783Skibstatic void vfs_setdirty(struct buf *bp);
82235783Skibstatic void vfs_vmio_release(struct buf *bp);
83235783Skibstatic void vfs_backgroundwritedone(struct buf *bp);
84235783Skibstatic int flushbufqueues(void);
85235783Skibstatic void buf_daemon(void);
86235783Skib
87235783Skibint vmiodirenable = TRUE;
88235783SkibSYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
89235783Skib    "Use the VM system for directory writes");
90235783Skibint runningbufspace;
91235783SkibSYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
92235783Skib    "Amount of presently outstanding async buffer io");
93235783Skibstatic int bufspace;
94235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
95235783Skib    "KVA memory used for bufs");
96235783Skibstatic int maxbufspace;
97235783SkibSYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
98235783Skib    "Maximum allowed value of bufspace (including buf_daemon)");
99235783Skibstatic int bufmallocspace;
100235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
101235783Skib    "Amount of malloced memory for buffers");
102235783Skibstatic int maxbufmallocspace;
103235783SkibSYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
104235783Skib    "Maximum amount of malloced memory for buffers");
105235783Skibstatic int lobufspace;
106235783SkibSYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
107235783Skib    "Minimum amount of buffers we want to have");
108235783Skibstatic int hibufspace;
109235783SkibSYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
110235783Skib    "Maximum allowed value of bufspace (excluding buf_daemon)");
111235783Skibstatic int bufreusecnt;
112235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
113235783Skib    "Number of times we have reused a buffer");
114235783Skibstatic int buffreekvacnt;
115235783SkibSYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
116235783Skib    "Number of times we have freed the KVA space from some buffer");
117235783Skibstatic int bufdefragcnt;
118235783SkibSYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
119235783Skib    "Number of times we have had to repeat buffer allocation to defragment");
120235783Skibstatic int lorunningspace;
121235783SkibSYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
122235783Skib    "Minimum preferred space used for in-progress I/O");
123235783Skibstatic int hirunningspace;
124235783SkibSYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
125235783Skib    "Maximum amount of space to use for in-progress I/O");
126235783Skibstatic int numdirtybuffers;
127235783SkibSYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
128235783Skib    "Number of buffers that are dirty (has unwritten changes) at the moment");
129235783Skibstatic int lodirtybuffers;
130235783SkibSYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
131235783Skib    "How many buffers we want to have free before bufdaemon can sleep");
132235783Skibstatic int hidirtybuffers;
133235783SkibSYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
134235783Skib    "When the number of dirty buffers is considered severe");
135235783Skibstatic int numfreebuffers;
136235783SkibSYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
137235783Skib    "Number of free buffers");
138235783Skibstatic int lofreebuffers;
139235783SkibSYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
140235783Skib   "XXX Unused");
141235783Skibstatic int hifreebuffers;
142235783SkibSYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
143235783Skib   "XXX Complicatedly unused");
144235783Skibstatic int getnewbufcalls;
145235783SkibSYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
146235783Skib   "Number of calls to getnewbuf");
147235783Skibstatic int getnewbufrestarts;
148235783SkibSYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
149235783Skib    "Number of times getnewbuf has had to restart a buffer aquisition");
150235783Skibstatic int dobkgrdwrite = 1;
151235783SkibSYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
152235783Skib    "Do background writes (honoring the BX_BKGRDWRITE flag)?");
153235783Skib
154235783Skib/*
155235783Skib * Wakeup point for bufdaemon, as well as indicator of whether it is already
156235783Skib * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
157235783Skib * is idling.
158235783Skib */
159235783Skibstatic int bd_request;
160235783Skib
161235783Skib/*
162235783Skib * bogus page -- for I/O to/from partially complete buffers
163235783Skib * this is a temporary solution to the problem, but it is not
164280183Sdumbbell * really that bad.  it would be better to split the buffer
165235783Skib * for input in the case of buffers partially already in memory,
166235783Skib * but the code is intricate enough already.
167235783Skib */
168235783Skibvm_page_t bogus_page;
169235783Skib
170235783Skib/*
171235783Skib * Offset for bogus_page.
172235783Skib * XXX bogus_offset should be local to bufinit
173280183Sdumbbell */
174235783Skibstatic vm_offset_t bogus_offset;
175235783Skib
176235783Skib/*
177235783Skib * Synchronization (sleep/wakeup) variable for active buffer space requests.
178235783Skib * Set when wait starts, cleared prior to wakeup().
179235783Skib * Used in runningbufwakeup() and waitrunningbufspace().
180235783Skib */
181235783Skibstatic int runningbufreq;
182235783Skib
183235783Skib/*
184235783Skib * Synchronization (sleep/wakeup) variable for buffer requests.
185280183Sdumbbell * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
186235783Skib * by and/or.
187235783Skib * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
188235783Skib * getnewbuf(), and getblk().
189235783Skib */
190235783Skibstatic int needsbuffer;
191235783Skib
192235783Skib/*
193235783Skib * Mask for index into the buffer hash table, which needs to be power of 2 in
194235783Skib * size.  Set in kern_vfs_bio_buffer_alloc.
195235783Skib */
196280183Sdumbbellstatic int bufhashmask;
197235783Skib
198235783Skib/*
199235783Skib * Hash table for all buffers, with a linked list hanging from each table
200235783Skib * entry.  Set in kern_vfs_bio_buffer_alloc, initialized in buf_init.
201235783Skib */
202235783Skibstatic LIST_HEAD(bufhashhdr, buf) *bufhashtbl;
203235783Skib
204235783Skib/*
205235783Skib * Somewhere to store buffers when they are not in another list, to always
206235783Skib * have them in a list (and thus being able to use the same set of operations
207235783Skib * on them.)
208235783Skib */
209235783Skibstatic struct bufhashhdr invalhash;
210235783Skib
211235783Skib/*
212235783Skib * Definitions for the buffer free lists.
213235783Skib */
214235783Skib#define BUFFER_QUEUES	6	/* number of free buffer queues */
215235783Skib
216235783Skib#define QUEUE_NONE	0	/* on no queue */
217235783Skib#define QUEUE_LOCKED	1	/* locked buffers */
218235783Skib#define QUEUE_CLEAN	2	/* non-B_DELWRI buffers */
219235783Skib#define QUEUE_DIRTY	3	/* B_DELWRI buffers */
220235783Skib#define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
221235783Skib#define QUEUE_EMPTY	5	/* empty buffer headers */
222235783Skib
223235783Skib/* Queues for free buffers with various properties */
224235783Skibstatic TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
225235783Skib/*
226280183Sdumbbell * Single global constant for BUF_WMESG, to avoid getting multiple references.
227235783Skib * buf_wmesg is referred from macros.
228235783Skib */
229235783Skibconst char *buf_wmesg = BUF_WMESG;
230280183Sdumbbell
231280183Sdumbbell#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
232235783Skib#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
233235783Skib#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
234235783Skib#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
235235783Skib
236235783Skib/*
237235783Skib * Buffer hash table code.  Note that the logical block scans linearly, which
238235783Skib * gives us some L1 cache locality.
239235783Skib */
240235783Skib
241235783Skibstatic __inline
242235783Skibstruct bufhashhdr *
243235783Skibbufhash(struct vnode *vnp, daddr_t bn)
244235783Skib{
245235783Skib	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
246235783Skib}
247235783Skib
248235783Skib/*
249235783Skib *	numdirtywakeup:
250235783Skib *
251235783Skib *	If someone is blocked due to there being too many dirty buffers,
252235783Skib *	and numdirtybuffers is now reasonable, wake them up.
253235783Skib */
254235783Skib
255235783Skibstatic __inline void
256235783Skibnumdirtywakeup(int level)
257235783Skib{
258280183Sdumbbell	if (numdirtybuffers <= level) {
259235783Skib		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
260235783Skib			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
261235783Skib			wakeup(&needsbuffer);
262235783Skib		}
263235783Skib	}
264235783Skib}
265235783Skib
266280183Sdumbbell/*
267235783Skib *	bufspacewakeup:
268235783Skib *
269235783Skib *	Called when buffer space is potentially available for recovery.
270235783Skib *	getnewbuf() will block on this flag when it is unable to free
271235783Skib *	sufficient buffer space.  Buffer space becomes recoverable when
272235783Skib *	bp's get placed back in the queues.
273235783Skib */
274235783Skib
275235783Skibstatic __inline void
276235783Skibbufspacewakeup(void)
277235783Skib{
278235783Skib	/*
279235783Skib	 * If someone is waiting for BUF space, wake them up.  Even
280235783Skib	 * though we haven't freed the kva space yet, the waiting
281235783Skib	 * process will be able to now.
282235783Skib	 */
283235783Skib	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
284235783Skib		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
285235783Skib		wakeup(&needsbuffer);
286280183Sdumbbell	}
287235783Skib}
288235783Skib
289235783Skib/*
290235783Skib * runningbufwakeup() - in-progress I/O accounting.
291235783Skib *
292235783Skib */
293235783Skibstatic __inline void
294235783Skibrunningbufwakeup(struct buf *bp)
295235783Skib{
296235783Skib	if (bp->b_runningbufspace) {
297235783Skib		runningbufspace -= bp->b_runningbufspace;
298235783Skib		bp->b_runningbufspace = 0;
299280183Sdumbbell		if (runningbufreq && runningbufspace <= lorunningspace) {
300235783Skib			runningbufreq = 0;
301280183Sdumbbell			wakeup(&runningbufreq);
302280183Sdumbbell		}
303280183Sdumbbell	}
304280183Sdumbbell}
305280183Sdumbbell
306235783Skib/*
307280183Sdumbbell *	bufcountwakeup:
308280183Sdumbbell *
309280183Sdumbbell *	Called when a buffer has been added to one of the free queues to
310280183Sdumbbell *	account for the buffer and to wakeup anyone waiting for free buffers.
311280183Sdumbbell *	This typically occurs when large amounts of metadata are being handled
312280183Sdumbbell *	by the buffer cache ( else buffer space runs out first, usually ).
313280183Sdumbbell */
314280183Sdumbbell
315280183Sdumbbellstatic __inline void
316280183Sdumbbellbufcountwakeup(void)
317280183Sdumbbell{
318280183Sdumbbell	++numfreebuffers;
319280183Sdumbbell	if (needsbuffer) {
320280183Sdumbbell		needsbuffer &= ~VFS_BIO_NEED_ANY;
321280183Sdumbbell		if (numfreebuffers >= hifreebuffers)
322280183Sdumbbell			needsbuffer &= ~VFS_BIO_NEED_FREE;
323280183Sdumbbell		wakeup(&needsbuffer);
324280183Sdumbbell	}
325280183Sdumbbell}
326280183Sdumbbell
327280183Sdumbbell/*
328280183Sdumbbell *	waitrunningbufspace()
329280183Sdumbbell *
330280183Sdumbbell *	runningbufspace is a measure of the amount of I/O currently
331280183Sdumbbell *	running.  This routine is used in async-write situations to
332280183Sdumbbell *	prevent creating huge backups of pending writes to a device.
333280183Sdumbbell *	Only asynchronous writes are governed by this function.
334235783Skib *
335235783Skib *	Reads will adjust runningbufspace, but will not block based on it.
336235783Skib *	The read load has a side effect of reducing the allowed write load.
337235783Skib *
338235783Skib *	This does NOT turn an async write into a sync write.  It waits
339235783Skib *	for earlier writes to complete and generally returns before the
340235783Skib *	caller's write has reached the device.
341235783Skib */
342235783Skibstatic __inline void
343235783Skibwaitrunningbufspace(void)
344235783Skib{
345235783Skib	/*
346280183Sdumbbell	 * XXX race against wakeup interrupt, currently
347280183Sdumbbell	 * protected by Giant.  FIXME!
348280183Sdumbbell	 */
349280183Sdumbbell	while (runningbufspace > hirunningspace) {
350280183Sdumbbell		++runningbufreq;
351280183Sdumbbell		tsleep(&runningbufreq, PVM, "wdrain", 0);
352280183Sdumbbell	}
353280183Sdumbbell}
354280183Sdumbbell
355280183Sdumbbell
356280183Sdumbbell/*
357280183Sdumbbell *	vfs_buf_test_cache:
358280183Sdumbbell *
359280183Sdumbbell *	Called when a buffer is extended.  This function clears the B_CACHE
360280183Sdumbbell *	bit if the newly extended portion of the buffer does not contain
361280183Sdumbbell *	valid data.
362280183Sdumbbell */
363280183Sdumbbellstatic __inline__
364280183Sdumbbellvoid
365280183Sdumbbellvfs_buf_test_cache(struct buf *bp,
366280183Sdumbbell		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
367280183Sdumbbell		  vm_page_t m)
368280183Sdumbbell{
369280183Sdumbbell	GIANT_REQUIRED;
370280183Sdumbbell
371280183Sdumbbell	if (bp->b_flags & B_CACHE) {
372235783Skib		int base = (foff + off) & PAGE_MASK;
373235783Skib		if (vm_page_is_valid(m, base, size) == 0)
374235783Skib			bp->b_flags &= ~B_CACHE;
375235783Skib	}
376235783Skib}
377235783Skib
378235783Skib/* Wake up the buffer deamon if necessary */
379235783Skibstatic __inline__
380235783Skibvoid
381235783Skibbd_wakeup(int dirtybuflevel)
382235783Skib{
383235783Skib	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
384235783Skib		bd_request = 1;
385235783Skib		wakeup(&bd_request);
386235783Skib	}
387235783Skib}
388235783Skib
389235783Skib/*
390235783Skib * bd_speedup - speedup the buffer cache flushing code
391235783Skib */
392235783Skib
393235783Skibstatic __inline__
394235783Skibvoid
395235783Skibbd_speedup(void)
396235783Skib{
397235783Skib	bd_wakeup(1);
398235783Skib}
399235783Skib
400235783Skib/*
401235783Skib * Calculating buffer cache scaling values and reserve space for buffer
402280183Sdumbbell * headers.  This is called during low level kernel initialization and
403280183Sdumbbell * may be called more then once.  We CANNOT write to the memory area
404280183Sdumbbell * being reserved at this time.
405235783Skib */
406280183Sdumbbellcaddr_t
407235783Skibkern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est)
408235783Skib{
409235783Skib	/*
410235783Skib	 * physmem_est is in pages.  Convert it to kilobytes (assumes
411235783Skib	 * PAGE_SIZE is >= 1K)
412235783Skib	 */
413235783Skib	physmem_est = physmem_est * (PAGE_SIZE / 1024);
414235783Skib
415277487Skib	/*
416235783Skib	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
417235783Skib	 * For the first 64MB of ram nominally allocate sufficient buffers to
418235783Skib	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
419235783Skib	 * buffers to cover 1/20 of our ram over 64MB.  When auto-sizing
420235783Skib	 * the buffer cache we limit the eventual kva reservation to
421235783Skib	 * maxbcache bytes.
422235783Skib	 *
423235783Skib	 * factor represents the 1/4 x ram conversion.
424235783Skib	 */
425235783Skib	if (nbuf == 0) {
426235783Skib		int factor = 4 * BKVASIZE / 1024;
427235783Skib
428235783Skib		nbuf = 50;
429280183Sdumbbell		if (physmem_est > 4096)
430235783Skib			nbuf += min((physmem_est - 4096) / factor,
431235783Skib			    65536 / factor);
432280183Sdumbbell		if (physmem_est > 65536)
433235783Skib			nbuf += (physmem_est - 65536) * 2 / (factor * 5);
434235783Skib
435235783Skib		if (maxbcache && nbuf > maxbcache / BKVASIZE)
436235783Skib			nbuf = maxbcache / BKVASIZE;
437277487Skib	}
438277487Skib
439235783Skib#if 0
440235783Skib	/*
441277487Skib	 * Do not allow the buffer_map to be more then 1/2 the size of the
442280183Sdumbbell	 * kernel_map.
443235783Skib	 */
444235783Skib	if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) /
445235783Skib	    (BKVASIZE * 2)) {
446235783Skib		nbuf = (kernel_map->max_offset - kernel_map->min_offset) /
447280183Sdumbbell		    (BKVASIZE * 2);
448235783Skib		printf("Warning: nbufs capped at %d\n", nbuf);
449235783Skib	}
450235783Skib#endif
451235783Skib
452235783Skib	/*
453235783Skib	 * swbufs are used as temporary holders for I/O, such as paging I/O.
454235783Skib	 * We have no less then 16 and no more then 256.
455235783Skib	 */
456235783Skib	nswbuf = max(min(nbuf/4, 256), 16);
457235783Skib
458235783Skib	/*
459235783Skib	 * Reserve space for the buffer cache buffers
460235783Skib	 */
461235783Skib	swbuf = (void *)v;
462235783Skib	v = (caddr_t)(swbuf + nswbuf);
463280183Sdumbbell	buf = (void *)v;
464280183Sdumbbell	v = (caddr_t)(buf + nbuf);
465235783Skib
466235783Skib	/*
467235783Skib	 * Calculate the hash table size and reserve space
468235783Skib	 */
469235783Skib	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
470280183Sdumbbell		;
471235783Skib	bufhashtbl = (void *)v;
472235783Skib	v = (caddr_t)(bufhashtbl + bufhashmask);
473235783Skib	--bufhashmask;
474235783Skib
475235783Skib	return(v);
476235783Skib}
477235783Skib
478235783Skib/* Initialize the buffer subsystem.  Called before use of any buffers. */
479235783Skibvoid
480235783Skibbufinit(void)
481235783Skib{
482235783Skib	struct buf *bp;
483235783Skib	int i;
484235783Skib
485235783Skib	GIANT_REQUIRED;
486235783Skib
487280183Sdumbbell	LIST_INIT(&invalhash);
488235783Skib	mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF);
489235783Skib
490235783Skib	for (i = 0; i <= bufhashmask; i++)
491235783Skib		LIST_INIT(&bufhashtbl[i]);
492235783Skib
493235783Skib	/* next, make a null set of free lists */
494235783Skib	for (i = 0; i < BUFFER_QUEUES; i++)
495235783Skib		TAILQ_INIT(&bufqueues[i]);
496235783Skib
497235783Skib	/* finally, initialize each buffer header and stick on empty q */
498235783Skib	for (i = 0; i < nbuf; i++) {
499235783Skib		bp = &buf[i];
500235783Skib		bzero(bp, sizeof *bp);
501235783Skib		bp->b_flags = B_INVAL;	/* we're just an empty header */
502235783Skib		bp->b_dev = NODEV;
503235783Skib		bp->b_rcred = NOCRED;
504235783Skib		bp->b_wcred = NOCRED;
505280183Sdumbbell		bp->b_qindex = QUEUE_EMPTY;
506235783Skib		bp->b_xflags = 0;
507235783Skib		LIST_INIT(&bp->b_dep);
508235783Skib		BUF_LOCKINIT(bp);
509235783Skib		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
510235783Skib		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
511235783Skib	}
512235783Skib
513235783Skib	/*
514235783Skib	 * maxbufspace is the absolute maximum amount of buffer space we are
515235783Skib	 * allowed to reserve in KVM and in real terms.  The absolute maximum
516235783Skib	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
517235783Skib	 * used by most other processes.  The differential is required to
518235783Skib	 * ensure that buf_daemon is able to run when other processes might
519235783Skib	 * be blocked waiting for buffer space.
520235783Skib	 *
521235783Skib	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
522235783Skib	 * this may result in KVM fragmentation which is not handled optimally
523235783Skib	 * by the system.
524235783Skib	 */
525235783Skib	maxbufspace = nbuf * BKVASIZE;
526235783Skib	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
527235783Skib	lobufspace = hibufspace - MAXBSIZE;
528235783Skib
529235783Skib	lorunningspace = 512 * 1024;
530235783Skib	hirunningspace = 1024 * 1024;
531235783Skib
532235783Skib/*
533235783Skib * Limit the amount of malloc memory since it is wired permanently into
534235783Skib * the kernel space.  Even though this is accounted for in the buffer
535235783Skib * allocation, we don't want the malloced region to grow uncontrolled.
536277487Skib * The malloc scheme improves memory utilization significantly on average
537235783Skib * (small) directories.
538235783Skib */
539235783Skib	maxbufmallocspace = hibufspace / 20;
540235783Skib
541235783Skib/*
542235783Skib * Reduce the chance of a deadlock occuring by limiting the number
543235783Skib * of delayed-write dirty buffers we allow to stack up.
544235783Skib */
545235783Skib	hidirtybuffers = nbuf / 4 + 20;
546280183Sdumbbell	numdirtybuffers = 0;
547235783Skib/*
548235783Skib * To support extreme low-memory systems, make sure hidirtybuffers cannot
549235783Skib * eat up all available buffer space.  This occurs when our minimum cannot
550235783Skib * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
551280183Sdumbbell * BKVASIZE'd (8K) buffers.
552280183Sdumbbell */
553280183Sdumbbell	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
554280183Sdumbbell		hidirtybuffers >>= 1;
555235783Skib	}
556280183Sdumbbell	lodirtybuffers = hidirtybuffers / 2;
557235783Skib
558235783Skib/*
559280183Sdumbbell * Try to keep the number of free buffers in the specified range,
560235783Skib * and give special processes (e.g. like buf_daemon) access to an
561235783Skib * emergency reserve.
562235783Skib */
563235783Skib	lofreebuffers = nbuf / 18 + 5;
564280183Sdumbbell	hifreebuffers = 2 * lofreebuffers;
565235783Skib	numfreebuffers = nbuf;
566235783Skib
567235783Skib/*
568235783Skib * Maximum number of async ops initiated per buf_daemon loop.  This is
569235783Skib * somewhat of a hack at the moment, we really need to limit ourselves
570235783Skib * based on the number of bytes of I/O in-transit that were initiated
571235783Skib * from buf_daemon.
572235783Skib */
573235783Skib
574235783Skib	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
575235783Skib	bogus_page = vm_page_alloc(kernel_object,
576235783Skib			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
577235783Skib			VM_ALLOC_NORMAL);
578235783Skib	cnt.v_wire_count++;
579235783Skib}
580235783Skib
581235783Skib/*
582235783Skib * bfreekva() - free the kva allocation for a buffer.
583235783Skib *
584235783Skib *	Must be called at splbio() or higher as this is the only locking for
585235783Skib *	buffer_map.
586235783Skib *
587235783Skib *	Since this call frees up buffer space, we call bufspacewakeup().
588235783Skib */
589235783Skibstatic void
590235783Skibbfreekva(struct buf * bp)
591235783Skib{
592235783Skib	GIANT_REQUIRED;
593235783Skib
594235783Skib	if (bp->b_kvasize) {
595280183Sdumbbell		++buffreekvacnt;
596235783Skib		bufspace -= bp->b_kvasize;
597280183Sdumbbell		vm_map_delete(buffer_map,
598280183Sdumbbell		    (vm_offset_t) bp->b_kvabase,
599280183Sdumbbell		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
600280183Sdumbbell		);
601280183Sdumbbell		bp->b_kvasize = 0;
602280183Sdumbbell		bufspacewakeup();
603280183Sdumbbell	}
604280183Sdumbbell}
605280183Sdumbbell
606280183Sdumbbell/*
607280183Sdumbbell *	bremfree:
608280183Sdumbbell *
609280183Sdumbbell *	Remove the buffer from the appropriate free list.
610235783Skib */
611280183Sdumbbellvoid
612280183Sdumbbellbremfree(struct buf * bp)
613280183Sdumbbell{
614235783Skib	int s = splbio();
615235783Skib	int old_qindex = bp->b_qindex;
616235783Skib
617235783Skib	GIANT_REQUIRED;
618235783Skib
619235783Skib	if (bp->b_qindex != QUEUE_NONE) {
620235783Skib		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
621235783Skib		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
622235783Skib		bp->b_qindex = QUEUE_NONE;
623235783Skib	} else {
624235783Skib		if (BUF_REFCNT(bp) <= 1)
625235783Skib			panic("bremfree: removing a buffer not on a queue");
626235783Skib	}
627235783Skib
628235783Skib	/*
629235783Skib	 * Fixup numfreebuffers count.  If the buffer is invalid or not
630280183Sdumbbell	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
631235783Skib	 * the buffer was free and we must decrement numfreebuffers.
632235783Skib	 */
633235783Skib	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
634235783Skib		switch(old_qindex) {
635280183Sdumbbell		case QUEUE_DIRTY:
636235783Skib		case QUEUE_CLEAN:
637235783Skib		case QUEUE_EMPTY:
638235783Skib		case QUEUE_EMPTYKVA:
639235783Skib			--numfreebuffers;
640235783Skib			break;
641235783Skib		default:
642235783Skib			break;
643235783Skib		}
644235783Skib	}
645235783Skib	splx(s);
646280183Sdumbbell}
647235783Skib
648235783Skib
649235783Skib/*
650235783Skib * Get a buffer with the specified data.  Look in the cache first.  We
651235783Skib * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
652235783Skib * is set, the buffer is valid and we do not have to do anything ( see
653235783Skib * getblk() ).  This is really just a special case of breadn().
654235783Skib */
655235783Skibint
656235783Skibbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
657235783Skib    struct buf ** bpp)
658235783Skib{
659235783Skib
660235783Skib	return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp));
661235783Skib}
662277487Skib
663235783Skib/*
664235783Skib * Operates like bread, but also starts asynchronous I/O on
665235783Skib * read-ahead blocks.  We must clear BIO_ERROR and B_INVAL prior
666293851Sdumbbell * to initiating I/O . If B_CACHE is set, the buffer is valid
667280183Sdumbbell * and we do not have to do anything.
668280183Sdumbbell */
669280183Sdumbbellint
670280183Sdumbbellbreadn(struct vnode * vp, daddr_t blkno, int size,
671280183Sdumbbell    daddr_t * rablkno, int *rabsize,
672280183Sdumbbell    int cnt, struct ucred * cred, struct buf ** bpp)
673235783Skib{
674235783Skib	struct buf *bp, *rabp;
675235783Skib	int i;
676235783Skib	int rv = 0, readwait = 0;
677235783Skib
678235783Skib	*bpp = bp = getblk(vp, blkno, size, 0, 0);
679235783Skib
680235783Skib	/* if not found in cache, do some I/O */
681235783Skib	if ((bp->b_flags & B_CACHE) == 0) {
682235783Skib		if (curthread != PCPU_GET(idlethread))
683235783Skib			curthread->td_proc->p_stats->p_ru.ru_inblock++;
684235783Skib		bp->b_iocmd = BIO_READ;
685235783Skib		bp->b_flags &= ~B_INVAL;
686235783Skib		bp->b_ioflags &= ~BIO_ERROR;
687235783Skib		if (bp->b_rcred == NOCRED && cred != NOCRED)
688235783Skib			bp->b_rcred = crhold(cred);
689280183Sdumbbell		vfs_busy_pages(bp, 0);
690235783Skib		VOP_STRATEGY(vp, bp);
691235783Skib		++readwait;
692235783Skib	}
693235783Skib
694280183Sdumbbell	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
695235783Skib		if (inmem(vp, *rablkno))
696235783Skib			continue;
697235783Skib		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
698235783Skib
699235783Skib		if ((rabp->b_flags & B_CACHE) == 0) {
700235783Skib			if (curthread != PCPU_GET(idlethread))
701235783Skib				curthread->td_proc->p_stats->p_ru.ru_inblock++;
702235783Skib			rabp->b_flags |= B_ASYNC;
703235783Skib			rabp->b_flags &= ~B_INVAL;
704235783Skib			rabp->b_ioflags &= ~BIO_ERROR;
705235783Skib			rabp->b_iocmd = BIO_READ;
706235783Skib			if (rabp->b_rcred == NOCRED && cred != NOCRED)
707235783Skib				rabp->b_rcred = crhold(cred);
708235783Skib			vfs_busy_pages(rabp, 0);
709235783Skib			BUF_KERNPROC(rabp);
710280183Sdumbbell			VOP_STRATEGY(vp, rabp);
711235783Skib		} else {
712235783Skib			brelse(rabp);
713235783Skib		}
714235783Skib	}
715235783Skib
716235783Skib	if (readwait) {
717235783Skib		rv = bufwait(bp);
718235783Skib	}
719235783Skib	return (rv);
720235783Skib}
721235783Skib
722235783Skib/*
723235783Skib * Write, release buffer on completion.  (Done by iodone
724235783Skib * if async).  Do not bother writing anything if the buffer
725235783Skib * is invalid.
726235783Skib *
727235783Skib * Note that we set B_CACHE here, indicating that buffer is
728235783Skib * fully valid and thus cacheable.  This is true even of NFS
729235783Skib * now so we set it generally.  This could be set either here
730280183Sdumbbell * or in biodone() since the I/O is synchronous.  We put it
731280183Sdumbbell * here.
732235783Skib */
733235783Skib
734235783Skibint
735280183Sdumbbellbwrite(struct buf * bp)
736235783Skib{
737280183Sdumbbell	int oldflags, s;
738235783Skib	struct buf *newbp;
739235783Skib
740280183Sdumbbell	if (bp->b_flags & B_INVAL) {
741235783Skib		brelse(bp);
742235783Skib		return (0);
743235783Skib	}
744235783Skib
745235783Skib	oldflags = bp->b_flags;
746235783Skib
747235783Skib	if (BUF_REFCNT(bp) == 0)
748235783Skib		panic("bwrite: buffer is not busy???");
749235783Skib	s = splbio();
750235783Skib	/*
751235783Skib	 * If a background write is already in progress, delay
752235783Skib	 * writing this block if it is asynchronous. Otherwise
753235783Skib	 * wait for the background write to complete.
754235783Skib	 */
755235783Skib	if (bp->b_xflags & BX_BKGRDINPROG) {
756235783Skib		if (bp->b_flags & B_ASYNC) {
757235783Skib			splx(s);
758235783Skib			bdwrite(bp);
759235783Skib			return (0);
760235783Skib		}
761280183Sdumbbell		bp->b_xflags |= BX_BKGRDWAIT;
762235783Skib		tsleep(&bp->b_xflags, PRIBIO, "bwrbg", 0);
763235783Skib		if (bp->b_xflags & BX_BKGRDINPROG)
764235783Skib			panic("bwrite: still writing");
765235783Skib	}
766235783Skib
767235783Skib	/* Mark the buffer clean */
768235783Skib	bundirty(bp);
769235783Skib
770235783Skib	/*
771235783Skib	 * If this buffer is marked for background writing and we
772235783Skib	 * do not have to wait for it, make a copy and write the
773235783Skib	 * copy so as to leave this buffer ready for further use.
774235783Skib	 *
775235783Skib	 * This optimization eats a lot of memory.  If we have a page
776235783Skib	 * or buffer shortfall we can't do it.
777235783Skib	 */
778280183Sdumbbell	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
779235783Skib	    (bp->b_flags & B_ASYNC) &&
780235783Skib	    !vm_page_count_severe() &&
781235783Skib	    !buf_dirty_count_severe()) {
782235783Skib		if (bp->b_iodone != NULL) {
783235783Skib			printf("bp->b_iodone = %p\n", bp->b_iodone);
784235783Skib			panic("bwrite: need chained iodone");
785235783Skib		}
786235783Skib
787235783Skib		/* get a new block */
788235783Skib		newbp = geteblk(bp->b_bufsize);
789235783Skib
790235783Skib		/* set it to be identical to the old block */
791235783Skib		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
792235783Skib		bgetvp(bp->b_vp, newbp);
793235783Skib		newbp->b_lblkno = bp->b_lblkno;
794235783Skib		newbp->b_blkno = bp->b_blkno;
795235783Skib		newbp->b_offset = bp->b_offset;
796235783Skib		newbp->b_iodone = vfs_backgroundwritedone;
797235783Skib		newbp->b_flags |= B_ASYNC;
798235783Skib		newbp->b_flags &= ~B_INVAL;
799235783Skib
800235783Skib		/* move over the dependencies */
801235783Skib		if (LIST_FIRST(&bp->b_dep) != NULL)
802280183Sdumbbell			buf_movedeps(bp, newbp);
803235783Skib
804235783Skib		/*
805235783Skib		 * Initiate write on the copy, release the original to
806235783Skib		 * the B_LOCKED queue so that it cannot go away until
807235783Skib		 * the background write completes. If not locked it could go
808280183Sdumbbell		 * away and then be reconstituted while it was being written.
809235783Skib		 * If the reconstituted buffer were written, we could end up
810235783Skib		 * with two background copies being written at the same time.
811235783Skib		 */
812235783Skib		bp->b_xflags |= BX_BKGRDINPROG;
813280183Sdumbbell		bp->b_flags |= B_LOCKED;
814235783Skib		bqrelse(bp);
815235783Skib		bp = newbp;
816235783Skib	}
817235783Skib
818235783Skib	bp->b_flags &= ~B_DONE;
819235783Skib	bp->b_ioflags &= ~BIO_ERROR;
820235783Skib	bp->b_flags |= B_WRITEINPROG | B_CACHE;
821235783Skib	bp->b_iocmd = BIO_WRITE;
822235783Skib
823235783Skib	bp->b_vp->v_numoutput++;
824235783Skib	vfs_busy_pages(bp, 1);
825235783Skib
826235783Skib	/*
827235783Skib	 * Normal bwrites pipeline writes
828235783Skib	 */
829235783Skib	bp->b_runningbufspace = bp->b_bufsize;
830235783Skib	runningbufspace += bp->b_runningbufspace;
831235783Skib
832235783Skib	if (curthread != PCPU_GET(idlethread))
833235783Skib		curthread->td_proc->p_stats->p_ru.ru_oublock++;
834235783Skib	splx(s);
835235783Skib	if (oldflags & B_ASYNC)
836235783Skib		BUF_KERNPROC(bp);
837235783Skib	BUF_STRATEGY(bp);
838235783Skib
839235783Skib	if ((oldflags & B_ASYNC) == 0) {
840235783Skib		int rtval = bufwait(bp);
841235783Skib		brelse(bp);
842280183Sdumbbell		return (rtval);
843235783Skib	} else if ((oldflags & B_NOWDRAIN) == 0) {
844235783Skib		/*
845235783Skib		 * don't allow the async write to saturate the I/O
846235783Skib		 * system.  Deadlocks can occur only if a device strategy
847235783Skib		 * routine (like in MD) turns around and issues another
848235783Skib		 * high-level write, in which case B_NOWDRAIN is expected
849280183Sdumbbell		 * to be set.  Otherwise we will not deadlock here because
850235783Skib		 * we are blocking waiting for I/O that is already in-progress
851235783Skib		 * to complete.
852235783Skib		 */
853235783Skib		waitrunningbufspace();
854235783Skib	}
855235783Skib
856235783Skib	return (0);
857235783Skib}
858235783Skib
859235783Skib/*
860235783Skib * Complete a background write started from bwrite.
861235783Skib */
862235783Skibstatic void
863235783Skibvfs_backgroundwritedone(bp)
864235783Skib	struct buf *bp;
865235783Skib{
866235783Skib	struct buf *origbp;
867235783Skib
868235783Skib	/*
869235783Skib	 * Find the original buffer that we are writing.
870235783Skib	 */
871235783Skib	if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
872235783Skib		panic("backgroundwritedone: lost buffer");
873235783Skib	/*
874235783Skib	 * Process dependencies then return any unfinished ones.
875235783Skib	 */
876235783Skib	if (LIST_FIRST(&bp->b_dep) != NULL)
877235783Skib		buf_complete(bp);
878235783Skib	if (LIST_FIRST(&bp->b_dep) != NULL)
879235783Skib		buf_movedeps(bp, origbp);
880235783Skib	/*
881235783Skib	 * Clear the BX_BKGRDINPROG flag in the original buffer
882235783Skib	 * and awaken it if it is waiting for the write to complete.
883235783Skib	 * If BX_BKGRDINPROG is not set in the original buffer it must
884235783Skib	 * have been released and re-instantiated - which is not legal.
885235783Skib	 */
886235783Skib	KASSERT((origbp->b_xflags & BX_BKGRDINPROG),
887235783Skib	    ("backgroundwritedone: lost buffer2"));
888235783Skib	origbp->b_xflags &= ~BX_BKGRDINPROG;
889235783Skib	if (origbp->b_xflags & BX_BKGRDWAIT) {
890235783Skib		origbp->b_xflags &= ~BX_BKGRDWAIT;
891235783Skib		wakeup(&origbp->b_xflags);
892235783Skib	}
893235783Skib	/*
894280183Sdumbbell	 * Clear the B_LOCKED flag and remove it from the locked
895235783Skib	 * queue if it currently resides there.
896235783Skib	 */
897235783Skib	origbp->b_flags &= ~B_LOCKED;
898235783Skib	if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
899235783Skib		bremfree(origbp);
900235783Skib		bqrelse(origbp);
901235783Skib	}
902235783Skib	/*
903235783Skib	 * This buffer is marked B_NOCACHE, so when it is released
904235783Skib	 * by biodone, it will be tossed. We mark it with BIO_READ
905235783Skib	 * to avoid biodone doing a second vwakeup.
906235783Skib	 */
907235783Skib	bp->b_flags |= B_NOCACHE;
908235783Skib	bp->b_iocmd = BIO_READ;
909235783Skib	bp->b_flags &= ~(B_CACHE | B_DONE);
910235783Skib	bp->b_iodone = 0;
911235783Skib	bufdone(bp);
912235783Skib}
913280183Sdumbbell
914235783Skib/*
915235783Skib * Delayed write. (Buffer is marked dirty).  Do not bother writing
916235783Skib * anything if the buffer is marked invalid.
917235783Skib *
918235783Skib * Note that since the buffer must be completely valid, we can safely
919280183Sdumbbell * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
920235783Skib * biodone() in order to prevent getblk from writing the buffer
921235783Skib * out synchronously.
922235783Skib */
923235783Skibvoid
924235783Skibbdwrite(struct buf * bp)
925235783Skib{
926235783Skib	GIANT_REQUIRED;
927235783Skib
928235783Skib	if (BUF_REFCNT(bp) == 0)
929235783Skib		panic("bdwrite: buffer is not busy");
930235783Skib
931235783Skib	if (bp->b_flags & B_INVAL) {
932235783Skib		brelse(bp);
933235783Skib		return;
934235783Skib	}
935235783Skib	bdirty(bp);
936235783Skib
937235783Skib	/*
938280183Sdumbbell	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
939235783Skib	 * true even of NFS now.
940235783Skib	 */
941235783Skib	bp->b_flags |= B_CACHE;
942235783Skib
943280183Sdumbbell	/*
944235783Skib	 * This bmap keeps the system from needing to do the bmap later,
945235783Skib	 * perhaps when the system is attempting to do a sync.  Since it
946235783Skib	 * is likely that the indirect block -- or whatever other datastructure
947235783Skib	 * that the filesystem needs is still in memory now, it is a good
948235783Skib	 * thing to do this.  Note also, that if the pageout daemon is
949235783Skib	 * requesting a sync -- there might not be enough memory to do
950235783Skib	 * the bmap then...  So, this is important to do.
951235783Skib	 */
952235783Skib	if (bp->b_lblkno == bp->b_blkno) {
953235783Skib		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
954235783Skib	}
955235783Skib
956235783Skib	/*
957235783Skib	 * Set the *dirty* buffer range based upon the VM system dirty pages.
958235783Skib	 */
959235783Skib	vfs_setdirty(bp);
960235783Skib
961235783Skib	/*
962235783Skib	 * We need to do this here to satisfy the vnode_pager and the
963280183Sdumbbell	 * pageout daemon, so that it thinks that the pages have been
964235783Skib	 * "cleaned".  Note that since the pages are in a delayed write
965235783Skib	 * buffer -- the VFS layer "will" see that the pages get written
966235783Skib	 * out on the next sync, or perhaps the cluster will be completed.
967235783Skib	 */
968280183Sdumbbell	vfs_clean_pages(bp);
969235783Skib	bqrelse(bp);
970235783Skib
971235783Skib	/*
972235783Skib	 * Wakeup the buffer flushing daemon if we have a lot of dirty
973235783Skib	 * buffers (midpoint between our recovery point and our stall
974235783Skib	 * point).
975235783Skib	 */
976235783Skib	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
977235783Skib
978235783Skib	/*
979235783Skib	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
980235783Skib	 * due to the softdep code.
981235783Skib	 */
982235783Skib}
983235783Skib
984235783Skib/*
985235783Skib *	bdirty:
986235783Skib *
987235783Skib *	Turn buffer into delayed write request.  We must clear BIO_READ and
988235783Skib *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
989235783Skib *	itself to properly update it in the dirty/clean lists.  We mark it
990235783Skib *	B_DONE to ensure that any asynchronization of the buffer properly
991235783Skib *	clears B_DONE ( else a panic will occur later ).
992235783Skib *
993235783Skib *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
994235783Skib *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
995235783Skib *	should only be called if the buffer is known-good.
996235783Skib *
997235783Skib *	Since the buffer is not on a queue, we do not update the numfreebuffers
998235783Skib *	count.
999235783Skib *
1000235783Skib *	Must be called at splbio().
1001235783Skib *	The buffer must be on QUEUE_NONE.
1002280183Sdumbbell */
1003235783Skibvoid
1004280183Sdumbbellbdirty(bp)
1005235783Skib	struct buf *bp;
1006235783Skib{
1007235783Skib	KASSERT(bp->b_qindex == QUEUE_NONE,
1008235783Skib	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
1009235783Skib	bp->b_flags &= ~(B_RELBUF);
1010235783Skib	bp->b_iocmd = BIO_WRITE;
1011235783Skib
1012235783Skib	if ((bp->b_flags & B_DELWRI) == 0) {
1013293851Sdumbbell		bp->b_flags |= B_DONE | B_DELWRI;
1014280183Sdumbbell		reassignbuf(bp, bp->b_vp);
1015280183Sdumbbell		++numdirtybuffers;
1016235783Skib		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
1017235783Skib	}
1018235783Skib}
1019235783Skib
1020235783Skib/*
1021235783Skib *	bundirty:
1022235783Skib *
1023280183Sdumbbell *	Clear B_DELWRI for buffer.
1024280183Sdumbbell *
1025280183Sdumbbell *	Since the buffer is not on a queue, we do not update the numfreebuffers
1026280183Sdumbbell *	count.
1027280183Sdumbbell *
1028280183Sdumbbell *	Must be called at splbio().
1029235783Skib *	The buffer must be on QUEUE_NONE.
1030235783Skib */
1031235783Skib
1032235783Skibvoid
1033235783Skibbundirty(bp)
1034235783Skib	struct buf *bp;
1035235783Skib{
1036235783Skib	KASSERT(bp->b_qindex == QUEUE_NONE,
1037235783Skib	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
1038235783Skib
1039235783Skib	if (bp->b_flags & B_DELWRI) {
1040235783Skib		bp->b_flags &= ~B_DELWRI;
1041235783Skib		reassignbuf(bp, bp->b_vp);
1042235783Skib		--numdirtybuffers;
1043235783Skib		numdirtywakeup(lodirtybuffers);
1044235783Skib	}
1045235783Skib	/*
1046235783Skib	 * Since it is now being written, we can clear its deferred write flag.
1047235783Skib	 */
1048235783Skib	bp->b_flags &= ~B_DEFERRED;
1049235783Skib}
1050235783Skib
1051235783Skib/*
1052235783Skib *	bawrite:
1053280183Sdumbbell *
1054235783Skib *	Asynchronous write.  Start output on a buffer, but do not wait for
1055235783Skib *	it to complete.  The buffer is released when the output completes.
1056235783Skib *
1057235783Skib *	bwrite() ( or the VOP routine anyway ) is responsible for handling
1058235783Skib *	B_INVAL buffers.  Not us.
1059235783Skib */
1060235783Skibvoid
1061235783Skibbawrite(struct buf * bp)
1062235783Skib{
1063235783Skib	bp->b_flags |= B_ASYNC;
1064235783Skib	(void) BUF_WRITE(bp);
1065235783Skib}
1066235783Skib
1067235783Skib/*
1068235783Skib *	bwillwrite:
1069235783Skib *
1070235783Skib *	Called prior to the locking of any vnodes when we are expecting to
1071235783Skib *	write.  We do not want to starve the buffer cache with too many
1072235783Skib *	dirty buffers so we block here.  By blocking prior to the locking
1073235783Skib *	of any vnodes we attempt to avoid the situation where a locked vnode
1074280183Sdumbbell *	prevents the various system daemons from flushing related buffers.
1075235783Skib */
1076235783Skib
1077235783Skibvoid
1078235783Skibbwillwrite(void)
1079235783Skib{
1080235783Skib	if (numdirtybuffers >= hidirtybuffers) {
1081235783Skib		int s;
1082235783Skib
1083235783Skib		mtx_lock(&Giant);
1084235783Skib		s = splbio();
1085235783Skib		while (numdirtybuffers >= hidirtybuffers) {
1086235783Skib			bd_wakeup(1);
1087235783Skib			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
1088235783Skib			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
1089235783Skib		}
1090235783Skib		splx(s);
1091235783Skib		mtx_unlock(&Giant);
1092280183Sdumbbell	}
1093280183Sdumbbell}
1094280183Sdumbbell
1095235783Skib/*
1096235783Skib * Return true if we have too many dirty buffers.
1097280183Sdumbbell */
1098280183Sdumbbellint
1099235783Skibbuf_dirty_count_severe(void)
1100235783Skib{
1101235783Skib	return(numdirtybuffers >= hidirtybuffers);
1102235783Skib}
1103235783Skib
1104235783Skib/*
1105280183Sdumbbell *	brelse:
1106280183Sdumbbell *
1107280183Sdumbbell *	Release a busy buffer and, if requested, free its resources.  The
1108280183Sdumbbell *	buffer will be stashed in the appropriate bufqueue[] allowing it
1109280183Sdumbbell *	to be accessed later as a cache entity or reused for other purposes.
1110235783Skib */
1111235783Skibvoid
1112280183Sdumbbellbrelse(struct buf * bp)
1113235783Skib{
1114235783Skib	int s;
1115235783Skib
1116235783Skib	GIANT_REQUIRED;
1117235783Skib
1118235783Skib	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1119235783Skib	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1120235783Skib
1121235783Skib	s = splbio();
1122235783Skib
1123235783Skib	if (bp->b_flags & B_LOCKED)
1124235783Skib		bp->b_ioflags &= ~BIO_ERROR;
1125235783Skib
1126235783Skib	if (bp->b_iocmd == BIO_WRITE &&
1127235783Skib	    (bp->b_ioflags & BIO_ERROR) &&
1128235783Skib	    !(bp->b_flags & B_INVAL)) {
1129280183Sdumbbell		/*
1130280183Sdumbbell		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
1131280183Sdumbbell		 * pages from being scrapped.  If B_INVAL is set then
1132280183Sdumbbell		 * this case is not run and the next case is run to
1133280183Sdumbbell		 * destroy the buffer.  B_INVAL can occur if the buffer
1134235783Skib		 * is outside the range supported by the underlying device.
1135235783Skib		 */
1136235783Skib		bp->b_ioflags &= ~BIO_ERROR;
1137235783Skib		bdirty(bp);
1138235783Skib	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1139235783Skib	    (bp->b_ioflags & BIO_ERROR) ||
1140235783Skib	    bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) {
1141235783Skib		/*
1142235783Skib		 * Either a failed I/O or we were asked to free or not
1143235783Skib		 * cache the buffer.
1144235783Skib		 */
1145235783Skib		bp->b_flags |= B_INVAL;
1146235783Skib		if (LIST_FIRST(&bp->b_dep) != NULL)
1147235783Skib			buf_deallocate(bp);
1148235783Skib		if (bp->b_flags & B_DELWRI) {
1149235783Skib			--numdirtybuffers;
1150235783Skib			numdirtywakeup(lodirtybuffers);
1151235783Skib		}
1152235783Skib		bp->b_flags &= ~(B_DELWRI | B_CACHE);
1153235783Skib		if ((bp->b_flags & B_VMIO) == 0) {
1154235783Skib			if (bp->b_bufsize)
1155235783Skib				allocbuf(bp, 0);
1156235783Skib			if (bp->b_vp)
1157235783Skib				brelvp(bp);
1158235783Skib		}
1159235783Skib	}
1160235783Skib
1161235783Skib	/*
1162235783Skib	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1163235783Skib	 * is called with B_DELWRI set, the underlying pages may wind up
1164235783Skib	 * getting freed causing a previous write (bdwrite()) to get 'lost'
1165235783Skib	 * because pages associated with a B_DELWRI bp are marked clean.
1166235783Skib	 *
1167235783Skib	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
1168235783Skib	 * if B_DELWRI is set.
1169235783Skib	 *
1170235783Skib	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
1171280183Sdumbbell	 * on pages to return pages to the VM page queues.
1172235783Skib	 */
1173235783Skib	if (bp->b_flags & B_DELWRI)
1174235783Skib		bp->b_flags &= ~B_RELBUF;
1175235783Skib	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
1176235783Skib		bp->b_flags |= B_RELBUF;
1177235783Skib
1178235783Skib	/*
1179235783Skib	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1180235783Skib	 * constituted, not even NFS buffers now.  Two flags effect this.  If
1181235783Skib	 * B_INVAL, the struct buf is invalidated but the VM object is kept
1182235783Skib	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
1183235783Skib	 *
1184235783Skib	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1185235783Skib	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1186235783Skib	 * buffer is also B_INVAL because it hits the re-dirtying code above.
1187235783Skib	 *
1188235783Skib	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
1189235783Skib	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
1190235783Skib	 * the commit state and we cannot afford to lose the buffer. If the
1191235783Skib	 * buffer has a background write in progress, we need to keep it
1192235783Skib	 * around to prevent it from being reconstituted and starting a second
1193235783Skib	 * background write.
1194235783Skib	 */
1195235783Skib	if ((bp->b_flags & B_VMIO)
1196235783Skib	    && !(bp->b_vp->v_tag == VT_NFS &&
1197235783Skib		 !vn_isdisk(bp->b_vp, NULL) &&
1198235783Skib		 (bp->b_flags & B_DELWRI))
1199235783Skib	    ) {
1200235783Skib
1201235783Skib		int i, j, resid;
1202235783Skib		vm_page_t m;
1203235783Skib		off_t foff;
1204235783Skib		vm_pindex_t poff;
1205235783Skib		vm_object_t obj;
1206235783Skib		struct vnode *vp;
1207235783Skib
1208235783Skib		vp = bp->b_vp;
1209235783Skib		obj = bp->b_object;
1210235783Skib
1211235783Skib		/*
1212235783Skib		 * Get the base offset and length of the buffer.  Note that
1213235783Skib		 * in the VMIO case if the buffer block size is not
1214235783Skib		 * page-aligned then b_data pointer may not be page-aligned.
1215235783Skib		 * But our b_pages[] array *IS* page aligned.
1216235783Skib		 *
1217235783Skib		 * block sizes less then DEV_BSIZE (usually 512) are not
1218235783Skib		 * supported due to the page granularity bits (m->valid,
1219235783Skib		 * m->dirty, etc...).
1220235783Skib		 *
1221235783Skib		 * See man buf(9) for more information
1222235783Skib		 */
1223235783Skib		resid = bp->b_bufsize;
1224235783Skib		foff = bp->b_offset;
1225235783Skib
1226235783Skib		for (i = 0; i < bp->b_npages; i++) {
1227235783Skib			int had_bogus = 0;
1228235783Skib
1229235783Skib			m = bp->b_pages[i];
1230235783Skib			vm_page_flag_clear(m, PG_ZERO);
1231235783Skib
1232235783Skib			/*
1233280183Sdumbbell			 * If we hit a bogus page, fixup *all* the bogus pages
1234235783Skib			 * now.
1235235783Skib			 */
1236235783Skib			if (m == bogus_page) {
1237235783Skib				poff = OFF_TO_IDX(bp->b_offset);
1238235783Skib				had_bogus = 1;
1239235783Skib
1240235783Skib				for (j = i; j < bp->b_npages; j++) {
1241235783Skib					vm_page_t mtmp;
1242235783Skib					mtmp = bp->b_pages[j];
1243235783Skib					if (mtmp == bogus_page) {
1244280183Sdumbbell						mtmp = vm_page_lookup(obj, poff + j);
1245280183Sdumbbell						if (!mtmp) {
1246235783Skib							panic("brelse: page missing\n");
1247235783Skib						}
1248235783Skib						bp->b_pages[j] = mtmp;
1249235783Skib					}
1250235783Skib				}
1251235783Skib
1252235783Skib				if ((bp->b_flags & B_INVAL) == 0) {
1253235783Skib					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
1254235783Skib				}
1255235783Skib				m = bp->b_pages[i];
1256235783Skib			}
1257235783Skib			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
1258235783Skib				int poffset = foff & PAGE_MASK;
1259235783Skib				int presid = resid > (PAGE_SIZE - poffset) ?
1260235783Skib					(PAGE_SIZE - poffset) : resid;
1261235783Skib
1262235783Skib				KASSERT(presid >= 0, ("brelse: extra page"));
1263235783Skib				vm_page_set_invalid(m, poffset, presid);
1264235783Skib				if (had_bogus)
1265235783Skib					printf("avoided corruption bug in bogus_page/brelse code\n");
1266235783Skib			}
1267235783Skib			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1268235783Skib			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1269235783Skib		}
1270235783Skib
1271280183Sdumbbell		if (bp->b_flags & (B_INVAL | B_RELBUF))
1272235783Skib			vfs_vmio_release(bp);
1273280183Sdumbbell
1274280183Sdumbbell	} else if (bp->b_flags & B_VMIO) {
1275235783Skib
1276235783Skib		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1277235783Skib			vfs_vmio_release(bp);
1278235783Skib		}
1279235783Skib
1280235783Skib	}
1281235783Skib
1282235783Skib	if (bp->b_qindex != QUEUE_NONE)
1283235783Skib		panic("brelse: free buffer onto another queue???");
1284235783Skib	if (BUF_REFCNT(bp) > 1) {
1285280183Sdumbbell		/* do not release to free list */
1286280183Sdumbbell		BUF_UNLOCK(bp);
1287235783Skib		splx(s);
1288235783Skib		return;
1289235783Skib	}
1290280183Sdumbbell
1291280183Sdumbbell	/* enqueue */
1292235783Skib
1293235783Skib	/* buffers with no memory */
1294235783Skib	if (bp->b_bufsize == 0) {
1295235783Skib		bp->b_flags |= B_INVAL;
1296235783Skib		bp->b_xflags &= ~BX_BKGRDWRITE;
1297235783Skib		if (bp->b_xflags & BX_BKGRDINPROG)
1298280183Sdumbbell			panic("losing buffer 1");
1299280183Sdumbbell		if (bp->b_kvasize) {
1300280183Sdumbbell			bp->b_qindex = QUEUE_EMPTYKVA;
1301235783Skib		} else {
1302235783Skib			bp->b_qindex = QUEUE_EMPTY;
1303235783Skib		}
1304235783Skib		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1305235783Skib		LIST_REMOVE(bp, b_hash);
1306235783Skib		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1307235783Skib		bp->b_dev = NODEV;
1308235783Skib	/* buffers with junk contents */
1309235783Skib	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1310235783Skib	    (bp->b_ioflags & BIO_ERROR)) {
1311235783Skib		bp->b_flags |= B_INVAL;
1312280183Sdumbbell		bp->b_xflags &= ~BX_BKGRDWRITE;
1313280183Sdumbbell		if (bp->b_xflags & BX_BKGRDINPROG)
1314235783Skib			panic("losing buffer 2");
1315235783Skib		bp->b_qindex = QUEUE_CLEAN;
1316235783Skib		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1317235783Skib		LIST_REMOVE(bp, b_hash);
1318235783Skib		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1319280183Sdumbbell		bp->b_dev = NODEV;
1320280183Sdumbbell
1321280183Sdumbbell	/* buffers that are locked */
1322235783Skib	} else if (bp->b_flags & B_LOCKED) {
1323235783Skib		bp->b_qindex = QUEUE_LOCKED;
1324235783Skib		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1325235783Skib
1326235783Skib	/* remaining buffers */
1327280183Sdumbbell	} else {
1328280183Sdumbbell		if (bp->b_flags & B_DELWRI)
1329280183Sdumbbell			bp->b_qindex = QUEUE_DIRTY;
1330280183Sdumbbell		else
1331235783Skib			bp->b_qindex = QUEUE_CLEAN;
1332235783Skib		if (bp->b_flags & B_AGE)
1333235783Skib			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1334235783Skib		else
1335235783Skib			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
1336235783Skib	}
1337235783Skib
1338235783Skib	/*
1339235783Skib	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
1340235783Skib	 * on the correct queue.
1341235783Skib	 */
1342235783Skib	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
1343280183Sdumbbell		bundirty(bp);
1344280183Sdumbbell
1345235783Skib	/*
1346235783Skib	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
1347235783Skib	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
1348235783Skib	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
1349235783Skib	 * if B_INVAL is set ).
1350235783Skib	 */
1351280183Sdumbbell
1352280183Sdumbbell	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
1353280183Sdumbbell		bufcountwakeup();
1354235783Skib
1355235783Skib	/*
1356235783Skib	 * Something we can maybe free or reuse
1357235783Skib	 */
1358235783Skib	if (bp->b_bufsize || bp->b_kvasize)
1359235783Skib		bufspacewakeup();
1360235783Skib
1361235783Skib	/* unlock */
1362280183Sdumbbell	BUF_UNLOCK(bp);
1363280183Sdumbbell	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF |
1364280183Sdumbbell			B_DIRECT | B_NOWDRAIN);
1365235783Skib	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1366235783Skib		panic("brelse: not dirty");
1367235783Skib	splx(s);
1368235783Skib}
1369235783Skib
1370235783Skib/*
1371235783Skib * Release a buffer back to the appropriate queue but do not try to free
1372235783Skib * it.  The buffer is expected to be used again soon.
1373235783Skib *
1374235783Skib * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1375235783Skib * biodone() to requeue an async I/O on completion.  It is also used when
1376235783Skib * known good buffers need to be requeued but we think we may need the data
1377235783Skib * again soon.
1378235783Skib *
1379235783Skib * XXX we should be able to leave the B_RELBUF hint set on completion.
1380235783Skib */
1381235783Skibvoid
1382235783Skibbqrelse(struct buf * bp)
1383235783Skib{
1384235783Skib	int s;
1385235783Skib
1386235783Skib	s = splbio();
1387235783Skib
1388235783Skib	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1389235783Skib
1390235783Skib	if (bp->b_qindex != QUEUE_NONE)
1391235783Skib		panic("bqrelse: free buffer onto another queue???");
1392235783Skib	if (BUF_REFCNT(bp) > 1) {
1393235783Skib		/* do not release to free list */
1394235783Skib		BUF_UNLOCK(bp);
1395235783Skib		splx(s);
1396235783Skib		return;
1397235783Skib	}
1398235783Skib	if (bp->b_flags & B_LOCKED) {
1399235783Skib		bp->b_ioflags &= ~BIO_ERROR;
1400235783Skib		bp->b_qindex = QUEUE_LOCKED;
1401235783Skib		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1402235783Skib		/* buffers with stale but valid contents */
1403235783Skib	} else if (bp->b_flags & B_DELWRI) {
1404235783Skib		bp->b_qindex = QUEUE_DIRTY;
1405235783Skib		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1406235783Skib	} else if (vm_page_count_severe()) {
1407280183Sdumbbell		/*
1408235783Skib		 * We are too low on memory, we have to try to free the
1409235783Skib		 * buffer (most importantly: the wired pages making up its
1410235783Skib		 * backing store) *now*.
1411235783Skib		 */
1412235783Skib		splx(s);
1413235783Skib		brelse(bp);
1414280183Sdumbbell		return;
1415235783Skib	} else {
1416235783Skib		bp->b_qindex = QUEUE_CLEAN;
1417235783Skib		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1418235783Skib	}
1419235783Skib
1420235783Skib	if ((bp->b_flags & B_LOCKED) == 0 &&
1421235783Skib	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
1422235783Skib		bufcountwakeup();
1423235783Skib	}
1424235783Skib
1425235783Skib	/*
1426235783Skib	 * Something we can maybe free or reuse.
1427235783Skib	 */
1428235783Skib	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1429235783Skib		bufspacewakeup();
1430235783Skib
1431235783Skib	/* unlock */
1432235783Skib	BUF_UNLOCK(bp);
1433235783Skib	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1434235783Skib	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1435235783Skib		panic("bqrelse: not dirty");
1436235783Skib	splx(s);
1437235783Skib}
1438235783Skib
1439235783Skib/* Give pages used by the bp back to the VM system (where possible) */
1440235783Skibstatic void
1441235783Skibvfs_vmio_release(bp)
1442235783Skib	struct buf *bp;
1443235783Skib{
1444235783Skib	int i;
1445235783Skib	vm_page_t m;
1446235783Skib
1447235783Skib	GIANT_REQUIRED;
1448235783Skib
1449235783Skib	for (i = 0; i < bp->b_npages; i++) {
1450235783Skib		m = bp->b_pages[i];
1451235783Skib		bp->b_pages[i] = NULL;
1452235783Skib		/*
1453235783Skib		 * In order to keep page LRU ordering consistent, put
1454235783Skib		 * everything on the inactive queue.
1455235783Skib		 */
1456235783Skib		vm_page_unwire(m, 0);
1457235783Skib		/*
1458235783Skib		 * We don't mess with busy pages, it is
1459235783Skib		 * the responsibility of the process that
1460235783Skib		 * busied the pages to deal with them.
1461235783Skib		 */
1462235783Skib		if ((m->flags & PG_BUSY) || (m->busy != 0))
1463235783Skib			continue;
1464235783Skib
1465235783Skib		if (m->wire_count == 0) {
1466235783Skib			vm_page_flag_clear(m, PG_ZERO);
1467235783Skib			/*
1468235783Skib			 * Might as well free the page if we can and it has
1469235783Skib			 * no valid data.  We also free the page if the
1470235783Skib			 * buffer was used for direct I/O
1471235783Skib			 */
1472235783Skib			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
1473280183Sdumbbell			    m->hold_count == 0) {
1474280183Sdumbbell				vm_page_busy(m);
1475280183Sdumbbell				vm_page_protect(m, VM_PROT_NONE);
1476235783Skib				vm_page_free(m);
1477235783Skib			} else if (bp->b_flags & B_DIRECT) {
1478280183Sdumbbell				vm_page_try_to_free(m);
1479235783Skib			} else if (vm_page_count_severe()) {
1480235783Skib				vm_page_try_to_cache(m);
1481235783Skib			}
1482235783Skib		}
1483235783Skib	}
1484235783Skib	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
1485235783Skib
1486235783Skib	if (bp->b_bufsize) {
1487235783Skib		bufspacewakeup();
1488235783Skib		bp->b_bufsize = 0;
1489280183Sdumbbell	}
1490235783Skib	bp->b_npages = 0;
1491235783Skib	bp->b_flags &= ~B_VMIO;
1492235783Skib	if (bp->b_vp)
1493235783Skib		brelvp(bp);
1494277487Skib}
1495235783Skib
1496235783Skib/*
1497235783Skib * Check to see if a block is currently memory resident.
1498235783Skib */
1499235783Skibstruct buf *
1500235783Skibgbincore(struct vnode * vp, daddr_t blkno)
1501235783Skib{
1502235783Skib	struct buf *bp;
1503235783Skib	struct bufhashhdr *bh;
1504235783Skib
1505235783Skib	bh = bufhash(vp, blkno);
1506235783Skib
1507235783Skib	/* Search hash chain */
1508235783Skib	LIST_FOREACH(bp, bh, b_hash) {
1509235783Skib		/* hit */
1510235783Skib		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
1511235783Skib		    (bp->b_flags & B_INVAL) == 0) {
1512235783Skib			break;
1513235783Skib		}
1514235783Skib	}
1515235783Skib	return (bp);
1516235783Skib}
1517235783Skib
1518235783Skib/*
1519235783Skib *	vfs_bio_awrite:
1520235783Skib *
1521235783Skib *	Implement clustered async writes for clearing out B_DELWRI buffers.
1522235783Skib *	This is much better then the old way of writing only one buffer at
1523235783Skib *	a time.  Note that we may not be presented with the buffers in the
1524235783Skib *	correct order, so we search for the cluster in both directions.
1525235783Skib */
1526235783Skibint
1527235783Skibvfs_bio_awrite(struct buf * bp)
1528235783Skib{
1529235783Skib	int i;
1530280183Sdumbbell	int j;
1531235783Skib	daddr_t lblkno = bp->b_lblkno;
1532235783Skib	struct vnode *vp = bp->b_vp;
1533280183Sdumbbell	int s;
1534280183Sdumbbell	int ncl;
1535280183Sdumbbell	struct buf *bpa;
1536235783Skib	int nwritten;
1537235783Skib	int size;
1538235783Skib	int maxcl;
1539235783Skib
1540235783Skib	s = splbio();
1541235783Skib	/*
1542235783Skib	 * right now we support clustered writing only to regular files.  If
1543235783Skib	 * we find a clusterable block we could be in the middle of a cluster
1544235783Skib	 * rather then at the beginning.
1545280183Sdumbbell	 */
1546280183Sdumbbell	if ((vp->v_type == VREG) &&
1547277487Skib	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1548280183Sdumbbell	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1549280183Sdumbbell
1550280183Sdumbbell		size = vp->v_mount->mnt_stat.f_iosize;
1551277487Skib		maxcl = MAXPHYS / size;
1552277487Skib
1553235783Skib		for (i = 1; i < maxcl; i++) {
1554280183Sdumbbell			if ((bpa = gbincore(vp, lblkno + i)) &&
1555280183Sdumbbell			    BUF_REFCNT(bpa) == 0 &&
1556280183Sdumbbell			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1557277487Skib			    (B_DELWRI | B_CLUSTEROK)) &&
1558235783Skib			    (bpa->b_bufsize == size)) {
1559277487Skib				if ((bpa->b_blkno == bpa->b_lblkno) ||
1560235783Skib				    (bpa->b_blkno !=
1561235783Skib				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
1562235783Skib					break;
1563235783Skib			} else {
1564235783Skib				break;
1565235783Skib			}
1566280183Sdumbbell		}
1567235783Skib		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
1568235783Skib			if ((bpa = gbincore(vp, lblkno - j)) &&
1569280183Sdumbbell			    BUF_REFCNT(bpa) == 0 &&
1570280183Sdumbbell			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1571280183Sdumbbell			    (B_DELWRI | B_CLUSTEROK)) &&
1572235783Skib			    (bpa->b_bufsize == size)) {
1573235783Skib				if ((bpa->b_blkno == bpa->b_lblkno) ||
1574235783Skib				    (bpa->b_blkno !=
1575235783Skib				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
1576235783Skib					break;
1577235783Skib			} else {
1578235783Skib				break;
1579235783Skib			}
1580235783Skib		}
1581235783Skib		--j;
1582235783Skib		ncl = i + j;
1583235783Skib		/*
1584235783Skib		 * this is a possible cluster write
1585235783Skib		 */
1586235783Skib		if (ncl != 1) {
1587235783Skib			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
1588235783Skib			splx(s);
1589235783Skib			return nwritten;
1590235783Skib		}
1591235783Skib	}
1592235783Skib
1593235783Skib	BUF_LOCK(bp, LK_EXCLUSIVE);
1594280183Sdumbbell	bremfree(bp);
1595235783Skib	bp->b_flags |= B_ASYNC;
1596235783Skib
1597235783Skib	splx(s);
1598235783Skib	/*
1599235783Skib	 * default (old) behavior, writing out only one block
1600280183Sdumbbell	 *
1601235783Skib	 * XXX returns b_bufsize instead of b_bcount for nwritten?
1602235783Skib	 */
1603235783Skib	nwritten = bp->b_bufsize;
1604235783Skib	(void) BUF_WRITE(bp);
1605235783Skib
1606235783Skib	return nwritten;
1607235783Skib}
1608235783Skib
1609235783Skib/*
1610235783Skib *	getnewbuf:
1611235783Skib *
1612235783Skib *	Find and initialize a new buffer header, freeing up existing buffers
1613235783Skib *	in the bufqueues as necessary.  The new buffer is returned locked.
1614235783Skib *
1615235783Skib *	Important:  B_INVAL is not set.  If the caller wishes to throw the
1616235783Skib *	buffer away, the caller must set B_INVAL prior to calling brelse().
1617235783Skib *
1618235783Skib *	We block if:
1619235783Skib *		We have insufficient buffer headers
1620235783Skib *		We have insufficient buffer space
1621235783Skib *		buffer_map is too fragmented ( space reservation fails )
1622235783Skib *		If we have to flush dirty buffers ( but we try to avoid this )
1623235783Skib *
1624235783Skib *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
1625235783Skib *	Instead we ask the buf daemon to do it for us.  We attempt to
1626235783Skib *	avoid piecemeal wakeups of the pageout daemon.
1627235783Skib */
1628235783Skib
1629235783Skibstatic struct buf *
1630235783Skibgetnewbuf(int slpflag, int slptimeo, int size, int maxsize)
1631235783Skib{
1632235783Skib	struct buf *bp;
1633235783Skib	struct buf *nbp;
1634235783Skib	int defrag = 0;
1635235783Skib	int nqindex;
1636280183Sdumbbell	static int flushingbufs;
1637235783Skib
1638235783Skib	GIANT_REQUIRED;
1639235783Skib
1640280183Sdumbbell	/*
1641235783Skib	 * We can't afford to block since we might be holding a vnode lock,
1642235783Skib	 * which may prevent system daemons from running.  We deal with
1643235783Skib	 * low-memory situations by proactively returning memory and running
1644235783Skib	 * async I/O rather then sync I/O.
1645235783Skib	 */
1646235783Skib
1647235783Skib	++getnewbufcalls;
1648235783Skib	--getnewbufrestarts;
1649235783Skibrestart:
1650235783Skib	++getnewbufrestarts;
1651280183Sdumbbell
1652235783Skib	/*
1653235783Skib	 * Setup for scan.  If we do not have enough free buffers,
1654280183Sdumbbell	 * we setup a degenerate case that immediately fails.  Note
1655280183Sdumbbell	 * that if we are specially marked process, we are allowed to
1656235783Skib	 * dip into our reserves.
1657235783Skib	 *
1658235783Skib	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
1659235783Skib	 *
1660235783Skib	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
1661235783Skib	 * However, there are a number of cases (defragging, reusing, ...)
1662235783Skib	 * where we cannot backup.
1663235783Skib	 */
1664235783Skib	nqindex = QUEUE_EMPTYKVA;
1665235783Skib	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
1666235783Skib
1667235783Skib	if (nbp == NULL) {
1668235783Skib		/*
1669235783Skib		 * If no EMPTYKVA buffers and we are either
1670235783Skib		 * defragging or reusing, locate a CLEAN buffer
1671235783Skib		 * to free or reuse.  If bufspace useage is low
1672235783Skib		 * skip this step so we can allocate a new buffer.
1673235783Skib		 */
1674235783Skib		if (defrag || bufspace >= lobufspace) {
1675235783Skib			nqindex = QUEUE_CLEAN;
1676235783Skib			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
1677235783Skib		}
1678235783Skib
1679235783Skib		/*
1680235783Skib		 * If we could not find or were not allowed to reuse a
1681235783Skib		 * CLEAN buffer, check to see if it is ok to use an EMPTY
1682235783Skib		 * buffer.  We can only use an EMPTY buffer if allocating
1683235783Skib		 * its KVA would not otherwise run us out of buffer space.
1684235783Skib		 */
1685235783Skib		if (nbp == NULL && defrag == 0 &&
1686280183Sdumbbell		    bufspace + maxsize < hibufspace) {
1687235783Skib			nqindex = QUEUE_EMPTY;
1688235783Skib			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1689235783Skib		}
1690280183Sdumbbell	}
1691235783Skib
1692235783Skib	/*
1693235783Skib	 * Run scan, possibly freeing data and/or kva mappings on the fly
1694235783Skib	 * depending.
1695235783Skib	 */
1696280183Sdumbbell
1697235783Skib	while ((bp = nbp) != NULL) {
1698235783Skib		int qindex = nqindex;
1699235783Skib
1700235783Skib		/*
1701235783Skib		 * Calculate next bp ( we can only use it if we do not block
1702235783Skib		 * or do other fancy things ).
1703235783Skib		 */
1704235783Skib		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
1705235783Skib			switch(qindex) {
1706235783Skib			case QUEUE_EMPTY:
1707235783Skib				nqindex = QUEUE_EMPTYKVA;
1708235783Skib				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
1709235783Skib					break;
1710235783Skib				/* fall through */
1711235783Skib			case QUEUE_EMPTYKVA:
1712235783Skib				nqindex = QUEUE_CLEAN;
1713235783Skib				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
1714235783Skib					break;
1715235783Skib				/* fall through */
1716235783Skib			case QUEUE_CLEAN:
1717235783Skib				/*
1718235783Skib				 * nbp is NULL.
1719235783Skib				 */
1720235783Skib				break;
1721280183Sdumbbell			}
1722280183Sdumbbell		}
1723235783Skib
1724235783Skib		/*
1725280183Sdumbbell		 * Sanity Checks
1726235783Skib		 */
1727235783Skib		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
1728235783Skib
1729235783Skib		/*
1730235783Skib		 * Note: we no longer distinguish between VMIO and non-VMIO
1731235783Skib		 * buffers.
1732235783Skib		 */
1733235783Skib
1734235783Skib		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
1735235783Skib
1736235783Skib		/*
1737235783Skib		 * If we are defragging then we need a buffer with
1738235783Skib		 * b_kvasize != 0.  XXX this situation should no longer
1739235783Skib		 * occur, if defrag is non-zero the buffer's b_kvasize
1740235783Skib		 * should also be non-zero at this point.  XXX
1741235783Skib		 */
1742235783Skib		if (defrag && bp->b_kvasize == 0) {
1743235783Skib			printf("Warning: defrag empty buffer %p\n", bp);
1744235783Skib			continue;
1745235783Skib		}
1746235783Skib
1747235783Skib		/*
1748235783Skib		 * Start freeing the bp.  This is somewhat involved.  nbp
1749235783Skib		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
1750235783Skib		 */
1751235783Skib
1752235783Skib		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1753235783Skib			panic("getnewbuf: locked buf");
1754235783Skib		bremfree(bp);
1755235783Skib
1756235783Skib		if (qindex == QUEUE_CLEAN) {
1757235783Skib			if (bp->b_flags & B_VMIO) {
1758235783Skib				bp->b_flags &= ~B_ASYNC;
1759235783Skib				vfs_vmio_release(bp);
1760235783Skib			}
1761280183Sdumbbell			if (bp->b_vp)
1762235783Skib				brelvp(bp);
1763235783Skib		}
1764235783Skib
1765235783Skib		/*
1766235783Skib		 * NOTE:  nbp is now entirely invalid.  We can only restart
1767235783Skib		 * the scan from this point on.
1768235783Skib		 *
1769235783Skib		 * Get the rest of the buffer freed up.  b_kva* is still
1770235783Skib		 * valid after this operation.
1771235783Skib		 */
1772235783Skib
1773235783Skib		if (bp->b_rcred != NOCRED) {
1774280183Sdumbbell			crfree(bp->b_rcred);
1775235783Skib			bp->b_rcred = NOCRED;
1776235783Skib		}
1777235783Skib		if (bp->b_wcred != NOCRED) {
1778235783Skib			crfree(bp->b_wcred);
1779235783Skib			bp->b_wcred = NOCRED;
1780235783Skib		}
1781235783Skib		if (LIST_FIRST(&bp->b_dep) != NULL)
1782235783Skib			buf_deallocate(bp);
1783235783Skib		if (bp->b_xflags & BX_BKGRDINPROG)
1784235783Skib			panic("losing buffer 3");
1785235783Skib		LIST_REMOVE(bp, b_hash);
1786235783Skib		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1787235783Skib
1788235783Skib		if (bp->b_bufsize)
1789235783Skib			allocbuf(bp, 0);
1790235783Skib
1791235783Skib		bp->b_flags = 0;
1792280183Sdumbbell		bp->b_ioflags = 0;
1793235783Skib		bp->b_xflags = 0;
1794235783Skib		bp->b_dev = NODEV;
1795235783Skib		bp->b_vp = NULL;
1796235783Skib		bp->b_blkno = bp->b_lblkno = 0;
1797235783Skib		bp->b_offset = NOOFFSET;
1798235783Skib		bp->b_iodone = 0;
1799235783Skib		bp->b_error = 0;
1800235783Skib		bp->b_resid = 0;
1801235783Skib		bp->b_bcount = 0;
1802280183Sdumbbell		bp->b_npages = 0;
1803235783Skib		bp->b_dirtyoff = bp->b_dirtyend = 0;
1804235783Skib		bp->b_magic = B_MAGIC_BIO;
1805235783Skib		bp->b_op = &buf_ops_bio;
1806235783Skib		bp->b_object = NULL;
1807235783Skib
1808235783Skib		LIST_INIT(&bp->b_dep);
1809235783Skib
1810235783Skib		/*
1811235783Skib		 * If we are defragging then free the buffer.
1812235783Skib		 */
1813280183Sdumbbell		if (defrag) {
1814235783Skib			bp->b_flags |= B_INVAL;
1815235783Skib			bfreekva(bp);
1816235783Skib			brelse(bp);
1817235783Skib			defrag = 0;
1818235783Skib			goto restart;
1819235783Skib		}
1820235783Skib
1821235783Skib		/*
1822235783Skib		 * If we are overcomitted then recover the buffer and its
1823235783Skib		 * KVM space.  This occurs in rare situations when multiple
1824235783Skib		 * processes are blocked in getnewbuf() or allocbuf().
1825235783Skib		 */
1826235783Skib		if (bufspace >= hibufspace)
1827235783Skib			flushingbufs = 1;
1828235783Skib		if (flushingbufs && bp->b_kvasize != 0) {
1829235783Skib			bp->b_flags |= B_INVAL;
1830235783Skib			bfreekva(bp);
1831235783Skib			brelse(bp);
1832235783Skib			goto restart;
1833235783Skib		}
1834235783Skib		if (bufspace < lobufspace)
1835280183Sdumbbell			flushingbufs = 0;
1836235783Skib		break;
1837235783Skib	}
1838235783Skib
1839235783Skib	/*
1840235783Skib	 * If we exhausted our list, sleep as appropriate.  We may have to
1841235783Skib	 * wakeup various daemons and write out some dirty buffers.
1842235783Skib	 *
1843235783Skib	 * Generally we are sleeping due to insufficient buffer space.
1844235783Skib	 */
1845235783Skib
1846235783Skib	if (bp == NULL) {
1847280183Sdumbbell		int flags;
1848235783Skib		char *waitmsg;
1849235783Skib
1850235783Skib		if (defrag) {
1851280183Sdumbbell			flags = VFS_BIO_NEED_BUFSPACE;
1852235783Skib			waitmsg = "nbufkv";
1853235783Skib		} else if (bufspace >= hibufspace) {
1854235783Skib			waitmsg = "nbufbs";
1855235783Skib			flags = VFS_BIO_NEED_BUFSPACE;
1856235783Skib		} else {
1857235783Skib			waitmsg = "newbuf";
1858235783Skib			flags = VFS_BIO_NEED_ANY;
1859235783Skib		}
1860235783Skib
1861235783Skib		bd_speedup();	/* heeeelp */
1862235783Skib
1863235783Skib		needsbuffer |= flags;
1864235783Skib		while (needsbuffer & flags) {
1865235783Skib			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
1866235783Skib			    waitmsg, slptimeo))
1867235783Skib				return (NULL);
1868235783Skib		}
1869235783Skib	} else {
1870235783Skib		/*
1871235783Skib		 * We finally have a valid bp.  We aren't quite out of the
1872235783Skib		 * woods, we still have to reserve kva space.  In order
1873235783Skib		 * to keep fragmentation sane we only allocate kva in
1874235783Skib		 * BKVASIZE chunks.
1875235783Skib		 */
1876235783Skib		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
1877235783Skib
1878235783Skib		if (maxsize != bp->b_kvasize) {
1879235783Skib			vm_offset_t addr = 0;
1880235783Skib
1881235783Skib			bfreekva(bp);
1882235783Skib
1883235783Skib			if (vm_map_findspace(buffer_map,
1884235783Skib				vm_map_min(buffer_map), maxsize, &addr)) {
1885235783Skib				/*
1886235783Skib				 * Uh oh.  Buffer map is to fragmented.  We
1887235783Skib				 * must defragment the map.
1888235783Skib				 */
1889235783Skib				++bufdefragcnt;
1890235783Skib				defrag = 1;
1891235783Skib				bp->b_flags |= B_INVAL;
1892235783Skib				brelse(bp);
1893235783Skib				goto restart;
1894235783Skib			}
1895280183Sdumbbell			if (addr) {
1896277487Skib				vm_map_insert(buffer_map, NULL, 0,
1897235783Skib					addr, addr + maxsize,
1898235783Skib					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1899235783Skib
1900280183Sdumbbell				bp->b_kvabase = (caddr_t) addr;
1901235783Skib				bp->b_kvasize = maxsize;
1902235783Skib				bufspace += bp->b_kvasize;
1903235783Skib				++bufreusecnt;
1904280183Sdumbbell			}
1905235783Skib		}
1906235783Skib		bp->b_data = bp->b_kvabase;
1907235783Skib	}
1908235783Skib	return(bp);
1909235783Skib}
1910235783Skib
1911280183Sdumbbell/*
1912235783Skib *	buf_daemon:
1913235783Skib *
1914235783Skib *	buffer flushing daemon.  Buffers are normally flushed by the
1915235783Skib *	update daemon but if it cannot keep up this process starts to
1916235783Skib *	take the load in an attempt to prevent getnewbuf() from blocking.
1917235783Skib */
1918280183Sdumbbell
1919235783Skibstatic struct proc *bufdaemonproc;
1920235783Skib
1921235783Skibstatic struct kproc_desc buf_kp = {
1922235783Skib	"bufdaemon",
1923235783Skib	buf_daemon,
1924235783Skib	&bufdaemonproc
1925235783Skib};
1926235783SkibSYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
1927235783Skib
1928235783Skibstatic void
1929235783Skibbuf_daemon()
1930235783Skib{
1931235783Skib	int s;
1932235783Skib
1933235783Skib	mtx_lock(&Giant);
1934280183Sdumbbell
1935235783Skib	/*
1936235783Skib	 * This process needs to be suspended prior to shutdown sync.
1937235783Skib	 */
1938235783Skib	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
1939235783Skib	    SHUTDOWN_PRI_LAST);
1940235783Skib
1941235783Skib	/*
1942280183Sdumbbell	 * This process is allowed to take the buffer cache to the limit
1943235783Skib	 */
1944235783Skib	s = splbio();
1945235783Skib
1946235783Skib	for (;;) {
1947235783Skib		kthread_suspend_check(bufdaemonproc);
1948235783Skib
1949235783Skib		bd_request = 0;
1950235783Skib
1951235783Skib		/*
1952235783Skib		 * Do the flush.  Limit the amount of in-transit I/O we
1953235783Skib		 * allow to build up, otherwise we would completely saturate
1954280183Sdumbbell		 * the I/O system.  Wakeup any waiting processes before we
1955280183Sdumbbell		 * normally would so they can run in parallel with our drain.
1956280183Sdumbbell		 */
1957280183Sdumbbell		while (numdirtybuffers > lodirtybuffers) {
1958280183Sdumbbell			if (flushbufqueues() == 0)
1959280183Sdumbbell				break;
1960280183Sdumbbell			waitrunningbufspace();
1961280183Sdumbbell			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
1962280183Sdumbbell		}
1963280183Sdumbbell
1964280183Sdumbbell		/*
1965280183Sdumbbell		 * Only clear bd_request if we have reached our low water
1966280183Sdumbbell		 * mark.  The buf_daemon normally waits 1 second and
1967280183Sdumbbell		 * then incrementally flushes any dirty buffers that have
1968280183Sdumbbell		 * built up, within reason.
1969280183Sdumbbell		 *
1970280183Sdumbbell		 * If we were unable to hit our low water mark and couldn't
1971280183Sdumbbell		 * find any flushable buffers, we sleep half a second.
1972280183Sdumbbell		 * Otherwise we loop immediately.
1973235783Skib		 */
1974235783Skib		if (numdirtybuffers <= lodirtybuffers) {
1975235783Skib			/*
1976235783Skib			 * We reached our low water mark, reset the
1977235783Skib			 * request and sleep until we are needed again.
1978235783Skib			 * The sleep is just so the suspend code works.
1979280183Sdumbbell			 */
1980235783Skib			bd_request = 0;
1981235783Skib			tsleep(&bd_request, PVM, "psleep", hz);
1982235783Skib		} else {
1983235783Skib			/*
1984235783Skib			 * We couldn't find any flushable dirty buffers but
1985235783Skib			 * still have too many dirty buffers, we
1986280183Sdumbbell			 * have to sleep and try again.  (rare)
1987235783Skib			 */
1988235783Skib			tsleep(&bd_request, PVM, "qsleep", hz / 2);
1989235783Skib		}
1990235783Skib	}
1991235783Skib}
1992235783Skib
1993235783Skib/*
1994235783Skib *	flushbufqueues:
1995280183Sdumbbell *
1996235783Skib *	Try to flush a buffer in the dirty queue.  We must be careful to
1997235783Skib *	free up B_INVAL buffers instead of write them, which NFS is
1998235783Skib *	particularly sensitive to.
1999235783Skib */
2000280183Sdumbbell
2001293851Sdumbbellstatic int
2002280183Sdumbbellflushbufqueues(void)
2003280183Sdumbbell{
2004280183Sdumbbell	struct buf *bp;
2005280183Sdumbbell	int r = 0;
2006235783Skib
2007235783Skib	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
2008280183Sdumbbell
2009280183Sdumbbell	while (bp) {
2010280183Sdumbbell		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
2011235783Skib		if ((bp->b_flags & B_DELWRI) != 0 &&
2012235783Skib		    (bp->b_xflags & BX_BKGRDINPROG) == 0) {
2013235783Skib			if (bp->b_flags & B_INVAL) {
2014235783Skib				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
2015235783Skib					panic("flushbufqueues: locked buf");
2016235783Skib				bremfree(bp);
2017235783Skib				brelse(bp);
2018235783Skib				++r;
2019280183Sdumbbell				break;
2020235783Skib			}
2021235783Skib			if (LIST_FIRST(&bp->b_dep) != NULL &&
2022235783Skib			    (bp->b_flags & B_DEFERRED) == 0 &&
2023235783Skib			    buf_countdeps(bp, 0)) {
2024235783Skib				TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
2025235783Skib				    bp, b_freelist);
2026235783Skib				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
2027235783Skib				    bp, b_freelist);
2028235783Skib				bp->b_flags |= B_DEFERRED;
2029235783Skib				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
2030235783Skib				continue;
2031235783Skib			}
2032235783Skib			vfs_bio_awrite(bp);
2033235783Skib			++r;
2034235783Skib			break;
2035235783Skib		}
2036235783Skib		bp = TAILQ_NEXT(bp, b_freelist);
2037235783Skib	}
2038235783Skib	return (r);
2039235783Skib}
2040235783Skib
2041235783Skib/*
2042235783Skib * Check to see if a block is currently memory resident.
2043235783Skib */
2044235783Skibstruct buf *
2045235783Skibincore(struct vnode * vp, daddr_t blkno)
2046235783Skib{
2047235783Skib	struct buf *bp;
2048235783Skib
2049235783Skib	int s = splbio();
2050235783Skib	bp = gbincore(vp, blkno);
2051235783Skib	splx(s);
2052235783Skib	return (bp);
2053235783Skib}
2054235783Skib
2055235783Skib/*
2056280183Sdumbbell * Returns true if no I/O is needed to access the
2057235783Skib * associated VM object.  This is like incore except
2058280183Sdumbbell * it also hunts around in the VM system for the data.
2059280183Sdumbbell */
2060235783Skib
2061235783Skibint
2062235783Skibinmem(struct vnode * vp, daddr_t blkno)
2063235783Skib{
2064235783Skib	vm_object_t obj;
2065280183Sdumbbell	vm_offset_t toff, tinc, size;
2066235783Skib	vm_page_t m;
2067235783Skib	vm_ooffset_t off;
2068235783Skib
2069235783Skib	GIANT_REQUIRED;
2070235783Skib
2071235783Skib	if (incore(vp, blkno))
2072280183Sdumbbell		return 1;
2073235783Skib	if (vp->v_mount == NULL)
2074235783Skib		return 0;
2075235783Skib	if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
2076280183Sdumbbell		return 0;
2077235783Skib
2078235783Skib	size = PAGE_SIZE;
2079235783Skib	if (size > vp->v_mount->mnt_stat.f_iosize)
2080235783Skib		size = vp->v_mount->mnt_stat.f_iosize;
2081235783Skib	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2082235783Skib
2083235783Skib	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2084280183Sdumbbell		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2085235783Skib		if (!m)
2086235783Skib			goto notinmem;
2087235783Skib		tinc = size;
2088235783Skib		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2089235783Skib			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2090235783Skib		if (vm_page_is_valid(m,
2091235783Skib		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2092235783Skib			goto notinmem;
2093235783Skib	}
2094235783Skib	return 1;
2095235783Skib
2096235783Skibnotinmem:
2097235783Skib	return (0);
2098235783Skib}
2099235783Skib
2100280183Sdumbbell/*
2101235783Skib *	vfs_setdirty:
2102235783Skib *
2103235783Skib *	Sets the dirty range for a buffer based on the status of the dirty
2104235783Skib *	bits in the pages comprising the buffer.
2105235783Skib *
2106235783Skib *	The range is limited to the size of the buffer.
2107235783Skib *
2108235783Skib *	This routine is primarily used by NFS, but is generalized for the
2109235783Skib *	B_VMIO case.
2110235783Skib */
2111235783Skibstatic void
2112235783Skibvfs_setdirty(struct buf *bp)
2113235783Skib{
2114235783Skib	int i;
2115235783Skib	vm_object_t object;
2116235783Skib
2117235783Skib	GIANT_REQUIRED;
2118235783Skib	/*
2119235783Skib	 * Degenerate case - empty buffer
2120280183Sdumbbell	 */
2121235783Skib
2122235783Skib	if (bp->b_bufsize == 0)
2123235783Skib		return;
2124235783Skib
2125235783Skib	/*
2126235783Skib	 * We qualify the scan for modified pages on whether the
2127280183Sdumbbell	 * object has been flushed yet.  The OBJ_WRITEABLE flag
2128235783Skib	 * is not cleared simply by protecting pages off.
2129235783Skib	 */
2130235783Skib
2131235783Skib	if ((bp->b_flags & B_VMIO) == 0)
2132235783Skib		return;
2133235783Skib
2134235783Skib	object = bp->b_pages[0]->object;
2135235783Skib
2136235783Skib	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
2137235783Skib		printf("Warning: object %p writeable but not mightbedirty\n", object);
2138235783Skib	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
2139235783Skib		printf("Warning: object %p mightbedirty but not writeable\n", object);
2140235783Skib
2141235783Skib	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
2142235783Skib		vm_offset_t boffset;
2143235783Skib		vm_offset_t eoffset;
2144235783Skib
2145235783Skib		/*
2146235783Skib		 * test the pages to see if they have been modified directly
2147235783Skib		 * by users through the VM system.
2148235783Skib		 */
2149235783Skib		for (i = 0; i < bp->b_npages; i++) {
2150235783Skib			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2151235783Skib			vm_page_test_dirty(bp->b_pages[i]);
2152235783Skib		}
2153235783Skib
2154235783Skib		/*
2155235783Skib		 * Calculate the encompassing dirty range, boffset and eoffset,
2156235783Skib		 * (eoffset - boffset) bytes.
2157235783Skib		 */
2158235783Skib
2159235783Skib		for (i = 0; i < bp->b_npages; i++) {
2160235783Skib			if (bp->b_pages[i]->dirty)
2161235783Skib				break;
2162235783Skib		}
2163235783Skib		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2164280183Sdumbbell
2165235783Skib		for (i = bp->b_npages - 1; i >= 0; --i) {
2166235783Skib			if (bp->b_pages[i]->dirty) {
2167280183Sdumbbell				break;
2168280183Sdumbbell			}
2169235783Skib		}
2170280183Sdumbbell		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2171235783Skib
2172235783Skib		/*
2173235783Skib		 * Fit it to the buffer.
2174280183Sdumbbell		 */
2175280183Sdumbbell
2176280183Sdumbbell		if (eoffset > bp->b_bcount)
2177280183Sdumbbell			eoffset = bp->b_bcount;
2178235783Skib
2179280183Sdumbbell		/*
2180235783Skib		 * If we have a good dirty range, merge with the existing
2181235783Skib		 * dirty range.
2182235783Skib		 */
2183235783Skib
2184235783Skib		if (boffset < eoffset) {
2185235783Skib			if (bp->b_dirtyoff > boffset)
2186235783Skib				bp->b_dirtyoff = boffset;
2187235783Skib			if (bp->b_dirtyend < eoffset)
2188235783Skib				bp->b_dirtyend = eoffset;
2189235783Skib		}
2190235783Skib	}
2191235783Skib}
2192280183Sdumbbell
2193235783Skib/*
2194235783Skib *	getblk:
2195235783Skib *
2196235783Skib *	Get a block given a specified block and offset into a file/device.
2197235783Skib *	The buffers B_DONE bit will be cleared on return, making it almost
2198235783Skib * 	ready for an I/O initiation.  B_INVAL may or may not be set on
2199235783Skib *	return.  The caller should clear B_INVAL prior to initiating a
2200235783Skib *	READ.
2201235783Skib *
2202235783Skib *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
2203235783Skib *	an existing buffer.
2204235783Skib *
2205235783Skib *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
2206235783Skib *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
2207235783Skib *	and then cleared based on the backing VM.  If the previous buffer is
2208235783Skib *	non-0-sized but invalid, B_CACHE will be cleared.
2209235783Skib *
2210235783Skib *	If getblk() must create a new buffer, the new buffer is returned with
2211235783Skib *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
2212235783Skib *	case it is returned with B_INVAL clear and B_CACHE set based on the
2213235783Skib *	backing VM.
2214235783Skib *
2215235783Skib *	getblk() also forces a BUF_WRITE() for any B_DELWRI buffer whos
2216235783Skib *	B_CACHE bit is clear.
2217235783Skib *
2218235783Skib *	What this means, basically, is that the caller should use B_CACHE to
2219235783Skib *	determine whether the buffer is fully valid or not and should clear
2220235783Skib *	B_INVAL prior to issuing a read.  If the caller intends to validate
2221235783Skib *	the buffer by loading its data area with something, the caller needs
2222235783Skib *	to clear B_INVAL.  If the caller does this without issuing an I/O,
2223235783Skib *	the caller should set B_CACHE ( as an optimization ), else the caller
2224235783Skib *	should issue the I/O and biodone() will set B_CACHE if the I/O was
2225235783Skib *	a write attempt or if it was a successfull read.  If the caller
2226235783Skib *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
2227235783Skib *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
2228235783Skib */
2229235783Skibstruct buf *
2230235783Skibgetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
2231235783Skib{
2232235783Skib	struct buf *bp;
2233235783Skib	int s;
2234235783Skib	struct bufhashhdr *bh;
2235235783Skib
2236235783Skib	if (size > MAXBSIZE)
2237235783Skib		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
2238235783Skib
2239235783Skib	s = splbio();
2240235783Skibloop:
2241235783Skib	/*
2242235783Skib	 * Block if we are low on buffers.   Certain processes are allowed
2243235783Skib	 * to completely exhaust the buffer cache.
2244235783Skib         *
2245280183Sdumbbell         * If this check ever becomes a bottleneck it may be better to
2246280183Sdumbbell         * move it into the else, when gbincore() fails.  At the moment
2247235783Skib         * it isn't a problem.
2248235783Skib	 *
2249235783Skib	 * XXX remove if 0 sections (clean this up after its proven)
2250235783Skib         */
2251235783Skib	if (numfreebuffers == 0) {
2252235783Skib		if (curthread == PCPU_GET(idlethread))
2253235783Skib			return NULL;
2254235783Skib		needsbuffer |= VFS_BIO_NEED_ANY;
2255235783Skib	}
2256235783Skib
2257235783Skib	if ((bp = gbincore(vp, blkno))) {
2258235783Skib		/*
2259280183Sdumbbell		 * Buffer is in-core.  If the buffer is not busy, it must
2260235783Skib		 * be on a queue.
2261235783Skib		 */
2262235783Skib
2263280183Sdumbbell		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
2264277487Skib			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
2265277487Skib			    "getblk", slpflag, slptimeo) == ENOLCK)
2266277487Skib				goto loop;
2267277487Skib			splx(s);
2268277487Skib			return (struct buf *) NULL;
2269277487Skib		}
2270277487Skib
2271277487Skib		/*
2272277487Skib		 * The buffer is locked.  B_CACHE is cleared if the buffer is
2273277487Skib		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
2274277487Skib		 * and for a VMIO buffer B_CACHE is adjusted according to the
2275277487Skib		 * backing VM cache.
2276277487Skib		 */
2277277487Skib		if (bp->b_flags & B_INVAL)
2278277487Skib			bp->b_flags &= ~B_CACHE;
2279277487Skib		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
2280277487Skib			bp->b_flags |= B_CACHE;
2281277487Skib		bremfree(bp);
2282277487Skib
2283277487Skib		/*
2284277487Skib		 * check for size inconsistancies for non-VMIO case.
2285277487Skib		 */
2286277487Skib
2287277487Skib		if (bp->b_bcount != size) {
2288277487Skib			if ((bp->b_flags & B_VMIO) == 0 ||
2289280183Sdumbbell			    (size > bp->b_kvasize)) {
2290280183Sdumbbell				if (bp->b_flags & B_DELWRI) {
2291277487Skib					bp->b_flags |= B_NOCACHE;
2292277487Skib					BUF_WRITE(bp);
2293277487Skib				} else {
2294277487Skib					if ((bp->b_flags & B_VMIO) &&
2295277487Skib					   (LIST_FIRST(&bp->b_dep) == NULL)) {
2296277487Skib						bp->b_flags |= B_RELBUF;
2297280183Sdumbbell						brelse(bp);
2298280183Sdumbbell					} else {
2299280183Sdumbbell						bp->b_flags |= B_NOCACHE;
2300280183Sdumbbell						BUF_WRITE(bp);
2301280183Sdumbbell					}
2302280183Sdumbbell				}
2303280183Sdumbbell				goto loop;
2304277487Skib			}
2305277487Skib		}
2306277487Skib
2307277487Skib		/*
2308277487Skib		 * If the size is inconsistant in the VMIO case, we can resize
2309277487Skib		 * the buffer.  This might lead to B_CACHE getting set or
2310277487Skib		 * cleared.  If the size has not changed, B_CACHE remains
2311277487Skib		 * unchanged from its previous state.
2312235783Skib		 */
2313235783Skib
2314235783Skib		if (bp->b_bcount != size)
2315235783Skib			allocbuf(bp, size);
2316235783Skib
2317235783Skib		KASSERT(bp->b_offset != NOOFFSET,
2318235783Skib		    ("getblk: no buffer offset"));
2319235783Skib
2320235783Skib		/*
2321235783Skib		 * A buffer with B_DELWRI set and B_CACHE clear must
2322235783Skib		 * be committed before we can return the buffer in
2323235783Skib		 * order to prevent the caller from issuing a read
2324235783Skib		 * ( due to B_CACHE not being set ) and overwriting
2325235783Skib		 * it.
2326235783Skib		 *
2327235783Skib		 * Most callers, including NFS and FFS, need this to
2328235783Skib		 * operate properly either because they assume they
2329235783Skib		 * can issue a read if B_CACHE is not set, or because
2330235783Skib		 * ( for example ) an uncached B_DELWRI might loop due
2331235783Skib		 * to softupdates re-dirtying the buffer.  In the latter
2332235783Skib		 * case, B_CACHE is set after the first write completes,
2333235783Skib		 * preventing further loops.
2334235783Skib		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
2335280183Sdumbbell		 * above while extending the buffer, we cannot allow the
2336235783Skib		 * buffer to remain with B_CACHE set after the write
2337235783Skib		 * completes or it will represent a corrupt state.  To
2338280183Sdumbbell		 * deal with this we set B_NOCACHE to scrap the buffer
2339235783Skib		 * after the write.
2340280183Sdumbbell		 *
2341280183Sdumbbell		 * We might be able to do something fancy, like setting
2342280183Sdumbbell		 * B_CACHE in bwrite() except if B_DELWRI is already set,
2343280183Sdumbbell		 * so the below call doesn't set B_CACHE, but that gets real
2344280183Sdumbbell		 * confusing.  This is much easier.
2345235783Skib		 */
2346277487Skib
2347235783Skib		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
2348280183Sdumbbell			bp->b_flags |= B_NOCACHE;
2349235783Skib			BUF_WRITE(bp);
2350235783Skib			goto loop;
2351277487Skib		}
2352235783Skib
2353280183Sdumbbell		splx(s);
2354235783Skib		bp->b_flags &= ~B_DONE;
2355235783Skib	} else {
2356277487Skib		/*
2357277487Skib		 * Buffer is not in-core, create new buffer.  The buffer
2358280183Sdumbbell		 * returned by getnewbuf() is locked.  Note that the returned
2359235783Skib		 * buffer is also considered valid (not marked B_INVAL).
2360235783Skib		 */
2361235783Skib		int bsize, maxsize, vmio;
2362280183Sdumbbell		off_t offset;
2363235783Skib
2364280183Sdumbbell		if (vn_isdisk(vp, NULL))
2365235783Skib			bsize = DEV_BSIZE;
2366235783Skib		else if (vp->v_mountedhere)
2367235783Skib			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
2368235783Skib		else if (vp->v_mount)
2369235783Skib			bsize = vp->v_mount->mnt_stat.f_iosize;
2370235783Skib		else
2371235783Skib			bsize = size;
2372235783Skib
2373235783Skib		offset = blkno * bsize;
2374280183Sdumbbell		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
2375235783Skib		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
2376235783Skib		maxsize = imax(maxsize, bsize);
2377235783Skib
2378235783Skib		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
2379235783Skib			if (slpflag || slptimeo) {
2380235783Skib				splx(s);
2381235783Skib				return NULL;
2382235783Skib			}
2383235783Skib			goto loop;
2384235783Skib		}
2385235783Skib
2386235783Skib		/*
2387235783Skib		 * This code is used to make sure that a buffer is not
2388235783Skib		 * created while the getnewbuf routine is blocked.
2389235783Skib		 * This can be a problem whether the vnode is locked or not.
2390235783Skib		 * If the buffer is created out from under us, we have to
2391235783Skib		 * throw away the one we just created.  There is now window
2392235783Skib		 * race because we are safely running at splbio() from the
2393235783Skib		 * point of the duplicate buffer creation through to here,
2394235783Skib		 * and we've locked the buffer.
2395235783Skib		 */
2396235783Skib		if (gbincore(vp, blkno)) {
2397235783Skib			bp->b_flags |= B_INVAL;
2398235783Skib			brelse(bp);
2399235783Skib			goto loop;
2400235783Skib		}
2401235783Skib
2402235783Skib		/*
2403235783Skib		 * Insert the buffer into the hash, so that it can
2404235783Skib		 * be found by incore.
2405280183Sdumbbell		 */
2406235783Skib		bp->b_blkno = bp->b_lblkno = blkno;
2407235783Skib		bp->b_offset = offset;
2408235783Skib
2409235783Skib		bgetvp(vp, bp);
2410235783Skib		LIST_REMOVE(bp, b_hash);
2411280183Sdumbbell		bh = bufhash(vp, blkno);
2412235783Skib		LIST_INSERT_HEAD(bh, bp, b_hash);
2413235783Skib
2414235783Skib		/*
2415235783Skib		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
2416235783Skib		 * buffer size starts out as 0, B_CACHE will be set by
2417235783Skib		 * allocbuf() for the VMIO case prior to it testing the
2418235783Skib		 * backing store for validity.
2419235783Skib		 */
2420235783Skib
2421280183Sdumbbell		if (vmio) {
2422235783Skib			bp->b_flags |= B_VMIO;
2423235783Skib#if defined(VFS_BIO_DEBUG)
2424235783Skib			if (vp->v_type != VREG)
2425280183Sdumbbell				printf("getblk: vmioing file type %d???\n", vp->v_type);
2426235783Skib#endif
2427235783Skib			VOP_GETVOBJECT(vp, &bp->b_object);
2428235783Skib		} else {
2429235783Skib			bp->b_flags &= ~B_VMIO;
2430235783Skib			bp->b_object = NULL;
2431235783Skib		}
2432235783Skib
2433235783Skib		allocbuf(bp, size);
2434235783Skib
2435235783Skib		splx(s);
2436235783Skib		bp->b_flags &= ~B_DONE;
2437235783Skib	}
2438235783Skib	KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp));
2439235783Skib	return (bp);
2440235783Skib}
2441235783Skib
2442235783Skib/*
2443235783Skib * Get an empty, disassociated buffer of given size.  The buffer is initially
2444235783Skib * set to B_INVAL.
2445235783Skib */
2446235783Skibstruct buf *
2447235783Skibgeteblk(int size)
2448235783Skib{
2449235783Skib	struct buf *bp;
2450235783Skib	int s;
2451235783Skib	int maxsize;
2452235783Skib
2453235783Skib	maxsize = (size + BKVAMASK) & ~BKVAMASK;
2454235783Skib
2455235783Skib	s = splbio();
2456235783Skib	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
2457235783Skib	splx(s);
2458280183Sdumbbell	allocbuf(bp, size);
2459235783Skib	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
2460235783Skib	KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp));
2461235783Skib	return (bp);
2462235783Skib}
2463280183Sdumbbell
2464235783Skib
2465235783Skib/*
2466235783Skib * This code constitutes the buffer memory from either anonymous system
2467235783Skib * memory (in the case of non-VMIO operations) or from an associated
2468235783Skib * VM object (in the case of VMIO operations).  This code is able to
2469235783Skib * resize a buffer up or down.
2470235783Skib *
2471235783Skib * Note that this code is tricky, and has many complications to resolve
2472235783Skib * deadlock or inconsistant data situations.  Tread lightly!!!
2473257869Sdumbbell * There are B_CACHE and B_DELWRI interactions that must be dealt with by
2474235783Skib * the caller.  Calling this code willy nilly can result in the loss of data.
2475235783Skib *
2476235783Skib * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
2477235783Skib * B_CACHE for the non-VMIO case.
2478235783Skib */
2479235783Skib
2480235783Skibint
2481235783Skiballocbuf(struct buf *bp, int size)
2482235783Skib{
2483235783Skib	int newbsize, mbsize;
2484235783Skib	int i;
2485235783Skib
2486235783Skib	GIANT_REQUIRED;
2487235783Skib
2488235783Skib	if (BUF_REFCNT(bp) == 0)
2489235783Skib		panic("allocbuf: buffer not busy");
2490235783Skib
2491277487Skib	if (bp->b_kvasize < size)
2492235783Skib		panic("allocbuf: buffer too small");
2493235783Skib
2494280183Sdumbbell	if ((bp->b_flags & B_VMIO) == 0) {
2495235783Skib		caddr_t origbuf;
2496235783Skib		int origbufsize;
2497235783Skib		/*
2498235783Skib		 * Just get anonymous memory from the kernel.  Don't
2499280183Sdumbbell		 * mess with B_CACHE.
2500235783Skib		 */
2501235783Skib		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2502235783Skib		if (bp->b_flags & B_MALLOC)
2503235783Skib			newbsize = mbsize;
2504235783Skib		else
2505280183Sdumbbell			newbsize = round_page(size);
2506235783Skib
2507235783Skib		if (newbsize < bp->b_bufsize) {
2508280183Sdumbbell			/*
2509235783Skib			 * malloced buffers are not shrunk
2510235783Skib			 */
2511235783Skib			if (bp->b_flags & B_MALLOC) {
2512235783Skib				if (newbsize) {
2513235783Skib					bp->b_bcount = size;
2514235783Skib				} else {
2515235783Skib					free(bp->b_data, M_BIOBUF);
2516280183Sdumbbell					if (bp->b_bufsize) {
2517235783Skib						bufmallocspace -= bp->b_bufsize;
2518235783Skib						bufspacewakeup();
2519235783Skib						bp->b_bufsize = 0;
2520235783Skib					}
2521235783Skib					bp->b_data = bp->b_kvabase;
2522280183Sdumbbell					bp->b_bcount = 0;
2523235783Skib					bp->b_flags &= ~B_MALLOC;
2524235783Skib				}
2525235783Skib				return 1;
2526293851Sdumbbell			}
2527280183Sdumbbell			vm_hold_free_pages(
2528280183Sdumbbell			    bp,
2529280183Sdumbbell			    (vm_offset_t) bp->b_data + newbsize,
2530280183Sdumbbell			    (vm_offset_t) bp->b_data + bp->b_bufsize);
2531235783Skib		} else if (newbsize > bp->b_bufsize) {
2532280183Sdumbbell			/*
2533280183Sdumbbell			 * We only use malloced memory on the first allocation.
2534280183Sdumbbell			 * and revert to page-allocated memory when the buffer
2535280183Sdumbbell			 * grows.
2536235783Skib			 */
2537280183Sdumbbell			if ( (bufmallocspace < maxbufmallocspace) &&
2538235783Skib				(bp->b_bufsize == 0) &&
2539235783Skib				(mbsize <= PAGE_SIZE/2)) {
2540235783Skib
2541280183Sdumbbell				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
2542235783Skib				bp->b_bufsize = mbsize;
2543235783Skib				bp->b_bcount = size;
2544280183Sdumbbell				bp->b_flags |= B_MALLOC;
2545235783Skib				bufmallocspace += mbsize;
2546235783Skib				return 1;
2547235783Skib			}
2548235783Skib			origbuf = NULL;
2549235783Skib			origbufsize = 0;
2550235783Skib			/*
2551235783Skib			 * If the buffer is growing on its other-than-first allocation,
2552235783Skib			 * then we revert to the page-allocation scheme.
2553235783Skib			 */
2554235783Skib			if (bp->b_flags & B_MALLOC) {
2555235783Skib				origbuf = bp->b_data;
2556235783Skib				origbufsize = bp->b_bufsize;
2557235783Skib				bp->b_data = bp->b_kvabase;
2558235783Skib				if (bp->b_bufsize) {
2559235783Skib					bufmallocspace -= bp->b_bufsize;
2560235783Skib					bufspacewakeup();
2561235783Skib					bp->b_bufsize = 0;
2562235783Skib				}
2563235783Skib				bp->b_flags &= ~B_MALLOC;
2564235783Skib				newbsize = round_page(newbsize);
2565235783Skib			}
2566235783Skib			vm_hold_load_pages(
2567235783Skib			    bp,
2568235783Skib			    (vm_offset_t) bp->b_data + bp->b_bufsize,
2569235783Skib			    (vm_offset_t) bp->b_data + newbsize);
2570235783Skib			if (origbuf) {
2571235783Skib				bcopy(origbuf, bp->b_data, origbufsize);
2572235783Skib				free(origbuf, M_BIOBUF);
2573235783Skib			}
2574235783Skib		}
2575235783Skib	} else {
2576235783Skib		vm_page_t m;
2577280183Sdumbbell		int desiredpages;
2578235783Skib
2579235783Skib		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2580235783Skib		desiredpages = (size == 0) ? 0 :
2581235783Skib			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
2582235783Skib
2583235783Skib		if (bp->b_flags & B_MALLOC)
2584235783Skib			panic("allocbuf: VMIO buffer can't be malloced");
2585235783Skib		/*
2586235783Skib		 * Set B_CACHE initially if buffer is 0 length or will become
2587235783Skib		 * 0-length.
2588235783Skib		 */
2589235783Skib		if (size == 0 || bp->b_bufsize == 0)
2590235783Skib			bp->b_flags |= B_CACHE;
2591235783Skib
2592235783Skib		if (newbsize < bp->b_bufsize) {
2593235783Skib			/*
2594235783Skib			 * DEV_BSIZE aligned new buffer size is less then the
2595235783Skib			 * DEV_BSIZE aligned existing buffer size.  Figure out
2596235783Skib			 * if we have to remove any pages.
2597235783Skib			 */
2598235783Skib			if (desiredpages < bp->b_npages) {
2599235783Skib				for (i = desiredpages; i < bp->b_npages; i++) {
2600235783Skib					/*
2601235783Skib					 * the page is not freed here -- it
2602235783Skib					 * is the responsibility of
2603235783Skib					 * vnode_pager_setsize
2604235783Skib					 */
2605235783Skib					m = bp->b_pages[i];
2606235783Skib					KASSERT(m != bogus_page,
2607235783Skib					    ("allocbuf: bogus page found"));
2608235783Skib					while (vm_page_sleep_busy(m, TRUE, "biodep"))
2609235783Skib						;
2610235783Skib
2611280183Sdumbbell					bp->b_pages[i] = NULL;
2612235783Skib					vm_page_unwire(m, 0);
2613235783Skib				}
2614235783Skib				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
2615235783Skib				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
2616235783Skib				bp->b_npages = desiredpages;
2617235783Skib			}
2618235783Skib		} else if (size > bp->b_bcount) {
2619235783Skib			/*
2620235783Skib			 * We are growing the buffer, possibly in a
2621235783Skib			 * byte-granular fashion.
2622235783Skib			 */
2623235783Skib			struct vnode *vp;
2624235783Skib			vm_object_t obj;
2625235783Skib			vm_offset_t toff;
2626235783Skib			vm_offset_t tinc;
2627235783Skib
2628235783Skib			/*
2629235783Skib			 * Step 1, bring in the VM pages from the object,
2630235783Skib			 * allocating them if necessary.  We must clear
2631235783Skib			 * B_CACHE if these pages are not valid for the
2632235783Skib			 * range covered by the buffer.
2633280183Sdumbbell			 */
2634235783Skib
2635235783Skib			vp = bp->b_vp;
2636235783Skib			obj = bp->b_object;
2637235783Skib
2638235783Skib			while (bp->b_npages < desiredpages) {
2639235783Skib				vm_page_t m;
2640235783Skib				vm_pindex_t pi;
2641235783Skib
2642235783Skib				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
2643235783Skib				if ((m = vm_page_lookup(obj, pi)) == NULL) {
2644235783Skib					/*
2645235783Skib					 * note: must allocate system pages
2646235783Skib					 * since blocking here could intefere
2647235783Skib					 * with paging I/O, no matter which
2648235783Skib					 * process we are.
2649235783Skib					 */
2650235783Skib					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
2651235783Skib					if (m == NULL) {
2652235783Skib						VM_WAIT;
2653235783Skib						vm_pageout_deficit += desiredpages - bp->b_npages;
2654235783Skib					} else {
2655235783Skib						vm_page_wire(m);
2656235783Skib						vm_page_wakeup(m);
2657235783Skib						bp->b_flags &= ~B_CACHE;
2658235783Skib						bp->b_pages[bp->b_npages] = m;
2659235783Skib						++bp->b_npages;
2660235783Skib					}
2661235783Skib					continue;
2662235783Skib				}
2663235783Skib
2664235783Skib				/*
2665235783Skib				 * We found a page.  If we have to sleep on it,
2666235783Skib				 * retry because it might have gotten freed out
2667280183Sdumbbell				 * from under us.
2668235783Skib				 *
2669235783Skib				 * We can only test PG_BUSY here.  Blocking on
2670235783Skib				 * m->busy might lead to a deadlock:
2671235783Skib				 *
2672235783Skib				 *  vm_fault->getpages->cluster_read->allocbuf
2673235783Skib				 *
2674235783Skib				 */
2675235783Skib
2676235783Skib				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
2677235783Skib					continue;
2678235783Skib
2679235783Skib				/*
2680235783Skib				 * We have a good page.  Should we wakeup the
2681235783Skib				 * page daemon?
2682235783Skib				 */
2683235783Skib				if ((curproc != pageproc) &&
2684235783Skib				    ((m->queue - m->pc) == PQ_CACHE) &&
2685235783Skib				    ((cnt.v_free_count + cnt.v_cache_count) <
2686235783Skib					(cnt.v_free_min + cnt.v_cache_min))) {
2687235783Skib					pagedaemon_wakeup();
2688235783Skib				}
2689235783Skib				vm_page_flag_clear(m, PG_ZERO);
2690277487Skib				vm_page_wire(m);
2691235783Skib				bp->b_pages[bp->b_npages] = m;
2692235783Skib				++bp->b_npages;
2693235783Skib			}
2694235783Skib
2695235783Skib			/*
2696235783Skib			 * Step 2.  We've loaded the pages into the buffer,
2697235783Skib			 * we have to figure out if we can still have B_CACHE
2698235783Skib			 * set.  Note that B_CACHE is set according to the
2699235783Skib			 * byte-granular range ( bcount and size ), new the
2700235783Skib			 * aligned range ( newbsize ).
2701235783Skib			 *
2702235783Skib			 * The VM test is against m->valid, which is DEV_BSIZE
2703235783Skib			 * aligned.  Needless to say, the validity of the data
2704235783Skib			 * needs to also be DEV_BSIZE aligned.  Note that this
2705235783Skib			 * fails with NFS if the server or some other client
2706235783Skib			 * extends the file's EOF.  If our buffer is resized,
2707235783Skib			 * B_CACHE may remain set! XXX
2708235783Skib			 */
2709235783Skib
2710235783Skib			toff = bp->b_bcount;
2711235783Skib			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
2712235783Skib
2713235783Skib			while ((bp->b_flags & B_CACHE) && toff < size) {
2714235783Skib				vm_pindex_t pi;
2715235783Skib
2716235783Skib				if (tinc > (size - toff))
2717235783Skib					tinc = size - toff;
2718235783Skib
2719235783Skib				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
2720235783Skib				    PAGE_SHIFT;
2721235783Skib
2722235783Skib				vfs_buf_test_cache(
2723235783Skib				    bp,
2724235783Skib				    bp->b_offset,
2725235783Skib				    toff,
2726235783Skib				    tinc,
2727235783Skib				    bp->b_pages[pi]
2728235783Skib				);
2729235783Skib				toff += tinc;
2730235783Skib				tinc = PAGE_SIZE;
2731235783Skib			}
2732235783Skib
2733235783Skib			/*
2734235783Skib			 * Step 3, fixup the KVM pmap.  Remember that
2735235783Skib			 * bp->b_data is relative to bp->b_offset, but
2736235783Skib			 * bp->b_offset may be offset into the first page.
2737235783Skib			 */
2738235783Skib
2739235783Skib			bp->b_data = (caddr_t)
2740235783Skib			    trunc_page((vm_offset_t)bp->b_data);
2741235783Skib			pmap_qenter(
2742235783Skib			    (vm_offset_t)bp->b_data,
2743235783Skib			    bp->b_pages,
2744277487Skib			    bp->b_npages
2745235783Skib			);
2746235783Skib
2747235783Skib			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
2748235783Skib			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
2749235783Skib		}
2750235783Skib	}
2751235783Skib	if (newbsize < bp->b_bufsize)
2752235783Skib		bufspacewakeup();
2753235783Skib	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
2754235783Skib	bp->b_bcount = size;		/* requested buffer size	*/
2755235783Skib	return 1;
2756235783Skib}
2757235783Skib
2758235783Skib/*
2759235783Skib *	bufwait:
2760235783Skib *
2761235783Skib *	Wait for buffer I/O completion, returning error status.  The buffer
2762235783Skib *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
2763235783Skib *	error and cleared.
2764235783Skib */
2765235783Skibint
2766235783Skibbufwait(register struct buf * bp)
2767235783Skib{
2768235783Skib	int s;
2769235783Skib
2770235783Skib	s = splbio();
2771235783Skib	while ((bp->b_flags & B_DONE) == 0) {
2772235783Skib		if (bp->b_iocmd == BIO_READ)
2773235783Skib			tsleep(bp, PRIBIO, "biord", 0);
2774235783Skib		else
2775235783Skib			tsleep(bp, PRIBIO, "biowr", 0);
2776235783Skib	}
2777293851Sdumbbell	splx(s);
2778280183Sdumbbell	if (bp->b_flags & B_EINTR) {
2779280183Sdumbbell		bp->b_flags &= ~B_EINTR;
2780235783Skib		return (EINTR);
2781235783Skib	}
2782235783Skib	if (bp->b_ioflags & BIO_ERROR) {
2783293851Sdumbbell		return (bp->b_error ? bp->b_error : EIO);
2784280183Sdumbbell	} else {
2785280183Sdumbbell		return (0);
2786235783Skib	}
2787235783Skib}
2788235783Skib
2789235783Skib /*
2790235783Skib  * Call back function from struct bio back up to struct buf.
2791280183Sdumbbell  * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY().
2792235783Skib  */
2793235783Skibvoid
2794235783Skibbufdonebio(struct bio *bp)
2795235783Skib{
2796235783Skib	bufdone(bp->bio_caller2);
2797235783Skib}
2798235783Skib
2799235783Skib/*
2800235783Skib *	bufdone:
2801235783Skib *
2802235783Skib *	Finish I/O on a buffer, optionally calling a completion function.
2803235783Skib *	This is usually called from an interrupt so process blocking is
2804235783Skib *	not allowed.
2805235783Skib *
2806280183Sdumbbell *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
2807235783Skib *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
2808280183Sdumbbell *	assuming B_INVAL is clear.
2809235783Skib *
2810235783Skib *	For the VMIO case, we set B_CACHE if the op was a read and no
2811235783Skib *	read error occured, or if the op was a write.  B_CACHE is never
2812235783Skib *	set if the buffer is invalid or otherwise uncacheable.
2813235783Skib *
2814235783Skib *	biodone does not mess with B_INVAL, allowing the I/O routine or the
2815235783Skib *	initiator to leave B_INVAL set to brelse the buffer out of existance
2816235783Skib *	in the biodone routine.
2817235783Skib */
2818235783Skibvoid
2819235783Skibbufdone(struct buf *bp)
2820235783Skib{
2821235783Skib	int s;
2822235783Skib	void    (*biodone)(struct buf *);
2823235783Skib
2824235783Skib	GIANT_REQUIRED;
2825235783Skib
2826235783Skib	s = splbio();
2827235783Skib
2828235783Skib	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
2829235783Skib	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
2830235783Skib
2831235783Skib	bp->b_flags |= B_DONE;
2832235783Skib	runningbufwakeup(bp);
2833235783Skib
2834235783Skib	if (bp->b_iocmd == BIO_DELETE) {
2835235783Skib		brelse(bp);
2836280183Sdumbbell		splx(s);
2837235783Skib		return;
2838277487Skib	}
2839277487Skib
2840277487Skib	if (bp->b_iocmd == BIO_WRITE) {
2841277487Skib		vwakeup(bp);
2842277487Skib	}
2843277487Skib
2844277487Skib	/* call optional completion function if requested */
2845277487Skib	if (bp->b_iodone != NULL) {
2846277487Skib		biodone = bp->b_iodone;
2847277487Skib		bp->b_iodone = NULL;
2848277487Skib		(*biodone) (bp);
2849277487Skib		splx(s);
2850277487Skib		return;
2851277487Skib	}
2852277487Skib	if (LIST_FIRST(&bp->b_dep) != NULL)
2853277487Skib		buf_complete(bp);
2854277487Skib
2855277487Skib	if (bp->b_flags & B_VMIO) {
2856277487Skib		int i;
2857277487Skib		vm_ooffset_t foff;
2858277487Skib		vm_page_t m;
2859277487Skib		vm_object_t obj;
2860277487Skib		int iosize;
2861277487Skib		struct vnode *vp = bp->b_vp;
2862277487Skib
2863277487Skib		obj = bp->b_object;
2864280183Sdumbbell
2865277487Skib#if defined(VFS_BIO_DEBUG)
2866235783Skib		if (vp->v_usecount == 0) {
2867235783Skib			panic("biodone: zero vnode ref count");
2868235783Skib		}
2869235783Skib
2870235783Skib		if ((vp->v_flag & VOBJBUF) == 0) {
2871235783Skib			panic("biodone: vnode is not setup for merged cache");
2872235783Skib		}
2873235783Skib#endif
2874235783Skib
2875235783Skib		foff = bp->b_offset;
2876235783Skib		KASSERT(bp->b_offset != NOOFFSET,
2877235783Skib		    ("biodone: no buffer offset"));
2878235783Skib
2879235783Skib#if defined(VFS_BIO_DEBUG)
2880235783Skib		if (obj->paging_in_progress < bp->b_npages) {
2881235783Skib			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
2882235783Skib			    obj->paging_in_progress, bp->b_npages);
2883280183Sdumbbell		}
2884235783Skib#endif
2885235783Skib
2886235783Skib		/*
2887235783Skib		 * Set B_CACHE if the op was a normal read and no error
2888235783Skib		 * occured.  B_CACHE is set for writes in the b*write()
2889235783Skib		 * routines.
2890277487Skib		 */
2891235783Skib		iosize = bp->b_bcount - bp->b_resid;
2892235783Skib		if (bp->b_iocmd == BIO_READ &&
2893277487Skib		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
2894277487Skib		    !(bp->b_ioflags & BIO_ERROR)) {
2895277487Skib			bp->b_flags |= B_CACHE;
2896277487Skib		}
2897277487Skib
2898277487Skib		for (i = 0; i < bp->b_npages; i++) {
2899277487Skib			int bogusflag = 0;
2900235783Skib			int resid;
2901235783Skib
2902235783Skib			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
2903235783Skib			if (resid > iosize)
2904235783Skib				resid = iosize;
2905235783Skib
2906235783Skib			/*
2907235783Skib			 * cleanup bogus pages, restoring the originals
2908235783Skib			 */
2909235783Skib			m = bp->b_pages[i];
2910235783Skib			if (m == bogus_page) {
2911293851Sdumbbell				bogusflag = 1;
2912280183Sdumbbell				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
2913280183Sdumbbell				if (m == NULL)
2914235783Skib					panic("biodone: page disappeared!");
2915235783Skib				bp->b_pages[i] = m;
2916235783Skib				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2917235783Skib			}
2918235783Skib#if defined(VFS_BIO_DEBUG)
2919235783Skib			if (OFF_TO_IDX(foff) != m->pindex) {
2920235783Skib				printf(
2921235783Skib"biodone: foff(%jd)/m->pindex(%ju) mismatch\n",
2922235783Skib				    (intmax_t)foff, (uintmax_t)m->pindex);
2923280183Sdumbbell			}
2924235783Skib#endif
2925235783Skib
2926235783Skib			/*
2927235783Skib			 * In the write case, the valid and clean bits are
2928235783Skib			 * already changed correctly ( see bdwrite() ), so we
2929235783Skib			 * only need to do this here in the read case.
2930235783Skib			 */
2931235783Skib			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
2932235783Skib				vfs_page_set_valid(bp, foff, i, m);
2933235783Skib			}
2934235783Skib			vm_page_flag_clear(m, PG_ZERO);
2935235783Skib
2936235783Skib			/*
2937235783Skib			 * when debugging new filesystems or buffer I/O methods, this
2938235783Skib			 * is the most common error that pops up.  if you see this, you
2939235783Skib			 * have not set the page busy flag correctly!!!
2940280183Sdumbbell			 */
2941235783Skib			if (m->busy == 0) {
2942277487Skib				printf("biodone: page busy < 0, "
2943277487Skib				    "pindex: %d, foff: 0x(%x,%x), "
2944277487Skib				    "resid: %d, index: %d\n",
2945277487Skib				    (int) m->pindex, (int)(foff >> 32),
2946277487Skib						(int) foff & 0xffffffff, resid, i);
2947277487Skib				if (!vn_isdisk(vp, NULL))
2948277487Skib					printf(" iosize: %ld, lblkno: %jd, flags: 0x%lx, npages: %d\n",
2949280183Sdumbbell					    bp->b_vp->v_mount->mnt_stat.f_iosize,
2950277487Skib					    (intmax_t) bp->b_lblkno,
2951277487Skib					    bp->b_flags, bp->b_npages);
2952277487Skib				else
2953277487Skib					printf(" VDEV, lblkno: %jd, flags: 0x%lx, npages: %d\n",
2954280183Sdumbbell					    (intmax_t) bp->b_lblkno,
2955277487Skib					    bp->b_flags, bp->b_npages);
2956277487Skib				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2957277487Skib				    m->valid, m->dirty, m->wire_count);
2958277487Skib				panic("biodone: page busy < 0\n");
2959277487Skib			}
2960280183Sdumbbell			vm_page_io_finish(m);
2961277487Skib			vm_object_pip_subtract(obj, 1);
2962277487Skib			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2963277487Skib			iosize -= resid;
2964277487Skib		}
2965235783Skib		if (obj)
2966235783Skib			vm_object_pip_wakeupn(obj, 0);
2967277487Skib	}
2968277487Skib
2969277487Skib	/*
2970277487Skib	 * For asynchronous completions, release the buffer now. The brelse
2971235783Skib	 * will do a wakeup there if necessary - so no need to do a wakeup
2972235783Skib	 * here in the async case. The sync case always needs to do a wakeup.
2973235783Skib	 */
2974277487Skib
2975235783Skib	if (bp->b_flags & B_ASYNC) {
2976280183Sdumbbell		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
2977235783Skib			brelse(bp);
2978277487Skib		else
2979235783Skib			bqrelse(bp);
2980235783Skib	} else {
2981235783Skib		wakeup(bp);
2982235783Skib	}
2983277487Skib	splx(s);
2984277487Skib}
2985277487Skib
2986277487Skib/*
2987235783Skib * This routine is called in lieu of iodone in the case of
2988235783Skib * incomplete I/O.  This keeps the busy status for pages
2989235783Skib * consistant.
2990277487Skib */
2991235783Skibvoid
2992280183Sdumbbellvfs_unbusy_pages(struct buf * bp)
2993235783Skib{
2994235783Skib	int i;
2995235783Skib
2996235783Skib	GIANT_REQUIRED;
2997235783Skib
2998235783Skib	runningbufwakeup(bp);
2999235783Skib	if (bp->b_flags & B_VMIO) {
3000235783Skib		vm_object_t obj;
3001235783Skib
3002235783Skib		obj = bp->b_object;
3003235783Skib
3004235783Skib		for (i = 0; i < bp->b_npages; i++) {
3005235783Skib			vm_page_t m = bp->b_pages[i];
3006235783Skib
3007235783Skib			if (m == bogus_page) {
3008280183Sdumbbell				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3009280183Sdumbbell				if (!m) {
3010280183Sdumbbell					panic("vfs_unbusy_pages: page missing\n");
3011235783Skib				}
3012235783Skib				bp->b_pages[i] = m;
3013235783Skib				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
3014235783Skib			}
3015235783Skib			vm_object_pip_subtract(obj, 1);
3016235783Skib			vm_page_flag_clear(m, PG_ZERO);
3017235783Skib			vm_page_io_finish(m);
3018235783Skib		}
3019235783Skib		vm_object_pip_wakeupn(obj, 0);
3020235783Skib	}
3021235783Skib}
3022235783Skib
3023277487Skib/*
3024235783Skib * vfs_page_set_valid:
3025235783Skib *
3026235783Skib *	Set the valid bits in a page based on the supplied offset.   The
3027235783Skib *	range is restricted to the buffer's size.
3028235783Skib *
3029235783Skib *	This routine is typically called after a read completes.
3030235783Skib */
3031235783Skibstatic void
3032235783Skibvfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
3033235783Skib{
3034235783Skib	vm_ooffset_t soff, eoff;
3035235783Skib
3036235783Skib	GIANT_REQUIRED;
3037235783Skib	/*
3038280183Sdumbbell	 * Start and end offsets in buffer.  eoff - soff may not cross a
3039235783Skib	 * page boundry or cross the end of the buffer.  The end of the
3040280183Sdumbbell	 * buffer, in this case, is our file EOF, not the allocation size
3041235783Skib	 * of the buffer.
3042235783Skib	 */
3043235783Skib	soff = off;
3044235783Skib	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3045235783Skib	if (eoff > bp->b_offset + bp->b_bcount)
3046235783Skib		eoff = bp->b_offset + bp->b_bcount;
3047235783Skib
3048277487Skib	/*
3049235783Skib	 * Set valid range.  This is typically the entire buffer and thus the
3050235783Skib	 * entire page.
3051280183Sdumbbell	 */
3052235783Skib	if (eoff > soff) {
3053235783Skib		vm_page_set_validclean(
3054280183Sdumbbell		    m,
3055235783Skib		   (vm_offset_t) (soff & PAGE_MASK),
3056235783Skib		   (vm_offset_t) (eoff - soff)
3057235783Skib		);
3058235783Skib	}
3059280183Sdumbbell}
3060280183Sdumbbell
3061235783Skib/*
3062235783Skib * This routine is called before a device strategy routine.
3063235783Skib * It is used to tell the VM system that paging I/O is in
3064235783Skib * progress, and treat the pages associated with the buffer
3065235783Skib * almost as being PG_BUSY.  Also the object paging_in_progress
3066235783Skib * flag is handled to make sure that the object doesn't become
3067235783Skib * inconsistant.
3068235783Skib *
3069235783Skib * Since I/O has not been initiated yet, certain buffer flags
3070235783Skib * such as BIO_ERROR or B_INVAL may be in an inconsistant state
3071235783Skib * and should be ignored.
3072235783Skib */
3073280183Sdumbbellvoid
3074280183Sdumbbellvfs_busy_pages(struct buf * bp, int clear_modify)
3075235783Skib{
3076235783Skib	int i, bogus;
3077280183Sdumbbell
3078235783Skib	GIANT_REQUIRED;
3079235783Skib
3080235783Skib	if (bp->b_flags & B_VMIO) {
3081235783Skib		vm_object_t obj;
3082280183Sdumbbell		vm_ooffset_t foff;
3083235783Skib
3084235783Skib		obj = bp->b_object;
3085235783Skib		foff = bp->b_offset;
3086235783Skib		KASSERT(bp->b_offset != NOOFFSET,
3087235783Skib		    ("vfs_busy_pages: no buffer offset"));
3088235783Skib		vfs_setdirty(bp);
3089235783Skib
3090235783Skibretry:
3091235783Skib		for (i = 0; i < bp->b_npages; i++) {
3092235783Skib			vm_page_t m = bp->b_pages[i];
3093235783Skib			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
3094235783Skib				goto retry;
3095235783Skib		}
3096235783Skib
3097235783Skib		bogus = 0;
3098235783Skib		for (i = 0; i < bp->b_npages; i++) {
3099235783Skib			vm_page_t m = bp->b_pages[i];
3100235783Skib
3101235783Skib			vm_page_flag_clear(m, PG_ZERO);
3102235783Skib			if ((bp->b_flags & B_CLUSTER) == 0) {
3103235783Skib				vm_object_pip_add(obj, 1);
3104235783Skib				vm_page_io_start(m);
3105235783Skib			}
3106280183Sdumbbell
3107293851Sdumbbell			/*
3108280183Sdumbbell			 * When readying a buffer for a read ( i.e
3109280183Sdumbbell			 * clear_modify == 0 ), it is important to do
3110235783Skib			 * bogus_page replacement for valid pages in
3111235783Skib			 * partially instantiated buffers.  Partially
3112235783Skib			 * instantiated buffers can, in turn, occur when
3113235783Skib			 * reconstituting a buffer from its VM backing store
3114280183Sdumbbell			 * base.  We only have to do this if B_CACHE is
3115235783Skib			 * clear ( which causes the I/O to occur in the
3116235783Skib			 * first place ).  The replacement prevents the read
3117235783Skib			 * I/O from overwriting potentially dirty VM-backed
3118235783Skib			 * pages.  XXX bogus page replacement is, uh, bogus.
3119235783Skib			 * It may not work properly with small-block devices.
3120235783Skib			 * We need to find a better way.
3121235783Skib			 */
3122235783Skib
3123235783Skib			vm_page_protect(m, VM_PROT_NONE);
3124235783Skib			if (clear_modify)
3125235783Skib				vfs_page_set_valid(bp, foff, i, m);
3126235783Skib			else if (m->valid == VM_PAGE_BITS_ALL &&
3127235783Skib				(bp->b_flags & B_CACHE) == 0) {
3128235783Skib				bp->b_pages[i] = bogus_page;
3129235783Skib				bogus++;
3130235783Skib			}
3131235783Skib			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3132235783Skib		}
3133235783Skib		if (bogus)
3134235783Skib			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
3135235783Skib	}
3136235783Skib}
3137235783Skib
3138235783Skib/*
3139235783Skib * Tell the VM system that the pages associated with this buffer
3140280183Sdumbbell * are clean.  This is used for delayed writes where the data is
3141235783Skib * going to go to disk eventually without additional VM intevention.
3142235783Skib *
3143235783Skib * Note that while we only really need to clean through to b_bcount, we
3144235783Skib * just go ahead and clean through to b_bufsize.
3145235783Skib */
3146235783Skibstatic void
3147235783Skibvfs_clean_pages(struct buf * bp)
3148235783Skib{
3149235783Skib	int i;
3150235783Skib
3151235783Skib	GIANT_REQUIRED;
3152235783Skib
3153235783Skib	if (bp->b_flags & B_VMIO) {
3154280183Sdumbbell		vm_ooffset_t foff;
3155280183Sdumbbell
3156235783Skib		foff = bp->b_offset;
3157235783Skib		KASSERT(bp->b_offset != NOOFFSET,
3158235783Skib		    ("vfs_clean_pages: no buffer offset"));
3159235783Skib		for (i = 0; i < bp->b_npages; i++) {
3160235783Skib			vm_page_t m = bp->b_pages[i];
3161235783Skib			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3162235783Skib			vm_ooffset_t eoff = noff;
3163235783Skib
3164235783Skib			if (eoff > bp->b_offset + bp->b_bufsize)
3165235783Skib				eoff = bp->b_offset + bp->b_bufsize;
3166235783Skib			vfs_page_set_valid(bp, foff, i, m);
3167235783Skib			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
3168235783Skib			foff = noff;
3169235783Skib		}
3170235783Skib	}
3171277487Skib}
3172235783Skib
3173235783Skib/*
3174235783Skib *	vfs_bio_set_validclean:
3175235783Skib *
3176235783Skib *	Set the range within the buffer to valid and clean.  The range is
3177235783Skib *	relative to the beginning of the buffer, b_offset.  Note that b_offset
3178235783Skib *	itself may be offset from the beginning of the first page.
3179280183Sdumbbell *
3180235783Skib */
3181235783Skib
3182235783Skibvoid
3183235783Skibvfs_bio_set_validclean(struct buf *bp, int base, int size)
3184235783Skib{
3185235783Skib	if (bp->b_flags & B_VMIO) {
3186280183Sdumbbell		int i;
3187280183Sdumbbell		int n;
3188235783Skib
3189280183Sdumbbell		/*
3190235783Skib		 * Fixup base to be relative to beginning of first page.
3191235783Skib		 * Set initial n to be the maximum number of bytes in the
3192235783Skib		 * first page that can be validated.
3193235783Skib		 */
3194235783Skib
3195280183Sdumbbell		base += (bp->b_offset & PAGE_MASK);
3196235783Skib		n = PAGE_SIZE - (base & PAGE_MASK);
3197277487Skib
3198280183Sdumbbell		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
3199277487Skib			vm_page_t m = bp->b_pages[i];
3200277487Skib
3201277487Skib			if (n > size)
3202277487Skib				n = size;
3203277487Skib
3204277487Skib			vm_page_set_validclean(m, base & PAGE_MASK, n);
3205277487Skib			base += n;
3206277487Skib			size -= n;
3207277487Skib			n = PAGE_SIZE;
3208280183Sdumbbell		}
3209277487Skib	}
3210277487Skib}
3211277487Skib
3212280183Sdumbbell/*
3213280183Sdumbbell *	vfs_bio_clrbuf:
3214280183Sdumbbell *
3215277487Skib *	clear a buffer.  This routine essentially fakes an I/O, so we need
3216277487Skib *	to clear BIO_ERROR and B_INVAL.
3217277487Skib *
3218277487Skib *	Note that while we only theoretically need to clear through b_bcount,
3219277487Skib *	we go ahead and clear through b_bufsize.
3220277487Skib */
3221277487Skib
3222277487Skibvoid
3223277487Skibvfs_bio_clrbuf(struct buf *bp)
3224235783Skib{
3225235783Skib	int i, mask = 0;
3226235783Skib	caddr_t sa, ea;
3227277487Skib
3228277487Skib	GIANT_REQUIRED;
3229277487Skib
3230277487Skib	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
3231277487Skib		bp->b_flags &= ~B_INVAL;
3232277487Skib		bp->b_ioflags &= ~BIO_ERROR;
3233277487Skib		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
3234277487Skib		    (bp->b_offset & PAGE_MASK) == 0) {
3235277487Skib			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
3236277487Skib			if ((bp->b_pages[0]->valid & mask) == mask) {
3237277487Skib				bp->b_resid = 0;
3238277487Skib				return;
3239277487Skib			}
3240277487Skib			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
3241277487Skib			    ((bp->b_pages[0]->valid & mask) == 0)) {
3242277487Skib				bzero(bp->b_data, bp->b_bufsize);
3243277487Skib				bp->b_pages[0]->valid |= mask;
3244277487Skib				bp->b_resid = 0;
3245277487Skib				return;
3246277487Skib			}
3247277487Skib		}
3248277487Skib		ea = sa = bp->b_data;
3249277487Skib		for(i=0;i<bp->b_npages;i++,sa=ea) {
3250277487Skib			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
3251277487Skib			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
3252277487Skib			ea = (caddr_t)(vm_offset_t)ulmin(
3253277487Skib			    (u_long)(vm_offset_t)ea,
3254277487Skib			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
3255277487Skib			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
3256280183Sdumbbell			if ((bp->b_pages[i]->valid & mask) == mask)
3257277487Skib				continue;
3258277487Skib			if ((bp->b_pages[i]->valid & mask) == 0) {
3259277487Skib				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
3260277487Skib					bzero(sa, ea - sa);
3261277487Skib				}
3262277487Skib			} else {
3263277487Skib				for (; sa < ea; sa += DEV_BSIZE, j++) {
3264277487Skib					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
3265277487Skib						(bp->b_pages[i]->valid & (1<<j)) == 0)
3266277487Skib						bzero(sa, DEV_BSIZE);
3267277487Skib				}
3268277487Skib			}
3269277487Skib			bp->b_pages[i]->valid |= mask;
3270277487Skib			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
3271277487Skib		}
3272277487Skib		bp->b_resid = 0;
3273277487Skib	} else {
3274277487Skib		clrbuf(bp);
3275277487Skib	}
3276277487Skib}
3277277487Skib
3278277487Skib/*
3279277487Skib * vm_hold_load_pages and vm_hold_free_pages get pages into
3280277487Skib * a buffers address space.  The pages are anonymous and are
3281277487Skib * not associated with a file object.
3282277487Skib */
3283277487Skibstatic void
3284277487Skibvm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3285277487Skib{
3286277487Skib	vm_offset_t pg;
3287277487Skib	vm_page_t p;
3288277487Skib	int index;
3289277487Skib
3290277487Skib	GIANT_REQUIRED;
3291277487Skib
3292277487Skib	to = round_page(to);
3293277487Skib	from = round_page(from);
3294235783Skib	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3295277487Skib
3296235783Skib	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3297277487Skibtryagain:
3298277487Skib		/*
3299277487Skib		 * note: must allocate system pages since blocking here
3300277487Skib		 * could intefere with paging I/O, no matter which
3301235783Skib		 * process we are.
3302235783Skib		 */
3303235783Skib		p = vm_page_alloc(kernel_object,
3304235783Skib			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
3305235783Skib		    VM_ALLOC_SYSTEM);
3306235783Skib		if (!p) {
3307277487Skib			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
3308235783Skib			VM_WAIT;
3309277487Skib			goto tryagain;
3310235783Skib		}
3311235783Skib		vm_page_wire(p);
3312277487Skib		p->valid = VM_PAGE_BITS_ALL;
3313277487Skib		vm_page_flag_clear(p, PG_ZERO);
3314277487Skib		pmap_qenter(pg, &p, 1);
3315277487Skib		bp->b_pages[index] = p;
3316235783Skib		vm_page_wakeup(p);
3317277487Skib	}
3318277487Skib	bp->b_npages = index;
3319277487Skib}
3320277487Skib
3321277487Skib/* Return pages associated with this buf to the vm system */
3322277487Skibvoid
3323277487Skibvm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3324277487Skib{
3325277487Skib	vm_offset_t pg;
3326277487Skib	vm_page_t p;
3327280183Sdumbbell	int index, newnpages;
3328280183Sdumbbell
3329277487Skib	GIANT_REQUIRED;
3330277487Skib
3331277487Skib	from = round_page(from);
3332280183Sdumbbell	to = round_page(to);
3333280183Sdumbbell	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3334277487Skib
3335277487Skib	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3336277487Skib		p = bp->b_pages[index];
3337277487Skib		if (p && (index < bp->b_npages)) {
3338277487Skib			if (p->busy) {
3339235783Skib				printf(
3340277487Skib			    "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
3341277487Skib				    (intmax_t)bp->b_blkno,
3342277487Skib				    (intmax_t)bp->b_lblkno);
3343277487Skib			}
3344277487Skib			bp->b_pages[index] = NULL;
3345235783Skib			pmap_qremove(pg, 1);
3346277487Skib			vm_page_busy(p);
3347277487Skib			vm_page_unwire(p, 0);
3348277487Skib			vm_page_free(p);
3349277487Skib		}
3350277487Skib	}
3351277487Skib	bp->b_npages = newnpages;
3352277487Skib}
3353277487Skib
3354277487Skib
3355277487Skib#include "opt_ddb.h"
3356277487Skib#ifdef DDB
3357277487Skib#include <ddb/ddb.h>
3358277487Skib
3359277487Skib/* DDB command to show buffer data */
3360277487SkibDB_SHOW_COMMAND(buffer, db_show_buffer)
3361277487Skib{
3362277487Skib	/* get args */
3363235783Skib	struct buf *bp = (struct buf *)addr;
3364277487Skib
3365277487Skib	if (!have_addr) {
3366235783Skib		db_printf("usage: show buffer <addr>\n");
3367277487Skib		return;
3368277487Skib	}
3369277487Skib
3370277487Skib	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
3371277487Skib	db_printf(
3372235783Skib	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
3373235783Skib	    "b_dev = (%d,%d), b_data = %p, b_blkno = %jd, b_pblkno = %jd\n",
3374277487Skib	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
3375277487Skib	    major(bp->b_dev), minor(bp->b_dev), bp->b_data,
3376277487Skib	    (intmax_t)bp->b_blkno, (intmax_t)bp->b_pblkno);
3377235783Skib	if (bp->b_npages) {
3378277487Skib		int i;
3379235783Skib		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
3380277487Skib		for (i = 0; i < bp->b_npages; i++) {
3381277487Skib			vm_page_t m;
3382235783Skib			m = bp->b_pages[i];
3383277487Skib			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
3384277487Skib			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
3385277487Skib			if ((i + 1) < bp->b_npages)
3386277487Skib				db_printf(",");
3387277487Skib		}
3388277487Skib		db_printf("\n");
3389277487Skib	}
3390277487Skib}
3391277487Skib#endif /* DDB */
3392277487Skib