vfs_bio.c revision 252330
1/*-
2 * Copyright (c) 2004 Poul-Henning Kamp
3 * Copyright (c) 1994,1997 John S. Dyson
4 * Copyright (c) 2013 The FreeBSD Foundation
5 * All rights reserved.
6 *
7 * Portions of this software were developed by Konstantin Belousov
8 * under sponsorship from the FreeBSD Foundation.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * this file contains a new buffer I/O scheme implementing a coherent
34 * VM object and buffer cache scheme.  Pains have been taken to make
35 * sure that the performance degradation associated with schemes such
36 * as this is not realized.
37 *
38 * Author:  John S. Dyson
39 * Significant help during the development and debugging phases
40 * had been provided by David Greenman, also of the FreeBSD core team.
41 *
42 * see man buf(9) for more info.
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD: head/sys/kern/vfs_bio.c 252330 2013-06-28 03:51:20Z jeff $");
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/conf.h>
52#include <sys/buf.h>
53#include <sys/devicestat.h>
54#include <sys/eventhandler.h>
55#include <sys/fail.h>
56#include <sys/limits.h>
57#include <sys/lock.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/mutex.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/proc.h>
64#include <sys/resourcevar.h>
65#include <sys/rwlock.h>
66#include <sys/sysctl.h>
67#include <sys/vmem.h>
68#include <sys/vmmeter.h>
69#include <sys/vnode.h>
70#include <geom/geom.h>
71#include <vm/vm.h>
72#include <vm/vm_param.h>
73#include <vm/vm_kern.h>
74#include <vm/vm_pageout.h>
75#include <vm/vm_page.h>
76#include <vm/vm_object.h>
77#include <vm/vm_extern.h>
78#include <vm/vm_map.h>
79#include "opt_compat.h"
80#include "opt_directio.h"
81#include "opt_swap.h"
82
83static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
84
85struct	bio_ops bioops;		/* I/O operation notification */
86
87struct	buf_ops buf_ops_bio = {
88	.bop_name	=	"buf_ops_bio",
89	.bop_write	=	bufwrite,
90	.bop_strategy	=	bufstrategy,
91	.bop_sync	=	bufsync,
92	.bop_bdflush	=	bufbdflush,
93};
94
95/*
96 * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
97 * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
98 */
99struct buf *buf;		/* buffer header pool */
100caddr_t unmapped_buf;
101
102static struct proc *bufdaemonproc;
103
104static int inmem(struct vnode *vp, daddr_t blkno);
105static void vm_hold_free_pages(struct buf *bp, int newbsize);
106static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
107		vm_offset_t to);
108static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
109static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
110		vm_page_t m);
111static void vfs_drain_busy_pages(struct buf *bp);
112static void vfs_clean_pages_dirty_buf(struct buf *bp);
113static void vfs_setdirty_locked_object(struct buf *bp);
114static void vfs_vmio_release(struct buf *bp);
115static int vfs_bio_clcheck(struct vnode *vp, int size,
116		daddr_t lblkno, daddr_t blkno);
117static int buf_flush(struct vnode *vp, int);
118static int flushbufqueues(struct vnode *, int, int);
119static void buf_daemon(void);
120static void bremfreel(struct buf *bp);
121static __inline void bd_wakeup(void);
122#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
123    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
124static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
125#endif
126
127int vmiodirenable = TRUE;
128SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
129    "Use the VM system for directory writes");
130long runningbufspace;
131SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
132    "Amount of presently outstanding async buffer io");
133static long bufspace;
134#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
135    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
136SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
137    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
138#else
139SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
140    "Virtual memory used for buffers");
141#endif
142static long unmapped_bufspace;
143SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
144    &unmapped_bufspace, 0,
145    "Amount of unmapped buffers, inclusive in the bufspace");
146static long maxbufspace;
147SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
148    "Maximum allowed value of bufspace (including buf_daemon)");
149static long bufmallocspace;
150SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
151    "Amount of malloced memory for buffers");
152static long maxbufmallocspace;
153SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
154    "Maximum amount of malloced memory for buffers");
155static long lobufspace;
156SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
157    "Minimum amount of buffers we want to have");
158long hibufspace;
159SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
160    "Maximum allowed value of bufspace (excluding buf_daemon)");
161static int bufreusecnt;
162SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
163    "Number of times we have reused a buffer");
164static int buffreekvacnt;
165SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
166    "Number of times we have freed the KVA space from some buffer");
167static int bufdefragcnt;
168SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
169    "Number of times we have had to repeat buffer allocation to defragment");
170static long lorunningspace;
171SYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
172    "Minimum preferred space used for in-progress I/O");
173static long hirunningspace;
174SYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
175    "Maximum amount of space to use for in-progress I/O");
176int dirtybufferflushes;
177SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
178    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
179int bdwriteskip;
180SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
181    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
182int altbufferflushes;
183SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
184    0, "Number of fsync flushes to limit dirty buffers");
185static int recursiveflushes;
186SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
187    0, "Number of flushes skipped due to being recursive");
188static int numdirtybuffers;
189SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
190    "Number of buffers that are dirty (has unwritten changes) at the moment");
191static int lodirtybuffers;
192SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
193    "How many buffers we want to have free before bufdaemon can sleep");
194static int hidirtybuffers;
195SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
196    "When the number of dirty buffers is considered severe");
197int dirtybufthresh;
198SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
199    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
200static int numfreebuffers;
201SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
202    "Number of free buffers");
203static int lofreebuffers;
204SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
205   "XXX Unused");
206static int hifreebuffers;
207SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
208   "XXX Complicatedly unused");
209static int getnewbufcalls;
210SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
211   "Number of calls to getnewbuf");
212static int getnewbufrestarts;
213SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
214    "Number of times getnewbuf has had to restart a buffer aquisition");
215static int mappingrestarts;
216SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
217    "Number of times getblk has had to restart a buffer mapping for "
218    "unmapped buffer");
219static int flushbufqtarget = 100;
220SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
221    "Amount of work to do in flushbufqueues when helping bufdaemon");
222static long notbufdflushes;
223SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
224    "Number of dirty buffer flushes done by the bufdaemon helpers");
225static long barrierwrites;
226SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
227    "Number of barrier writes");
228SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
229    &unmapped_buf_allowed, 0,
230    "Permit the use of the unmapped i/o");
231
232/*
233 * Lock for the non-dirty bufqueues
234 */
235static struct mtx_padalign bqclean;
236
237/*
238 * Lock for the dirty queue.
239 */
240static struct mtx_padalign bqdirty;
241
242/*
243 * This lock synchronizes access to bd_request.
244 */
245static struct mtx_padalign bdlock;
246
247/*
248 * This lock protects the runningbufreq and synchronizes runningbufwakeup and
249 * waitrunningbufspace().
250 */
251static struct mtx_padalign rbreqlock;
252
253/*
254 * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
255 */
256static struct mtx_padalign nblock;
257
258/*
259 * Lock that protects bdirtywait.
260 */
261static struct mtx_padalign bdirtylock;
262
263/*
264 * Wakeup point for bufdaemon, as well as indicator of whether it is already
265 * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
266 * is idling.
267 */
268static int bd_request;
269
270/*
271 * Request for the buf daemon to write more buffers than is indicated by
272 * lodirtybuf.  This may be necessary to push out excess dependencies or
273 * defragment the address space where a simple count of the number of dirty
274 * buffers is insufficient to characterize the demand for flushing them.
275 */
276static int bd_speedupreq;
277
278/*
279 * bogus page -- for I/O to/from partially complete buffers
280 * this is a temporary solution to the problem, but it is not
281 * really that bad.  it would be better to split the buffer
282 * for input in the case of buffers partially already in memory,
283 * but the code is intricate enough already.
284 */
285vm_page_t bogus_page;
286
287/*
288 * Synchronization (sleep/wakeup) variable for active buffer space requests.
289 * Set when wait starts, cleared prior to wakeup().
290 * Used in runningbufwakeup() and waitrunningbufspace().
291 */
292static int runningbufreq;
293
294/*
295 * Synchronization (sleep/wakeup) variable for buffer requests.
296 * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
297 * by and/or.
298 * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
299 * getnewbuf(), and getblk().
300 */
301static int needsbuffer;
302
303/*
304 * Synchronization for bwillwrite() waiters.
305 */
306static int bdirtywait;
307
308/*
309 * Definitions for the buffer free lists.
310 */
311#define BUFFER_QUEUES	5	/* number of free buffer queues */
312
313#define QUEUE_NONE	0	/* on no queue */
314#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
315#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
316#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
317#define QUEUE_EMPTY	4	/* empty buffer headers */
318#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
319
320/* Queues for free buffers with various properties */
321static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
322#ifdef INVARIANTS
323static int bq_len[BUFFER_QUEUES];
324#endif
325
326/*
327 * Single global constant for BUF_WMESG, to avoid getting multiple references.
328 * buf_wmesg is referred from macros.
329 */
330const char *buf_wmesg = BUF_WMESG;
331
332#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
333#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
334#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
335
336#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
337    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
338static int
339sysctl_bufspace(SYSCTL_HANDLER_ARGS)
340{
341	long lvalue;
342	int ivalue;
343
344	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
345		return (sysctl_handle_long(oidp, arg1, arg2, req));
346	lvalue = *(long *)arg1;
347	if (lvalue > INT_MAX)
348		/* On overflow, still write out a long to trigger ENOMEM. */
349		return (sysctl_handle_long(oidp, &lvalue, 0, req));
350	ivalue = lvalue;
351	return (sysctl_handle_int(oidp, &ivalue, 0, req));
352}
353#endif
354
355#ifdef DIRECTIO
356extern void ffs_rawread_setup(void);
357#endif /* DIRECTIO */
358
359/*
360 *	bqlock:
361 *
362 *	Return the appropriate queue lock based on the index.
363 */
364static inline struct mtx *
365bqlock(int qindex)
366{
367
368	if (qindex == QUEUE_DIRTY)
369		return (struct mtx *)(&bqdirty);
370	return (struct mtx *)(&bqclean);
371}
372
373/*
374 *	bdirtywakeup:
375 *
376 *	Wakeup any bwillwrite() waiters.
377 */
378static void
379bdirtywakeup(void)
380{
381	mtx_lock(&bdirtylock);
382	if (bdirtywait) {
383		bdirtywait = 0;
384		wakeup(&bdirtywait);
385	}
386	mtx_unlock(&bdirtylock);
387}
388
389/*
390 *	bdirtysub:
391 *
392 *	Decrement the numdirtybuffers count by one and wakeup any
393 *	threads blocked in bwillwrite().
394 */
395static void
396bdirtysub(void)
397{
398
399	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
400	    (lodirtybuffers + hidirtybuffers) / 2)
401		bdirtywakeup();
402}
403
404/*
405 *	bdirtyadd:
406 *
407 *	Increment the numdirtybuffers count by one and wakeup the buf
408 *	daemon if needed.
409 */
410static void
411bdirtyadd(void)
412{
413
414	/*
415	 * Only do the wakeup once as we cross the boundary.  The
416	 * buf daemon will keep running until the condition clears.
417	 */
418	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
419	    (lodirtybuffers + hidirtybuffers) / 2)
420		bd_wakeup();
421}
422
423/*
424 *	bufspacewakeup:
425 *
426 *	Called when buffer space is potentially available for recovery.
427 *	getnewbuf() will block on this flag when it is unable to free
428 *	sufficient buffer space.  Buffer space becomes recoverable when
429 *	bp's get placed back in the queues.
430 */
431
432static __inline void
433bufspacewakeup(void)
434{
435
436	/*
437	 * If someone is waiting for BUF space, wake them up.  Even
438	 * though we haven't freed the kva space yet, the waiting
439	 * process will be able to now.
440	 */
441	mtx_lock(&nblock);
442	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
443		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
444		wakeup(&needsbuffer);
445	}
446	mtx_unlock(&nblock);
447}
448
449/*
450 *	runningwakeup:
451 *
452 *	Wake up processes that are waiting on asynchronous writes to fall
453 *	below lorunningspace.
454 */
455static void
456runningwakeup(void)
457{
458
459	mtx_lock(&rbreqlock);
460	if (runningbufreq) {
461		runningbufreq = 0;
462		wakeup(&runningbufreq);
463	}
464	mtx_unlock(&rbreqlock);
465}
466
467/*
468 *	runningbufwakeup:
469 *
470 *	Decrement the outstanding write count according.
471 */
472void
473runningbufwakeup(struct buf *bp)
474{
475	long space, bspace;
476
477	if (bp->b_runningbufspace == 0)
478		return;
479	space = atomic_fetchadd_long(&runningbufspace, -bp->b_runningbufspace);
480	bspace = bp->b_runningbufspace;
481	bp->b_runningbufspace = 0;
482	/*
483	 * Only acquire the lock and wakeup on the transition from exceeding
484	 * the threshold to falling below it.
485	 */
486	if (space < lorunningspace)
487		return;
488	if (space - bspace > lorunningspace)
489		return;
490	runningwakeup();
491}
492
493/*
494 *	bufcountadd:
495 *
496 *	Called when a buffer has been added to one of the free queues to
497 *	account for the buffer and to wakeup anyone waiting for free buffers.
498 *	This typically occurs when large amounts of metadata are being handled
499 *	by the buffer cache ( else buffer space runs out first, usually ).
500 */
501static __inline void
502bufcountadd(struct buf *bp)
503{
504	int old;
505
506	KASSERT((bp->b_flags & B_INFREECNT) == 0,
507	    ("buf %p already counted as free", bp));
508	bp->b_flags |= B_INFREECNT;
509	old = atomic_fetchadd_int(&numfreebuffers, 1);
510	KASSERT(old >= 0 && old < nbuf,
511	    ("numfreebuffers climbed to %d", old + 1));
512	mtx_lock(&nblock);
513	if (needsbuffer) {
514		needsbuffer &= ~VFS_BIO_NEED_ANY;
515		if (numfreebuffers >= hifreebuffers)
516			needsbuffer &= ~VFS_BIO_NEED_FREE;
517		wakeup(&needsbuffer);
518	}
519	mtx_unlock(&nblock);
520}
521
522/*
523 *	bufcountsub:
524 *
525 *	Decrement the numfreebuffers count as needed.
526 */
527static void
528bufcountsub(struct buf *bp)
529{
530	int old;
531
532	/*
533	 * Fixup numfreebuffers count.  If the buffer is invalid or not
534	 * delayed-write, the buffer was free and we must decrement
535	 * numfreebuffers.
536	 */
537	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
538		KASSERT((bp->b_flags & B_INFREECNT) != 0,
539		    ("buf %p not counted in numfreebuffers", bp));
540		bp->b_flags &= ~B_INFREECNT;
541		old = atomic_fetchadd_int(&numfreebuffers, -1);
542		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
543	}
544}
545
546/*
547 *	waitrunningbufspace()
548 *
549 *	runningbufspace is a measure of the amount of I/O currently
550 *	running.  This routine is used in async-write situations to
551 *	prevent creating huge backups of pending writes to a device.
552 *	Only asynchronous writes are governed by this function.
553 *
554 *	This does NOT turn an async write into a sync write.  It waits
555 *	for earlier writes to complete and generally returns before the
556 *	caller's write has reached the device.
557 */
558void
559waitrunningbufspace(void)
560{
561
562	mtx_lock(&rbreqlock);
563	while (runningbufspace > hirunningspace) {
564		++runningbufreq;
565		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
566	}
567	mtx_unlock(&rbreqlock);
568}
569
570
571/*
572 *	vfs_buf_test_cache:
573 *
574 *	Called when a buffer is extended.  This function clears the B_CACHE
575 *	bit if the newly extended portion of the buffer does not contain
576 *	valid data.
577 */
578static __inline
579void
580vfs_buf_test_cache(struct buf *bp,
581		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
582		  vm_page_t m)
583{
584
585	VM_OBJECT_ASSERT_WLOCKED(m->object);
586	if (bp->b_flags & B_CACHE) {
587		int base = (foff + off) & PAGE_MASK;
588		if (vm_page_is_valid(m, base, size) == 0)
589			bp->b_flags &= ~B_CACHE;
590	}
591}
592
593/* Wake up the buffer daemon if necessary */
594static __inline void
595bd_wakeup(void)
596{
597
598	mtx_lock(&bdlock);
599	if (bd_request == 0) {
600		bd_request = 1;
601		wakeup(&bd_request);
602	}
603	mtx_unlock(&bdlock);
604}
605
606/*
607 * bd_speedup - speedup the buffer cache flushing code
608 */
609void
610bd_speedup(void)
611{
612	int needwake;
613
614	mtx_lock(&bdlock);
615	needwake = 0;
616	if (bd_speedupreq == 0 || bd_request == 0)
617		needwake = 1;
618	bd_speedupreq = 1;
619	bd_request = 1;
620	if (needwake)
621		wakeup(&bd_request);
622	mtx_unlock(&bdlock);
623}
624
625#ifdef __i386__
626#define	TRANSIENT_DENOM	5
627#else
628#define	TRANSIENT_DENOM 10
629#endif
630
631/*
632 * Calculating buffer cache scaling values and reserve space for buffer
633 * headers.  This is called during low level kernel initialization and
634 * may be called more then once.  We CANNOT write to the memory area
635 * being reserved at this time.
636 */
637caddr_t
638kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
639{
640	int tuned_nbuf;
641	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
642
643	/*
644	 * physmem_est is in pages.  Convert it to kilobytes (assumes
645	 * PAGE_SIZE is >= 1K)
646	 */
647	physmem_est = physmem_est * (PAGE_SIZE / 1024);
648
649	/*
650	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
651	 * For the first 64MB of ram nominally allocate sufficient buffers to
652	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
653	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
654	 * the buffer cache we limit the eventual kva reservation to
655	 * maxbcache bytes.
656	 *
657	 * factor represents the 1/4 x ram conversion.
658	 */
659	if (nbuf == 0) {
660		int factor = 4 * BKVASIZE / 1024;
661
662		nbuf = 50;
663		if (physmem_est > 4096)
664			nbuf += min((physmem_est - 4096) / factor,
665			    65536 / factor);
666		if (physmem_est > 65536)
667			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
668			    32 * 1024 * 1024 / (factor * 5));
669
670		if (maxbcache && nbuf > maxbcache / BKVASIZE)
671			nbuf = maxbcache / BKVASIZE;
672		tuned_nbuf = 1;
673	} else
674		tuned_nbuf = 0;
675
676	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
677	maxbuf = (LONG_MAX / 3) / BKVASIZE;
678	if (nbuf > maxbuf) {
679		if (!tuned_nbuf)
680			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
681			    maxbuf);
682		nbuf = maxbuf;
683	}
684
685	/*
686	 * Ideal allocation size for the transient bio submap if 10%
687	 * of the maximal space buffer map.  This roughly corresponds
688	 * to the amount of the buffer mapped for typical UFS load.
689	 *
690	 * Clip the buffer map to reserve space for the transient
691	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
692	 * maximum buffer map extent on the platform.
693	 *
694	 * The fall-back to the maxbuf in case of maxbcache unset,
695	 * allows to not trim the buffer KVA for the architectures
696	 * with ample KVA space.
697	 */
698	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
699		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
700		buf_sz = (long)nbuf * BKVASIZE;
701		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
702		    (TRANSIENT_DENOM - 1)) {
703			/*
704			 * There is more KVA than memory.  Do not
705			 * adjust buffer map size, and assign the rest
706			 * of maxbuf to transient map.
707			 */
708			biotmap_sz = maxbuf_sz - buf_sz;
709		} else {
710			/*
711			 * Buffer map spans all KVA we could afford on
712			 * this platform.  Give 10% (20% on i386) of
713			 * the buffer map to the transient bio map.
714			 */
715			biotmap_sz = buf_sz / TRANSIENT_DENOM;
716			buf_sz -= biotmap_sz;
717		}
718		if (biotmap_sz / INT_MAX > MAXPHYS)
719			bio_transient_maxcnt = INT_MAX;
720		else
721			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
722		/*
723		 * Artifically limit to 1024 simultaneous in-flight I/Os
724		 * using the transient mapping.
725		 */
726		if (bio_transient_maxcnt > 1024)
727			bio_transient_maxcnt = 1024;
728		if (tuned_nbuf)
729			nbuf = buf_sz / BKVASIZE;
730	}
731
732	/*
733	 * swbufs are used as temporary holders for I/O, such as paging I/O.
734	 * We have no less then 16 and no more then 256.
735	 */
736	nswbuf = max(min(nbuf/4, 256), 16);
737#ifdef NSWBUF_MIN
738	if (nswbuf < NSWBUF_MIN)
739		nswbuf = NSWBUF_MIN;
740#endif
741#ifdef DIRECTIO
742	ffs_rawread_setup();
743#endif
744
745	/*
746	 * Reserve space for the buffer cache buffers
747	 */
748	swbuf = (void *)v;
749	v = (caddr_t)(swbuf + nswbuf);
750	buf = (void *)v;
751	v = (caddr_t)(buf + nbuf);
752
753	return(v);
754}
755
756/* Initialize the buffer subsystem.  Called before use of any buffers. */
757void
758bufinit(void)
759{
760	struct buf *bp;
761	int i;
762
763	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
764	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
765	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
766	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
767	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
768	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
769
770	/* next, make a null set of free lists */
771	for (i = 0; i < BUFFER_QUEUES; i++)
772		TAILQ_INIT(&bufqueues[i]);
773
774	/* finally, initialize each buffer header and stick on empty q */
775	for (i = 0; i < nbuf; i++) {
776		bp = &buf[i];
777		bzero(bp, sizeof *bp);
778		bp->b_flags = B_INVAL | B_INFREECNT;
779		bp->b_rcred = NOCRED;
780		bp->b_wcred = NOCRED;
781		bp->b_qindex = QUEUE_EMPTY;
782		bp->b_xflags = 0;
783		LIST_INIT(&bp->b_dep);
784		BUF_LOCKINIT(bp);
785		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
786#ifdef INVARIANTS
787		bq_len[QUEUE_EMPTY]++;
788#endif
789	}
790
791	/*
792	 * maxbufspace is the absolute maximum amount of buffer space we are
793	 * allowed to reserve in KVM and in real terms.  The absolute maximum
794	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
795	 * used by most other processes.  The differential is required to
796	 * ensure that buf_daemon is able to run when other processes might
797	 * be blocked waiting for buffer space.
798	 *
799	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
800	 * this may result in KVM fragmentation which is not handled optimally
801	 * by the system.
802	 */
803	maxbufspace = (long)nbuf * BKVASIZE;
804	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
805	lobufspace = hibufspace - MAXBSIZE;
806
807	/*
808	 * Note: The 16 MiB upper limit for hirunningspace was chosen
809	 * arbitrarily and may need further tuning. It corresponds to
810	 * 128 outstanding write IO requests (if IO size is 128 KiB),
811	 * which fits with many RAID controllers' tagged queuing limits.
812	 * The lower 1 MiB limit is the historical upper limit for
813	 * hirunningspace.
814	 */
815	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
816	    16 * 1024 * 1024), 1024 * 1024);
817	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
818
819/*
820 * Limit the amount of malloc memory since it is wired permanently into
821 * the kernel space.  Even though this is accounted for in the buffer
822 * allocation, we don't want the malloced region to grow uncontrolled.
823 * The malloc scheme improves memory utilization significantly on average
824 * (small) directories.
825 */
826	maxbufmallocspace = hibufspace / 20;
827
828/*
829 * Reduce the chance of a deadlock occuring by limiting the number
830 * of delayed-write dirty buffers we allow to stack up.
831 */
832	hidirtybuffers = nbuf / 4 + 20;
833	dirtybufthresh = hidirtybuffers * 9 / 10;
834	numdirtybuffers = 0;
835/*
836 * To support extreme low-memory systems, make sure hidirtybuffers cannot
837 * eat up all available buffer space.  This occurs when our minimum cannot
838 * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
839 * BKVASIZE'd buffers.
840 */
841	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
842		hidirtybuffers >>= 1;
843	}
844	lodirtybuffers = hidirtybuffers / 2;
845
846/*
847 * Try to keep the number of free buffers in the specified range,
848 * and give special processes (e.g. like buf_daemon) access to an
849 * emergency reserve.
850 */
851	lofreebuffers = nbuf / 18 + 5;
852	hifreebuffers = 2 * lofreebuffers;
853	numfreebuffers = nbuf;
854
855	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
856	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
857	unmapped_buf = (caddr_t)kmem_alloc_nofault(kernel_map, MAXPHYS);
858}
859
860#ifdef INVARIANTS
861static inline void
862vfs_buf_check_mapped(struct buf *bp)
863{
864
865	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
866	    ("mapped buf %p %x", bp, bp->b_flags));
867	KASSERT(bp->b_kvabase != unmapped_buf,
868	    ("mapped buf: b_kvabase was not updated %p", bp));
869	KASSERT(bp->b_data != unmapped_buf,
870	    ("mapped buf: b_data was not updated %p", bp));
871}
872
873static inline void
874vfs_buf_check_unmapped(struct buf *bp)
875{
876
877	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
878	    ("unmapped buf %p %x", bp, bp->b_flags));
879	KASSERT(bp->b_kvabase == unmapped_buf,
880	    ("unmapped buf: corrupted b_kvabase %p", bp));
881	KASSERT(bp->b_data == unmapped_buf,
882	    ("unmapped buf: corrupted b_data %p", bp));
883}
884
885#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
886#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
887#else
888#define	BUF_CHECK_MAPPED(bp) do {} while (0)
889#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
890#endif
891
892static void
893bpmap_qenter(struct buf *bp)
894{
895
896	BUF_CHECK_MAPPED(bp);
897
898	/*
899	 * bp->b_data is relative to bp->b_offset, but
900	 * bp->b_offset may be offset into the first page.
901	 */
902	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
903	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
904	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
905	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
906}
907
908/*
909 * bfreekva() - free the kva allocation for a buffer.
910 *
911 *	Since this call frees up buffer space, we call bufspacewakeup().
912 */
913static void
914bfreekva(struct buf *bp)
915{
916
917	if (bp->b_kvasize == 0)
918		return;
919
920	atomic_add_int(&buffreekvacnt, 1);
921	atomic_subtract_long(&bufspace, bp->b_kvasize);
922	if ((bp->b_flags & B_UNMAPPED) == 0) {
923		BUF_CHECK_MAPPED(bp);
924		vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
925		    bp->b_kvasize);
926	} else {
927		BUF_CHECK_UNMAPPED(bp);
928		if ((bp->b_flags & B_KVAALLOC) != 0) {
929			vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
930			    bp->b_kvasize);
931		}
932		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
933		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
934	}
935	bp->b_kvasize = 0;
936	bufspacewakeup();
937}
938
939/*
940 *	binsfree:
941 *
942 *	Insert the buffer into the appropriate free list.
943 */
944static void
945binsfree(struct buf *bp, int qindex)
946{
947	struct mtx *olock, *nlock;
948
949	BUF_ASSERT_XLOCKED(bp);
950
951	olock = bqlock(bp->b_qindex);
952	nlock = bqlock(qindex);
953	mtx_lock(olock);
954	/* Handle delayed bremfree() processing. */
955	if (bp->b_flags & B_REMFREE)
956		bremfreel(bp);
957
958	if (bp->b_qindex != QUEUE_NONE)
959		panic("binsfree: free buffer onto another queue???");
960
961	bp->b_qindex = qindex;
962	if (olock != nlock) {
963		mtx_unlock(olock);
964		mtx_lock(nlock);
965	}
966	if (bp->b_flags & B_AGE)
967		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
968	else
969		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
970#ifdef INVARIANTS
971	bq_len[bp->b_qindex]++;
972#endif
973	mtx_unlock(nlock);
974
975	/*
976	 * Something we can maybe free or reuse.
977	 */
978	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
979		bufspacewakeup();
980
981	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
982		bufcountadd(bp);
983}
984
985/*
986 *	bremfree:
987 *
988 *	Mark the buffer for removal from the appropriate free list.
989 *
990 */
991void
992bremfree(struct buf *bp)
993{
994
995	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
996	KASSERT((bp->b_flags & B_REMFREE) == 0,
997	    ("bremfree: buffer %p already marked for delayed removal.", bp));
998	KASSERT(bp->b_qindex != QUEUE_NONE,
999	    ("bremfree: buffer %p not on a queue.", bp));
1000	BUF_ASSERT_XLOCKED(bp);
1001
1002	bp->b_flags |= B_REMFREE;
1003	bufcountsub(bp);
1004}
1005
1006/*
1007 *	bremfreef:
1008 *
1009 *	Force an immediate removal from a free list.  Used only in nfs when
1010 *	it abuses the b_freelist pointer.
1011 */
1012void
1013bremfreef(struct buf *bp)
1014{
1015	struct mtx *qlock;
1016
1017	qlock = bqlock(bp->b_qindex);
1018	mtx_lock(qlock);
1019	bremfreel(bp);
1020	mtx_unlock(qlock);
1021}
1022
1023/*
1024 *	bremfreel:
1025 *
1026 *	Removes a buffer from the free list, must be called with the
1027 *	correct qlock held.
1028 */
1029static void
1030bremfreel(struct buf *bp)
1031{
1032
1033	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1034	    bp, bp->b_vp, bp->b_flags);
1035	KASSERT(bp->b_qindex != QUEUE_NONE,
1036	    ("bremfreel: buffer %p not on a queue.", bp));
1037	BUF_ASSERT_XLOCKED(bp);
1038	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1039
1040	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
1041#ifdef INVARIANTS
1042	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
1043	    bp->b_qindex));
1044	bq_len[bp->b_qindex]--;
1045#endif
1046	bp->b_qindex = QUEUE_NONE;
1047	/*
1048	 * If this was a delayed bremfree() we only need to remove the buffer
1049	 * from the queue and return the stats are already done.
1050	 */
1051	if (bp->b_flags & B_REMFREE) {
1052		bp->b_flags &= ~B_REMFREE;
1053		return;
1054	}
1055	bufcountsub(bp);
1056}
1057
1058/*
1059 * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
1060 * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
1061 * the buffer is valid and we do not have to do anything.
1062 */
1063void
1064breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1065    int cnt, struct ucred * cred)
1066{
1067	struct buf *rabp;
1068	int i;
1069
1070	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
1071		if (inmem(vp, *rablkno))
1072			continue;
1073		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
1074
1075		if ((rabp->b_flags & B_CACHE) == 0) {
1076			if (!TD_IS_IDLETHREAD(curthread))
1077				curthread->td_ru.ru_inblock++;
1078			rabp->b_flags |= B_ASYNC;
1079			rabp->b_flags &= ~B_INVAL;
1080			rabp->b_ioflags &= ~BIO_ERROR;
1081			rabp->b_iocmd = BIO_READ;
1082			if (rabp->b_rcred == NOCRED && cred != NOCRED)
1083				rabp->b_rcred = crhold(cred);
1084			vfs_busy_pages(rabp, 0);
1085			BUF_KERNPROC(rabp);
1086			rabp->b_iooffset = dbtob(rabp->b_blkno);
1087			bstrategy(rabp);
1088		} else {
1089			brelse(rabp);
1090		}
1091	}
1092}
1093
1094/*
1095 * Entry point for bread() and breadn() via #defines in sys/buf.h.
1096 *
1097 * Get a buffer with the specified data.  Look in the cache first.  We
1098 * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
1099 * is set, the buffer is valid and we do not have to do anything, see
1100 * getblk(). Also starts asynchronous I/O on read-ahead blocks.
1101 */
1102int
1103breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
1104    int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
1105{
1106	struct buf *bp;
1107	int rv = 0, readwait = 0;
1108
1109	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
1110	/*
1111	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
1112	 */
1113	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
1114	if (bp == NULL)
1115		return (EBUSY);
1116
1117	/* if not found in cache, do some I/O */
1118	if ((bp->b_flags & B_CACHE) == 0) {
1119		if (!TD_IS_IDLETHREAD(curthread))
1120			curthread->td_ru.ru_inblock++;
1121		bp->b_iocmd = BIO_READ;
1122		bp->b_flags &= ~B_INVAL;
1123		bp->b_ioflags &= ~BIO_ERROR;
1124		if (bp->b_rcred == NOCRED && cred != NOCRED)
1125			bp->b_rcred = crhold(cred);
1126		vfs_busy_pages(bp, 0);
1127		bp->b_iooffset = dbtob(bp->b_blkno);
1128		bstrategy(bp);
1129		++readwait;
1130	}
1131
1132	breada(vp, rablkno, rabsize, cnt, cred);
1133
1134	if (readwait) {
1135		rv = bufwait(bp);
1136	}
1137	return (rv);
1138}
1139
1140/*
1141 * Write, release buffer on completion.  (Done by iodone
1142 * if async).  Do not bother writing anything if the buffer
1143 * is invalid.
1144 *
1145 * Note that we set B_CACHE here, indicating that buffer is
1146 * fully valid and thus cacheable.  This is true even of NFS
1147 * now so we set it generally.  This could be set either here
1148 * or in biodone() since the I/O is synchronous.  We put it
1149 * here.
1150 */
1151int
1152bufwrite(struct buf *bp)
1153{
1154	int oldflags;
1155	struct vnode *vp;
1156	long space;
1157	int vp_md;
1158
1159	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1160	if (bp->b_flags & B_INVAL) {
1161		brelse(bp);
1162		return (0);
1163	}
1164
1165	if (bp->b_flags & B_BARRIER)
1166		barrierwrites++;
1167
1168	oldflags = bp->b_flags;
1169
1170	BUF_ASSERT_HELD(bp);
1171
1172	if (bp->b_pin_count > 0)
1173		bunpin_wait(bp);
1174
1175	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
1176	    ("FFS background buffer should not get here %p", bp));
1177
1178	vp = bp->b_vp;
1179	if (vp)
1180		vp_md = vp->v_vflag & VV_MD;
1181	else
1182		vp_md = 0;
1183
1184	/*
1185	 * Mark the buffer clean.  Increment the bufobj write count
1186	 * before bundirty() call, to prevent other thread from seeing
1187	 * empty dirty list and zero counter for writes in progress,
1188	 * falsely indicating that the bufobj is clean.
1189	 */
1190	bufobj_wref(bp->b_bufobj);
1191	bundirty(bp);
1192
1193	bp->b_flags &= ~B_DONE;
1194	bp->b_ioflags &= ~BIO_ERROR;
1195	bp->b_flags |= B_CACHE;
1196	bp->b_iocmd = BIO_WRITE;
1197
1198	vfs_busy_pages(bp, 1);
1199
1200	/*
1201	 * Normal bwrites pipeline writes
1202	 */
1203	bp->b_runningbufspace = bp->b_bufsize;
1204	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
1205
1206	if (!TD_IS_IDLETHREAD(curthread))
1207		curthread->td_ru.ru_oublock++;
1208	if (oldflags & B_ASYNC)
1209		BUF_KERNPROC(bp);
1210	bp->b_iooffset = dbtob(bp->b_blkno);
1211	bstrategy(bp);
1212
1213	if ((oldflags & B_ASYNC) == 0) {
1214		int rtval = bufwait(bp);
1215		brelse(bp);
1216		return (rtval);
1217	} else if (space > hirunningspace) {
1218		/*
1219		 * don't allow the async write to saturate the I/O
1220		 * system.  We will not deadlock here because
1221		 * we are blocking waiting for I/O that is already in-progress
1222		 * to complete. We do not block here if it is the update
1223		 * or syncer daemon trying to clean up as that can lead
1224		 * to deadlock.
1225		 */
1226		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1227			waitrunningbufspace();
1228	}
1229
1230	return (0);
1231}
1232
1233void
1234bufbdflush(struct bufobj *bo, struct buf *bp)
1235{
1236	struct buf *nbp;
1237
1238	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
1239		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
1240		altbufferflushes++;
1241	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
1242		BO_LOCK(bo);
1243		/*
1244		 * Try to find a buffer to flush.
1245		 */
1246		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
1247			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1248			    BUF_LOCK(nbp,
1249				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
1250				continue;
1251			if (bp == nbp)
1252				panic("bdwrite: found ourselves");
1253			BO_UNLOCK(bo);
1254			/* Don't countdeps with the bo lock held. */
1255			if (buf_countdeps(nbp, 0)) {
1256				BO_LOCK(bo);
1257				BUF_UNLOCK(nbp);
1258				continue;
1259			}
1260			if (nbp->b_flags & B_CLUSTEROK) {
1261				vfs_bio_awrite(nbp);
1262			} else {
1263				bremfree(nbp);
1264				bawrite(nbp);
1265			}
1266			dirtybufferflushes++;
1267			break;
1268		}
1269		if (nbp == NULL)
1270			BO_UNLOCK(bo);
1271	}
1272}
1273
1274/*
1275 * Delayed write. (Buffer is marked dirty).  Do not bother writing
1276 * anything if the buffer is marked invalid.
1277 *
1278 * Note that since the buffer must be completely valid, we can safely
1279 * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
1280 * biodone() in order to prevent getblk from writing the buffer
1281 * out synchronously.
1282 */
1283void
1284bdwrite(struct buf *bp)
1285{
1286	struct thread *td = curthread;
1287	struct vnode *vp;
1288	struct bufobj *bo;
1289
1290	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1291	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1292	KASSERT((bp->b_flags & B_BARRIER) == 0,
1293	    ("Barrier request in delayed write %p", bp));
1294	BUF_ASSERT_HELD(bp);
1295
1296	if (bp->b_flags & B_INVAL) {
1297		brelse(bp);
1298		return;
1299	}
1300
1301	/*
1302	 * If we have too many dirty buffers, don't create any more.
1303	 * If we are wildly over our limit, then force a complete
1304	 * cleanup. Otherwise, just keep the situation from getting
1305	 * out of control. Note that we have to avoid a recursive
1306	 * disaster and not try to clean up after our own cleanup!
1307	 */
1308	vp = bp->b_vp;
1309	bo = bp->b_bufobj;
1310	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
1311		td->td_pflags |= TDP_INBDFLUSH;
1312		BO_BDFLUSH(bo, bp);
1313		td->td_pflags &= ~TDP_INBDFLUSH;
1314	} else
1315		recursiveflushes++;
1316
1317	bdirty(bp);
1318	/*
1319	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
1320	 * true even of NFS now.
1321	 */
1322	bp->b_flags |= B_CACHE;
1323
1324	/*
1325	 * This bmap keeps the system from needing to do the bmap later,
1326	 * perhaps when the system is attempting to do a sync.  Since it
1327	 * is likely that the indirect block -- or whatever other datastructure
1328	 * that the filesystem needs is still in memory now, it is a good
1329	 * thing to do this.  Note also, that if the pageout daemon is
1330	 * requesting a sync -- there might not be enough memory to do
1331	 * the bmap then...  So, this is important to do.
1332	 */
1333	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
1334		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
1335	}
1336
1337	/*
1338	 * Set the *dirty* buffer range based upon the VM system dirty
1339	 * pages.
1340	 *
1341	 * Mark the buffer pages as clean.  We need to do this here to
1342	 * satisfy the vnode_pager and the pageout daemon, so that it
1343	 * thinks that the pages have been "cleaned".  Note that since
1344	 * the pages are in a delayed write buffer -- the VFS layer
1345	 * "will" see that the pages get written out on the next sync,
1346	 * or perhaps the cluster will be completed.
1347	 */
1348	vfs_clean_pages_dirty_buf(bp);
1349	bqrelse(bp);
1350
1351	/*
1352	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
1353	 * due to the softdep code.
1354	 */
1355}
1356
1357/*
1358 *	bdirty:
1359 *
1360 *	Turn buffer into delayed write request.  We must clear BIO_READ and
1361 *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
1362 *	itself to properly update it in the dirty/clean lists.  We mark it
1363 *	B_DONE to ensure that any asynchronization of the buffer properly
1364 *	clears B_DONE ( else a panic will occur later ).
1365 *
1366 *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
1367 *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
1368 *	should only be called if the buffer is known-good.
1369 *
1370 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1371 *	count.
1372 *
1373 *	The buffer must be on QUEUE_NONE.
1374 */
1375void
1376bdirty(struct buf *bp)
1377{
1378
1379	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
1380	    bp, bp->b_vp, bp->b_flags);
1381	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1382	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1383	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
1384	BUF_ASSERT_HELD(bp);
1385	bp->b_flags &= ~(B_RELBUF);
1386	bp->b_iocmd = BIO_WRITE;
1387
1388	if ((bp->b_flags & B_DELWRI) == 0) {
1389		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
1390		reassignbuf(bp);
1391		bdirtyadd();
1392	}
1393}
1394
1395/*
1396 *	bundirty:
1397 *
1398 *	Clear B_DELWRI for buffer.
1399 *
1400 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1401 *	count.
1402 *
1403 *	The buffer must be on QUEUE_NONE.
1404 */
1405
1406void
1407bundirty(struct buf *bp)
1408{
1409
1410	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1411	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1412	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1413	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
1414	BUF_ASSERT_HELD(bp);
1415
1416	if (bp->b_flags & B_DELWRI) {
1417		bp->b_flags &= ~B_DELWRI;
1418		reassignbuf(bp);
1419		bdirtysub();
1420	}
1421	/*
1422	 * Since it is now being written, we can clear its deferred write flag.
1423	 */
1424	bp->b_flags &= ~B_DEFERRED;
1425}
1426
1427/*
1428 *	bawrite:
1429 *
1430 *	Asynchronous write.  Start output on a buffer, but do not wait for
1431 *	it to complete.  The buffer is released when the output completes.
1432 *
1433 *	bwrite() ( or the VOP routine anyway ) is responsible for handling
1434 *	B_INVAL buffers.  Not us.
1435 */
1436void
1437bawrite(struct buf *bp)
1438{
1439
1440	bp->b_flags |= B_ASYNC;
1441	(void) bwrite(bp);
1442}
1443
1444/*
1445 *	babarrierwrite:
1446 *
1447 *	Asynchronous barrier write.  Start output on a buffer, but do not
1448 *	wait for it to complete.  Place a write barrier after this write so
1449 *	that this buffer and all buffers written before it are committed to
1450 *	the disk before any buffers written after this write are committed
1451 *	to the disk.  The buffer is released when the output completes.
1452 */
1453void
1454babarrierwrite(struct buf *bp)
1455{
1456
1457	bp->b_flags |= B_ASYNC | B_BARRIER;
1458	(void) bwrite(bp);
1459}
1460
1461/*
1462 *	bbarrierwrite:
1463 *
1464 *	Synchronous barrier write.  Start output on a buffer and wait for
1465 *	it to complete.  Place a write barrier after this write so that
1466 *	this buffer and all buffers written before it are committed to
1467 *	the disk before any buffers written after this write are committed
1468 *	to the disk.  The buffer is released when the output completes.
1469 */
1470int
1471bbarrierwrite(struct buf *bp)
1472{
1473
1474	bp->b_flags |= B_BARRIER;
1475	return (bwrite(bp));
1476}
1477
1478/*
1479 *	bwillwrite:
1480 *
1481 *	Called prior to the locking of any vnodes when we are expecting to
1482 *	write.  We do not want to starve the buffer cache with too many
1483 *	dirty buffers so we block here.  By blocking prior to the locking
1484 *	of any vnodes we attempt to avoid the situation where a locked vnode
1485 *	prevents the various system daemons from flushing related buffers.
1486 */
1487void
1488bwillwrite(void)
1489{
1490
1491	if (numdirtybuffers >= hidirtybuffers) {
1492		mtx_lock(&bdirtylock);
1493		while (numdirtybuffers >= hidirtybuffers) {
1494			bdirtywait = 1;
1495			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
1496			    "flswai", 0);
1497		}
1498		mtx_unlock(&bdirtylock);
1499	}
1500}
1501
1502/*
1503 * Return true if we have too many dirty buffers.
1504 */
1505int
1506buf_dirty_count_severe(void)
1507{
1508
1509	return(numdirtybuffers >= hidirtybuffers);
1510}
1511
1512static __noinline int
1513buf_vm_page_count_severe(void)
1514{
1515
1516	KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
1517
1518	return vm_page_count_severe();
1519}
1520
1521/*
1522 *	brelse:
1523 *
1524 *	Release a busy buffer and, if requested, free its resources.  The
1525 *	buffer will be stashed in the appropriate bufqueue[] allowing it
1526 *	to be accessed later as a cache entity or reused for other purposes.
1527 */
1528void
1529brelse(struct buf *bp)
1530{
1531	int qindex;
1532
1533	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
1534	    bp, bp->b_vp, bp->b_flags);
1535	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1536	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1537
1538	if (BUF_LOCKRECURSED(bp)) {
1539		/*
1540		 * Do not process, in particular, do not handle the
1541		 * B_INVAL/B_RELBUF and do not release to free list.
1542		 */
1543		BUF_UNLOCK(bp);
1544		return;
1545	}
1546
1547	if (bp->b_flags & B_MANAGED) {
1548		bqrelse(bp);
1549		return;
1550	}
1551
1552	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
1553	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
1554		/*
1555		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
1556		 * pages from being scrapped.  If the error is anything
1557		 * other than an I/O error (EIO), assume that retrying
1558		 * is futile.
1559		 */
1560		bp->b_ioflags &= ~BIO_ERROR;
1561		bdirty(bp);
1562	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1563	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
1564		/*
1565		 * Either a failed I/O or we were asked to free or not
1566		 * cache the buffer.
1567		 */
1568		bp->b_flags |= B_INVAL;
1569		if (!LIST_EMPTY(&bp->b_dep))
1570			buf_deallocate(bp);
1571		if (bp->b_flags & B_DELWRI)
1572			bdirtysub();
1573		bp->b_flags &= ~(B_DELWRI | B_CACHE);
1574		if ((bp->b_flags & B_VMIO) == 0) {
1575			if (bp->b_bufsize)
1576				allocbuf(bp, 0);
1577			if (bp->b_vp)
1578				brelvp(bp);
1579		}
1580	}
1581
1582	/*
1583	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1584	 * is called with B_DELWRI set, the underlying pages may wind up
1585	 * getting freed causing a previous write (bdwrite()) to get 'lost'
1586	 * because pages associated with a B_DELWRI bp are marked clean.
1587	 *
1588	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
1589	 * if B_DELWRI is set.
1590	 *
1591	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
1592	 * on pages to return pages to the VM page queues.
1593	 */
1594	if (bp->b_flags & B_DELWRI)
1595		bp->b_flags &= ~B_RELBUF;
1596	else if (buf_vm_page_count_severe()) {
1597		/*
1598		 * BKGRDINPROG can only be set with the buf and bufobj
1599		 * locks both held.  We tolerate a race to clear it here.
1600		 */
1601		if (!(bp->b_vflags & BV_BKGRDINPROG))
1602			bp->b_flags |= B_RELBUF;
1603	}
1604
1605	/*
1606	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1607	 * constituted, not even NFS buffers now.  Two flags effect this.  If
1608	 * B_INVAL, the struct buf is invalidated but the VM object is kept
1609	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
1610	 *
1611	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1612	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1613	 * buffer is also B_INVAL because it hits the re-dirtying code above.
1614	 *
1615	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
1616	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
1617	 * the commit state and we cannot afford to lose the buffer. If the
1618	 * buffer has a background write in progress, we need to keep it
1619	 * around to prevent it from being reconstituted and starting a second
1620	 * background write.
1621	 */
1622	if ((bp->b_flags & B_VMIO)
1623	    && !(bp->b_vp->v_mount != NULL &&
1624		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
1625		 !vn_isdisk(bp->b_vp, NULL) &&
1626		 (bp->b_flags & B_DELWRI))
1627	    ) {
1628
1629		int i, j, resid;
1630		vm_page_t m;
1631		off_t foff;
1632		vm_pindex_t poff;
1633		vm_object_t obj;
1634
1635		obj = bp->b_bufobj->bo_object;
1636
1637		/*
1638		 * Get the base offset and length of the buffer.  Note that
1639		 * in the VMIO case if the buffer block size is not
1640		 * page-aligned then b_data pointer may not be page-aligned.
1641		 * But our b_pages[] array *IS* page aligned.
1642		 *
1643		 * block sizes less then DEV_BSIZE (usually 512) are not
1644		 * supported due to the page granularity bits (m->valid,
1645		 * m->dirty, etc...).
1646		 *
1647		 * See man buf(9) for more information
1648		 */
1649		resid = bp->b_bufsize;
1650		foff = bp->b_offset;
1651		for (i = 0; i < bp->b_npages; i++) {
1652			int had_bogus = 0;
1653
1654			m = bp->b_pages[i];
1655
1656			/*
1657			 * If we hit a bogus page, fixup *all* the bogus pages
1658			 * now.
1659			 */
1660			if (m == bogus_page) {
1661				poff = OFF_TO_IDX(bp->b_offset);
1662				had_bogus = 1;
1663
1664				VM_OBJECT_RLOCK(obj);
1665				for (j = i; j < bp->b_npages; j++) {
1666					vm_page_t mtmp;
1667					mtmp = bp->b_pages[j];
1668					if (mtmp == bogus_page) {
1669						mtmp = vm_page_lookup(obj, poff + j);
1670						if (!mtmp) {
1671							panic("brelse: page missing\n");
1672						}
1673						bp->b_pages[j] = mtmp;
1674					}
1675				}
1676				VM_OBJECT_RUNLOCK(obj);
1677
1678				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
1679					BUF_CHECK_MAPPED(bp);
1680					pmap_qenter(
1681					    trunc_page((vm_offset_t)bp->b_data),
1682					    bp->b_pages, bp->b_npages);
1683				}
1684				m = bp->b_pages[i];
1685			}
1686			if ((bp->b_flags & B_NOCACHE) ||
1687			    (bp->b_ioflags & BIO_ERROR &&
1688			     bp->b_iocmd == BIO_READ)) {
1689				int poffset = foff & PAGE_MASK;
1690				int presid = resid > (PAGE_SIZE - poffset) ?
1691					(PAGE_SIZE - poffset) : resid;
1692
1693				KASSERT(presid >= 0, ("brelse: extra page"));
1694				VM_OBJECT_WLOCK(obj);
1695				vm_page_set_invalid(m, poffset, presid);
1696				VM_OBJECT_WUNLOCK(obj);
1697				if (had_bogus)
1698					printf("avoided corruption bug in bogus_page/brelse code\n");
1699			}
1700			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1701			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1702		}
1703		if (bp->b_flags & (B_INVAL | B_RELBUF))
1704			vfs_vmio_release(bp);
1705
1706	} else if (bp->b_flags & B_VMIO) {
1707
1708		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1709			vfs_vmio_release(bp);
1710		}
1711
1712	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
1713		if (bp->b_bufsize != 0)
1714			allocbuf(bp, 0);
1715		if (bp->b_vp != NULL)
1716			brelvp(bp);
1717	}
1718
1719	/*
1720	 * If the buffer has junk contents signal it and eventually
1721	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
1722	 * doesn't find it.
1723	 */
1724	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
1725	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
1726		bp->b_flags |= B_INVAL;
1727	if (bp->b_flags & B_INVAL) {
1728		if (bp->b_flags & B_DELWRI)
1729			bundirty(bp);
1730		if (bp->b_vp)
1731			brelvp(bp);
1732	}
1733
1734	/* buffers with no memory */
1735	if (bp->b_bufsize == 0) {
1736		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1737		if (bp->b_vflags & BV_BKGRDINPROG)
1738			panic("losing buffer 1");
1739		if (bp->b_kvasize)
1740			qindex = QUEUE_EMPTYKVA;
1741		else
1742			qindex = QUEUE_EMPTY;
1743		bp->b_flags |= B_AGE;
1744	/* buffers with junk contents */
1745	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1746	    (bp->b_ioflags & BIO_ERROR)) {
1747		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1748		if (bp->b_vflags & BV_BKGRDINPROG)
1749			panic("losing buffer 2");
1750		qindex = QUEUE_CLEAN;
1751		bp->b_flags |= B_AGE;
1752	/* remaining buffers */
1753	} else if (bp->b_flags & B_DELWRI)
1754		qindex = QUEUE_DIRTY;
1755	else
1756		qindex = QUEUE_CLEAN;
1757
1758	binsfree(bp, qindex);
1759
1760	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1761	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1762		panic("brelse: not dirty");
1763	/* unlock */
1764	BUF_UNLOCK(bp);
1765}
1766
1767/*
1768 * Release a buffer back to the appropriate queue but do not try to free
1769 * it.  The buffer is expected to be used again soon.
1770 *
1771 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1772 * biodone() to requeue an async I/O on completion.  It is also used when
1773 * known good buffers need to be requeued but we think we may need the data
1774 * again soon.
1775 *
1776 * XXX we should be able to leave the B_RELBUF hint set on completion.
1777 */
1778void
1779bqrelse(struct buf *bp)
1780{
1781	int qindex;
1782
1783	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1784	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1785	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1786
1787	if (BUF_LOCKRECURSED(bp)) {
1788		/* do not release to free list */
1789		BUF_UNLOCK(bp);
1790		return;
1791	}
1792	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1793
1794	if (bp->b_flags & B_MANAGED) {
1795		if (bp->b_flags & B_REMFREE)
1796			bremfreef(bp);
1797		goto out;
1798	}
1799
1800	/* buffers with stale but valid contents */
1801	if (bp->b_flags & B_DELWRI) {
1802		qindex = QUEUE_DIRTY;
1803	} else {
1804		if ((bp->b_flags & B_DELWRI) == 0 &&
1805		    (bp->b_xflags & BX_VNDIRTY))
1806			panic("bqrelse: not dirty");
1807		/*
1808		 * BKGRDINPROG can only be set with the buf and bufobj
1809		 * locks both held.  We tolerate a race to clear it here.
1810		 */
1811		if (buf_vm_page_count_severe() &&
1812		    (bp->b_vflags & BV_BKGRDINPROG) == 0) {
1813			/*
1814			 * We are too low on memory, we have to try to free
1815			 * the buffer (most importantly: the wired pages
1816			 * making up its backing store) *now*.
1817			 */
1818			brelse(bp);
1819			return;
1820		}
1821		qindex = QUEUE_CLEAN;
1822	}
1823	binsfree(bp, qindex);
1824
1825out:
1826	/* unlock */
1827	BUF_UNLOCK(bp);
1828}
1829
1830/* Give pages used by the bp back to the VM system (where possible) */
1831static void
1832vfs_vmio_release(struct buf *bp)
1833{
1834	int i;
1835	vm_page_t m;
1836
1837	if ((bp->b_flags & B_UNMAPPED) == 0) {
1838		BUF_CHECK_MAPPED(bp);
1839		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
1840	} else
1841		BUF_CHECK_UNMAPPED(bp);
1842	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
1843	for (i = 0; i < bp->b_npages; i++) {
1844		m = bp->b_pages[i];
1845		bp->b_pages[i] = NULL;
1846		/*
1847		 * In order to keep page LRU ordering consistent, put
1848		 * everything on the inactive queue.
1849		 */
1850		vm_page_lock(m);
1851		vm_page_unwire(m, 0);
1852		/*
1853		 * We don't mess with busy pages, it is
1854		 * the responsibility of the process that
1855		 * busied the pages to deal with them.
1856		 */
1857		if ((m->oflags & VPO_BUSY) == 0 && m->busy == 0 &&
1858		    m->wire_count == 0) {
1859			/*
1860			 * Might as well free the page if we can and it has
1861			 * no valid data.  We also free the page if the
1862			 * buffer was used for direct I/O
1863			 */
1864			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
1865				vm_page_free(m);
1866			} else if (bp->b_flags & B_DIRECT) {
1867				vm_page_try_to_free(m);
1868			} else if (buf_vm_page_count_severe()) {
1869				vm_page_try_to_cache(m);
1870			}
1871		}
1872		vm_page_unlock(m);
1873	}
1874	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
1875
1876	if (bp->b_bufsize) {
1877		bufspacewakeup();
1878		bp->b_bufsize = 0;
1879	}
1880	bp->b_npages = 0;
1881	bp->b_flags &= ~B_VMIO;
1882	if (bp->b_vp)
1883		brelvp(bp);
1884}
1885
1886/*
1887 * Check to see if a block at a particular lbn is available for a clustered
1888 * write.
1889 */
1890static int
1891vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
1892{
1893	struct buf *bpa;
1894	int match;
1895
1896	match = 0;
1897
1898	/* If the buf isn't in core skip it */
1899	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
1900		return (0);
1901
1902	/* If the buf is busy we don't want to wait for it */
1903	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1904		return (0);
1905
1906	/* Only cluster with valid clusterable delayed write buffers */
1907	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
1908	    (B_DELWRI | B_CLUSTEROK))
1909		goto done;
1910
1911	if (bpa->b_bufsize != size)
1912		goto done;
1913
1914	/*
1915	 * Check to see if it is in the expected place on disk and that the
1916	 * block has been mapped.
1917	 */
1918	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
1919		match = 1;
1920done:
1921	BUF_UNLOCK(bpa);
1922	return (match);
1923}
1924
1925/*
1926 *	vfs_bio_awrite:
1927 *
1928 *	Implement clustered async writes for clearing out B_DELWRI buffers.
1929 *	This is much better then the old way of writing only one buffer at
1930 *	a time.  Note that we may not be presented with the buffers in the
1931 *	correct order, so we search for the cluster in both directions.
1932 */
1933int
1934vfs_bio_awrite(struct buf *bp)
1935{
1936	struct bufobj *bo;
1937	int i;
1938	int j;
1939	daddr_t lblkno = bp->b_lblkno;
1940	struct vnode *vp = bp->b_vp;
1941	int ncl;
1942	int nwritten;
1943	int size;
1944	int maxcl;
1945	int gbflags;
1946
1947	bo = &vp->v_bufobj;
1948	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
1949	/*
1950	 * right now we support clustered writing only to regular files.  If
1951	 * we find a clusterable block we could be in the middle of a cluster
1952	 * rather then at the beginning.
1953	 */
1954	if ((vp->v_type == VREG) &&
1955	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1956	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1957
1958		size = vp->v_mount->mnt_stat.f_iosize;
1959		maxcl = MAXPHYS / size;
1960
1961		BO_RLOCK(bo);
1962		for (i = 1; i < maxcl; i++)
1963			if (vfs_bio_clcheck(vp, size, lblkno + i,
1964			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
1965				break;
1966
1967		for (j = 1; i + j <= maxcl && j <= lblkno; j++)
1968			if (vfs_bio_clcheck(vp, size, lblkno - j,
1969			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
1970				break;
1971		BO_RUNLOCK(bo);
1972		--j;
1973		ncl = i + j;
1974		/*
1975		 * this is a possible cluster write
1976		 */
1977		if (ncl != 1) {
1978			BUF_UNLOCK(bp);
1979			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
1980			    gbflags);
1981			return (nwritten);
1982		}
1983	}
1984	bremfree(bp);
1985	bp->b_flags |= B_ASYNC;
1986	/*
1987	 * default (old) behavior, writing out only one block
1988	 *
1989	 * XXX returns b_bufsize instead of b_bcount for nwritten?
1990	 */
1991	nwritten = bp->b_bufsize;
1992	(void) bwrite(bp);
1993
1994	return (nwritten);
1995}
1996
1997static void
1998setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
1999{
2000
2001	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2002	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
2003	if ((gbflags & GB_UNMAPPED) == 0) {
2004		bp->b_kvabase = (caddr_t)addr;
2005	} else if ((gbflags & GB_KVAALLOC) != 0) {
2006		KASSERT((gbflags & GB_UNMAPPED) != 0,
2007		    ("GB_KVAALLOC without GB_UNMAPPED"));
2008		bp->b_kvaalloc = (caddr_t)addr;
2009		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2010		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2011	}
2012	bp->b_kvasize = maxsize;
2013}
2014
2015/*
2016 * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
2017 * needed.
2018 */
2019static int
2020allocbufkva(struct buf *bp, int maxsize, int gbflags)
2021{
2022	vm_offset_t addr;
2023
2024	bfreekva(bp);
2025	addr = 0;
2026
2027	if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
2028		/*
2029		 * Buffer map is too fragmented.  Request the caller
2030		 * to defragment the map.
2031		 */
2032		atomic_add_int(&bufdefragcnt, 1);
2033		return (1);
2034	}
2035	setbufkva(bp, addr, maxsize, gbflags);
2036	atomic_add_long(&bufspace, bp->b_kvasize);
2037	return (0);
2038}
2039
2040/*
2041 * Ask the bufdaemon for help, or act as bufdaemon itself, when a
2042 * locked vnode is supplied.
2043 */
2044static void
2045getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
2046    int defrag)
2047{
2048	struct thread *td;
2049	char *waitmsg;
2050	int fl, flags, norunbuf;
2051
2052	mtx_assert(&bqclean, MA_OWNED);
2053
2054	if (defrag) {
2055		flags = VFS_BIO_NEED_BUFSPACE;
2056		waitmsg = "nbufkv";
2057	} else if (bufspace >= hibufspace) {
2058		waitmsg = "nbufbs";
2059		flags = VFS_BIO_NEED_BUFSPACE;
2060	} else {
2061		waitmsg = "newbuf";
2062		flags = VFS_BIO_NEED_ANY;
2063	}
2064	mtx_lock(&nblock);
2065	needsbuffer |= flags;
2066	mtx_unlock(&nblock);
2067	mtx_unlock(&bqclean);
2068
2069	bd_speedup();	/* heeeelp */
2070	if ((gbflags & GB_NOWAIT_BD) != 0)
2071		return;
2072
2073	td = curthread;
2074	mtx_lock(&nblock);
2075	while (needsbuffer & flags) {
2076		if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
2077			mtx_unlock(&nblock);
2078			/*
2079			 * getblk() is called with a vnode locked, and
2080			 * some majority of the dirty buffers may as
2081			 * well belong to the vnode.  Flushing the
2082			 * buffers there would make a progress that
2083			 * cannot be achieved by the buf_daemon, that
2084			 * cannot lock the vnode.
2085			 */
2086			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
2087			    (td->td_pflags & TDP_NORUNNINGBUF);
2088			/* play bufdaemon */
2089			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
2090			fl = buf_flush(vp, flushbufqtarget);
2091			td->td_pflags &= norunbuf;
2092			mtx_lock(&nblock);
2093			if (fl != 0)
2094				continue;
2095			if ((needsbuffer & flags) == 0)
2096				break;
2097		}
2098		if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
2099		    waitmsg, slptimeo))
2100			break;
2101	}
2102	mtx_unlock(&nblock);
2103}
2104
2105static void
2106getnewbuf_reuse_bp(struct buf *bp, int qindex)
2107{
2108
2109	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
2110	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
2111	     bp->b_kvasize, bp->b_bufsize, qindex);
2112	mtx_assert(&bqclean, MA_NOTOWNED);
2113
2114	/*
2115	 * Note: we no longer distinguish between VMIO and non-VMIO
2116	 * buffers.
2117	 */
2118	KASSERT((bp->b_flags & B_DELWRI) == 0,
2119	    ("delwri buffer %p found in queue %d", bp, qindex));
2120
2121	if (qindex == QUEUE_CLEAN) {
2122		if (bp->b_flags & B_VMIO) {
2123			bp->b_flags &= ~B_ASYNC;
2124			vfs_vmio_release(bp);
2125		}
2126		if (bp->b_vp != NULL)
2127			brelvp(bp);
2128	}
2129
2130	/*
2131	 * Get the rest of the buffer freed up.  b_kva* is still valid
2132	 * after this operation.
2133	 */
2134
2135	if (bp->b_rcred != NOCRED) {
2136		crfree(bp->b_rcred);
2137		bp->b_rcred = NOCRED;
2138	}
2139	if (bp->b_wcred != NOCRED) {
2140		crfree(bp->b_wcred);
2141		bp->b_wcred = NOCRED;
2142	}
2143	if (!LIST_EMPTY(&bp->b_dep))
2144		buf_deallocate(bp);
2145	if (bp->b_vflags & BV_BKGRDINPROG)
2146		panic("losing buffer 3");
2147	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
2148	    bp, bp->b_vp, qindex));
2149	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
2150	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
2151
2152	if (bp->b_bufsize)
2153		allocbuf(bp, 0);
2154
2155	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
2156	bp->b_ioflags = 0;
2157	bp->b_xflags = 0;
2158	KASSERT((bp->b_flags & B_INFREECNT) == 0,
2159	    ("buf %p still counted as free?", bp));
2160	bp->b_vflags = 0;
2161	bp->b_vp = NULL;
2162	bp->b_blkno = bp->b_lblkno = 0;
2163	bp->b_offset = NOOFFSET;
2164	bp->b_iodone = 0;
2165	bp->b_error = 0;
2166	bp->b_resid = 0;
2167	bp->b_bcount = 0;
2168	bp->b_npages = 0;
2169	bp->b_dirtyoff = bp->b_dirtyend = 0;
2170	bp->b_bufobj = NULL;
2171	bp->b_pin_count = 0;
2172	bp->b_fsprivate1 = NULL;
2173	bp->b_fsprivate2 = NULL;
2174	bp->b_fsprivate3 = NULL;
2175
2176	LIST_INIT(&bp->b_dep);
2177}
2178
2179static int flushingbufs;
2180
2181static struct buf *
2182getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
2183{
2184	struct buf *bp, *nbp;
2185	int nqindex, qindex, pass;
2186
2187	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
2188
2189	pass = 1;
2190restart:
2191	atomic_add_int(&getnewbufrestarts, 1);
2192
2193	/*
2194	 * Setup for scan.  If we do not have enough free buffers,
2195	 * we setup a degenerate case that immediately fails.  Note
2196	 * that if we are specially marked process, we are allowed to
2197	 * dip into our reserves.
2198	 *
2199	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
2200	 * for the allocation of the mapped buffer.  For unmapped, the
2201	 * easiest is to start with EMPTY outright.
2202	 *
2203	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
2204	 * However, there are a number of cases (defragging, reusing, ...)
2205	 * where we cannot backup.
2206	 */
2207	nbp = NULL;
2208	mtx_lock(&bqclean);
2209	if (!defrag && unmapped) {
2210		nqindex = QUEUE_EMPTY;
2211		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2212	}
2213	if (nbp == NULL) {
2214		nqindex = QUEUE_EMPTYKVA;
2215		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2216	}
2217
2218	/*
2219	 * If no EMPTYKVA buffers and we are either defragging or
2220	 * reusing, locate a CLEAN buffer to free or reuse.  If
2221	 * bufspace useage is low skip this step so we can allocate a
2222	 * new buffer.
2223	 */
2224	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
2225		nqindex = QUEUE_CLEAN;
2226		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2227	}
2228
2229	/*
2230	 * If we could not find or were not allowed to reuse a CLEAN
2231	 * buffer, check to see if it is ok to use an EMPTY buffer.
2232	 * We can only use an EMPTY buffer if allocating its KVA would
2233	 * not otherwise run us out of buffer space.  No KVA is needed
2234	 * for the unmapped allocation.
2235	 */
2236	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
2237	    metadata)) {
2238		nqindex = QUEUE_EMPTY;
2239		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2240	}
2241
2242	/*
2243	 * All available buffers might be clean, retry ignoring the
2244	 * lobufspace as the last resort.
2245	 */
2246	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
2247		nqindex = QUEUE_CLEAN;
2248		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2249	}
2250
2251	/*
2252	 * Run scan, possibly freeing data and/or kva mappings on the fly
2253	 * depending.
2254	 */
2255	while ((bp = nbp) != NULL) {
2256		qindex = nqindex;
2257
2258		/*
2259		 * Calculate next bp (we can only use it if we do not
2260		 * block or do other fancy things).
2261		 */
2262		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
2263			switch (qindex) {
2264			case QUEUE_EMPTY:
2265				nqindex = QUEUE_EMPTYKVA;
2266				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2267				if (nbp != NULL)
2268					break;
2269				/* FALLTHROUGH */
2270			case QUEUE_EMPTYKVA:
2271				nqindex = QUEUE_CLEAN;
2272				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2273				if (nbp != NULL)
2274					break;
2275				/* FALLTHROUGH */
2276			case QUEUE_CLEAN:
2277				if (metadata && pass == 1) {
2278					pass = 2;
2279					nqindex = QUEUE_EMPTY;
2280					nbp = TAILQ_FIRST(
2281					    &bufqueues[QUEUE_EMPTY]);
2282				}
2283				/*
2284				 * nbp is NULL.
2285				 */
2286				break;
2287			}
2288		}
2289		/*
2290		 * If we are defragging then we need a buffer with
2291		 * b_kvasize != 0.  XXX this situation should no longer
2292		 * occur, if defrag is non-zero the buffer's b_kvasize
2293		 * should also be non-zero at this point.  XXX
2294		 */
2295		if (defrag && bp->b_kvasize == 0) {
2296			printf("Warning: defrag empty buffer %p\n", bp);
2297			continue;
2298		}
2299
2300		/*
2301		 * Start freeing the bp.  This is somewhat involved.  nbp
2302		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
2303		 */
2304		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2305			continue;
2306		/*
2307		 * BKGRDINPROG can only be set with the buf and bufobj
2308		 * locks both held.  We tolerate a race to clear it here.
2309		 */
2310		if (bp->b_vflags & BV_BKGRDINPROG) {
2311			BUF_UNLOCK(bp);
2312			continue;
2313		}
2314
2315		KASSERT(bp->b_qindex == qindex,
2316		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
2317
2318		bremfreel(bp);
2319		mtx_unlock(&bqclean);
2320		/*
2321		 * NOTE:  nbp is now entirely invalid.  We can only restart
2322		 * the scan from this point on.
2323		 */
2324
2325		getnewbuf_reuse_bp(bp, qindex);
2326		mtx_assert(&bqclean, MA_NOTOWNED);
2327
2328		/*
2329		 * If we are defragging then free the buffer.
2330		 */
2331		if (defrag) {
2332			bp->b_flags |= B_INVAL;
2333			bfreekva(bp);
2334			brelse(bp);
2335			defrag = 0;
2336			goto restart;
2337		}
2338
2339		/*
2340		 * Notify any waiters for the buffer lock about
2341		 * identity change by freeing the buffer.
2342		 */
2343		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
2344			bp->b_flags |= B_INVAL;
2345			bfreekva(bp);
2346			brelse(bp);
2347			goto restart;
2348		}
2349
2350		if (metadata)
2351			break;
2352
2353		/*
2354		 * If we are overcomitted then recover the buffer and its
2355		 * KVM space.  This occurs in rare situations when multiple
2356		 * processes are blocked in getnewbuf() or allocbuf().
2357		 */
2358		if (bufspace >= hibufspace)
2359			flushingbufs = 1;
2360		if (flushingbufs && bp->b_kvasize != 0) {
2361			bp->b_flags |= B_INVAL;
2362			bfreekva(bp);
2363			brelse(bp);
2364			goto restart;
2365		}
2366		if (bufspace < lobufspace)
2367			flushingbufs = 0;
2368		break;
2369	}
2370	return (bp);
2371}
2372
2373/*
2374 *	getnewbuf:
2375 *
2376 *	Find and initialize a new buffer header, freeing up existing buffers
2377 *	in the bufqueues as necessary.  The new buffer is returned locked.
2378 *
2379 *	Important:  B_INVAL is not set.  If the caller wishes to throw the
2380 *	buffer away, the caller must set B_INVAL prior to calling brelse().
2381 *
2382 *	We block if:
2383 *		We have insufficient buffer headers
2384 *		We have insufficient buffer space
2385 *		buffer_arena is too fragmented ( space reservation fails )
2386 *		If we have to flush dirty buffers ( but we try to avoid this )
2387 */
2388static struct buf *
2389getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
2390    int gbflags)
2391{
2392	struct buf *bp;
2393	int defrag, metadata;
2394
2395	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2396	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2397	if (!unmapped_buf_allowed)
2398		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2399
2400	defrag = 0;
2401	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2402	    vp->v_type == VCHR)
2403		metadata = 1;
2404	else
2405		metadata = 0;
2406	/*
2407	 * We can't afford to block since we might be holding a vnode lock,
2408	 * which may prevent system daemons from running.  We deal with
2409	 * low-memory situations by proactively returning memory and running
2410	 * async I/O rather then sync I/O.
2411	 */
2412	atomic_add_int(&getnewbufcalls, 1);
2413	atomic_subtract_int(&getnewbufrestarts, 1);
2414restart:
2415	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
2416	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
2417	if (bp != NULL)
2418		defrag = 0;
2419
2420	/*
2421	 * If we exhausted our list, sleep as appropriate.  We may have to
2422	 * wakeup various daemons and write out some dirty buffers.
2423	 *
2424	 * Generally we are sleeping due to insufficient buffer space.
2425	 */
2426	if (bp == NULL) {
2427		mtx_assert(&bqclean, MA_OWNED);
2428		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
2429		mtx_assert(&bqclean, MA_NOTOWNED);
2430	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
2431		mtx_assert(&bqclean, MA_NOTOWNED);
2432
2433		bfreekva(bp);
2434		bp->b_flags |= B_UNMAPPED;
2435		bp->b_kvabase = bp->b_data = unmapped_buf;
2436		bp->b_kvasize = maxsize;
2437		atomic_add_long(&bufspace, bp->b_kvasize);
2438		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2439		atomic_add_int(&bufreusecnt, 1);
2440	} else {
2441		mtx_assert(&bqclean, MA_NOTOWNED);
2442
2443		/*
2444		 * We finally have a valid bp.  We aren't quite out of the
2445		 * woods, we still have to reserve kva space.  In order
2446		 * to keep fragmentation sane we only allocate kva in
2447		 * BKVASIZE chunks.
2448		 */
2449		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2450
2451		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
2452		    B_KVAALLOC)) == B_UNMAPPED) {
2453			if (allocbufkva(bp, maxsize, gbflags)) {
2454				defrag = 1;
2455				bp->b_flags |= B_INVAL;
2456				brelse(bp);
2457				goto restart;
2458			}
2459			atomic_add_int(&bufreusecnt, 1);
2460		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
2461		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
2462			/*
2463			 * If the reused buffer has KVA allocated,
2464			 * reassign b_kvaalloc to b_kvabase.
2465			 */
2466			bp->b_kvabase = bp->b_kvaalloc;
2467			bp->b_flags &= ~B_KVAALLOC;
2468			atomic_subtract_long(&unmapped_bufspace,
2469			    bp->b_kvasize);
2470			atomic_add_int(&bufreusecnt, 1);
2471		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2472		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
2473		    GB_KVAALLOC)) {
2474			/*
2475			 * The case of reused buffer already have KVA
2476			 * mapped, but the request is for unmapped
2477			 * buffer with KVA allocated.
2478			 */
2479			bp->b_kvaalloc = bp->b_kvabase;
2480			bp->b_data = bp->b_kvabase = unmapped_buf;
2481			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2482			atomic_add_long(&unmapped_bufspace,
2483			    bp->b_kvasize);
2484			atomic_add_int(&bufreusecnt, 1);
2485		}
2486		if ((gbflags & GB_UNMAPPED) == 0) {
2487			bp->b_saveaddr = bp->b_kvabase;
2488			bp->b_data = bp->b_saveaddr;
2489			bp->b_flags &= ~B_UNMAPPED;
2490			BUF_CHECK_MAPPED(bp);
2491		}
2492	}
2493	return (bp);
2494}
2495
2496/*
2497 *	buf_daemon:
2498 *
2499 *	buffer flushing daemon.  Buffers are normally flushed by the
2500 *	update daemon but if it cannot keep up this process starts to
2501 *	take the load in an attempt to prevent getnewbuf() from blocking.
2502 */
2503
2504static struct kproc_desc buf_kp = {
2505	"bufdaemon",
2506	buf_daemon,
2507	&bufdaemonproc
2508};
2509SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2510
2511static int
2512buf_flush(struct vnode *vp, int target)
2513{
2514	int flushed;
2515
2516	flushed = flushbufqueues(vp, target, 0);
2517	if (flushed == 0) {
2518		/*
2519		 * Could not find any buffers without rollback
2520		 * dependencies, so just write the first one
2521		 * in the hopes of eventually making progress.
2522		 */
2523		if (vp != NULL && target > 2)
2524			target /= 2;
2525		flushbufqueues(vp, target, 1);
2526	}
2527	return (flushed);
2528}
2529
2530static void
2531buf_daemon()
2532{
2533	int lodirty;
2534
2535	/*
2536	 * This process needs to be suspended prior to shutdown sync.
2537	 */
2538	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2539	    SHUTDOWN_PRI_LAST);
2540
2541	/*
2542	 * This process is allowed to take the buffer cache to the limit
2543	 */
2544	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2545	mtx_lock(&bdlock);
2546	for (;;) {
2547		bd_request = 0;
2548		mtx_unlock(&bdlock);
2549
2550		kproc_suspend_check(bufdaemonproc);
2551		lodirty = lodirtybuffers;
2552		if (bd_speedupreq) {
2553			lodirty = numdirtybuffers / 2;
2554			bd_speedupreq = 0;
2555		}
2556		/*
2557		 * Do the flush.  Limit the amount of in-transit I/O we
2558		 * allow to build up, otherwise we would completely saturate
2559		 * the I/O system.
2560		 */
2561		while (numdirtybuffers > lodirty) {
2562			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
2563				break;
2564			kern_yield(PRI_USER);
2565		}
2566
2567		/*
2568		 * Only clear bd_request if we have reached our low water
2569		 * mark.  The buf_daemon normally waits 1 second and
2570		 * then incrementally flushes any dirty buffers that have
2571		 * built up, within reason.
2572		 *
2573		 * If we were unable to hit our low water mark and couldn't
2574		 * find any flushable buffers, we sleep for a short period
2575		 * to avoid endless loops on unlockable buffers.
2576		 */
2577		mtx_lock(&bdlock);
2578		if (numdirtybuffers <= lodirtybuffers) {
2579			/*
2580			 * We reached our low water mark, reset the
2581			 * request and sleep until we are needed again.
2582			 * The sleep is just so the suspend code works.
2583			 */
2584			bd_request = 0;
2585			/*
2586			 * Do an extra wakeup in case dirty threshold
2587			 * changed via sysctl and the explicit transition
2588			 * out of shortfall was missed.
2589			 */
2590			bdirtywakeup();
2591			if (runningbufspace <= lorunningspace)
2592				runningwakeup();
2593			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
2594		} else {
2595			/*
2596			 * We couldn't find any flushable dirty buffers but
2597			 * still have too many dirty buffers, we
2598			 * have to sleep and try again.  (rare)
2599			 */
2600			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
2601		}
2602	}
2603}
2604
2605/*
2606 *	flushbufqueues:
2607 *
2608 *	Try to flush a buffer in the dirty queue.  We must be careful to
2609 *	free up B_INVAL buffers instead of write them, which NFS is
2610 *	particularly sensitive to.
2611 */
2612static int flushwithdeps = 0;
2613SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
2614    0, "Number of buffers flushed with dependecies that require rollbacks");
2615
2616static int
2617flushbufqueues(struct vnode *lvp, int target, int flushdeps)
2618{
2619	struct buf *sentinel;
2620	struct vnode *vp;
2621	struct mount *mp;
2622	struct buf *bp;
2623	int hasdeps;
2624	int flushed;
2625	int queue;
2626
2627	flushed = 0;
2628	queue = QUEUE_DIRTY;
2629	bp = NULL;
2630	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2631	sentinel->b_qindex = QUEUE_SENTINEL;
2632	mtx_lock(&bqdirty);
2633	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2634	while (flushed != target) {
2635		bp = TAILQ_NEXT(sentinel, b_freelist);
2636		if (bp != NULL) {
2637			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2638			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2639			    b_freelist);
2640		} else
2641			break;
2642		/*
2643		 * Skip sentinels inserted by other invocations of the
2644		 * flushbufqueues(), taking care to not reorder them.
2645		 */
2646		if (bp->b_qindex == QUEUE_SENTINEL)
2647			continue;
2648		/*
2649		 * Only flush the buffers that belong to the
2650		 * vnode locked by the curthread.
2651		 */
2652		if (lvp != NULL && bp->b_vp != lvp)
2653			continue;
2654		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2655			continue;
2656		if (bp->b_pin_count > 0) {
2657			BUF_UNLOCK(bp);
2658			continue;
2659		}
2660		/*
2661		 * BKGRDINPROG can only be set with the buf and bufobj
2662		 * locks both held.  We tolerate a race to clear it here.
2663		 */
2664		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
2665		    (bp->b_flags & B_DELWRI) == 0) {
2666			BUF_UNLOCK(bp);
2667			continue;
2668		}
2669		if (bp->b_flags & B_INVAL) {
2670			bremfreel(bp);
2671			mtx_unlock(&bqdirty);
2672			brelse(bp);
2673			flushed++;
2674			mtx_lock(&bqdirty);
2675			continue;
2676		}
2677
2678		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
2679			if (flushdeps == 0) {
2680				BUF_UNLOCK(bp);
2681				continue;
2682			}
2683			hasdeps = 1;
2684		} else
2685			hasdeps = 0;
2686		/*
2687		 * We must hold the lock on a vnode before writing
2688		 * one of its buffers. Otherwise we may confuse, or
2689		 * in the case of a snapshot vnode, deadlock the
2690		 * system.
2691		 *
2692		 * The lock order here is the reverse of the normal
2693		 * of vnode followed by buf lock.  This is ok because
2694		 * the NOWAIT will prevent deadlock.
2695		 */
2696		vp = bp->b_vp;
2697		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2698			BUF_UNLOCK(bp);
2699			continue;
2700		}
2701		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
2702			mtx_unlock(&bqdirty);
2703			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
2704			    bp, bp->b_vp, bp->b_flags);
2705			if (curproc == bufdaemonproc)
2706				vfs_bio_awrite(bp);
2707			else {
2708				bremfree(bp);
2709				bwrite(bp);
2710				notbufdflushes++;
2711			}
2712			vn_finished_write(mp);
2713			VOP_UNLOCK(vp, 0);
2714			flushwithdeps += hasdeps;
2715			flushed++;
2716
2717			/*
2718			 * Sleeping on runningbufspace while holding
2719			 * vnode lock leads to deadlock.
2720			 */
2721			if (curproc == bufdaemonproc &&
2722			    runningbufspace > hirunningspace)
2723				waitrunningbufspace();
2724			mtx_lock(&bqdirty);
2725			continue;
2726		}
2727		vn_finished_write(mp);
2728		BUF_UNLOCK(bp);
2729	}
2730	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2731	mtx_unlock(&bqdirty);
2732	free(sentinel, M_TEMP);
2733	return (flushed);
2734}
2735
2736/*
2737 * Check to see if a block is currently memory resident.
2738 */
2739struct buf *
2740incore(struct bufobj *bo, daddr_t blkno)
2741{
2742	struct buf *bp;
2743
2744	BO_RLOCK(bo);
2745	bp = gbincore(bo, blkno);
2746	BO_RUNLOCK(bo);
2747	return (bp);
2748}
2749
2750/*
2751 * Returns true if no I/O is needed to access the
2752 * associated VM object.  This is like incore except
2753 * it also hunts around in the VM system for the data.
2754 */
2755
2756static int
2757inmem(struct vnode * vp, daddr_t blkno)
2758{
2759	vm_object_t obj;
2760	vm_offset_t toff, tinc, size;
2761	vm_page_t m;
2762	vm_ooffset_t off;
2763
2764	ASSERT_VOP_LOCKED(vp, "inmem");
2765
2766	if (incore(&vp->v_bufobj, blkno))
2767		return 1;
2768	if (vp->v_mount == NULL)
2769		return 0;
2770	obj = vp->v_object;
2771	if (obj == NULL)
2772		return (0);
2773
2774	size = PAGE_SIZE;
2775	if (size > vp->v_mount->mnt_stat.f_iosize)
2776		size = vp->v_mount->mnt_stat.f_iosize;
2777	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2778
2779	VM_OBJECT_RLOCK(obj);
2780	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2781		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2782		if (!m)
2783			goto notinmem;
2784		tinc = size;
2785		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2786			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2787		if (vm_page_is_valid(m,
2788		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2789			goto notinmem;
2790	}
2791	VM_OBJECT_RUNLOCK(obj);
2792	return 1;
2793
2794notinmem:
2795	VM_OBJECT_RUNLOCK(obj);
2796	return (0);
2797}
2798
2799/*
2800 * Set the dirty range for a buffer based on the status of the dirty
2801 * bits in the pages comprising the buffer.  The range is limited
2802 * to the size of the buffer.
2803 *
2804 * Tell the VM system that the pages associated with this buffer
2805 * are clean.  This is used for delayed writes where the data is
2806 * going to go to disk eventually without additional VM intevention.
2807 *
2808 * Note that while we only really need to clean through to b_bcount, we
2809 * just go ahead and clean through to b_bufsize.
2810 */
2811static void
2812vfs_clean_pages_dirty_buf(struct buf *bp)
2813{
2814	vm_ooffset_t foff, noff, eoff;
2815	vm_page_t m;
2816	int i;
2817
2818	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
2819		return;
2820
2821	foff = bp->b_offset;
2822	KASSERT(bp->b_offset != NOOFFSET,
2823	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
2824
2825	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
2826	vfs_drain_busy_pages(bp);
2827	vfs_setdirty_locked_object(bp);
2828	for (i = 0; i < bp->b_npages; i++) {
2829		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2830		eoff = noff;
2831		if (eoff > bp->b_offset + bp->b_bufsize)
2832			eoff = bp->b_offset + bp->b_bufsize;
2833		m = bp->b_pages[i];
2834		vfs_page_set_validclean(bp, foff, m);
2835		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2836		foff = noff;
2837	}
2838	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
2839}
2840
2841static void
2842vfs_setdirty_locked_object(struct buf *bp)
2843{
2844	vm_object_t object;
2845	int i;
2846
2847	object = bp->b_bufobj->bo_object;
2848	VM_OBJECT_ASSERT_WLOCKED(object);
2849
2850	/*
2851	 * We qualify the scan for modified pages on whether the
2852	 * object has been flushed yet.
2853	 */
2854	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
2855		vm_offset_t boffset;
2856		vm_offset_t eoffset;
2857
2858		/*
2859		 * test the pages to see if they have been modified directly
2860		 * by users through the VM system.
2861		 */
2862		for (i = 0; i < bp->b_npages; i++)
2863			vm_page_test_dirty(bp->b_pages[i]);
2864
2865		/*
2866		 * Calculate the encompassing dirty range, boffset and eoffset,
2867		 * (eoffset - boffset) bytes.
2868		 */
2869
2870		for (i = 0; i < bp->b_npages; i++) {
2871			if (bp->b_pages[i]->dirty)
2872				break;
2873		}
2874		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2875
2876		for (i = bp->b_npages - 1; i >= 0; --i) {
2877			if (bp->b_pages[i]->dirty) {
2878				break;
2879			}
2880		}
2881		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2882
2883		/*
2884		 * Fit it to the buffer.
2885		 */
2886
2887		if (eoffset > bp->b_bcount)
2888			eoffset = bp->b_bcount;
2889
2890		/*
2891		 * If we have a good dirty range, merge with the existing
2892		 * dirty range.
2893		 */
2894
2895		if (boffset < eoffset) {
2896			if (bp->b_dirtyoff > boffset)
2897				bp->b_dirtyoff = boffset;
2898			if (bp->b_dirtyend < eoffset)
2899				bp->b_dirtyend = eoffset;
2900		}
2901	}
2902}
2903
2904/*
2905 * Allocate the KVA mapping for an existing buffer. It handles the
2906 * cases of both B_UNMAPPED buffer, and buffer with the preallocated
2907 * KVA which is not mapped (B_KVAALLOC).
2908 */
2909static void
2910bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
2911{
2912	struct buf *scratch_bp;
2913	int bsize, maxsize, need_mapping, need_kva;
2914	off_t offset;
2915
2916	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
2917	    (gbflags & GB_UNMAPPED) == 0;
2918	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
2919	    (gbflags & GB_KVAALLOC) != 0;
2920	if (!need_mapping && !need_kva)
2921		return;
2922
2923	BUF_CHECK_UNMAPPED(bp);
2924
2925	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
2926		/*
2927		 * Buffer is not mapped, but the KVA was already
2928		 * reserved at the time of the instantiation.  Use the
2929		 * allocated space.
2930		 */
2931		bp->b_flags &= ~B_KVAALLOC;
2932		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
2933		bp->b_kvabase = bp->b_kvaalloc;
2934		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
2935		goto has_addr;
2936	}
2937
2938	/*
2939	 * Calculate the amount of the address space we would reserve
2940	 * if the buffer was mapped.
2941	 */
2942	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
2943	offset = blkno * bsize;
2944	maxsize = size + (offset & PAGE_MASK);
2945	maxsize = imax(maxsize, bsize);
2946
2947mapping_loop:
2948	if (allocbufkva(bp, maxsize, gbflags)) {
2949		/*
2950		 * Request defragmentation. getnewbuf() returns us the
2951		 * allocated space by the scratch buffer KVA.
2952		 */
2953		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
2954		    (GB_UNMAPPED | GB_KVAALLOC));
2955		if (scratch_bp == NULL) {
2956			if ((gbflags & GB_NOWAIT_BD) != 0) {
2957				/*
2958				 * XXXKIB: defragmentation cannot
2959				 * succeed, not sure what else to do.
2960				 */
2961				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
2962			}
2963			atomic_add_int(&mappingrestarts, 1);
2964			goto mapping_loop;
2965		}
2966		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
2967		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
2968		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
2969		    scratch_bp->b_kvasize, gbflags);
2970
2971		/* Get rid of the scratch buffer. */
2972		scratch_bp->b_kvasize = 0;
2973		scratch_bp->b_flags |= B_INVAL;
2974		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
2975		brelse(scratch_bp);
2976	}
2977	if (!need_mapping)
2978		return;
2979
2980has_addr:
2981	bp->b_saveaddr = bp->b_kvabase;
2982	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
2983	bp->b_flags &= ~B_UNMAPPED;
2984	BUF_CHECK_MAPPED(bp);
2985	bpmap_qenter(bp);
2986}
2987
2988/*
2989 *	getblk:
2990 *
2991 *	Get a block given a specified block and offset into a file/device.
2992 *	The buffers B_DONE bit will be cleared on return, making it almost
2993 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
2994 *	return.  The caller should clear B_INVAL prior to initiating a
2995 *	READ.
2996 *
2997 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
2998 *	an existing buffer.
2999 *
3000 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
3001 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
3002 *	and then cleared based on the backing VM.  If the previous buffer is
3003 *	non-0-sized but invalid, B_CACHE will be cleared.
3004 *
3005 *	If getblk() must create a new buffer, the new buffer is returned with
3006 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
3007 *	case it is returned with B_INVAL clear and B_CACHE set based on the
3008 *	backing VM.
3009 *
3010 *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
3011 *	B_CACHE bit is clear.
3012 *
3013 *	What this means, basically, is that the caller should use B_CACHE to
3014 *	determine whether the buffer is fully valid or not and should clear
3015 *	B_INVAL prior to issuing a read.  If the caller intends to validate
3016 *	the buffer by loading its data area with something, the caller needs
3017 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
3018 *	the caller should set B_CACHE ( as an optimization ), else the caller
3019 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
3020 *	a write attempt or if it was a successfull read.  If the caller
3021 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
3022 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
3023 */
3024struct buf *
3025getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
3026    int flags)
3027{
3028	struct buf *bp;
3029	struct bufobj *bo;
3030	int bsize, error, maxsize, vmio;
3031	off_t offset;
3032
3033	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
3034	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
3035	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
3036	ASSERT_VOP_LOCKED(vp, "getblk");
3037	if (size > MAXBSIZE)
3038		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
3039	if (!unmapped_buf_allowed)
3040		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3041
3042	bo = &vp->v_bufobj;
3043loop:
3044	BO_RLOCK(bo);
3045	bp = gbincore(bo, blkno);
3046	if (bp != NULL) {
3047		int lockflags;
3048		/*
3049		 * Buffer is in-core.  If the buffer is not busy nor managed,
3050		 * it must be on a queue.
3051		 */
3052		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3053
3054		if (flags & GB_LOCK_NOWAIT)
3055			lockflags |= LK_NOWAIT;
3056
3057		error = BUF_TIMELOCK(bp, lockflags,
3058		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3059
3060		/*
3061		 * If we slept and got the lock we have to restart in case
3062		 * the buffer changed identities.
3063		 */
3064		if (error == ENOLCK)
3065			goto loop;
3066		/* We timed out or were interrupted. */
3067		else if (error)
3068			return (NULL);
3069		/* If recursed, assume caller knows the rules. */
3070		else if (BUF_LOCKRECURSED(bp))
3071			goto end;
3072
3073		/*
3074		 * The buffer is locked.  B_CACHE is cleared if the buffer is
3075		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3076		 * and for a VMIO buffer B_CACHE is adjusted according to the
3077		 * backing VM cache.
3078		 */
3079		if (bp->b_flags & B_INVAL)
3080			bp->b_flags &= ~B_CACHE;
3081		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3082			bp->b_flags |= B_CACHE;
3083		if (bp->b_flags & B_MANAGED)
3084			MPASS(bp->b_qindex == QUEUE_NONE);
3085		else
3086			bremfree(bp);
3087
3088		/*
3089		 * check for size inconsistencies for non-VMIO case.
3090		 */
3091		if (bp->b_bcount != size) {
3092			if ((bp->b_flags & B_VMIO) == 0 ||
3093			    (size > bp->b_kvasize)) {
3094				if (bp->b_flags & B_DELWRI) {
3095					/*
3096					 * If buffer is pinned and caller does
3097					 * not want sleep  waiting for it to be
3098					 * unpinned, bail out
3099					 * */
3100					if (bp->b_pin_count > 0) {
3101						if (flags & GB_LOCK_NOWAIT) {
3102							bqrelse(bp);
3103							return (NULL);
3104						} else {
3105							bunpin_wait(bp);
3106						}
3107					}
3108					bp->b_flags |= B_NOCACHE;
3109					bwrite(bp);
3110				} else {
3111					if (LIST_EMPTY(&bp->b_dep)) {
3112						bp->b_flags |= B_RELBUF;
3113						brelse(bp);
3114					} else {
3115						bp->b_flags |= B_NOCACHE;
3116						bwrite(bp);
3117					}
3118				}
3119				goto loop;
3120			}
3121		}
3122
3123		/*
3124		 * Handle the case of unmapped buffer which should
3125		 * become mapped, or the buffer for which KVA
3126		 * reservation is requested.
3127		 */
3128		bp_unmapped_get_kva(bp, blkno, size, flags);
3129
3130		/*
3131		 * If the size is inconsistant in the VMIO case, we can resize
3132		 * the buffer.  This might lead to B_CACHE getting set or
3133		 * cleared.  If the size has not changed, B_CACHE remains
3134		 * unchanged from its previous state.
3135		 */
3136		if (bp->b_bcount != size)
3137			allocbuf(bp, size);
3138
3139		KASSERT(bp->b_offset != NOOFFSET,
3140		    ("getblk: no buffer offset"));
3141
3142		/*
3143		 * A buffer with B_DELWRI set and B_CACHE clear must
3144		 * be committed before we can return the buffer in
3145		 * order to prevent the caller from issuing a read
3146		 * ( due to B_CACHE not being set ) and overwriting
3147		 * it.
3148		 *
3149		 * Most callers, including NFS and FFS, need this to
3150		 * operate properly either because they assume they
3151		 * can issue a read if B_CACHE is not set, or because
3152		 * ( for example ) an uncached B_DELWRI might loop due
3153		 * to softupdates re-dirtying the buffer.  In the latter
3154		 * case, B_CACHE is set after the first write completes,
3155		 * preventing further loops.
3156		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3157		 * above while extending the buffer, we cannot allow the
3158		 * buffer to remain with B_CACHE set after the write
3159		 * completes or it will represent a corrupt state.  To
3160		 * deal with this we set B_NOCACHE to scrap the buffer
3161		 * after the write.
3162		 *
3163		 * We might be able to do something fancy, like setting
3164		 * B_CACHE in bwrite() except if B_DELWRI is already set,
3165		 * so the below call doesn't set B_CACHE, but that gets real
3166		 * confusing.  This is much easier.
3167		 */
3168
3169		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3170			bp->b_flags |= B_NOCACHE;
3171			bwrite(bp);
3172			goto loop;
3173		}
3174		bp->b_flags &= ~B_DONE;
3175	} else {
3176		/*
3177		 * Buffer is not in-core, create new buffer.  The buffer
3178		 * returned by getnewbuf() is locked.  Note that the returned
3179		 * buffer is also considered valid (not marked B_INVAL).
3180		 */
3181		BO_RUNLOCK(bo);
3182		/*
3183		 * If the user does not want us to create the buffer, bail out
3184		 * here.
3185		 */
3186		if (flags & GB_NOCREAT)
3187			return NULL;
3188		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
3189			return NULL;
3190
3191		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3192		offset = blkno * bsize;
3193		vmio = vp->v_object != NULL;
3194		if (vmio) {
3195			maxsize = size + (offset & PAGE_MASK);
3196		} else {
3197			maxsize = size;
3198			/* Do not allow non-VMIO notmapped buffers. */
3199			flags &= ~GB_UNMAPPED;
3200		}
3201		maxsize = imax(maxsize, bsize);
3202
3203		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
3204		if (bp == NULL) {
3205			if (slpflag || slptimeo)
3206				return NULL;
3207			goto loop;
3208		}
3209
3210		/*
3211		 * This code is used to make sure that a buffer is not
3212		 * created while the getnewbuf routine is blocked.
3213		 * This can be a problem whether the vnode is locked or not.
3214		 * If the buffer is created out from under us, we have to
3215		 * throw away the one we just created.
3216		 *
3217		 * Note: this must occur before we associate the buffer
3218		 * with the vp especially considering limitations in
3219		 * the splay tree implementation when dealing with duplicate
3220		 * lblkno's.
3221		 */
3222		BO_LOCK(bo);
3223		if (gbincore(bo, blkno)) {
3224			BO_UNLOCK(bo);
3225			bp->b_flags |= B_INVAL;
3226			brelse(bp);
3227			goto loop;
3228		}
3229
3230		/*
3231		 * Insert the buffer into the hash, so that it can
3232		 * be found by incore.
3233		 */
3234		bp->b_blkno = bp->b_lblkno = blkno;
3235		bp->b_offset = offset;
3236		bgetvp(vp, bp);
3237		BO_UNLOCK(bo);
3238
3239		/*
3240		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3241		 * buffer size starts out as 0, B_CACHE will be set by
3242		 * allocbuf() for the VMIO case prior to it testing the
3243		 * backing store for validity.
3244		 */
3245
3246		if (vmio) {
3247			bp->b_flags |= B_VMIO;
3248			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3249			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3250			    bp, vp->v_object, bp->b_bufobj->bo_object));
3251		} else {
3252			bp->b_flags &= ~B_VMIO;
3253			KASSERT(bp->b_bufobj->bo_object == NULL,
3254			    ("ARGH! has b_bufobj->bo_object %p %p\n",
3255			    bp, bp->b_bufobj->bo_object));
3256			BUF_CHECK_MAPPED(bp);
3257		}
3258
3259		allocbuf(bp, size);
3260		bp->b_flags &= ~B_DONE;
3261	}
3262	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3263	BUF_ASSERT_HELD(bp);
3264end:
3265	KASSERT(bp->b_bufobj == bo,
3266	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3267	return (bp);
3268}
3269
3270/*
3271 * Get an empty, disassociated buffer of given size.  The buffer is initially
3272 * set to B_INVAL.
3273 */
3274struct buf *
3275geteblk(int size, int flags)
3276{
3277	struct buf *bp;
3278	int maxsize;
3279
3280	maxsize = (size + BKVAMASK) & ~BKVAMASK;
3281	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
3282		if ((flags & GB_NOWAIT_BD) &&
3283		    (curthread->td_pflags & TDP_BUFNEED) != 0)
3284			return (NULL);
3285	}
3286	allocbuf(bp, size);
3287	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
3288	BUF_ASSERT_HELD(bp);
3289	return (bp);
3290}
3291
3292
3293/*
3294 * This code constitutes the buffer memory from either anonymous system
3295 * memory (in the case of non-VMIO operations) or from an associated
3296 * VM object (in the case of VMIO operations).  This code is able to
3297 * resize a buffer up or down.
3298 *
3299 * Note that this code is tricky, and has many complications to resolve
3300 * deadlock or inconsistant data situations.  Tread lightly!!!
3301 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3302 * the caller.  Calling this code willy nilly can result in the loss of data.
3303 *
3304 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3305 * B_CACHE for the non-VMIO case.
3306 */
3307
3308int
3309allocbuf(struct buf *bp, int size)
3310{
3311	int newbsize, mbsize;
3312	int i;
3313
3314	BUF_ASSERT_HELD(bp);
3315
3316	if (bp->b_kvasize < size)
3317		panic("allocbuf: buffer too small");
3318
3319	if ((bp->b_flags & B_VMIO) == 0) {
3320		caddr_t origbuf;
3321		int origbufsize;
3322		/*
3323		 * Just get anonymous memory from the kernel.  Don't
3324		 * mess with B_CACHE.
3325		 */
3326		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3327		if (bp->b_flags & B_MALLOC)
3328			newbsize = mbsize;
3329		else
3330			newbsize = round_page(size);
3331
3332		if (newbsize < bp->b_bufsize) {
3333			/*
3334			 * malloced buffers are not shrunk
3335			 */
3336			if (bp->b_flags & B_MALLOC) {
3337				if (newbsize) {
3338					bp->b_bcount = size;
3339				} else {
3340					free(bp->b_data, M_BIOBUF);
3341					if (bp->b_bufsize) {
3342						atomic_subtract_long(
3343						    &bufmallocspace,
3344						    bp->b_bufsize);
3345						bufspacewakeup();
3346						bp->b_bufsize = 0;
3347					}
3348					bp->b_saveaddr = bp->b_kvabase;
3349					bp->b_data = bp->b_saveaddr;
3350					bp->b_bcount = 0;
3351					bp->b_flags &= ~B_MALLOC;
3352				}
3353				return 1;
3354			}
3355			vm_hold_free_pages(bp, newbsize);
3356		} else if (newbsize > bp->b_bufsize) {
3357			/*
3358			 * We only use malloced memory on the first allocation.
3359			 * and revert to page-allocated memory when the buffer
3360			 * grows.
3361			 */
3362			/*
3363			 * There is a potential smp race here that could lead
3364			 * to bufmallocspace slightly passing the max.  It
3365			 * is probably extremely rare and not worth worrying
3366			 * over.
3367			 */
3368			if ( (bufmallocspace < maxbufmallocspace) &&
3369				(bp->b_bufsize == 0) &&
3370				(mbsize <= PAGE_SIZE/2)) {
3371
3372				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
3373				bp->b_bufsize = mbsize;
3374				bp->b_bcount = size;
3375				bp->b_flags |= B_MALLOC;
3376				atomic_add_long(&bufmallocspace, mbsize);
3377				return 1;
3378			}
3379			origbuf = NULL;
3380			origbufsize = 0;
3381			/*
3382			 * If the buffer is growing on its other-than-first allocation,
3383			 * then we revert to the page-allocation scheme.
3384			 */
3385			if (bp->b_flags & B_MALLOC) {
3386				origbuf = bp->b_data;
3387				origbufsize = bp->b_bufsize;
3388				bp->b_data = bp->b_kvabase;
3389				if (bp->b_bufsize) {
3390					atomic_subtract_long(&bufmallocspace,
3391					    bp->b_bufsize);
3392					bufspacewakeup();
3393					bp->b_bufsize = 0;
3394				}
3395				bp->b_flags &= ~B_MALLOC;
3396				newbsize = round_page(newbsize);
3397			}
3398			vm_hold_load_pages(
3399			    bp,
3400			    (vm_offset_t) bp->b_data + bp->b_bufsize,
3401			    (vm_offset_t) bp->b_data + newbsize);
3402			if (origbuf) {
3403				bcopy(origbuf, bp->b_data, origbufsize);
3404				free(origbuf, M_BIOBUF);
3405			}
3406		}
3407	} else {
3408		int desiredpages;
3409
3410		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3411		desiredpages = (size == 0) ? 0 :
3412			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3413
3414		if (bp->b_flags & B_MALLOC)
3415			panic("allocbuf: VMIO buffer can't be malloced");
3416		/*
3417		 * Set B_CACHE initially if buffer is 0 length or will become
3418		 * 0-length.
3419		 */
3420		if (size == 0 || bp->b_bufsize == 0)
3421			bp->b_flags |= B_CACHE;
3422
3423		if (newbsize < bp->b_bufsize) {
3424			/*
3425			 * DEV_BSIZE aligned new buffer size is less then the
3426			 * DEV_BSIZE aligned existing buffer size.  Figure out
3427			 * if we have to remove any pages.
3428			 */
3429			if (desiredpages < bp->b_npages) {
3430				vm_page_t m;
3431
3432				if ((bp->b_flags & B_UNMAPPED) == 0) {
3433					BUF_CHECK_MAPPED(bp);
3434					pmap_qremove((vm_offset_t)trunc_page(
3435					    (vm_offset_t)bp->b_data) +
3436					    (desiredpages << PAGE_SHIFT),
3437					    (bp->b_npages - desiredpages));
3438				} else
3439					BUF_CHECK_UNMAPPED(bp);
3440				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3441				for (i = desiredpages; i < bp->b_npages; i++) {
3442					/*
3443					 * the page is not freed here -- it
3444					 * is the responsibility of
3445					 * vnode_pager_setsize
3446					 */
3447					m = bp->b_pages[i];
3448					KASSERT(m != bogus_page,
3449					    ("allocbuf: bogus page found"));
3450					while (vm_page_sleep_if_busy(m, TRUE,
3451					    "biodep"))
3452						continue;
3453
3454					bp->b_pages[i] = NULL;
3455					vm_page_lock(m);
3456					vm_page_unwire(m, 0);
3457					vm_page_unlock(m);
3458				}
3459				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3460				bp->b_npages = desiredpages;
3461			}
3462		} else if (size > bp->b_bcount) {
3463			/*
3464			 * We are growing the buffer, possibly in a
3465			 * byte-granular fashion.
3466			 */
3467			vm_object_t obj;
3468			vm_offset_t toff;
3469			vm_offset_t tinc;
3470
3471			/*
3472			 * Step 1, bring in the VM pages from the object,
3473			 * allocating them if necessary.  We must clear
3474			 * B_CACHE if these pages are not valid for the
3475			 * range covered by the buffer.
3476			 */
3477
3478			obj = bp->b_bufobj->bo_object;
3479
3480			VM_OBJECT_WLOCK(obj);
3481			while (bp->b_npages < desiredpages) {
3482				vm_page_t m;
3483
3484				/*
3485				 * We must allocate system pages since blocking
3486				 * here could interfere with paging I/O, no
3487				 * matter which process we are.
3488				 *
3489				 * We can only test VPO_BUSY here.  Blocking on
3490				 * m->busy might lead to a deadlock:
3491				 *  vm_fault->getpages->cluster_read->allocbuf
3492				 * Thus, we specify VM_ALLOC_IGN_SBUSY.
3493				 */
3494				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
3495				    bp->b_npages, VM_ALLOC_NOBUSY |
3496				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3497				    VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY |
3498				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
3499				if (m->valid == 0)
3500					bp->b_flags &= ~B_CACHE;
3501				bp->b_pages[bp->b_npages] = m;
3502				++bp->b_npages;
3503			}
3504
3505			/*
3506			 * Step 2.  We've loaded the pages into the buffer,
3507			 * we have to figure out if we can still have B_CACHE
3508			 * set.  Note that B_CACHE is set according to the
3509			 * byte-granular range ( bcount and size ), new the
3510			 * aligned range ( newbsize ).
3511			 *
3512			 * The VM test is against m->valid, which is DEV_BSIZE
3513			 * aligned.  Needless to say, the validity of the data
3514			 * needs to also be DEV_BSIZE aligned.  Note that this
3515			 * fails with NFS if the server or some other client
3516			 * extends the file's EOF.  If our buffer is resized,
3517			 * B_CACHE may remain set! XXX
3518			 */
3519
3520			toff = bp->b_bcount;
3521			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
3522
3523			while ((bp->b_flags & B_CACHE) && toff < size) {
3524				vm_pindex_t pi;
3525
3526				if (tinc > (size - toff))
3527					tinc = size - toff;
3528
3529				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
3530				    PAGE_SHIFT;
3531
3532				vfs_buf_test_cache(
3533				    bp,
3534				    bp->b_offset,
3535				    toff,
3536				    tinc,
3537				    bp->b_pages[pi]
3538				);
3539				toff += tinc;
3540				tinc = PAGE_SIZE;
3541			}
3542			VM_OBJECT_WUNLOCK(obj);
3543
3544			/*
3545			 * Step 3, fixup the KVM pmap.
3546			 */
3547			if ((bp->b_flags & B_UNMAPPED) == 0)
3548				bpmap_qenter(bp);
3549			else
3550				BUF_CHECK_UNMAPPED(bp);
3551		}
3552	}
3553	if (newbsize < bp->b_bufsize)
3554		bufspacewakeup();
3555	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
3556	bp->b_bcount = size;		/* requested buffer size	*/
3557	return 1;
3558}
3559
3560extern int inflight_transient_maps;
3561
3562void
3563biodone(struct bio *bp)
3564{
3565	struct mtx *mtxp;
3566	void (*done)(struct bio *);
3567	vm_offset_t start, end;
3568	int transient;
3569
3570	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3571	mtx_lock(mtxp);
3572	bp->bio_flags |= BIO_DONE;
3573	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3574		start = trunc_page((vm_offset_t)bp->bio_data);
3575		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3576		transient = 1;
3577	} else {
3578		transient = 0;
3579		start = end = 0;
3580	}
3581	done = bp->bio_done;
3582	if (done == NULL)
3583		wakeup(bp);
3584	mtx_unlock(mtxp);
3585	if (done != NULL)
3586		done(bp);
3587	if (transient) {
3588		pmap_qremove(start, OFF_TO_IDX(end - start));
3589		vmem_free(transient_arena, start, end - start);
3590		atomic_add_int(&inflight_transient_maps, -1);
3591	}
3592}
3593
3594/*
3595 * Wait for a BIO to finish.
3596 *
3597 * XXX: resort to a timeout for now.  The optimal locking (if any) for this
3598 * case is not yet clear.
3599 */
3600int
3601biowait(struct bio *bp, const char *wchan)
3602{
3603	struct mtx *mtxp;
3604
3605	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3606	mtx_lock(mtxp);
3607	while ((bp->bio_flags & BIO_DONE) == 0)
3608		msleep(bp, mtxp, PRIBIO, wchan, hz / 10);
3609	mtx_unlock(mtxp);
3610	if (bp->bio_error != 0)
3611		return (bp->bio_error);
3612	if (!(bp->bio_flags & BIO_ERROR))
3613		return (0);
3614	return (EIO);
3615}
3616
3617void
3618biofinish(struct bio *bp, struct devstat *stat, int error)
3619{
3620
3621	if (error) {
3622		bp->bio_error = error;
3623		bp->bio_flags |= BIO_ERROR;
3624	}
3625	if (stat != NULL)
3626		devstat_end_transaction_bio(stat, bp);
3627	biodone(bp);
3628}
3629
3630/*
3631 *	bufwait:
3632 *
3633 *	Wait for buffer I/O completion, returning error status.  The buffer
3634 *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3635 *	error and cleared.
3636 */
3637int
3638bufwait(struct buf *bp)
3639{
3640	if (bp->b_iocmd == BIO_READ)
3641		bwait(bp, PRIBIO, "biord");
3642	else
3643		bwait(bp, PRIBIO, "biowr");
3644	if (bp->b_flags & B_EINTR) {
3645		bp->b_flags &= ~B_EINTR;
3646		return (EINTR);
3647	}
3648	if (bp->b_ioflags & BIO_ERROR) {
3649		return (bp->b_error ? bp->b_error : EIO);
3650	} else {
3651		return (0);
3652	}
3653}
3654
3655 /*
3656  * Call back function from struct bio back up to struct buf.
3657  */
3658static void
3659bufdonebio(struct bio *bip)
3660{
3661	struct buf *bp;
3662
3663	bp = bip->bio_caller2;
3664	bp->b_resid = bp->b_bcount - bip->bio_completed;
3665	bp->b_resid = bip->bio_resid;	/* XXX: remove */
3666	bp->b_ioflags = bip->bio_flags;
3667	bp->b_error = bip->bio_error;
3668	if (bp->b_error)
3669		bp->b_ioflags |= BIO_ERROR;
3670	bufdone(bp);
3671	g_destroy_bio(bip);
3672}
3673
3674void
3675dev_strategy(struct cdev *dev, struct buf *bp)
3676{
3677	struct cdevsw *csw;
3678	int ref;
3679
3680	KASSERT(dev->si_refcount > 0,
3681	    ("dev_strategy on un-referenced struct cdev *(%s) %p",
3682	    devtoname(dev), dev));
3683
3684	csw = dev_refthread(dev, &ref);
3685	dev_strategy_csw(dev, csw, bp);
3686	dev_relthread(dev, ref);
3687}
3688
3689void
3690dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
3691{
3692	struct bio *bip;
3693
3694	KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
3695	    ("b_iocmd botch"));
3696	KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
3697	    dev->si_threadcount > 0,
3698	    ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
3699	    dev));
3700	if (csw == NULL) {
3701		bp->b_error = ENXIO;
3702		bp->b_ioflags = BIO_ERROR;
3703		bufdone(bp);
3704		return;
3705	}
3706	for (;;) {
3707		bip = g_new_bio();
3708		if (bip != NULL)
3709			break;
3710		/* Try again later */
3711		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
3712	}
3713	bip->bio_cmd = bp->b_iocmd;
3714	bip->bio_offset = bp->b_iooffset;
3715	bip->bio_length = bp->b_bcount;
3716	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
3717	bdata2bio(bp, bip);
3718	bip->bio_done = bufdonebio;
3719	bip->bio_caller2 = bp;
3720	bip->bio_dev = dev;
3721	(*csw->d_strategy)(bip);
3722}
3723
3724/*
3725 *	bufdone:
3726 *
3727 *	Finish I/O on a buffer, optionally calling a completion function.
3728 *	This is usually called from an interrupt so process blocking is
3729 *	not allowed.
3730 *
3731 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3732 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
3733 *	assuming B_INVAL is clear.
3734 *
3735 *	For the VMIO case, we set B_CACHE if the op was a read and no
3736 *	read error occured, or if the op was a write.  B_CACHE is never
3737 *	set if the buffer is invalid or otherwise uncacheable.
3738 *
3739 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
3740 *	initiator to leave B_INVAL set to brelse the buffer out of existance
3741 *	in the biodone routine.
3742 */
3743void
3744bufdone(struct buf *bp)
3745{
3746	struct bufobj *dropobj;
3747	void    (*biodone)(struct buf *);
3748
3749	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3750	dropobj = NULL;
3751
3752	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3753	BUF_ASSERT_HELD(bp);
3754
3755	runningbufwakeup(bp);
3756	if (bp->b_iocmd == BIO_WRITE)
3757		dropobj = bp->b_bufobj;
3758	/* call optional completion function if requested */
3759	if (bp->b_iodone != NULL) {
3760		biodone = bp->b_iodone;
3761		bp->b_iodone = NULL;
3762		(*biodone) (bp);
3763		if (dropobj)
3764			bufobj_wdrop(dropobj);
3765		return;
3766	}
3767
3768	bufdone_finish(bp);
3769
3770	if (dropobj)
3771		bufobj_wdrop(dropobj);
3772}
3773
3774void
3775bufdone_finish(struct buf *bp)
3776{
3777	BUF_ASSERT_HELD(bp);
3778
3779	if (!LIST_EMPTY(&bp->b_dep))
3780		buf_complete(bp);
3781
3782	if (bp->b_flags & B_VMIO) {
3783		vm_ooffset_t foff;
3784		vm_page_t m;
3785		vm_object_t obj;
3786		struct vnode *vp;
3787		int bogus, i, iosize;
3788
3789		obj = bp->b_bufobj->bo_object;
3790		KASSERT(obj->paging_in_progress >= bp->b_npages,
3791		    ("biodone_finish: paging in progress(%d) < b_npages(%d)",
3792		    obj->paging_in_progress, bp->b_npages));
3793
3794		vp = bp->b_vp;
3795		KASSERT(vp->v_holdcnt > 0,
3796		    ("biodone_finish: vnode %p has zero hold count", vp));
3797		KASSERT(vp->v_object != NULL,
3798		    ("biodone_finish: vnode %p has no vm_object", vp));
3799
3800		foff = bp->b_offset;
3801		KASSERT(bp->b_offset != NOOFFSET,
3802		    ("biodone_finish: bp %p has no buffer offset", bp));
3803
3804		/*
3805		 * Set B_CACHE if the op was a normal read and no error
3806		 * occured.  B_CACHE is set for writes in the b*write()
3807		 * routines.
3808		 */
3809		iosize = bp->b_bcount - bp->b_resid;
3810		if (bp->b_iocmd == BIO_READ &&
3811		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
3812		    !(bp->b_ioflags & BIO_ERROR)) {
3813			bp->b_flags |= B_CACHE;
3814		}
3815		bogus = 0;
3816		VM_OBJECT_WLOCK(obj);
3817		for (i = 0; i < bp->b_npages; i++) {
3818			int bogusflag = 0;
3819			int resid;
3820
3821			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
3822			if (resid > iosize)
3823				resid = iosize;
3824
3825			/*
3826			 * cleanup bogus pages, restoring the originals
3827			 */
3828			m = bp->b_pages[i];
3829			if (m == bogus_page) {
3830				bogus = bogusflag = 1;
3831				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
3832				if (m == NULL)
3833					panic("biodone: page disappeared!");
3834				bp->b_pages[i] = m;
3835			}
3836			KASSERT(OFF_TO_IDX(foff) == m->pindex,
3837			    ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
3838			    (intmax_t)foff, (uintmax_t)m->pindex));
3839
3840			/*
3841			 * In the write case, the valid and clean bits are
3842			 * already changed correctly ( see bdwrite() ), so we
3843			 * only need to do this here in the read case.
3844			 */
3845			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
3846				KASSERT((m->dirty & vm_page_bits(foff &
3847				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
3848				    " page %p has unexpected dirty bits", m));
3849				vfs_page_set_valid(bp, foff, m);
3850			}
3851
3852			vm_page_io_finish(m);
3853			vm_object_pip_subtract(obj, 1);
3854			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3855			iosize -= resid;
3856		}
3857		vm_object_pip_wakeupn(obj, 0);
3858		VM_OBJECT_WUNLOCK(obj);
3859		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
3860			BUF_CHECK_MAPPED(bp);
3861			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3862			    bp->b_pages, bp->b_npages);
3863		}
3864	}
3865
3866	/*
3867	 * For asynchronous completions, release the buffer now. The brelse
3868	 * will do a wakeup there if necessary - so no need to do a wakeup
3869	 * here in the async case. The sync case always needs to do a wakeup.
3870	 */
3871
3872	if (bp->b_flags & B_ASYNC) {
3873		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
3874			brelse(bp);
3875		else
3876			bqrelse(bp);
3877	} else
3878		bdone(bp);
3879}
3880
3881/*
3882 * This routine is called in lieu of iodone in the case of
3883 * incomplete I/O.  This keeps the busy status for pages
3884 * consistant.
3885 */
3886void
3887vfs_unbusy_pages(struct buf *bp)
3888{
3889	int i;
3890	vm_object_t obj;
3891	vm_page_t m;
3892
3893	runningbufwakeup(bp);
3894	if (!(bp->b_flags & B_VMIO))
3895		return;
3896
3897	obj = bp->b_bufobj->bo_object;
3898	VM_OBJECT_WLOCK(obj);
3899	for (i = 0; i < bp->b_npages; i++) {
3900		m = bp->b_pages[i];
3901		if (m == bogus_page) {
3902			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3903			if (!m)
3904				panic("vfs_unbusy_pages: page missing\n");
3905			bp->b_pages[i] = m;
3906			if ((bp->b_flags & B_UNMAPPED) == 0) {
3907				BUF_CHECK_MAPPED(bp);
3908				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3909				    bp->b_pages, bp->b_npages);
3910			} else
3911				BUF_CHECK_UNMAPPED(bp);
3912		}
3913		vm_object_pip_subtract(obj, 1);
3914		vm_page_io_finish(m);
3915	}
3916	vm_object_pip_wakeupn(obj, 0);
3917	VM_OBJECT_WUNLOCK(obj);
3918}
3919
3920/*
3921 * vfs_page_set_valid:
3922 *
3923 *	Set the valid bits in a page based on the supplied offset.   The
3924 *	range is restricted to the buffer's size.
3925 *
3926 *	This routine is typically called after a read completes.
3927 */
3928static void
3929vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3930{
3931	vm_ooffset_t eoff;
3932
3933	/*
3934	 * Compute the end offset, eoff, such that [off, eoff) does not span a
3935	 * page boundary and eoff is not greater than the end of the buffer.
3936	 * The end of the buffer, in this case, is our file EOF, not the
3937	 * allocation size of the buffer.
3938	 */
3939	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
3940	if (eoff > bp->b_offset + bp->b_bcount)
3941		eoff = bp->b_offset + bp->b_bcount;
3942
3943	/*
3944	 * Set valid range.  This is typically the entire buffer and thus the
3945	 * entire page.
3946	 */
3947	if (eoff > off)
3948		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
3949}
3950
3951/*
3952 * vfs_page_set_validclean:
3953 *
3954 *	Set the valid bits and clear the dirty bits in a page based on the
3955 *	supplied offset.   The range is restricted to the buffer's size.
3956 */
3957static void
3958vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3959{
3960	vm_ooffset_t soff, eoff;
3961
3962	/*
3963	 * Start and end offsets in buffer.  eoff - soff may not cross a
3964	 * page boundry or cross the end of the buffer.  The end of the
3965	 * buffer, in this case, is our file EOF, not the allocation size
3966	 * of the buffer.
3967	 */
3968	soff = off;
3969	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3970	if (eoff > bp->b_offset + bp->b_bcount)
3971		eoff = bp->b_offset + bp->b_bcount;
3972
3973	/*
3974	 * Set valid range.  This is typically the entire buffer and thus the
3975	 * entire page.
3976	 */
3977	if (eoff > soff) {
3978		vm_page_set_validclean(
3979		    m,
3980		   (vm_offset_t) (soff & PAGE_MASK),
3981		   (vm_offset_t) (eoff - soff)
3982		);
3983	}
3984}
3985
3986/*
3987 * Ensure that all buffer pages are not busied by VPO_BUSY flag. If
3988 * any page is busy, drain the flag.
3989 */
3990static void
3991vfs_drain_busy_pages(struct buf *bp)
3992{
3993	vm_page_t m;
3994	int i, last_busied;
3995
3996	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
3997	last_busied = 0;
3998	for (i = 0; i < bp->b_npages; i++) {
3999		m = bp->b_pages[i];
4000		if ((m->oflags & VPO_BUSY) != 0) {
4001			for (; last_busied < i; last_busied++)
4002				vm_page_busy(bp->b_pages[last_busied]);
4003			while ((m->oflags & VPO_BUSY) != 0)
4004				vm_page_sleep(m, "vbpage");
4005		}
4006	}
4007	for (i = 0; i < last_busied; i++)
4008		vm_page_wakeup(bp->b_pages[i]);
4009}
4010
4011/*
4012 * This routine is called before a device strategy routine.
4013 * It is used to tell the VM system that paging I/O is in
4014 * progress, and treat the pages associated with the buffer
4015 * almost as being VPO_BUSY.  Also the object paging_in_progress
4016 * flag is handled to make sure that the object doesn't become
4017 * inconsistant.
4018 *
4019 * Since I/O has not been initiated yet, certain buffer flags
4020 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
4021 * and should be ignored.
4022 */
4023void
4024vfs_busy_pages(struct buf *bp, int clear_modify)
4025{
4026	int i, bogus;
4027	vm_object_t obj;
4028	vm_ooffset_t foff;
4029	vm_page_t m;
4030
4031	if (!(bp->b_flags & B_VMIO))
4032		return;
4033
4034	obj = bp->b_bufobj->bo_object;
4035	foff = bp->b_offset;
4036	KASSERT(bp->b_offset != NOOFFSET,
4037	    ("vfs_busy_pages: no buffer offset"));
4038	VM_OBJECT_WLOCK(obj);
4039	vfs_drain_busy_pages(bp);
4040	if (bp->b_bufsize != 0)
4041		vfs_setdirty_locked_object(bp);
4042	bogus = 0;
4043	for (i = 0; i < bp->b_npages; i++) {
4044		m = bp->b_pages[i];
4045
4046		if ((bp->b_flags & B_CLUSTER) == 0) {
4047			vm_object_pip_add(obj, 1);
4048			vm_page_io_start(m);
4049		}
4050		/*
4051		 * When readying a buffer for a read ( i.e
4052		 * clear_modify == 0 ), it is important to do
4053		 * bogus_page replacement for valid pages in
4054		 * partially instantiated buffers.  Partially
4055		 * instantiated buffers can, in turn, occur when
4056		 * reconstituting a buffer from its VM backing store
4057		 * base.  We only have to do this if B_CACHE is
4058		 * clear ( which causes the I/O to occur in the
4059		 * first place ).  The replacement prevents the read
4060		 * I/O from overwriting potentially dirty VM-backed
4061		 * pages.  XXX bogus page replacement is, uh, bogus.
4062		 * It may not work properly with small-block devices.
4063		 * We need to find a better way.
4064		 */
4065		if (clear_modify) {
4066			pmap_remove_write(m);
4067			vfs_page_set_validclean(bp, foff, m);
4068		} else if (m->valid == VM_PAGE_BITS_ALL &&
4069		    (bp->b_flags & B_CACHE) == 0) {
4070			bp->b_pages[i] = bogus_page;
4071			bogus++;
4072		}
4073		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4074	}
4075	VM_OBJECT_WUNLOCK(obj);
4076	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
4077		BUF_CHECK_MAPPED(bp);
4078		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4079		    bp->b_pages, bp->b_npages);
4080	}
4081}
4082
4083/*
4084 *	vfs_bio_set_valid:
4085 *
4086 *	Set the range within the buffer to valid.  The range is
4087 *	relative to the beginning of the buffer, b_offset.  Note that
4088 *	b_offset itself may be offset from the beginning of the first
4089 *	page.
4090 */
4091void
4092vfs_bio_set_valid(struct buf *bp, int base, int size)
4093{
4094	int i, n;
4095	vm_page_t m;
4096
4097	if (!(bp->b_flags & B_VMIO))
4098		return;
4099
4100	/*
4101	 * Fixup base to be relative to beginning of first page.
4102	 * Set initial n to be the maximum number of bytes in the
4103	 * first page that can be validated.
4104	 */
4105	base += (bp->b_offset & PAGE_MASK);
4106	n = PAGE_SIZE - (base & PAGE_MASK);
4107
4108	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4109	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4110		m = bp->b_pages[i];
4111		if (n > size)
4112			n = size;
4113		vm_page_set_valid_range(m, base & PAGE_MASK, n);
4114		base += n;
4115		size -= n;
4116		n = PAGE_SIZE;
4117	}
4118	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4119}
4120
4121/*
4122 *	vfs_bio_clrbuf:
4123 *
4124 *	If the specified buffer is a non-VMIO buffer, clear the entire
4125 *	buffer.  If the specified buffer is a VMIO buffer, clear and
4126 *	validate only the previously invalid portions of the buffer.
4127 *	This routine essentially fakes an I/O, so we need to clear
4128 *	BIO_ERROR and B_INVAL.
4129 *
4130 *	Note that while we only theoretically need to clear through b_bcount,
4131 *	we go ahead and clear through b_bufsize.
4132 */
4133void
4134vfs_bio_clrbuf(struct buf *bp)
4135{
4136	int i, j, mask, sa, ea, slide;
4137
4138	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4139		clrbuf(bp);
4140		return;
4141	}
4142	bp->b_flags &= ~B_INVAL;
4143	bp->b_ioflags &= ~BIO_ERROR;
4144	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4145	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4146	    (bp->b_offset & PAGE_MASK) == 0) {
4147		if (bp->b_pages[0] == bogus_page)
4148			goto unlock;
4149		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4150		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4151		if ((bp->b_pages[0]->valid & mask) == mask)
4152			goto unlock;
4153		if ((bp->b_pages[0]->valid & mask) == 0) {
4154			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4155			bp->b_pages[0]->valid |= mask;
4156			goto unlock;
4157		}
4158	}
4159	sa = bp->b_offset & PAGE_MASK;
4160	slide = 0;
4161	for (i = 0; i < bp->b_npages; i++, sa = 0) {
4162		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4163		ea = slide & PAGE_MASK;
4164		if (ea == 0)
4165			ea = PAGE_SIZE;
4166		if (bp->b_pages[i] == bogus_page)
4167			continue;
4168		j = sa / DEV_BSIZE;
4169		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4170		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4171		if ((bp->b_pages[i]->valid & mask) == mask)
4172			continue;
4173		if ((bp->b_pages[i]->valid & mask) == 0)
4174			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4175		else {
4176			for (; sa < ea; sa += DEV_BSIZE, j++) {
4177				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4178					pmap_zero_page_area(bp->b_pages[i],
4179					    sa, DEV_BSIZE);
4180				}
4181			}
4182		}
4183		bp->b_pages[i]->valid |= mask;
4184	}
4185unlock:
4186	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4187	bp->b_resid = 0;
4188}
4189
4190void
4191vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4192{
4193	vm_page_t m;
4194	int i, n;
4195
4196	if ((bp->b_flags & B_UNMAPPED) == 0) {
4197		BUF_CHECK_MAPPED(bp);
4198		bzero(bp->b_data + base, size);
4199	} else {
4200		BUF_CHECK_UNMAPPED(bp);
4201		n = PAGE_SIZE - (base & PAGE_MASK);
4202		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4203			m = bp->b_pages[i];
4204			if (n > size)
4205				n = size;
4206			pmap_zero_page_area(m, base & PAGE_MASK, n);
4207			base += n;
4208			size -= n;
4209			n = PAGE_SIZE;
4210		}
4211	}
4212}
4213
4214/*
4215 * vm_hold_load_pages and vm_hold_free_pages get pages into
4216 * a buffers address space.  The pages are anonymous and are
4217 * not associated with a file object.
4218 */
4219static void
4220vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4221{
4222	vm_offset_t pg;
4223	vm_page_t p;
4224	int index;
4225
4226	BUF_CHECK_MAPPED(bp);
4227
4228	to = round_page(to);
4229	from = round_page(from);
4230	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4231
4232	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4233tryagain:
4234		/*
4235		 * note: must allocate system pages since blocking here
4236		 * could interfere with paging I/O, no matter which
4237		 * process we are.
4238		 */
4239		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4240		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4241		if (p == NULL) {
4242			VM_WAIT;
4243			goto tryagain;
4244		}
4245		pmap_qenter(pg, &p, 1);
4246		bp->b_pages[index] = p;
4247	}
4248	bp->b_npages = index;
4249}
4250
4251/* Return pages associated with this buf to the vm system */
4252static void
4253vm_hold_free_pages(struct buf *bp, int newbsize)
4254{
4255	vm_offset_t from;
4256	vm_page_t p;
4257	int index, newnpages;
4258
4259	BUF_CHECK_MAPPED(bp);
4260
4261	from = round_page((vm_offset_t)bp->b_data + newbsize);
4262	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4263	if (bp->b_npages > newnpages)
4264		pmap_qremove(from, bp->b_npages - newnpages);
4265	for (index = newnpages; index < bp->b_npages; index++) {
4266		p = bp->b_pages[index];
4267		bp->b_pages[index] = NULL;
4268		if (p->busy != 0)
4269			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4270			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4271		p->wire_count--;
4272		vm_page_free(p);
4273		atomic_subtract_int(&cnt.v_wire_count, 1);
4274	}
4275	bp->b_npages = newnpages;
4276}
4277
4278/*
4279 * Map an IO request into kernel virtual address space.
4280 *
4281 * All requests are (re)mapped into kernel VA space.
4282 * Notice that we use b_bufsize for the size of the buffer
4283 * to be mapped.  b_bcount might be modified by the driver.
4284 *
4285 * Note that even if the caller determines that the address space should
4286 * be valid, a race or a smaller-file mapped into a larger space may
4287 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4288 * check the return value.
4289 */
4290int
4291vmapbuf(struct buf *bp, int mapbuf)
4292{
4293	caddr_t kva;
4294	vm_prot_t prot;
4295	int pidx;
4296
4297	if (bp->b_bufsize < 0)
4298		return (-1);
4299	prot = VM_PROT_READ;
4300	if (bp->b_iocmd == BIO_READ)
4301		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
4302	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4303	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4304	    btoc(MAXPHYS))) < 0)
4305		return (-1);
4306	bp->b_npages = pidx;
4307	if (mapbuf || !unmapped_buf_allowed) {
4308		pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
4309		kva = bp->b_saveaddr;
4310		bp->b_saveaddr = bp->b_data;
4311		bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
4312		bp->b_flags &= ~B_UNMAPPED;
4313	} else {
4314		bp->b_flags |= B_UNMAPPED;
4315		bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4316		bp->b_saveaddr = bp->b_data;
4317		bp->b_data = unmapped_buf;
4318	}
4319	return(0);
4320}
4321
4322/*
4323 * Free the io map PTEs associated with this IO operation.
4324 * We also invalidate the TLB entries and restore the original b_addr.
4325 */
4326void
4327vunmapbuf(struct buf *bp)
4328{
4329	int npages;
4330
4331	npages = bp->b_npages;
4332	if (bp->b_flags & B_UNMAPPED)
4333		bp->b_flags &= ~B_UNMAPPED;
4334	else
4335		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4336	vm_page_unhold_pages(bp->b_pages, npages);
4337
4338	bp->b_data = bp->b_saveaddr;
4339}
4340
4341void
4342bdone(struct buf *bp)
4343{
4344	struct mtx *mtxp;
4345
4346	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4347	mtx_lock(mtxp);
4348	bp->b_flags |= B_DONE;
4349	wakeup(bp);
4350	mtx_unlock(mtxp);
4351}
4352
4353void
4354bwait(struct buf *bp, u_char pri, const char *wchan)
4355{
4356	struct mtx *mtxp;
4357
4358	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4359	mtx_lock(mtxp);
4360	while ((bp->b_flags & B_DONE) == 0)
4361		msleep(bp, mtxp, pri, wchan, 0);
4362	mtx_unlock(mtxp);
4363}
4364
4365int
4366bufsync(struct bufobj *bo, int waitfor)
4367{
4368
4369	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4370}
4371
4372void
4373bufstrategy(struct bufobj *bo, struct buf *bp)
4374{
4375	int i = 0;
4376	struct vnode *vp;
4377
4378	vp = bp->b_vp;
4379	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4380	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4381	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4382	i = VOP_STRATEGY(vp, bp);
4383	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4384}
4385
4386void
4387bufobj_wrefl(struct bufobj *bo)
4388{
4389
4390	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4391	ASSERT_BO_WLOCKED(bo);
4392	bo->bo_numoutput++;
4393}
4394
4395void
4396bufobj_wref(struct bufobj *bo)
4397{
4398
4399	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4400	BO_LOCK(bo);
4401	bo->bo_numoutput++;
4402	BO_UNLOCK(bo);
4403}
4404
4405void
4406bufobj_wdrop(struct bufobj *bo)
4407{
4408
4409	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4410	BO_LOCK(bo);
4411	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4412	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4413		bo->bo_flag &= ~BO_WWAIT;
4414		wakeup(&bo->bo_numoutput);
4415	}
4416	BO_UNLOCK(bo);
4417}
4418
4419int
4420bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4421{
4422	int error;
4423
4424	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4425	ASSERT_BO_WLOCKED(bo);
4426	error = 0;
4427	while (bo->bo_numoutput) {
4428		bo->bo_flag |= BO_WWAIT;
4429		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4430		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4431		if (error)
4432			break;
4433	}
4434	return (error);
4435}
4436
4437void
4438bpin(struct buf *bp)
4439{
4440	struct mtx *mtxp;
4441
4442	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4443	mtx_lock(mtxp);
4444	bp->b_pin_count++;
4445	mtx_unlock(mtxp);
4446}
4447
4448void
4449bunpin(struct buf *bp)
4450{
4451	struct mtx *mtxp;
4452
4453	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4454	mtx_lock(mtxp);
4455	if (--bp->b_pin_count == 0)
4456		wakeup(bp);
4457	mtx_unlock(mtxp);
4458}
4459
4460void
4461bunpin_wait(struct buf *bp)
4462{
4463	struct mtx *mtxp;
4464
4465	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4466	mtx_lock(mtxp);
4467	while (bp->b_pin_count > 0)
4468		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4469	mtx_unlock(mtxp);
4470}
4471
4472/*
4473 * Set bio_data or bio_ma for struct bio from the struct buf.
4474 */
4475void
4476bdata2bio(struct buf *bp, struct bio *bip)
4477{
4478
4479	if ((bp->b_flags & B_UNMAPPED) != 0) {
4480		KASSERT(unmapped_buf_allowed, ("unmapped"));
4481		bip->bio_ma = bp->b_pages;
4482		bip->bio_ma_n = bp->b_npages;
4483		bip->bio_data = unmapped_buf;
4484		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4485		bip->bio_flags |= BIO_UNMAPPED;
4486		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4487		    PAGE_SIZE == bp->b_npages,
4488		    ("Buffer %p too short: %d %d %d", bp, bip->bio_ma_offset,
4489		    bip->bio_length, bip->bio_ma_n));
4490	} else {
4491		bip->bio_data = bp->b_data;
4492		bip->bio_ma = NULL;
4493	}
4494}
4495
4496#include "opt_ddb.h"
4497#ifdef DDB
4498#include <ddb/ddb.h>
4499
4500/* DDB command to show buffer data */
4501DB_SHOW_COMMAND(buffer, db_show_buffer)
4502{
4503	/* get args */
4504	struct buf *bp = (struct buf *)addr;
4505
4506	if (!have_addr) {
4507		db_printf("usage: show buffer <addr>\n");
4508		return;
4509	}
4510
4511	db_printf("buf at %p\n", bp);
4512	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4513	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4514	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4515	db_printf(
4516	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4517	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4518	    "b_dep = %p\n",
4519	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4520	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4521	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4522	if (bp->b_npages) {
4523		int i;
4524		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4525		for (i = 0; i < bp->b_npages; i++) {
4526			vm_page_t m;
4527			m = bp->b_pages[i];
4528			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4529			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4530			if ((i + 1) < bp->b_npages)
4531				db_printf(",");
4532		}
4533		db_printf("\n");
4534	}
4535	db_printf(" ");
4536	BUF_LOCKPRINTINFO(bp);
4537}
4538
4539DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4540{
4541	struct buf *bp;
4542	int i;
4543
4544	for (i = 0; i < nbuf; i++) {
4545		bp = &buf[i];
4546		if (BUF_ISLOCKED(bp)) {
4547			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4548			db_printf("\n");
4549		}
4550	}
4551}
4552
4553DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4554{
4555	struct vnode *vp;
4556	struct buf *bp;
4557
4558	if (!have_addr) {
4559		db_printf("usage: show vnodebufs <addr>\n");
4560		return;
4561	}
4562	vp = (struct vnode *)addr;
4563	db_printf("Clean buffers:\n");
4564	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4565		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4566		db_printf("\n");
4567	}
4568	db_printf("Dirty buffers:\n");
4569	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4570		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4571		db_printf("\n");
4572	}
4573}
4574
4575DB_COMMAND(countfreebufs, db_coundfreebufs)
4576{
4577	struct buf *bp;
4578	int i, used = 0, nfree = 0;
4579
4580	if (have_addr) {
4581		db_printf("usage: countfreebufs\n");
4582		return;
4583	}
4584
4585	for (i = 0; i < nbuf; i++) {
4586		bp = &buf[i];
4587		if ((bp->b_flags & B_INFREECNT) != 0)
4588			nfree++;
4589		else
4590			used++;
4591	}
4592
4593	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4594	    nfree + used);
4595	db_printf("numfreebuffers is %d\n", numfreebuffers);
4596}
4597#endif /* DDB */
4598