vfs_bio.c revision 282933
1139804Simp/*-
22729Sdfr * Copyright (c) 2004 Poul-Henning Kamp
32729Sdfr * Copyright (c) 1994,1997 John S. Dyson
42729Sdfr * Copyright (c) 2013 The FreeBSD Foundation
52729Sdfr * All rights reserved.
62729Sdfr *
72729Sdfr * Portions of this software were developed by Konstantin Belousov
82729Sdfr * under sponsorship from the FreeBSD Foundation.
92729Sdfr *
102729Sdfr * Redistribution and use in source and binary forms, with or without
112729Sdfr * modification, are permitted provided that the following conditions
122729Sdfr * are met:
132729Sdfr * 1. Redistributions of source code must retain the above copyright
142729Sdfr *    notice, this list of conditions and the following disclaimer.
152729Sdfr * 2. Redistributions in binary form must reproduce the above copyright
162729Sdfr *    notice, this list of conditions and the following disclaimer in the
172729Sdfr *    documentation and/or other materials provided with the distribution.
182729Sdfr *
19140614Srwatson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20140614Srwatson * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21140614Srwatson * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22140614Srwatson * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23140614Srwatson * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24140614Srwatson * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25140614Srwatson * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26140614Srwatson * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27140614Srwatson * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28140614Srwatson * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29140614Srwatson * SUCH DAMAGE.
30140614Srwatson */
31140614Srwatson
32140614Srwatson/*
33140614Srwatson * this file contains a new buffer I/O scheme implementing a coherent
34140614Srwatson * VM object and buffer cache scheme.  Pains have been taken to make
35140614Srwatson * sure that the performance degradation associated with schemes such
36140614Srwatson * as this is not realized.
37140614Srwatson *
38140614Srwatson * Author:  John S. Dyson
39140614Srwatson * Significant help during the development and debugging phases
40140614Srwatson * had been provided by David Greenman, also of the FreeBSD core team.
41140614Srwatson *
42140614Srwatson * see man buf(9) for more info.
43140614Srwatson */
44140614Srwatson
45140614Srwatson#include <sys/cdefs.h>
46140614Srwatson__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_bio.c 282933 2015-05-14 22:50:07Z rmacklem $");
47140614Srwatson
48140614Srwatson#include <sys/param.h>
492729Sdfr#include <sys/systm.h>
50116182Sobrien#include <sys/bio.h>
51116182Sobrien#include <sys/conf.h>
52116182Sobrien#include <sys/buf.h>
5359839Speter#include <sys/devicestat.h>
54140614Srwatson#include <sys/eventhandler.h>
5559839Speter#include <sys/fail.h>
562729Sdfr#include <sys/limits.h>
572729Sdfr#include <sys/lock.h>
5811626Sbde#include <sys/malloc.h>
592729Sdfr#include <sys/mount.h>
60164033Srwatson#include <sys/mutex.h>
612729Sdfr#include <sys/kernel.h>
6282607Sdillon#include <sys/kthread.h>
6382607Sdillon#include <sys/proc.h>
64129882Sphk#include <sys/resourcevar.h>
652729Sdfr#include <sys/rwlock.h>
6669449Salfred#include <sys/sysctl.h>
67140839Ssobomax#include <sys/vmem.h>
6811626Sbde#include <sys/vmmeter.h>
6959839Speter#include <sys/vnode.h>
7059839Speter#include <geom/geom.h>
7168024Srwatson#include <vm/vm.h>
722729Sdfr#include <vm/vm_param.h>
73163606Srwatson#include <vm/vm_kern.h>
74163606Srwatson#include <vm/vm_pageout.h>
7559839Speter#include <vm/vm_page.h>
7659839Speter#include <vm/vm_object.h>
7792723Salfred#include <vm/vm_extern.h>
7892723Salfred#include <vm/vm_map.h>
7992723Salfred#include "opt_compat.h"
8010358Sjulian#include "opt_swap.h"
81100523Salfred
82100523Salfredstatic MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
83100523Salfred
84100523Salfredstruct	bio_ops bioops;		/* I/O operation notification */
85100523Salfred
862729Sdfrstruct	buf_ops buf_ops_bio = {
8792723Salfred	.bop_name	=	"buf_ops_bio",
882729Sdfr	.bop_write	=	bufwrite,
8911626Sbde	.bop_strategy	=	bufstrategy,
9012819Sphk	.bop_sync	=	bufsync,
9111626Sbde	.bop_bdflush	=	bufbdflush,
9211626Sbde};
9311626Sbde
942729Sdfr/*
9559839Speter * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
9659839Speter * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
9759839Speter */
9859839Speterstruct buf *buf;		/* buffer header pool */
9959839Spetercaddr_t unmapped_buf;
10059839Speter
10159839Speter/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
10259839Speterstruct proc *bufdaemonproc;
10359839Speter
10459839Speterstatic int inmem(struct vnode *vp, daddr_t blkno);
10559839Speterstatic void vm_hold_free_pages(struct buf *bp, int newbsize);
10659839Speterstatic void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
10759839Speter		vm_offset_t to);
10859839Speterstatic void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
10959839Speterstatic void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
11059839Speter		vm_page_t m);
11159839Speterstatic void vfs_clean_pages_dirty_buf(struct buf *bp);
11259839Speterstatic void vfs_setdirty_locked_object(struct buf *bp);
11359839Speterstatic void vfs_vmio_release(struct buf *bp);
11459839Speterstatic int vfs_bio_clcheck(struct vnode *vp, int size,
11559839Speter		daddr_t lblkno, daddr_t blkno);
11659839Speterstatic int buf_flush(struct vnode *vp, int);
11759839Speterstatic int flushbufqueues(struct vnode *, int, int);
11859839Speterstatic void buf_daemon(void);
11959839Speterstatic void bremfreel(struct buf *bp);
12059839Speterstatic __inline void bd_wakeup(void);
12159839Speter#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
12259839Speter    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
12359839Speterstatic int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
12459839Speter#endif
12559839Speter
12659839Speterint vmiodirenable = TRUE;
12759839SpeterSYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
12859839Speter    "Use the VM system for directory writes");
12959839Speterlong runningbufspace;
13059839SpeterSYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
13159839Speter    "Amount of presently outstanding async buffer io");
13259839Speterstatic long bufspace;
13359839Speter#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
13459839Speter    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
13559839SpeterSYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
13659839Speter    &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
13759839Speter#else
13859839SpeterSYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
13959839Speter    "Virtual memory used for buffers");
14059839Speter#endif
14159839Speterstatic long unmapped_bufspace;
14259839SpeterSYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
14359839Speter    &unmapped_bufspace, 0,
14459839Speter    "Amount of unmapped buffers, inclusive in the bufspace");
14559839Speterstatic long maxbufspace;
14659839SpeterSYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
14759839Speter    "Maximum allowed value of bufspace (including buf_daemon)");
14859839Speterstatic long bufmallocspace;
14959839SpeterSYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
15059839Speter    "Amount of malloced memory for buffers");
15159839Speterstatic long maxbufmallocspace;
15212819SphkSYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
15312819Sphk    "Maximum amount of malloced memory for buffers");
15459839Speterstatic long lobufspace;
15559839SpeterSYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
15659839Speter    "Minimum amount of buffers we want to have");
15759839Speterlong hibufspace;
158137613SrwatsonSYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
159101772Salfred    "Maximum allowed value of bufspace (excluding buf_daemon)");
1602729Sdfrstatic int bufreusecnt;
16159839SpeterSYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
16269449Salfred    "Number of times we have reused a buffer");
1632729Sdfrstatic int buffreekvacnt;
1642729SdfrSYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
1652729Sdfr    "Number of times we have freed the KVA space from some buffer");
16683765Smrstatic int bufdefragcnt;
16783765SmrSYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
16883765Smr    "Number of times we have had to repeat buffer allocation to defragment");
16983765Smrstatic long lorunningspace;
170139436SrwatsonSYSCTL_LONG(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
171139436Srwatson    "Minimum preferred space used for in-progress I/O");
17283765Smrstatic long hirunningspace;
173111119SimpSYSCTL_LONG(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
17459839Speter    "Maximum amount of space to use for in-progress I/O");
17559839Speterint dirtybufferflushes;
176111119SimpSYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
17759839Speter    0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
17859839Speterint bdwriteskip;
179111119SimpSYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
18059839Speter    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
18159839Speterint altbufferflushes;
182137613SrwatsonSYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
183137613Srwatson    0, "Number of fsync flushes to limit dirty buffers");
18459839Speterstatic int recursiveflushes;
18559839SpeterSYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
18659839Speter    0, "Number of flushes skipped due to being recursive");
1872729Sdfrstatic int numdirtybuffers;
1882729SdfrSYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
1892729Sdfr    "Number of buffers that are dirty (has unwritten changes) at the moment");
1902729Sdfrstatic int lodirtybuffers;
1912729SdfrSYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
1922729Sdfr    "How many buffers we want to have free before bufdaemon can sleep");
1932729Sdfrstatic int hidirtybuffers;
1942729SdfrSYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
1952729Sdfr    "When the number of dirty buffers is considered severe");
1962729Sdfrint dirtybufthresh;
197100523SalfredSYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
198100523Salfred    0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
1992729Sdfrstatic int numfreebuffers;
2002729SdfrSYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
2012729Sdfr    "Number of free buffers");
2022729Sdfrstatic int lofreebuffers;
203100523SalfredSYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
2042729Sdfr   "XXX Unused");
2052729Sdfrstatic int hifreebuffers;
2062729SdfrSYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
2072729Sdfr   "XXX Complicatedly unused");
2082729Sdfrstatic int getnewbufcalls;
2092729SdfrSYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
2102729Sdfr   "Number of calls to getnewbuf");
2112729Sdfrstatic int getnewbufrestarts;
2122729SdfrSYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
2132729Sdfr    "Number of times getnewbuf has had to restart a buffer aquisition");
2142729Sdfrstatic int mappingrestarts;
2152729SdfrSYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
2162729Sdfr    "Number of times getblk has had to restart a buffer mapping for "
2172729Sdfr    "unmapped buffer");
2182729Sdfrstatic int flushbufqtarget = 100;
2192729SdfrSYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
2202729Sdfr    "Amount of work to do in flushbufqueues when helping bufdaemon");
2212729Sdfrstatic long notbufdflushes;
2222729SdfrSYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
2232729Sdfr    "Number of dirty buffer flushes done by the bufdaemon helpers");
2242729Sdfrstatic long barrierwrites;
2252729SdfrSYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
226140614Srwatson    "Number of barrier writes");
227140614SrwatsonSYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
228140614Srwatson    &unmapped_buf_allowed, 0,
2292729Sdfr    "Permit the use of the unmapped i/o");
2302729Sdfr
2312729Sdfr/*
2322729Sdfr * Lock for the non-dirty bufqueues
2332729Sdfr */
2342729Sdfrstatic struct mtx_padalign bqclean;
2352729Sdfr
236137613Srwatson/*
237137613Srwatson * Lock for the dirty queue.
238137613Srwatson */
239140614Srwatsonstatic struct mtx_padalign bqdirty;
240140614Srwatson
241140614Srwatson/*
2422729Sdfr * This lock synchronizes access to bd_request.
243101772Salfred */
2442729Sdfrstatic struct mtx_padalign bdlock;
2452729Sdfr
24669449Salfred/*
24769449Salfred * This lock protects the runningbufreq and synchronizes runningbufwakeup and
24869449Salfred * waitrunningbufspace().
249137613Srwatson */
25069449Salfredstatic struct mtx_padalign rbreqlock;
251140614Srwatson
252140614Srwatson/*
253140614Srwatson * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
25469449Salfred */
25569449Salfredstatic struct rwlock_padalign nblock;
25669449Salfred
25769449Salfred/*
25869449Salfred * Lock that protects bdirtywait.
25969449Salfred */
26069449Salfredstatic struct mtx_padalign bdirtylock;
26169449Salfred
262137613Srwatson/*
263137613Srwatson * Wakeup point for bufdaemon, as well as indicator of whether it is already
264137613Srwatson * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
26569449Salfred * is idling.
26669449Salfred */
26769449Salfredstatic int bd_request;
26869449Salfred
26969449Salfred/*
270140614Srwatson * Request for the buf daemon to write more buffers than is indicated by
271140614Srwatson * lodirtybuf.  This may be necessary to push out excess dependencies or
272140614Srwatson * defragment the address space where a simple count of the number of dirty
273140614Srwatson * buffers is insufficient to characterize the demand for flushing them.
274140614Srwatson */
275140614Srwatsonstatic int bd_speedupreq;
27669449Salfred
27769449Salfred/*
27869449Salfred * bogus page -- for I/O to/from partially complete buffers
27969449Salfred * this is a temporary solution to the problem, but it is not
280101772Salfred * really that bad.  it would be better to split the buffer
28169449Salfred * for input in the case of buffers partially already in memory,
28269449Salfred * but the code is intricate enough already.
28369449Salfred */
28469449Salfredvm_page_t bogus_page;
28569449Salfred
28669449Salfred/*
28769449Salfred * Synchronization (sleep/wakeup) variable for active buffer space requests.
28869449Salfred * Set when wait starts, cleared prior to wakeup().
28969449Salfred * Used in runningbufwakeup() and waitrunningbufspace().
29069449Salfred */
29169449Salfredstatic int runningbufreq;
29269449Salfred
29369449Salfred/*
29469449Salfred * Synchronization (sleep/wakeup) variable for buffer requests.
29569449Salfred * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
29669449Salfred * by and/or.
29769449Salfred * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
29869449Salfred * getnewbuf(), and getblk().
29969449Salfred */
30069449Salfredstatic volatile int needsbuffer;
30169449Salfred
30269449Salfred/*
30369449Salfred * Synchronization for bwillwrite() waiters.
30469449Salfred */
30569449Salfredstatic int bdirtywait;
30671038Sdes
30771038Sdes/*
30869449Salfred * Definitions for the buffer free lists.
30969449Salfred */
31069449Salfred#define BUFFER_QUEUES	5	/* number of free buffer queues */
31169449Salfred
31288633Salfred#define QUEUE_NONE	0	/* on no queue */
31388633Salfred#define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
31488633Salfred#define QUEUE_DIRTY	2	/* B_DELWRI buffers */
31588633Salfred#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
31688633Salfred#define QUEUE_EMPTY	4	/* empty buffer headers */
31769449Salfred#define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
31871038Sdes
31969449Salfred/* Queues for free buffers with various properties */
32071038Sdesstatic TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
32169449Salfred#ifdef INVARIANTS
3222729Sdfrstatic int bq_len[BUFFER_QUEUES];
3232729Sdfr#endif
32482607Sdillon
32582607Sdillon/*
3262729Sdfr * Single global constant for BUF_WMESG, to avoid getting multiple references.
3272729Sdfr * buf_wmesg is referred from macros.
32883366Sjulian */
32983366Sjulianconst char *buf_wmesg = BUF_WMESG;
33011626Sbde
33111626Sbde#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
332118615Snectar#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
33311626Sbde#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
33411626Sbde
33511626Sbde#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
33611626Sbde    defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
33711626Sbdestatic int
33811626Sbdesysctl_bufspace(SYSCTL_HANDLER_ARGS)
3392729Sdfr{
34082607Sdillon	long lvalue;
3412729Sdfr	int ivalue;
34291703Sjhb
34391703Sjhb	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
344118615Snectar		return (sysctl_handle_long(oidp, arg1, arg2, req));
345118615Snectar	lvalue = *(long *)arg1;
34691703Sjhb	if (lvalue > INT_MAX)
34783366Sjulian		/* On overflow, still write out a long to trigger ENOMEM. */
34882607Sdillon		return (sysctl_handle_long(oidp, &lvalue, 0, req));
3492729Sdfr	ivalue = lvalue;
3502729Sdfr	return (sysctl_handle_int(oidp, &ivalue, 0, req));
3512729Sdfr}
3522729Sdfr#endif
3532729Sdfr
3542729Sdfr/*
3552729Sdfr *	bqlock:
3562729Sdfr *
3572729Sdfr *	Return the appropriate queue lock based on the index.
3582729Sdfr */
3592729Sdfrstatic inline struct mtx *
3602729Sdfrbqlock(int qindex)
3612729Sdfr{
3622729Sdfr
3632729Sdfr	if (qindex == QUEUE_DIRTY)
3642729Sdfr		return (struct mtx *)(&bqdirty);
3652729Sdfr	return (struct mtx *)(&bqclean);
3662729Sdfr}
3672729Sdfr
3682729Sdfr/*
3692729Sdfr *	bdirtywakeup:
3702729Sdfr *
3712729Sdfr *	Wakeup any bwillwrite() waiters.
3722729Sdfr */
373140614Srwatsonstatic void
374140614Srwatsonbdirtywakeup(void)
375140614Srwatson{
3762729Sdfr	mtx_lock(&bdirtylock);
3772729Sdfr	if (bdirtywait) {
37812866Speter		bdirtywait = 0;
3792729Sdfr		wakeup(&bdirtywait);
3802729Sdfr	}
3812729Sdfr	mtx_unlock(&bdirtylock);
38212866Speter}
3832729Sdfr
38412866Speter/*
3852729Sdfr *	bdirtysub:
38682607Sdillon *
38782607Sdillon *	Decrement the numdirtybuffers count by one and wakeup any
38882607Sdillon *	threads blocked in bwillwrite().
38912866Speter */
39083366Sjulianstatic void
39183366Sjulianbdirtysub(void)
3922729Sdfr{
3932729Sdfr
3942729Sdfr	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
3952729Sdfr	    (lodirtybuffers + hidirtybuffers) / 2)
3962729Sdfr		bdirtywakeup();
397140839Ssobomax}
398140839Ssobomax
399165403Sjkim/*
400140839Ssobomax *	bdirtyadd:
401140839Ssobomax *
402140839Ssobomax *	Increment the numdirtybuffers count by one and wakeup the buf
403141471Sjhb *	daemon if needed.
404140839Ssobomax */
405141471Sjhbstatic void
406140839Ssobomaxbdirtyadd(void)
407140839Ssobomax{
408140839Ssobomax
409140839Ssobomax	/*
410141471Sjhb	 * Only do the wakeup once as we cross the boundary.  The
411140839Ssobomax	 * buf daemon will keep running until the condition clears.
412140839Ssobomax	 */
413140839Ssobomax	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
414140839Ssobomax	    (lodirtybuffers + hidirtybuffers) / 2)
415140839Ssobomax		bd_wakeup();
416140839Ssobomax}
417137613Srwatson
4182729Sdfr/*
41991703Sjhb *	bufspacewakeup:
42091703Sjhb *
42191703Sjhb *	Called when buffer space is potentially available for recovery.
422140839Ssobomax *	getnewbuf() will block on this flag when it is unable to free
4232729Sdfr *	sufficient buffer space.  Buffer space becomes recoverable when
424140839Ssobomax *	bp's get placed back in the queues.
425140839Ssobomax */
426100523Salfred
427101772Salfredstatic __inline void
4282729Sdfrbufspacewakeup(void)
4292729Sdfr{
430140839Ssobomax	int need_wakeup, on;
4312729Sdfr
432101772Salfred	/*
433137613Srwatson	 * If someone is waiting for BUF space, wake them up.  Even
434100523Salfred	 * though we haven't freed the kva space yet, the waiting
43582607Sdillon	 * process will be able to now.
43682607Sdillon	 */
4372729Sdfr	rw_rlock(&nblock);
438140839Ssobomax	for (;;) {
439100523Salfred		need_wakeup = 0;
44082607Sdillon		on = needsbuffer;
44182607Sdillon		if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
4422729Sdfr			break;
443140614Srwatson		need_wakeup = 1;
444140614Srwatson		if (atomic_cmpset_rel_int(&needsbuffer, on,
445162468Srwatson		    on & ~VFS_BIO_NEED_BUFSPACE))
446140614Srwatson			break;
447140614Srwatson	}
4482729Sdfr	if (need_wakeup)
44982607Sdillon		wakeup(__DEVOLATILE(void *, &needsbuffer));
4502729Sdfr	rw_runlock(&nblock);
4512729Sdfr}
4522729Sdfr
4532729Sdfr/*
4542729Sdfr *	runningwakeup:
4552729Sdfr *
4562729Sdfr *	Wake up processes that are waiting on asynchronous writes to fall
457137613Srwatson *	below lorunningspace.
45882607Sdillon */
459137613Srwatsonstatic void
460140614Srwatsonrunningwakeup(void)
461140614Srwatson{
462140614Srwatson
463140614Srwatson	mtx_lock(&rbreqlock);
464140614Srwatson	if (runningbufreq) {
465140614Srwatson		runningbufreq = 0;
466140614Srwatson		wakeup(&runningbufreq);
467140614Srwatson	}
468140614Srwatson	mtx_unlock(&rbreqlock);
469140614Srwatson}
470140614Srwatson
471140614Srwatson/*
472140614Srwatson *	runningbufwakeup:
473162468Srwatson *
474140614Srwatson *	Decrement the outstanding write count according.
475140614Srwatson */
476140614Srwatsonvoid
477140614Srwatsonrunningbufwakeup(struct buf *bp)
4782729Sdfr{
479137613Srwatson	long space, bspace;
4802729Sdfr
4812729Sdfr	bspace = bp->b_runningbufspace;
4822729Sdfr	if (bspace == 0)
4832729Sdfr		return;
484137613Srwatson	space = atomic_fetchadd_long(&runningbufspace, -bspace);
485137613Srwatson	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
4862729Sdfr	    space, bspace));
4872729Sdfr	bp->b_runningbufspace = 0;
4882729Sdfr	/*
4892729Sdfr	 * Only acquire the lock and wakeup on the transition from exceeding
4902729Sdfr	 * the threshold to falling below it.
491137613Srwatson	 */
4922729Sdfr	if (space < lorunningspace)
493137613Srwatson		return;
4942729Sdfr	if (space - bspace > lorunningspace)
4952729Sdfr		return;
496137613Srwatson	runningwakeup();
4972729Sdfr}
498140614Srwatson
499140614Srwatson/*
500140614Srwatson *	bufcountadd:
501140614Srwatson *
502137613Srwatson *	Called when a buffer has been added to one of the free queues to
5032729Sdfr *	account for the buffer and to wakeup anyone waiting for free buffers.
5042729Sdfr *	This typically occurs when large amounts of metadata are being handled
5052729Sdfr *	by the buffer cache ( else buffer space runs out first, usually ).
5062729Sdfr */
5072729Sdfrstatic __inline void
508137613Srwatsonbufcountadd(struct buf *bp)
50982607Sdillon{
510140839Ssobomax	int mask, need_wakeup, old, on;
511164033Srwatson
51282607Sdillon	KASSERT((bp->b_flags & B_INFREECNT) == 0,
51382607Sdillon	    ("buf %p already counted as free", bp));
51443426Sphk	bp->b_flags |= B_INFREECNT;
515140839Ssobomax	old = atomic_fetchadd_int(&numfreebuffers, 1);
516100523Salfred	KASSERT(old >= 0 && old < nbuf,
517100523Salfred	    ("numfreebuffers climbed to %d", old + 1));
518140839Ssobomax	mask = VFS_BIO_NEED_ANY;
5192729Sdfr	if (numfreebuffers >= hifreebuffers)
520140839Ssobomax		mask |= VFS_BIO_NEED_FREE;
521100523Salfred	rw_rlock(&nblock);
52282607Sdillon	for (;;) {
52382607Sdillon		need_wakeup = 0;
5242729Sdfr		on = needsbuffer;
525140839Ssobomax		if (on == 0)
526140839Ssobomax			break;
527137613Srwatson		need_wakeup = 1;
528140839Ssobomax		if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
529140839Ssobomax			break;
530137613Srwatson	}
5312729Sdfr	if (need_wakeup)
5322729Sdfr		wakeup(__DEVOLATILE(void *, &needsbuffer));
5332729Sdfr	rw_runlock(&nblock);
534137613Srwatson}
535100523Salfred
53682607Sdillon/*
5372729Sdfr *	bufcountsub:
538141471Sjhb *
5392729Sdfr *	Decrement the numfreebuffers count as needed.
5402729Sdfr */
5412729Sdfrstatic void
542100523Salfredbufcountsub(struct buf *bp)
54382607Sdillon{
54482607Sdillon	int old;
5452729Sdfr
5462729Sdfr	/*
54782607Sdillon	 * Fixup numfreebuffers count.  If the buffer is invalid or not
54883366Sjulian	 * delayed-write, the buffer was free and we must decrement
54982607Sdillon	 * numfreebuffers.
550101772Salfred	 */
551141471Sjhb	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
5522729Sdfr		KASSERT((bp->b_flags & B_INFREECNT) != 0,
5532729Sdfr		    ("buf %p not counted in numfreebuffers", bp));
55412866Speter		bp->b_flags &= ~B_INFREECNT;
5552729Sdfr		old = atomic_fetchadd_int(&numfreebuffers, -1);
5562729Sdfr		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
5572729Sdfr	}
5582729Sdfr}
55912866Speter
5602729Sdfr/*
56182607Sdillon *	waitrunningbufspace()
56282607Sdillon *
56382607Sdillon *	runningbufspace is a measure of the amount of I/O currently
56412866Speter *	running.  This routine is used in async-write situations to
56583366Sjulian *	prevent creating huge backups of pending writes to a device.
56683366Sjulian *	Only asynchronous writes are governed by this function.
5672729Sdfr *
5682729Sdfr *	This does NOT turn an async write into a sync write.  It waits
56982607Sdillon *	for earlier writes to complete and generally returns before the
5702729Sdfr *	caller's write has reached the device.
5712729Sdfr */
57291703Sjhbvoid
573137613Srwatsonwaitrunningbufspace(void)
5742729Sdfr{
575100523Salfred
5762729Sdfr	mtx_lock(&rbreqlock);
57791703Sjhb	while (runningbufspace > hirunningspace) {
57891703Sjhb		runningbufreq = 1;
57991703Sjhb		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
580101772Salfred	}
5812729Sdfr	mtx_unlock(&rbreqlock);
5822729Sdfr}
583137613Srwatson
584137613Srwatson
585137613Srwatson/*
5862729Sdfr *	vfs_buf_test_cache:
5872729Sdfr *
5882729Sdfr *	Called when a buffer is extended.  This function clears the B_CACHE
589100523Salfred *	bit if the newly extended portion of the buffer does not contain
5902729Sdfr *	valid data.
591100523Salfred */
59282607Sdillonstatic __inline
59382607Sdillonvoid
5942729Sdfrvfs_buf_test_cache(struct buf *bp,
595137613Srwatson		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
596137613Srwatson		  vm_page_t m)
597100523Salfred{
598100523Salfred
59982607Sdillon	VM_OBJECT_ASSERT_LOCKED(m->object);
6002729Sdfr	if (bp->b_flags & B_CACHE) {
601140614Srwatson		int base = (foff + off) & PAGE_MASK;
602140614Srwatson		if (vm_page_is_valid(m, base, size) == 0)
603162468Srwatson			bp->b_flags &= ~B_CACHE;
604140614Srwatson	}
605140614Srwatson}
6062729Sdfr
6072729Sdfr/* Wake up the buffer daemon if necessary */
6082729Sdfrstatic __inline void
6092729Sdfrbd_wakeup(void)
610100523Salfred{
6112729Sdfr
6122729Sdfr	mtx_lock(&bdlock);
6132729Sdfr	if (bd_request == 0) {
6142729Sdfr		bd_request = 1;
6152729Sdfr		wakeup(&bd_request);
6162729Sdfr	}
6172729Sdfr	mtx_unlock(&bdlock);
6182729Sdfr}
619137613Srwatson
620137613Srwatson/*
621137613Srwatson * bd_speedup - speedup the buffer cache flushing code
6222729Sdfr */
6232729Sdfrvoid
6242729Sdfrbd_speedup(void)
625100523Salfred{
62682607Sdillon	int needwake;
62782607Sdillon
6282729Sdfr	mtx_lock(&bdlock);
629100523Salfred	needwake = 0;
630137613Srwatson	if (bd_speedupreq == 0 || bd_request == 0)
631137613Srwatson		needwake = 1;
632137613Srwatson	bd_speedupreq = 1;
633137613Srwatson	bd_request = 1;
634137613Srwatson	if (needwake)
635137613Srwatson		wakeup(&bd_request);
6362729Sdfr	mtx_unlock(&bdlock);
637137613Srwatson}
638137613Srwatson
639137613Srwatson#ifndef NSWBUF_MIN
640137613Srwatson#define	NSWBUF_MIN	16
641137613Srwatson#endif
642137613Srwatson
643137613Srwatson#ifdef __i386__
644137613Srwatson#define	TRANSIENT_DENOM	5
645137613Srwatson#else
646137613Srwatson#define	TRANSIENT_DENOM 10
647137613Srwatson#endif
648140614Srwatson
649140614Srwatson/*
650140614Srwatson * Calculating buffer cache scaling values and reserve space for buffer
6512729Sdfr * headers.  This is called during low level kernel initialization and
652100523Salfred * may be called more then once.  We CANNOT write to the memory area
65382607Sdillon * being reserved at this time.
65482607Sdillon */
6552729Sdfrcaddr_t
6562729Sdfrkern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
6572729Sdfr{
6582729Sdfr	int tuned_nbuf;
659137613Srwatson	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
66082607Sdillon
661101772Salfred	/*
66282607Sdillon	 * physmem_est is in pages.  Convert it to kilobytes (assumes
6632729Sdfr	 * PAGE_SIZE is >= 1K)
6642729Sdfr	 */
66512866Speter	physmem_est = physmem_est * (PAGE_SIZE / 1024);
6662729Sdfr
6672729Sdfr	/*
668109895Salfred	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
6692729Sdfr	 * For the first 64MB of ram nominally allocate sufficient buffers to
6702729Sdfr	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
6712729Sdfr	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
67212866Speter	 * the buffer cache we limit the eventual kva reservation to
6732729Sdfr	 * maxbcache bytes.
67412866Speter	 *
675165403Sjkim	 * factor represents the 1/4 x ram conversion.
67683366Sjulian	 */
677165403Sjkim	if (nbuf == 0) {
678165403Sjkim		int factor = 4 * BKVASIZE / 1024;
679165403Sjkim
680165403Sjkim		nbuf = 50;
681165403Sjkim		if (physmem_est > 4096)
6822729Sdfr			nbuf += min((physmem_est - 4096) / factor,
683165403Sjkim			    65536 / factor);
684137613Srwatson		if (physmem_est > 65536)
6852729Sdfr			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
6862729Sdfr			    32 * 1024 * 1024 / (factor * 5));
6872729Sdfr
68891703Sjhb		if (maxbcache && nbuf > maxbcache / BKVASIZE)
68991703Sjhb			nbuf = maxbcache / BKVASIZE;
69091703Sjhb		tuned_nbuf = 1;
691101772Salfred	} else
692165403Sjkim		tuned_nbuf = 0;
6932729Sdfr
694165403Sjkim	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
695165403Sjkim	maxbuf = (LONG_MAX / 3) / BKVASIZE;
696100523Salfred	if (nbuf > maxbuf) {
69782607Sdillon		if (!tuned_nbuf)
69882607Sdillon			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
6992729Sdfr			    maxbuf);
7002729Sdfr		nbuf = maxbuf;
701165403Sjkim	}
702137613Srwatson
703100523Salfred	/*
70482607Sdillon	 * Ideal allocation size for the transient bio submap if 10%
70582607Sdillon	 * of the maximal space buffer map.  This roughly corresponds
7062729Sdfr	 * to the amount of the buffer mapped for typical UFS load.
707165403Sjkim	 *
708100523Salfred	 * Clip the buffer map to reserve space for the transient
70982607Sdillon	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
71082607Sdillon	 * maximum buffer map extent on the platform.
7112729Sdfr	 *
7122729Sdfr	 * The fall-back to the maxbuf in case of maxbcache unset,
713137613Srwatson	 * allows to not trim the buffer KVA for the architectures
714100523Salfred	 * with ample KVA space.
71582607Sdillon	 */
7162729Sdfr	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
7172729Sdfr		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
718140614Srwatson		buf_sz = (long)nbuf * BKVASIZE;
719140614Srwatson		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
720162468Srwatson		    (TRANSIENT_DENOM - 1)) {
721140614Srwatson			/*
722140614Srwatson			 * There is more KVA than memory.  Do not
723140614Srwatson			 * adjust buffer map size, and assign the rest
7242729Sdfr			 * of maxbuf to transient map.
725165403Sjkim			 */
726165403Sjkim			biotmap_sz = maxbuf_sz - buf_sz;
7272729Sdfr		} else {
7282729Sdfr			/*
7292729Sdfr			 * Buffer map spans all KVA we could afford on
7302729Sdfr			 * this platform.  Give 10% (20% on i386) of
7312729Sdfr			 * the buffer map to the transient bio map.
7322729Sdfr			 */
7332729Sdfr			biotmap_sz = buf_sz / TRANSIENT_DENOM;
7342729Sdfr			buf_sz -= biotmap_sz;
735137613Srwatson		}
736137613Srwatson		if (biotmap_sz / INT_MAX > MAXPHYS)
73782607Sdillon			bio_transient_maxcnt = INT_MAX;
73882607Sdillon		else
7392729Sdfr			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
7402729Sdfr		/*
741137613Srwatson		 * Artifically limit to 1024 simultaneous in-flight I/Os
742100523Salfred		 * using the transient mapping.
7432729Sdfr		 */
7442729Sdfr		if (bio_transient_maxcnt > 1024)
745137613Srwatson			bio_transient_maxcnt = 1024;
746100523Salfred		if (tuned_nbuf)
7472729Sdfr			nbuf = buf_sz / BKVASIZE;
7482729Sdfr	}
7492729Sdfr
750100523Salfred	/*
7512729Sdfr	 * swbufs are used as temporary holders for I/O, such as paging I/O.
7522729Sdfr	 * We have no less then 16 and no more then 256.
7532729Sdfr	 */
754100523Salfred	nswbuf = min(nbuf / 4, 256);
7552729Sdfr	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
7562729Sdfr	if (nswbuf < NSWBUF_MIN)
7572729Sdfr		nswbuf = NSWBUF_MIN;
7582729Sdfr
7592729Sdfr	/*
7602729Sdfr	 * Reserve space for the buffer cache buffers
7612729Sdfr	 */
762100523Salfred	swbuf = (void *)v;
763100523Salfred	v = (caddr_t)(swbuf + nswbuf);
76482607Sdillon	buf = (void *)v;
76582607Sdillon	v = (caddr_t)(buf + nbuf);
7662729Sdfr
7672729Sdfr	return(v);
768137613Srwatson}
769100523Salfred
7702729Sdfr/* Initialize the buffer subsystem.  Called before use of any buffers. */
7712729Sdfrvoid
7722729Sdfrbufinit(void)
7732729Sdfr{
774100523Salfred	struct buf *bp;
775137613Srwatson	int i;
7762729Sdfr
7772729Sdfr	CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
778164368Sjkim	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
779137613Srwatson	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
780164368Sjkim	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
781164368Sjkim	rw_init(&nblock, "needsbuffer lock");
7822729Sdfr	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
783137613Srwatson	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
784164368Sjkim
785164368Sjkim	/* next, make a null set of free lists */
786164368Sjkim	for (i = 0; i < BUFFER_QUEUES; i++)
787164368Sjkim		TAILQ_INIT(&bufqueues[i]);
78882607Sdillon
789100523Salfred	/* finally, initialize each buffer header and stick on empty q */
79082607Sdillon	for (i = 0; i < nbuf; i++) {
79182607Sdillon		bp = &buf[i];
7922729Sdfr		bzero(bp, sizeof *bp);
7932729Sdfr		bp->b_flags = B_INVAL | B_INFREECNT;
7942729Sdfr		bp->b_rcred = NOCRED;
7952729Sdfr		bp->b_wcred = NOCRED;
7962729Sdfr		bp->b_qindex = QUEUE_EMPTY;
7972729Sdfr		bp->b_xflags = 0;
798137613Srwatson		LIST_INIT(&bp->b_dep);
799100523Salfred		BUF_LOCKINIT(bp);
80082607Sdillon		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
80182607Sdillon#ifdef INVARIANTS
8022729Sdfr		bq_len[QUEUE_EMPTY]++;
8032729Sdfr#endif
8042729Sdfr	}
805100523Salfred
8062729Sdfr	/*
8072729Sdfr	 * maxbufspace is the absolute maximum amount of buffer space we are
8082729Sdfr	 * allowed to reserve in KVM and in real terms.  The absolute maximum
8092729Sdfr	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
8102729Sdfr	 * used by most other processes.  The differential is required to
8112729Sdfr	 * ensure that buf_daemon is able to run when other processes might
8122729Sdfr	 * be blocked waiting for buffer space.
8132729Sdfr	 *
8142729Sdfr	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
815137613Srwatson	 * this may result in KVM fragmentation which is not handled optimally
8162729Sdfr	 * by the system.
8172729Sdfr	 */
8182729Sdfr	maxbufspace = (long)nbuf * BKVASIZE;
819137613Srwatson	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
8202729Sdfr	lobufspace = hibufspace - MAXBCACHEBUF;
8212729Sdfr
8222729Sdfr	/*
8232729Sdfr	 * Note: The 16 MiB upper limit for hirunningspace was chosen
8242729Sdfr	 * arbitrarily and may need further tuning. It corresponds to
8252729Sdfr	 * 128 outstanding write IO requests (if IO size is 128 KiB),
8262729Sdfr	 * which fits with many RAID controllers' tagged queuing limits.
8272729Sdfr	 * The lower 1 MiB limit is the historical upper limit for
8282729Sdfr	 * hirunningspace.
829137613Srwatson	 */
8302729Sdfr	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
831137613Srwatson	    16 * 1024 * 1024), 1024 * 1024);
8322729Sdfr	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
8332729Sdfr
8342729Sdfr/*
8352729Sdfr * Limit the amount of malloc memory since it is wired permanently into
8362729Sdfr * the kernel space.  Even though this is accounted for in the buffer
8372729Sdfr * allocation, we don't want the malloced region to grow uncontrolled.
8382729Sdfr * The malloc scheme improves memory utilization significantly on average
8392729Sdfr * (small) directories.
8402729Sdfr */
841165403Sjkim	maxbufmallocspace = hibufspace / 20;
842140614Srwatson
843140614Srwatson/*
844140614Srwatson * Reduce the chance of a deadlock occuring by limiting the number
845140614Srwatson * of delayed-write dirty buffers we allow to stack up.
846140614Srwatson */
847140614Srwatson	hidirtybuffers = nbuf / 4 + 20;
848140614Srwatson	dirtybufthresh = hidirtybuffers * 9 / 10;
849140614Srwatson	numdirtybuffers = 0;
8502729Sdfr/*
8512729Sdfr * To support extreme low-memory systems, make sure hidirtybuffers cannot
8522729Sdfr * eat up all available buffer space.  This occurs when our minimum cannot
8532729Sdfr * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
8542729Sdfr * BKVASIZE'd buffers.
8552729Sdfr */
8562729Sdfr	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
8572729Sdfr		hidirtybuffers >>= 1;
8582729Sdfr	}
8592729Sdfr	lodirtybuffers = hidirtybuffers / 2;
8602729Sdfr
8612729Sdfr/*
8622729Sdfr * Try to keep the number of free buffers in the specified range,
8632729Sdfr * and give special processes (e.g. like buf_daemon) access to an
8642729Sdfr * emergency reserve.
865100523Salfred */
8662729Sdfr	lofreebuffers = nbuf / 18 + 5;
8672729Sdfr	hifreebuffers = 2 * lofreebuffers;
8682729Sdfr	numfreebuffers = nbuf;
8692729Sdfr
8702729Sdfr	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
8712729Sdfr	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
8722729Sdfr	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
8732729Sdfr}
8742729Sdfr
8752729Sdfr#ifdef INVARIANTS
8762729Sdfrstatic inline void
8772729Sdfrvfs_buf_check_mapped(struct buf *bp)
8782729Sdfr{
879137613Srwatson
880137613Srwatson	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
881165403Sjkim	    ("mapped buf %p %x", bp, bp->b_flags));
88282607Sdillon	KASSERT(bp->b_kvabase != unmapped_buf,
88382607Sdillon	    ("mapped buf: b_kvabase was not updated %p", bp));
8842729Sdfr	KASSERT(bp->b_data != unmapped_buf,
8852729Sdfr	    ("mapped buf: b_data was not updated %p", bp));
8862729Sdfr}
8872729Sdfr
8882729Sdfrstatic inline void
8892729Sdfrvfs_buf_check_unmapped(struct buf *bp)
8902729Sdfr{
8912729Sdfr
8922729Sdfr	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
8932729Sdfr	    ("unmapped buf %p %x", bp, bp->b_flags));
8942729Sdfr	KASSERT(bp->b_kvabase == unmapped_buf,
8952729Sdfr	    ("unmapped buf: corrupted b_kvabase %p", bp));
8962729Sdfr	KASSERT(bp->b_data == unmapped_buf,
8972729Sdfr	    ("unmapped buf: corrupted b_data %p", bp));
8982729Sdfr}
8992729Sdfr
9002729Sdfr#define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
901101772Salfred#define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
902165403Sjkim#else
9032729Sdfr#define	BUF_CHECK_MAPPED(bp) do {} while (0)
904101772Salfred#define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
905100523Salfred#endif
906100523Salfred
9072729Sdfrstatic void
908137613Srwatsonbpmap_qenter(struct buf *bp)
909137613Srwatson{
91082607Sdillon
9112729Sdfr	BUF_CHECK_MAPPED(bp);
912101772Salfred
9132729Sdfr	/*
914165403Sjkim	 * bp->b_data is relative to bp->b_offset, but
9152729Sdfr	 * bp->b_offset may be offset into the first page.
9162729Sdfr	 */
9172729Sdfr	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
9182729Sdfr	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
9192729Sdfr	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
9202729Sdfr	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
9212729Sdfr}
9222729Sdfr
9232729Sdfr/*
924137613Srwatson * bfreekva() - free the kva allocation for a buffer.
9252729Sdfr *
9262729Sdfr *	Since this call frees up buffer space, we call bufspacewakeup().
9272729Sdfr */
9282729Sdfrstatic void
9292729Sdfrbfreekva(struct buf *bp)
930137613Srwatson{
9312729Sdfr
932137613Srwatson	if (bp->b_kvasize == 0)
93382607Sdillon		return;
93482607Sdillon
9352729Sdfr	atomic_add_int(&buffreekvacnt, 1);
9362729Sdfr	atomic_subtract_long(&bufspace, bp->b_kvasize);
937140614Srwatson	if ((bp->b_flags & B_UNMAPPED) == 0) {
9382729Sdfr		BUF_CHECK_MAPPED(bp);
939140614Srwatson		vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
940140614Srwatson		    bp->b_kvasize);
941140614Srwatson	} else {
942140614Srwatson		BUF_CHECK_UNMAPPED(bp);
943140614Srwatson		if ((bp->b_flags & B_KVAALLOC) != 0) {
944140614Srwatson			vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
945140614Srwatson			    bp->b_kvasize);
946140614Srwatson		}
947140614Srwatson		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
948140614Srwatson		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
949140614Srwatson	}
950140614Srwatson	bp->b_kvasize = 0;
951140614Srwatson	bufspacewakeup();
952140614Srwatson}
953140614Srwatson
954140614Srwatson/*
955140614Srwatson *	binsfree:
956140614Srwatson *
957140614Srwatson *	Insert the buffer into the appropriate free list.
9582729Sdfr */
9592729Sdfrstatic void
960137613Srwatsonbinsfree(struct buf *bp, int qindex)
961137613Srwatson{
962137613Srwatson	struct mtx *olock, *nlock;
9632729Sdfr
964137613Srwatson	BUF_ASSERT_XLOCKED(bp);
965137613Srwatson
9662729Sdfr	olock = bqlock(bp->b_qindex);
967137613Srwatson	nlock = bqlock(qindex);
9682729Sdfr	mtx_lock(olock);
969137613Srwatson	/* Handle delayed bremfree() processing. */
970137613Srwatson	if (bp->b_flags & B_REMFREE)
971137613Srwatson		bremfreel(bp);
972137613Srwatson
9732729Sdfr	if (bp->b_qindex != QUEUE_NONE)
974137613Srwatson		panic("binsfree: free buffer onto another queue???");
97583366Sjulian
97682607Sdillon	bp->b_qindex = qindex;
977101772Salfred	if (olock != nlock) {
97882607Sdillon		mtx_unlock(olock);
9792729Sdfr		mtx_lock(nlock);
9802729Sdfr	}
981165403Sjkim	if (bp->b_flags & B_AGE)
982165403Sjkim		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
983165403Sjkim	else
984165403Sjkim		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
985165403Sjkim#ifdef INVARIANTS
986165403Sjkim	bq_len[bp->b_qindex]++;
987165403Sjkim#endif
988165403Sjkim	mtx_unlock(nlock);
989165403Sjkim
990165403Sjkim	/*
991165403Sjkim	 * Something we can maybe free or reuse.
992165403Sjkim	 */
993165403Sjkim	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
994165403Sjkim		bufspacewakeup();
995165403Sjkim
996165403Sjkim	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
997165403Sjkim		bufcountadd(bp);
998165403Sjkim}
999165403Sjkim
1000165403Sjkim/*
1001165403Sjkim *	bremfree:
1002165403Sjkim *
1003165403Sjkim *	Mark the buffer for removal from the appropriate free list.
100412866Speter *
10052729Sdfr */
10062729Sdfrvoid
10072729Sdfrbremfree(struct buf *bp)
10082729Sdfr{
10092729Sdfr
10102729Sdfr	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
10112729Sdfr	KASSERT((bp->b_flags & B_REMFREE) == 0,
101212866Speter	    ("bremfree: buffer %p already marked for delayed removal.", bp));
10132729Sdfr	KASSERT(bp->b_qindex != QUEUE_NONE,
101412866Speter	    ("bremfree: buffer %p not on a queue.", bp));
1015165403Sjkim	BUF_ASSERT_XLOCKED(bp);
101683366Sjulian
1017165403Sjkim	bp->b_flags |= B_REMFREE;
1018165403Sjkim	bufcountsub(bp);
1019165403Sjkim}
1020165403Sjkim
1021165403Sjkim/*
1022165403Sjkim *	bremfreef:
10232729Sdfr *
10242729Sdfr *	Force an immediate removal from a free list.  Used only in nfs when
1025137613Srwatson *	it abuses the b_freelist pointer.
10262729Sdfr */
1027165403Sjkimvoid
10282729Sdfrbremfreef(struct buf *bp)
10292729Sdfr{
103091703Sjhb	struct mtx *qlock;
103191703Sjhb
103291703Sjhb	qlock = bqlock(bp->b_qindex);
1033165403Sjkim	mtx_lock(qlock);
10342729Sdfr	bremfreel(bp);
1035165403Sjkim	mtx_unlock(qlock);
1036165403Sjkim}
1037100523Salfred
1038101772Salfred/*
10392729Sdfr *	bremfreel:
10402729Sdfr *
1041165403Sjkim *	Removes a buffer from the free list, must be called with the
1042101772Salfred *	correct qlock held.
1043137613Srwatson */
1044100523Salfredstatic void
104582607Sdillonbremfreel(struct buf *bp)
104682607Sdillon{
10472729Sdfr
1048165403Sjkim	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
1049100523Salfred	    bp, bp->b_vp, bp->b_flags);
105082607Sdillon	KASSERT(bp->b_qindex != QUEUE_NONE,
105182607Sdillon	    ("bremfreel: buffer %p not on a queue.", bp));
10522729Sdfr	BUF_ASSERT_XLOCKED(bp);
10532729Sdfr	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
1054137613Srwatson
1055100523Salfred	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
105682607Sdillon#ifdef INVARIANTS
10572729Sdfr	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
10582729Sdfr	    bp->b_qindex));
1059140614Srwatson	bq_len[bp->b_qindex]--;
1060140614Srwatson#endif
1061162468Srwatson	bp->b_qindex = QUEUE_NONE;
1062140614Srwatson	/*
1063140614Srwatson	 * If this was a delayed bremfree() we only need to remove the buffer
1064140614Srwatson	 * from the queue and return the stats are already done.
10652729Sdfr	 */
10662729Sdfr	if (bp->b_flags & B_REMFREE) {
10672729Sdfr		bp->b_flags &= ~B_REMFREE;
1068137613Srwatson		return;
10692729Sdfr	}
10702729Sdfr	bufcountsub(bp);
10712729Sdfr}
1072100523Salfred
1073165403Sjkim/*
1074100523Salfred * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
107582607Sdillon * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
107682607Sdillon * the buffer is valid and we do not have to do anything.
10772729Sdfr */
1078140614Srwatsonvoid
1079140614Srwatsonbreada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
1080140614Srwatson    int cnt, struct ucred * cred)
1081162468Srwatson{
1082140614Srwatson	struct buf *rabp;
1083140614Srwatson	int i;
1084137613Srwatson
1085137613Srwatson	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
1086137613Srwatson		if (inmem(vp, *rablkno))
10872729Sdfr			continue;
1088137613Srwatson		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
1089137613Srwatson
10902729Sdfr		if ((rabp->b_flags & B_CACHE) == 0) {
10912729Sdfr			if (!TD_IS_IDLETHREAD(curthread))
10922729Sdfr				curthread->td_ru.ru_inblock++;
10932729Sdfr			rabp->b_flags |= B_ASYNC;
10942729Sdfr			rabp->b_flags &= ~B_INVAL;
10952729Sdfr			rabp->b_ioflags &= ~BIO_ERROR;
10962729Sdfr			rabp->b_iocmd = BIO_READ;
10972729Sdfr			if (rabp->b_rcred == NOCRED && cred != NOCRED)
1098137613Srwatson				rabp->b_rcred = crhold(cred);
10992729Sdfr			vfs_busy_pages(rabp, 0);
11002729Sdfr			BUF_KERNPROC(rabp);
11012729Sdfr			rabp->b_iooffset = dbtob(rabp->b_blkno);
11022729Sdfr			bstrategy(rabp);
11032729Sdfr		} else {
11042729Sdfr			brelse(rabp);
11052729Sdfr		}
11062729Sdfr	}
11072729Sdfr}
11082729Sdfr
11092729Sdfr/*
11102729Sdfr * Entry point for bread() and breadn() via #defines in sys/buf.h.
1111165403Sjkim *
1112165403Sjkim * Get a buffer with the specified data.  Look in the cache first.  We
1113100523Salfred * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
11142729Sdfr * is set, the buffer is valid and we do not have to do anything, see
11152729Sdfr * getblk(). Also starts asynchronous I/O on read-ahead blocks.
1116100523Salfred */
1117100523Salfredint
1118165403Sjkimbreadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
1119100523Salfred    int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
112082607Sdillon{
112182607Sdillon	struct buf *bp;
11222729Sdfr	int rv = 0, readwait = 0;
1123140614Srwatson
1124140614Srwatson	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
1125140614Srwatson	/*
1126162468Srwatson	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
1127140614Srwatson	 */
1128140614Srwatson	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
11292729Sdfr	if (bp == NULL)
1130137613Srwatson		return (EBUSY);
11312729Sdfr
11322729Sdfr	/* if not found in cache, do some I/O */
1133137613Srwatson	if ((bp->b_flags & B_CACHE) == 0) {
11342729Sdfr		if (!TD_IS_IDLETHREAD(curthread))
1135137613Srwatson			curthread->td_ru.ru_inblock++;
11362729Sdfr		bp->b_iocmd = BIO_READ;
1137137613Srwatson		bp->b_flags &= ~B_INVAL;
11382729Sdfr		bp->b_ioflags &= ~BIO_ERROR;
11392729Sdfr		if (bp->b_rcred == NOCRED && cred != NOCRED)
11402729Sdfr			bp->b_rcred = crhold(cred);
1141137613Srwatson		vfs_busy_pages(bp, 0);
11422729Sdfr		bp->b_iooffset = dbtob(bp->b_blkno);
1143137613Srwatson		bstrategy(bp);
11442729Sdfr		++readwait;
11452729Sdfr	}
11462729Sdfr
11472729Sdfr	breada(vp, rablkno, rabsize, cnt, cred);
11482729Sdfr
11492729Sdfr	if (readwait) {
11502729Sdfr		rv = bufwait(bp);
11512729Sdfr	}
11522729Sdfr	return (rv);
11532729Sdfr}
11542729Sdfr
11552729Sdfr/*
11562729Sdfr * Write, release buffer on completion.  (Done by iodone
11572729Sdfr * if async).  Do not bother writing anything if the buffer
11582729Sdfr * is invalid.
11592729Sdfr *
11602729Sdfr * Note that we set B_CACHE here, indicating that buffer is
11612729Sdfr * fully valid and thus cacheable.  This is true even of NFS
11622729Sdfr * now so we set it generally.  This could be set either here
11632729Sdfr * or in biodone() since the I/O is synchronous.  We put it
11642729Sdfr * here.
11652729Sdfr */
11662729Sdfrint
11672729Sdfrbufwrite(struct buf *bp)
1168165403Sjkim{
1169100523Salfred	int oldflags;
11702729Sdfr	struct vnode *vp;
117182607Sdillon	long space;
117282607Sdillon	int vp_md;
11732729Sdfr
11742729Sdfr	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
11752729Sdfr	if (bp->b_flags & B_INVAL) {
11762729Sdfr		brelse(bp);
11772729Sdfr		return (0);
11782729Sdfr	}
1179100523Salfred
1180137613Srwatson	if (bp->b_flags & B_BARRIER)
1181164368Sjkim		barrierwrites++;
1182100523Salfred
11832729Sdfr	oldflags = bp->b_flags;
118482607Sdillon
1185164368Sjkim	BUF_ASSERT_HELD(bp);
118682607Sdillon
118782607Sdillon	if (bp->b_pin_count > 0)
11882729Sdfr		bunpin_wait(bp);
11892729Sdfr
11902729Sdfr	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
11912729Sdfr	    ("FFS background buffer should not get here %p", bp));
11922729Sdfr
11932729Sdfr	vp = bp->b_vp;
1194137613Srwatson	if (vp)
1195165403Sjkim		vp_md = vp->v_vflag & VV_MD;
1196100523Salfred	else
119782607Sdillon		vp_md = 0;
119882607Sdillon
11992729Sdfr	/*
12002729Sdfr	 * Mark the buffer clean.  Increment the bufobj write count
12012729Sdfr	 * before bundirty() call, to prevent other thread from seeing
12022729Sdfr	 * empty dirty list and zero counter for writes in progress,
12032729Sdfr	 * falsely indicating that the bufobj is clean.
12042729Sdfr	 */
12052729Sdfr	bufobj_wref(bp->b_bufobj);
12062729Sdfr	bundirty(bp);
12072729Sdfr
1208137613Srwatson	bp->b_flags &= ~B_DONE;
1209137613Srwatson	bp->b_ioflags &= ~BIO_ERROR;
1210137613Srwatson	bp->b_flags |= B_CACHE;
1211137613Srwatson	bp->b_iocmd = BIO_WRITE;
12122729Sdfr
12132729Sdfr	vfs_busy_pages(bp, 1);
12142729Sdfr
12152729Sdfr	/*
12162729Sdfr	 * Normal bwrites pipeline writes
12172729Sdfr	 */
12182729Sdfr	bp->b_runningbufspace = bp->b_bufsize;
1219165403Sjkim	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
1220100523Salfred
12212729Sdfr	if (!TD_IS_IDLETHREAD(curthread))
12222729Sdfr		curthread->td_ru.ru_oublock++;
1223165403Sjkim	if (oldflags & B_ASYNC)
12242729Sdfr		BUF_KERNPROC(bp);
12252729Sdfr	bp->b_iooffset = dbtob(bp->b_blkno);
12262729Sdfr	bstrategy(bp);
12272729Sdfr
12282729Sdfr	if ((oldflags & B_ASYNC) == 0) {
12292729Sdfr		int rtval = bufwait(bp);
12302729Sdfr		brelse(bp);
12312729Sdfr		return (rtval);
12322729Sdfr	} else if (space > hirunningspace) {
123345921Ssada		/*
12342729Sdfr		 * don't allow the async write to saturate the I/O
12352729Sdfr		 * system.  We will not deadlock here because
123645921Ssada		 * we are blocking waiting for I/O that is already in-progress
12372729Sdfr		 * to complete. We do not block here if it is the update
12382729Sdfr		 * or syncer daemon trying to clean up as that can lead
12392729Sdfr		 * to deadlock.
12402729Sdfr		 */
1241101772Salfred		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
1242165403Sjkim			waitrunningbufspace();
1243101772Salfred	}
124482607Sdillon
1245100523Salfred	return (0);
1246100523Salfred}
12472729Sdfr
1248137613Srwatsonvoid
124982607Sdillonbufbdflush(struct bufobj *bo, struct buf *bp)
12502729Sdfr{
1251165403Sjkim	struct buf *nbp;
12522729Sdfr
12532729Sdfr	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
12542729Sdfr		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
12552729Sdfr		altbufferflushes++;
12562729Sdfr	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
12572729Sdfr		BO_LOCK(bo);
12582729Sdfr		/*
12592729Sdfr		 * Try to find a buffer to flush.
1260137613Srwatson		 */
126183366Sjulian		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
126282607Sdillon			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
1263101772Salfred			    BUF_LOCK(nbp,
126482607Sdillon				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
12652729Sdfr				continue;
126677461Sdd			if (bp == nbp)
1267165403Sjkim				panic("bdwrite: found ourselves");
1268165403Sjkim			BO_UNLOCK(bo);
1269165403Sjkim			/* Don't countdeps with the bo lock held. */
1270165403Sjkim			if (buf_countdeps(nbp, 0)) {
1271165403Sjkim				BO_LOCK(bo);
1272165403Sjkim				BUF_UNLOCK(nbp);
1273165403Sjkim				continue;
1274165403Sjkim			}
1275165403Sjkim			if (nbp->b_flags & B_CLUSTEROK) {
1276165403Sjkim				vfs_bio_awrite(nbp);
1277165403Sjkim			} else {
1278165403Sjkim				bremfree(nbp);
1279165403Sjkim				bawrite(nbp);
1280165403Sjkim			}
1281165403Sjkim			dirtybufferflushes++;
1282165403Sjkim			break;
1283165403Sjkim		}
1284165403Sjkim		if (nbp == NULL)
1285165403Sjkim			BO_UNLOCK(bo);
1286165403Sjkim	}
1287165403Sjkim}
1288165403Sjkim
1289165403Sjkim/*
129077461Sdd * Delayed write. (Buffer is marked dirty).  Do not bother writing
129177461Sdd * anything if the buffer is marked invalid.
129277461Sdd *
129377461Sdd * Note that since the buffer must be completely valid, we can safely
129477461Sdd * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
1295137613Srwatson * biodone() in order to prevent getblk from writing the buffer
129677461Sdd * out synchronously.
129777461Sdd */
1298141710Scsjpvoid
1299141710Scsjpbdwrite(struct buf *bp)
1300141710Scsjp{
1301141710Scsjp	struct thread *td = curthread;
1302141710Scsjp	struct vnode *vp;
1303141710Scsjp	struct bufobj *bo;
1304141710Scsjp
1305141710Scsjp	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1306141710Scsjp	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1307141710Scsjp	KASSERT((bp->b_flags & B_BARRIER) == 0,
1308141710Scsjp	    ("Barrier request in delayed write %p", bp));
1309141710Scsjp	BUF_ASSERT_HELD(bp);
131077461Sdd
131177461Sdd	if (bp->b_flags & B_INVAL) {
1312		brelse(bp);
1313		return;
1314	}
1315
1316	/*
1317	 * If we have too many dirty buffers, don't create any more.
1318	 * If we are wildly over our limit, then force a complete
1319	 * cleanup. Otherwise, just keep the situation from getting
1320	 * out of control. Note that we have to avoid a recursive
1321	 * disaster and not try to clean up after our own cleanup!
1322	 */
1323	vp = bp->b_vp;
1324	bo = bp->b_bufobj;
1325	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
1326		td->td_pflags |= TDP_INBDFLUSH;
1327		BO_BDFLUSH(bo, bp);
1328		td->td_pflags &= ~TDP_INBDFLUSH;
1329	} else
1330		recursiveflushes++;
1331
1332	bdirty(bp);
1333	/*
1334	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
1335	 * true even of NFS now.
1336	 */
1337	bp->b_flags |= B_CACHE;
1338
1339	/*
1340	 * This bmap keeps the system from needing to do the bmap later,
1341	 * perhaps when the system is attempting to do a sync.  Since it
1342	 * is likely that the indirect block -- or whatever other datastructure
1343	 * that the filesystem needs is still in memory now, it is a good
1344	 * thing to do this.  Note also, that if the pageout daemon is
1345	 * requesting a sync -- there might not be enough memory to do
1346	 * the bmap then...  So, this is important to do.
1347	 */
1348	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
1349		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
1350	}
1351
1352	/*
1353	 * Set the *dirty* buffer range based upon the VM system dirty
1354	 * pages.
1355	 *
1356	 * Mark the buffer pages as clean.  We need to do this here to
1357	 * satisfy the vnode_pager and the pageout daemon, so that it
1358	 * thinks that the pages have been "cleaned".  Note that since
1359	 * the pages are in a delayed write buffer -- the VFS layer
1360	 * "will" see that the pages get written out on the next sync,
1361	 * or perhaps the cluster will be completed.
1362	 */
1363	vfs_clean_pages_dirty_buf(bp);
1364	bqrelse(bp);
1365
1366	/*
1367	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
1368	 * due to the softdep code.
1369	 */
1370}
1371
1372/*
1373 *	bdirty:
1374 *
1375 *	Turn buffer into delayed write request.  We must clear BIO_READ and
1376 *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
1377 *	itself to properly update it in the dirty/clean lists.  We mark it
1378 *	B_DONE to ensure that any asynchronization of the buffer properly
1379 *	clears B_DONE ( else a panic will occur later ).
1380 *
1381 *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
1382 *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
1383 *	should only be called if the buffer is known-good.
1384 *
1385 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1386 *	count.
1387 *
1388 *	The buffer must be on QUEUE_NONE.
1389 */
1390void
1391bdirty(struct buf *bp)
1392{
1393
1394	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
1395	    bp, bp->b_vp, bp->b_flags);
1396	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1397	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1398	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
1399	BUF_ASSERT_HELD(bp);
1400	bp->b_flags &= ~(B_RELBUF);
1401	bp->b_iocmd = BIO_WRITE;
1402
1403	if ((bp->b_flags & B_DELWRI) == 0) {
1404		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
1405		reassignbuf(bp);
1406		bdirtyadd();
1407	}
1408}
1409
1410/*
1411 *	bundirty:
1412 *
1413 *	Clear B_DELWRI for buffer.
1414 *
1415 *	Since the buffer is not on a queue, we do not update the numfreebuffers
1416 *	count.
1417 *
1418 *	The buffer must be on QUEUE_NONE.
1419 */
1420
1421void
1422bundirty(struct buf *bp)
1423{
1424
1425	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1426	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1427	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
1428	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
1429	BUF_ASSERT_HELD(bp);
1430
1431	if (bp->b_flags & B_DELWRI) {
1432		bp->b_flags &= ~B_DELWRI;
1433		reassignbuf(bp);
1434		bdirtysub();
1435	}
1436	/*
1437	 * Since it is now being written, we can clear its deferred write flag.
1438	 */
1439	bp->b_flags &= ~B_DEFERRED;
1440}
1441
1442/*
1443 *	bawrite:
1444 *
1445 *	Asynchronous write.  Start output on a buffer, but do not wait for
1446 *	it to complete.  The buffer is released when the output completes.
1447 *
1448 *	bwrite() ( or the VOP routine anyway ) is responsible for handling
1449 *	B_INVAL buffers.  Not us.
1450 */
1451void
1452bawrite(struct buf *bp)
1453{
1454
1455	bp->b_flags |= B_ASYNC;
1456	(void) bwrite(bp);
1457}
1458
1459/*
1460 *	babarrierwrite:
1461 *
1462 *	Asynchronous barrier write.  Start output on a buffer, but do not
1463 *	wait for it to complete.  Place a write barrier after this write so
1464 *	that this buffer and all buffers written before it are committed to
1465 *	the disk before any buffers written after this write are committed
1466 *	to the disk.  The buffer is released when the output completes.
1467 */
1468void
1469babarrierwrite(struct buf *bp)
1470{
1471
1472	bp->b_flags |= B_ASYNC | B_BARRIER;
1473	(void) bwrite(bp);
1474}
1475
1476/*
1477 *	bbarrierwrite:
1478 *
1479 *	Synchronous barrier write.  Start output on a buffer and wait for
1480 *	it to complete.  Place a write barrier after this write so that
1481 *	this buffer and all buffers written before it are committed to
1482 *	the disk before any buffers written after this write are committed
1483 *	to the disk.  The buffer is released when the output completes.
1484 */
1485int
1486bbarrierwrite(struct buf *bp)
1487{
1488
1489	bp->b_flags |= B_BARRIER;
1490	return (bwrite(bp));
1491}
1492
1493/*
1494 *	bwillwrite:
1495 *
1496 *	Called prior to the locking of any vnodes when we are expecting to
1497 *	write.  We do not want to starve the buffer cache with too many
1498 *	dirty buffers so we block here.  By blocking prior to the locking
1499 *	of any vnodes we attempt to avoid the situation where a locked vnode
1500 *	prevents the various system daemons from flushing related buffers.
1501 */
1502void
1503bwillwrite(void)
1504{
1505
1506	if (numdirtybuffers >= hidirtybuffers) {
1507		mtx_lock(&bdirtylock);
1508		while (numdirtybuffers >= hidirtybuffers) {
1509			bdirtywait = 1;
1510			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
1511			    "flswai", 0);
1512		}
1513		mtx_unlock(&bdirtylock);
1514	}
1515}
1516
1517/*
1518 * Return true if we have too many dirty buffers.
1519 */
1520int
1521buf_dirty_count_severe(void)
1522{
1523
1524	return(numdirtybuffers >= hidirtybuffers);
1525}
1526
1527static __noinline int
1528buf_vm_page_count_severe(void)
1529{
1530
1531	KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
1532
1533	return vm_page_count_severe();
1534}
1535
1536/*
1537 *	brelse:
1538 *
1539 *	Release a busy buffer and, if requested, free its resources.  The
1540 *	buffer will be stashed in the appropriate bufqueue[] allowing it
1541 *	to be accessed later as a cache entity or reused for other purposes.
1542 */
1543void
1544brelse(struct buf *bp)
1545{
1546	int qindex;
1547
1548	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
1549	    bp, bp->b_vp, bp->b_flags);
1550	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1551	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1552
1553	if (BUF_LOCKRECURSED(bp)) {
1554		/*
1555		 * Do not process, in particular, do not handle the
1556		 * B_INVAL/B_RELBUF and do not release to free list.
1557		 */
1558		BUF_UNLOCK(bp);
1559		return;
1560	}
1561
1562	if (bp->b_flags & B_MANAGED) {
1563		bqrelse(bp);
1564		return;
1565	}
1566
1567	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
1568	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
1569		/*
1570		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
1571		 * pages from being scrapped.  If the error is anything
1572		 * other than an I/O error (EIO), assume that retrying
1573		 * is futile.
1574		 */
1575		bp->b_ioflags &= ~BIO_ERROR;
1576		bdirty(bp);
1577	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
1578	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
1579		/*
1580		 * Either a failed I/O or we were asked to free or not
1581		 * cache the buffer.
1582		 */
1583		bp->b_flags |= B_INVAL;
1584		if (!LIST_EMPTY(&bp->b_dep))
1585			buf_deallocate(bp);
1586		if (bp->b_flags & B_DELWRI)
1587			bdirtysub();
1588		bp->b_flags &= ~(B_DELWRI | B_CACHE);
1589		if ((bp->b_flags & B_VMIO) == 0) {
1590			if (bp->b_bufsize)
1591				allocbuf(bp, 0);
1592			if (bp->b_vp)
1593				brelvp(bp);
1594		}
1595	}
1596
1597	/*
1598	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
1599	 * is called with B_DELWRI set, the underlying pages may wind up
1600	 * getting freed causing a previous write (bdwrite()) to get 'lost'
1601	 * because pages associated with a B_DELWRI bp are marked clean.
1602	 *
1603	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
1604	 * if B_DELWRI is set.
1605	 *
1606	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
1607	 * on pages to return pages to the VM page queues.
1608	 */
1609	if (bp->b_flags & B_DELWRI)
1610		bp->b_flags &= ~B_RELBUF;
1611	else if (buf_vm_page_count_severe()) {
1612		/*
1613		 * BKGRDINPROG can only be set with the buf and bufobj
1614		 * locks both held.  We tolerate a race to clear it here.
1615		 */
1616		if (!(bp->b_vflags & BV_BKGRDINPROG))
1617			bp->b_flags |= B_RELBUF;
1618	}
1619
1620	/*
1621	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
1622	 * constituted, not even NFS buffers now.  Two flags effect this.  If
1623	 * B_INVAL, the struct buf is invalidated but the VM object is kept
1624	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
1625	 *
1626	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
1627	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
1628	 * buffer is also B_INVAL because it hits the re-dirtying code above.
1629	 *
1630	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
1631	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
1632	 * the commit state and we cannot afford to lose the buffer. If the
1633	 * buffer has a background write in progress, we need to keep it
1634	 * around to prevent it from being reconstituted and starting a second
1635	 * background write.
1636	 */
1637	if ((bp->b_flags & B_VMIO)
1638	    && !(bp->b_vp->v_mount != NULL &&
1639		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
1640		 !vn_isdisk(bp->b_vp, NULL) &&
1641		 (bp->b_flags & B_DELWRI))
1642	    ) {
1643
1644		int i, j, resid;
1645		vm_page_t m;
1646		off_t foff;
1647		vm_pindex_t poff;
1648		vm_object_t obj;
1649
1650		obj = bp->b_bufobj->bo_object;
1651
1652		/*
1653		 * Get the base offset and length of the buffer.  Note that
1654		 * in the VMIO case if the buffer block size is not
1655		 * page-aligned then b_data pointer may not be page-aligned.
1656		 * But our b_pages[] array *IS* page aligned.
1657		 *
1658		 * block sizes less then DEV_BSIZE (usually 512) are not
1659		 * supported due to the page granularity bits (m->valid,
1660		 * m->dirty, etc...).
1661		 *
1662		 * See man buf(9) for more information
1663		 */
1664		resid = bp->b_bufsize;
1665		foff = bp->b_offset;
1666		for (i = 0; i < bp->b_npages; i++) {
1667			int had_bogus = 0;
1668
1669			m = bp->b_pages[i];
1670
1671			/*
1672			 * If we hit a bogus page, fixup *all* the bogus pages
1673			 * now.
1674			 */
1675			if (m == bogus_page) {
1676				poff = OFF_TO_IDX(bp->b_offset);
1677				had_bogus = 1;
1678
1679				VM_OBJECT_RLOCK(obj);
1680				for (j = i; j < bp->b_npages; j++) {
1681					vm_page_t mtmp;
1682					mtmp = bp->b_pages[j];
1683					if (mtmp == bogus_page) {
1684						mtmp = vm_page_lookup(obj, poff + j);
1685						if (!mtmp) {
1686							panic("brelse: page missing\n");
1687						}
1688						bp->b_pages[j] = mtmp;
1689					}
1690				}
1691				VM_OBJECT_RUNLOCK(obj);
1692
1693				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
1694					BUF_CHECK_MAPPED(bp);
1695					pmap_qenter(
1696					    trunc_page((vm_offset_t)bp->b_data),
1697					    bp->b_pages, bp->b_npages);
1698				}
1699				m = bp->b_pages[i];
1700			}
1701			if ((bp->b_flags & B_NOCACHE) ||
1702			    (bp->b_ioflags & BIO_ERROR &&
1703			     bp->b_iocmd == BIO_READ)) {
1704				int poffset = foff & PAGE_MASK;
1705				int presid = resid > (PAGE_SIZE - poffset) ?
1706					(PAGE_SIZE - poffset) : resid;
1707
1708				KASSERT(presid >= 0, ("brelse: extra page"));
1709				VM_OBJECT_WLOCK(obj);
1710				while (vm_page_xbusied(m)) {
1711					vm_page_lock(m);
1712					VM_OBJECT_WUNLOCK(obj);
1713					vm_page_busy_sleep(m, "mbncsh");
1714					VM_OBJECT_WLOCK(obj);
1715				}
1716				if (pmap_page_wired_mappings(m) == 0)
1717					vm_page_set_invalid(m, poffset, presid);
1718				VM_OBJECT_WUNLOCK(obj);
1719				if (had_bogus)
1720					printf("avoided corruption bug in bogus_page/brelse code\n");
1721			}
1722			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1723			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
1724		}
1725		if (bp->b_flags & (B_INVAL | B_RELBUF))
1726			vfs_vmio_release(bp);
1727
1728	} else if (bp->b_flags & B_VMIO) {
1729
1730		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
1731			vfs_vmio_release(bp);
1732		}
1733
1734	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
1735		if (bp->b_bufsize != 0)
1736			allocbuf(bp, 0);
1737		if (bp->b_vp != NULL)
1738			brelvp(bp);
1739	}
1740
1741	/*
1742	 * If the buffer has junk contents signal it and eventually
1743	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
1744	 * doesn't find it.
1745	 */
1746	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
1747	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
1748		bp->b_flags |= B_INVAL;
1749	if (bp->b_flags & B_INVAL) {
1750		if (bp->b_flags & B_DELWRI)
1751			bundirty(bp);
1752		if (bp->b_vp)
1753			brelvp(bp);
1754	}
1755
1756	/* buffers with no memory */
1757	if (bp->b_bufsize == 0) {
1758		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1759		if (bp->b_vflags & BV_BKGRDINPROG)
1760			panic("losing buffer 1");
1761		if (bp->b_kvasize)
1762			qindex = QUEUE_EMPTYKVA;
1763		else
1764			qindex = QUEUE_EMPTY;
1765		bp->b_flags |= B_AGE;
1766	/* buffers with junk contents */
1767	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
1768	    (bp->b_ioflags & BIO_ERROR)) {
1769		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
1770		if (bp->b_vflags & BV_BKGRDINPROG)
1771			panic("losing buffer 2");
1772		qindex = QUEUE_CLEAN;
1773		bp->b_flags |= B_AGE;
1774	/* remaining buffers */
1775	} else if (bp->b_flags & B_DELWRI)
1776		qindex = QUEUE_DIRTY;
1777	else
1778		qindex = QUEUE_CLEAN;
1779
1780	binsfree(bp, qindex);
1781
1782	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
1783	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1784		panic("brelse: not dirty");
1785	/* unlock */
1786	BUF_UNLOCK(bp);
1787}
1788
1789/*
1790 * Release a buffer back to the appropriate queue but do not try to free
1791 * it.  The buffer is expected to be used again soon.
1792 *
1793 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1794 * biodone() to requeue an async I/O on completion.  It is also used when
1795 * known good buffers need to be requeued but we think we may need the data
1796 * again soon.
1797 *
1798 * XXX we should be able to leave the B_RELBUF hint set on completion.
1799 */
1800void
1801bqrelse(struct buf *bp)
1802{
1803	int qindex;
1804
1805	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1806	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
1807	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1808
1809	if (BUF_LOCKRECURSED(bp)) {
1810		/* do not release to free list */
1811		BUF_UNLOCK(bp);
1812		return;
1813	}
1814	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1815
1816	if (bp->b_flags & B_MANAGED) {
1817		if (bp->b_flags & B_REMFREE)
1818			bremfreef(bp);
1819		goto out;
1820	}
1821
1822	/* buffers with stale but valid contents */
1823	if (bp->b_flags & B_DELWRI) {
1824		qindex = QUEUE_DIRTY;
1825	} else {
1826		if ((bp->b_flags & B_DELWRI) == 0 &&
1827		    (bp->b_xflags & BX_VNDIRTY))
1828			panic("bqrelse: not dirty");
1829		/*
1830		 * BKGRDINPROG can only be set with the buf and bufobj
1831		 * locks both held.  We tolerate a race to clear it here.
1832		 */
1833		if (buf_vm_page_count_severe() &&
1834		    (bp->b_vflags & BV_BKGRDINPROG) == 0) {
1835			/*
1836			 * We are too low on memory, we have to try to free
1837			 * the buffer (most importantly: the wired pages
1838			 * making up its backing store) *now*.
1839			 */
1840			brelse(bp);
1841			return;
1842		}
1843		qindex = QUEUE_CLEAN;
1844	}
1845	binsfree(bp, qindex);
1846
1847out:
1848	/* unlock */
1849	BUF_UNLOCK(bp);
1850}
1851
1852/* Give pages used by the bp back to the VM system (where possible) */
1853static void
1854vfs_vmio_release(struct buf *bp)
1855{
1856	vm_object_t obj;
1857	vm_page_t m;
1858	int i;
1859
1860	if ((bp->b_flags & B_UNMAPPED) == 0) {
1861		BUF_CHECK_MAPPED(bp);
1862		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
1863	} else
1864		BUF_CHECK_UNMAPPED(bp);
1865	obj = bp->b_bufobj->bo_object;
1866	if (obj != NULL)
1867		VM_OBJECT_WLOCK(obj);
1868	for (i = 0; i < bp->b_npages; i++) {
1869		m = bp->b_pages[i];
1870		bp->b_pages[i] = NULL;
1871		/*
1872		 * In order to keep page LRU ordering consistent, put
1873		 * everything on the inactive queue.
1874		 */
1875		vm_page_lock(m);
1876		vm_page_unwire(m, 0);
1877
1878		/*
1879		 * Might as well free the page if we can and it has
1880		 * no valid data.  We also free the page if the
1881		 * buffer was used for direct I/O
1882		 */
1883		if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
1884			if (m->wire_count == 0 && !vm_page_busied(m))
1885				vm_page_free(m);
1886		} else if (bp->b_flags & B_DIRECT)
1887			vm_page_try_to_free(m);
1888		else if (buf_vm_page_count_severe())
1889			vm_page_try_to_cache(m);
1890		vm_page_unlock(m);
1891	}
1892	if (obj != NULL)
1893		VM_OBJECT_WUNLOCK(obj);
1894
1895	if (bp->b_bufsize) {
1896		bufspacewakeup();
1897		bp->b_bufsize = 0;
1898	}
1899	bp->b_npages = 0;
1900	bp->b_flags &= ~B_VMIO;
1901	if (bp->b_vp)
1902		brelvp(bp);
1903}
1904
1905/*
1906 * Check to see if a block at a particular lbn is available for a clustered
1907 * write.
1908 */
1909static int
1910vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
1911{
1912	struct buf *bpa;
1913	int match;
1914
1915	match = 0;
1916
1917	/* If the buf isn't in core skip it */
1918	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
1919		return (0);
1920
1921	/* If the buf is busy we don't want to wait for it */
1922	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
1923		return (0);
1924
1925	/* Only cluster with valid clusterable delayed write buffers */
1926	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
1927	    (B_DELWRI | B_CLUSTEROK))
1928		goto done;
1929
1930	if (bpa->b_bufsize != size)
1931		goto done;
1932
1933	/*
1934	 * Check to see if it is in the expected place on disk and that the
1935	 * block has been mapped.
1936	 */
1937	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
1938		match = 1;
1939done:
1940	BUF_UNLOCK(bpa);
1941	return (match);
1942}
1943
1944/*
1945 *	vfs_bio_awrite:
1946 *
1947 *	Implement clustered async writes for clearing out B_DELWRI buffers.
1948 *	This is much better then the old way of writing only one buffer at
1949 *	a time.  Note that we may not be presented with the buffers in the
1950 *	correct order, so we search for the cluster in both directions.
1951 */
1952int
1953vfs_bio_awrite(struct buf *bp)
1954{
1955	struct bufobj *bo;
1956	int i;
1957	int j;
1958	daddr_t lblkno = bp->b_lblkno;
1959	struct vnode *vp = bp->b_vp;
1960	int ncl;
1961	int nwritten;
1962	int size;
1963	int maxcl;
1964	int gbflags;
1965
1966	bo = &vp->v_bufobj;
1967	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
1968	/*
1969	 * right now we support clustered writing only to regular files.  If
1970	 * we find a clusterable block we could be in the middle of a cluster
1971	 * rather then at the beginning.
1972	 */
1973	if ((vp->v_type == VREG) &&
1974	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1975	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1976
1977		size = vp->v_mount->mnt_stat.f_iosize;
1978		maxcl = MAXPHYS / size;
1979
1980		BO_RLOCK(bo);
1981		for (i = 1; i < maxcl; i++)
1982			if (vfs_bio_clcheck(vp, size, lblkno + i,
1983			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
1984				break;
1985
1986		for (j = 1; i + j <= maxcl && j <= lblkno; j++)
1987			if (vfs_bio_clcheck(vp, size, lblkno - j,
1988			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
1989				break;
1990		BO_RUNLOCK(bo);
1991		--j;
1992		ncl = i + j;
1993		/*
1994		 * this is a possible cluster write
1995		 */
1996		if (ncl != 1) {
1997			BUF_UNLOCK(bp);
1998			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
1999			    gbflags);
2000			return (nwritten);
2001		}
2002	}
2003	bremfree(bp);
2004	bp->b_flags |= B_ASYNC;
2005	/*
2006	 * default (old) behavior, writing out only one block
2007	 *
2008	 * XXX returns b_bufsize instead of b_bcount for nwritten?
2009	 */
2010	nwritten = bp->b_bufsize;
2011	(void) bwrite(bp);
2012
2013	return (nwritten);
2014}
2015
2016static void
2017setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
2018{
2019
2020	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2021	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
2022	if ((gbflags & GB_UNMAPPED) == 0) {
2023		bp->b_kvabase = (caddr_t)addr;
2024	} else if ((gbflags & GB_KVAALLOC) != 0) {
2025		KASSERT((gbflags & GB_UNMAPPED) != 0,
2026		    ("GB_KVAALLOC without GB_UNMAPPED"));
2027		bp->b_kvaalloc = (caddr_t)addr;
2028		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2029		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2030	}
2031	bp->b_kvasize = maxsize;
2032}
2033
2034/*
2035 * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
2036 * needed.
2037 */
2038static int
2039allocbufkva(struct buf *bp, int maxsize, int gbflags)
2040{
2041	vm_offset_t addr;
2042
2043	bfreekva(bp);
2044	addr = 0;
2045
2046	if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
2047		/*
2048		 * Buffer map is too fragmented.  Request the caller
2049		 * to defragment the map.
2050		 */
2051		atomic_add_int(&bufdefragcnt, 1);
2052		return (1);
2053	}
2054	setbufkva(bp, addr, maxsize, gbflags);
2055	atomic_add_long(&bufspace, bp->b_kvasize);
2056	return (0);
2057}
2058
2059/*
2060 * Ask the bufdaemon for help, or act as bufdaemon itself, when a
2061 * locked vnode is supplied.
2062 */
2063static void
2064getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
2065    int defrag)
2066{
2067	struct thread *td;
2068	char *waitmsg;
2069	int error, fl, flags, norunbuf;
2070
2071	mtx_assert(&bqclean, MA_OWNED);
2072
2073	if (defrag) {
2074		flags = VFS_BIO_NEED_BUFSPACE;
2075		waitmsg = "nbufkv";
2076	} else if (bufspace >= hibufspace) {
2077		waitmsg = "nbufbs";
2078		flags = VFS_BIO_NEED_BUFSPACE;
2079	} else {
2080		waitmsg = "newbuf";
2081		flags = VFS_BIO_NEED_ANY;
2082	}
2083	atomic_set_int(&needsbuffer, flags);
2084	mtx_unlock(&bqclean);
2085
2086	bd_speedup();	/* heeeelp */
2087	if ((gbflags & GB_NOWAIT_BD) != 0)
2088		return;
2089
2090	td = curthread;
2091	rw_wlock(&nblock);
2092	while ((needsbuffer & flags) != 0) {
2093		if (vp != NULL && vp->v_type != VCHR &&
2094		    (td->td_pflags & TDP_BUFNEED) == 0) {
2095			rw_wunlock(&nblock);
2096			/*
2097			 * getblk() is called with a vnode locked, and
2098			 * some majority of the dirty buffers may as
2099			 * well belong to the vnode.  Flushing the
2100			 * buffers there would make a progress that
2101			 * cannot be achieved by the buf_daemon, that
2102			 * cannot lock the vnode.
2103			 */
2104			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
2105			    (td->td_pflags & TDP_NORUNNINGBUF);
2106
2107			/*
2108			 * Play bufdaemon.  The getnewbuf() function
2109			 * may be called while the thread owns lock
2110			 * for another dirty buffer for the same
2111			 * vnode, which makes it impossible to use
2112			 * VOP_FSYNC() there, due to the buffer lock
2113			 * recursion.
2114			 */
2115			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
2116			fl = buf_flush(vp, flushbufqtarget);
2117			td->td_pflags &= norunbuf;
2118			rw_wlock(&nblock);
2119			if (fl != 0)
2120				continue;
2121			if ((needsbuffer & flags) == 0)
2122				break;
2123		}
2124		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
2125		    (PRIBIO + 4) | slpflag, waitmsg, slptimeo);
2126		if (error != 0)
2127			break;
2128	}
2129	rw_wunlock(&nblock);
2130}
2131
2132static void
2133getnewbuf_reuse_bp(struct buf *bp, int qindex)
2134{
2135
2136	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
2137	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
2138	     bp->b_kvasize, bp->b_bufsize, qindex);
2139	mtx_assert(&bqclean, MA_NOTOWNED);
2140
2141	/*
2142	 * Note: we no longer distinguish between VMIO and non-VMIO
2143	 * buffers.
2144	 */
2145	KASSERT((bp->b_flags & B_DELWRI) == 0,
2146	    ("delwri buffer %p found in queue %d", bp, qindex));
2147
2148	if (qindex == QUEUE_CLEAN) {
2149		if (bp->b_flags & B_VMIO) {
2150			bp->b_flags &= ~B_ASYNC;
2151			vfs_vmio_release(bp);
2152		}
2153		if (bp->b_vp != NULL)
2154			brelvp(bp);
2155	}
2156
2157	/*
2158	 * Get the rest of the buffer freed up.  b_kva* is still valid
2159	 * after this operation.
2160	 */
2161
2162	if (bp->b_rcred != NOCRED) {
2163		crfree(bp->b_rcred);
2164		bp->b_rcred = NOCRED;
2165	}
2166	if (bp->b_wcred != NOCRED) {
2167		crfree(bp->b_wcred);
2168		bp->b_wcred = NOCRED;
2169	}
2170	if (!LIST_EMPTY(&bp->b_dep))
2171		buf_deallocate(bp);
2172	if (bp->b_vflags & BV_BKGRDINPROG)
2173		panic("losing buffer 3");
2174	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
2175	    bp, bp->b_vp, qindex));
2176	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
2177	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
2178
2179	if (bp->b_bufsize)
2180		allocbuf(bp, 0);
2181
2182	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
2183	bp->b_ioflags = 0;
2184	bp->b_xflags = 0;
2185	KASSERT((bp->b_flags & B_INFREECNT) == 0,
2186	    ("buf %p still counted as free?", bp));
2187	bp->b_vflags = 0;
2188	bp->b_vp = NULL;
2189	bp->b_blkno = bp->b_lblkno = 0;
2190	bp->b_offset = NOOFFSET;
2191	bp->b_iodone = 0;
2192	bp->b_error = 0;
2193	bp->b_resid = 0;
2194	bp->b_bcount = 0;
2195	bp->b_npages = 0;
2196	bp->b_dirtyoff = bp->b_dirtyend = 0;
2197	bp->b_bufobj = NULL;
2198	bp->b_pin_count = 0;
2199	bp->b_fsprivate1 = NULL;
2200	bp->b_fsprivate2 = NULL;
2201	bp->b_fsprivate3 = NULL;
2202
2203	LIST_INIT(&bp->b_dep);
2204}
2205
2206static int flushingbufs;
2207
2208static struct buf *
2209getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
2210{
2211	struct buf *bp, *nbp;
2212	int nqindex, qindex, pass;
2213
2214	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
2215
2216	pass = 1;
2217restart:
2218	atomic_add_int(&getnewbufrestarts, 1);
2219
2220	/*
2221	 * Setup for scan.  If we do not have enough free buffers,
2222	 * we setup a degenerate case that immediately fails.  Note
2223	 * that if we are specially marked process, we are allowed to
2224	 * dip into our reserves.
2225	 *
2226	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
2227	 * for the allocation of the mapped buffer.  For unmapped, the
2228	 * easiest is to start with EMPTY outright.
2229	 *
2230	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
2231	 * However, there are a number of cases (defragging, reusing, ...)
2232	 * where we cannot backup.
2233	 */
2234	nbp = NULL;
2235	mtx_lock(&bqclean);
2236	if (!defrag && unmapped) {
2237		nqindex = QUEUE_EMPTY;
2238		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2239	}
2240	if (nbp == NULL) {
2241		nqindex = QUEUE_EMPTYKVA;
2242		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2243	}
2244
2245	/*
2246	 * If no EMPTYKVA buffers and we are either defragging or
2247	 * reusing, locate a CLEAN buffer to free or reuse.  If
2248	 * bufspace useage is low skip this step so we can allocate a
2249	 * new buffer.
2250	 */
2251	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
2252		nqindex = QUEUE_CLEAN;
2253		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2254	}
2255
2256	/*
2257	 * If we could not find or were not allowed to reuse a CLEAN
2258	 * buffer, check to see if it is ok to use an EMPTY buffer.
2259	 * We can only use an EMPTY buffer if allocating its KVA would
2260	 * not otherwise run us out of buffer space.  No KVA is needed
2261	 * for the unmapped allocation.
2262	 */
2263	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
2264	    metadata)) {
2265		nqindex = QUEUE_EMPTY;
2266		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
2267	}
2268
2269	/*
2270	 * All available buffers might be clean, retry ignoring the
2271	 * lobufspace as the last resort.
2272	 */
2273	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
2274		nqindex = QUEUE_CLEAN;
2275		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2276	}
2277
2278	/*
2279	 * Run scan, possibly freeing data and/or kva mappings on the fly
2280	 * depending.
2281	 */
2282	while ((bp = nbp) != NULL) {
2283		qindex = nqindex;
2284
2285		/*
2286		 * Calculate next bp (we can only use it if we do not
2287		 * block or do other fancy things).
2288		 */
2289		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
2290			switch (qindex) {
2291			case QUEUE_EMPTY:
2292				nqindex = QUEUE_EMPTYKVA;
2293				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
2294				if (nbp != NULL)
2295					break;
2296				/* FALLTHROUGH */
2297			case QUEUE_EMPTYKVA:
2298				nqindex = QUEUE_CLEAN;
2299				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
2300				if (nbp != NULL)
2301					break;
2302				/* FALLTHROUGH */
2303			case QUEUE_CLEAN:
2304				if (metadata && pass == 1) {
2305					pass = 2;
2306					nqindex = QUEUE_EMPTY;
2307					nbp = TAILQ_FIRST(
2308					    &bufqueues[QUEUE_EMPTY]);
2309				}
2310				/*
2311				 * nbp is NULL.
2312				 */
2313				break;
2314			}
2315		}
2316		/*
2317		 * If we are defragging then we need a buffer with
2318		 * b_kvasize != 0.  XXX this situation should no longer
2319		 * occur, if defrag is non-zero the buffer's b_kvasize
2320		 * should also be non-zero at this point.  XXX
2321		 */
2322		if (defrag && bp->b_kvasize == 0) {
2323			printf("Warning: defrag empty buffer %p\n", bp);
2324			continue;
2325		}
2326
2327		/*
2328		 * Start freeing the bp.  This is somewhat involved.  nbp
2329		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
2330		 */
2331		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
2332			continue;
2333		/*
2334		 * BKGRDINPROG can only be set with the buf and bufobj
2335		 * locks both held.  We tolerate a race to clear it here.
2336		 */
2337		if (bp->b_vflags & BV_BKGRDINPROG) {
2338			BUF_UNLOCK(bp);
2339			continue;
2340		}
2341
2342		KASSERT(bp->b_qindex == qindex,
2343		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
2344
2345		bremfreel(bp);
2346		mtx_unlock(&bqclean);
2347		/*
2348		 * NOTE:  nbp is now entirely invalid.  We can only restart
2349		 * the scan from this point on.
2350		 */
2351
2352		getnewbuf_reuse_bp(bp, qindex);
2353		mtx_assert(&bqclean, MA_NOTOWNED);
2354
2355		/*
2356		 * If we are defragging then free the buffer.
2357		 */
2358		if (defrag) {
2359			bp->b_flags |= B_INVAL;
2360			bfreekva(bp);
2361			brelse(bp);
2362			defrag = 0;
2363			goto restart;
2364		}
2365
2366		/*
2367		 * Notify any waiters for the buffer lock about
2368		 * identity change by freeing the buffer.
2369		 */
2370		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
2371			bp->b_flags |= B_INVAL;
2372			bfreekva(bp);
2373			brelse(bp);
2374			goto restart;
2375		}
2376
2377		if (metadata)
2378			break;
2379
2380		/*
2381		 * If we are overcomitted then recover the buffer and its
2382		 * KVM space.  This occurs in rare situations when multiple
2383		 * processes are blocked in getnewbuf() or allocbuf().
2384		 */
2385		if (bufspace >= hibufspace)
2386			flushingbufs = 1;
2387		if (flushingbufs && bp->b_kvasize != 0) {
2388			bp->b_flags |= B_INVAL;
2389			bfreekva(bp);
2390			brelse(bp);
2391			goto restart;
2392		}
2393		if (bufspace < lobufspace)
2394			flushingbufs = 0;
2395		break;
2396	}
2397	return (bp);
2398}
2399
2400/*
2401 *	getnewbuf:
2402 *
2403 *	Find and initialize a new buffer header, freeing up existing buffers
2404 *	in the bufqueues as necessary.  The new buffer is returned locked.
2405 *
2406 *	Important:  B_INVAL is not set.  If the caller wishes to throw the
2407 *	buffer away, the caller must set B_INVAL prior to calling brelse().
2408 *
2409 *	We block if:
2410 *		We have insufficient buffer headers
2411 *		We have insufficient buffer space
2412 *		buffer_arena is too fragmented ( space reservation fails )
2413 *		If we have to flush dirty buffers ( but we try to avoid this )
2414 */
2415static struct buf *
2416getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
2417    int gbflags)
2418{
2419	struct buf *bp;
2420	int defrag, metadata;
2421
2422	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
2423	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
2424	if (!unmapped_buf_allowed)
2425		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
2426
2427	defrag = 0;
2428	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
2429	    vp->v_type == VCHR)
2430		metadata = 1;
2431	else
2432		metadata = 0;
2433	/*
2434	 * We can't afford to block since we might be holding a vnode lock,
2435	 * which may prevent system daemons from running.  We deal with
2436	 * low-memory situations by proactively returning memory and running
2437	 * async I/O rather then sync I/O.
2438	 */
2439	atomic_add_int(&getnewbufcalls, 1);
2440	atomic_subtract_int(&getnewbufrestarts, 1);
2441restart:
2442	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
2443	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
2444	if (bp != NULL)
2445		defrag = 0;
2446
2447	/*
2448	 * If we exhausted our list, sleep as appropriate.  We may have to
2449	 * wakeup various daemons and write out some dirty buffers.
2450	 *
2451	 * Generally we are sleeping due to insufficient buffer space.
2452	 */
2453	if (bp == NULL) {
2454		mtx_assert(&bqclean, MA_OWNED);
2455		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
2456		mtx_assert(&bqclean, MA_NOTOWNED);
2457	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
2458		mtx_assert(&bqclean, MA_NOTOWNED);
2459
2460		bfreekva(bp);
2461		bp->b_flags |= B_UNMAPPED;
2462		bp->b_kvabase = bp->b_data = unmapped_buf;
2463		bp->b_kvasize = maxsize;
2464		atomic_add_long(&bufspace, bp->b_kvasize);
2465		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
2466		atomic_add_int(&bufreusecnt, 1);
2467	} else {
2468		mtx_assert(&bqclean, MA_NOTOWNED);
2469
2470		/*
2471		 * We finally have a valid bp.  We aren't quite out of the
2472		 * woods, we still have to reserve kva space.  In order
2473		 * to keep fragmentation sane we only allocate kva in
2474		 * BKVASIZE chunks.
2475		 */
2476		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
2477
2478		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
2479		    B_KVAALLOC)) == B_UNMAPPED) {
2480			if (allocbufkva(bp, maxsize, gbflags)) {
2481				defrag = 1;
2482				bp->b_flags |= B_INVAL;
2483				brelse(bp);
2484				goto restart;
2485			}
2486			atomic_add_int(&bufreusecnt, 1);
2487		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
2488		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
2489			/*
2490			 * If the reused buffer has KVA allocated,
2491			 * reassign b_kvaalloc to b_kvabase.
2492			 */
2493			bp->b_kvabase = bp->b_kvaalloc;
2494			bp->b_flags &= ~B_KVAALLOC;
2495			atomic_subtract_long(&unmapped_bufspace,
2496			    bp->b_kvasize);
2497			atomic_add_int(&bufreusecnt, 1);
2498		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
2499		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
2500		    GB_KVAALLOC)) {
2501			/*
2502			 * The case of reused buffer already have KVA
2503			 * mapped, but the request is for unmapped
2504			 * buffer with KVA allocated.
2505			 */
2506			bp->b_kvaalloc = bp->b_kvabase;
2507			bp->b_data = bp->b_kvabase = unmapped_buf;
2508			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
2509			atomic_add_long(&unmapped_bufspace,
2510			    bp->b_kvasize);
2511			atomic_add_int(&bufreusecnt, 1);
2512		}
2513		if ((gbflags & GB_UNMAPPED) == 0) {
2514			bp->b_saveaddr = bp->b_kvabase;
2515			bp->b_data = bp->b_saveaddr;
2516			bp->b_flags &= ~B_UNMAPPED;
2517			BUF_CHECK_MAPPED(bp);
2518		}
2519	}
2520	return (bp);
2521}
2522
2523/*
2524 *	buf_daemon:
2525 *
2526 *	buffer flushing daemon.  Buffers are normally flushed by the
2527 *	update daemon but if it cannot keep up this process starts to
2528 *	take the load in an attempt to prevent getnewbuf() from blocking.
2529 */
2530
2531static struct kproc_desc buf_kp = {
2532	"bufdaemon",
2533	buf_daemon,
2534	&bufdaemonproc
2535};
2536SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
2537
2538static int
2539buf_flush(struct vnode *vp, int target)
2540{
2541	int flushed;
2542
2543	flushed = flushbufqueues(vp, target, 0);
2544	if (flushed == 0) {
2545		/*
2546		 * Could not find any buffers without rollback
2547		 * dependencies, so just write the first one
2548		 * in the hopes of eventually making progress.
2549		 */
2550		if (vp != NULL && target > 2)
2551			target /= 2;
2552		flushbufqueues(vp, target, 1);
2553	}
2554	return (flushed);
2555}
2556
2557static void
2558buf_daemon()
2559{
2560	int lodirty;
2561
2562	/*
2563	 * This process needs to be suspended prior to shutdown sync.
2564	 */
2565	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
2566	    SHUTDOWN_PRI_LAST);
2567
2568	/*
2569	 * This process is allowed to take the buffer cache to the limit
2570	 */
2571	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
2572	mtx_lock(&bdlock);
2573	for (;;) {
2574		bd_request = 0;
2575		mtx_unlock(&bdlock);
2576
2577		kproc_suspend_check(bufdaemonproc);
2578		lodirty = lodirtybuffers;
2579		if (bd_speedupreq) {
2580			lodirty = numdirtybuffers / 2;
2581			bd_speedupreq = 0;
2582		}
2583		/*
2584		 * Do the flush.  Limit the amount of in-transit I/O we
2585		 * allow to build up, otherwise we would completely saturate
2586		 * the I/O system.
2587		 */
2588		while (numdirtybuffers > lodirty) {
2589			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
2590				break;
2591			kern_yield(PRI_USER);
2592		}
2593
2594		/*
2595		 * Only clear bd_request if we have reached our low water
2596		 * mark.  The buf_daemon normally waits 1 second and
2597		 * then incrementally flushes any dirty buffers that have
2598		 * built up, within reason.
2599		 *
2600		 * If we were unable to hit our low water mark and couldn't
2601		 * find any flushable buffers, we sleep for a short period
2602		 * to avoid endless loops on unlockable buffers.
2603		 */
2604		mtx_lock(&bdlock);
2605		if (numdirtybuffers <= lodirtybuffers) {
2606			/*
2607			 * We reached our low water mark, reset the
2608			 * request and sleep until we are needed again.
2609			 * The sleep is just so the suspend code works.
2610			 */
2611			bd_request = 0;
2612			/*
2613			 * Do an extra wakeup in case dirty threshold
2614			 * changed via sysctl and the explicit transition
2615			 * out of shortfall was missed.
2616			 */
2617			bdirtywakeup();
2618			if (runningbufspace <= lorunningspace)
2619				runningwakeup();
2620			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
2621		} else {
2622			/*
2623			 * We couldn't find any flushable dirty buffers but
2624			 * still have too many dirty buffers, we
2625			 * have to sleep and try again.  (rare)
2626			 */
2627			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
2628		}
2629	}
2630}
2631
2632/*
2633 *	flushbufqueues:
2634 *
2635 *	Try to flush a buffer in the dirty queue.  We must be careful to
2636 *	free up B_INVAL buffers instead of write them, which NFS is
2637 *	particularly sensitive to.
2638 */
2639static int flushwithdeps = 0;
2640SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
2641    0, "Number of buffers flushed with dependecies that require rollbacks");
2642
2643static int
2644flushbufqueues(struct vnode *lvp, int target, int flushdeps)
2645{
2646	struct buf *sentinel;
2647	struct vnode *vp;
2648	struct mount *mp;
2649	struct buf *bp;
2650	int hasdeps;
2651	int flushed;
2652	int queue;
2653	int error;
2654	bool unlock;
2655
2656	flushed = 0;
2657	queue = QUEUE_DIRTY;
2658	bp = NULL;
2659	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
2660	sentinel->b_qindex = QUEUE_SENTINEL;
2661	mtx_lock(&bqdirty);
2662	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
2663	mtx_unlock(&bqdirty);
2664	while (flushed != target) {
2665		maybe_yield();
2666		mtx_lock(&bqdirty);
2667		bp = TAILQ_NEXT(sentinel, b_freelist);
2668		if (bp != NULL) {
2669			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2670			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
2671			    b_freelist);
2672		} else {
2673			mtx_unlock(&bqdirty);
2674			break;
2675		}
2676		/*
2677		 * Skip sentinels inserted by other invocations of the
2678		 * flushbufqueues(), taking care to not reorder them.
2679		 *
2680		 * Only flush the buffers that belong to the
2681		 * vnode locked by the curthread.
2682		 */
2683		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
2684		    bp->b_vp != lvp)) {
2685			mtx_unlock(&bqdirty);
2686 			continue;
2687		}
2688		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
2689		mtx_unlock(&bqdirty);
2690		if (error != 0)
2691			continue;
2692		if (bp->b_pin_count > 0) {
2693			BUF_UNLOCK(bp);
2694			continue;
2695		}
2696		/*
2697		 * BKGRDINPROG can only be set with the buf and bufobj
2698		 * locks both held.  We tolerate a race to clear it here.
2699		 */
2700		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
2701		    (bp->b_flags & B_DELWRI) == 0) {
2702			BUF_UNLOCK(bp);
2703			continue;
2704		}
2705		if (bp->b_flags & B_INVAL) {
2706			bremfreef(bp);
2707			brelse(bp);
2708			flushed++;
2709			continue;
2710		}
2711
2712		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
2713			if (flushdeps == 0) {
2714				BUF_UNLOCK(bp);
2715				continue;
2716			}
2717			hasdeps = 1;
2718		} else
2719			hasdeps = 0;
2720		/*
2721		 * We must hold the lock on a vnode before writing
2722		 * one of its buffers. Otherwise we may confuse, or
2723		 * in the case of a snapshot vnode, deadlock the
2724		 * system.
2725		 *
2726		 * The lock order here is the reverse of the normal
2727		 * of vnode followed by buf lock.  This is ok because
2728		 * the NOWAIT will prevent deadlock.
2729		 */
2730		vp = bp->b_vp;
2731		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2732			BUF_UNLOCK(bp);
2733			continue;
2734		}
2735		if (lvp == NULL) {
2736			unlock = true;
2737			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
2738		} else {
2739			ASSERT_VOP_LOCKED(vp, "getbuf");
2740			unlock = false;
2741			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
2742			    vn_lock(vp, LK_TRYUPGRADE);
2743		}
2744		if (error == 0) {
2745			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
2746			    bp, bp->b_vp, bp->b_flags);
2747			if (curproc == bufdaemonproc) {
2748				vfs_bio_awrite(bp);
2749			} else {
2750				bremfree(bp);
2751				bwrite(bp);
2752				notbufdflushes++;
2753			}
2754			vn_finished_write(mp);
2755			if (unlock)
2756				VOP_UNLOCK(vp, 0);
2757			flushwithdeps += hasdeps;
2758			flushed++;
2759
2760			/*
2761			 * Sleeping on runningbufspace while holding
2762			 * vnode lock leads to deadlock.
2763			 */
2764			if (curproc == bufdaemonproc &&
2765			    runningbufspace > hirunningspace)
2766				waitrunningbufspace();
2767			continue;
2768		}
2769		vn_finished_write(mp);
2770		BUF_UNLOCK(bp);
2771	}
2772	mtx_lock(&bqdirty);
2773	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
2774	mtx_unlock(&bqdirty);
2775	free(sentinel, M_TEMP);
2776	return (flushed);
2777}
2778
2779/*
2780 * Check to see if a block is currently memory resident.
2781 */
2782struct buf *
2783incore(struct bufobj *bo, daddr_t blkno)
2784{
2785	struct buf *bp;
2786
2787	BO_RLOCK(bo);
2788	bp = gbincore(bo, blkno);
2789	BO_RUNLOCK(bo);
2790	return (bp);
2791}
2792
2793/*
2794 * Returns true if no I/O is needed to access the
2795 * associated VM object.  This is like incore except
2796 * it also hunts around in the VM system for the data.
2797 */
2798
2799static int
2800inmem(struct vnode * vp, daddr_t blkno)
2801{
2802	vm_object_t obj;
2803	vm_offset_t toff, tinc, size;
2804	vm_page_t m;
2805	vm_ooffset_t off;
2806
2807	ASSERT_VOP_LOCKED(vp, "inmem");
2808
2809	if (incore(&vp->v_bufobj, blkno))
2810		return 1;
2811	if (vp->v_mount == NULL)
2812		return 0;
2813	obj = vp->v_object;
2814	if (obj == NULL)
2815		return (0);
2816
2817	size = PAGE_SIZE;
2818	if (size > vp->v_mount->mnt_stat.f_iosize)
2819		size = vp->v_mount->mnt_stat.f_iosize;
2820	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
2821
2822	VM_OBJECT_RLOCK(obj);
2823	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
2824		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
2825		if (!m)
2826			goto notinmem;
2827		tinc = size;
2828		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
2829			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
2830		if (vm_page_is_valid(m,
2831		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
2832			goto notinmem;
2833	}
2834	VM_OBJECT_RUNLOCK(obj);
2835	return 1;
2836
2837notinmem:
2838	VM_OBJECT_RUNLOCK(obj);
2839	return (0);
2840}
2841
2842/*
2843 * Set the dirty range for a buffer based on the status of the dirty
2844 * bits in the pages comprising the buffer.  The range is limited
2845 * to the size of the buffer.
2846 *
2847 * Tell the VM system that the pages associated with this buffer
2848 * are clean.  This is used for delayed writes where the data is
2849 * going to go to disk eventually without additional VM intevention.
2850 *
2851 * Note that while we only really need to clean through to b_bcount, we
2852 * just go ahead and clean through to b_bufsize.
2853 */
2854static void
2855vfs_clean_pages_dirty_buf(struct buf *bp)
2856{
2857	vm_ooffset_t foff, noff, eoff;
2858	vm_page_t m;
2859	int i;
2860
2861	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
2862		return;
2863
2864	foff = bp->b_offset;
2865	KASSERT(bp->b_offset != NOOFFSET,
2866	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
2867
2868	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
2869	vfs_drain_busy_pages(bp);
2870	vfs_setdirty_locked_object(bp);
2871	for (i = 0; i < bp->b_npages; i++) {
2872		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
2873		eoff = noff;
2874		if (eoff > bp->b_offset + bp->b_bufsize)
2875			eoff = bp->b_offset + bp->b_bufsize;
2876		m = bp->b_pages[i];
2877		vfs_page_set_validclean(bp, foff, m);
2878		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2879		foff = noff;
2880	}
2881	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
2882}
2883
2884static void
2885vfs_setdirty_locked_object(struct buf *bp)
2886{
2887	vm_object_t object;
2888	int i;
2889
2890	object = bp->b_bufobj->bo_object;
2891	VM_OBJECT_ASSERT_WLOCKED(object);
2892
2893	/*
2894	 * We qualify the scan for modified pages on whether the
2895	 * object has been flushed yet.
2896	 */
2897	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
2898		vm_offset_t boffset;
2899		vm_offset_t eoffset;
2900
2901		/*
2902		 * test the pages to see if they have been modified directly
2903		 * by users through the VM system.
2904		 */
2905		for (i = 0; i < bp->b_npages; i++)
2906			vm_page_test_dirty(bp->b_pages[i]);
2907
2908		/*
2909		 * Calculate the encompassing dirty range, boffset and eoffset,
2910		 * (eoffset - boffset) bytes.
2911		 */
2912
2913		for (i = 0; i < bp->b_npages; i++) {
2914			if (bp->b_pages[i]->dirty)
2915				break;
2916		}
2917		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2918
2919		for (i = bp->b_npages - 1; i >= 0; --i) {
2920			if (bp->b_pages[i]->dirty) {
2921				break;
2922			}
2923		}
2924		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
2925
2926		/*
2927		 * Fit it to the buffer.
2928		 */
2929
2930		if (eoffset > bp->b_bcount)
2931			eoffset = bp->b_bcount;
2932
2933		/*
2934		 * If we have a good dirty range, merge with the existing
2935		 * dirty range.
2936		 */
2937
2938		if (boffset < eoffset) {
2939			if (bp->b_dirtyoff > boffset)
2940				bp->b_dirtyoff = boffset;
2941			if (bp->b_dirtyend < eoffset)
2942				bp->b_dirtyend = eoffset;
2943		}
2944	}
2945}
2946
2947/*
2948 * Allocate the KVA mapping for an existing buffer. It handles the
2949 * cases of both B_UNMAPPED buffer, and buffer with the preallocated
2950 * KVA which is not mapped (B_KVAALLOC).
2951 */
2952static void
2953bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
2954{
2955	struct buf *scratch_bp;
2956	int bsize, maxsize, need_mapping, need_kva;
2957	off_t offset;
2958
2959	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
2960	    (gbflags & GB_UNMAPPED) == 0;
2961	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
2962	    (gbflags & GB_KVAALLOC) != 0;
2963	if (!need_mapping && !need_kva)
2964		return;
2965
2966	BUF_CHECK_UNMAPPED(bp);
2967
2968	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
2969		/*
2970		 * Buffer is not mapped, but the KVA was already
2971		 * reserved at the time of the instantiation.  Use the
2972		 * allocated space.
2973		 */
2974		bp->b_flags &= ~B_KVAALLOC;
2975		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
2976		bp->b_kvabase = bp->b_kvaalloc;
2977		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
2978		goto has_addr;
2979	}
2980
2981	/*
2982	 * Calculate the amount of the address space we would reserve
2983	 * if the buffer was mapped.
2984	 */
2985	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
2986	offset = blkno * bsize;
2987	maxsize = size + (offset & PAGE_MASK);
2988	maxsize = imax(maxsize, bsize);
2989
2990mapping_loop:
2991	if (allocbufkva(bp, maxsize, gbflags)) {
2992		/*
2993		 * Request defragmentation. getnewbuf() returns us the
2994		 * allocated space by the scratch buffer KVA.
2995		 */
2996		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
2997		    (GB_UNMAPPED | GB_KVAALLOC));
2998		if (scratch_bp == NULL) {
2999			if ((gbflags & GB_NOWAIT_BD) != 0) {
3000				/*
3001				 * XXXKIB: defragmentation cannot
3002				 * succeed, not sure what else to do.
3003				 */
3004				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
3005			}
3006			atomic_add_int(&mappingrestarts, 1);
3007			goto mapping_loop;
3008		}
3009		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
3010		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
3011		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
3012		    scratch_bp->b_kvasize, gbflags);
3013
3014		/* Get rid of the scratch buffer. */
3015		scratch_bp->b_kvasize = 0;
3016		scratch_bp->b_flags |= B_INVAL;
3017		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
3018		brelse(scratch_bp);
3019	}
3020	if (!need_mapping)
3021		return;
3022
3023has_addr:
3024	bp->b_saveaddr = bp->b_kvabase;
3025	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
3026	bp->b_flags &= ~B_UNMAPPED;
3027	BUF_CHECK_MAPPED(bp);
3028	bpmap_qenter(bp);
3029}
3030
3031/*
3032 *	getblk:
3033 *
3034 *	Get a block given a specified block and offset into a file/device.
3035 *	The buffers B_DONE bit will be cleared on return, making it almost
3036 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
3037 *	return.  The caller should clear B_INVAL prior to initiating a
3038 *	READ.
3039 *
3040 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
3041 *	an existing buffer.
3042 *
3043 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
3044 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
3045 *	and then cleared based on the backing VM.  If the previous buffer is
3046 *	non-0-sized but invalid, B_CACHE will be cleared.
3047 *
3048 *	If getblk() must create a new buffer, the new buffer is returned with
3049 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
3050 *	case it is returned with B_INVAL clear and B_CACHE set based on the
3051 *	backing VM.
3052 *
3053 *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
3054 *	B_CACHE bit is clear.
3055 *
3056 *	What this means, basically, is that the caller should use B_CACHE to
3057 *	determine whether the buffer is fully valid or not and should clear
3058 *	B_INVAL prior to issuing a read.  If the caller intends to validate
3059 *	the buffer by loading its data area with something, the caller needs
3060 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
3061 *	the caller should set B_CACHE ( as an optimization ), else the caller
3062 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
3063 *	a write attempt or if it was a successfull read.  If the caller
3064 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
3065 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
3066 */
3067struct buf *
3068getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
3069    int flags)
3070{
3071	struct buf *bp;
3072	struct bufobj *bo;
3073	int bsize, error, maxsize, vmio;
3074	off_t offset;
3075
3076	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
3077	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
3078	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
3079	ASSERT_VOP_LOCKED(vp, "getblk");
3080	if (size > MAXBCACHEBUF)
3081		panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
3082		    MAXBCACHEBUF);
3083	if (!unmapped_buf_allowed)
3084		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
3085
3086	bo = &vp->v_bufobj;
3087loop:
3088	BO_RLOCK(bo);
3089	bp = gbincore(bo, blkno);
3090	if (bp != NULL) {
3091		int lockflags;
3092		/*
3093		 * Buffer is in-core.  If the buffer is not busy nor managed,
3094		 * it must be on a queue.
3095		 */
3096		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
3097
3098		if (flags & GB_LOCK_NOWAIT)
3099			lockflags |= LK_NOWAIT;
3100
3101		error = BUF_TIMELOCK(bp, lockflags,
3102		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
3103
3104		/*
3105		 * If we slept and got the lock we have to restart in case
3106		 * the buffer changed identities.
3107		 */
3108		if (error == ENOLCK)
3109			goto loop;
3110		/* We timed out or were interrupted. */
3111		else if (error)
3112			return (NULL);
3113		/* If recursed, assume caller knows the rules. */
3114		else if (BUF_LOCKRECURSED(bp))
3115			goto end;
3116
3117		/*
3118		 * The buffer is locked.  B_CACHE is cleared if the buffer is
3119		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
3120		 * and for a VMIO buffer B_CACHE is adjusted according to the
3121		 * backing VM cache.
3122		 */
3123		if (bp->b_flags & B_INVAL)
3124			bp->b_flags &= ~B_CACHE;
3125		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
3126			bp->b_flags |= B_CACHE;
3127		if (bp->b_flags & B_MANAGED)
3128			MPASS(bp->b_qindex == QUEUE_NONE);
3129		else
3130			bremfree(bp);
3131
3132		/*
3133		 * check for size inconsistencies for non-VMIO case.
3134		 */
3135		if (bp->b_bcount != size) {
3136			if ((bp->b_flags & B_VMIO) == 0 ||
3137			    (size > bp->b_kvasize)) {
3138				if (bp->b_flags & B_DELWRI) {
3139					/*
3140					 * If buffer is pinned and caller does
3141					 * not want sleep  waiting for it to be
3142					 * unpinned, bail out
3143					 * */
3144					if (bp->b_pin_count > 0) {
3145						if (flags & GB_LOCK_NOWAIT) {
3146							bqrelse(bp);
3147							return (NULL);
3148						} else {
3149							bunpin_wait(bp);
3150						}
3151					}
3152					bp->b_flags |= B_NOCACHE;
3153					bwrite(bp);
3154				} else {
3155					if (LIST_EMPTY(&bp->b_dep)) {
3156						bp->b_flags |= B_RELBUF;
3157						brelse(bp);
3158					} else {
3159						bp->b_flags |= B_NOCACHE;
3160						bwrite(bp);
3161					}
3162				}
3163				goto loop;
3164			}
3165		}
3166
3167		/*
3168		 * Handle the case of unmapped buffer which should
3169		 * become mapped, or the buffer for which KVA
3170		 * reservation is requested.
3171		 */
3172		bp_unmapped_get_kva(bp, blkno, size, flags);
3173
3174		/*
3175		 * If the size is inconsistant in the VMIO case, we can resize
3176		 * the buffer.  This might lead to B_CACHE getting set or
3177		 * cleared.  If the size has not changed, B_CACHE remains
3178		 * unchanged from its previous state.
3179		 */
3180		if (bp->b_bcount != size)
3181			allocbuf(bp, size);
3182
3183		KASSERT(bp->b_offset != NOOFFSET,
3184		    ("getblk: no buffer offset"));
3185
3186		/*
3187		 * A buffer with B_DELWRI set and B_CACHE clear must
3188		 * be committed before we can return the buffer in
3189		 * order to prevent the caller from issuing a read
3190		 * ( due to B_CACHE not being set ) and overwriting
3191		 * it.
3192		 *
3193		 * Most callers, including NFS and FFS, need this to
3194		 * operate properly either because they assume they
3195		 * can issue a read if B_CACHE is not set, or because
3196		 * ( for example ) an uncached B_DELWRI might loop due
3197		 * to softupdates re-dirtying the buffer.  In the latter
3198		 * case, B_CACHE is set after the first write completes,
3199		 * preventing further loops.
3200		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
3201		 * above while extending the buffer, we cannot allow the
3202		 * buffer to remain with B_CACHE set after the write
3203		 * completes or it will represent a corrupt state.  To
3204		 * deal with this we set B_NOCACHE to scrap the buffer
3205		 * after the write.
3206		 *
3207		 * We might be able to do something fancy, like setting
3208		 * B_CACHE in bwrite() except if B_DELWRI is already set,
3209		 * so the below call doesn't set B_CACHE, but that gets real
3210		 * confusing.  This is much easier.
3211		 */
3212
3213		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
3214			bp->b_flags |= B_NOCACHE;
3215			bwrite(bp);
3216			goto loop;
3217		}
3218		bp->b_flags &= ~B_DONE;
3219	} else {
3220		/*
3221		 * Buffer is not in-core, create new buffer.  The buffer
3222		 * returned by getnewbuf() is locked.  Note that the returned
3223		 * buffer is also considered valid (not marked B_INVAL).
3224		 */
3225		BO_RUNLOCK(bo);
3226		/*
3227		 * If the user does not want us to create the buffer, bail out
3228		 * here.
3229		 */
3230		if (flags & GB_NOCREAT)
3231			return NULL;
3232		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
3233			return NULL;
3234
3235		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
3236		offset = blkno * bsize;
3237		vmio = vp->v_object != NULL;
3238		if (vmio) {
3239			maxsize = size + (offset & PAGE_MASK);
3240		} else {
3241			maxsize = size;
3242			/* Do not allow non-VMIO notmapped buffers. */
3243			flags &= ~GB_UNMAPPED;
3244		}
3245		maxsize = imax(maxsize, bsize);
3246
3247		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
3248		if (bp == NULL) {
3249			if (slpflag || slptimeo)
3250				return NULL;
3251			goto loop;
3252		}
3253
3254		/*
3255		 * This code is used to make sure that a buffer is not
3256		 * created while the getnewbuf routine is blocked.
3257		 * This can be a problem whether the vnode is locked or not.
3258		 * If the buffer is created out from under us, we have to
3259		 * throw away the one we just created.
3260		 *
3261		 * Note: this must occur before we associate the buffer
3262		 * with the vp especially considering limitations in
3263		 * the splay tree implementation when dealing with duplicate
3264		 * lblkno's.
3265		 */
3266		BO_LOCK(bo);
3267		if (gbincore(bo, blkno)) {
3268			BO_UNLOCK(bo);
3269			bp->b_flags |= B_INVAL;
3270			brelse(bp);
3271			goto loop;
3272		}
3273
3274		/*
3275		 * Insert the buffer into the hash, so that it can
3276		 * be found by incore.
3277		 */
3278		bp->b_blkno = bp->b_lblkno = blkno;
3279		bp->b_offset = offset;
3280		bgetvp(vp, bp);
3281		BO_UNLOCK(bo);
3282
3283		/*
3284		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
3285		 * buffer size starts out as 0, B_CACHE will be set by
3286		 * allocbuf() for the VMIO case prior to it testing the
3287		 * backing store for validity.
3288		 */
3289
3290		if (vmio) {
3291			bp->b_flags |= B_VMIO;
3292			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
3293			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
3294			    bp, vp->v_object, bp->b_bufobj->bo_object));
3295		} else {
3296			bp->b_flags &= ~B_VMIO;
3297			KASSERT(bp->b_bufobj->bo_object == NULL,
3298			    ("ARGH! has b_bufobj->bo_object %p %p\n",
3299			    bp, bp->b_bufobj->bo_object));
3300			BUF_CHECK_MAPPED(bp);
3301		}
3302
3303		allocbuf(bp, size);
3304		bp->b_flags &= ~B_DONE;
3305	}
3306	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
3307	BUF_ASSERT_HELD(bp);
3308end:
3309	KASSERT(bp->b_bufobj == bo,
3310	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
3311	return (bp);
3312}
3313
3314/*
3315 * Get an empty, disassociated buffer of given size.  The buffer is initially
3316 * set to B_INVAL.
3317 */
3318struct buf *
3319geteblk(int size, int flags)
3320{
3321	struct buf *bp;
3322	int maxsize;
3323
3324	maxsize = (size + BKVAMASK) & ~BKVAMASK;
3325	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
3326		if ((flags & GB_NOWAIT_BD) &&
3327		    (curthread->td_pflags & TDP_BUFNEED) != 0)
3328			return (NULL);
3329	}
3330	allocbuf(bp, size);
3331	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
3332	BUF_ASSERT_HELD(bp);
3333	return (bp);
3334}
3335
3336
3337/*
3338 * This code constitutes the buffer memory from either anonymous system
3339 * memory (in the case of non-VMIO operations) or from an associated
3340 * VM object (in the case of VMIO operations).  This code is able to
3341 * resize a buffer up or down.
3342 *
3343 * Note that this code is tricky, and has many complications to resolve
3344 * deadlock or inconsistant data situations.  Tread lightly!!!
3345 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
3346 * the caller.  Calling this code willy nilly can result in the loss of data.
3347 *
3348 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
3349 * B_CACHE for the non-VMIO case.
3350 */
3351
3352int
3353allocbuf(struct buf *bp, int size)
3354{
3355	int newbsize, mbsize;
3356	int i;
3357
3358	BUF_ASSERT_HELD(bp);
3359
3360	if (bp->b_kvasize < size)
3361		panic("allocbuf: buffer too small");
3362
3363	if ((bp->b_flags & B_VMIO) == 0) {
3364		caddr_t origbuf;
3365		int origbufsize;
3366		/*
3367		 * Just get anonymous memory from the kernel.  Don't
3368		 * mess with B_CACHE.
3369		 */
3370		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3371		if (bp->b_flags & B_MALLOC)
3372			newbsize = mbsize;
3373		else
3374			newbsize = round_page(size);
3375
3376		if (newbsize < bp->b_bufsize) {
3377			/*
3378			 * malloced buffers are not shrunk
3379			 */
3380			if (bp->b_flags & B_MALLOC) {
3381				if (newbsize) {
3382					bp->b_bcount = size;
3383				} else {
3384					free(bp->b_data, M_BIOBUF);
3385					if (bp->b_bufsize) {
3386						atomic_subtract_long(
3387						    &bufmallocspace,
3388						    bp->b_bufsize);
3389						bufspacewakeup();
3390						bp->b_bufsize = 0;
3391					}
3392					bp->b_saveaddr = bp->b_kvabase;
3393					bp->b_data = bp->b_saveaddr;
3394					bp->b_bcount = 0;
3395					bp->b_flags &= ~B_MALLOC;
3396				}
3397				return 1;
3398			}
3399			vm_hold_free_pages(bp, newbsize);
3400		} else if (newbsize > bp->b_bufsize) {
3401			/*
3402			 * We only use malloced memory on the first allocation.
3403			 * and revert to page-allocated memory when the buffer
3404			 * grows.
3405			 */
3406			/*
3407			 * There is a potential smp race here that could lead
3408			 * to bufmallocspace slightly passing the max.  It
3409			 * is probably extremely rare and not worth worrying
3410			 * over.
3411			 */
3412			if ( (bufmallocspace < maxbufmallocspace) &&
3413				(bp->b_bufsize == 0) &&
3414				(mbsize <= PAGE_SIZE/2)) {
3415
3416				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
3417				bp->b_bufsize = mbsize;
3418				bp->b_bcount = size;
3419				bp->b_flags |= B_MALLOC;
3420				atomic_add_long(&bufmallocspace, mbsize);
3421				return 1;
3422			}
3423			origbuf = NULL;
3424			origbufsize = 0;
3425			/*
3426			 * If the buffer is growing on its other-than-first allocation,
3427			 * then we revert to the page-allocation scheme.
3428			 */
3429			if (bp->b_flags & B_MALLOC) {
3430				origbuf = bp->b_data;
3431				origbufsize = bp->b_bufsize;
3432				bp->b_data = bp->b_kvabase;
3433				if (bp->b_bufsize) {
3434					atomic_subtract_long(&bufmallocspace,
3435					    bp->b_bufsize);
3436					bufspacewakeup();
3437					bp->b_bufsize = 0;
3438				}
3439				bp->b_flags &= ~B_MALLOC;
3440				newbsize = round_page(newbsize);
3441			}
3442			vm_hold_load_pages(
3443			    bp,
3444			    (vm_offset_t) bp->b_data + bp->b_bufsize,
3445			    (vm_offset_t) bp->b_data + newbsize);
3446			if (origbuf) {
3447				bcopy(origbuf, bp->b_data, origbufsize);
3448				free(origbuf, M_BIOBUF);
3449			}
3450		}
3451	} else {
3452		int desiredpages;
3453
3454		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
3455		desiredpages = (size == 0) ? 0 :
3456			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
3457
3458		if (bp->b_flags & B_MALLOC)
3459			panic("allocbuf: VMIO buffer can't be malloced");
3460		/*
3461		 * Set B_CACHE initially if buffer is 0 length or will become
3462		 * 0-length.
3463		 */
3464		if (size == 0 || bp->b_bufsize == 0)
3465			bp->b_flags |= B_CACHE;
3466
3467		if (newbsize < bp->b_bufsize) {
3468			/*
3469			 * DEV_BSIZE aligned new buffer size is less then the
3470			 * DEV_BSIZE aligned existing buffer size.  Figure out
3471			 * if we have to remove any pages.
3472			 */
3473			if (desiredpages < bp->b_npages) {
3474				vm_page_t m;
3475
3476				if ((bp->b_flags & B_UNMAPPED) == 0) {
3477					BUF_CHECK_MAPPED(bp);
3478					pmap_qremove((vm_offset_t)trunc_page(
3479					    (vm_offset_t)bp->b_data) +
3480					    (desiredpages << PAGE_SHIFT),
3481					    (bp->b_npages - desiredpages));
3482				} else
3483					BUF_CHECK_UNMAPPED(bp);
3484				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
3485				for (i = desiredpages; i < bp->b_npages; i++) {
3486					/*
3487					 * the page is not freed here -- it
3488					 * is the responsibility of
3489					 * vnode_pager_setsize
3490					 */
3491					m = bp->b_pages[i];
3492					KASSERT(m != bogus_page,
3493					    ("allocbuf: bogus page found"));
3494					while (vm_page_sleep_if_busy(m,
3495					    "biodep"))
3496						continue;
3497
3498					bp->b_pages[i] = NULL;
3499					vm_page_lock(m);
3500					vm_page_unwire(m, 0);
3501					vm_page_unlock(m);
3502				}
3503				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
3504				bp->b_npages = desiredpages;
3505			}
3506		} else if (size > bp->b_bcount) {
3507			/*
3508			 * We are growing the buffer, possibly in a
3509			 * byte-granular fashion.
3510			 */
3511			vm_object_t obj;
3512			vm_offset_t toff;
3513			vm_offset_t tinc;
3514
3515			/*
3516			 * Step 1, bring in the VM pages from the object,
3517			 * allocating them if necessary.  We must clear
3518			 * B_CACHE if these pages are not valid for the
3519			 * range covered by the buffer.
3520			 */
3521
3522			obj = bp->b_bufobj->bo_object;
3523
3524			VM_OBJECT_WLOCK(obj);
3525			while (bp->b_npages < desiredpages) {
3526				vm_page_t m;
3527
3528				/*
3529				 * We must allocate system pages since blocking
3530				 * here could interfere with paging I/O, no
3531				 * matter which process we are.
3532				 *
3533				 * Only exclusive busy can be tested here.
3534				 * Blocking on shared busy might lead to
3535				 * deadlocks once allocbuf() is called after
3536				 * pages are vfs_busy_pages().
3537				 */
3538				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
3539				    bp->b_npages, VM_ALLOC_NOBUSY |
3540				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
3541				    VM_ALLOC_IGN_SBUSY |
3542				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
3543				if (m->valid == 0)
3544					bp->b_flags &= ~B_CACHE;
3545				bp->b_pages[bp->b_npages] = m;
3546				++bp->b_npages;
3547			}
3548
3549			/*
3550			 * Step 2.  We've loaded the pages into the buffer,
3551			 * we have to figure out if we can still have B_CACHE
3552			 * set.  Note that B_CACHE is set according to the
3553			 * byte-granular range ( bcount and size ), new the
3554			 * aligned range ( newbsize ).
3555			 *
3556			 * The VM test is against m->valid, which is DEV_BSIZE
3557			 * aligned.  Needless to say, the validity of the data
3558			 * needs to also be DEV_BSIZE aligned.  Note that this
3559			 * fails with NFS if the server or some other client
3560			 * extends the file's EOF.  If our buffer is resized,
3561			 * B_CACHE may remain set! XXX
3562			 */
3563
3564			toff = bp->b_bcount;
3565			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
3566
3567			while ((bp->b_flags & B_CACHE) && toff < size) {
3568				vm_pindex_t pi;
3569
3570				if (tinc > (size - toff))
3571					tinc = size - toff;
3572
3573				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
3574				    PAGE_SHIFT;
3575
3576				vfs_buf_test_cache(
3577				    bp,
3578				    bp->b_offset,
3579				    toff,
3580				    tinc,
3581				    bp->b_pages[pi]
3582				);
3583				toff += tinc;
3584				tinc = PAGE_SIZE;
3585			}
3586			VM_OBJECT_WUNLOCK(obj);
3587
3588			/*
3589			 * Step 3, fixup the KVM pmap.
3590			 */
3591			if ((bp->b_flags & B_UNMAPPED) == 0)
3592				bpmap_qenter(bp);
3593			else
3594				BUF_CHECK_UNMAPPED(bp);
3595		}
3596	}
3597	if (newbsize < bp->b_bufsize)
3598		bufspacewakeup();
3599	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
3600	bp->b_bcount = size;		/* requested buffer size	*/
3601	return 1;
3602}
3603
3604extern int inflight_transient_maps;
3605
3606void
3607biodone(struct bio *bp)
3608{
3609	struct mtx *mtxp;
3610	void (*done)(struct bio *);
3611	vm_offset_t start, end;
3612
3613	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
3614		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
3615		bp->bio_flags |= BIO_UNMAPPED;
3616		start = trunc_page((vm_offset_t)bp->bio_data);
3617		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
3618		pmap_qremove(start, OFF_TO_IDX(end - start));
3619		vmem_free(transient_arena, start, end - start);
3620		atomic_add_int(&inflight_transient_maps, -1);
3621	}
3622	done = bp->bio_done;
3623	if (done == NULL) {
3624		mtxp = mtx_pool_find(mtxpool_sleep, bp);
3625		mtx_lock(mtxp);
3626		bp->bio_flags |= BIO_DONE;
3627		wakeup(bp);
3628		mtx_unlock(mtxp);
3629	} else {
3630		bp->bio_flags |= BIO_DONE;
3631		done(bp);
3632	}
3633}
3634
3635/*
3636 * Wait for a BIO to finish.
3637 */
3638int
3639biowait(struct bio *bp, const char *wchan)
3640{
3641	struct mtx *mtxp;
3642
3643	mtxp = mtx_pool_find(mtxpool_sleep, bp);
3644	mtx_lock(mtxp);
3645	while ((bp->bio_flags & BIO_DONE) == 0)
3646		msleep(bp, mtxp, PRIBIO, wchan, 0);
3647	mtx_unlock(mtxp);
3648	if (bp->bio_error != 0)
3649		return (bp->bio_error);
3650	if (!(bp->bio_flags & BIO_ERROR))
3651		return (0);
3652	return (EIO);
3653}
3654
3655void
3656biofinish(struct bio *bp, struct devstat *stat, int error)
3657{
3658
3659	if (error) {
3660		bp->bio_error = error;
3661		bp->bio_flags |= BIO_ERROR;
3662	}
3663	if (stat != NULL)
3664		devstat_end_transaction_bio(stat, bp);
3665	biodone(bp);
3666}
3667
3668/*
3669 *	bufwait:
3670 *
3671 *	Wait for buffer I/O completion, returning error status.  The buffer
3672 *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
3673 *	error and cleared.
3674 */
3675int
3676bufwait(struct buf *bp)
3677{
3678	if (bp->b_iocmd == BIO_READ)
3679		bwait(bp, PRIBIO, "biord");
3680	else
3681		bwait(bp, PRIBIO, "biowr");
3682	if (bp->b_flags & B_EINTR) {
3683		bp->b_flags &= ~B_EINTR;
3684		return (EINTR);
3685	}
3686	if (bp->b_ioflags & BIO_ERROR) {
3687		return (bp->b_error ? bp->b_error : EIO);
3688	} else {
3689		return (0);
3690	}
3691}
3692
3693 /*
3694  * Call back function from struct bio back up to struct buf.
3695  */
3696static void
3697bufdonebio(struct bio *bip)
3698{
3699	struct buf *bp;
3700
3701	bp = bip->bio_caller2;
3702	bp->b_resid = bp->b_bcount - bip->bio_completed;
3703	bp->b_resid = bip->bio_resid;	/* XXX: remove */
3704	bp->b_ioflags = bip->bio_flags;
3705	bp->b_error = bip->bio_error;
3706	if (bp->b_error)
3707		bp->b_ioflags |= BIO_ERROR;
3708	bufdone(bp);
3709	g_destroy_bio(bip);
3710}
3711
3712void
3713dev_strategy(struct cdev *dev, struct buf *bp)
3714{
3715	struct cdevsw *csw;
3716	int ref;
3717
3718	KASSERT(dev->si_refcount > 0,
3719	    ("dev_strategy on un-referenced struct cdev *(%s) %p",
3720	    devtoname(dev), dev));
3721
3722	csw = dev_refthread(dev, &ref);
3723	dev_strategy_csw(dev, csw, bp);
3724	dev_relthread(dev, ref);
3725}
3726
3727void
3728dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
3729{
3730	struct bio *bip;
3731
3732	KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
3733	    ("b_iocmd botch"));
3734	KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
3735	    dev->si_threadcount > 0,
3736	    ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
3737	    dev));
3738	if (csw == NULL) {
3739		bp->b_error = ENXIO;
3740		bp->b_ioflags = BIO_ERROR;
3741		bufdone(bp);
3742		return;
3743	}
3744	for (;;) {
3745		bip = g_new_bio();
3746		if (bip != NULL)
3747			break;
3748		/* Try again later */
3749		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
3750	}
3751	bip->bio_cmd = bp->b_iocmd;
3752	bip->bio_offset = bp->b_iooffset;
3753	bip->bio_length = bp->b_bcount;
3754	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
3755	bdata2bio(bp, bip);
3756	bip->bio_done = bufdonebio;
3757	bip->bio_caller2 = bp;
3758	bip->bio_dev = dev;
3759	(*csw->d_strategy)(bip);
3760}
3761
3762/*
3763 *	bufdone:
3764 *
3765 *	Finish I/O on a buffer, optionally calling a completion function.
3766 *	This is usually called from an interrupt so process blocking is
3767 *	not allowed.
3768 *
3769 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
3770 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
3771 *	assuming B_INVAL is clear.
3772 *
3773 *	For the VMIO case, we set B_CACHE if the op was a read and no
3774 *	read error occured, or if the op was a write.  B_CACHE is never
3775 *	set if the buffer is invalid or otherwise uncacheable.
3776 *
3777 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
3778 *	initiator to leave B_INVAL set to brelse the buffer out of existance
3779 *	in the biodone routine.
3780 */
3781void
3782bufdone(struct buf *bp)
3783{
3784	struct bufobj *dropobj;
3785	void    (*biodone)(struct buf *);
3786
3787	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
3788	dropobj = NULL;
3789
3790	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
3791	BUF_ASSERT_HELD(bp);
3792
3793	runningbufwakeup(bp);
3794	if (bp->b_iocmd == BIO_WRITE)
3795		dropobj = bp->b_bufobj;
3796	/* call optional completion function if requested */
3797	if (bp->b_iodone != NULL) {
3798		biodone = bp->b_iodone;
3799		bp->b_iodone = NULL;
3800		(*biodone) (bp);
3801		if (dropobj)
3802			bufobj_wdrop(dropobj);
3803		return;
3804	}
3805
3806	bufdone_finish(bp);
3807
3808	if (dropobj)
3809		bufobj_wdrop(dropobj);
3810}
3811
3812void
3813bufdone_finish(struct buf *bp)
3814{
3815	BUF_ASSERT_HELD(bp);
3816
3817	if (!LIST_EMPTY(&bp->b_dep))
3818		buf_complete(bp);
3819
3820	if (bp->b_flags & B_VMIO) {
3821		vm_ooffset_t foff;
3822		vm_page_t m;
3823		vm_object_t obj;
3824		struct vnode *vp;
3825		int bogus, i, iosize;
3826
3827		obj = bp->b_bufobj->bo_object;
3828		KASSERT(obj->paging_in_progress >= bp->b_npages,
3829		    ("biodone_finish: paging in progress(%d) < b_npages(%d)",
3830		    obj->paging_in_progress, bp->b_npages));
3831
3832		vp = bp->b_vp;
3833		KASSERT(vp->v_holdcnt > 0,
3834		    ("biodone_finish: vnode %p has zero hold count", vp));
3835		KASSERT(vp->v_object != NULL,
3836		    ("biodone_finish: vnode %p has no vm_object", vp));
3837
3838		foff = bp->b_offset;
3839		KASSERT(bp->b_offset != NOOFFSET,
3840		    ("biodone_finish: bp %p has no buffer offset", bp));
3841
3842		/*
3843		 * Set B_CACHE if the op was a normal read and no error
3844		 * occured.  B_CACHE is set for writes in the b*write()
3845		 * routines.
3846		 */
3847		iosize = bp->b_bcount - bp->b_resid;
3848		if (bp->b_iocmd == BIO_READ &&
3849		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
3850		    !(bp->b_ioflags & BIO_ERROR)) {
3851			bp->b_flags |= B_CACHE;
3852		}
3853		bogus = 0;
3854		VM_OBJECT_WLOCK(obj);
3855		for (i = 0; i < bp->b_npages; i++) {
3856			int bogusflag = 0;
3857			int resid;
3858
3859			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
3860			if (resid > iosize)
3861				resid = iosize;
3862
3863			/*
3864			 * cleanup bogus pages, restoring the originals
3865			 */
3866			m = bp->b_pages[i];
3867			if (m == bogus_page) {
3868				bogus = bogusflag = 1;
3869				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
3870				if (m == NULL)
3871					panic("biodone: page disappeared!");
3872				bp->b_pages[i] = m;
3873			}
3874			KASSERT(OFF_TO_IDX(foff) == m->pindex,
3875			    ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
3876			    (intmax_t)foff, (uintmax_t)m->pindex));
3877
3878			/*
3879			 * In the write case, the valid and clean bits are
3880			 * already changed correctly ( see bdwrite() ), so we
3881			 * only need to do this here in the read case.
3882			 */
3883			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
3884				KASSERT((m->dirty & vm_page_bits(foff &
3885				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
3886				    " page %p has unexpected dirty bits", m));
3887				vfs_page_set_valid(bp, foff, m);
3888			}
3889
3890			vm_page_sunbusy(m);
3891			vm_object_pip_subtract(obj, 1);
3892			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
3893			iosize -= resid;
3894		}
3895		vm_object_pip_wakeupn(obj, 0);
3896		VM_OBJECT_WUNLOCK(obj);
3897		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
3898			BUF_CHECK_MAPPED(bp);
3899			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3900			    bp->b_pages, bp->b_npages);
3901		}
3902	}
3903
3904	/*
3905	 * For asynchronous completions, release the buffer now. The brelse
3906	 * will do a wakeup there if necessary - so no need to do a wakeup
3907	 * here in the async case. The sync case always needs to do a wakeup.
3908	 */
3909
3910	if (bp->b_flags & B_ASYNC) {
3911		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
3912			brelse(bp);
3913		else
3914			bqrelse(bp);
3915	} else
3916		bdone(bp);
3917}
3918
3919/*
3920 * This routine is called in lieu of iodone in the case of
3921 * incomplete I/O.  This keeps the busy status for pages
3922 * consistant.
3923 */
3924void
3925vfs_unbusy_pages(struct buf *bp)
3926{
3927	int i;
3928	vm_object_t obj;
3929	vm_page_t m;
3930
3931	runningbufwakeup(bp);
3932	if (!(bp->b_flags & B_VMIO))
3933		return;
3934
3935	obj = bp->b_bufobj->bo_object;
3936	VM_OBJECT_WLOCK(obj);
3937	for (i = 0; i < bp->b_npages; i++) {
3938		m = bp->b_pages[i];
3939		if (m == bogus_page) {
3940			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
3941			if (!m)
3942				panic("vfs_unbusy_pages: page missing\n");
3943			bp->b_pages[i] = m;
3944			if ((bp->b_flags & B_UNMAPPED) == 0) {
3945				BUF_CHECK_MAPPED(bp);
3946				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
3947				    bp->b_pages, bp->b_npages);
3948			} else
3949				BUF_CHECK_UNMAPPED(bp);
3950		}
3951		vm_object_pip_subtract(obj, 1);
3952		vm_page_sunbusy(m);
3953	}
3954	vm_object_pip_wakeupn(obj, 0);
3955	VM_OBJECT_WUNLOCK(obj);
3956}
3957
3958/*
3959 * vfs_page_set_valid:
3960 *
3961 *	Set the valid bits in a page based on the supplied offset.   The
3962 *	range is restricted to the buffer's size.
3963 *
3964 *	This routine is typically called after a read completes.
3965 */
3966static void
3967vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3968{
3969	vm_ooffset_t eoff;
3970
3971	/*
3972	 * Compute the end offset, eoff, such that [off, eoff) does not span a
3973	 * page boundary and eoff is not greater than the end of the buffer.
3974	 * The end of the buffer, in this case, is our file EOF, not the
3975	 * allocation size of the buffer.
3976	 */
3977	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
3978	if (eoff > bp->b_offset + bp->b_bcount)
3979		eoff = bp->b_offset + bp->b_bcount;
3980
3981	/*
3982	 * Set valid range.  This is typically the entire buffer and thus the
3983	 * entire page.
3984	 */
3985	if (eoff > off)
3986		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
3987}
3988
3989/*
3990 * vfs_page_set_validclean:
3991 *
3992 *	Set the valid bits and clear the dirty bits in a page based on the
3993 *	supplied offset.   The range is restricted to the buffer's size.
3994 */
3995static void
3996vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
3997{
3998	vm_ooffset_t soff, eoff;
3999
4000	/*
4001	 * Start and end offsets in buffer.  eoff - soff may not cross a
4002	 * page boundry or cross the end of the buffer.  The end of the
4003	 * buffer, in this case, is our file EOF, not the allocation size
4004	 * of the buffer.
4005	 */
4006	soff = off;
4007	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4008	if (eoff > bp->b_offset + bp->b_bcount)
4009		eoff = bp->b_offset + bp->b_bcount;
4010
4011	/*
4012	 * Set valid range.  This is typically the entire buffer and thus the
4013	 * entire page.
4014	 */
4015	if (eoff > soff) {
4016		vm_page_set_validclean(
4017		    m,
4018		   (vm_offset_t) (soff & PAGE_MASK),
4019		   (vm_offset_t) (eoff - soff)
4020		);
4021	}
4022}
4023
4024/*
4025 * Ensure that all buffer pages are not exclusive busied.  If any page is
4026 * exclusive busy, drain it.
4027 */
4028void
4029vfs_drain_busy_pages(struct buf *bp)
4030{
4031	vm_page_t m;
4032	int i, last_busied;
4033
4034	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
4035	last_busied = 0;
4036	for (i = 0; i < bp->b_npages; i++) {
4037		m = bp->b_pages[i];
4038		if (vm_page_xbusied(m)) {
4039			for (; last_busied < i; last_busied++)
4040				vm_page_sbusy(bp->b_pages[last_busied]);
4041			while (vm_page_xbusied(m)) {
4042				vm_page_lock(m);
4043				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4044				vm_page_busy_sleep(m, "vbpage");
4045				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4046			}
4047		}
4048	}
4049	for (i = 0; i < last_busied; i++)
4050		vm_page_sunbusy(bp->b_pages[i]);
4051}
4052
4053/*
4054 * This routine is called before a device strategy routine.
4055 * It is used to tell the VM system that paging I/O is in
4056 * progress, and treat the pages associated with the buffer
4057 * almost as being exclusive busy.  Also the object paging_in_progress
4058 * flag is handled to make sure that the object doesn't become
4059 * inconsistant.
4060 *
4061 * Since I/O has not been initiated yet, certain buffer flags
4062 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
4063 * and should be ignored.
4064 */
4065void
4066vfs_busy_pages(struct buf *bp, int clear_modify)
4067{
4068	int i, bogus;
4069	vm_object_t obj;
4070	vm_ooffset_t foff;
4071	vm_page_t m;
4072
4073	if (!(bp->b_flags & B_VMIO))
4074		return;
4075
4076	obj = bp->b_bufobj->bo_object;
4077	foff = bp->b_offset;
4078	KASSERT(bp->b_offset != NOOFFSET,
4079	    ("vfs_busy_pages: no buffer offset"));
4080	VM_OBJECT_WLOCK(obj);
4081	vfs_drain_busy_pages(bp);
4082	if (bp->b_bufsize != 0)
4083		vfs_setdirty_locked_object(bp);
4084	bogus = 0;
4085	for (i = 0; i < bp->b_npages; i++) {
4086		m = bp->b_pages[i];
4087
4088		if ((bp->b_flags & B_CLUSTER) == 0) {
4089			vm_object_pip_add(obj, 1);
4090			vm_page_sbusy(m);
4091		}
4092		/*
4093		 * When readying a buffer for a read ( i.e
4094		 * clear_modify == 0 ), it is important to do
4095		 * bogus_page replacement for valid pages in
4096		 * partially instantiated buffers.  Partially
4097		 * instantiated buffers can, in turn, occur when
4098		 * reconstituting a buffer from its VM backing store
4099		 * base.  We only have to do this if B_CACHE is
4100		 * clear ( which causes the I/O to occur in the
4101		 * first place ).  The replacement prevents the read
4102		 * I/O from overwriting potentially dirty VM-backed
4103		 * pages.  XXX bogus page replacement is, uh, bogus.
4104		 * It may not work properly with small-block devices.
4105		 * We need to find a better way.
4106		 */
4107		if (clear_modify) {
4108			pmap_remove_write(m);
4109			vfs_page_set_validclean(bp, foff, m);
4110		} else if (m->valid == VM_PAGE_BITS_ALL &&
4111		    (bp->b_flags & B_CACHE) == 0) {
4112			bp->b_pages[i] = bogus_page;
4113			bogus++;
4114		}
4115		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
4116	}
4117	VM_OBJECT_WUNLOCK(obj);
4118	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
4119		BUF_CHECK_MAPPED(bp);
4120		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
4121		    bp->b_pages, bp->b_npages);
4122	}
4123}
4124
4125/*
4126 *	vfs_bio_set_valid:
4127 *
4128 *	Set the range within the buffer to valid.  The range is
4129 *	relative to the beginning of the buffer, b_offset.  Note that
4130 *	b_offset itself may be offset from the beginning of the first
4131 *	page.
4132 */
4133void
4134vfs_bio_set_valid(struct buf *bp, int base, int size)
4135{
4136	int i, n;
4137	vm_page_t m;
4138
4139	if (!(bp->b_flags & B_VMIO))
4140		return;
4141
4142	/*
4143	 * Fixup base to be relative to beginning of first page.
4144	 * Set initial n to be the maximum number of bytes in the
4145	 * first page that can be validated.
4146	 */
4147	base += (bp->b_offset & PAGE_MASK);
4148	n = PAGE_SIZE - (base & PAGE_MASK);
4149
4150	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4151	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4152		m = bp->b_pages[i];
4153		if (n > size)
4154			n = size;
4155		vm_page_set_valid_range(m, base & PAGE_MASK, n);
4156		base += n;
4157		size -= n;
4158		n = PAGE_SIZE;
4159	}
4160	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4161}
4162
4163/*
4164 *	vfs_bio_clrbuf:
4165 *
4166 *	If the specified buffer is a non-VMIO buffer, clear the entire
4167 *	buffer.  If the specified buffer is a VMIO buffer, clear and
4168 *	validate only the previously invalid portions of the buffer.
4169 *	This routine essentially fakes an I/O, so we need to clear
4170 *	BIO_ERROR and B_INVAL.
4171 *
4172 *	Note that while we only theoretically need to clear through b_bcount,
4173 *	we go ahead and clear through b_bufsize.
4174 */
4175void
4176vfs_bio_clrbuf(struct buf *bp)
4177{
4178	int i, j, mask, sa, ea, slide;
4179
4180	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
4181		clrbuf(bp);
4182		return;
4183	}
4184	bp->b_flags &= ~B_INVAL;
4185	bp->b_ioflags &= ~BIO_ERROR;
4186	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
4187	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
4188	    (bp->b_offset & PAGE_MASK) == 0) {
4189		if (bp->b_pages[0] == bogus_page)
4190			goto unlock;
4191		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
4192		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
4193		if ((bp->b_pages[0]->valid & mask) == mask)
4194			goto unlock;
4195		if ((bp->b_pages[0]->valid & mask) == 0) {
4196			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
4197			bp->b_pages[0]->valid |= mask;
4198			goto unlock;
4199		}
4200	}
4201	sa = bp->b_offset & PAGE_MASK;
4202	slide = 0;
4203	for (i = 0; i < bp->b_npages; i++, sa = 0) {
4204		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
4205		ea = slide & PAGE_MASK;
4206		if (ea == 0)
4207			ea = PAGE_SIZE;
4208		if (bp->b_pages[i] == bogus_page)
4209			continue;
4210		j = sa / DEV_BSIZE;
4211		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
4212		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
4213		if ((bp->b_pages[i]->valid & mask) == mask)
4214			continue;
4215		if ((bp->b_pages[i]->valid & mask) == 0)
4216			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
4217		else {
4218			for (; sa < ea; sa += DEV_BSIZE, j++) {
4219				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
4220					pmap_zero_page_area(bp->b_pages[i],
4221					    sa, DEV_BSIZE);
4222				}
4223			}
4224		}
4225		bp->b_pages[i]->valid |= mask;
4226	}
4227unlock:
4228	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
4229	bp->b_resid = 0;
4230}
4231
4232void
4233vfs_bio_bzero_buf(struct buf *bp, int base, int size)
4234{
4235	vm_page_t m;
4236	int i, n;
4237
4238	if ((bp->b_flags & B_UNMAPPED) == 0) {
4239		BUF_CHECK_MAPPED(bp);
4240		bzero(bp->b_data + base, size);
4241	} else {
4242		BUF_CHECK_UNMAPPED(bp);
4243		n = PAGE_SIZE - (base & PAGE_MASK);
4244		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
4245			m = bp->b_pages[i];
4246			if (n > size)
4247				n = size;
4248			pmap_zero_page_area(m, base & PAGE_MASK, n);
4249			base += n;
4250			size -= n;
4251			n = PAGE_SIZE;
4252		}
4253	}
4254}
4255
4256/*
4257 * vm_hold_load_pages and vm_hold_free_pages get pages into
4258 * a buffers address space.  The pages are anonymous and are
4259 * not associated with a file object.
4260 */
4261static void
4262vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
4263{
4264	vm_offset_t pg;
4265	vm_page_t p;
4266	int index;
4267
4268	BUF_CHECK_MAPPED(bp);
4269
4270	to = round_page(to);
4271	from = round_page(from);
4272	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4273
4274	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
4275tryagain:
4276		/*
4277		 * note: must allocate system pages since blocking here
4278		 * could interfere with paging I/O, no matter which
4279		 * process we are.
4280		 */
4281		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
4282		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
4283		if (p == NULL) {
4284			VM_WAIT;
4285			goto tryagain;
4286		}
4287		pmap_qenter(pg, &p, 1);
4288		bp->b_pages[index] = p;
4289	}
4290	bp->b_npages = index;
4291}
4292
4293/* Return pages associated with this buf to the vm system */
4294static void
4295vm_hold_free_pages(struct buf *bp, int newbsize)
4296{
4297	vm_offset_t from;
4298	vm_page_t p;
4299	int index, newnpages;
4300
4301	BUF_CHECK_MAPPED(bp);
4302
4303	from = round_page((vm_offset_t)bp->b_data + newbsize);
4304	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
4305	if (bp->b_npages > newnpages)
4306		pmap_qremove(from, bp->b_npages - newnpages);
4307	for (index = newnpages; index < bp->b_npages; index++) {
4308		p = bp->b_pages[index];
4309		bp->b_pages[index] = NULL;
4310		if (vm_page_sbusied(p))
4311			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
4312			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
4313		p->wire_count--;
4314		vm_page_free(p);
4315		atomic_subtract_int(&cnt.v_wire_count, 1);
4316	}
4317	bp->b_npages = newnpages;
4318}
4319
4320/*
4321 * Map an IO request into kernel virtual address space.
4322 *
4323 * All requests are (re)mapped into kernel VA space.
4324 * Notice that we use b_bufsize for the size of the buffer
4325 * to be mapped.  b_bcount might be modified by the driver.
4326 *
4327 * Note that even if the caller determines that the address space should
4328 * be valid, a race or a smaller-file mapped into a larger space may
4329 * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
4330 * check the return value.
4331 */
4332int
4333vmapbuf(struct buf *bp, int mapbuf)
4334{
4335	caddr_t kva;
4336	vm_prot_t prot;
4337	int pidx;
4338
4339	if (bp->b_bufsize < 0)
4340		return (-1);
4341	prot = VM_PROT_READ;
4342	if (bp->b_iocmd == BIO_READ)
4343		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
4344	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
4345	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
4346	    btoc(MAXPHYS))) < 0)
4347		return (-1);
4348	bp->b_npages = pidx;
4349	if (mapbuf || !unmapped_buf_allowed) {
4350		pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
4351		kva = bp->b_saveaddr;
4352		bp->b_saveaddr = bp->b_data;
4353		bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
4354		bp->b_flags &= ~B_UNMAPPED;
4355	} else {
4356		bp->b_flags |= B_UNMAPPED;
4357		bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
4358		bp->b_saveaddr = bp->b_data;
4359		bp->b_data = unmapped_buf;
4360	}
4361	return(0);
4362}
4363
4364/*
4365 * Free the io map PTEs associated with this IO operation.
4366 * We also invalidate the TLB entries and restore the original b_addr.
4367 */
4368void
4369vunmapbuf(struct buf *bp)
4370{
4371	int npages;
4372
4373	npages = bp->b_npages;
4374	if (bp->b_flags & B_UNMAPPED)
4375		bp->b_flags &= ~B_UNMAPPED;
4376	else
4377		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
4378	vm_page_unhold_pages(bp->b_pages, npages);
4379
4380	bp->b_data = bp->b_saveaddr;
4381}
4382
4383void
4384bdone(struct buf *bp)
4385{
4386	struct mtx *mtxp;
4387
4388	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4389	mtx_lock(mtxp);
4390	bp->b_flags |= B_DONE;
4391	wakeup(bp);
4392	mtx_unlock(mtxp);
4393}
4394
4395void
4396bwait(struct buf *bp, u_char pri, const char *wchan)
4397{
4398	struct mtx *mtxp;
4399
4400	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4401	mtx_lock(mtxp);
4402	while ((bp->b_flags & B_DONE) == 0)
4403		msleep(bp, mtxp, pri, wchan, 0);
4404	mtx_unlock(mtxp);
4405}
4406
4407int
4408bufsync(struct bufobj *bo, int waitfor)
4409{
4410
4411	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
4412}
4413
4414void
4415bufstrategy(struct bufobj *bo, struct buf *bp)
4416{
4417	int i = 0;
4418	struct vnode *vp;
4419
4420	vp = bp->b_vp;
4421	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
4422	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
4423	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
4424	i = VOP_STRATEGY(vp, bp);
4425	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
4426}
4427
4428void
4429bufobj_wrefl(struct bufobj *bo)
4430{
4431
4432	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4433	ASSERT_BO_WLOCKED(bo);
4434	bo->bo_numoutput++;
4435}
4436
4437void
4438bufobj_wref(struct bufobj *bo)
4439{
4440
4441	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
4442	BO_LOCK(bo);
4443	bo->bo_numoutput++;
4444	BO_UNLOCK(bo);
4445}
4446
4447void
4448bufobj_wdrop(struct bufobj *bo)
4449{
4450
4451	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
4452	BO_LOCK(bo);
4453	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
4454	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
4455		bo->bo_flag &= ~BO_WWAIT;
4456		wakeup(&bo->bo_numoutput);
4457	}
4458	BO_UNLOCK(bo);
4459}
4460
4461int
4462bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
4463{
4464	int error;
4465
4466	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
4467	ASSERT_BO_WLOCKED(bo);
4468	error = 0;
4469	while (bo->bo_numoutput) {
4470		bo->bo_flag |= BO_WWAIT;
4471		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
4472		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
4473		if (error)
4474			break;
4475	}
4476	return (error);
4477}
4478
4479void
4480bpin(struct buf *bp)
4481{
4482	struct mtx *mtxp;
4483
4484	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4485	mtx_lock(mtxp);
4486	bp->b_pin_count++;
4487	mtx_unlock(mtxp);
4488}
4489
4490void
4491bunpin(struct buf *bp)
4492{
4493	struct mtx *mtxp;
4494
4495	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4496	mtx_lock(mtxp);
4497	if (--bp->b_pin_count == 0)
4498		wakeup(bp);
4499	mtx_unlock(mtxp);
4500}
4501
4502void
4503bunpin_wait(struct buf *bp)
4504{
4505	struct mtx *mtxp;
4506
4507	mtxp = mtx_pool_find(mtxpool_sleep, bp);
4508	mtx_lock(mtxp);
4509	while (bp->b_pin_count > 0)
4510		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
4511	mtx_unlock(mtxp);
4512}
4513
4514/*
4515 * Set bio_data or bio_ma for struct bio from the struct buf.
4516 */
4517void
4518bdata2bio(struct buf *bp, struct bio *bip)
4519{
4520
4521	if ((bp->b_flags & B_UNMAPPED) != 0) {
4522		KASSERT(unmapped_buf_allowed, ("unmapped"));
4523		bip->bio_ma = bp->b_pages;
4524		bip->bio_ma_n = bp->b_npages;
4525		bip->bio_data = unmapped_buf;
4526		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
4527		bip->bio_flags |= BIO_UNMAPPED;
4528		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
4529		    PAGE_SIZE == bp->b_npages,
4530		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
4531		    (long long)bip->bio_length, bip->bio_ma_n));
4532	} else {
4533		bip->bio_data = bp->b_data;
4534		bip->bio_ma = NULL;
4535	}
4536}
4537
4538#include "opt_ddb.h"
4539#ifdef DDB
4540#include <ddb/ddb.h>
4541
4542/* DDB command to show buffer data */
4543DB_SHOW_COMMAND(buffer, db_show_buffer)
4544{
4545	/* get args */
4546	struct buf *bp = (struct buf *)addr;
4547
4548	if (!have_addr) {
4549		db_printf("usage: show buffer <addr>\n");
4550		return;
4551	}
4552
4553	db_printf("buf at %p\n", bp);
4554	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
4555	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
4556	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
4557	db_printf(
4558	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
4559	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
4560	    "b_dep = %p\n",
4561	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
4562	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
4563	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
4564	if (bp->b_npages) {
4565		int i;
4566		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
4567		for (i = 0; i < bp->b_npages; i++) {
4568			vm_page_t m;
4569			m = bp->b_pages[i];
4570			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
4571			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
4572			if ((i + 1) < bp->b_npages)
4573				db_printf(",");
4574		}
4575		db_printf("\n");
4576	}
4577	db_printf(" ");
4578	BUF_LOCKPRINTINFO(bp);
4579}
4580
4581DB_SHOW_COMMAND(lockedbufs, lockedbufs)
4582{
4583	struct buf *bp;
4584	int i;
4585
4586	for (i = 0; i < nbuf; i++) {
4587		bp = &buf[i];
4588		if (BUF_ISLOCKED(bp)) {
4589			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4590			db_printf("\n");
4591		}
4592	}
4593}
4594
4595DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
4596{
4597	struct vnode *vp;
4598	struct buf *bp;
4599
4600	if (!have_addr) {
4601		db_printf("usage: show vnodebufs <addr>\n");
4602		return;
4603	}
4604	vp = (struct vnode *)addr;
4605	db_printf("Clean buffers:\n");
4606	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
4607		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4608		db_printf("\n");
4609	}
4610	db_printf("Dirty buffers:\n");
4611	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
4612		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
4613		db_printf("\n");
4614	}
4615}
4616
4617DB_COMMAND(countfreebufs, db_coundfreebufs)
4618{
4619	struct buf *bp;
4620	int i, used = 0, nfree = 0;
4621
4622	if (have_addr) {
4623		db_printf("usage: countfreebufs\n");
4624		return;
4625	}
4626
4627	for (i = 0; i < nbuf; i++) {
4628		bp = &buf[i];
4629		if ((bp->b_flags & B_INFREECNT) != 0)
4630			nfree++;
4631		else
4632			used++;
4633	}
4634
4635	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
4636	    nfree + used);
4637	db_printf("numfreebuffers is %d\n", numfreebuffers);
4638}
4639#endif /* DDB */
4640