vfs_bio.c revision 67365
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $FreeBSD: head/sys/kern/vfs_bio.c 67365 2000-10-20 07:58:15Z jhb $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 *
27 * see man buf(9) for more info.
28 */
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/bio.h>
33#include <sys/buf.h>
34#include <sys/eventhandler.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mount.h>
38#include <sys/mutex.h>
39#include <sys/kernel.h>
40#include <sys/kthread.h>
41#include <sys/ktr.h>
42#include <sys/proc.h>
43#include <sys/reboot.h>
44#include <sys/resourcevar.h>
45#include <sys/sysctl.h>
46#include <sys/vmmeter.h>
47#include <sys/vnode.h>
48#include <vm/vm.h>
49#include <vm/vm_param.h>
50#include <vm/vm_kern.h>
51#include <vm/vm_pageout.h>
52#include <vm/vm_page.h>
53#include <vm/vm_object.h>
54#include <vm/vm_extern.h>
55#include <vm/vm_map.h>
56
57static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
58
59struct	bio_ops bioops;		/* I/O operation notification */
60
61struct buf *buf;		/* buffer header pool */
62struct swqueue bswlist;
63struct mtx buftimelock;		/* Interlock on setting prio and timo */
64
65static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
66		vm_offset_t to);
67static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
68		vm_offset_t to);
69static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
70			       int pageno, vm_page_t m);
71static void vfs_clean_pages(struct buf * bp);
72static void vfs_setdirty(struct buf *bp);
73static void vfs_vmio_release(struct buf *bp);
74static void vfs_backgroundwritedone(struct buf *bp);
75static int flushbufqueues(void);
76
77static int bd_request;
78
79static void buf_daemon __P((void));
80/*
81 * bogus page -- for I/O to/from partially complete buffers
82 * this is a temporary solution to the problem, but it is not
83 * really that bad.  it would be better to split the buffer
84 * for input in the case of buffers partially already in memory,
85 * but the code is intricate enough already.
86 */
87vm_page_t bogus_page;
88int runningbufspace;
89int vmiodirenable = FALSE;
90static vm_offset_t bogus_offset;
91
92static int bufspace, maxbufspace,
93	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
94static int bufreusecnt, bufdefragcnt, buffreekvacnt;
95static int maxbdrun;
96static int needsbuffer;
97static int numdirtybuffers, hidirtybuffers;
98static int numfreebuffers, lofreebuffers, hifreebuffers;
99static int getnewbufcalls;
100static int getnewbufrestarts;
101
102SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
103	&numdirtybuffers, 0, "");
104SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
105	&hidirtybuffers, 0, "");
106SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
107	&numfreebuffers, 0, "");
108SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
109	&lofreebuffers, 0, "");
110SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
111	&hifreebuffers, 0, "");
112SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
113	&runningbufspace, 0, "");
114SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
115	&maxbufspace, 0, "");
116SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
117	&hibufspace, 0, "");
118SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
119	&lobufspace, 0, "");
120SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
121	&bufspace, 0, "");
122SYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW,
123	&maxbdrun, 0, "");
124SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
125	&maxbufmallocspace, 0, "");
126SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
127	&bufmallocspace, 0, "");
128SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
129	&getnewbufcalls, 0, "");
130SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
131	&getnewbufrestarts, 0, "");
132SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
133	&vmiodirenable, 0, "");
134SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW,
135	&bufdefragcnt, 0, "");
136SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW,
137	&buffreekvacnt, 0, "");
138SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW,
139	&bufreusecnt, 0, "");
140
141static int bufhashmask;
142static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
143struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
144char *buf_wmesg = BUF_WMESG;
145
146extern int vm_swap_size;
147
148#define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
149#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
150#define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
151#define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
152
153/*
154 * Buffer hash table code.  Note that the logical block scans linearly, which
155 * gives us some L1 cache locality.
156 */
157
158static __inline
159struct bufhashhdr *
160bufhash(struct vnode *vnp, daddr_t bn)
161{
162	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
163}
164
165/*
166 *	numdirtywakeup:
167 *
168 *	If someone is blocked due to there being too many dirty buffers,
169 *	and numdirtybuffers is now reasonable, wake them up.
170 */
171
172static __inline void
173numdirtywakeup(void)
174{
175	if (numdirtybuffers < hidirtybuffers) {
176		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
177			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
178			wakeup(&needsbuffer);
179		}
180	}
181}
182
183/*
184 *	bufspacewakeup:
185 *
186 *	Called when buffer space is potentially available for recovery.
187 *	getnewbuf() will block on this flag when it is unable to free
188 *	sufficient buffer space.  Buffer space becomes recoverable when
189 *	bp's get placed back in the queues.
190 */
191
192static __inline void
193bufspacewakeup(void)
194{
195	/*
196	 * If someone is waiting for BUF space, wake them up.  Even
197	 * though we haven't freed the kva space yet, the waiting
198	 * process will be able to now.
199	 */
200	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
201		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
202		wakeup(&needsbuffer);
203	}
204}
205
206/*
207 *	bufcountwakeup:
208 *
209 *	Called when a buffer has been added to one of the free queues to
210 *	account for the buffer and to wakeup anyone waiting for free buffers.
211 *	This typically occurs when large amounts of metadata are being handled
212 *	by the buffer cache ( else buffer space runs out first, usually ).
213 */
214
215static __inline void
216bufcountwakeup(void)
217{
218	++numfreebuffers;
219	if (needsbuffer) {
220		needsbuffer &= ~VFS_BIO_NEED_ANY;
221		if (numfreebuffers >= hifreebuffers)
222			needsbuffer &= ~VFS_BIO_NEED_FREE;
223		wakeup(&needsbuffer);
224	}
225}
226
227/*
228 *	vfs_buf_test_cache:
229 *
230 *	Called when a buffer is extended.  This function clears the B_CACHE
231 *	bit if the newly extended portion of the buffer does not contain
232 *	valid data.
233 */
234static __inline__
235void
236vfs_buf_test_cache(struct buf *bp,
237		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
238		  vm_page_t m)
239{
240	if (bp->b_flags & B_CACHE) {
241		int base = (foff + off) & PAGE_MASK;
242		if (vm_page_is_valid(m, base, size) == 0)
243			bp->b_flags &= ~B_CACHE;
244	}
245}
246
247static __inline__
248void
249bd_wakeup(int dirtybuflevel)
250{
251	if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
252		bd_request = 1;
253		wakeup(&bd_request);
254	}
255}
256
257/*
258 * bd_speedup - speedup the buffer cache flushing code
259 */
260
261static __inline__
262void
263bd_speedup(void)
264{
265	bd_wakeup(1);
266}
267
268/*
269 * Initialize buffer headers and related structures.
270 */
271
272caddr_t
273bufhashinit(caddr_t vaddr)
274{
275	/* first, make a null hash table */
276	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
277		;
278	bufhashtbl = (void *)vaddr;
279	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
280	--bufhashmask;
281	return(vaddr);
282}
283
284void
285bufinit(void)
286{
287	struct buf *bp;
288	int i;
289
290	TAILQ_INIT(&bswlist);
291	LIST_INIT(&invalhash);
292	mtx_init(&buftimelock, "buftime lock", MTX_DEF);
293
294	for (i = 0; i <= bufhashmask; i++)
295		LIST_INIT(&bufhashtbl[i]);
296
297	/* next, make a null set of free lists */
298	for (i = 0; i < BUFFER_QUEUES; i++)
299		TAILQ_INIT(&bufqueues[i]);
300
301	/* finally, initialize each buffer header and stick on empty q */
302	for (i = 0; i < nbuf; i++) {
303		bp = &buf[i];
304		bzero(bp, sizeof *bp);
305		bp->b_flags = B_INVAL;	/* we're just an empty header */
306		bp->b_dev = NODEV;
307		bp->b_rcred = NOCRED;
308		bp->b_wcred = NOCRED;
309		bp->b_qindex = QUEUE_EMPTY;
310		bp->b_xflags = 0;
311		LIST_INIT(&bp->b_dep);
312		BUF_LOCKINIT(bp);
313		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
314		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
315	}
316
317	/*
318	 * maxbufspace is the absolute maximum amount of buffer space we are
319	 * allowed to reserve in KVM and in real terms.  The absolute maximum
320	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
321	 * used by most other processes.  The differential is required to
322	 * ensure that buf_daemon is able to run when other processes might
323	 * be blocked waiting for buffer space.
324	 *
325	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
326	 * this may result in KVM fragmentation which is not handled optimally
327	 * by the system.
328	 */
329	maxbufspace = nbuf * BKVASIZE;
330	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
331	lobufspace = hibufspace - MAXBSIZE;
332
333/*
334 * Limit the amount of malloc memory since it is wired permanently into
335 * the kernel space.  Even though this is accounted for in the buffer
336 * allocation, we don't want the malloced region to grow uncontrolled.
337 * The malloc scheme improves memory utilization significantly on average
338 * (small) directories.
339 */
340	maxbufmallocspace = hibufspace / 20;
341
342/*
343 * Reduce the chance of a deadlock occuring by limiting the number
344 * of delayed-write dirty buffers we allow to stack up.
345 */
346	hidirtybuffers = nbuf / 4 + 20;
347	numdirtybuffers = 0;
348/*
349 * To support extreme low-memory systems, make sure hidirtybuffers cannot
350 * eat up all available buffer space.  This occurs when our minimum cannot
351 * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
352 * BKVASIZE'd (8K) buffers.
353 */
354	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
355		hidirtybuffers >>= 1;
356	}
357
358/*
359 * Try to keep the number of free buffers in the specified range,
360 * and give special processes (e.g. like buf_daemon) access to an
361 * emergency reserve.
362 */
363	lofreebuffers = nbuf / 18 + 5;
364	hifreebuffers = 2 * lofreebuffers;
365	numfreebuffers = nbuf;
366
367/*
368 * Maximum number of async ops initiated per buf_daemon loop.  This is
369 * somewhat of a hack at the moment, we really need to limit ourselves
370 * based on the number of bytes of I/O in-transit that were initiated
371 * from buf_daemon.
372 */
373	if ((maxbdrun = nswbuf / 4) < 4)
374		maxbdrun = 4;
375
376	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
377	bogus_page = vm_page_alloc(kernel_object,
378			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
379			VM_ALLOC_NORMAL);
380	cnt.v_wire_count++;
381
382}
383
384/*
385 * bfreekva() - free the kva allocation for a buffer.
386 *
387 *	Must be called at splbio() or higher as this is the only locking for
388 *	buffer_map.
389 *
390 *	Since this call frees up buffer space, we call bufspacewakeup().
391 */
392static void
393bfreekva(struct buf * bp)
394{
395	if (bp->b_kvasize) {
396		++buffreekvacnt;
397		bufspace -= bp->b_kvasize;
398		vm_map_delete(buffer_map,
399		    (vm_offset_t) bp->b_kvabase,
400		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
401		);
402		bp->b_kvasize = 0;
403		bufspacewakeup();
404	}
405}
406
407/*
408 *	bremfree:
409 *
410 *	Remove the buffer from the appropriate free list.
411 */
412void
413bremfree(struct buf * bp)
414{
415	int s = splbio();
416	int old_qindex = bp->b_qindex;
417
418	if (bp->b_qindex != QUEUE_NONE) {
419		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
420		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
421		bp->b_qindex = QUEUE_NONE;
422		runningbufspace += bp->b_bufsize;
423	} else {
424		if (BUF_REFCNT(bp) <= 1)
425			panic("bremfree: removing a buffer not on a queue");
426	}
427
428	/*
429	 * Fixup numfreebuffers count.  If the buffer is invalid or not
430	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
431	 * the buffer was free and we must decrement numfreebuffers.
432	 */
433	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
434		switch(old_qindex) {
435		case QUEUE_DIRTY:
436		case QUEUE_CLEAN:
437		case QUEUE_EMPTY:
438		case QUEUE_EMPTYKVA:
439			--numfreebuffers;
440			break;
441		default:
442			break;
443		}
444	}
445	splx(s);
446}
447
448
449/*
450 * Get a buffer with the specified data.  Look in the cache first.  We
451 * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
452 * is set, the buffer is valid and we do not have to do anything ( see
453 * getblk() ).
454 */
455int
456bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
457    struct buf ** bpp)
458{
459	struct buf *bp;
460
461	bp = getblk(vp, blkno, size, 0, 0);
462	*bpp = bp;
463
464	/* if not found in cache, do some I/O */
465	if ((bp->b_flags & B_CACHE) == 0) {
466		if (curproc != idleproc)
467			curproc->p_stats->p_ru.ru_inblock++;
468		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
469		bp->b_iocmd = BIO_READ;
470		bp->b_flags &= ~B_INVAL;
471		bp->b_ioflags &= ~BIO_ERROR;
472		if (bp->b_rcred == NOCRED) {
473			if (cred != NOCRED)
474				crhold(cred);
475			bp->b_rcred = cred;
476		}
477		vfs_busy_pages(bp, 0);
478		VOP_STRATEGY(vp, bp);
479		return (bufwait(bp));
480	}
481	return (0);
482}
483
484/*
485 * Operates like bread, but also starts asynchronous I/O on
486 * read-ahead blocks.  We must clear BIO_ERROR and B_INVAL prior
487 * to initiating I/O . If B_CACHE is set, the buffer is valid
488 * and we do not have to do anything.
489 */
490int
491breadn(struct vnode * vp, daddr_t blkno, int size,
492    daddr_t * rablkno, int *rabsize,
493    int cnt, struct ucred * cred, struct buf ** bpp)
494{
495	struct buf *bp, *rabp;
496	int i;
497	int rv = 0, readwait = 0;
498
499	*bpp = bp = getblk(vp, blkno, size, 0, 0);
500
501	/* if not found in cache, do some I/O */
502	if ((bp->b_flags & B_CACHE) == 0) {
503		if (curproc != idleproc)
504			curproc->p_stats->p_ru.ru_inblock++;
505		bp->b_iocmd = BIO_READ;
506		bp->b_flags &= ~B_INVAL;
507		bp->b_ioflags &= ~BIO_ERROR;
508		if (bp->b_rcred == NOCRED) {
509			if (cred != NOCRED)
510				crhold(cred);
511			bp->b_rcred = cred;
512		}
513		vfs_busy_pages(bp, 0);
514		VOP_STRATEGY(vp, bp);
515		++readwait;
516	}
517
518	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
519		if (inmem(vp, *rablkno))
520			continue;
521		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
522
523		if ((rabp->b_flags & B_CACHE) == 0) {
524			if (curproc != idleproc)
525				curproc->p_stats->p_ru.ru_inblock++;
526			rabp->b_flags |= B_ASYNC;
527			rabp->b_flags &= ~B_INVAL;
528			rabp->b_ioflags &= ~BIO_ERROR;
529			rabp->b_iocmd = BIO_READ;
530			if (rabp->b_rcred == NOCRED) {
531				if (cred != NOCRED)
532					crhold(cred);
533				rabp->b_rcred = cred;
534			}
535			vfs_busy_pages(rabp, 0);
536			BUF_KERNPROC(rabp);
537			VOP_STRATEGY(vp, rabp);
538		} else {
539			brelse(rabp);
540		}
541	}
542
543	if (readwait) {
544		rv = bufwait(bp);
545	}
546	return (rv);
547}
548
549/*
550 * Write, release buffer on completion.  (Done by iodone
551 * if async).  Do not bother writing anything if the buffer
552 * is invalid.
553 *
554 * Note that we set B_CACHE here, indicating that buffer is
555 * fully valid and thus cacheable.  This is true even of NFS
556 * now so we set it generally.  This could be set either here
557 * or in biodone() since the I/O is synchronous.  We put it
558 * here.
559 */
560int
561bwrite(struct buf * bp)
562{
563	int oldflags, s;
564	struct buf *newbp;
565
566	if (bp->b_flags & B_INVAL) {
567		brelse(bp);
568		return (0);
569	}
570
571	oldflags = bp->b_flags;
572
573	if (BUF_REFCNT(bp) == 0)
574		panic("bwrite: buffer is not busy???");
575	s = splbio();
576	/*
577	 * If a background write is already in progress, delay
578	 * writing this block if it is asynchronous. Otherwise
579	 * wait for the background write to complete.
580	 */
581	if (bp->b_xflags & BX_BKGRDINPROG) {
582		if (bp->b_flags & B_ASYNC) {
583			splx(s);
584			bdwrite(bp);
585			return (0);
586		}
587		bp->b_xflags |= BX_BKGRDWAIT;
588		tsleep(&bp->b_xflags, PRIBIO, "biord", 0);
589		if (bp->b_xflags & BX_BKGRDINPROG)
590			panic("bwrite: still writing");
591	}
592
593	/* Mark the buffer clean */
594	bundirty(bp);
595
596	/*
597	 * If this buffer is marked for background writing and we
598	 * do not have to wait for it, make a copy and write the
599	 * copy so as to leave this buffer ready for further use.
600	 */
601	if ((bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC)) {
602		if (bp->b_iodone != NULL) {
603			printf("bp->b_iodone = %p\n", bp->b_iodone);
604			panic("bwrite: need chained iodone");
605		}
606
607		/* get a new block */
608		newbp = geteblk(bp->b_bufsize);
609
610		/* set it to be identical to the old block */
611		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
612		bgetvp(bp->b_vp, newbp);
613		newbp->b_lblkno = bp->b_lblkno;
614		newbp->b_blkno = bp->b_blkno;
615		newbp->b_offset = bp->b_offset;
616		newbp->b_iodone = vfs_backgroundwritedone;
617		newbp->b_flags |= B_ASYNC;
618		newbp->b_flags &= ~B_INVAL;
619
620		/* move over the dependencies */
621		if (LIST_FIRST(&bp->b_dep) != NULL)
622			buf_movedeps(bp, newbp);
623
624		/*
625		 * Initiate write on the copy, release the original to
626		 * the B_LOCKED queue so that it cannot go away until
627		 * the background write completes. If not locked it could go
628		 * away and then be reconstituted while it was being written.
629		 * If the reconstituted buffer were written, we could end up
630		 * with two background copies being written at the same time.
631		 */
632		bp->b_xflags |= BX_BKGRDINPROG;
633		bp->b_flags |= B_LOCKED;
634		bqrelse(bp);
635		bp = newbp;
636	}
637
638	bp->b_flags &= ~B_DONE;
639	bp->b_ioflags &= ~BIO_ERROR;
640	bp->b_flags |= B_WRITEINPROG | B_CACHE;
641	bp->b_iocmd = BIO_WRITE;
642
643	bp->b_vp->v_numoutput++;
644	vfs_busy_pages(bp, 1);
645	if (curproc != idleproc)
646		curproc->p_stats->p_ru.ru_oublock++;
647	splx(s);
648	if (oldflags & B_ASYNC)
649		BUF_KERNPROC(bp);
650	BUF_STRATEGY(bp);
651
652	if ((oldflags & B_ASYNC) == 0) {
653		int rtval = bufwait(bp);
654		brelse(bp);
655		return (rtval);
656	}
657
658	return (0);
659}
660
661/*
662 * Complete a background write started from bwrite.
663 */
664static void
665vfs_backgroundwritedone(bp)
666	struct buf *bp;
667{
668	struct buf *origbp;
669
670	/*
671	 * Find the original buffer that we are writing.
672	 */
673	if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
674		panic("backgroundwritedone: lost buffer");
675	/*
676	 * Process dependencies then return any unfinished ones.
677	 */
678	if (LIST_FIRST(&bp->b_dep) != NULL)
679		buf_complete(bp);
680	if (LIST_FIRST(&bp->b_dep) != NULL)
681		buf_movedeps(bp, origbp);
682	/*
683	 * Clear the BX_BKGRDINPROG flag in the original buffer
684	 * and awaken it if it is waiting for the write to complete.
685	 */
686	origbp->b_xflags &= ~BX_BKGRDINPROG;
687	if (origbp->b_xflags & BX_BKGRDWAIT) {
688		origbp->b_xflags &= ~BX_BKGRDWAIT;
689		wakeup(&origbp->b_xflags);
690	}
691	/*
692	 * Clear the B_LOCKED flag and remove it from the locked
693	 * queue if it currently resides there.
694	 */
695	origbp->b_flags &= ~B_LOCKED;
696	if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
697		bremfree(origbp);
698		bqrelse(origbp);
699	}
700	/*
701	 * This buffer is marked B_NOCACHE, so when it is released
702	 * by biodone, it will be tossed. We mark it with BIO_READ
703	 * to avoid biodone doing a second vwakeup.
704	 */
705	bp->b_flags |= B_NOCACHE;
706	bp->b_iocmd = BIO_READ;
707	bp->b_flags &= ~(B_CACHE | B_DONE);
708	bp->b_iodone = 0;
709	bufdone(bp);
710}
711
712/*
713 * Delayed write. (Buffer is marked dirty).  Do not bother writing
714 * anything if the buffer is marked invalid.
715 *
716 * Note that since the buffer must be completely valid, we can safely
717 * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
718 * biodone() in order to prevent getblk from writing the buffer
719 * out synchronously.
720 */
721void
722bdwrite(struct buf * bp)
723{
724	if (BUF_REFCNT(bp) == 0)
725		panic("bdwrite: buffer is not busy");
726
727	if (bp->b_flags & B_INVAL) {
728		brelse(bp);
729		return;
730	}
731	bdirty(bp);
732
733	/*
734	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
735	 * true even of NFS now.
736	 */
737	bp->b_flags |= B_CACHE;
738
739	/*
740	 * This bmap keeps the system from needing to do the bmap later,
741	 * perhaps when the system is attempting to do a sync.  Since it
742	 * is likely that the indirect block -- or whatever other datastructure
743	 * that the filesystem needs is still in memory now, it is a good
744	 * thing to do this.  Note also, that if the pageout daemon is
745	 * requesting a sync -- there might not be enough memory to do
746	 * the bmap then...  So, this is important to do.
747	 */
748	if (bp->b_lblkno == bp->b_blkno) {
749		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
750	}
751
752	/*
753	 * Set the *dirty* buffer range based upon the VM system dirty pages.
754	 */
755	vfs_setdirty(bp);
756
757	/*
758	 * We need to do this here to satisfy the vnode_pager and the
759	 * pageout daemon, so that it thinks that the pages have been
760	 * "cleaned".  Note that since the pages are in a delayed write
761	 * buffer -- the VFS layer "will" see that the pages get written
762	 * out on the next sync, or perhaps the cluster will be completed.
763	 */
764	vfs_clean_pages(bp);
765	bqrelse(bp);
766
767	/*
768	 * Wakeup the buffer flushing daemon if we have saturated the
769	 * buffer cache.
770	 */
771
772	bd_wakeup(hidirtybuffers);
773
774	/*
775	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
776	 * due to the softdep code.
777	 */
778}
779
780/*
781 *	bdirty:
782 *
783 *	Turn buffer into delayed write request.  We must clear BIO_READ and
784 *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to
785 *	itself to properly update it in the dirty/clean lists.  We mark it
786 *	B_DONE to ensure that any asynchronization of the buffer properly
787 *	clears B_DONE ( else a panic will occur later ).
788 *
789 *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
790 *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
791 *	should only be called if the buffer is known-good.
792 *
793 *	Since the buffer is not on a queue, we do not update the numfreebuffers
794 *	count.
795 *
796 *	Must be called at splbio().
797 *	The buffer must be on QUEUE_NONE.
798 */
799void
800bdirty(bp)
801	struct buf *bp;
802{
803	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
804	bp->b_flags &= ~(B_RELBUF);
805	bp->b_iocmd = BIO_WRITE;
806
807	if ((bp->b_flags & B_DELWRI) == 0) {
808		bp->b_flags |= B_DONE | B_DELWRI;
809		reassignbuf(bp, bp->b_vp);
810		++numdirtybuffers;
811		bd_wakeup(hidirtybuffers);
812	}
813}
814
815/*
816 *	bundirty:
817 *
818 *	Clear B_DELWRI for buffer.
819 *
820 *	Since the buffer is not on a queue, we do not update the numfreebuffers
821 *	count.
822 *
823 *	Must be called at splbio().
824 *	The buffer must be on QUEUE_NONE.
825 */
826
827void
828bundirty(bp)
829	struct buf *bp;
830{
831	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
832
833	if (bp->b_flags & B_DELWRI) {
834		bp->b_flags &= ~B_DELWRI;
835		reassignbuf(bp, bp->b_vp);
836		--numdirtybuffers;
837		numdirtywakeup();
838	}
839	/*
840	 * Since it is now being written, we can clear its deferred write flag.
841	 */
842	bp->b_flags &= ~B_DEFERRED;
843}
844
845/*
846 *	bawrite:
847 *
848 *	Asynchronous write.  Start output on a buffer, but do not wait for
849 *	it to complete.  The buffer is released when the output completes.
850 *
851 *	bwrite() ( or the VOP routine anyway ) is responsible for handling
852 *	B_INVAL buffers.  Not us.
853 */
854void
855bawrite(struct buf * bp)
856{
857	bp->b_flags |= B_ASYNC;
858	(void) BUF_WRITE(bp);
859}
860
861/*
862 *	bowrite:
863 *
864 *	Ordered write.  Start output on a buffer, and flag it so that the
865 *	device will write it in the order it was queued.  The buffer is
866 *	released when the output completes.  bwrite() ( or the VOP routine
867 *	anyway ) is responsible for handling B_INVAL buffers.
868 */
869int
870bowrite(struct buf * bp)
871{
872	bp->b_ioflags |= BIO_ORDERED;
873	bp->b_flags |= B_ASYNC;
874	return (BUF_WRITE(bp));
875}
876
877/*
878 *	bwillwrite:
879 *
880 *	Called prior to the locking of any vnodes when we are expecting to
881 *	write.  We do not want to starve the buffer cache with too many
882 *	dirty buffers so we block here.  By blocking prior to the locking
883 *	of any vnodes we attempt to avoid the situation where a locked vnode
884 *	prevents the various system daemons from flushing related buffers.
885 */
886
887void
888bwillwrite(void)
889{
890	int slop = hidirtybuffers / 10;
891
892	if (numdirtybuffers > hidirtybuffers + slop) {
893		int s;
894
895		s = splbio();
896		while (numdirtybuffers > hidirtybuffers) {
897			bd_wakeup(hidirtybuffers);
898			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
899			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
900		}
901		splx(s);
902	}
903}
904
905/*
906 *	brelse:
907 *
908 *	Release a busy buffer and, if requested, free its resources.  The
909 *	buffer will be stashed in the appropriate bufqueue[] allowing it
910 *	to be accessed later as a cache entity or reused for other purposes.
911 */
912void
913brelse(struct buf * bp)
914{
915	int s;
916
917	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
918
919	s = splbio();
920
921	if (bp->b_flags & B_LOCKED)
922		bp->b_ioflags &= ~BIO_ERROR;
923
924	if (bp->b_iocmd == BIO_WRITE &&
925	    (bp->b_ioflags & BIO_ERROR) &&
926	    !(bp->b_flags & B_INVAL)) {
927		/*
928		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
929		 * pages from being scrapped.  If B_INVAL is set then
930		 * this case is not run and the next case is run to
931		 * destroy the buffer.  B_INVAL can occur if the buffer
932		 * is outside the range supported by the underlying device.
933		 */
934		bp->b_ioflags &= ~BIO_ERROR;
935		bdirty(bp);
936	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
937	    (bp->b_ioflags & BIO_ERROR) ||
938	    bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) {
939		/*
940		 * Either a failed I/O or we were asked to free or not
941		 * cache the buffer.
942		 */
943		bp->b_flags |= B_INVAL;
944		if (LIST_FIRST(&bp->b_dep) != NULL)
945			buf_deallocate(bp);
946		if (bp->b_flags & B_DELWRI) {
947			--numdirtybuffers;
948			numdirtywakeup();
949		}
950		bp->b_flags &= ~(B_DELWRI | B_CACHE);
951		if ((bp->b_flags & B_VMIO) == 0) {
952			if (bp->b_bufsize)
953				allocbuf(bp, 0);
954			if (bp->b_vp)
955				brelvp(bp);
956		}
957	}
958
959	/*
960	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
961	 * is called with B_DELWRI set, the underlying pages may wind up
962	 * getting freed causing a previous write (bdwrite()) to get 'lost'
963	 * because pages associated with a B_DELWRI bp are marked clean.
964	 *
965	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
966	 * if B_DELWRI is set.
967	 */
968
969	if (bp->b_flags & B_DELWRI)
970		bp->b_flags &= ~B_RELBUF;
971
972	/*
973	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
974	 * constituted, not even NFS buffers now.  Two flags effect this.  If
975	 * B_INVAL, the struct buf is invalidated but the VM object is kept
976	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
977	 *
978	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
979	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
980	 * buffer is also B_INVAL because it hits the re-dirtying code above.
981	 *
982	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
983	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
984	 * the commit state and we cannot afford to lose the buffer. If the
985	 * buffer has a background write in progress, we need to keep it
986	 * around to prevent it from being reconstituted and starting a second
987	 * background write.
988	 */
989	if ((bp->b_flags & B_VMIO)
990	    && !(bp->b_vp->v_tag == VT_NFS &&
991		 !vn_isdisk(bp->b_vp, NULL) &&
992		 (bp->b_flags & B_DELWRI) &&
993		 (bp->b_xflags & BX_BKGRDINPROG))
994	    ) {
995
996		int i, j, resid;
997		vm_page_t m;
998		off_t foff;
999		vm_pindex_t poff;
1000		vm_object_t obj;
1001		struct vnode *vp;
1002
1003		vp = bp->b_vp;
1004
1005		/*
1006		 * Get the base offset and length of the buffer.  Note that
1007		 * for block sizes that are less then PAGE_SIZE, the b_data
1008		 * base of the buffer does not represent exactly b_offset and
1009		 * neither b_offset nor b_size are necessarily page aligned.
1010		 * Instead, the starting position of b_offset is:
1011		 *
1012		 * 	b_data + (b_offset & PAGE_MASK)
1013		 *
1014		 * block sizes less then DEV_BSIZE (usually 512) are not
1015		 * supported due to the page granularity bits (m->valid,
1016		 * m->dirty, etc...).
1017		 *
1018		 * See man buf(9) for more information
1019		 */
1020
1021		resid = bp->b_bufsize;
1022		foff = bp->b_offset;
1023
1024		for (i = 0; i < bp->b_npages; i++) {
1025			m = bp->b_pages[i];
1026			vm_page_flag_clear(m, PG_ZERO);
1027			if (m == bogus_page) {
1028
1029				VOP_GETVOBJECT(vp, &obj);
1030				poff = OFF_TO_IDX(bp->b_offset);
1031
1032				for (j = i; j < bp->b_npages; j++) {
1033					m = bp->b_pages[j];
1034					if (m == bogus_page) {
1035						m = vm_page_lookup(obj, poff + j);
1036						if (!m) {
1037							panic("brelse: page missing\n");
1038						}
1039						bp->b_pages[j] = m;
1040					}
1041				}
1042
1043				if ((bp->b_flags & B_INVAL) == 0) {
1044					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
1045				}
1046			}
1047			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
1048				int poffset = foff & PAGE_MASK;
1049				int presid = resid > (PAGE_SIZE - poffset) ?
1050					(PAGE_SIZE - poffset) : resid;
1051
1052				KASSERT(presid >= 0, ("brelse: extra page"));
1053				vm_page_set_invalid(m, poffset, presid);
1054			}
1055			resid -= PAGE_SIZE - (foff & PAGE_MASK);
1056			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
1057		}
1058
1059		if (bp->b_flags & (B_INVAL | B_RELBUF))
1060			vfs_vmio_release(bp);
1061
1062	} else if (bp->b_flags & B_VMIO) {
1063
1064		if (bp->b_flags & (B_INVAL | B_RELBUF))
1065			vfs_vmio_release(bp);
1066
1067	}
1068
1069	if (bp->b_qindex != QUEUE_NONE)
1070		panic("brelse: free buffer onto another queue???");
1071	if (BUF_REFCNT(bp) > 1) {
1072		/* do not release to free list */
1073		BUF_UNLOCK(bp);
1074		splx(s);
1075		return;
1076	}
1077
1078	/* enqueue */
1079
1080	/* buffers with no memory */
1081	if (bp->b_bufsize == 0) {
1082		bp->b_flags |= B_INVAL;
1083		bp->b_xflags &= ~BX_BKGRDWRITE;
1084		if (bp->b_xflags & BX_BKGRDINPROG)
1085			panic("losing buffer 1");
1086		if (bp->b_kvasize) {
1087			bp->b_qindex = QUEUE_EMPTYKVA;
1088		} else {
1089			bp->b_qindex = QUEUE_EMPTY;
1090		}
1091		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
1092		LIST_REMOVE(bp, b_hash);
1093		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1094		bp->b_dev = NODEV;
1095	/* buffers with junk contents */
1096	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) {
1097		bp->b_flags |= B_INVAL;
1098		bp->b_xflags &= ~BX_BKGRDWRITE;
1099		if (bp->b_xflags & BX_BKGRDINPROG)
1100			panic("losing buffer 2");
1101		bp->b_qindex = QUEUE_CLEAN;
1102		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1103		LIST_REMOVE(bp, b_hash);
1104		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1105		bp->b_dev = NODEV;
1106
1107	/* buffers that are locked */
1108	} else if (bp->b_flags & B_LOCKED) {
1109		bp->b_qindex = QUEUE_LOCKED;
1110		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1111
1112	/* remaining buffers */
1113	} else {
1114		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
1115		case B_DELWRI | B_AGE:
1116		    bp->b_qindex = QUEUE_DIRTY;
1117		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1118		    break;
1119		case B_DELWRI:
1120		    bp->b_qindex = QUEUE_DIRTY;
1121		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1122		    break;
1123		case B_AGE:
1124		    bp->b_qindex = QUEUE_CLEAN;
1125		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1126		    break;
1127		default:
1128		    bp->b_qindex = QUEUE_CLEAN;
1129		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1130		    break;
1131		}
1132	}
1133
1134	/*
1135	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
1136	 * on the correct queue.
1137	 */
1138	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
1139		bp->b_flags &= ~B_DELWRI;
1140		--numdirtybuffers;
1141		numdirtywakeup();
1142	}
1143
1144	runningbufspace -= bp->b_bufsize;
1145
1146	/*
1147	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
1148	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
1149	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
1150	 * if B_INVAL is set ).
1151	 */
1152
1153	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
1154		bufcountwakeup();
1155
1156	/*
1157	 * Something we can maybe free.
1158	 */
1159
1160	if (bp->b_bufsize || bp->b_kvasize)
1161		bufspacewakeup();
1162
1163	/* unlock */
1164	BUF_UNLOCK(bp);
1165	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1166	bp->b_ioflags &= ~BIO_ORDERED;
1167	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1168		panic("brelse: not dirty");
1169	splx(s);
1170}
1171
1172/*
1173 * Release a buffer back to the appropriate queue but do not try to free
1174 * it.
1175 *
1176 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
1177 * biodone() to requeue an async I/O on completion.  It is also used when
1178 * known good buffers need to be requeued but we think we may need the data
1179 * again soon.
1180 */
1181void
1182bqrelse(struct buf * bp)
1183{
1184	int s;
1185
1186	s = splbio();
1187
1188	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
1189
1190	if (bp->b_qindex != QUEUE_NONE)
1191		panic("bqrelse: free buffer onto another queue???");
1192	if (BUF_REFCNT(bp) > 1) {
1193		/* do not release to free list */
1194		BUF_UNLOCK(bp);
1195		splx(s);
1196		return;
1197	}
1198	if (bp->b_flags & B_LOCKED) {
1199		bp->b_ioflags &= ~BIO_ERROR;
1200		bp->b_qindex = QUEUE_LOCKED;
1201		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
1202		/* buffers with stale but valid contents */
1203	} else if (bp->b_flags & B_DELWRI) {
1204		bp->b_qindex = QUEUE_DIRTY;
1205		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
1206	} else {
1207		bp->b_qindex = QUEUE_CLEAN;
1208		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
1209	}
1210
1211	runningbufspace -= bp->b_bufsize;
1212
1213	if ((bp->b_flags & B_LOCKED) == 0 &&
1214	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
1215		bufcountwakeup();
1216	}
1217
1218	/*
1219	 * Something we can maybe wakeup
1220	 */
1221	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
1222		bufspacewakeup();
1223
1224	/* unlock */
1225	BUF_UNLOCK(bp);
1226	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
1227	bp->b_ioflags &= ~BIO_ORDERED;
1228	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
1229		panic("bqrelse: not dirty");
1230	splx(s);
1231}
1232
1233static void
1234vfs_vmio_release(bp)
1235	struct buf *bp;
1236{
1237	int i, s;
1238	vm_page_t m;
1239
1240	s = splvm();
1241	for (i = 0; i < bp->b_npages; i++) {
1242		m = bp->b_pages[i];
1243		bp->b_pages[i] = NULL;
1244		/*
1245		 * In order to keep page LRU ordering consistent, put
1246		 * everything on the inactive queue.
1247		 */
1248		vm_page_unwire(m, 0);
1249		/*
1250		 * We don't mess with busy pages, it is
1251		 * the responsibility of the process that
1252		 * busied the pages to deal with them.
1253		 */
1254		if ((m->flags & PG_BUSY) || (m->busy != 0))
1255			continue;
1256
1257		if (m->wire_count == 0) {
1258			vm_page_flag_clear(m, PG_ZERO);
1259			/*
1260			 * Might as well free the page if we can and it has
1261			 * no valid data.
1262			 */
1263			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
1264				vm_page_busy(m);
1265				vm_page_protect(m, VM_PROT_NONE);
1266				vm_page_free(m);
1267			}
1268		}
1269	}
1270	runningbufspace -= bp->b_bufsize;
1271	splx(s);
1272	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
1273	if (bp->b_bufsize)
1274		bufspacewakeup();
1275	bp->b_npages = 0;
1276	bp->b_bufsize = 0;
1277	bp->b_flags &= ~B_VMIO;
1278	if (bp->b_vp)
1279		brelvp(bp);
1280}
1281
1282/*
1283 * Check to see if a block is currently memory resident.
1284 */
1285struct buf *
1286gbincore(struct vnode * vp, daddr_t blkno)
1287{
1288	struct buf *bp;
1289	struct bufhashhdr *bh;
1290
1291	bh = bufhash(vp, blkno);
1292
1293	/* Search hash chain */
1294	LIST_FOREACH(bp, bh, b_hash) {
1295		/* hit */
1296		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
1297		    (bp->b_flags & B_INVAL) == 0) {
1298			break;
1299		}
1300	}
1301	return (bp);
1302}
1303
1304/*
1305 *	vfs_bio_awrite:
1306 *
1307 *	Implement clustered async writes for clearing out B_DELWRI buffers.
1308 *	This is much better then the old way of writing only one buffer at
1309 *	a time.  Note that we may not be presented with the buffers in the
1310 *	correct order, so we search for the cluster in both directions.
1311 */
1312int
1313vfs_bio_awrite(struct buf * bp)
1314{
1315	int i;
1316	int j;
1317	daddr_t lblkno = bp->b_lblkno;
1318	struct vnode *vp = bp->b_vp;
1319	int s;
1320	int ncl;
1321	struct buf *bpa;
1322	int nwritten;
1323	int size;
1324	int maxcl;
1325
1326	s = splbio();
1327	/*
1328	 * right now we support clustered writing only to regular files.  If
1329	 * we find a clusterable block we could be in the middle of a cluster
1330	 * rather then at the beginning.
1331	 */
1332	if ((vp->v_type == VREG) &&
1333	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
1334	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
1335
1336		size = vp->v_mount->mnt_stat.f_iosize;
1337		maxcl = MAXPHYS / size;
1338
1339		for (i = 1; i < maxcl; i++) {
1340			if ((bpa = gbincore(vp, lblkno + i)) &&
1341			    BUF_REFCNT(bpa) == 0 &&
1342			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1343			    (B_DELWRI | B_CLUSTEROK)) &&
1344			    (bpa->b_bufsize == size)) {
1345				if ((bpa->b_blkno == bpa->b_lblkno) ||
1346				    (bpa->b_blkno !=
1347				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
1348					break;
1349			} else {
1350				break;
1351			}
1352		}
1353		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
1354			if ((bpa = gbincore(vp, lblkno - j)) &&
1355			    BUF_REFCNT(bpa) == 0 &&
1356			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
1357			    (B_DELWRI | B_CLUSTEROK)) &&
1358			    (bpa->b_bufsize == size)) {
1359				if ((bpa->b_blkno == bpa->b_lblkno) ||
1360				    (bpa->b_blkno !=
1361				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
1362					break;
1363			} else {
1364				break;
1365			}
1366		}
1367		--j;
1368		ncl = i + j;
1369		/*
1370		 * this is a possible cluster write
1371		 */
1372		if (ncl != 1) {
1373			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
1374			splx(s);
1375			return nwritten;
1376		}
1377	}
1378
1379	BUF_LOCK(bp, LK_EXCLUSIVE);
1380	bremfree(bp);
1381	bp->b_flags |= B_ASYNC;
1382
1383	splx(s);
1384	/*
1385	 * default (old) behavior, writing out only one block
1386	 *
1387	 * XXX returns b_bufsize instead of b_bcount for nwritten?
1388	 */
1389	nwritten = bp->b_bufsize;
1390	(void) BUF_WRITE(bp);
1391
1392	return nwritten;
1393}
1394
1395/*
1396 *	getnewbuf:
1397 *
1398 *	Find and initialize a new buffer header, freeing up existing buffers
1399 *	in the bufqueues as necessary.  The new buffer is returned locked.
1400 *
1401 *	Important:  B_INVAL is not set.  If the caller wishes to throw the
1402 *	buffer away, the caller must set B_INVAL prior to calling brelse().
1403 *
1404 *	We block if:
1405 *		We have insufficient buffer headers
1406 *		We have insufficient buffer space
1407 *		buffer_map is too fragmented ( space reservation fails )
1408 *		If we have to flush dirty buffers ( but we try to avoid this )
1409 *
1410 *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
1411 *	Instead we ask the buf daemon to do it for us.  We attempt to
1412 *	avoid piecemeal wakeups of the pageout daemon.
1413 */
1414
1415static struct buf *
1416getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
1417{
1418	struct buf *bp;
1419	struct buf *nbp;
1420	int defrag = 0;
1421	int nqindex;
1422	int isspecial;
1423	static int flushingbufs;
1424
1425	if (curproc != idleproc &&
1426	    (curproc->p_flag & (P_COWINPROGRESS|P_BUFEXHAUST)) == 0)
1427		isspecial = 0;
1428	else
1429		isspecial = 1;
1430
1431	++getnewbufcalls;
1432	--getnewbufrestarts;
1433restart:
1434	++getnewbufrestarts;
1435
1436	/*
1437	 * Setup for scan.  If we do not have enough free buffers,
1438	 * we setup a degenerate case that immediately fails.  Note
1439	 * that if we are specially marked process, we are allowed to
1440	 * dip into our reserves.
1441	 *
1442	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
1443	 *
1444	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
1445	 * However, there are a number of cases (defragging, reusing, ...)
1446	 * where we cannot backup.
1447	 */
1448
1449	if (isspecial == 0 && numfreebuffers < lofreebuffers) {
1450		/*
1451		 * This will cause an immediate failure
1452		 */
1453		nqindex = QUEUE_CLEAN;
1454		nbp = NULL;
1455	} else {
1456		/*
1457		 * Locate a buffer which already has KVA assigned.  First
1458		 * try EMPTYKVA buffers.
1459		 */
1460		nqindex = QUEUE_EMPTYKVA;
1461		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
1462
1463		if (nbp == NULL) {
1464			/*
1465			 * If no EMPTYKVA buffers and we are either
1466			 * defragging or reusing, locate a CLEAN buffer
1467			 * to free or reuse.  If bufspace useage is low
1468			 * skip this step so we can allocate a new buffer.
1469			 */
1470			if (defrag || bufspace >= lobufspace) {
1471				nqindex = QUEUE_CLEAN;
1472				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
1473			}
1474
1475			/*
1476			 * Nada.  If we are allowed to allocate an EMPTY
1477			 * buffer, go get one.
1478			 */
1479			if (nbp == NULL && defrag == 0 &&
1480			    (isspecial || bufspace < hibufspace)) {
1481				nqindex = QUEUE_EMPTY;
1482				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1483			}
1484		}
1485	}
1486
1487	/*
1488	 * Run scan, possibly freeing data and/or kva mappings on the fly
1489	 * depending.
1490	 */
1491
1492	while ((bp = nbp) != NULL) {
1493		int qindex = nqindex;
1494
1495		/*
1496		 * Calculate next bp ( we can only use it if we do not block
1497		 * or do other fancy things ).
1498		 */
1499		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
1500			switch(qindex) {
1501			case QUEUE_EMPTY:
1502				nqindex = QUEUE_EMPTYKVA;
1503				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
1504					break;
1505				/* fall through */
1506			case QUEUE_EMPTYKVA:
1507				nqindex = QUEUE_CLEAN;
1508				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
1509					break;
1510				/* fall through */
1511			case QUEUE_CLEAN:
1512				/*
1513				 * nbp is NULL.
1514				 */
1515				break;
1516			}
1517		}
1518
1519		/*
1520		 * Sanity Checks
1521		 */
1522		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
1523
1524		/*
1525		 * Note: we no longer distinguish between VMIO and non-VMIO
1526		 * buffers.
1527		 */
1528
1529		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
1530
1531		/*
1532		 * If we are defragging then we need a buffer with
1533		 * b_kvasize != 0.  XXX this situation should no longer
1534		 * occur, if defrag is non-zero the buffer's b_kvasize
1535		 * should also be non-zero at this point.  XXX
1536		 */
1537		if (defrag && bp->b_kvasize == 0) {
1538			printf("Warning: defrag empty buffer %p\n", bp);
1539			continue;
1540		}
1541
1542		/*
1543		 * Start freeing the bp.  This is somewhat involved.  nbp
1544		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
1545		 */
1546
1547		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1548			panic("getnewbuf: locked buf");
1549		bremfree(bp);
1550
1551		if (qindex == QUEUE_CLEAN) {
1552			if (bp->b_flags & B_VMIO) {
1553				bp->b_flags &= ~B_ASYNC;
1554				vfs_vmio_release(bp);
1555			}
1556			if (bp->b_vp)
1557				brelvp(bp);
1558		}
1559
1560		/*
1561		 * NOTE:  nbp is now entirely invalid.  We can only restart
1562		 * the scan from this point on.
1563		 *
1564		 * Get the rest of the buffer freed up.  b_kva* is still
1565		 * valid after this operation.
1566		 */
1567
1568		if (bp->b_rcred != NOCRED) {
1569			crfree(bp->b_rcred);
1570			bp->b_rcred = NOCRED;
1571		}
1572		if (bp->b_wcred != NOCRED) {
1573			crfree(bp->b_wcred);
1574			bp->b_wcred = NOCRED;
1575		}
1576		if (LIST_FIRST(&bp->b_dep) != NULL)
1577			buf_deallocate(bp);
1578		if (bp->b_xflags & BX_BKGRDINPROG)
1579			panic("losing buffer 3");
1580		LIST_REMOVE(bp, b_hash);
1581		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1582
1583		if (bp->b_bufsize)
1584			allocbuf(bp, 0);
1585
1586		bp->b_flags = 0;
1587		bp->b_ioflags = 0;
1588		bp->b_xflags = 0;
1589		bp->b_dev = NODEV;
1590		bp->b_vp = NULL;
1591		bp->b_blkno = bp->b_lblkno = 0;
1592		bp->b_offset = NOOFFSET;
1593		bp->b_iodone = 0;
1594		bp->b_error = 0;
1595		bp->b_resid = 0;
1596		bp->b_bcount = 0;
1597		bp->b_npages = 0;
1598		bp->b_dirtyoff = bp->b_dirtyend = 0;
1599
1600		LIST_INIT(&bp->b_dep);
1601
1602		/*
1603		 * If we are defragging then free the buffer.
1604		 */
1605		if (defrag) {
1606			bp->b_flags |= B_INVAL;
1607			bfreekva(bp);
1608			brelse(bp);
1609			defrag = 0;
1610			goto restart;
1611		}
1612
1613		/*
1614		 * If we are a normal process then deal with bufspace
1615		 * hysteresis.  A normal process tries to keep bufspace
1616		 * between lobufspace and hibufspace.  Note: if we encounter
1617		 * a buffer with b_kvasize == 0 then it means we started
1618		 * our scan on the EMPTY list and should allocate a new
1619		 * buffer.
1620		 */
1621		if (isspecial == 0) {
1622			if (bufspace > hibufspace)
1623				flushingbufs = 1;
1624			if (flushingbufs && bp->b_kvasize != 0) {
1625				bp->b_flags |= B_INVAL;
1626				bfreekva(bp);
1627				brelse(bp);
1628				goto restart;
1629			}
1630			if (bufspace < lobufspace)
1631				flushingbufs = 0;
1632		}
1633		break;
1634	}
1635
1636	/*
1637	 * If we exhausted our list, sleep as appropriate.  We may have to
1638	 * wakeup various daemons and write out some dirty buffers.
1639	 *
1640	 * Generally we are sleeping due to insufficient buffer space.
1641	 */
1642
1643	if (bp == NULL) {
1644		int flags;
1645		char *waitmsg;
1646
1647		if (defrag) {
1648			flags = VFS_BIO_NEED_BUFSPACE;
1649			waitmsg = "nbufkv";
1650		} else if (bufspace >= hibufspace) {
1651			waitmsg = "nbufbs";
1652			flags = VFS_BIO_NEED_BUFSPACE;
1653		} else {
1654			waitmsg = "newbuf";
1655			flags = VFS_BIO_NEED_ANY;
1656		}
1657
1658		bd_speedup();	/* heeeelp */
1659
1660		needsbuffer |= flags;
1661		while (needsbuffer & flags) {
1662			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
1663			    waitmsg, slptimeo))
1664				return (NULL);
1665		}
1666	} else {
1667		/*
1668		 * We finally have a valid bp.  We aren't quite out of the
1669		 * woods, we still have to reserve kva space.  In order
1670		 * to keep fragmentation sane we only allocate kva in
1671		 * BKVASIZE chunks.
1672		 */
1673		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
1674
1675		if (maxsize != bp->b_kvasize) {
1676			vm_offset_t addr = 0;
1677
1678			bfreekva(bp);
1679
1680			if (vm_map_findspace(buffer_map,
1681				vm_map_min(buffer_map), maxsize, &addr)) {
1682				/*
1683				 * Uh oh.  Buffer map is to fragmented.  We
1684				 * must defragment the map.
1685				 */
1686				++bufdefragcnt;
1687				defrag = 1;
1688				bp->b_flags |= B_INVAL;
1689				brelse(bp);
1690				goto restart;
1691			}
1692			if (addr) {
1693				vm_map_insert(buffer_map, NULL, 0,
1694					addr, addr + maxsize,
1695					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1696
1697				bp->b_kvabase = (caddr_t) addr;
1698				bp->b_kvasize = maxsize;
1699				bufspace += bp->b_kvasize;
1700				++bufreusecnt;
1701			}
1702		}
1703		bp->b_data = bp->b_kvabase;
1704	}
1705	return(bp);
1706}
1707
1708/*
1709 *	waitfreebuffers:
1710 *
1711 *	Wait for sufficient free buffers.  Only called from normal processes.
1712 */
1713
1714static void
1715waitfreebuffers(int slpflag, int slptimeo)
1716{
1717	while (numfreebuffers < hifreebuffers) {
1718		if (numfreebuffers >= hifreebuffers)
1719			break;
1720		needsbuffer |= VFS_BIO_NEED_FREE;
1721		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1722			break;
1723	}
1724}
1725
1726/*
1727 *	buf_daemon:
1728 *
1729 *	buffer flushing daemon.  Buffers are normally flushed by the
1730 *	update daemon but if it cannot keep up this process starts to
1731 *	take the load in an attempt to prevent getnewbuf() from blocking.
1732 */
1733
1734static struct proc *bufdaemonproc;
1735static int bd_interval;
1736static int bd_flushto;
1737static int bd_flushinc;
1738
1739static struct kproc_desc buf_kp = {
1740	"bufdaemon",
1741	buf_daemon,
1742	&bufdaemonproc
1743};
1744SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
1745
1746static void
1747buf_daemon()
1748{
1749	int s;
1750
1751	mtx_enter(&Giant, MTX_DEF);
1752
1753	/*
1754	 * This process needs to be suspended prior to shutdown sync.
1755	 */
1756	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, bufdaemonproc,
1757	    SHUTDOWN_PRI_LAST);
1758
1759	/*
1760	 * This process is allowed to take the buffer cache to the limit
1761	 */
1762	curproc->p_flag |= P_BUFEXHAUST;
1763	s = splbio();
1764
1765	bd_interval = 5 * hz;	/* dynamically adjusted */
1766	bd_flushto = hidirtybuffers;	/* dynamically adjusted */
1767	bd_flushinc = 1;
1768
1769	for (;;) {
1770		kproc_suspend_loop(bufdaemonproc);
1771
1772		bd_request = 0;
1773
1774		/*
1775		 * Do the flush.  Limit the number of buffers we flush in one
1776		 * go.  The failure condition occurs when processes are writing
1777		 * buffers faster then we can dispose of them.  In this case
1778		 * we may be flushing so often that the previous set of flushes
1779		 * have not had time to complete, causing us to run out of
1780		 * physical buffers and block.
1781		 */
1782		{
1783			int runcount = maxbdrun;
1784
1785			while (numdirtybuffers > bd_flushto && runcount) {
1786				--runcount;
1787				if (flushbufqueues() == 0)
1788					break;
1789			}
1790		}
1791
1792		if (bd_request ||
1793		    tsleep(&bd_request, PVM, "psleep", bd_interval) == 0) {
1794			/*
1795			 * Another request is pending or we were woken up
1796			 * without timing out.  Flush more.
1797			 */
1798			--bd_flushto;
1799			if (bd_flushto >= numdirtybuffers - 5) {
1800				bd_flushto = numdirtybuffers - 10;
1801				bd_flushinc = 1;
1802			}
1803			if (bd_flushto < 2)
1804				bd_flushto = 2;
1805		} else {
1806			/*
1807			 * We slept and timed out, we can slow down.
1808			 */
1809			bd_flushto += bd_flushinc;
1810			if (bd_flushto > hidirtybuffers)
1811				bd_flushto = hidirtybuffers;
1812			++bd_flushinc;
1813			if (bd_flushinc > hidirtybuffers / 20 + 1)
1814				bd_flushinc = hidirtybuffers / 20 + 1;
1815		}
1816
1817		/*
1818		 * Set the interval on a linear scale based on hidirtybuffers
1819		 * with a maximum frequency of 1/10 second.
1820		 */
1821		bd_interval = bd_flushto * 5 * hz / hidirtybuffers;
1822		if (bd_interval < hz / 10)
1823			bd_interval = hz / 10;
1824	}
1825}
1826
1827/*
1828 *	flushbufqueues:
1829 *
1830 *	Try to flush a buffer in the dirty queue.  We must be careful to
1831 *	free up B_INVAL buffers instead of write them, which NFS is
1832 *	particularly sensitive to.
1833 */
1834
1835static int
1836flushbufqueues(void)
1837{
1838	struct buf *bp;
1839	int r = 0;
1840
1841	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
1842
1843	while (bp) {
1844		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
1845		if ((bp->b_flags & B_DELWRI) != 0 &&
1846		    (bp->b_xflags & BX_BKGRDINPROG) == 0) {
1847			if (bp->b_flags & B_INVAL) {
1848				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
1849					panic("flushbufqueues: locked buf");
1850				bremfree(bp);
1851				brelse(bp);
1852				++r;
1853				break;
1854			}
1855			if (LIST_FIRST(&bp->b_dep) != NULL &&
1856			    (bp->b_flags & B_DEFERRED) == 0 &&
1857			    buf_countdeps(bp, 0)) {
1858				TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
1859				    bp, b_freelist);
1860				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
1861				    bp, b_freelist);
1862				bp->b_flags |= B_DEFERRED;
1863				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
1864				continue;
1865			}
1866			vfs_bio_awrite(bp);
1867			++r;
1868			break;
1869		}
1870		bp = TAILQ_NEXT(bp, b_freelist);
1871	}
1872	return (r);
1873}
1874
1875/*
1876 * Check to see if a block is currently memory resident.
1877 */
1878struct buf *
1879incore(struct vnode * vp, daddr_t blkno)
1880{
1881	struct buf *bp;
1882
1883	int s = splbio();
1884	bp = gbincore(vp, blkno);
1885	splx(s);
1886	return (bp);
1887}
1888
1889/*
1890 * Returns true if no I/O is needed to access the
1891 * associated VM object.  This is like incore except
1892 * it also hunts around in the VM system for the data.
1893 */
1894
1895int
1896inmem(struct vnode * vp, daddr_t blkno)
1897{
1898	vm_object_t obj;
1899	vm_offset_t toff, tinc, size;
1900	vm_page_t m;
1901	vm_ooffset_t off;
1902
1903	if (incore(vp, blkno))
1904		return 1;
1905	if (vp->v_mount == NULL)
1906		return 0;
1907	if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
1908		return 0;
1909
1910	size = PAGE_SIZE;
1911	if (size > vp->v_mount->mnt_stat.f_iosize)
1912		size = vp->v_mount->mnt_stat.f_iosize;
1913	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
1914
1915	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1916		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1917		if (!m)
1918			return 0;
1919		tinc = size;
1920		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
1921			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
1922		if (vm_page_is_valid(m,
1923		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1924			return 0;
1925	}
1926	return 1;
1927}
1928
1929/*
1930 *	vfs_setdirty:
1931 *
1932 *	Sets the dirty range for a buffer based on the status of the dirty
1933 *	bits in the pages comprising the buffer.
1934 *
1935 *	The range is limited to the size of the buffer.
1936 *
1937 *	This routine is primarily used by NFS, but is generalized for the
1938 *	B_VMIO case.
1939 */
1940static void
1941vfs_setdirty(struct buf *bp)
1942{
1943	int i;
1944	vm_object_t object;
1945
1946	/*
1947	 * Degenerate case - empty buffer
1948	 */
1949
1950	if (bp->b_bufsize == 0)
1951		return;
1952
1953	/*
1954	 * We qualify the scan for modified pages on whether the
1955	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1956	 * is not cleared simply by protecting pages off.
1957	 */
1958
1959	if ((bp->b_flags & B_VMIO) == 0)
1960		return;
1961
1962	object = bp->b_pages[0]->object;
1963
1964	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
1965		printf("Warning: object %p writeable but not mightbedirty\n", object);
1966	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
1967		printf("Warning: object %p mightbedirty but not writeable\n", object);
1968
1969	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
1970		vm_offset_t boffset;
1971		vm_offset_t eoffset;
1972
1973		/*
1974		 * test the pages to see if they have been modified directly
1975		 * by users through the VM system.
1976		 */
1977		for (i = 0; i < bp->b_npages; i++) {
1978			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1979			vm_page_test_dirty(bp->b_pages[i]);
1980		}
1981
1982		/*
1983		 * Calculate the encompassing dirty range, boffset and eoffset,
1984		 * (eoffset - boffset) bytes.
1985		 */
1986
1987		for (i = 0; i < bp->b_npages; i++) {
1988			if (bp->b_pages[i]->dirty)
1989				break;
1990		}
1991		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1992
1993		for (i = bp->b_npages - 1; i >= 0; --i) {
1994			if (bp->b_pages[i]->dirty) {
1995				break;
1996			}
1997		}
1998		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1999
2000		/*
2001		 * Fit it to the buffer.
2002		 */
2003
2004		if (eoffset > bp->b_bcount)
2005			eoffset = bp->b_bcount;
2006
2007		/*
2008		 * If we have a good dirty range, merge with the existing
2009		 * dirty range.
2010		 */
2011
2012		if (boffset < eoffset) {
2013			if (bp->b_dirtyoff > boffset)
2014				bp->b_dirtyoff = boffset;
2015			if (bp->b_dirtyend < eoffset)
2016				bp->b_dirtyend = eoffset;
2017		}
2018	}
2019}
2020
2021/*
2022 *	getblk:
2023 *
2024 *	Get a block given a specified block and offset into a file/device.
2025 *	The buffers B_DONE bit will be cleared on return, making it almost
2026 * 	ready for an I/O initiation.  B_INVAL may or may not be set on
2027 *	return.  The caller should clear B_INVAL prior to initiating a
2028 *	READ.
2029 *
2030 *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
2031 *	an existing buffer.
2032 *
2033 *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
2034 *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
2035 *	and then cleared based on the backing VM.  If the previous buffer is
2036 *	non-0-sized but invalid, B_CACHE will be cleared.
2037 *
2038 *	If getblk() must create a new buffer, the new buffer is returned with
2039 *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
2040 *	case it is returned with B_INVAL clear and B_CACHE set based on the
2041 *	backing VM.
2042 *
2043 *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
2044 *	B_CACHE bit is clear.
2045 *
2046 *	What this means, basically, is that the caller should use B_CACHE to
2047 *	determine whether the buffer is fully valid or not and should clear
2048 *	B_INVAL prior to issuing a read.  If the caller intends to validate
2049 *	the buffer by loading its data area with something, the caller needs
2050 *	to clear B_INVAL.  If the caller does this without issuing an I/O,
2051 *	the caller should set B_CACHE ( as an optimization ), else the caller
2052 *	should issue the I/O and biodone() will set B_CACHE if the I/O was
2053 *	a write attempt or if it was a successfull read.  If the caller
2054 *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
2055 *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
2056 */
2057struct buf *
2058getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
2059{
2060	struct buf *bp;
2061	int s;
2062	struct bufhashhdr *bh;
2063
2064	if (size > MAXBSIZE)
2065		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
2066
2067	s = splbio();
2068loop:
2069	/*
2070	 * Block if we are low on buffers.   Certain processes are allowed
2071	 * to completely exhaust the buffer cache.
2072         *
2073         * If this check ever becomes a bottleneck it may be better to
2074         * move it into the else, when gbincore() fails.  At the moment
2075         * it isn't a problem.
2076         */
2077	if (curproc == idleproc || (curproc->p_flag & P_BUFEXHAUST)) {
2078		if (numfreebuffers == 0) {
2079			if (curproc == idleproc)
2080				return NULL;
2081			needsbuffer |= VFS_BIO_NEED_ANY;
2082			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
2083			    slptimeo);
2084		}
2085	} else if (numfreebuffers < lofreebuffers) {
2086		waitfreebuffers(slpflag, slptimeo);
2087	}
2088
2089	if ((bp = gbincore(vp, blkno))) {
2090		/*
2091		 * Buffer is in-core.  If the buffer is not busy, it must
2092		 * be on a queue.
2093		 */
2094
2095		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
2096			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
2097			    "getblk", slpflag, slptimeo) == ENOLCK)
2098				goto loop;
2099			splx(s);
2100			return (struct buf *) NULL;
2101		}
2102
2103		/*
2104		 * The buffer is locked.  B_CACHE is cleared if the buffer is
2105		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
2106		 * and for a VMIO buffer B_CACHE is adjusted according to the
2107		 * backing VM cache.
2108		 */
2109		if (bp->b_flags & B_INVAL)
2110			bp->b_flags &= ~B_CACHE;
2111		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
2112			bp->b_flags |= B_CACHE;
2113		bremfree(bp);
2114
2115		/*
2116		 * check for size inconsistancies for non-VMIO case.
2117		 */
2118
2119		if (bp->b_bcount != size) {
2120			if ((bp->b_flags & B_VMIO) == 0 ||
2121			    (size > bp->b_kvasize)) {
2122				if (bp->b_flags & B_DELWRI) {
2123					bp->b_flags |= B_NOCACHE;
2124					BUF_WRITE(bp);
2125				} else {
2126					if ((bp->b_flags & B_VMIO) &&
2127					   (LIST_FIRST(&bp->b_dep) == NULL)) {
2128						bp->b_flags |= B_RELBUF;
2129						brelse(bp);
2130					} else {
2131						bp->b_flags |= B_NOCACHE;
2132						BUF_WRITE(bp);
2133					}
2134				}
2135				goto loop;
2136			}
2137		}
2138
2139		/*
2140		 * If the size is inconsistant in the VMIO case, we can resize
2141		 * the buffer.  This might lead to B_CACHE getting set or
2142		 * cleared.  If the size has not changed, B_CACHE remains
2143		 * unchanged from its previous state.
2144		 */
2145
2146		if (bp->b_bcount != size)
2147			allocbuf(bp, size);
2148
2149		KASSERT(bp->b_offset != NOOFFSET,
2150		    ("getblk: no buffer offset"));
2151
2152		/*
2153		 * A buffer with B_DELWRI set and B_CACHE clear must
2154		 * be committed before we can return the buffer in
2155		 * order to prevent the caller from issuing a read
2156		 * ( due to B_CACHE not being set ) and overwriting
2157		 * it.
2158		 *
2159		 * Most callers, including NFS and FFS, need this to
2160		 * operate properly either because they assume they
2161		 * can issue a read if B_CACHE is not set, or because
2162		 * ( for example ) an uncached B_DELWRI might loop due
2163		 * to softupdates re-dirtying the buffer.  In the latter
2164		 * case, B_CACHE is set after the first write completes,
2165		 * preventing further loops.
2166		 */
2167
2168		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
2169			BUF_WRITE(bp);
2170			goto loop;
2171		}
2172
2173		splx(s);
2174		bp->b_flags &= ~B_DONE;
2175	} else {
2176		/*
2177		 * Buffer is not in-core, create new buffer.  The buffer
2178		 * returned by getnewbuf() is locked.  Note that the returned
2179		 * buffer is also considered valid (not marked B_INVAL).
2180		 */
2181		int bsize, maxsize, vmio;
2182		off_t offset;
2183
2184		if (vn_isdisk(vp, NULL))
2185			bsize = DEV_BSIZE;
2186		else if (vp->v_mountedhere)
2187			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
2188		else if (vp->v_mount)
2189			bsize = vp->v_mount->mnt_stat.f_iosize;
2190		else
2191			bsize = size;
2192
2193		offset = (off_t)blkno * bsize;
2194		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
2195		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
2196		maxsize = imax(maxsize, bsize);
2197
2198		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
2199			if (slpflag || slptimeo) {
2200				splx(s);
2201				return NULL;
2202			}
2203			goto loop;
2204		}
2205
2206		/*
2207		 * This code is used to make sure that a buffer is not
2208		 * created while the getnewbuf routine is blocked.
2209		 * This can be a problem whether the vnode is locked or not.
2210		 * If the buffer is created out from under us, we have to
2211		 * throw away the one we just created.  There is now window
2212		 * race because we are safely running at splbio() from the
2213		 * point of the duplicate buffer creation through to here,
2214		 * and we've locked the buffer.
2215		 */
2216		if (gbincore(vp, blkno)) {
2217			bp->b_flags |= B_INVAL;
2218			brelse(bp);
2219			goto loop;
2220		}
2221
2222		/*
2223		 * Insert the buffer into the hash, so that it can
2224		 * be found by incore.
2225		 */
2226		bp->b_blkno = bp->b_lblkno = blkno;
2227		bp->b_offset = offset;
2228
2229		bgetvp(vp, bp);
2230		LIST_REMOVE(bp, b_hash);
2231		bh = bufhash(vp, blkno);
2232		LIST_INSERT_HEAD(bh, bp, b_hash);
2233
2234		/*
2235		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
2236		 * buffer size starts out as 0, B_CACHE will be set by
2237		 * allocbuf() for the VMIO case prior to it testing the
2238		 * backing store for validity.
2239		 */
2240
2241		if (vmio) {
2242			bp->b_flags |= B_VMIO;
2243#if defined(VFS_BIO_DEBUG)
2244			if (vp->v_type != VREG && vp->v_type != VBLK)
2245				printf("getblk: vmioing file type %d???\n", vp->v_type);
2246#endif
2247		} else {
2248			bp->b_flags &= ~B_VMIO;
2249		}
2250
2251		allocbuf(bp, size);
2252
2253		splx(s);
2254		bp->b_flags &= ~B_DONE;
2255	}
2256	return (bp);
2257}
2258
2259/*
2260 * Get an empty, disassociated buffer of given size.  The buffer is initially
2261 * set to B_INVAL.
2262 */
2263struct buf *
2264geteblk(int size)
2265{
2266	struct buf *bp;
2267	int s;
2268	int maxsize;
2269
2270	maxsize = (size + BKVAMASK) & ~BKVAMASK;
2271
2272	s = splbio();
2273	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
2274	splx(s);
2275	allocbuf(bp, size);
2276	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
2277	return (bp);
2278}
2279
2280
2281/*
2282 * This code constitutes the buffer memory from either anonymous system
2283 * memory (in the case of non-VMIO operations) or from an associated
2284 * VM object (in the case of VMIO operations).  This code is able to
2285 * resize a buffer up or down.
2286 *
2287 * Note that this code is tricky, and has many complications to resolve
2288 * deadlock or inconsistant data situations.  Tread lightly!!!
2289 * There are B_CACHE and B_DELWRI interactions that must be dealt with by
2290 * the caller.  Calling this code willy nilly can result in the loss of data.
2291 *
2292 * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
2293 * B_CACHE for the non-VMIO case.
2294 */
2295
2296int
2297allocbuf(struct buf *bp, int size)
2298{
2299	int newbsize, mbsize;
2300	int i;
2301
2302	if (BUF_REFCNT(bp) == 0)
2303		panic("allocbuf: buffer not busy");
2304
2305	if (bp->b_kvasize < size)
2306		panic("allocbuf: buffer too small");
2307
2308	if ((bp->b_flags & B_VMIO) == 0) {
2309		caddr_t origbuf;
2310		int origbufsize;
2311		/*
2312		 * Just get anonymous memory from the kernel.  Don't
2313		 * mess with B_CACHE.
2314		 */
2315		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2316#if !defined(NO_B_MALLOC)
2317		if (bp->b_flags & B_MALLOC)
2318			newbsize = mbsize;
2319		else
2320#endif
2321			newbsize = round_page(size);
2322
2323		if (newbsize < bp->b_bufsize) {
2324#if !defined(NO_B_MALLOC)
2325			/*
2326			 * malloced buffers are not shrunk
2327			 */
2328			if (bp->b_flags & B_MALLOC) {
2329				if (newbsize) {
2330					bp->b_bcount = size;
2331				} else {
2332					free(bp->b_data, M_BIOBUF);
2333					bufmallocspace -= bp->b_bufsize;
2334					runningbufspace -= bp->b_bufsize;
2335					if (bp->b_bufsize)
2336						bufspacewakeup();
2337					bp->b_data = bp->b_kvabase;
2338					bp->b_bufsize = 0;
2339					bp->b_bcount = 0;
2340					bp->b_flags &= ~B_MALLOC;
2341				}
2342				return 1;
2343			}
2344#endif
2345			vm_hold_free_pages(
2346			    bp,
2347			    (vm_offset_t) bp->b_data + newbsize,
2348			    (vm_offset_t) bp->b_data + bp->b_bufsize);
2349		} else if (newbsize > bp->b_bufsize) {
2350#if !defined(NO_B_MALLOC)
2351			/*
2352			 * We only use malloced memory on the first allocation.
2353			 * and revert to page-allocated memory when the buffer
2354			 * grows.
2355			 */
2356			if ( (bufmallocspace < maxbufmallocspace) &&
2357				(bp->b_bufsize == 0) &&
2358				(mbsize <= PAGE_SIZE/2)) {
2359
2360				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
2361				bp->b_bufsize = mbsize;
2362				bp->b_bcount = size;
2363				bp->b_flags |= B_MALLOC;
2364				bufmallocspace += mbsize;
2365				runningbufspace += bp->b_bufsize;
2366				return 1;
2367			}
2368#endif
2369			origbuf = NULL;
2370			origbufsize = 0;
2371#if !defined(NO_B_MALLOC)
2372			/*
2373			 * If the buffer is growing on its other-than-first allocation,
2374			 * then we revert to the page-allocation scheme.
2375			 */
2376			if (bp->b_flags & B_MALLOC) {
2377				origbuf = bp->b_data;
2378				origbufsize = bp->b_bufsize;
2379				bp->b_data = bp->b_kvabase;
2380				bufmallocspace -= bp->b_bufsize;
2381				runningbufspace -= bp->b_bufsize;
2382				if (bp->b_bufsize)
2383					bufspacewakeup();
2384				bp->b_bufsize = 0;
2385				bp->b_flags &= ~B_MALLOC;
2386				newbsize = round_page(newbsize);
2387			}
2388#endif
2389			vm_hold_load_pages(
2390			    bp,
2391			    (vm_offset_t) bp->b_data + bp->b_bufsize,
2392			    (vm_offset_t) bp->b_data + newbsize);
2393#if !defined(NO_B_MALLOC)
2394			if (origbuf) {
2395				bcopy(origbuf, bp->b_data, origbufsize);
2396				free(origbuf, M_BIOBUF);
2397			}
2398#endif
2399		}
2400	} else {
2401		vm_page_t m;
2402		int desiredpages;
2403
2404		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2405		desiredpages = (size == 0) ? 0 :
2406			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
2407
2408#if !defined(NO_B_MALLOC)
2409		if (bp->b_flags & B_MALLOC)
2410			panic("allocbuf: VMIO buffer can't be malloced");
2411#endif
2412		/*
2413		 * Set B_CACHE initially if buffer is 0 length or will become
2414		 * 0-length.
2415		 */
2416		if (size == 0 || bp->b_bufsize == 0)
2417			bp->b_flags |= B_CACHE;
2418
2419		if (newbsize < bp->b_bufsize) {
2420			/*
2421			 * DEV_BSIZE aligned new buffer size is less then the
2422			 * DEV_BSIZE aligned existing buffer size.  Figure out
2423			 * if we have to remove any pages.
2424			 */
2425			if (desiredpages < bp->b_npages) {
2426				for (i = desiredpages; i < bp->b_npages; i++) {
2427					/*
2428					 * the page is not freed here -- it
2429					 * is the responsibility of
2430					 * vnode_pager_setsize
2431					 */
2432					m = bp->b_pages[i];
2433					KASSERT(m != bogus_page,
2434					    ("allocbuf: bogus page found"));
2435					while (vm_page_sleep_busy(m, TRUE, "biodep"))
2436						;
2437
2438					bp->b_pages[i] = NULL;
2439					vm_page_unwire(m, 0);
2440				}
2441				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
2442				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
2443				bp->b_npages = desiredpages;
2444			}
2445		} else if (size > bp->b_bcount) {
2446			/*
2447			 * We are growing the buffer, possibly in a
2448			 * byte-granular fashion.
2449			 */
2450			struct vnode *vp;
2451			vm_object_t obj;
2452			vm_offset_t toff;
2453			vm_offset_t tinc;
2454
2455			/*
2456			 * Step 1, bring in the VM pages from the object,
2457			 * allocating them if necessary.  We must clear
2458			 * B_CACHE if these pages are not valid for the
2459			 * range covered by the buffer.
2460			 */
2461
2462			vp = bp->b_vp;
2463			VOP_GETVOBJECT(vp, &obj);
2464
2465			while (bp->b_npages < desiredpages) {
2466				vm_page_t m;
2467				vm_pindex_t pi;
2468
2469				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
2470				if ((m = vm_page_lookup(obj, pi)) == NULL) {
2471					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
2472					if (m == NULL) {
2473						VM_WAIT;
2474						vm_pageout_deficit += desiredpages - bp->b_npages;
2475					} else {
2476						vm_page_wire(m);
2477						vm_page_wakeup(m);
2478						bp->b_flags &= ~B_CACHE;
2479						bp->b_pages[bp->b_npages] = m;
2480						++bp->b_npages;
2481					}
2482					continue;
2483				}
2484
2485				/*
2486				 * We found a page.  If we have to sleep on it,
2487				 * retry because it might have gotten freed out
2488				 * from under us.
2489				 *
2490				 * We can only test PG_BUSY here.  Blocking on
2491				 * m->busy might lead to a deadlock:
2492				 *
2493				 *  vm_fault->getpages->cluster_read->allocbuf
2494				 *
2495				 */
2496
2497				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
2498					continue;
2499
2500				/*
2501				 * We have a good page.  Should we wakeup the
2502				 * page daemon?
2503				 */
2504				if ((curproc != pageproc) &&
2505				    ((m->queue - m->pc) == PQ_CACHE) &&
2506				    ((cnt.v_free_count + cnt.v_cache_count) <
2507					(cnt.v_free_min + cnt.v_cache_min))) {
2508					pagedaemon_wakeup();
2509				}
2510				vm_page_flag_clear(m, PG_ZERO);
2511				vm_page_wire(m);
2512				bp->b_pages[bp->b_npages] = m;
2513				++bp->b_npages;
2514			}
2515
2516			/*
2517			 * Step 2.  We've loaded the pages into the buffer,
2518			 * we have to figure out if we can still have B_CACHE
2519			 * set.  Note that B_CACHE is set according to the
2520			 * byte-granular range ( bcount and size ), new the
2521			 * aligned range ( newbsize ).
2522			 *
2523			 * The VM test is against m->valid, which is DEV_BSIZE
2524			 * aligned.  Needless to say, the validity of the data
2525			 * needs to also be DEV_BSIZE aligned.  Note that this
2526			 * fails with NFS if the server or some other client
2527			 * extends the file's EOF.  If our buffer is resized,
2528			 * B_CACHE may remain set! XXX
2529			 */
2530
2531			toff = bp->b_bcount;
2532			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
2533
2534			while ((bp->b_flags & B_CACHE) && toff < size) {
2535				vm_pindex_t pi;
2536
2537				if (tinc > (size - toff))
2538					tinc = size - toff;
2539
2540				pi = ((bp->b_offset & PAGE_MASK) + toff) >>
2541				    PAGE_SHIFT;
2542
2543				vfs_buf_test_cache(
2544				    bp,
2545				    bp->b_offset,
2546				    toff,
2547				    tinc,
2548				    bp->b_pages[pi]
2549				);
2550				toff += tinc;
2551				tinc = PAGE_SIZE;
2552			}
2553
2554			/*
2555			 * Step 3, fixup the KVM pmap.  Remember that
2556			 * bp->b_data is relative to bp->b_offset, but
2557			 * bp->b_offset may be offset into the first page.
2558			 */
2559
2560			bp->b_data = (caddr_t)
2561			    trunc_page((vm_offset_t)bp->b_data);
2562			pmap_qenter(
2563			    (vm_offset_t)bp->b_data,
2564			    bp->b_pages,
2565			    bp->b_npages
2566			);
2567			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
2568			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
2569		}
2570	}
2571	runningbufspace += (newbsize - bp->b_bufsize);
2572	if (newbsize < bp->b_bufsize)
2573		bufspacewakeup();
2574	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
2575	bp->b_bcount = size;		/* requested buffer size	*/
2576	return 1;
2577}
2578
2579/*
2580 *	bufwait:
2581 *
2582 *	Wait for buffer I/O completion, returning error status.  The buffer
2583 *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
2584 *	error and cleared.
2585 */
2586int
2587bufwait(register struct buf * bp)
2588{
2589	int s;
2590
2591	s = splbio();
2592	while ((bp->b_flags & B_DONE) == 0) {
2593		if (bp->b_iocmd == BIO_READ)
2594			tsleep(bp, PRIBIO, "biord", 0);
2595		else
2596			tsleep(bp, PRIBIO, "biowr", 0);
2597	}
2598	splx(s);
2599	if (bp->b_flags & B_EINTR) {
2600		bp->b_flags &= ~B_EINTR;
2601		return (EINTR);
2602	}
2603	if (bp->b_ioflags & BIO_ERROR) {
2604		return (bp->b_error ? bp->b_error : EIO);
2605	} else {
2606		return (0);
2607	}
2608}
2609
2610 /*
2611  * Call back function from struct bio back up to struct buf.
2612  * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY().
2613  */
2614void
2615bufdonebio(struct bio *bp)
2616{
2617	bufdone(bp->bio_caller2);
2618}
2619
2620/*
2621 *	bufdone:
2622 *
2623 *	Finish I/O on a buffer, optionally calling a completion function.
2624 *	This is usually called from an interrupt so process blocking is
2625 *	not allowed.
2626 *
2627 *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
2628 *	In a non-VMIO bp, B_CACHE will be set on the next getblk()
2629 *	assuming B_INVAL is clear.
2630 *
2631 *	For the VMIO case, we set B_CACHE if the op was a read and no
2632 *	read error occured, or if the op was a write.  B_CACHE is never
2633 *	set if the buffer is invalid or otherwise uncacheable.
2634 *
2635 *	biodone does not mess with B_INVAL, allowing the I/O routine or the
2636 *	initiator to leave B_INVAL set to brelse the buffer out of existance
2637 *	in the biodone routine.
2638 */
2639void
2640bufdone(struct buf *bp)
2641{
2642	int s, error;
2643	void    (*biodone) __P((struct buf *));
2644
2645	s = splbio();
2646
2647	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
2648	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
2649
2650	bp->b_flags |= B_DONE;
2651
2652	if (bp->b_iocmd == BIO_DELETE) {
2653		brelse(bp);
2654		splx(s);
2655		return;
2656	}
2657
2658	if (bp->b_iocmd == BIO_WRITE) {
2659		vwakeup(bp);
2660	}
2661
2662	/* call optional completion function if requested */
2663	if (bp->b_iodone != NULL) {
2664		biodone = bp->b_iodone;
2665		bp->b_iodone = NULL;
2666		(*biodone) (bp);
2667		splx(s);
2668		return;
2669	}
2670	if (LIST_FIRST(&bp->b_dep) != NULL)
2671		buf_complete(bp);
2672
2673	if (bp->b_flags & B_VMIO) {
2674		int i, resid;
2675		vm_ooffset_t foff;
2676		vm_page_t m;
2677		vm_object_t obj;
2678		int iosize;
2679		struct vnode *vp = bp->b_vp;
2680
2681		error = VOP_GETVOBJECT(vp, &obj);
2682
2683#if defined(VFS_BIO_DEBUG)
2684		if (vp->v_usecount == 0) {
2685			panic("biodone: zero vnode ref count");
2686		}
2687
2688		if (error) {
2689			panic("biodone: missing VM object");
2690		}
2691
2692		if ((vp->v_flag & VOBJBUF) == 0) {
2693			panic("biodone: vnode is not setup for merged cache");
2694		}
2695#endif
2696
2697		foff = bp->b_offset;
2698		KASSERT(bp->b_offset != NOOFFSET,
2699		    ("biodone: no buffer offset"));
2700
2701		if (error) {
2702			panic("biodone: no object");
2703		}
2704#if defined(VFS_BIO_DEBUG)
2705		if (obj->paging_in_progress < bp->b_npages) {
2706			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
2707			    obj->paging_in_progress, bp->b_npages);
2708		}
2709#endif
2710
2711		/*
2712		 * Set B_CACHE if the op was a normal read and no error
2713		 * occured.  B_CACHE is set for writes in the b*write()
2714		 * routines.
2715		 */
2716		iosize = bp->b_bcount - bp->b_resid;
2717		if (bp->b_iocmd == BIO_READ &&
2718		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
2719		    !(bp->b_ioflags & BIO_ERROR)) {
2720			bp->b_flags |= B_CACHE;
2721		}
2722
2723		for (i = 0; i < bp->b_npages; i++) {
2724			int bogusflag = 0;
2725			m = bp->b_pages[i];
2726			if (m == bogus_page) {
2727				bogusflag = 1;
2728				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
2729				if (!m) {
2730#if defined(VFS_BIO_DEBUG)
2731					printf("biodone: page disappeared\n");
2732#endif
2733					vm_object_pip_subtract(obj, 1);
2734					bp->b_flags &= ~B_CACHE;
2735					continue;
2736				}
2737				bp->b_pages[i] = m;
2738				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2739			}
2740#if defined(VFS_BIO_DEBUG)
2741			if (OFF_TO_IDX(foff) != m->pindex) {
2742				printf(
2743"biodone: foff(%lu)/m->pindex(%d) mismatch\n",
2744				    (unsigned long)foff, m->pindex);
2745			}
2746#endif
2747			resid = IDX_TO_OFF(m->pindex + 1) - foff;
2748			if (resid > iosize)
2749				resid = iosize;
2750
2751			/*
2752			 * In the write case, the valid and clean bits are
2753			 * already changed correctly ( see bdwrite() ), so we
2754			 * only need to do this here in the read case.
2755			 */
2756			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
2757				vfs_page_set_valid(bp, foff, i, m);
2758			}
2759			vm_page_flag_clear(m, PG_ZERO);
2760
2761			/*
2762			 * when debugging new filesystems or buffer I/O methods, this
2763			 * is the most common error that pops up.  if you see this, you
2764			 * have not set the page busy flag correctly!!!
2765			 */
2766			if (m->busy == 0) {
2767				printf("biodone: page busy < 0, "
2768				    "pindex: %d, foff: 0x(%x,%x), "
2769				    "resid: %d, index: %d\n",
2770				    (int) m->pindex, (int)(foff >> 32),
2771						(int) foff & 0xffffffff, resid, i);
2772				if (!vn_isdisk(vp, NULL))
2773					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
2774					    bp->b_vp->v_mount->mnt_stat.f_iosize,
2775					    (int) bp->b_lblkno,
2776					    bp->b_flags, bp->b_npages);
2777				else
2778					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2779					    (int) bp->b_lblkno,
2780					    bp->b_flags, bp->b_npages);
2781				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2782				    m->valid, m->dirty, m->wire_count);
2783				panic("biodone: page busy < 0\n");
2784			}
2785			vm_page_io_finish(m);
2786			vm_object_pip_subtract(obj, 1);
2787			foff += resid;
2788			iosize -= resid;
2789		}
2790		if (obj)
2791			vm_object_pip_wakeupn(obj, 0);
2792	}
2793	/*
2794	 * For asynchronous completions, release the buffer now. The brelse
2795	 * will do a wakeup there if necessary - so no need to do a wakeup
2796	 * here in the async case. The sync case always needs to do a wakeup.
2797	 */
2798
2799	if (bp->b_flags & B_ASYNC) {
2800		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
2801			brelse(bp);
2802		else
2803			bqrelse(bp);
2804	} else {
2805		wakeup(bp);
2806	}
2807	splx(s);
2808}
2809
2810/*
2811 * This routine is called in lieu of iodone in the case of
2812 * incomplete I/O.  This keeps the busy status for pages
2813 * consistant.
2814 */
2815void
2816vfs_unbusy_pages(struct buf * bp)
2817{
2818	int i;
2819
2820	if (bp->b_flags & B_VMIO) {
2821		struct vnode *vp = bp->b_vp;
2822		vm_object_t obj;
2823
2824		VOP_GETVOBJECT(vp, &obj);
2825
2826		for (i = 0; i < bp->b_npages; i++) {
2827			vm_page_t m = bp->b_pages[i];
2828
2829			if (m == bogus_page) {
2830				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2831				if (!m) {
2832					panic("vfs_unbusy_pages: page missing\n");
2833				}
2834				bp->b_pages[i] = m;
2835				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2836			}
2837			vm_object_pip_subtract(obj, 1);
2838			vm_page_flag_clear(m, PG_ZERO);
2839			vm_page_io_finish(m);
2840		}
2841		vm_object_pip_wakeupn(obj, 0);
2842	}
2843}
2844
2845/*
2846 * vfs_page_set_valid:
2847 *
2848 *	Set the valid bits in a page based on the supplied offset.   The
2849 *	range is restricted to the buffer's size.
2850 *
2851 *	This routine is typically called after a read completes.
2852 */
2853static void
2854vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2855{
2856	vm_ooffset_t soff, eoff;
2857
2858	/*
2859	 * Start and end offsets in buffer.  eoff - soff may not cross a
2860	 * page boundry or cross the end of the buffer.  The end of the
2861	 * buffer, in this case, is our file EOF, not the allocation size
2862	 * of the buffer.
2863	 */
2864	soff = off;
2865	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
2866	if (eoff > bp->b_offset + bp->b_bcount)
2867		eoff = bp->b_offset + bp->b_bcount;
2868
2869	/*
2870	 * Set valid range.  This is typically the entire buffer and thus the
2871	 * entire page.
2872	 */
2873	if (eoff > soff) {
2874		vm_page_set_validclean(
2875		    m,
2876		   (vm_offset_t) (soff & PAGE_MASK),
2877		   (vm_offset_t) (eoff - soff)
2878		);
2879	}
2880}
2881
2882/*
2883 * This routine is called before a device strategy routine.
2884 * It is used to tell the VM system that paging I/O is in
2885 * progress, and treat the pages associated with the buffer
2886 * almost as being PG_BUSY.  Also the object paging_in_progress
2887 * flag is handled to make sure that the object doesn't become
2888 * inconsistant.
2889 *
2890 * Since I/O has not been initiated yet, certain buffer flags
2891 * such as BIO_ERROR or B_INVAL may be in an inconsistant state
2892 * and should be ignored.
2893 */
2894void
2895vfs_busy_pages(struct buf * bp, int clear_modify)
2896{
2897	int i, bogus;
2898
2899	if (bp->b_flags & B_VMIO) {
2900		struct vnode *vp = bp->b_vp;
2901		vm_object_t obj;
2902		vm_ooffset_t foff;
2903
2904		VOP_GETVOBJECT(vp, &obj);
2905		foff = bp->b_offset;
2906		KASSERT(bp->b_offset != NOOFFSET,
2907		    ("vfs_busy_pages: no buffer offset"));
2908		vfs_setdirty(bp);
2909
2910retry:
2911		for (i = 0; i < bp->b_npages; i++) {
2912			vm_page_t m = bp->b_pages[i];
2913			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
2914				goto retry;
2915		}
2916
2917		bogus = 0;
2918		for (i = 0; i < bp->b_npages; i++) {
2919			vm_page_t m = bp->b_pages[i];
2920
2921			vm_page_flag_clear(m, PG_ZERO);
2922			if ((bp->b_flags & B_CLUSTER) == 0) {
2923				vm_object_pip_add(obj, 1);
2924				vm_page_io_start(m);
2925			}
2926
2927			/*
2928			 * When readying a buffer for a read ( i.e
2929			 * clear_modify == 0 ), it is important to do
2930			 * bogus_page replacement for valid pages in
2931			 * partially instantiated buffers.  Partially
2932			 * instantiated buffers can, in turn, occur when
2933			 * reconstituting a buffer from its VM backing store
2934			 * base.  We only have to do this if B_CACHE is
2935			 * clear ( which causes the I/O to occur in the
2936			 * first place ).  The replacement prevents the read
2937			 * I/O from overwriting potentially dirty VM-backed
2938			 * pages.  XXX bogus page replacement is, uh, bogus.
2939			 * It may not work properly with small-block devices.
2940			 * We need to find a better way.
2941			 */
2942
2943			vm_page_protect(m, VM_PROT_NONE);
2944			if (clear_modify)
2945				vfs_page_set_valid(bp, foff, i, m);
2946			else if (m->valid == VM_PAGE_BITS_ALL &&
2947				(bp->b_flags & B_CACHE) == 0) {
2948				bp->b_pages[i] = bogus_page;
2949				bogus++;
2950			}
2951			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2952		}
2953		if (bogus)
2954			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2955	}
2956}
2957
2958/*
2959 * Tell the VM system that the pages associated with this buffer
2960 * are clean.  This is used for delayed writes where the data is
2961 * going to go to disk eventually without additional VM intevention.
2962 *
2963 * Note that while we only really need to clean through to b_bcount, we
2964 * just go ahead and clean through to b_bufsize.
2965 */
2966static void
2967vfs_clean_pages(struct buf * bp)
2968{
2969	int i;
2970
2971	if (bp->b_flags & B_VMIO) {
2972		vm_ooffset_t foff;
2973
2974		foff = bp->b_offset;
2975		KASSERT(bp->b_offset != NOOFFSET,
2976		    ("vfs_clean_pages: no buffer offset"));
2977		for (i = 0; i < bp->b_npages; i++) {
2978			vm_page_t m = bp->b_pages[i];
2979			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2980			vm_ooffset_t eoff = noff;
2981
2982			if (eoff > bp->b_offset + bp->b_bufsize)
2983				eoff = bp->b_offset + bp->b_bufsize;
2984			vfs_page_set_valid(bp, foff, i, m);
2985			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
2986			foff = noff;
2987		}
2988	}
2989}
2990
2991/*
2992 *	vfs_bio_set_validclean:
2993 *
2994 *	Set the range within the buffer to valid and clean.  The range is
2995 *	relative to the beginning of the buffer, b_offset.  Note that b_offset
2996 *	itself may be offset from the beginning of the first page.
2997 */
2998
2999void
3000vfs_bio_set_validclean(struct buf *bp, int base, int size)
3001{
3002	if (bp->b_flags & B_VMIO) {
3003		int i;
3004		int n;
3005
3006		/*
3007		 * Fixup base to be relative to beginning of first page.
3008		 * Set initial n to be the maximum number of bytes in the
3009		 * first page that can be validated.
3010		 */
3011
3012		base += (bp->b_offset & PAGE_MASK);
3013		n = PAGE_SIZE - (base & PAGE_MASK);
3014
3015		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
3016			vm_page_t m = bp->b_pages[i];
3017
3018			if (n > size)
3019				n = size;
3020
3021			vm_page_set_validclean(m, base & PAGE_MASK, n);
3022			base += n;
3023			size -= n;
3024			n = PAGE_SIZE;
3025		}
3026	}
3027}
3028
3029/*
3030 *	vfs_bio_clrbuf:
3031 *
3032 *	clear a buffer.  This routine essentially fakes an I/O, so we need
3033 *	to clear BIO_ERROR and B_INVAL.
3034 *
3035 *	Note that while we only theoretically need to clear through b_bcount,
3036 *	we go ahead and clear through b_bufsize.
3037 */
3038
3039void
3040vfs_bio_clrbuf(struct buf *bp) {
3041	int i, mask = 0;
3042	caddr_t sa, ea;
3043	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
3044		bp->b_flags &= ~B_INVAL;
3045		bp->b_ioflags &= ~BIO_ERROR;
3046		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
3047		    (bp->b_offset & PAGE_MASK) == 0) {
3048			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
3049			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
3050			    ((bp->b_pages[0]->valid & mask) != mask)) {
3051				bzero(bp->b_data, bp->b_bufsize);
3052			}
3053			bp->b_pages[0]->valid |= mask;
3054			bp->b_resid = 0;
3055			return;
3056		}
3057		ea = sa = bp->b_data;
3058		for(i=0;i<bp->b_npages;i++,sa=ea) {
3059			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
3060			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
3061			ea = (caddr_t)(vm_offset_t)ulmin(
3062			    (u_long)(vm_offset_t)ea,
3063			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
3064			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
3065			if ((bp->b_pages[i]->valid & mask) == mask)
3066				continue;
3067			if ((bp->b_pages[i]->valid & mask) == 0) {
3068				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
3069					bzero(sa, ea - sa);
3070				}
3071			} else {
3072				for (; sa < ea; sa += DEV_BSIZE, j++) {
3073					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
3074						(bp->b_pages[i]->valid & (1<<j)) == 0)
3075						bzero(sa, DEV_BSIZE);
3076				}
3077			}
3078			bp->b_pages[i]->valid |= mask;
3079			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
3080		}
3081		bp->b_resid = 0;
3082	} else {
3083		clrbuf(bp);
3084	}
3085}
3086
3087/*
3088 * vm_hold_load_pages and vm_hold_unload pages get pages into
3089 * a buffers address space.  The pages are anonymous and are
3090 * not associated with a file object.
3091 */
3092void
3093vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3094{
3095	vm_offset_t pg;
3096	vm_page_t p;
3097	int index;
3098
3099	to = round_page(to);
3100	from = round_page(from);
3101	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3102
3103	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3104
3105tryagain:
3106
3107		p = vm_page_alloc(kernel_object,
3108			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
3109		    VM_ALLOC_NORMAL);
3110		if (!p) {
3111			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
3112			VM_WAIT;
3113			goto tryagain;
3114		}
3115		vm_page_wire(p);
3116		p->valid = VM_PAGE_BITS_ALL;
3117		vm_page_flag_clear(p, PG_ZERO);
3118		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
3119		bp->b_pages[index] = p;
3120		vm_page_wakeup(p);
3121	}
3122	bp->b_npages = index;
3123}
3124
3125void
3126vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
3127{
3128	vm_offset_t pg;
3129	vm_page_t p;
3130	int index, newnpages;
3131
3132	from = round_page(from);
3133	to = round_page(to);
3134	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
3135
3136	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
3137		p = bp->b_pages[index];
3138		if (p && (index < bp->b_npages)) {
3139			if (p->busy) {
3140				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
3141					bp->b_blkno, bp->b_lblkno);
3142			}
3143			bp->b_pages[index] = NULL;
3144			pmap_kremove(pg);
3145			vm_page_busy(p);
3146			vm_page_unwire(p, 0);
3147			vm_page_free(p);
3148		}
3149	}
3150	bp->b_npages = newnpages;
3151}
3152
3153
3154#include "opt_ddb.h"
3155#ifdef DDB
3156#include <ddb/ddb.h>
3157
3158DB_SHOW_COMMAND(buffer, db_show_buffer)
3159{
3160	/* get args */
3161	struct buf *bp = (struct buf *)addr;
3162
3163	if (!have_addr) {
3164		db_printf("usage: show buffer <addr>\n");
3165		return;
3166	}
3167
3168	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
3169	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
3170		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
3171		  "b_blkno = %d, b_pblkno = %d\n",
3172		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
3173		  major(bp->b_dev), minor(bp->b_dev),
3174		  bp->b_data, bp->b_blkno, bp->b_pblkno);
3175	if (bp->b_npages) {
3176		int i;
3177		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
3178		for (i = 0; i < bp->b_npages; i++) {
3179			vm_page_t m;
3180			m = bp->b_pages[i];
3181			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
3182			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
3183			if ((i + 1) < bp->b_npages)
3184				db_printf(",");
3185		}
3186		db_printf("\n");
3187	}
3188}
3189#endif /* DDB */
3190