vfs_bio.c revision 40764
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.182 1998/10/29 11:04:22 dg Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 */
27
28#define VMIO
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysproto.h>
32#include <sys/kernel.h>
33#include <sys/sysctl.h>
34#include <sys/proc.h>
35#include <sys/vnode.h>
36#include <sys/vmmeter.h>
37#include <sys/lock.h>
38#include <miscfs/specfs/specdev.h>
39#include <vm/vm.h>
40#include <vm/vm_param.h>
41#include <vm/vm_prot.h>
42#include <vm/vm_kern.h>
43#include <vm/vm_pageout.h>
44#include <vm/vm_page.h>
45#include <vm/vm_object.h>
46#include <vm/vm_extern.h>
47#include <vm/vm_map.h>
48#include <sys/buf.h>
49#include <sys/mount.h>
50#include <sys/malloc.h>
51#include <sys/resourcevar.h>
52
53static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
54
55struct	bio_ops bioops;		/* I/O operation notification */
56
57#if 0 	/* replaced bu sched_sync */
58static void vfs_update __P((void));
59static struct	proc *updateproc;
60static struct kproc_desc up_kp = {
61	"update",
62	vfs_update,
63	&updateproc
64};
65SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
66#endif
67
68struct buf *buf;		/* buffer header pool */
69struct swqueue bswlist;
70
71static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
72		vm_offset_t to);
73static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
74		vm_offset_t to);
75static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
76			      vm_offset_t off, vm_offset_t size,
77			      vm_page_t m);
78static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
79			       int pageno, vm_page_t m);
80static void vfs_clean_pages(struct buf * bp);
81static void vfs_setdirty(struct buf *bp);
82static void vfs_vmio_release(struct buf *bp);
83static void flushdirtybuffers(int slpflag, int slptimeo);
84
85int needsbuffer;
86
87/*
88 * Internal update daemon, process 3
89 *	The variable vfs_update_wakeup allows for internal syncs.
90 */
91int vfs_update_wakeup;
92
93
94/*
95 * buffers base kva
96 */
97
98/*
99 * bogus page -- for I/O to/from partially complete buffers
100 * this is a temporary solution to the problem, but it is not
101 * really that bad.  it would be better to split the buffer
102 * for input in the case of buffers partially already in memory,
103 * but the code is intricate enough already.
104 */
105vm_page_t bogus_page;
106static vm_offset_t bogus_offset;
107
108static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
109	bufmallocspace, maxbufmallocspace;
110int numdirtybuffers;
111static int lodirtybuffers, hidirtybuffers;
112static int numfreebuffers, lofreebuffers, hifreebuffers;
113static int kvafreespace;
114
115SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
116	&numdirtybuffers, 0, "");
117SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
118	&lodirtybuffers, 0, "");
119SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
120	&hidirtybuffers, 0, "");
121SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
122	&numfreebuffers, 0, "");
123SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
124	&lofreebuffers, 0, "");
125SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
126	&hifreebuffers, 0, "");
127SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
128	&maxbufspace, 0, "");
129SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
130	&bufspace, 0, "");
131SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
132	&maxvmiobufspace, 0, "");
133SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
134	&vmiospace, 0, "");
135SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
136	&maxbufmallocspace, 0, "");
137SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
138	&bufmallocspace, 0, "");
139SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
140	&kvafreespace, 0, "");
141
142static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
143struct bqueues bufqueues[BUFFER_QUEUES] = {0};
144
145extern int vm_swap_size;
146
147#define BUF_MAXUSE 24
148
149#define VFS_BIO_NEED_ANY 1
150#define VFS_BIO_NEED_LOWLIMIT 2
151#define VFS_BIO_NEED_FREE 4
152
153/*
154 * Initialize buffer headers and related structures.
155 */
156void
157bufinit()
158{
159	struct buf *bp;
160	int i;
161
162	TAILQ_INIT(&bswlist);
163	LIST_INIT(&invalhash);
164
165	/* first, make a null hash table */
166	for (i = 0; i < BUFHSZ; i++)
167		LIST_INIT(&bufhashtbl[i]);
168
169	/* next, make a null set of free lists */
170	for (i = 0; i < BUFFER_QUEUES; i++)
171		TAILQ_INIT(&bufqueues[i]);
172
173	/* finally, initialize each buffer header and stick on empty q */
174	for (i = 0; i < nbuf; i++) {
175		bp = &buf[i];
176		bzero(bp, sizeof *bp);
177		bp->b_flags = B_INVAL;	/* we're just an empty header */
178		bp->b_dev = NODEV;
179		bp->b_rcred = NOCRED;
180		bp->b_wcred = NOCRED;
181		bp->b_qindex = QUEUE_EMPTY;
182		bp->b_vnbufs.le_next = NOLIST;
183		LIST_INIT(&bp->b_dep);
184		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
185		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
186	}
187/*
188 * maxbufspace is currently calculated to support all filesystem blocks
189 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
190 * cache is still the same as it would be for 8K filesystems.  This
191 * keeps the size of the buffer cache "in check" for big block filesystems.
192 */
193	maxbufspace = (nbuf + 8) * DFLTBSIZE;
194/*
195 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
196 */
197	maxvmiobufspace = 2 * maxbufspace / 3;
198/*
199 * Limit the amount of malloc memory since it is wired permanently into
200 * the kernel space.  Even though this is accounted for in the buffer
201 * allocation, we don't want the malloced region to grow uncontrolled.
202 * The malloc scheme improves memory utilization significantly on average
203 * (small) directories.
204 */
205	maxbufmallocspace = maxbufspace / 20;
206
207/*
208 * Remove the probability of deadlock conditions by limiting the
209 * number of dirty buffers.
210 */
211	hidirtybuffers = nbuf / 8 + 20;
212	lodirtybuffers = nbuf / 16 + 10;
213	numdirtybuffers = 0;
214	lofreebuffers = nbuf / 18 + 5;
215	hifreebuffers = 2 * lofreebuffers;
216	numfreebuffers = nbuf;
217	kvafreespace = 0;
218
219	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
220	bogus_page = vm_page_alloc(kernel_object,
221			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
222			VM_ALLOC_NORMAL);
223
224}
225
226/*
227 * Free the kva allocation for a buffer
228 * Must be called only at splbio or higher,
229 *  as this is the only locking for buffer_map.
230 */
231static void
232bfreekva(struct buf * bp)
233{
234	if (bp->b_kvasize == 0)
235		return;
236
237	vm_map_delete(buffer_map,
238		(vm_offset_t) bp->b_kvabase,
239		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
240
241	bp->b_kvasize = 0;
242
243}
244
245/*
246 * remove the buffer from the appropriate free list
247 */
248void
249bremfree(struct buf * bp)
250{
251	int s = splbio();
252
253	if (bp->b_qindex != QUEUE_NONE) {
254		if (bp->b_qindex == QUEUE_EMPTY) {
255			kvafreespace -= bp->b_kvasize;
256		}
257		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
258		bp->b_qindex = QUEUE_NONE;
259	} else {
260#if !defined(MAX_PERF)
261		panic("bremfree: removing a buffer when not on a queue");
262#endif
263	}
264	if ((bp->b_flags & B_INVAL) ||
265		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
266		--numfreebuffers;
267	splx(s);
268}
269
270
271/*
272 * Get a buffer with the specified data.  Look in the cache first.
273 */
274int
275bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
276    struct buf ** bpp)
277{
278	struct buf *bp;
279
280	bp = getblk(vp, blkno, size, 0, 0);
281	*bpp = bp;
282
283	/* if not found in cache, do some I/O */
284	if ((bp->b_flags & B_CACHE) == 0) {
285		if (curproc != NULL)
286			curproc->p_stats->p_ru.ru_inblock++;
287		bp->b_flags |= B_READ;
288		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
289		if (bp->b_rcred == NOCRED) {
290			if (cred != NOCRED)
291				crhold(cred);
292			bp->b_rcred = cred;
293		}
294		vfs_busy_pages(bp, 0);
295		VOP_STRATEGY(vp, bp);
296		return (biowait(bp));
297	}
298	return (0);
299}
300
301/*
302 * Operates like bread, but also starts asynchronous I/O on
303 * read-ahead blocks.
304 */
305int
306breadn(struct vnode * vp, daddr_t blkno, int size,
307    daddr_t * rablkno, int *rabsize,
308    int cnt, struct ucred * cred, struct buf ** bpp)
309{
310	struct buf *bp, *rabp;
311	int i;
312	int rv = 0, readwait = 0;
313
314	*bpp = bp = getblk(vp, blkno, size, 0, 0);
315
316	/* if not found in cache, do some I/O */
317	if ((bp->b_flags & B_CACHE) == 0) {
318		if (curproc != NULL)
319			curproc->p_stats->p_ru.ru_inblock++;
320		bp->b_flags |= B_READ;
321		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
322		if (bp->b_rcred == NOCRED) {
323			if (cred != NOCRED)
324				crhold(cred);
325			bp->b_rcred = cred;
326		}
327		vfs_busy_pages(bp, 0);
328		VOP_STRATEGY(vp, bp);
329		++readwait;
330	}
331	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
332		if (inmem(vp, *rablkno))
333			continue;
334		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
335
336		if ((rabp->b_flags & B_CACHE) == 0) {
337			if (curproc != NULL)
338				curproc->p_stats->p_ru.ru_inblock++;
339			rabp->b_flags |= B_READ | B_ASYNC;
340			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
341			if (rabp->b_rcred == NOCRED) {
342				if (cred != NOCRED)
343					crhold(cred);
344				rabp->b_rcred = cred;
345			}
346			vfs_busy_pages(rabp, 0);
347			VOP_STRATEGY(vp, rabp);
348		} else {
349			brelse(rabp);
350		}
351	}
352
353	if (readwait) {
354		rv = biowait(bp);
355	}
356	return (rv);
357}
358
359/*
360 * Write, release buffer on completion.  (Done by iodone
361 * if async.)
362 */
363int
364bwrite(struct buf * bp)
365{
366	int oldflags, s;
367	struct vnode *vp;
368	struct mount *mp;
369
370
371	if (bp->b_flags & B_INVAL) {
372		brelse(bp);
373		return (0);
374	}
375
376	oldflags = bp->b_flags;
377
378#if !defined(MAX_PERF)
379	if ((bp->b_flags & B_BUSY) == 0)
380		panic("bwrite: buffer is not busy???");
381#endif
382
383	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
384	bp->b_flags |= B_WRITEINPROG;
385
386	s = splbio();
387	if ((oldflags & B_DELWRI) == B_DELWRI) {
388		--numdirtybuffers;
389		reassignbuf(bp, bp->b_vp);
390	}
391
392	bp->b_vp->v_numoutput++;
393	vfs_busy_pages(bp, 1);
394	if (curproc != NULL)
395		curproc->p_stats->p_ru.ru_oublock++;
396	splx(s);
397	VOP_STRATEGY(bp->b_vp, bp);
398
399	/*
400	 * Collect statistics on synchronous and asynchronous writes.
401	 * Writes to block devices are charged to their associated
402	 * filesystem (if any).
403	 */
404	if ((vp = bp->b_vp) != NULL) {
405		if (vp->v_type == VBLK)
406			mp = vp->v_specmountpoint;
407		else
408			mp = vp->v_mount;
409		if (mp != NULL)
410			if ((oldflags & B_ASYNC) == 0)
411				mp->mnt_stat.f_syncwrites++;
412			else
413				mp->mnt_stat.f_asyncwrites++;
414	}
415
416	if ((oldflags & B_ASYNC) == 0) {
417		int rtval = biowait(bp);
418		brelse(bp);
419		return (rtval);
420	}
421	return (0);
422}
423
424__inline void
425vfs_bio_need_satisfy(void) {
426	++numfreebuffers;
427	if (!needsbuffer)
428		return;
429	if (numdirtybuffers < lodirtybuffers) {
430		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
431	} else {
432		needsbuffer &= ~VFS_BIO_NEED_ANY;
433	}
434	if (numfreebuffers >= hifreebuffers) {
435		needsbuffer &= ~VFS_BIO_NEED_FREE;
436	}
437	wakeup(&needsbuffer);
438}
439
440/*
441 * Delayed write. (Buffer is marked dirty).
442 */
443void
444bdwrite(struct buf * bp)
445{
446	struct vnode *vp;
447
448#if !defined(MAX_PERF)
449	if ((bp->b_flags & B_BUSY) == 0) {
450		panic("bdwrite: buffer is not busy");
451	}
452#endif
453
454	if (bp->b_flags & B_INVAL) {
455		brelse(bp);
456		return;
457	}
458	bp->b_flags &= ~(B_READ|B_RELBUF);
459	if ((bp->b_flags & B_DELWRI) == 0) {
460		bp->b_flags |= B_DONE | B_DELWRI;
461		reassignbuf(bp, bp->b_vp);
462		++numdirtybuffers;
463	}
464
465	/*
466	 * This bmap keeps the system from needing to do the bmap later,
467	 * perhaps when the system is attempting to do a sync.  Since it
468	 * is likely that the indirect block -- or whatever other datastructure
469	 * that the filesystem needs is still in memory now, it is a good
470	 * thing to do this.  Note also, that if the pageout daemon is
471	 * requesting a sync -- there might not be enough memory to do
472	 * the bmap then...  So, this is important to do.
473	 */
474	if (bp->b_lblkno == bp->b_blkno) {
475		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
476	}
477
478	/*
479	 * Set the *dirty* buffer range based upon the VM system dirty pages.
480	 */
481	vfs_setdirty(bp);
482
483	/*
484	 * We need to do this here to satisfy the vnode_pager and the
485	 * pageout daemon, so that it thinks that the pages have been
486	 * "cleaned".  Note that since the pages are in a delayed write
487	 * buffer -- the VFS layer "will" see that the pages get written
488	 * out on the next sync, or perhaps the cluster will be completed.
489	 */
490	vfs_clean_pages(bp);
491	bqrelse(bp);
492
493	/*
494	 * XXX The soft dependency code is not prepared to
495	 * have I/O done when a bdwrite is requested. For
496	 * now we just let the write be delayed if it is
497	 * requested by the soft dependency code.
498	 */
499	if ((vp = bp->b_vp) &&
500	    (vp->v_type == VBLK && vp->v_specmountpoint &&
501	    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
502	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))
503		return;
504
505	if (numdirtybuffers >= hidirtybuffers)
506		flushdirtybuffers(0, 0);
507
508	return;
509}
510
511
512/*
513 * Same as first half of bdwrite, mark buffer dirty, but do not release it.
514 * Check how this compares with vfs_setdirty(); XXX [JRE]
515 */
516void
517bdirty(bp)
518      struct buf *bp;
519{
520
521	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
522	if ((bp->b_flags & B_DELWRI) == 0) {
523		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
524		reassignbuf(bp, bp->b_vp);
525		++numdirtybuffers;
526	}
527}
528
529/*
530 * Asynchronous write.
531 * Start output on a buffer, but do not wait for it to complete.
532 * The buffer is released when the output completes.
533 */
534void
535bawrite(struct buf * bp)
536{
537	bp->b_flags |= B_ASYNC;
538	(void) VOP_BWRITE(bp);
539}
540
541/*
542 * Ordered write.
543 * Start output on a buffer, and flag it so that the device will write
544 * it in the order it was queued.  The buffer is released when the output
545 * completes.
546 */
547int
548bowrite(struct buf * bp)
549{
550	bp->b_flags |= B_ORDERED|B_ASYNC;
551	return (VOP_BWRITE(bp));
552}
553
554/*
555 * Release a buffer.
556 */
557void
558brelse(struct buf * bp)
559{
560	int s;
561
562	if (bp->b_flags & B_CLUSTER) {
563		relpbuf(bp);
564		return;
565	}
566
567	s = splbio();
568
569	/* anyone need this block? */
570	if (bp->b_flags & B_WANTED) {
571		bp->b_flags &= ~(B_WANTED | B_AGE);
572		wakeup(bp);
573	}
574
575	if (bp->b_flags & B_LOCKED)
576		bp->b_flags &= ~B_ERROR;
577
578	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
579	    (bp->b_bufsize <= 0)) {
580		bp->b_flags |= B_INVAL;
581		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
582			(*bioops.io_deallocate)(bp);
583		if (bp->b_flags & B_DELWRI)
584			--numdirtybuffers;
585		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
586		if ((bp->b_flags & B_VMIO) == 0) {
587			if (bp->b_bufsize)
588				allocbuf(bp, 0);
589			if (bp->b_vp)
590				brelvp(bp);
591		}
592	}
593
594	/*
595	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
596	 * is called with B_DELWRI set, the underlying pages may wind up
597	 * getting freed causing a previous write (bdwrite()) to get 'lost'
598	 * because pages associated with a B_DELWRI bp are marked clean.
599	 *
600	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
601	 * if B_DELWRI is set.
602	 */
603
604	if (bp->b_flags & B_DELWRI)
605		bp->b_flags &= ~B_RELBUF;
606
607	/*
608	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
609	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
610	 * but the VM object is kept around.  The B_NOCACHE flag is used to
611	 * invalidate the pages in the VM object.
612	 *
613	 * If the buffer is a partially filled NFS buffer, keep it
614	 * since invalidating it now will lose informatio.  The valid
615	 * flags in the vm_pages have only DEV_BSIZE resolution but
616	 * the b_validoff, b_validend fields have byte resolution.
617	 * This can avoid unnecessary re-reads of the buffer.
618	 * XXX this seems to cause performance problems.
619	 */
620	if ((bp->b_flags & B_VMIO)
621	    && !(bp->b_vp->v_tag == VT_NFS &&
622		 bp->b_vp->v_type != VBLK &&
623		 (bp->b_flags & B_DELWRI) != 0)
624#ifdef notdef
625	    && (bp->b_vp->v_tag != VT_NFS
626		|| bp->b_vp->v_type == VBLK
627		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
628		|| bp->b_validend == 0
629		|| (bp->b_validoff == 0
630		    && bp->b_validend == bp->b_bufsize))
631#endif
632	    ) {
633
634		int i, j, resid;
635		vm_page_t m;
636		off_t foff;
637		vm_pindex_t poff;
638		vm_object_t obj;
639		struct vnode *vp;
640
641		vp = bp->b_vp;
642
643		resid = bp->b_bufsize;
644		foff = bp->b_offset;
645
646		for (i = 0; i < bp->b_npages; i++) {
647			m = bp->b_pages[i];
648			vm_page_flag_clear(m, PG_ZERO);
649			if (m == bogus_page) {
650
651				obj = (vm_object_t) vp->v_object;
652				poff = OFF_TO_IDX(bp->b_offset);
653
654				for (j = i; j < bp->b_npages; j++) {
655					m = bp->b_pages[j];
656					if (m == bogus_page) {
657						m = vm_page_lookup(obj, poff + j);
658#if !defined(MAX_PERF)
659						if (!m) {
660							panic("brelse: page missing\n");
661						}
662#endif
663						bp->b_pages[j] = m;
664					}
665				}
666
667				if ((bp->b_flags & B_INVAL) == 0) {
668					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
669				}
670			}
671			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
672				int poffset = foff & PAGE_MASK;
673				int presid = resid > (PAGE_SIZE - poffset) ?
674					(PAGE_SIZE - poffset) : resid;
675				vm_page_set_invalid(m, poffset, presid);
676			}
677			resid -= PAGE_SIZE;
678		}
679
680		if (bp->b_flags & (B_INVAL | B_RELBUF))
681			vfs_vmio_release(bp);
682
683	} else if (bp->b_flags & B_VMIO) {
684
685		if (bp->b_flags & (B_INVAL | B_RELBUF))
686			vfs_vmio_release(bp);
687
688	}
689
690#if !defined(MAX_PERF)
691	if (bp->b_qindex != QUEUE_NONE)
692		panic("brelse: free buffer onto another queue???");
693#endif
694
695	/* enqueue */
696	/* buffers with no memory */
697	if (bp->b_bufsize == 0) {
698		bp->b_flags |= B_INVAL;
699		bp->b_qindex = QUEUE_EMPTY;
700		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
701		LIST_REMOVE(bp, b_hash);
702		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
703		bp->b_dev = NODEV;
704		kvafreespace += bp->b_kvasize;
705
706	/* buffers with junk contents */
707	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
708		bp->b_flags |= B_INVAL;
709		bp->b_qindex = QUEUE_AGE;
710		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
711		LIST_REMOVE(bp, b_hash);
712		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
713		bp->b_dev = NODEV;
714
715	/* buffers that are locked */
716	} else if (bp->b_flags & B_LOCKED) {
717		bp->b_qindex = QUEUE_LOCKED;
718		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
719
720	/* buffers with stale but valid contents */
721	} else if (bp->b_flags & B_AGE) {
722		bp->b_qindex = QUEUE_AGE;
723		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
724
725	/* buffers with valid and quite potentially reuseable contents */
726	} else {
727		bp->b_qindex = QUEUE_LRU;
728		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
729	}
730
731	if ((bp->b_flags & B_INVAL) ||
732		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
733		if (bp->b_flags & B_DELWRI) {
734			--numdirtybuffers;
735			bp->b_flags &= ~B_DELWRI;
736		}
737		vfs_bio_need_satisfy();
738	}
739
740	/* unlock */
741	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
742		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
743	splx(s);
744}
745
746/*
747 * Release a buffer.
748 */
749void
750bqrelse(struct buf * bp)
751{
752	int s;
753
754	s = splbio();
755
756	/* anyone need this block? */
757	if (bp->b_flags & B_WANTED) {
758		bp->b_flags &= ~(B_WANTED | B_AGE);
759		wakeup(bp);
760	}
761
762#if !defined(MAX_PERF)
763	if (bp->b_qindex != QUEUE_NONE)
764		panic("bqrelse: free buffer onto another queue???");
765#endif
766
767	if (bp->b_flags & B_LOCKED) {
768		bp->b_flags &= ~B_ERROR;
769		bp->b_qindex = QUEUE_LOCKED;
770		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
771		/* buffers with stale but valid contents */
772	} else {
773		bp->b_qindex = QUEUE_LRU;
774		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
775	}
776
777	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
778		vfs_bio_need_satisfy();
779	}
780
781	/* unlock */
782	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
783		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
784	splx(s);
785}
786
787static void
788vfs_vmio_release(bp)
789	struct buf *bp;
790{
791	int i;
792	vm_page_t m;
793
794	for (i = 0; i < bp->b_npages; i++) {
795		m = bp->b_pages[i];
796		bp->b_pages[i] = NULL;
797		/*
798		 * In order to keep page LRU ordering consistent, put
799		 * everything on the inactive queue.
800		 */
801		vm_page_unwire(m, 0);
802		/*
803		 * We don't mess with busy pages, it is
804		 * the responsibility of the process that
805		 * busied the pages to deal with them.
806		 */
807		if ((m->flags & PG_BUSY) || (m->busy != 0))
808			continue;
809
810		if (m->wire_count == 0) {
811			vm_page_flag_clear(m, PG_ZERO);
812			/*
813			 * Might as well free the page if we can and it has
814			 * no valid data.
815			 */
816			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
817				vm_page_busy(m);
818				vm_page_protect(m, VM_PROT_NONE);
819				vm_page_free(m);
820			}
821		}
822	}
823	bufspace -= bp->b_bufsize;
824	vmiospace -= bp->b_bufsize;
825	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
826	bp->b_npages = 0;
827	bp->b_bufsize = 0;
828	bp->b_flags &= ~B_VMIO;
829	if (bp->b_vp)
830		brelvp(bp);
831}
832
833/*
834 * Check to see if a block is currently memory resident.
835 */
836struct buf *
837gbincore(struct vnode * vp, daddr_t blkno)
838{
839	struct buf *bp;
840	struct bufhashhdr *bh;
841
842	bh = BUFHASH(vp, blkno);
843	bp = bh->lh_first;
844
845	/* Search hash chain */
846	while (bp != NULL) {
847		/* hit */
848		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
849		    (bp->b_flags & B_INVAL) == 0) {
850			break;
851		}
852		bp = bp->b_hash.le_next;
853	}
854	return (bp);
855}
856
857/*
858 * this routine implements clustered async writes for
859 * clearing out B_DELWRI buffers...  This is much better
860 * than the old way of writing only one buffer at a time.
861 */
862int
863vfs_bio_awrite(struct buf * bp)
864{
865	int i;
866	daddr_t lblkno = bp->b_lblkno;
867	struct vnode *vp = bp->b_vp;
868	int s;
869	int ncl;
870	struct buf *bpa;
871	int nwritten;
872	int size;
873	int maxcl;
874
875	s = splbio();
876	/*
877	 * right now we support clustered writing only to regular files
878	 */
879	if ((vp->v_type == VREG) &&
880	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
881	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
882
883		size = vp->v_mount->mnt_stat.f_iosize;
884		maxcl = MAXPHYS / size;
885
886		for (i = 1; i < maxcl; i++) {
887			if ((bpa = gbincore(vp, lblkno + i)) &&
888			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
889			    (B_DELWRI | B_CLUSTEROK)) &&
890			    (bpa->b_bufsize == size)) {
891				if ((bpa->b_blkno == bpa->b_lblkno) ||
892				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
893					break;
894			} else {
895				break;
896			}
897		}
898		ncl = i;
899		/*
900		 * this is a possible cluster write
901		 */
902		if (ncl != 1) {
903			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
904			splx(s);
905			return nwritten;
906		}
907	}
908
909	bremfree(bp);
910	bp->b_flags |= B_BUSY | B_ASYNC;
911
912	splx(s);
913	/*
914	 * default (old) behavior, writing out only one block
915	 */
916	nwritten = bp->b_bufsize;
917	(void) VOP_BWRITE(bp);
918	return nwritten;
919}
920
921
922/*
923 * Find a buffer header which is available for use.
924 */
925static struct buf *
926getnewbuf(struct vnode *vp, daddr_t blkno,
927	int slpflag, int slptimeo, int size, int maxsize)
928{
929	struct buf *bp, *bp1;
930	int nbyteswritten = 0;
931	vm_offset_t addr;
932	static int writerecursion = 0;
933
934start:
935	if (bufspace >= maxbufspace)
936		goto trytofreespace;
937
938	/* can we constitute a new buffer? */
939	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
940#if !defined(MAX_PERF)
941		if (bp->b_qindex != QUEUE_EMPTY)
942			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
943			    bp->b_qindex);
944#endif
945		bp->b_flags |= B_BUSY;
946		bremfree(bp);
947		goto fillbuf;
948	}
949trytofreespace:
950	/*
951	 * We keep the file I/O from hogging metadata I/O
952	 * This is desirable because file data is cached in the
953	 * VM/Buffer cache even if a buffer is freed.
954	 */
955	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
956#if !defined(MAX_PERF)
957		if (bp->b_qindex != QUEUE_AGE)
958			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
959			    bp->b_qindex);
960#endif
961	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
962#if !defined(MAX_PERF)
963		if (bp->b_qindex != QUEUE_LRU)
964			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
965			    bp->b_qindex);
966#endif
967	}
968	if (!bp) {
969		/* wait for a free buffer of any kind */
970		needsbuffer |= VFS_BIO_NEED_ANY;
971		do
972			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
973			    slptimeo);
974		while (needsbuffer & VFS_BIO_NEED_ANY);
975		return (0);
976	}
977
978#if defined(DIAGNOSTIC)
979	if (bp->b_flags & B_BUSY) {
980		panic("getnewbuf: busy buffer on free list\n");
981	}
982#endif
983
984	/*
985	 * We are fairly aggressive about freeing VMIO buffers, but since
986	 * the buffering is intact without buffer headers, there is not
987	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
988	 */
989	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
990		if ((bp->b_flags & B_VMIO) == 0 ||
991			(vmiospace < maxvmiobufspace)) {
992			--bp->b_usecount;
993			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
994			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
995				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
996				goto start;
997			}
998			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
999		}
1000	}
1001
1002
1003	/* if we are a delayed write, convert to an async write */
1004	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1005
1006		/*
1007		 * If our delayed write is likely to be used soon, then
1008		 * recycle back onto the LRU queue.
1009		 */
1010		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1011			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1012
1013			if (bp->b_usecount > 0) {
1014				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1015
1016					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1017
1018					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1019						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1020						bp->b_usecount--;
1021						goto start;
1022					}
1023					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1024				}
1025			}
1026		}
1027
1028		/*
1029		 * Certain layered filesystems can recursively re-enter the vfs_bio
1030		 * code, due to delayed writes.  This helps keep the system from
1031		 * deadlocking.
1032		 */
1033		if (writerecursion > 0) {
1034			if (writerecursion > 5) {
1035				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1036				while (bp) {
1037					if ((bp->b_flags & B_DELWRI) == 0)
1038						break;
1039					bp = TAILQ_NEXT(bp, b_freelist);
1040				}
1041				if (bp == NULL) {
1042					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1043					while (bp) {
1044						if ((bp->b_flags & B_DELWRI) == 0)
1045							break;
1046						bp = TAILQ_NEXT(bp, b_freelist);
1047					}
1048				}
1049				if (bp == NULL)
1050					panic("getnewbuf: cannot get buffer, infinite recursion failure");
1051			} else {
1052				bremfree(bp);
1053				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1054				nbyteswritten += bp->b_bufsize;
1055				++writerecursion;
1056				VOP_BWRITE(bp);
1057				--writerecursion;
1058				if (!slpflag && !slptimeo) {
1059					return (0);
1060				}
1061				goto start;
1062			}
1063		} else {
1064			++writerecursion;
1065			nbyteswritten += vfs_bio_awrite(bp);
1066			--writerecursion;
1067			if (!slpflag && !slptimeo) {
1068				return (0);
1069			}
1070			goto start;
1071		}
1072	}
1073
1074	if (bp->b_flags & B_WANTED) {
1075		bp->b_flags &= ~B_WANTED;
1076		wakeup(bp);
1077	}
1078	bremfree(bp);
1079	bp->b_flags |= B_BUSY;
1080
1081	if (bp->b_flags & B_VMIO) {
1082		bp->b_flags &= ~B_ASYNC;
1083		vfs_vmio_release(bp);
1084	}
1085
1086	if (bp->b_vp)
1087		brelvp(bp);
1088
1089fillbuf:
1090
1091	/* we are not free, nor do we contain interesting data */
1092	if (bp->b_rcred != NOCRED) {
1093		crfree(bp->b_rcred);
1094		bp->b_rcred = NOCRED;
1095	}
1096	if (bp->b_wcred != NOCRED) {
1097		crfree(bp->b_wcred);
1098		bp->b_wcred = NOCRED;
1099	}
1100	if (LIST_FIRST(&bp->b_dep) != NULL &&
1101	    bioops.io_deallocate)
1102		(*bioops.io_deallocate)(bp);
1103
1104	LIST_REMOVE(bp, b_hash);
1105	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1106	if (bp->b_bufsize) {
1107		allocbuf(bp, 0);
1108	}
1109	bp->b_flags = B_BUSY;
1110	bp->b_dev = NODEV;
1111	bp->b_vp = NULL;
1112	bp->b_blkno = bp->b_lblkno = 0;
1113	bp->b_offset = NOOFFSET;
1114	bp->b_iodone = 0;
1115	bp->b_error = 0;
1116	bp->b_resid = 0;
1117	bp->b_bcount = 0;
1118	bp->b_npages = 0;
1119	bp->b_dirtyoff = bp->b_dirtyend = 0;
1120	bp->b_validoff = bp->b_validend = 0;
1121	bp->b_usecount = 5;
1122	/* Here, not kern_physio.c, is where this should be done*/
1123	LIST_INIT(&bp->b_dep);
1124
1125	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1126
1127	/*
1128	 * we assume that buffer_map is not at address 0
1129	 */
1130	addr = 0;
1131	if (maxsize != bp->b_kvasize) {
1132		bfreekva(bp);
1133
1134findkvaspace:
1135		/*
1136		 * See if we have buffer kva space
1137		 */
1138		if (vm_map_findspace(buffer_map,
1139			vm_map_min(buffer_map), maxsize, &addr)) {
1140			if (kvafreespace > 0) {
1141				int totfree = 0, freed;
1142				do {
1143					freed = 0;
1144					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1145						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1146						if (bp1->b_kvasize != 0) {
1147							totfree += bp1->b_kvasize;
1148							freed = bp1->b_kvasize;
1149							bremfree(bp1);
1150							bfreekva(bp1);
1151							brelse(bp1);
1152							break;
1153						}
1154					}
1155				} while (freed);
1156				/*
1157				 * if we found free space, then retry with the same buffer.
1158				 */
1159				if (totfree)
1160					goto findkvaspace;
1161			}
1162			bp->b_flags |= B_INVAL;
1163			brelse(bp);
1164			goto trytofreespace;
1165		}
1166	}
1167
1168	/*
1169	 * See if we are below are allocated minimum
1170	 */
1171	if (bufspace >= (maxbufspace + nbyteswritten)) {
1172		bp->b_flags |= B_INVAL;
1173		brelse(bp);
1174		goto trytofreespace;
1175	}
1176
1177	/*
1178	 * create a map entry for the buffer -- in essence
1179	 * reserving the kva space.
1180	 */
1181	if (addr) {
1182		vm_map_insert(buffer_map, NULL, 0,
1183			addr, addr + maxsize,
1184			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1185
1186		bp->b_kvabase = (caddr_t) addr;
1187		bp->b_kvasize = maxsize;
1188	}
1189	bp->b_data = bp->b_kvabase;
1190
1191	return (bp);
1192}
1193
1194static void
1195waitfreebuffers(int slpflag, int slptimeo) {
1196	while (numfreebuffers < hifreebuffers) {
1197		flushdirtybuffers(slpflag, slptimeo);
1198		if (numfreebuffers < hifreebuffers)
1199			break;
1200		needsbuffer |= VFS_BIO_NEED_FREE;
1201		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1202			break;
1203	}
1204}
1205
1206static void
1207flushdirtybuffers(int slpflag, int slptimeo) {
1208	int s;
1209	static pid_t flushing = 0;
1210
1211	s = splbio();
1212
1213	if (flushing) {
1214		if (flushing == curproc->p_pid) {
1215			splx(s);
1216			return;
1217		}
1218		while (flushing) {
1219			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1220				splx(s);
1221				return;
1222			}
1223		}
1224	}
1225	flushing = curproc->p_pid;
1226
1227	while (numdirtybuffers > lodirtybuffers) {
1228		struct buf *bp;
1229		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1230		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1231		if (bp == NULL)
1232			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1233
1234		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1235			bp = TAILQ_NEXT(bp, b_freelist);
1236		}
1237
1238		if (bp) {
1239			vfs_bio_awrite(bp);
1240			continue;
1241		}
1242		break;
1243	}
1244
1245	flushing = 0;
1246	wakeup(&flushing);
1247	splx(s);
1248}
1249
1250/*
1251 * Check to see if a block is currently memory resident.
1252 */
1253struct buf *
1254incore(struct vnode * vp, daddr_t blkno)
1255{
1256	struct buf *bp;
1257
1258	int s = splbio();
1259	bp = gbincore(vp, blkno);
1260	splx(s);
1261	return (bp);
1262}
1263
1264/*
1265 * Returns true if no I/O is needed to access the
1266 * associated VM object.  This is like incore except
1267 * it also hunts around in the VM system for the data.
1268 */
1269
1270int
1271inmem(struct vnode * vp, daddr_t blkno)
1272{
1273	vm_object_t obj;
1274	vm_offset_t toff, tinc;
1275	vm_page_t m;
1276	vm_ooffset_t off;
1277
1278	if (incore(vp, blkno))
1279		return 1;
1280	if (vp->v_mount == NULL)
1281		return 0;
1282	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1283		return 0;
1284
1285	obj = vp->v_object;
1286	tinc = PAGE_SIZE;
1287	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1288		tinc = vp->v_mount->mnt_stat.f_iosize;
1289	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1290
1291	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1292
1293		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1294		if (!m)
1295			return 0;
1296		if (vm_page_is_valid(m,
1297		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1298			return 0;
1299	}
1300	return 1;
1301}
1302
1303/*
1304 * now we set the dirty range for the buffer --
1305 * for NFS -- if the file is mapped and pages have
1306 * been written to, let it know.  We want the
1307 * entire range of the buffer to be marked dirty if
1308 * any of the pages have been written to for consistancy
1309 * with the b_validoff, b_validend set in the nfs write
1310 * code, and used by the nfs read code.
1311 */
1312static void
1313vfs_setdirty(struct buf *bp) {
1314	int i;
1315	vm_object_t object;
1316	vm_offset_t boffset, offset;
1317	/*
1318	 * We qualify the scan for modified pages on whether the
1319	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1320	 * is not cleared simply by protecting pages off.
1321	 */
1322	if ((bp->b_flags & B_VMIO) &&
1323		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1324		/*
1325		 * test the pages to see if they have been modified directly
1326		 * by users through the VM system.
1327		 */
1328		for (i = 0; i < bp->b_npages; i++) {
1329			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1330			vm_page_test_dirty(bp->b_pages[i]);
1331		}
1332
1333		/*
1334		 * scan forwards for the first page modified
1335		 */
1336		for (i = 0; i < bp->b_npages; i++) {
1337			if (bp->b_pages[i]->dirty) {
1338				break;
1339			}
1340		}
1341		boffset = (i << PAGE_SHIFT);
1342		if (boffset < bp->b_dirtyoff) {
1343			bp->b_dirtyoff = boffset;
1344		}
1345
1346		/*
1347		 * scan backwards for the last page modified
1348		 */
1349		for (i = bp->b_npages - 1; i >= 0; --i) {
1350			if (bp->b_pages[i]->dirty) {
1351				break;
1352			}
1353		}
1354		boffset = (i + 1);
1355		offset = boffset + bp->b_pages[0]->pindex;
1356		if (offset >= object->size)
1357			boffset = object->size - bp->b_pages[0]->pindex;
1358		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1359			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1360	}
1361}
1362
1363/*
1364 * Get a block given a specified block and offset into a file/device.
1365 */
1366struct buf *
1367getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1368{
1369	struct buf *bp;
1370	int i, s;
1371	struct bufhashhdr *bh;
1372	int maxsize;
1373	int checksize;
1374
1375	if (vp->v_mount) {
1376		maxsize = vp->v_mount->mnt_stat.f_iosize;
1377		/*
1378		 * This happens on mount points.
1379		 */
1380		if (maxsize < size)
1381			maxsize = size;
1382	} else {
1383		maxsize = size;
1384	}
1385
1386#if !defined(MAX_PERF)
1387	if (size > MAXBSIZE)
1388		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1389#endif
1390
1391	s = splbio();
1392loop:
1393	if (numfreebuffers < lofreebuffers) {
1394		waitfreebuffers(slpflag, slptimeo);
1395	}
1396
1397	if ((bp = gbincore(vp, blkno))) {
1398		if (bp->b_flags & B_BUSY) {
1399
1400			bp->b_flags |= B_WANTED;
1401			if (bp->b_usecount < BUF_MAXUSE)
1402				++bp->b_usecount;
1403
1404			if (!tsleep(bp,
1405				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1406				goto loop;
1407			}
1408
1409			splx(s);
1410			return (struct buf *) NULL;
1411		}
1412		bp->b_flags |= B_BUSY | B_CACHE;
1413		bremfree(bp);
1414
1415		/*
1416		 * check for size inconsistancies (note that they shouldn't
1417		 * happen but do when filesystems don't handle the size changes
1418		 * correctly.) We are conservative on metadata and don't just
1419		 * extend the buffer but write (if needed) and re-constitute it.
1420		 */
1421
1422		if (bp->b_bcount != size) {
1423			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1424				allocbuf(bp, size);
1425			} else {
1426				if (bp->b_flags & B_DELWRI) {
1427					bp->b_flags |= B_NOCACHE;
1428					VOP_BWRITE(bp);
1429				} else {
1430					if ((bp->b_flags & B_VMIO) &&
1431					   (LIST_FIRST(&bp->b_dep) == NULL)) {
1432						bp->b_flags |= B_RELBUF;
1433						brelse(bp);
1434					} else {
1435						bp->b_flags |= B_NOCACHE;
1436						VOP_BWRITE(bp);
1437					}
1438				}
1439				goto loop;
1440			}
1441		}
1442
1443#ifdef DIAGNOSTIC
1444		if (bp->b_offset == NOOFFSET)
1445			panic("getblk: no buffer offset");
1446#endif
1447
1448		/*
1449		 * Check that the constituted buffer really deserves for the
1450		 * B_CACHE bit to be set.  B_VMIO type buffers might not
1451		 * contain fully valid pages.  Normal (old-style) buffers
1452		 * should be fully valid.
1453		 */
1454		if (bp->b_flags & B_VMIO) {
1455			checksize = bp->b_bufsize;
1456			for (i = 0; i < bp->b_npages; i++) {
1457				int resid;
1458				int poffset;
1459				poffset = bp->b_offset & PAGE_MASK;
1460				resid = (checksize > (PAGE_SIZE - poffset)) ?
1461					(PAGE_SIZE - poffset) : checksize;
1462				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1463					bp->b_flags &= ~(B_CACHE | B_DONE);
1464					break;
1465				}
1466				checksize -= resid;
1467			}
1468		}
1469
1470		if (bp->b_usecount < BUF_MAXUSE)
1471			++bp->b_usecount;
1472		splx(s);
1473		return (bp);
1474	} else {
1475		vm_object_t obj;
1476
1477		if ((bp = getnewbuf(vp, blkno,
1478			slpflag, slptimeo, size, maxsize)) == 0) {
1479			if (slpflag || slptimeo) {
1480				splx(s);
1481				return NULL;
1482			}
1483			goto loop;
1484		}
1485
1486		/*
1487		 * This code is used to make sure that a buffer is not
1488		 * created while the getnewbuf routine is blocked.
1489		 * Normally the vnode is locked so this isn't a problem.
1490		 * VBLK type I/O requests, however, don't lock the vnode.
1491		 */
1492		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
1493			bp->b_flags |= B_INVAL;
1494			brelse(bp);
1495			goto loop;
1496		}
1497
1498		/*
1499		 * Insert the buffer into the hash, so that it can
1500		 * be found by incore.
1501		 */
1502		bp->b_blkno = bp->b_lblkno = blkno;
1503
1504		if (vp->v_type != VBLK)
1505			bp->b_offset = (off_t) blkno * maxsize;
1506		else
1507			bp->b_offset = (off_t) blkno * DEV_BSIZE;
1508
1509		bgetvp(vp, bp);
1510		LIST_REMOVE(bp, b_hash);
1511		bh = BUFHASH(vp, blkno);
1512		LIST_INSERT_HEAD(bh, bp, b_hash);
1513
1514		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
1515			bp->b_flags |= (B_VMIO | B_CACHE);
1516#if defined(VFS_BIO_DEBUG)
1517			if (vp->v_type != VREG && vp->v_type != VBLK)
1518				printf("getblk: vmioing file type %d???\n", vp->v_type);
1519#endif
1520		} else {
1521			bp->b_flags &= ~B_VMIO;
1522		}
1523
1524		allocbuf(bp, size);
1525
1526		splx(s);
1527		return (bp);
1528	}
1529}
1530
1531/*
1532 * Get an empty, disassociated buffer of given size.
1533 */
1534struct buf *
1535geteblk(int size)
1536{
1537	struct buf *bp;
1538	int s;
1539
1540	s = splbio();
1541	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1542	splx(s);
1543	allocbuf(bp, size);
1544	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
1545	return (bp);
1546}
1547
1548
1549/*
1550 * This code constitutes the buffer memory from either anonymous system
1551 * memory (in the case of non-VMIO operations) or from an associated
1552 * VM object (in the case of VMIO operations).
1553 *
1554 * Note that this code is tricky, and has many complications to resolve
1555 * deadlock or inconsistant data situations.  Tread lightly!!!
1556 *
1557 * Modify the length of a buffer's underlying buffer storage without
1558 * destroying information (unless, of course the buffer is shrinking).
1559 */
1560int
1561allocbuf(struct buf * bp, int size)
1562{
1563
1564	int s;
1565	int newbsize, mbsize;
1566	int i;
1567
1568#if !defined(MAX_PERF)
1569	if (!(bp->b_flags & B_BUSY))
1570		panic("allocbuf: buffer not busy");
1571
1572	if (bp->b_kvasize < size)
1573		panic("allocbuf: buffer too small");
1574#endif
1575
1576	if ((bp->b_flags & B_VMIO) == 0) {
1577		caddr_t origbuf;
1578		int origbufsize;
1579		/*
1580		 * Just get anonymous memory from the kernel
1581		 */
1582		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1583#if !defined(NO_B_MALLOC)
1584		if (bp->b_flags & B_MALLOC)
1585			newbsize = mbsize;
1586		else
1587#endif
1588			newbsize = round_page(size);
1589
1590		if (newbsize < bp->b_bufsize) {
1591#if !defined(NO_B_MALLOC)
1592			/*
1593			 * malloced buffers are not shrunk
1594			 */
1595			if (bp->b_flags & B_MALLOC) {
1596				if (newbsize) {
1597					bp->b_bcount = size;
1598				} else {
1599					free(bp->b_data, M_BIOBUF);
1600					bufspace -= bp->b_bufsize;
1601					bufmallocspace -= bp->b_bufsize;
1602					bp->b_data = bp->b_kvabase;
1603					bp->b_bufsize = 0;
1604					bp->b_bcount = 0;
1605					bp->b_flags &= ~B_MALLOC;
1606				}
1607				return 1;
1608			}
1609#endif
1610			vm_hold_free_pages(
1611			    bp,
1612			    (vm_offset_t) bp->b_data + newbsize,
1613			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1614		} else if (newbsize > bp->b_bufsize) {
1615#if !defined(NO_B_MALLOC)
1616			/*
1617			 * We only use malloced memory on the first allocation.
1618			 * and revert to page-allocated memory when the buffer grows.
1619			 */
1620			if ( (bufmallocspace < maxbufmallocspace) &&
1621				(bp->b_bufsize == 0) &&
1622				(mbsize <= PAGE_SIZE/2)) {
1623
1624				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1625				bp->b_bufsize = mbsize;
1626				bp->b_bcount = size;
1627				bp->b_flags |= B_MALLOC;
1628				bufspace += mbsize;
1629				bufmallocspace += mbsize;
1630				return 1;
1631			}
1632#endif
1633			origbuf = NULL;
1634			origbufsize = 0;
1635#if !defined(NO_B_MALLOC)
1636			/*
1637			 * If the buffer is growing on its other-than-first allocation,
1638			 * then we revert to the page-allocation scheme.
1639			 */
1640			if (bp->b_flags & B_MALLOC) {
1641				origbuf = bp->b_data;
1642				origbufsize = bp->b_bufsize;
1643				bp->b_data = bp->b_kvabase;
1644				bufspace -= bp->b_bufsize;
1645				bufmallocspace -= bp->b_bufsize;
1646				bp->b_bufsize = 0;
1647				bp->b_flags &= ~B_MALLOC;
1648				newbsize = round_page(newbsize);
1649			}
1650#endif
1651			vm_hold_load_pages(
1652			    bp,
1653			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1654			    (vm_offset_t) bp->b_data + newbsize);
1655#if !defined(NO_B_MALLOC)
1656			if (origbuf) {
1657				bcopy(origbuf, bp->b_data, origbufsize);
1658				free(origbuf, M_BIOBUF);
1659			}
1660#endif
1661		}
1662	} else {
1663		vm_page_t m;
1664		int desiredpages;
1665
1666		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1667		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1668
1669#if !defined(NO_B_MALLOC)
1670		if (bp->b_flags & B_MALLOC)
1671			panic("allocbuf: VMIO buffer can't be malloced");
1672#endif
1673
1674		if (newbsize < bp->b_bufsize) {
1675			if (desiredpages < bp->b_npages) {
1676				for (i = desiredpages; i < bp->b_npages; i++) {
1677					/*
1678					 * the page is not freed here -- it
1679					 * is the responsibility of vnode_pager_setsize
1680					 */
1681					m = bp->b_pages[i];
1682#if defined(DIAGNOSTIC)
1683					if (m == bogus_page)
1684						panic("allocbuf: bogus page found");
1685#endif
1686					vm_page_sleep(m, "biodep", &m->busy);
1687
1688					bp->b_pages[i] = NULL;
1689					vm_page_unwire(m, 0);
1690				}
1691				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
1692				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1693				bp->b_npages = desiredpages;
1694			}
1695		} else if (newbsize > bp->b_bufsize) {
1696			vm_object_t obj;
1697			vm_offset_t tinc, toff;
1698			vm_ooffset_t off;
1699			vm_pindex_t objoff;
1700			int pageindex, curbpnpages;
1701			struct vnode *vp;
1702			int bsize;
1703			int orig_validoff = bp->b_validoff;
1704			int orig_validend = bp->b_validend;
1705
1706			vp = bp->b_vp;
1707
1708			if (vp->v_type == VBLK)
1709				bsize = DEV_BSIZE;
1710			else
1711				bsize = vp->v_mount->mnt_stat.f_iosize;
1712
1713			if (bp->b_npages < desiredpages) {
1714				obj = vp->v_object;
1715				tinc = PAGE_SIZE;
1716				if (tinc > bsize)
1717					tinc = bsize;
1718
1719				off = bp->b_offset;
1720#ifdef DIAGNOSTIC
1721				if (bp->b_offset == NOOFFSET)
1722					panic("allocbuf: no buffer offset");
1723#endif
1724
1725				curbpnpages = bp->b_npages;
1726		doretry:
1727				bp->b_validoff = orig_validoff;
1728				bp->b_validend = orig_validend;
1729				bp->b_flags |= B_CACHE;
1730				for (toff = 0; toff < newbsize; toff += tinc) {
1731					int bytesinpage;
1732
1733					pageindex = toff >> PAGE_SHIFT;
1734					objoff = OFF_TO_IDX(off + toff);
1735					if (pageindex < curbpnpages) {
1736
1737						m = bp->b_pages[pageindex];
1738#ifdef VFS_BIO_DIAG
1739						if (m->pindex != objoff)
1740							panic("allocbuf: page changed offset??!!!?");
1741#endif
1742						bytesinpage = tinc;
1743						if (tinc > (newbsize - toff))
1744							bytesinpage = newbsize - toff;
1745						if (bp->b_flags & B_CACHE)
1746							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1747						continue;
1748					}
1749					m = vm_page_lookup(obj, objoff);
1750					if (!m) {
1751						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1752						if (!m) {
1753							VM_WAIT;
1754							vm_pageout_deficit += (desiredpages - bp->b_npages);
1755							goto doretry;
1756						}
1757
1758						vm_page_wire(m);
1759						vm_page_flag_clear(m, PG_BUSY);
1760						bp->b_flags &= ~B_CACHE;
1761
1762					} else if (m->flags & PG_BUSY) {
1763						s = splvm();
1764						if (m->flags & PG_BUSY) {
1765							vm_page_flag_set(m, PG_WANTED);
1766							tsleep(m, PVM, "pgtblk", 0);
1767						}
1768						splx(s);
1769						goto doretry;
1770					} else {
1771						if ((curproc != pageproc) &&
1772							((m->queue - m->pc) == PQ_CACHE) &&
1773						    ((cnt.v_free_count + cnt.v_cache_count) <
1774								(cnt.v_free_min + cnt.v_cache_min))) {
1775							pagedaemon_wakeup();
1776						}
1777						bytesinpage = tinc;
1778						if (tinc > (newbsize - toff))
1779							bytesinpage = newbsize - toff;
1780						if (bp->b_flags & B_CACHE)
1781							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1782						vm_page_flag_clear(m, PG_ZERO);
1783						vm_page_wire(m);
1784					}
1785					bp->b_pages[pageindex] = m;
1786					curbpnpages = pageindex + 1;
1787				}
1788				if (vp->v_tag == VT_NFS &&
1789				    vp->v_type != VBLK) {
1790					if (bp->b_dirtyend > 0) {
1791						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1792						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1793					}
1794					if (bp->b_validend == 0)
1795						bp->b_flags &= ~B_CACHE;
1796				}
1797				bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
1798				bp->b_npages = curbpnpages;
1799				pmap_qenter((vm_offset_t) bp->b_data,
1800					bp->b_pages, bp->b_npages);
1801				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1802			}
1803		}
1804	}
1805	if (bp->b_flags & B_VMIO)
1806		vmiospace += (newbsize - bp->b_bufsize);
1807	bufspace += (newbsize - bp->b_bufsize);
1808	bp->b_bufsize = newbsize;
1809	bp->b_bcount = size;
1810	return 1;
1811}
1812
1813/*
1814 * Wait for buffer I/O completion, returning error status.
1815 */
1816int
1817biowait(register struct buf * bp)
1818{
1819	int s;
1820
1821	s = splbio();
1822	while ((bp->b_flags & B_DONE) == 0)
1823#if defined(NO_SCHEDULE_MODS)
1824		tsleep(bp, PRIBIO, "biowait", 0);
1825#else
1826		if (bp->b_flags & B_READ)
1827			tsleep(bp, PRIBIO, "biord", 0);
1828		else
1829			tsleep(bp, PRIBIO, "biowr", 0);
1830#endif
1831	splx(s);
1832	if (bp->b_flags & B_EINTR) {
1833		bp->b_flags &= ~B_EINTR;
1834		return (EINTR);
1835	}
1836	if (bp->b_flags & B_ERROR) {
1837		return (bp->b_error ? bp->b_error : EIO);
1838	} else {
1839		return (0);
1840	}
1841}
1842
1843/*
1844 * Finish I/O on a buffer, calling an optional function.
1845 * This is usually called from interrupt level, so process blocking
1846 * is not *a good idea*.
1847 */
1848void
1849biodone(register struct buf * bp)
1850{
1851	int s;
1852
1853	s = splbio();
1854
1855#if !defined(MAX_PERF)
1856	if (!(bp->b_flags & B_BUSY))
1857		panic("biodone: buffer not busy");
1858#endif
1859
1860	if (bp->b_flags & B_DONE) {
1861		splx(s);
1862#if !defined(MAX_PERF)
1863		printf("biodone: buffer already done\n");
1864#endif
1865		return;
1866	}
1867	bp->b_flags |= B_DONE;
1868
1869	if (bp->b_flags & B_FREEBUF) {
1870		brelse(bp);
1871		splx(s);
1872		return;
1873	}
1874
1875	if ((bp->b_flags & B_READ) == 0) {
1876		vwakeup(bp);
1877	}
1878
1879	/* call optional completion function if requested */
1880	if (bp->b_flags & B_CALL) {
1881		bp->b_flags &= ~B_CALL;
1882		(*bp->b_iodone) (bp);
1883		splx(s);
1884		return;
1885	}
1886	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1887		(*bioops.io_complete)(bp);
1888
1889	if (bp->b_flags & B_VMIO) {
1890		int i, resid;
1891		vm_ooffset_t foff;
1892		vm_page_t m;
1893		vm_object_t obj;
1894		int iosize;
1895		struct vnode *vp = bp->b_vp;
1896
1897		obj = vp->v_object;
1898
1899#if defined(VFS_BIO_DEBUG)
1900		if (vp->v_usecount == 0) {
1901			panic("biodone: zero vnode ref count");
1902		}
1903
1904		if (vp->v_object == NULL) {
1905			panic("biodone: missing VM object");
1906		}
1907
1908		if ((vp->v_flag & VOBJBUF) == 0) {
1909			panic("biodone: vnode is not setup for merged cache");
1910		}
1911#endif
1912
1913		foff = bp->b_offset;
1914#ifdef DIAGNOSTIC
1915		if (bp->b_offset == NOOFFSET)
1916			panic("biodone: no buffer offset");
1917#endif
1918
1919#if !defined(MAX_PERF)
1920		if (!obj) {
1921			panic("biodone: no object");
1922		}
1923#endif
1924#if defined(VFS_BIO_DEBUG)
1925		if (obj->paging_in_progress < bp->b_npages) {
1926			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1927			    obj->paging_in_progress, bp->b_npages);
1928		}
1929#endif
1930		iosize = bp->b_bufsize;
1931		for (i = 0; i < bp->b_npages; i++) {
1932			int bogusflag = 0;
1933			m = bp->b_pages[i];
1934			if (m == bogus_page) {
1935				bogusflag = 1;
1936				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1937				if (!m) {
1938#if defined(VFS_BIO_DEBUG)
1939					printf("biodone: page disappeared\n");
1940#endif
1941					vm_object_pip_subtract(obj, 1);
1942					continue;
1943				}
1944				bp->b_pages[i] = m;
1945				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
1946			}
1947#if defined(VFS_BIO_DEBUG)
1948			if (OFF_TO_IDX(foff) != m->pindex) {
1949				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1950			}
1951#endif
1952			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1953			if (resid > iosize)
1954				resid = iosize;
1955
1956			/*
1957			 * In the write case, the valid and clean bits are
1958			 * already changed correctly, so we only need to do this
1959			 * here in the read case.
1960			 */
1961			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1962				vfs_page_set_valid(bp, foff, i, m);
1963			}
1964			vm_page_flag_clear(m, PG_ZERO);
1965
1966			/*
1967			 * when debugging new filesystems or buffer I/O methods, this
1968			 * is the most common error that pops up.  if you see this, you
1969			 * have not set the page busy flag correctly!!!
1970			 */
1971			if (m->busy == 0) {
1972#if !defined(MAX_PERF)
1973				printf("biodone: page busy < 0, "
1974				    "pindex: %d, foff: 0x(%x,%x), "
1975				    "resid: %d, index: %d\n",
1976				    (int) m->pindex, (int)(foff >> 32),
1977						(int) foff & 0xffffffff, resid, i);
1978#endif
1979				if (vp->v_type != VBLK)
1980#if !defined(MAX_PERF)
1981					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1982					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1983					    (int) bp->b_lblkno,
1984					    bp->b_flags, bp->b_npages);
1985				else
1986					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1987					    (int) bp->b_lblkno,
1988					    bp->b_flags, bp->b_npages);
1989				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1990				    m->valid, m->dirty, m->wire_count);
1991#endif
1992				panic("biodone: page busy < 0\n");
1993			}
1994			vm_page_io_finish(m);
1995			vm_object_pip_subtract(obj, 1);
1996			foff += resid;
1997			iosize -= resid;
1998		}
1999		if (obj &&
2000			(obj->paging_in_progress == 0) &&
2001		    (obj->flags & OBJ_PIPWNT)) {
2002			vm_object_clear_flag(obj, OBJ_PIPWNT);
2003			wakeup(obj);
2004		}
2005	}
2006	/*
2007	 * For asynchronous completions, release the buffer now. The brelse
2008	 * checks for B_WANTED and will do the wakeup there if necessary - so
2009	 * no need to do a wakeup here in the async case.
2010	 */
2011
2012	if (bp->b_flags & B_ASYNC) {
2013		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2014			brelse(bp);
2015		else
2016			bqrelse(bp);
2017	} else {
2018		bp->b_flags &= ~B_WANTED;
2019		wakeup(bp);
2020	}
2021	splx(s);
2022}
2023
2024#if 0	/* not with kirks code */
2025static int vfs_update_interval = 30;
2026
2027static void
2028vfs_update()
2029{
2030	while (1) {
2031		tsleep(&vfs_update_wakeup, PUSER, "update",
2032		    hz * vfs_update_interval);
2033		vfs_update_wakeup = 0;
2034		sync(curproc, NULL);
2035	}
2036}
2037
2038static int
2039sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2040{
2041	int error = sysctl_handle_int(oidp,
2042		oidp->oid_arg1, oidp->oid_arg2, req);
2043	if (!error)
2044		wakeup(&vfs_update_wakeup);
2045	return error;
2046}
2047
2048SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2049	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2050
2051#endif
2052
2053
2054/*
2055 * This routine is called in lieu of iodone in the case of
2056 * incomplete I/O.  This keeps the busy status for pages
2057 * consistant.
2058 */
2059void
2060vfs_unbusy_pages(struct buf * bp)
2061{
2062	int i;
2063
2064	if (bp->b_flags & B_VMIO) {
2065		struct vnode *vp = bp->b_vp;
2066		vm_object_t obj = vp->v_object;
2067
2068		for (i = 0; i < bp->b_npages; i++) {
2069			vm_page_t m = bp->b_pages[i];
2070
2071			if (m == bogus_page) {
2072				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2073#if !defined(MAX_PERF)
2074				if (!m) {
2075					panic("vfs_unbusy_pages: page missing\n");
2076				}
2077#endif
2078				bp->b_pages[i] = m;
2079				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2080			}
2081			vm_object_pip_subtract(obj, 1);
2082			vm_page_flag_clear(m, PG_ZERO);
2083			vm_page_io_finish(m);
2084		}
2085		if (obj->paging_in_progress == 0 &&
2086		    (obj->flags & OBJ_PIPWNT)) {
2087			vm_object_clear_flag(obj, OBJ_PIPWNT);
2088			wakeup(obj);
2089		}
2090	}
2091}
2092
2093/*
2094 * Set NFS' b_validoff and b_validend fields from the valid bits
2095 * of a page.  If the consumer is not NFS, and the page is not
2096 * valid for the entire range, clear the B_CACHE flag to force
2097 * the consumer to re-read the page.
2098 */
2099static void
2100vfs_buf_set_valid(struct buf *bp,
2101		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2102		  vm_page_t m)
2103{
2104	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2105		vm_offset_t svalid, evalid;
2106		int validbits = m->valid;
2107
2108		/*
2109		 * This only bothers with the first valid range in the
2110		 * page.
2111		 */
2112		svalid = off;
2113		while (validbits && !(validbits & 1)) {
2114			svalid += DEV_BSIZE;
2115			validbits >>= 1;
2116		}
2117		evalid = svalid;
2118		while (validbits & 1) {
2119			evalid += DEV_BSIZE;
2120			validbits >>= 1;
2121		}
2122		/*
2123		 * Make sure this range is contiguous with the range
2124		 * built up from previous pages.  If not, then we will
2125		 * just use the range from the previous pages.
2126		 */
2127		if (svalid == bp->b_validend) {
2128			bp->b_validoff = min(bp->b_validoff, svalid);
2129			bp->b_validend = max(bp->b_validend, evalid);
2130		}
2131	} else if (!vm_page_is_valid(m,
2132				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2133				     size)) {
2134		bp->b_flags &= ~B_CACHE;
2135	}
2136}
2137
2138/*
2139 * Set the valid bits in a page, taking care of the b_validoff,
2140 * b_validend fields which NFS uses to optimise small reads.  Off is
2141 * the offset within the file and pageno is the page index within the buf.
2142 */
2143static void
2144vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2145{
2146	struct vnode *vp = bp->b_vp;
2147	vm_ooffset_t soff, eoff;
2148
2149	soff = off;
2150	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2151	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2152		vm_ooffset_t sv, ev;
2153		vm_page_set_invalid(m,
2154		    (vm_offset_t) (soff & PAGE_MASK),
2155		    (vm_offset_t) (eoff - soff));
2156		off = off - pageno * PAGE_SIZE;
2157		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2158		ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2159		soff = qmax(sv, soff);
2160		eoff = qmin(ev, eoff);
2161	}
2162	if (eoff > soff)
2163		vm_page_set_validclean(m,
2164	       (vm_offset_t) (soff & PAGE_MASK),
2165	       (vm_offset_t) (eoff - soff));
2166}
2167
2168/*
2169 * This routine is called before a device strategy routine.
2170 * It is used to tell the VM system that paging I/O is in
2171 * progress, and treat the pages associated with the buffer
2172 * almost as being PG_BUSY.  Also the object paging_in_progress
2173 * flag is handled to make sure that the object doesn't become
2174 * inconsistant.
2175 */
2176void
2177vfs_busy_pages(struct buf * bp, int clear_modify)
2178{
2179	int i;
2180
2181	if (bp->b_flags & B_VMIO) {
2182		struct vnode *vp = bp->b_vp;
2183		vm_object_t obj = vp->v_object;
2184		vm_ooffset_t foff;
2185
2186		foff = bp->b_offset;
2187#ifdef DIAGNOSTIC
2188		if (bp->b_offset == NOOFFSET)
2189			panic("vfs_busy_pages: no buffer offset");
2190#endif
2191
2192		vfs_setdirty(bp);
2193
2194retry:
2195		for (i = 0; i < bp->b_npages; i++) {
2196			vm_page_t m = bp->b_pages[i];
2197			if (vm_page_sleep(m, "vbpage", NULL))
2198				goto retry;
2199		}
2200
2201		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2202			vm_page_t m = bp->b_pages[i];
2203
2204			vm_page_flag_clear(m, PG_ZERO);
2205			if ((bp->b_flags & B_CLUSTER) == 0) {
2206				vm_object_pip_add(obj, 1);
2207				vm_page_io_start(m);
2208			}
2209
2210			vm_page_protect(m, VM_PROT_NONE);
2211			if (clear_modify)
2212				vfs_page_set_valid(bp, foff, i, m);
2213			else if (bp->b_bcount >= PAGE_SIZE) {
2214				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2215					bp->b_pages[i] = bogus_page;
2216					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2217				}
2218			}
2219		}
2220	}
2221}
2222
2223/*
2224 * Tell the VM system that the pages associated with this buffer
2225 * are clean.  This is used for delayed writes where the data is
2226 * going to go to disk eventually without additional VM intevention.
2227 */
2228void
2229vfs_clean_pages(struct buf * bp)
2230{
2231	int i;
2232
2233	if (bp->b_flags & B_VMIO) {
2234		vm_ooffset_t foff;
2235		foff = bp->b_offset;
2236
2237#ifdef DIAGNOSTIC
2238		if (bp->b_offset == NOOFFSET)
2239			panic("vfs_clean_pages: no buffer offset");
2240#endif
2241
2242		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2243			vm_page_t m = bp->b_pages[i];
2244			vfs_page_set_valid(bp, foff, i, m);
2245		}
2246	}
2247}
2248
2249void
2250vfs_bio_clrbuf(struct buf *bp) {
2251	int i;
2252	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2253		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2254			int mask;
2255			mask = 0;
2256			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2257				mask |= (1 << (i/DEV_BSIZE));
2258			if(((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2259				(bp->b_pages[0]->valid != mask)) {
2260				bzero(bp->b_data, bp->b_bufsize);
2261			}
2262			bp->b_pages[0]->valid = mask;
2263			bp->b_resid = 0;
2264			return;
2265		}
2266		for(i=0;i<bp->b_npages;i++) {
2267			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2268				continue;
2269			if( bp->b_pages[i]->valid == 0) {
2270				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2271					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2272				}
2273			} else {
2274				int j;
2275				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2276					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2277						(bp->b_pages[i]->valid & (1<<j)) == 0)
2278						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2279				}
2280			}
2281			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
2282			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2283		}
2284		bp->b_resid = 0;
2285	} else {
2286		clrbuf(bp);
2287	}
2288}
2289
2290/*
2291 * vm_hold_load_pages and vm_hold_unload pages get pages into
2292 * a buffers address space.  The pages are anonymous and are
2293 * not associated with a file object.
2294 */
2295void
2296vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2297{
2298	vm_offset_t pg;
2299	vm_page_t p;
2300	int index;
2301
2302	to = round_page(to);
2303	from = round_page(from);
2304	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2305
2306	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2307
2308tryagain:
2309
2310		p = vm_page_alloc(kernel_object,
2311			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2312		    VM_ALLOC_NORMAL);
2313		if (!p) {
2314			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2315			VM_WAIT;
2316			goto tryagain;
2317		}
2318		vm_page_wire(p);
2319		p->valid = VM_PAGE_BITS_ALL;
2320		vm_page_flag_clear(p, PG_ZERO);
2321		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2322		bp->b_pages[index] = p;
2323		vm_page_wakeup(p);
2324	}
2325	bp->b_npages = index;
2326}
2327
2328void
2329vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2330{
2331	vm_offset_t pg;
2332	vm_page_t p;
2333	int index, newnpages;
2334
2335	from = round_page(from);
2336	to = round_page(to);
2337	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2338
2339	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2340		p = bp->b_pages[index];
2341		if (p && (index < bp->b_npages)) {
2342#if !defined(MAX_PERF)
2343			if (p->busy) {
2344				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2345					bp->b_blkno, bp->b_lblkno);
2346			}
2347#endif
2348			bp->b_pages[index] = NULL;
2349			pmap_kremove(pg);
2350			vm_page_busy(p);
2351			vm_page_unwire(p, 0);
2352			vm_page_free(p);
2353		}
2354	}
2355	bp->b_npages = newnpages;
2356}
2357
2358
2359#include "opt_ddb.h"
2360#ifdef DDB
2361#include <ddb/ddb.h>
2362
2363DB_SHOW_COMMAND(buffer, db_show_buffer)
2364{
2365	/* get args */
2366	struct buf *bp = (struct buf *)addr;
2367
2368	if (!have_addr) {
2369		db_printf("usage: show buffer <addr>\n");
2370		return;
2371	}
2372
2373	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2374		  (u_int)bp->b_flags, PRINT_BUF_FLAGS);
2375	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2376		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2377		  "b_blkno = %d, b_pblkno = %d\n",
2378		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2379		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2380	if (bp->b_npages) {
2381		int i;
2382		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2383		for (i = 0; i < bp->b_npages; i++) {
2384			vm_page_t m;
2385			m = bp->b_pages[i];
2386			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
2387			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
2388			if ((i + 1) < bp->b_npages)
2389				db_printf(",");
2390		}
2391		db_printf("\n");
2392	}
2393}
2394#endif /* DDB */
2395