vfs_bio.c revision 41589
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.185 1998/11/18 09:00:47 dg Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 */
27
28#define VMIO
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysproto.h>
32#include <sys/kernel.h>
33#include <sys/sysctl.h>
34#include <sys/proc.h>
35#include <sys/vnode.h>
36#include <sys/vmmeter.h>
37#include <sys/lock.h>
38#include <miscfs/specfs/specdev.h>
39#include <vm/vm.h>
40#include <vm/vm_param.h>
41#include <vm/vm_prot.h>
42#include <vm/vm_kern.h>
43#include <vm/vm_pageout.h>
44#include <vm/vm_page.h>
45#include <vm/vm_object.h>
46#include <vm/vm_extern.h>
47#include <vm/vm_map.h>
48#include <sys/buf.h>
49#include <sys/mount.h>
50#include <sys/malloc.h>
51#include <sys/resourcevar.h>
52
53static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
54
55struct	bio_ops bioops;		/* I/O operation notification */
56
57#if 0 	/* replaced bu sched_sync */
58static void vfs_update __P((void));
59static struct	proc *updateproc;
60static struct kproc_desc up_kp = {
61	"update",
62	vfs_update,
63	&updateproc
64};
65SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
66#endif
67
68struct buf *buf;		/* buffer header pool */
69struct swqueue bswlist;
70
71static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
72		vm_offset_t to);
73static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
74		vm_offset_t to);
75static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
76			      vm_offset_t off, vm_offset_t size,
77			      vm_page_t m);
78static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
79			       int pageno, vm_page_t m);
80static void vfs_clean_pages(struct buf * bp);
81static void vfs_setdirty(struct buf *bp);
82static void vfs_vmio_release(struct buf *bp);
83static void flushdirtybuffers(int slpflag, int slptimeo);
84
85int needsbuffer;
86
87/*
88 * Internal update daemon, process 3
89 *	The variable vfs_update_wakeup allows for internal syncs.
90 */
91int vfs_update_wakeup;
92
93
94/*
95 * buffers base kva
96 */
97
98/*
99 * bogus page -- for I/O to/from partially complete buffers
100 * this is a temporary solution to the problem, but it is not
101 * really that bad.  it would be better to split the buffer
102 * for input in the case of buffers partially already in memory,
103 * but the code is intricate enough already.
104 */
105vm_page_t bogus_page;
106static vm_offset_t bogus_offset;
107
108static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
109	bufmallocspace, maxbufmallocspace;
110int numdirtybuffers;
111static int lodirtybuffers, hidirtybuffers;
112static int numfreebuffers, lofreebuffers, hifreebuffers;
113static int kvafreespace;
114
115SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
116	&numdirtybuffers, 0, "");
117SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
118	&lodirtybuffers, 0, "");
119SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
120	&hidirtybuffers, 0, "");
121SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
122	&numfreebuffers, 0, "");
123SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
124	&lofreebuffers, 0, "");
125SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
126	&hifreebuffers, 0, "");
127SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
128	&maxbufspace, 0, "");
129SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
130	&bufspace, 0, "");
131SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
132	&maxvmiobufspace, 0, "");
133SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
134	&vmiospace, 0, "");
135SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
136	&maxbufmallocspace, 0, "");
137SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
138	&bufmallocspace, 0, "");
139SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
140	&kvafreespace, 0, "");
141
142static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
143struct bqueues bufqueues[BUFFER_QUEUES] = {0};
144
145extern int vm_swap_size;
146
147#define BUF_MAXUSE 24
148
149#define VFS_BIO_NEED_ANY 1
150#define VFS_BIO_NEED_LOWLIMIT 2
151#define VFS_BIO_NEED_FREE 4
152
153/*
154 * Initialize buffer headers and related structures.
155 */
156void
157bufinit()
158{
159	struct buf *bp;
160	int i;
161
162	TAILQ_INIT(&bswlist);
163	LIST_INIT(&invalhash);
164
165	/* first, make a null hash table */
166	for (i = 0; i < BUFHSZ; i++)
167		LIST_INIT(&bufhashtbl[i]);
168
169	/* next, make a null set of free lists */
170	for (i = 0; i < BUFFER_QUEUES; i++)
171		TAILQ_INIT(&bufqueues[i]);
172
173	/* finally, initialize each buffer header and stick on empty q */
174	for (i = 0; i < nbuf; i++) {
175		bp = &buf[i];
176		bzero(bp, sizeof *bp);
177		bp->b_flags = B_INVAL;	/* we're just an empty header */
178		bp->b_dev = NODEV;
179		bp->b_rcred = NOCRED;
180		bp->b_wcred = NOCRED;
181		bp->b_qindex = QUEUE_EMPTY;
182		bp->b_xflags = 0;
183		LIST_INIT(&bp->b_dep);
184		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
185		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
186	}
187/*
188 * maxbufspace is currently calculated to support all filesystem blocks
189 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
190 * cache is still the same as it would be for 8K filesystems.  This
191 * keeps the size of the buffer cache "in check" for big block filesystems.
192 */
193	maxbufspace = (nbuf + 8) * DFLTBSIZE;
194/*
195 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
196 */
197	maxvmiobufspace = 2 * maxbufspace / 3;
198/*
199 * Limit the amount of malloc memory since it is wired permanently into
200 * the kernel space.  Even though this is accounted for in the buffer
201 * allocation, we don't want the malloced region to grow uncontrolled.
202 * The malloc scheme improves memory utilization significantly on average
203 * (small) directories.
204 */
205	maxbufmallocspace = maxbufspace / 20;
206
207/*
208 * Remove the probability of deadlock conditions by limiting the
209 * number of dirty buffers.
210 */
211	hidirtybuffers = nbuf / 8 + 20;
212	lodirtybuffers = nbuf / 16 + 10;
213	numdirtybuffers = 0;
214	lofreebuffers = nbuf / 18 + 5;
215	hifreebuffers = 2 * lofreebuffers;
216	numfreebuffers = nbuf;
217	kvafreespace = 0;
218
219	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
220	bogus_page = vm_page_alloc(kernel_object,
221			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
222			VM_ALLOC_NORMAL);
223
224}
225
226/*
227 * Free the kva allocation for a buffer
228 * Must be called only at splbio or higher,
229 *  as this is the only locking for buffer_map.
230 */
231static void
232bfreekva(struct buf * bp)
233{
234	if (bp->b_kvasize == 0)
235		return;
236
237	vm_map_delete(buffer_map,
238		(vm_offset_t) bp->b_kvabase,
239		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
240
241	bp->b_kvasize = 0;
242
243}
244
245/*
246 * remove the buffer from the appropriate free list
247 */
248void
249bremfree(struct buf * bp)
250{
251	int s = splbio();
252
253	if (bp->b_qindex != QUEUE_NONE) {
254		if (bp->b_qindex == QUEUE_EMPTY) {
255			kvafreespace -= bp->b_kvasize;
256		}
257		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
258		bp->b_qindex = QUEUE_NONE;
259	} else {
260#if !defined(MAX_PERF)
261		panic("bremfree: removing a buffer when not on a queue");
262#endif
263	}
264	if ((bp->b_flags & B_INVAL) ||
265		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
266		--numfreebuffers;
267	splx(s);
268}
269
270
271/*
272 * Get a buffer with the specified data.  Look in the cache first.
273 */
274int
275bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
276    struct buf ** bpp)
277{
278	struct buf *bp;
279
280	bp = getblk(vp, blkno, size, 0, 0);
281	*bpp = bp;
282
283	/* if not found in cache, do some I/O */
284	if ((bp->b_flags & B_CACHE) == 0) {
285		if (curproc != NULL)
286			curproc->p_stats->p_ru.ru_inblock++;
287		bp->b_flags |= B_READ;
288		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
289		if (bp->b_rcred == NOCRED) {
290			if (cred != NOCRED)
291				crhold(cred);
292			bp->b_rcred = cred;
293		}
294		vfs_busy_pages(bp, 0);
295		VOP_STRATEGY(vp, bp);
296		return (biowait(bp));
297	}
298	return (0);
299}
300
301/*
302 * Operates like bread, but also starts asynchronous I/O on
303 * read-ahead blocks.
304 */
305int
306breadn(struct vnode * vp, daddr_t blkno, int size,
307    daddr_t * rablkno, int *rabsize,
308    int cnt, struct ucred * cred, struct buf ** bpp)
309{
310	struct buf *bp, *rabp;
311	int i;
312	int rv = 0, readwait = 0;
313
314	*bpp = bp = getblk(vp, blkno, size, 0, 0);
315
316	/* if not found in cache, do some I/O */
317	if ((bp->b_flags & B_CACHE) == 0) {
318		if (curproc != NULL)
319			curproc->p_stats->p_ru.ru_inblock++;
320		bp->b_flags |= B_READ;
321		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
322		if (bp->b_rcred == NOCRED) {
323			if (cred != NOCRED)
324				crhold(cred);
325			bp->b_rcred = cred;
326		}
327		vfs_busy_pages(bp, 0);
328		VOP_STRATEGY(vp, bp);
329		++readwait;
330	}
331	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
332		if (inmem(vp, *rablkno))
333			continue;
334		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
335
336		if ((rabp->b_flags & B_CACHE) == 0) {
337			if (curproc != NULL)
338				curproc->p_stats->p_ru.ru_inblock++;
339			rabp->b_flags |= B_READ | B_ASYNC;
340			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
341			if (rabp->b_rcred == NOCRED) {
342				if (cred != NOCRED)
343					crhold(cred);
344				rabp->b_rcred = cred;
345			}
346			vfs_busy_pages(rabp, 0);
347			VOP_STRATEGY(vp, rabp);
348		} else {
349			brelse(rabp);
350		}
351	}
352
353	if (readwait) {
354		rv = biowait(bp);
355	}
356	return (rv);
357}
358
359/*
360 * Write, release buffer on completion.  (Done by iodone
361 * if async.)
362 */
363int
364bwrite(struct buf * bp)
365{
366	int oldflags, s;
367	struct vnode *vp;
368	struct mount *mp;
369
370
371	if (bp->b_flags & B_INVAL) {
372		brelse(bp);
373		return (0);
374	}
375
376	oldflags = bp->b_flags;
377
378#if !defined(MAX_PERF)
379	if ((bp->b_flags & B_BUSY) == 0)
380		panic("bwrite: buffer is not busy???");
381#endif
382
383	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
384	bp->b_flags |= B_WRITEINPROG;
385
386	s = splbio();
387	if ((oldflags & B_DELWRI) == B_DELWRI) {
388		--numdirtybuffers;
389		reassignbuf(bp, bp->b_vp);
390	}
391
392	bp->b_vp->v_numoutput++;
393	vfs_busy_pages(bp, 1);
394	if (curproc != NULL)
395		curproc->p_stats->p_ru.ru_oublock++;
396	splx(s);
397	VOP_STRATEGY(bp->b_vp, bp);
398
399	/*
400	 * Collect statistics on synchronous and asynchronous writes.
401	 * Writes to block devices are charged to their associated
402	 * filesystem (if any).
403	 */
404	if ((vp = bp->b_vp) != NULL) {
405		if (vp->v_type == VBLK)
406			mp = vp->v_specmountpoint;
407		else
408			mp = vp->v_mount;
409		if (mp != NULL)
410			if ((oldflags & B_ASYNC) == 0)
411				mp->mnt_stat.f_syncwrites++;
412			else
413				mp->mnt_stat.f_asyncwrites++;
414	}
415
416	if ((oldflags & B_ASYNC) == 0) {
417		int rtval = biowait(bp);
418		brelse(bp);
419		return (rtval);
420	}
421	return (0);
422}
423
424void
425vfs_bio_need_satisfy(void) {
426	++numfreebuffers;
427	if (!needsbuffer)
428		return;
429	if (numdirtybuffers < lodirtybuffers) {
430		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
431	} else {
432		needsbuffer &= ~VFS_BIO_NEED_ANY;
433	}
434	if (numfreebuffers >= hifreebuffers) {
435		needsbuffer &= ~VFS_BIO_NEED_FREE;
436	}
437	wakeup(&needsbuffer);
438}
439
440/*
441 * Delayed write. (Buffer is marked dirty).
442 */
443void
444bdwrite(struct buf * bp)
445{
446	struct vnode *vp;
447
448#if !defined(MAX_PERF)
449	if ((bp->b_flags & B_BUSY) == 0) {
450		panic("bdwrite: buffer is not busy");
451	}
452#endif
453
454	if (bp->b_flags & B_INVAL) {
455		brelse(bp);
456		return;
457	}
458	bp->b_flags &= ~(B_READ|B_RELBUF);
459	if ((bp->b_flags & B_DELWRI) == 0) {
460		bp->b_flags |= B_DONE | B_DELWRI;
461		reassignbuf(bp, bp->b_vp);
462		++numdirtybuffers;
463	}
464
465	/*
466	 * This bmap keeps the system from needing to do the bmap later,
467	 * perhaps when the system is attempting to do a sync.  Since it
468	 * is likely that the indirect block -- or whatever other datastructure
469	 * that the filesystem needs is still in memory now, it is a good
470	 * thing to do this.  Note also, that if the pageout daemon is
471	 * requesting a sync -- there might not be enough memory to do
472	 * the bmap then...  So, this is important to do.
473	 */
474	if (bp->b_lblkno == bp->b_blkno) {
475		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
476	}
477
478	/*
479	 * Set the *dirty* buffer range based upon the VM system dirty pages.
480	 */
481	vfs_setdirty(bp);
482
483	/*
484	 * We need to do this here to satisfy the vnode_pager and the
485	 * pageout daemon, so that it thinks that the pages have been
486	 * "cleaned".  Note that since the pages are in a delayed write
487	 * buffer -- the VFS layer "will" see that the pages get written
488	 * out on the next sync, or perhaps the cluster will be completed.
489	 */
490	vfs_clean_pages(bp);
491	bqrelse(bp);
492
493	/*
494	 * XXX The soft dependency code is not prepared to
495	 * have I/O done when a bdwrite is requested. For
496	 * now we just let the write be delayed if it is
497	 * requested by the soft dependency code.
498	 */
499	if ((vp = bp->b_vp) &&
500	    ((vp->v_type == VBLK && vp->v_specmountpoint &&
501		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
502		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
503		return;
504
505	if (numdirtybuffers >= hidirtybuffers)
506		flushdirtybuffers(0, 0);
507
508	return;
509}
510
511
512/*
513 * Same as first half of bdwrite, mark buffer dirty, but do not release it.
514 * Check how this compares with vfs_setdirty(); XXX [JRE]
515 */
516void
517bdirty(bp)
518      struct buf *bp;
519{
520
521	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
522	if ((bp->b_flags & B_DELWRI) == 0) {
523		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
524		reassignbuf(bp, bp->b_vp);
525		++numdirtybuffers;
526	}
527}
528
529/*
530 * Asynchronous write.
531 * Start output on a buffer, but do not wait for it to complete.
532 * The buffer is released when the output completes.
533 */
534void
535bawrite(struct buf * bp)
536{
537	bp->b_flags |= B_ASYNC;
538	(void) VOP_BWRITE(bp);
539}
540
541/*
542 * Ordered write.
543 * Start output on a buffer, and flag it so that the device will write
544 * it in the order it was queued.  The buffer is released when the output
545 * completes.
546 */
547int
548bowrite(struct buf * bp)
549{
550	bp->b_flags |= B_ORDERED|B_ASYNC;
551	return (VOP_BWRITE(bp));
552}
553
554/*
555 * Release a buffer.
556 */
557void
558brelse(struct buf * bp)
559{
560	int s;
561
562	if (bp->b_flags & B_CLUSTER) {
563		relpbuf(bp);
564		return;
565	}
566
567	s = splbio();
568
569	/* anyone need this block? */
570	if (bp->b_flags & B_WANTED) {
571		bp->b_flags &= ~(B_WANTED | B_AGE);
572		wakeup(bp);
573	}
574
575	if (bp->b_flags & B_LOCKED)
576		bp->b_flags &= ~B_ERROR;
577
578	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
579	    (bp->b_bufsize <= 0)) {
580		bp->b_flags |= B_INVAL;
581		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
582			(*bioops.io_deallocate)(bp);
583		if (bp->b_flags & B_DELWRI)
584			--numdirtybuffers;
585		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
586		if ((bp->b_flags & B_VMIO) == 0) {
587			if (bp->b_bufsize)
588				allocbuf(bp, 0);
589			if (bp->b_vp)
590				brelvp(bp);
591		}
592	}
593
594	/*
595	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
596	 * is called with B_DELWRI set, the underlying pages may wind up
597	 * getting freed causing a previous write (bdwrite()) to get 'lost'
598	 * because pages associated with a B_DELWRI bp are marked clean.
599	 *
600	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
601	 * if B_DELWRI is set.
602	 */
603
604	if (bp->b_flags & B_DELWRI)
605		bp->b_flags &= ~B_RELBUF;
606
607	/*
608	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
609	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
610	 * but the VM object is kept around.  The B_NOCACHE flag is used to
611	 * invalidate the pages in the VM object.
612	 *
613	 * If the buffer is a partially filled NFS buffer, keep it
614	 * since invalidating it now will lose informatio.  The valid
615	 * flags in the vm_pages have only DEV_BSIZE resolution but
616	 * the b_validoff, b_validend fields have byte resolution.
617	 * This can avoid unnecessary re-reads of the buffer.
618	 * XXX this seems to cause performance problems.
619	 */
620	if ((bp->b_flags & B_VMIO)
621	    && !(bp->b_vp->v_tag == VT_NFS &&
622		 bp->b_vp->v_type != VBLK &&
623		 (bp->b_flags & B_DELWRI) != 0)
624#ifdef notdef
625	    && (bp->b_vp->v_tag != VT_NFS
626		|| bp->b_vp->v_type == VBLK
627		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
628		|| bp->b_validend == 0
629		|| (bp->b_validoff == 0
630		    && bp->b_validend == bp->b_bufsize))
631#endif
632	    ) {
633
634		int i, j, resid;
635		vm_page_t m;
636		off_t foff;
637		vm_pindex_t poff;
638		vm_object_t obj;
639		struct vnode *vp;
640
641		vp = bp->b_vp;
642
643		resid = bp->b_bufsize;
644		foff = bp->b_offset;
645
646		for (i = 0; i < bp->b_npages; i++) {
647			m = bp->b_pages[i];
648			vm_page_flag_clear(m, PG_ZERO);
649			if (m == bogus_page) {
650
651				obj = (vm_object_t) vp->v_object;
652				poff = OFF_TO_IDX(bp->b_offset);
653
654				for (j = i; j < bp->b_npages; j++) {
655					m = bp->b_pages[j];
656					if (m == bogus_page) {
657						m = vm_page_lookup(obj, poff + j);
658#if !defined(MAX_PERF)
659						if (!m) {
660							panic("brelse: page missing\n");
661						}
662#endif
663						bp->b_pages[j] = m;
664					}
665				}
666
667				if ((bp->b_flags & B_INVAL) == 0) {
668					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
669				}
670			}
671			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
672				int poffset = foff & PAGE_MASK;
673				int presid = resid > (PAGE_SIZE - poffset) ?
674					(PAGE_SIZE - poffset) : resid;
675				vm_page_set_invalid(m, poffset, presid);
676			}
677			resid -= PAGE_SIZE;
678		}
679
680		if (bp->b_flags & (B_INVAL | B_RELBUF))
681			vfs_vmio_release(bp);
682
683	} else if (bp->b_flags & B_VMIO) {
684
685		if (bp->b_flags & (B_INVAL | B_RELBUF))
686			vfs_vmio_release(bp);
687
688	}
689
690#if !defined(MAX_PERF)
691	if (bp->b_qindex != QUEUE_NONE)
692		panic("brelse: free buffer onto another queue???");
693#endif
694
695	/* enqueue */
696	/* buffers with no memory */
697	if (bp->b_bufsize == 0) {
698		bp->b_flags |= B_INVAL;
699		bp->b_qindex = QUEUE_EMPTY;
700		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
701		LIST_REMOVE(bp, b_hash);
702		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
703		bp->b_dev = NODEV;
704		kvafreespace += bp->b_kvasize;
705
706	/* buffers with junk contents */
707	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
708		bp->b_flags |= B_INVAL;
709		bp->b_qindex = QUEUE_AGE;
710		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
711		LIST_REMOVE(bp, b_hash);
712		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
713		bp->b_dev = NODEV;
714
715	/* buffers that are locked */
716	} else if (bp->b_flags & B_LOCKED) {
717		bp->b_qindex = QUEUE_LOCKED;
718		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
719
720	/* buffers with stale but valid contents */
721	} else if (bp->b_flags & B_AGE) {
722		bp->b_qindex = QUEUE_AGE;
723		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
724
725	/* buffers with valid and quite potentially reuseable contents */
726	} else {
727		bp->b_qindex = QUEUE_LRU;
728		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
729	}
730
731	if ((bp->b_flags & B_INVAL) ||
732		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
733		if (bp->b_flags & B_DELWRI) {
734			--numdirtybuffers;
735			bp->b_flags &= ~B_DELWRI;
736		}
737		vfs_bio_need_satisfy();
738	}
739
740	/* unlock */
741	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
742		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
743	splx(s);
744}
745
746/*
747 * Release a buffer.
748 */
749void
750bqrelse(struct buf * bp)
751{
752	int s;
753
754	s = splbio();
755
756	/* anyone need this block? */
757	if (bp->b_flags & B_WANTED) {
758		bp->b_flags &= ~(B_WANTED | B_AGE);
759		wakeup(bp);
760	}
761
762#if !defined(MAX_PERF)
763	if (bp->b_qindex != QUEUE_NONE)
764		panic("bqrelse: free buffer onto another queue???");
765#endif
766
767	if (bp->b_flags & B_LOCKED) {
768		bp->b_flags &= ~B_ERROR;
769		bp->b_qindex = QUEUE_LOCKED;
770		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
771		/* buffers with stale but valid contents */
772	} else {
773		bp->b_qindex = QUEUE_LRU;
774		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
775	}
776
777	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
778		vfs_bio_need_satisfy();
779	}
780
781	/* unlock */
782	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
783		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
784	splx(s);
785}
786
787static void
788vfs_vmio_release(bp)
789	struct buf *bp;
790{
791	int i, s;
792	vm_page_t m;
793
794	s = splvm();
795	for (i = 0; i < bp->b_npages; i++) {
796		m = bp->b_pages[i];
797		bp->b_pages[i] = NULL;
798		/*
799		 * In order to keep page LRU ordering consistent, put
800		 * everything on the inactive queue.
801		 */
802		vm_page_unwire(m, 0);
803		/*
804		 * We don't mess with busy pages, it is
805		 * the responsibility of the process that
806		 * busied the pages to deal with them.
807		 */
808		if ((m->flags & PG_BUSY) || (m->busy != 0))
809			continue;
810
811		if (m->wire_count == 0) {
812			vm_page_flag_clear(m, PG_ZERO);
813			/*
814			 * Might as well free the page if we can and it has
815			 * no valid data.
816			 */
817			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
818				vm_page_busy(m);
819				vm_page_protect(m, VM_PROT_NONE);
820				vm_page_free(m);
821			}
822		}
823	}
824	splx(s);
825	bufspace -= bp->b_bufsize;
826	vmiospace -= bp->b_bufsize;
827	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
828	bp->b_npages = 0;
829	bp->b_bufsize = 0;
830	bp->b_flags &= ~B_VMIO;
831	if (bp->b_vp)
832		brelvp(bp);
833}
834
835/*
836 * Check to see if a block is currently memory resident.
837 */
838struct buf *
839gbincore(struct vnode * vp, daddr_t blkno)
840{
841	struct buf *bp;
842	struct bufhashhdr *bh;
843
844	bh = BUFHASH(vp, blkno);
845	bp = bh->lh_first;
846
847	/* Search hash chain */
848	while (bp != NULL) {
849		/* hit */
850		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
851		    (bp->b_flags & B_INVAL) == 0) {
852			break;
853		}
854		bp = bp->b_hash.le_next;
855	}
856	return (bp);
857}
858
859/*
860 * this routine implements clustered async writes for
861 * clearing out B_DELWRI buffers...  This is much better
862 * than the old way of writing only one buffer at a time.
863 */
864int
865vfs_bio_awrite(struct buf * bp)
866{
867	int i;
868	daddr_t lblkno = bp->b_lblkno;
869	struct vnode *vp = bp->b_vp;
870	int s;
871	int ncl;
872	struct buf *bpa;
873	int nwritten;
874	int size;
875	int maxcl;
876
877	s = splbio();
878	/*
879	 * right now we support clustered writing only to regular files
880	 */
881	if ((vp->v_type == VREG) &&
882	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
883	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
884
885		size = vp->v_mount->mnt_stat.f_iosize;
886		maxcl = MAXPHYS / size;
887
888		for (i = 1; i < maxcl; i++) {
889			if ((bpa = gbincore(vp, lblkno + i)) &&
890			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
891			    (B_DELWRI | B_CLUSTEROK)) &&
892			    (bpa->b_bufsize == size)) {
893				if ((bpa->b_blkno == bpa->b_lblkno) ||
894				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
895					break;
896			} else {
897				break;
898			}
899		}
900		ncl = i;
901		/*
902		 * this is a possible cluster write
903		 */
904		if (ncl != 1) {
905			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
906			splx(s);
907			return nwritten;
908		}
909	}
910
911	bremfree(bp);
912	bp->b_flags |= B_BUSY | B_ASYNC;
913
914	splx(s);
915	/*
916	 * default (old) behavior, writing out only one block
917	 */
918	nwritten = bp->b_bufsize;
919	(void) VOP_BWRITE(bp);
920	return nwritten;
921}
922
923
924/*
925 * Find a buffer header which is available for use.
926 */
927static struct buf *
928getnewbuf(struct vnode *vp, daddr_t blkno,
929	int slpflag, int slptimeo, int size, int maxsize)
930{
931	struct buf *bp, *bp1;
932	int nbyteswritten = 0;
933	vm_offset_t addr;
934	static int writerecursion = 0;
935
936start:
937	if (bufspace >= maxbufspace)
938		goto trytofreespace;
939
940	/* can we constitute a new buffer? */
941	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
942#if !defined(MAX_PERF)
943		if (bp->b_qindex != QUEUE_EMPTY)
944			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
945			    bp->b_qindex);
946#endif
947		bp->b_flags |= B_BUSY;
948		bremfree(bp);
949		goto fillbuf;
950	}
951trytofreespace:
952	/*
953	 * We keep the file I/O from hogging metadata I/O
954	 * This is desirable because file data is cached in the
955	 * VM/Buffer cache even if a buffer is freed.
956	 */
957	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
958#if !defined(MAX_PERF)
959		if (bp->b_qindex != QUEUE_AGE)
960			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
961			    bp->b_qindex);
962#endif
963	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
964#if !defined(MAX_PERF)
965		if (bp->b_qindex != QUEUE_LRU)
966			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
967			    bp->b_qindex);
968#endif
969	}
970	if (!bp) {
971		/* wait for a free buffer of any kind */
972		needsbuffer |= VFS_BIO_NEED_ANY;
973		do
974			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
975			    slptimeo);
976		while (needsbuffer & VFS_BIO_NEED_ANY);
977		return (0);
978	}
979
980#if defined(DIAGNOSTIC)
981	if (bp->b_flags & B_BUSY) {
982		panic("getnewbuf: busy buffer on free list\n");
983	}
984#endif
985
986	/*
987	 * We are fairly aggressive about freeing VMIO buffers, but since
988	 * the buffering is intact without buffer headers, there is not
989	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
990	 */
991	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
992		if ((bp->b_flags & B_VMIO) == 0 ||
993			(vmiospace < maxvmiobufspace)) {
994			--bp->b_usecount;
995			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
996			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
997				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
998				goto start;
999			}
1000			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1001		}
1002	}
1003
1004
1005	/* if we are a delayed write, convert to an async write */
1006	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1007
1008		/*
1009		 * If our delayed write is likely to be used soon, then
1010		 * recycle back onto the LRU queue.
1011		 */
1012		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1013			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1014
1015			if (bp->b_usecount > 0) {
1016				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1017
1018					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1019
1020					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1021						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1022						bp->b_usecount--;
1023						goto start;
1024					}
1025					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1026				}
1027			}
1028		}
1029
1030		/*
1031		 * Certain layered filesystems can recursively re-enter the vfs_bio
1032		 * code, due to delayed writes.  This helps keep the system from
1033		 * deadlocking.
1034		 */
1035		if (writerecursion > 0) {
1036			if (writerecursion > 5) {
1037				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1038				while (bp) {
1039					if ((bp->b_flags & B_DELWRI) == 0)
1040						break;
1041					bp = TAILQ_NEXT(bp, b_freelist);
1042				}
1043				if (bp == NULL) {
1044					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1045					while (bp) {
1046						if ((bp->b_flags & B_DELWRI) == 0)
1047							break;
1048						bp = TAILQ_NEXT(bp, b_freelist);
1049					}
1050				}
1051				if (bp == NULL)
1052					panic("getnewbuf: cannot get buffer, infinite recursion failure");
1053			} else {
1054				bremfree(bp);
1055				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1056				nbyteswritten += bp->b_bufsize;
1057				++writerecursion;
1058				VOP_BWRITE(bp);
1059				--writerecursion;
1060				if (!slpflag && !slptimeo) {
1061					return (0);
1062				}
1063				goto start;
1064			}
1065		} else {
1066			++writerecursion;
1067			nbyteswritten += vfs_bio_awrite(bp);
1068			--writerecursion;
1069			if (!slpflag && !slptimeo) {
1070				return (0);
1071			}
1072			goto start;
1073		}
1074	}
1075
1076	if (bp->b_flags & B_WANTED) {
1077		bp->b_flags &= ~B_WANTED;
1078		wakeup(bp);
1079	}
1080	bremfree(bp);
1081	bp->b_flags |= B_BUSY;
1082
1083	if (bp->b_flags & B_VMIO) {
1084		bp->b_flags &= ~B_ASYNC;
1085		vfs_vmio_release(bp);
1086	}
1087
1088	if (bp->b_vp)
1089		brelvp(bp);
1090
1091fillbuf:
1092
1093	/* we are not free, nor do we contain interesting data */
1094	if (bp->b_rcred != NOCRED) {
1095		crfree(bp->b_rcred);
1096		bp->b_rcred = NOCRED;
1097	}
1098	if (bp->b_wcred != NOCRED) {
1099		crfree(bp->b_wcred);
1100		bp->b_wcred = NOCRED;
1101	}
1102	if (LIST_FIRST(&bp->b_dep) != NULL &&
1103	    bioops.io_deallocate)
1104		(*bioops.io_deallocate)(bp);
1105
1106	LIST_REMOVE(bp, b_hash);
1107	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1108	if (bp->b_bufsize) {
1109		allocbuf(bp, 0);
1110	}
1111	bp->b_flags = B_BUSY;
1112	bp->b_dev = NODEV;
1113	bp->b_vp = NULL;
1114	bp->b_blkno = bp->b_lblkno = 0;
1115	bp->b_offset = NOOFFSET;
1116	bp->b_iodone = 0;
1117	bp->b_error = 0;
1118	bp->b_resid = 0;
1119	bp->b_bcount = 0;
1120	bp->b_npages = 0;
1121	bp->b_dirtyoff = bp->b_dirtyend = 0;
1122	bp->b_validoff = bp->b_validend = 0;
1123	bp->b_usecount = 5;
1124	/* Here, not kern_physio.c, is where this should be done*/
1125	LIST_INIT(&bp->b_dep);
1126
1127	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1128
1129	/*
1130	 * we assume that buffer_map is not at address 0
1131	 */
1132	addr = 0;
1133	if (maxsize != bp->b_kvasize) {
1134		bfreekva(bp);
1135
1136findkvaspace:
1137		/*
1138		 * See if we have buffer kva space
1139		 */
1140		if (vm_map_findspace(buffer_map,
1141			vm_map_min(buffer_map), maxsize, &addr)) {
1142			if (kvafreespace > 0) {
1143				int totfree = 0, freed;
1144				do {
1145					freed = 0;
1146					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1147						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1148						if (bp1->b_kvasize != 0) {
1149							totfree += bp1->b_kvasize;
1150							freed = bp1->b_kvasize;
1151							bremfree(bp1);
1152							bfreekva(bp1);
1153							brelse(bp1);
1154							break;
1155						}
1156					}
1157				} while (freed);
1158				/*
1159				 * if we found free space, then retry with the same buffer.
1160				 */
1161				if (totfree)
1162					goto findkvaspace;
1163			}
1164			bp->b_flags |= B_INVAL;
1165			brelse(bp);
1166			goto trytofreespace;
1167		}
1168	}
1169
1170	/*
1171	 * See if we are below are allocated minimum
1172	 */
1173	if (bufspace >= (maxbufspace + nbyteswritten)) {
1174		bp->b_flags |= B_INVAL;
1175		brelse(bp);
1176		goto trytofreespace;
1177	}
1178
1179	/*
1180	 * create a map entry for the buffer -- in essence
1181	 * reserving the kva space.
1182	 */
1183	if (addr) {
1184		vm_map_insert(buffer_map, NULL, 0,
1185			addr, addr + maxsize,
1186			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1187
1188		bp->b_kvabase = (caddr_t) addr;
1189		bp->b_kvasize = maxsize;
1190	}
1191	bp->b_data = bp->b_kvabase;
1192
1193	return (bp);
1194}
1195
1196static void
1197waitfreebuffers(int slpflag, int slptimeo) {
1198	while (numfreebuffers < hifreebuffers) {
1199		flushdirtybuffers(slpflag, slptimeo);
1200		if (numfreebuffers < hifreebuffers)
1201			break;
1202		needsbuffer |= VFS_BIO_NEED_FREE;
1203		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1204			break;
1205	}
1206}
1207
1208static void
1209flushdirtybuffers(int slpflag, int slptimeo) {
1210	int s;
1211	static pid_t flushing = 0;
1212
1213	s = splbio();
1214
1215	if (flushing) {
1216		if (flushing == curproc->p_pid) {
1217			splx(s);
1218			return;
1219		}
1220		while (flushing) {
1221			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1222				splx(s);
1223				return;
1224			}
1225		}
1226	}
1227	flushing = curproc->p_pid;
1228
1229	while (numdirtybuffers > lodirtybuffers) {
1230		struct buf *bp;
1231		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1232		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1233		if (bp == NULL)
1234			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1235
1236		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1237			bp = TAILQ_NEXT(bp, b_freelist);
1238		}
1239
1240		if (bp) {
1241			vfs_bio_awrite(bp);
1242			continue;
1243		}
1244		break;
1245	}
1246
1247	flushing = 0;
1248	wakeup(&flushing);
1249	splx(s);
1250}
1251
1252/*
1253 * Check to see if a block is currently memory resident.
1254 */
1255struct buf *
1256incore(struct vnode * vp, daddr_t blkno)
1257{
1258	struct buf *bp;
1259
1260	int s = splbio();
1261	bp = gbincore(vp, blkno);
1262	splx(s);
1263	return (bp);
1264}
1265
1266/*
1267 * Returns true if no I/O is needed to access the
1268 * associated VM object.  This is like incore except
1269 * it also hunts around in the VM system for the data.
1270 */
1271
1272int
1273inmem(struct vnode * vp, daddr_t blkno)
1274{
1275	vm_object_t obj;
1276	vm_offset_t toff, tinc;
1277	vm_page_t m;
1278	vm_ooffset_t off;
1279
1280	if (incore(vp, blkno))
1281		return 1;
1282	if (vp->v_mount == NULL)
1283		return 0;
1284	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1285		return 0;
1286
1287	obj = vp->v_object;
1288	tinc = PAGE_SIZE;
1289	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1290		tinc = vp->v_mount->mnt_stat.f_iosize;
1291	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1292
1293	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1294
1295		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1296		if (!m)
1297			return 0;
1298		if (vm_page_is_valid(m,
1299		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1300			return 0;
1301	}
1302	return 1;
1303}
1304
1305/*
1306 * now we set the dirty range for the buffer --
1307 * for NFS -- if the file is mapped and pages have
1308 * been written to, let it know.  We want the
1309 * entire range of the buffer to be marked dirty if
1310 * any of the pages have been written to for consistancy
1311 * with the b_validoff, b_validend set in the nfs write
1312 * code, and used by the nfs read code.
1313 */
1314static void
1315vfs_setdirty(struct buf *bp) {
1316	int i;
1317	vm_object_t object;
1318	vm_offset_t boffset, offset;
1319	/*
1320	 * We qualify the scan for modified pages on whether the
1321	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1322	 * is not cleared simply by protecting pages off.
1323	 */
1324	if ((bp->b_flags & B_VMIO) &&
1325		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1326		/*
1327		 * test the pages to see if they have been modified directly
1328		 * by users through the VM system.
1329		 */
1330		for (i = 0; i < bp->b_npages; i++) {
1331			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1332			vm_page_test_dirty(bp->b_pages[i]);
1333		}
1334
1335		/*
1336		 * scan forwards for the first page modified
1337		 */
1338		for (i = 0; i < bp->b_npages; i++) {
1339			if (bp->b_pages[i]->dirty) {
1340				break;
1341			}
1342		}
1343		boffset = (i << PAGE_SHIFT);
1344		if (boffset < bp->b_dirtyoff) {
1345			bp->b_dirtyoff = boffset;
1346		}
1347
1348		/*
1349		 * scan backwards for the last page modified
1350		 */
1351		for (i = bp->b_npages - 1; i >= 0; --i) {
1352			if (bp->b_pages[i]->dirty) {
1353				break;
1354			}
1355		}
1356		boffset = (i + 1);
1357		offset = boffset + bp->b_pages[0]->pindex;
1358		if (offset >= object->size)
1359			boffset = object->size - bp->b_pages[0]->pindex;
1360		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1361			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1362	}
1363}
1364
1365/*
1366 * Get a block given a specified block and offset into a file/device.
1367 */
1368struct buf *
1369getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1370{
1371	struct buf *bp;
1372	int i, s;
1373	struct bufhashhdr *bh;
1374	int maxsize;
1375	int checksize;
1376
1377	if (vp->v_mount) {
1378		maxsize = vp->v_mount->mnt_stat.f_iosize;
1379		/*
1380		 * This happens on mount points.
1381		 */
1382		if (maxsize < size)
1383			maxsize = size;
1384	} else {
1385		maxsize = size;
1386	}
1387
1388#if !defined(MAX_PERF)
1389	if (size > MAXBSIZE)
1390		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1391#endif
1392
1393	s = splbio();
1394loop:
1395	if (numfreebuffers < lofreebuffers) {
1396		waitfreebuffers(slpflag, slptimeo);
1397	}
1398
1399	if ((bp = gbincore(vp, blkno))) {
1400		if (bp->b_flags & B_BUSY) {
1401
1402			bp->b_flags |= B_WANTED;
1403			if (bp->b_usecount < BUF_MAXUSE)
1404				++bp->b_usecount;
1405
1406			if (!tsleep(bp,
1407				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1408				goto loop;
1409			}
1410
1411			splx(s);
1412			return (struct buf *) NULL;
1413		}
1414		bp->b_flags |= B_BUSY | B_CACHE;
1415		bremfree(bp);
1416
1417		/*
1418		 * check for size inconsistancies (note that they shouldn't
1419		 * happen but do when filesystems don't handle the size changes
1420		 * correctly.) We are conservative on metadata and don't just
1421		 * extend the buffer but write (if needed) and re-constitute it.
1422		 */
1423
1424		if (bp->b_bcount != size) {
1425			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1426				allocbuf(bp, size);
1427			} else {
1428				if (bp->b_flags & B_DELWRI) {
1429					bp->b_flags |= B_NOCACHE;
1430					VOP_BWRITE(bp);
1431				} else {
1432					if ((bp->b_flags & B_VMIO) &&
1433					   (LIST_FIRST(&bp->b_dep) == NULL)) {
1434						bp->b_flags |= B_RELBUF;
1435						brelse(bp);
1436					} else {
1437						bp->b_flags |= B_NOCACHE;
1438						VOP_BWRITE(bp);
1439					}
1440				}
1441				goto loop;
1442			}
1443		}
1444
1445#ifdef DIAGNOSTIC
1446		if (bp->b_offset == NOOFFSET)
1447			panic("getblk: no buffer offset");
1448#endif
1449
1450		/*
1451		 * Check that the constituted buffer really deserves for the
1452		 * B_CACHE bit to be set.  B_VMIO type buffers might not
1453		 * contain fully valid pages.  Normal (old-style) buffers
1454		 * should be fully valid.
1455		 */
1456		if (bp->b_flags & B_VMIO) {
1457			checksize = bp->b_bufsize;
1458			for (i = 0; i < bp->b_npages; i++) {
1459				int resid;
1460				int poffset;
1461				poffset = bp->b_offset & PAGE_MASK;
1462				resid = (checksize > (PAGE_SIZE - poffset)) ?
1463					(PAGE_SIZE - poffset) : checksize;
1464				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1465					bp->b_flags &= ~(B_CACHE | B_DONE);
1466					break;
1467				}
1468				checksize -= resid;
1469			}
1470		}
1471
1472		if (bp->b_usecount < BUF_MAXUSE)
1473			++bp->b_usecount;
1474		splx(s);
1475		return (bp);
1476	} else {
1477		vm_object_t obj;
1478
1479		if ((bp = getnewbuf(vp, blkno,
1480			slpflag, slptimeo, size, maxsize)) == 0) {
1481			if (slpflag || slptimeo) {
1482				splx(s);
1483				return NULL;
1484			}
1485			goto loop;
1486		}
1487
1488		/*
1489		 * This code is used to make sure that a buffer is not
1490		 * created while the getnewbuf routine is blocked.
1491		 * Normally the vnode is locked so this isn't a problem.
1492		 * VBLK type I/O requests, however, don't lock the vnode.
1493		 */
1494		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
1495			bp->b_flags |= B_INVAL;
1496			brelse(bp);
1497			goto loop;
1498		}
1499
1500		/*
1501		 * Insert the buffer into the hash, so that it can
1502		 * be found by incore.
1503		 */
1504		bp->b_blkno = bp->b_lblkno = blkno;
1505
1506		if (vp->v_type != VBLK)
1507			bp->b_offset = (off_t) blkno * maxsize;
1508		else
1509			bp->b_offset = (off_t) blkno * DEV_BSIZE;
1510
1511		bgetvp(vp, bp);
1512		LIST_REMOVE(bp, b_hash);
1513		bh = BUFHASH(vp, blkno);
1514		LIST_INSERT_HEAD(bh, bp, b_hash);
1515
1516		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
1517			bp->b_flags |= (B_VMIO | B_CACHE);
1518#if defined(VFS_BIO_DEBUG)
1519			if (vp->v_type != VREG && vp->v_type != VBLK)
1520				printf("getblk: vmioing file type %d???\n", vp->v_type);
1521#endif
1522		} else {
1523			bp->b_flags &= ~B_VMIO;
1524		}
1525
1526		allocbuf(bp, size);
1527
1528		splx(s);
1529		return (bp);
1530	}
1531}
1532
1533/*
1534 * Get an empty, disassociated buffer of given size.
1535 */
1536struct buf *
1537geteblk(int size)
1538{
1539	struct buf *bp;
1540	int s;
1541
1542	s = splbio();
1543	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1544	splx(s);
1545	allocbuf(bp, size);
1546	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
1547	return (bp);
1548}
1549
1550
1551/*
1552 * This code constitutes the buffer memory from either anonymous system
1553 * memory (in the case of non-VMIO operations) or from an associated
1554 * VM object (in the case of VMIO operations).
1555 *
1556 * Note that this code is tricky, and has many complications to resolve
1557 * deadlock or inconsistant data situations.  Tread lightly!!!
1558 *
1559 * Modify the length of a buffer's underlying buffer storage without
1560 * destroying information (unless, of course the buffer is shrinking).
1561 */
1562int
1563allocbuf(struct buf * bp, int size)
1564{
1565
1566	int s;
1567	int newbsize, mbsize;
1568	int i;
1569
1570#if !defined(MAX_PERF)
1571	if (!(bp->b_flags & B_BUSY))
1572		panic("allocbuf: buffer not busy");
1573
1574	if (bp->b_kvasize < size)
1575		panic("allocbuf: buffer too small");
1576#endif
1577
1578	if ((bp->b_flags & B_VMIO) == 0) {
1579		caddr_t origbuf;
1580		int origbufsize;
1581		/*
1582		 * Just get anonymous memory from the kernel
1583		 */
1584		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1585#if !defined(NO_B_MALLOC)
1586		if (bp->b_flags & B_MALLOC)
1587			newbsize = mbsize;
1588		else
1589#endif
1590			newbsize = round_page(size);
1591
1592		if (newbsize < bp->b_bufsize) {
1593#if !defined(NO_B_MALLOC)
1594			/*
1595			 * malloced buffers are not shrunk
1596			 */
1597			if (bp->b_flags & B_MALLOC) {
1598				if (newbsize) {
1599					bp->b_bcount = size;
1600				} else {
1601					free(bp->b_data, M_BIOBUF);
1602					bufspace -= bp->b_bufsize;
1603					bufmallocspace -= bp->b_bufsize;
1604					bp->b_data = bp->b_kvabase;
1605					bp->b_bufsize = 0;
1606					bp->b_bcount = 0;
1607					bp->b_flags &= ~B_MALLOC;
1608				}
1609				return 1;
1610			}
1611#endif
1612			vm_hold_free_pages(
1613			    bp,
1614			    (vm_offset_t) bp->b_data + newbsize,
1615			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1616		} else if (newbsize > bp->b_bufsize) {
1617#if !defined(NO_B_MALLOC)
1618			/*
1619			 * We only use malloced memory on the first allocation.
1620			 * and revert to page-allocated memory when the buffer grows.
1621			 */
1622			if ( (bufmallocspace < maxbufmallocspace) &&
1623				(bp->b_bufsize == 0) &&
1624				(mbsize <= PAGE_SIZE/2)) {
1625
1626				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1627				bp->b_bufsize = mbsize;
1628				bp->b_bcount = size;
1629				bp->b_flags |= B_MALLOC;
1630				bufspace += mbsize;
1631				bufmallocspace += mbsize;
1632				return 1;
1633			}
1634#endif
1635			origbuf = NULL;
1636			origbufsize = 0;
1637#if !defined(NO_B_MALLOC)
1638			/*
1639			 * If the buffer is growing on its other-than-first allocation,
1640			 * then we revert to the page-allocation scheme.
1641			 */
1642			if (bp->b_flags & B_MALLOC) {
1643				origbuf = bp->b_data;
1644				origbufsize = bp->b_bufsize;
1645				bp->b_data = bp->b_kvabase;
1646				bufspace -= bp->b_bufsize;
1647				bufmallocspace -= bp->b_bufsize;
1648				bp->b_bufsize = 0;
1649				bp->b_flags &= ~B_MALLOC;
1650				newbsize = round_page(newbsize);
1651			}
1652#endif
1653			vm_hold_load_pages(
1654			    bp,
1655			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1656			    (vm_offset_t) bp->b_data + newbsize);
1657#if !defined(NO_B_MALLOC)
1658			if (origbuf) {
1659				bcopy(origbuf, bp->b_data, origbufsize);
1660				free(origbuf, M_BIOBUF);
1661			}
1662#endif
1663		}
1664	} else {
1665		vm_page_t m;
1666		int desiredpages;
1667
1668		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1669		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1670
1671#if !defined(NO_B_MALLOC)
1672		if (bp->b_flags & B_MALLOC)
1673			panic("allocbuf: VMIO buffer can't be malloced");
1674#endif
1675
1676		if (newbsize < bp->b_bufsize) {
1677			if (desiredpages < bp->b_npages) {
1678				for (i = desiredpages; i < bp->b_npages; i++) {
1679					/*
1680					 * the page is not freed here -- it
1681					 * is the responsibility of vnode_pager_setsize
1682					 */
1683					m = bp->b_pages[i];
1684#if defined(DIAGNOSTIC)
1685					if (m == bogus_page)
1686						panic("allocbuf: bogus page found");
1687#endif
1688					vm_page_sleep(m, "biodep", &m->busy);
1689
1690					bp->b_pages[i] = NULL;
1691					vm_page_unwire(m, 0);
1692				}
1693				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
1694				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1695				bp->b_npages = desiredpages;
1696			}
1697		} else if (newbsize > bp->b_bufsize) {
1698			vm_object_t obj;
1699			vm_offset_t tinc, toff;
1700			vm_ooffset_t off;
1701			vm_pindex_t objoff;
1702			int pageindex, curbpnpages;
1703			struct vnode *vp;
1704			int bsize;
1705			int orig_validoff = bp->b_validoff;
1706			int orig_validend = bp->b_validend;
1707
1708			vp = bp->b_vp;
1709
1710			if (vp->v_type == VBLK)
1711				bsize = DEV_BSIZE;
1712			else
1713				bsize = vp->v_mount->mnt_stat.f_iosize;
1714
1715			if (bp->b_npages < desiredpages) {
1716				obj = vp->v_object;
1717				tinc = PAGE_SIZE;
1718				if (tinc > bsize)
1719					tinc = bsize;
1720
1721				off = bp->b_offset;
1722#ifdef DIAGNOSTIC
1723				if (bp->b_offset == NOOFFSET)
1724					panic("allocbuf: no buffer offset");
1725#endif
1726
1727				curbpnpages = bp->b_npages;
1728		doretry:
1729				bp->b_validoff = orig_validoff;
1730				bp->b_validend = orig_validend;
1731				bp->b_flags |= B_CACHE;
1732				for (toff = 0; toff < newbsize; toff += tinc) {
1733					int bytesinpage;
1734
1735					pageindex = toff >> PAGE_SHIFT;
1736					objoff = OFF_TO_IDX(off + toff);
1737					if (pageindex < curbpnpages) {
1738
1739						m = bp->b_pages[pageindex];
1740#ifdef VFS_BIO_DIAG
1741						if (m->pindex != objoff)
1742							panic("allocbuf: page changed offset?!!!?");
1743#endif
1744						bytesinpage = tinc;
1745						if (tinc > (newbsize - toff))
1746							bytesinpage = newbsize - toff;
1747						if (bp->b_flags & B_CACHE)
1748							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1749						continue;
1750					}
1751					m = vm_page_lookup(obj, objoff);
1752					if (!m) {
1753						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1754						if (!m) {
1755							VM_WAIT;
1756							vm_pageout_deficit += (desiredpages - bp->b_npages);
1757							goto doretry;
1758						}
1759
1760						vm_page_wire(m);
1761						vm_page_flag_clear(m, PG_BUSY);
1762						bp->b_flags &= ~B_CACHE;
1763
1764					} else if (m->flags & PG_BUSY) {
1765						s = splvm();
1766						if (m->flags & PG_BUSY) {
1767							vm_page_flag_set(m, PG_WANTED);
1768							tsleep(m, PVM, "pgtblk", 0);
1769						}
1770						splx(s);
1771						goto doretry;
1772					} else {
1773						if ((curproc != pageproc) &&
1774							((m->queue - m->pc) == PQ_CACHE) &&
1775						    ((cnt.v_free_count + cnt.v_cache_count) <
1776								(cnt.v_free_min + cnt.v_cache_min))) {
1777							pagedaemon_wakeup();
1778						}
1779						bytesinpage = tinc;
1780						if (tinc > (newbsize - toff))
1781							bytesinpage = newbsize - toff;
1782						if (bp->b_flags & B_CACHE)
1783							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1784						vm_page_flag_clear(m, PG_ZERO);
1785						vm_page_wire(m);
1786					}
1787					bp->b_pages[pageindex] = m;
1788					curbpnpages = pageindex + 1;
1789				}
1790				if (vp->v_tag == VT_NFS &&
1791				    vp->v_type != VBLK) {
1792					if (bp->b_dirtyend > 0) {
1793						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1794						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1795					}
1796					if (bp->b_validend == 0)
1797						bp->b_flags &= ~B_CACHE;
1798				}
1799				bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
1800				bp->b_npages = curbpnpages;
1801				pmap_qenter((vm_offset_t) bp->b_data,
1802					bp->b_pages, bp->b_npages);
1803				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1804			}
1805		}
1806	}
1807	if (bp->b_flags & B_VMIO)
1808		vmiospace += (newbsize - bp->b_bufsize);
1809	bufspace += (newbsize - bp->b_bufsize);
1810	bp->b_bufsize = newbsize;
1811	bp->b_bcount = size;
1812	return 1;
1813}
1814
1815/*
1816 * Wait for buffer I/O completion, returning error status.
1817 */
1818int
1819biowait(register struct buf * bp)
1820{
1821	int s;
1822
1823	s = splbio();
1824	while ((bp->b_flags & B_DONE) == 0)
1825#if defined(NO_SCHEDULE_MODS)
1826		tsleep(bp, PRIBIO, "biowait", 0);
1827#else
1828		if (bp->b_flags & B_READ)
1829			tsleep(bp, PRIBIO, "biord", 0);
1830		else
1831			tsleep(bp, PRIBIO, "biowr", 0);
1832#endif
1833	splx(s);
1834	if (bp->b_flags & B_EINTR) {
1835		bp->b_flags &= ~B_EINTR;
1836		return (EINTR);
1837	}
1838	if (bp->b_flags & B_ERROR) {
1839		return (bp->b_error ? bp->b_error : EIO);
1840	} else {
1841		return (0);
1842	}
1843}
1844
1845/*
1846 * Finish I/O on a buffer, calling an optional function.
1847 * This is usually called from interrupt level, so process blocking
1848 * is not *a good idea*.
1849 */
1850void
1851biodone(register struct buf * bp)
1852{
1853	int s;
1854
1855	s = splbio();
1856
1857#if !defined(MAX_PERF)
1858	if (!(bp->b_flags & B_BUSY))
1859		panic("biodone: buffer not busy");
1860#endif
1861
1862	if (bp->b_flags & B_DONE) {
1863		splx(s);
1864#if !defined(MAX_PERF)
1865		printf("biodone: buffer already done\n");
1866#endif
1867		return;
1868	}
1869	bp->b_flags |= B_DONE;
1870
1871	if (bp->b_flags & B_FREEBUF) {
1872		brelse(bp);
1873		splx(s);
1874		return;
1875	}
1876
1877	if ((bp->b_flags & B_READ) == 0) {
1878		vwakeup(bp);
1879	}
1880
1881	/* call optional completion function if requested */
1882	if (bp->b_flags & B_CALL) {
1883		bp->b_flags &= ~B_CALL;
1884		(*bp->b_iodone) (bp);
1885		splx(s);
1886		return;
1887	}
1888	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1889		(*bioops.io_complete)(bp);
1890
1891	if (bp->b_flags & B_VMIO) {
1892		int i, resid;
1893		vm_ooffset_t foff;
1894		vm_page_t m;
1895		vm_object_t obj;
1896		int iosize;
1897		struct vnode *vp = bp->b_vp;
1898
1899		obj = vp->v_object;
1900
1901#if defined(VFS_BIO_DEBUG)
1902		if (vp->v_usecount == 0) {
1903			panic("biodone: zero vnode ref count");
1904		}
1905
1906		if (vp->v_object == NULL) {
1907			panic("biodone: missing VM object");
1908		}
1909
1910		if ((vp->v_flag & VOBJBUF) == 0) {
1911			panic("biodone: vnode is not setup for merged cache");
1912		}
1913#endif
1914
1915		foff = bp->b_offset;
1916#ifdef DIAGNOSTIC
1917		if (bp->b_offset == NOOFFSET)
1918			panic("biodone: no buffer offset");
1919#endif
1920
1921#if !defined(MAX_PERF)
1922		if (!obj) {
1923			panic("biodone: no object");
1924		}
1925#endif
1926#if defined(VFS_BIO_DEBUG)
1927		if (obj->paging_in_progress < bp->b_npages) {
1928			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1929			    obj->paging_in_progress, bp->b_npages);
1930		}
1931#endif
1932		iosize = bp->b_bufsize;
1933		for (i = 0; i < bp->b_npages; i++) {
1934			int bogusflag = 0;
1935			m = bp->b_pages[i];
1936			if (m == bogus_page) {
1937				bogusflag = 1;
1938				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1939				if (!m) {
1940#if defined(VFS_BIO_DEBUG)
1941					printf("biodone: page disappeared\n");
1942#endif
1943					vm_object_pip_subtract(obj, 1);
1944					continue;
1945				}
1946				bp->b_pages[i] = m;
1947				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
1948			}
1949#if defined(VFS_BIO_DEBUG)
1950			if (OFF_TO_IDX(foff) != m->pindex) {
1951				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1952			}
1953#endif
1954			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1955			if (resid > iosize)
1956				resid = iosize;
1957
1958			/*
1959			 * In the write case, the valid and clean bits are
1960			 * already changed correctly, so we only need to do this
1961			 * here in the read case.
1962			 */
1963			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1964				vfs_page_set_valid(bp, foff, i, m);
1965			}
1966			vm_page_flag_clear(m, PG_ZERO);
1967
1968			/*
1969			 * when debugging new filesystems or buffer I/O methods, this
1970			 * is the most common error that pops up.  if you see this, you
1971			 * have not set the page busy flag correctly!!!
1972			 */
1973			if (m->busy == 0) {
1974#if !defined(MAX_PERF)
1975				printf("biodone: page busy < 0, "
1976				    "pindex: %d, foff: 0x(%x,%x), "
1977				    "resid: %d, index: %d\n",
1978				    (int) m->pindex, (int)(foff >> 32),
1979						(int) foff & 0xffffffff, resid, i);
1980#endif
1981				if (vp->v_type != VBLK)
1982#if !defined(MAX_PERF)
1983					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1984					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1985					    (int) bp->b_lblkno,
1986					    bp->b_flags, bp->b_npages);
1987				else
1988					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1989					    (int) bp->b_lblkno,
1990					    bp->b_flags, bp->b_npages);
1991				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1992				    m->valid, m->dirty, m->wire_count);
1993#endif
1994				panic("biodone: page busy < 0\n");
1995			}
1996			vm_page_io_finish(m);
1997			vm_object_pip_subtract(obj, 1);
1998			foff += resid;
1999			iosize -= resid;
2000		}
2001		if (obj &&
2002			(obj->paging_in_progress == 0) &&
2003		    (obj->flags & OBJ_PIPWNT)) {
2004			vm_object_clear_flag(obj, OBJ_PIPWNT);
2005			wakeup(obj);
2006		}
2007	}
2008	/*
2009	 * For asynchronous completions, release the buffer now. The brelse
2010	 * checks for B_WANTED and will do the wakeup there if necessary - so
2011	 * no need to do a wakeup here in the async case.
2012	 */
2013
2014	if (bp->b_flags & B_ASYNC) {
2015		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2016			brelse(bp);
2017		else
2018			bqrelse(bp);
2019	} else {
2020		bp->b_flags &= ~B_WANTED;
2021		wakeup(bp);
2022	}
2023	splx(s);
2024}
2025
2026#if 0	/* not with kirks code */
2027static int vfs_update_interval = 30;
2028
2029static void
2030vfs_update()
2031{
2032	while (1) {
2033		tsleep(&vfs_update_wakeup, PUSER, "update",
2034		    hz * vfs_update_interval);
2035		vfs_update_wakeup = 0;
2036		sync(curproc, NULL);
2037	}
2038}
2039
2040static int
2041sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2042{
2043	int error = sysctl_handle_int(oidp,
2044		oidp->oid_arg1, oidp->oid_arg2, req);
2045	if (!error)
2046		wakeup(&vfs_update_wakeup);
2047	return error;
2048}
2049
2050SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2051	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2052
2053#endif
2054
2055
2056/*
2057 * This routine is called in lieu of iodone in the case of
2058 * incomplete I/O.  This keeps the busy status for pages
2059 * consistant.
2060 */
2061void
2062vfs_unbusy_pages(struct buf * bp)
2063{
2064	int i;
2065
2066	if (bp->b_flags & B_VMIO) {
2067		struct vnode *vp = bp->b_vp;
2068		vm_object_t obj = vp->v_object;
2069
2070		for (i = 0; i < bp->b_npages; i++) {
2071			vm_page_t m = bp->b_pages[i];
2072
2073			if (m == bogus_page) {
2074				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2075#if !defined(MAX_PERF)
2076				if (!m) {
2077					panic("vfs_unbusy_pages: page missing\n");
2078				}
2079#endif
2080				bp->b_pages[i] = m;
2081				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2082			}
2083			vm_object_pip_subtract(obj, 1);
2084			vm_page_flag_clear(m, PG_ZERO);
2085			vm_page_io_finish(m);
2086		}
2087		if (obj->paging_in_progress == 0 &&
2088		    (obj->flags & OBJ_PIPWNT)) {
2089			vm_object_clear_flag(obj, OBJ_PIPWNT);
2090			wakeup(obj);
2091		}
2092	}
2093}
2094
2095/*
2096 * Set NFS' b_validoff and b_validend fields from the valid bits
2097 * of a page.  If the consumer is not NFS, and the page is not
2098 * valid for the entire range, clear the B_CACHE flag to force
2099 * the consumer to re-read the page.
2100 */
2101static void
2102vfs_buf_set_valid(struct buf *bp,
2103		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2104		  vm_page_t m)
2105{
2106	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2107		vm_offset_t svalid, evalid;
2108		int validbits = m->valid;
2109
2110		/*
2111		 * This only bothers with the first valid range in the
2112		 * page.
2113		 */
2114		svalid = off;
2115		while (validbits && !(validbits & 1)) {
2116			svalid += DEV_BSIZE;
2117			validbits >>= 1;
2118		}
2119		evalid = svalid;
2120		while (validbits & 1) {
2121			evalid += DEV_BSIZE;
2122			validbits >>= 1;
2123		}
2124		/*
2125		 * Make sure this range is contiguous with the range
2126		 * built up from previous pages.  If not, then we will
2127		 * just use the range from the previous pages.
2128		 */
2129		if (svalid == bp->b_validend) {
2130			bp->b_validoff = min(bp->b_validoff, svalid);
2131			bp->b_validend = max(bp->b_validend, evalid);
2132		}
2133	} else if (!vm_page_is_valid(m,
2134				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2135				     size)) {
2136		bp->b_flags &= ~B_CACHE;
2137	}
2138}
2139
2140/*
2141 * Set the valid bits in a page, taking care of the b_validoff,
2142 * b_validend fields which NFS uses to optimise small reads.  Off is
2143 * the offset within the file and pageno is the page index within the buf.
2144 */
2145static void
2146vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2147{
2148	struct vnode *vp = bp->b_vp;
2149	vm_ooffset_t soff, eoff;
2150
2151	soff = off;
2152	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2153	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2154		vm_ooffset_t sv, ev;
2155		vm_page_set_invalid(m,
2156		    (vm_offset_t) (soff & PAGE_MASK),
2157		    (vm_offset_t) (eoff - soff));
2158		off = off - pageno * PAGE_SIZE;
2159		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2160		ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2161		soff = qmax(sv, soff);
2162		eoff = qmin(ev, eoff);
2163	}
2164	if (eoff > soff)
2165		vm_page_set_validclean(m,
2166	       (vm_offset_t) (soff & PAGE_MASK),
2167	       (vm_offset_t) (eoff - soff));
2168}
2169
2170/*
2171 * This routine is called before a device strategy routine.
2172 * It is used to tell the VM system that paging I/O is in
2173 * progress, and treat the pages associated with the buffer
2174 * almost as being PG_BUSY.  Also the object paging_in_progress
2175 * flag is handled to make sure that the object doesn't become
2176 * inconsistant.
2177 */
2178void
2179vfs_busy_pages(struct buf * bp, int clear_modify)
2180{
2181	int i;
2182
2183	if (bp->b_flags & B_VMIO) {
2184		struct vnode *vp = bp->b_vp;
2185		vm_object_t obj = vp->v_object;
2186		vm_ooffset_t foff;
2187
2188		foff = bp->b_offset;
2189#ifdef DIAGNOSTIC
2190		if (bp->b_offset == NOOFFSET)
2191			panic("vfs_busy_pages: no buffer offset");
2192#endif
2193
2194		vfs_setdirty(bp);
2195
2196retry:
2197		for (i = 0; i < bp->b_npages; i++) {
2198			vm_page_t m = bp->b_pages[i];
2199			if (vm_page_sleep(m, "vbpage", NULL))
2200				goto retry;
2201		}
2202
2203		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2204			vm_page_t m = bp->b_pages[i];
2205
2206			vm_page_flag_clear(m, PG_ZERO);
2207			if ((bp->b_flags & B_CLUSTER) == 0) {
2208				vm_object_pip_add(obj, 1);
2209				vm_page_io_start(m);
2210			}
2211
2212			vm_page_protect(m, VM_PROT_NONE);
2213			if (clear_modify)
2214				vfs_page_set_valid(bp, foff, i, m);
2215			else if (bp->b_bcount >= PAGE_SIZE) {
2216				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2217					bp->b_pages[i] = bogus_page;
2218					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2219				}
2220			}
2221		}
2222	}
2223}
2224
2225/*
2226 * Tell the VM system that the pages associated with this buffer
2227 * are clean.  This is used for delayed writes where the data is
2228 * going to go to disk eventually without additional VM intevention.
2229 */
2230void
2231vfs_clean_pages(struct buf * bp)
2232{
2233	int i;
2234
2235	if (bp->b_flags & B_VMIO) {
2236		vm_ooffset_t foff;
2237		foff = bp->b_offset;
2238
2239#ifdef DIAGNOSTIC
2240		if (bp->b_offset == NOOFFSET)
2241			panic("vfs_clean_pages: no buffer offset");
2242#endif
2243
2244		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2245			vm_page_t m = bp->b_pages[i];
2246			vfs_page_set_valid(bp, foff, i, m);
2247		}
2248	}
2249}
2250
2251void
2252vfs_bio_clrbuf(struct buf *bp) {
2253	int i;
2254	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2255		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2256			int mask;
2257			mask = 0;
2258			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2259				mask |= (1 << (i/DEV_BSIZE));
2260			if(((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2261				(bp->b_pages[0]->valid != mask)) {
2262				bzero(bp->b_data, bp->b_bufsize);
2263			}
2264			bp->b_pages[0]->valid = mask;
2265			bp->b_resid = 0;
2266			return;
2267		}
2268		for(i=0;i<bp->b_npages;i++) {
2269			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2270				continue;
2271			if( bp->b_pages[i]->valid == 0) {
2272				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2273					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2274				}
2275			} else {
2276				int j;
2277				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2278					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2279						(bp->b_pages[i]->valid & (1<<j)) == 0)
2280						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2281				}
2282			}
2283			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
2284			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2285		}
2286		bp->b_resid = 0;
2287	} else {
2288		clrbuf(bp);
2289	}
2290}
2291
2292/*
2293 * vm_hold_load_pages and vm_hold_unload pages get pages into
2294 * a buffers address space.  The pages are anonymous and are
2295 * not associated with a file object.
2296 */
2297void
2298vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2299{
2300	vm_offset_t pg;
2301	vm_page_t p;
2302	int index;
2303
2304	to = round_page(to);
2305	from = round_page(from);
2306	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2307
2308	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2309
2310tryagain:
2311
2312		p = vm_page_alloc(kernel_object,
2313			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2314		    VM_ALLOC_NORMAL);
2315		if (!p) {
2316			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2317			VM_WAIT;
2318			goto tryagain;
2319		}
2320		vm_page_wire(p);
2321		p->valid = VM_PAGE_BITS_ALL;
2322		vm_page_flag_clear(p, PG_ZERO);
2323		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2324		bp->b_pages[index] = p;
2325		vm_page_wakeup(p);
2326	}
2327	bp->b_npages = index;
2328}
2329
2330void
2331vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2332{
2333	vm_offset_t pg;
2334	vm_page_t p;
2335	int index, newnpages;
2336
2337	from = round_page(from);
2338	to = round_page(to);
2339	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2340
2341	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2342		p = bp->b_pages[index];
2343		if (p && (index < bp->b_npages)) {
2344#if !defined(MAX_PERF)
2345			if (p->busy) {
2346				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2347					bp->b_blkno, bp->b_lblkno);
2348			}
2349#endif
2350			bp->b_pages[index] = NULL;
2351			pmap_kremove(pg);
2352			vm_page_busy(p);
2353			vm_page_unwire(p, 0);
2354			vm_page_free(p);
2355		}
2356	}
2357	bp->b_npages = newnpages;
2358}
2359
2360
2361#include "opt_ddb.h"
2362#ifdef DDB
2363#include <ddb/ddb.h>
2364
2365DB_SHOW_COMMAND(buffer, db_show_buffer)
2366{
2367	/* get args */
2368	struct buf *bp = (struct buf *)addr;
2369
2370	if (!have_addr) {
2371		db_printf("usage: show buffer <addr>\n");
2372		return;
2373	}
2374
2375	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2376		  (u_int)bp->b_flags, PRINT_BUF_FLAGS);
2377	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2378		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2379		  "b_blkno = %d, b_pblkno = %d\n",
2380		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2381		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2382	if (bp->b_npages) {
2383		int i;
2384		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2385		for (i = 0; i < bp->b_npages; i++) {
2386			vm_page_t m;
2387			m = bp->b_pages[i];
2388			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
2389			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
2390			if ((i + 1) < bp->b_npages)
2391				db_printf(",");
2392		}
2393		db_printf("\n");
2394	}
2395}
2396#endif /* DDB */
2397