vfs_bio.c revision 32454
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.141 1998/01/06 05:15:55 dyson Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 */
27
28#include "opt_bounce.h"
29
30#define VMIO
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/sysproto.h>
34#include <sys/kernel.h>
35#include <sys/sysctl.h>
36#include <sys/proc.h>
37#include <sys/vnode.h>
38#include <sys/vmmeter.h>
39#include <sys/lock.h>
40#include <vm/vm.h>
41#include <vm/vm_param.h>
42#include <vm/vm_prot.h>
43#include <vm/vm_kern.h>
44#include <vm/vm_pageout.h>
45#include <vm/vm_page.h>
46#include <vm/vm_object.h>
47#include <vm/vm_extern.h>
48#include <vm/vm_map.h>
49#include <sys/buf.h>
50#include <sys/mount.h>
51#include <sys/malloc.h>
52#include <sys/resourcevar.h>
53
54static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
55
56static void vfs_update __P((void));
57static struct	proc *updateproc;
58static struct kproc_desc up_kp = {
59	"update",
60	vfs_update,
61	&updateproc
62};
63SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
64
65struct buf *buf;		/* buffer header pool */
66struct swqueue bswlist;
67
68int count_lock_queue __P((void));
69static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
70		vm_offset_t to);
71static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
72		vm_offset_t to);
73static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
74			      vm_offset_t off, vm_offset_t size,
75			      vm_page_t m);
76static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
77			       int pageno, vm_page_t m);
78static void vfs_clean_pages(struct buf * bp);
79static void vfs_setdirty(struct buf *bp);
80static void vfs_vmio_release(struct buf *bp);
81static void flushdirtybuffers(int slpflag, int slptimeo);
82
83int needsbuffer;
84
85/*
86 * Internal update daemon, process 3
87 *	The variable vfs_update_wakeup allows for internal syncs.
88 */
89int vfs_update_wakeup;
90
91
92/*
93 * buffers base kva
94 */
95
96/*
97 * bogus page -- for I/O to/from partially complete buffers
98 * this is a temporary solution to the problem, but it is not
99 * really that bad.  it would be better to split the buffer
100 * for input in the case of buffers partially already in memory,
101 * but the code is intricate enough already.
102 */
103vm_page_t bogus_page;
104static vm_offset_t bogus_offset;
105
106static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
107	bufmallocspace, maxbufmallocspace;
108int numdirtybuffers, lodirtybuffers, hidirtybuffers;
109static int numfreebuffers, lofreebuffers, hifreebuffers;
110static int kvafreespace;
111
112SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
113	&numdirtybuffers, 0, "");
114SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
115	&lodirtybuffers, 0, "");
116SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
117	&hidirtybuffers, 0, "");
118SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
119	&numfreebuffers, 0, "");
120SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
121	&lofreebuffers, 0, "");
122SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
123	&hifreebuffers, 0, "");
124SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
125	&maxbufspace, 0, "");
126SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
127	&bufspace, 0, "");
128SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
129	&maxvmiobufspace, 0, "");
130SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
131	&vmiospace, 0, "");
132SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
133	&maxbufmallocspace, 0, "");
134SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
135	&bufmallocspace, 0, "");
136SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
137	&kvafreespace, 0, "");
138
139static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
140static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
141
142extern int vm_swap_size;
143
144#define BUF_MAXUSE 24
145
146#define VFS_BIO_NEED_ANY 1
147#define VFS_BIO_NEED_LOWLIMIT 2
148#define VFS_BIO_NEED_FREE 4
149
150/*
151 * Initialize buffer headers and related structures.
152 */
153void
154bufinit()
155{
156	struct buf *bp;
157	int i;
158
159	TAILQ_INIT(&bswlist);
160	LIST_INIT(&invalhash);
161
162	/* first, make a null hash table */
163	for (i = 0; i < BUFHSZ; i++)
164		LIST_INIT(&bufhashtbl[i]);
165
166	/* next, make a null set of free lists */
167	for (i = 0; i < BUFFER_QUEUES; i++)
168		TAILQ_INIT(&bufqueues[i]);
169
170	/* finally, initialize each buffer header and stick on empty q */
171	for (i = 0; i < nbuf; i++) {
172		bp = &buf[i];
173		bzero(bp, sizeof *bp);
174		bp->b_flags = B_INVAL;	/* we're just an empty header */
175		bp->b_dev = NODEV;
176		bp->b_rcred = NOCRED;
177		bp->b_wcred = NOCRED;
178		bp->b_qindex = QUEUE_EMPTY;
179		bp->b_vnbufs.le_next = NOLIST;
180		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
181		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
182	}
183/*
184 * maxbufspace is currently calculated to support all filesystem blocks
185 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
186 * cache is still the same as it would be for 8K filesystems.  This
187 * keeps the size of the buffer cache "in check" for big block filesystems.
188 */
189	maxbufspace = (nbuf + 8) * DFLTBSIZE;
190/*
191 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
192 */
193	maxvmiobufspace = 2 * maxbufspace / 3;
194/*
195 * Limit the amount of malloc memory since it is wired permanently into
196 * the kernel space.  Even though this is accounted for in the buffer
197 * allocation, we don't want the malloced region to grow uncontrolled.
198 * The malloc scheme improves memory utilization significantly on average
199 * (small) directories.
200 */
201	maxbufmallocspace = maxbufspace / 20;
202
203/*
204 * Remove the probability of deadlock conditions by limiting the
205 * number of dirty buffers.
206 */
207	hidirtybuffers = nbuf / 8 + 20;
208	lodirtybuffers = nbuf / 16 + 10;
209	numdirtybuffers = 0;
210	lofreebuffers = nbuf / 18 + 5;
211	hifreebuffers = 2 * lofreebuffers;
212	numfreebuffers = nbuf;
213	kvafreespace = 0;
214
215	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
216	bogus_page = vm_page_alloc(kernel_object,
217			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
218			VM_ALLOC_NORMAL);
219
220}
221
222/*
223 * Free the kva allocation for a buffer
224 * Must be called only at splbio or higher,
225 *  as this is the only locking for buffer_map.
226 */
227static void
228bfreekva(struct buf * bp)
229{
230	if (bp->b_kvasize == 0)
231		return;
232
233	vm_map_delete(buffer_map,
234		(vm_offset_t) bp->b_kvabase,
235		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
236
237	bp->b_kvasize = 0;
238
239}
240
241/*
242 * remove the buffer from the appropriate free list
243 */
244void
245bremfree(struct buf * bp)
246{
247	int s = splbio();
248
249	if (bp->b_qindex != QUEUE_NONE) {
250		if (bp->b_qindex == QUEUE_EMPTY) {
251			kvafreespace -= bp->b_kvasize;
252		}
253		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
254		bp->b_qindex = QUEUE_NONE;
255	} else {
256#if !defined(MAX_PERF)
257		panic("bremfree: removing a buffer when not on a queue");
258#endif
259	}
260	if ((bp->b_flags & B_INVAL) ||
261		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
262		--numfreebuffers;
263	splx(s);
264}
265
266
267/*
268 * Get a buffer with the specified data.  Look in the cache first.
269 */
270int
271bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
272    struct buf ** bpp)
273{
274	struct buf *bp;
275
276	bp = getblk(vp, blkno, size, 0, 0);
277	*bpp = bp;
278
279	/* if not found in cache, do some I/O */
280	if ((bp->b_flags & B_CACHE) == 0) {
281		if (curproc != NULL)
282			curproc->p_stats->p_ru.ru_inblock++;
283		bp->b_flags |= B_READ;
284		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
285		if (bp->b_rcred == NOCRED) {
286			if (cred != NOCRED)
287				crhold(cred);
288			bp->b_rcred = cred;
289		}
290		vfs_busy_pages(bp, 0);
291		VOP_STRATEGY(bp);
292		return (biowait(bp));
293	}
294	return (0);
295}
296
297/*
298 * Operates like bread, but also starts asynchronous I/O on
299 * read-ahead blocks.
300 */
301int
302breadn(struct vnode * vp, daddr_t blkno, int size,
303    daddr_t * rablkno, int *rabsize,
304    int cnt, struct ucred * cred, struct buf ** bpp)
305{
306	struct buf *bp, *rabp;
307	int i;
308	int rv = 0, readwait = 0;
309
310	*bpp = bp = getblk(vp, blkno, size, 0, 0);
311
312	/* if not found in cache, do some I/O */
313	if ((bp->b_flags & B_CACHE) == 0) {
314		if (curproc != NULL)
315			curproc->p_stats->p_ru.ru_inblock++;
316		bp->b_flags |= B_READ;
317		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
318		if (bp->b_rcred == NOCRED) {
319			if (cred != NOCRED)
320				crhold(cred);
321			bp->b_rcred = cred;
322		}
323		vfs_busy_pages(bp, 0);
324		VOP_STRATEGY(bp);
325		++readwait;
326	}
327	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
328		if (inmem(vp, *rablkno))
329			continue;
330		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
331
332		if ((rabp->b_flags & B_CACHE) == 0) {
333			if (curproc != NULL)
334				curproc->p_stats->p_ru.ru_inblock++;
335			rabp->b_flags |= B_READ | B_ASYNC;
336			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
337			if (rabp->b_rcred == NOCRED) {
338				if (cred != NOCRED)
339					crhold(cred);
340				rabp->b_rcred = cred;
341			}
342			vfs_busy_pages(rabp, 0);
343			VOP_STRATEGY(rabp);
344		} else {
345			brelse(rabp);
346		}
347	}
348
349	if (readwait) {
350		rv = biowait(bp);
351	}
352	return (rv);
353}
354
355/*
356 * Write, release buffer on completion.  (Done by iodone
357 * if async.)
358 */
359int
360bwrite(struct buf * bp)
361{
362	int oldflags = bp->b_flags;
363
364	if (bp->b_flags & B_INVAL) {
365		brelse(bp);
366		return (0);
367	}
368#if !defined(MAX_PERF)
369	if (!(bp->b_flags & B_BUSY))
370		panic("bwrite: buffer is not busy???");
371#endif
372
373	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
374	bp->b_flags |= B_WRITEINPROG;
375
376	if ((oldflags & B_DELWRI) == B_DELWRI) {
377		--numdirtybuffers;
378		reassignbuf(bp, bp->b_vp);
379	}
380
381	bp->b_vp->v_numoutput++;
382	vfs_busy_pages(bp, 1);
383	if (curproc != NULL)
384		curproc->p_stats->p_ru.ru_oublock++;
385	VOP_STRATEGY(bp);
386
387	if ((oldflags & B_ASYNC) == 0) {
388		int rtval = biowait(bp);
389
390		if (oldflags & B_DELWRI) {
391			reassignbuf(bp, bp->b_vp);
392		}
393		brelse(bp);
394		return (rtval);
395	}
396	return (0);
397}
398
399inline void
400vfs_bio_need_satisfy(void) {
401	++numfreebuffers;
402	if (!needsbuffer)
403		return;
404	if (numdirtybuffers < lodirtybuffers) {
405		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
406	} else {
407		needsbuffer &= ~VFS_BIO_NEED_ANY;
408	}
409	if (numfreebuffers >= hifreebuffers) {
410		needsbuffer &= ~VFS_BIO_NEED_FREE;
411	}
412	wakeup(&needsbuffer);
413}
414
415/*
416 * Delayed write. (Buffer is marked dirty).
417 */
418void
419bdwrite(struct buf * bp)
420{
421
422#if !defined(MAX_PERF)
423	if ((bp->b_flags & B_BUSY) == 0) {
424		panic("bdwrite: buffer is not busy");
425	}
426#endif
427
428	if (bp->b_flags & B_INVAL) {
429		brelse(bp);
430		return;
431	}
432	if (bp->b_flags & B_TAPE) {
433		bawrite(bp);
434		return;
435	}
436	bp->b_flags &= ~(B_READ|B_RELBUF);
437	if ((bp->b_flags & B_DELWRI) == 0) {
438		bp->b_flags |= B_DONE | B_DELWRI;
439		reassignbuf(bp, bp->b_vp);
440		++numdirtybuffers;
441	}
442
443	/*
444	 * This bmap keeps the system from needing to do the bmap later,
445	 * perhaps when the system is attempting to do a sync.  Since it
446	 * is likely that the indirect block -- or whatever other datastructure
447	 * that the filesystem needs is still in memory now, it is a good
448	 * thing to do this.  Note also, that if the pageout daemon is
449	 * requesting a sync -- there might not be enough memory to do
450	 * the bmap then...  So, this is important to do.
451	 */
452	if (bp->b_lblkno == bp->b_blkno) {
453		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
454	}
455
456	/*
457	 * Set the *dirty* buffer range based upon the VM system dirty pages.
458	 */
459	vfs_setdirty(bp);
460
461	/*
462	 * We need to do this here to satisfy the vnode_pager and the
463	 * pageout daemon, so that it thinks that the pages have been
464	 * "cleaned".  Note that since the pages are in a delayed write
465	 * buffer -- the VFS layer "will" see that the pages get written
466	 * out on the next sync, or perhaps the cluster will be completed.
467	 */
468	vfs_clean_pages(bp);
469	bqrelse(bp);
470
471	if (numdirtybuffers >= hidirtybuffers)
472		flushdirtybuffers(0, 0);
473
474	return;
475}
476
477/*
478 * Asynchronous write.
479 * Start output on a buffer, but do not wait for it to complete.
480 * The buffer is released when the output completes.
481 */
482void
483bawrite(struct buf * bp)
484{
485	bp->b_flags |= B_ASYNC;
486	(void) VOP_BWRITE(bp);
487}
488
489/*
490 * Ordered write.
491 * Start output on a buffer, but only wait for it to complete if the
492 * output device cannot guarantee ordering in some other way.  Devices
493 * that can perform asynchronous ordered writes will set the B_ASYNC
494 * flag in their strategy routine.
495 * The buffer is released when the output completes.
496 */
497int
498bowrite(struct buf * bp)
499{
500	/*
501	 * XXX Add in B_ASYNC once the SCSI
502	 *     layer can deal with ordered
503	 *     writes properly.
504	 */
505	bp->b_flags |= B_ORDERED;
506	return (VOP_BWRITE(bp));
507}
508
509/*
510 * Release a buffer.
511 */
512void
513brelse(struct buf * bp)
514{
515	int s;
516
517	if (bp->b_flags & B_CLUSTER) {
518		relpbuf(bp);
519		return;
520	}
521	/* anyone need a "free" block? */
522	s = splbio();
523
524	/* anyone need this block? */
525	if (bp->b_flags & B_WANTED) {
526		bp->b_flags &= ~(B_WANTED | B_AGE);
527		wakeup(bp);
528	}
529
530	if (bp->b_flags & B_LOCKED)
531		bp->b_flags &= ~B_ERROR;
532
533	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
534	    (bp->b_bufsize <= 0)) {
535		bp->b_flags |= B_INVAL;
536		if (bp->b_flags & B_DELWRI)
537			--numdirtybuffers;
538		bp->b_flags &= ~(B_DELWRI | B_CACHE);
539		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
540			if (bp->b_bufsize)
541				allocbuf(bp, 0);
542			brelvp(bp);
543		}
544	}
545
546	/*
547	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
548	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
549	 * but the VM object is kept around.  The B_NOCACHE flag is used to
550	 * invalidate the pages in the VM object.
551	 *
552	 * If the buffer is a partially filled NFS buffer, keep it
553	 * since invalidating it now will lose informatio.  The valid
554	 * flags in the vm_pages have only DEV_BSIZE resolution but
555	 * the b_validoff, b_validend fields have byte resolution.
556	 * This can avoid unnecessary re-reads of the buffer.
557	 * XXX this seems to cause performance problems.
558	 */
559	if ((bp->b_flags & B_VMIO)
560	    && !(bp->b_vp->v_tag == VT_NFS &&
561		 bp->b_vp->v_type != VBLK &&
562		 (bp->b_flags & B_DELWRI) != 0)
563#ifdef notdef
564	    && (bp->b_vp->v_tag != VT_NFS
565		|| bp->b_vp->v_type == VBLK
566		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
567		|| bp->b_validend == 0
568		|| (bp->b_validoff == 0
569		    && bp->b_validend == bp->b_bufsize))
570#endif
571	    ) {
572		vm_ooffset_t foff;
573		vm_object_t obj;
574		int i, resid;
575		vm_page_t m;
576		struct vnode *vp;
577		int iototal = bp->b_bufsize;
578
579		vp = bp->b_vp;
580
581#if !defined(MAX_PERF)
582		if (!vp)
583			panic("brelse: missing vp");
584#endif
585
586		if (bp->b_npages) {
587			vm_pindex_t poff;
588			obj = (vm_object_t) vp->v_object;
589			if (vp->v_type == VBLK)
590				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
591			else
592				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
593			poff = OFF_TO_IDX(foff);
594			for (i = 0; i < bp->b_npages; i++) {
595				m = bp->b_pages[i];
596				if (m == bogus_page) {
597					m = vm_page_lookup(obj, poff + i);
598#if !defined(MAX_PERF)
599					if (!m) {
600						panic("brelse: page missing\n");
601					}
602#endif
603					bp->b_pages[i] = m;
604					pmap_qenter(trunc_page(bp->b_data),
605						bp->b_pages, bp->b_npages);
606				}
607				resid = IDX_TO_OFF(m->pindex+1) - foff;
608				if (resid > iototal)
609					resid = iototal;
610				if (resid > 0) {
611					/*
612					 * Don't invalidate the page if the local machine has already
613					 * modified it.  This is the lesser of two evils, and should
614					 * be fixed.
615					 */
616					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
617						vm_page_test_dirty(m);
618						if (m->dirty == 0) {
619							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
620							if (m->valid == 0)
621								vm_page_protect(m, VM_PROT_NONE);
622						}
623					}
624					if (resid >= PAGE_SIZE) {
625						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
626							bp->b_flags |= B_INVAL;
627						}
628					} else {
629						if (!vm_page_is_valid(m,
630							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
631							bp->b_flags |= B_INVAL;
632						}
633					}
634				}
635				foff += resid;
636				iototal -= resid;
637			}
638		}
639		if (bp->b_flags & (B_INVAL | B_RELBUF))
640			vfs_vmio_release(bp);
641	}
642#if !defined(MAX_PERF)
643	if (bp->b_qindex != QUEUE_NONE)
644		panic("brelse: free buffer onto another queue???");
645#endif
646
647	/* enqueue */
648	/* buffers with no memory */
649	if (bp->b_bufsize == 0) {
650		bp->b_flags |= B_INVAL;
651		bp->b_qindex = QUEUE_EMPTY;
652		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
653		LIST_REMOVE(bp, b_hash);
654		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
655		bp->b_dev = NODEV;
656		kvafreespace += bp->b_kvasize;
657
658	/* buffers with junk contents */
659	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
660		bp->b_flags |= B_INVAL;
661		bp->b_qindex = QUEUE_AGE;
662		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
663		LIST_REMOVE(bp, b_hash);
664		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
665		bp->b_dev = NODEV;
666
667	/* buffers that are locked */
668	} else if (bp->b_flags & B_LOCKED) {
669		bp->b_qindex = QUEUE_LOCKED;
670		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
671
672	/* buffers with stale but valid contents */
673	} else if (bp->b_flags & B_AGE) {
674		bp->b_qindex = QUEUE_AGE;
675		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
676
677	/* buffers with valid and quite potentially reuseable contents */
678	} else {
679		bp->b_qindex = QUEUE_LRU;
680		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
681	}
682
683	if ((bp->b_flags & B_INVAL) ||
684		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
685		if (bp->b_flags & B_DELWRI) {
686			--numdirtybuffers;
687			bp->b_flags &= ~B_DELWRI;
688		}
689		vfs_bio_need_satisfy();
690	}
691
692	/* unlock */
693	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
694				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
695	splx(s);
696}
697
698/*
699 * Release a buffer.
700 */
701void
702bqrelse(struct buf * bp)
703{
704	int s;
705
706	s = splbio();
707
708	/* anyone need this block? */
709	if (bp->b_flags & B_WANTED) {
710		bp->b_flags &= ~(B_WANTED | B_AGE);
711		wakeup(bp);
712	}
713
714#if !defined(MAX_PERF)
715	if (bp->b_qindex != QUEUE_NONE)
716		panic("bqrelse: free buffer onto another queue???");
717#endif
718
719	if (bp->b_flags & B_LOCKED) {
720		bp->b_flags &= ~B_ERROR;
721		bp->b_qindex = QUEUE_LOCKED;
722		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
723		/* buffers with stale but valid contents */
724	} else {
725		bp->b_qindex = QUEUE_LRU;
726		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
727	}
728
729	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
730		vfs_bio_need_satisfy();
731	}
732
733	/* unlock */
734	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
735		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
736	splx(s);
737}
738
739static void
740vfs_vmio_release(bp)
741	struct buf *bp;
742{
743	int i;
744	vm_page_t m;
745
746	for (i = 0; i < bp->b_npages; i++) {
747		m = bp->b_pages[i];
748		bp->b_pages[i] = NULL;
749		vm_page_unwire(m);
750		/*
751		 * We don't mess with busy pages, it is
752		 * the responsibility of the process that
753		 * busied the pages to deal with them.
754		 */
755		if ((m->flags & PG_BUSY) || (m->busy != 0))
756			continue;
757
758		if (m->wire_count == 0) {
759
760			if (m->flags & PG_WANTED) {
761				m->flags &= ~PG_WANTED;
762				wakeup(m);
763			}
764
765			/*
766			 * If this is an async free -- we cannot place
767			 * pages onto the cache queue.  If it is an
768			 * async free, then we don't modify any queues.
769			 * This is probably in error (for perf reasons),
770			 * and we will eventually need to build
771			 * a more complete infrastructure to support I/O
772			 * rundown.
773			 */
774			if ((bp->b_flags & B_ASYNC) == 0) {
775
776			/*
777			 * In the case of sync buffer frees, we can do pretty much
778			 * anything to any of the memory queues.  Specifically,
779			 * the cache queue is okay to be modified.
780			 */
781				if (m->valid) {
782					if(m->dirty == 0)
783						vm_page_test_dirty(m);
784					/*
785					 * this keeps pressure off of the process memory
786					 */
787					if (m->dirty == 0 && m->hold_count == 0)
788						vm_page_cache(m);
789					else
790						vm_page_deactivate(m);
791				} else if (m->hold_count == 0) {
792					struct vnode *vp;
793					vp = bp->b_vp;
794					vm_page_protect(m, VM_PROT_NONE);
795					vm_page_free(m);
796					if (vp && VSHOULDFREE(vp) &&
797						(vp->v_flag & (VFREE|VTBFREE)) == 0) {
798						TAILQ_INSERT_TAIL(&vnode_tobefree_list, vp, v_freelist);
799						vp->v_flag |= VTBFREE;
800					}
801				}
802			} else {
803				/*
804				 * If async, then at least we clear the
805				 * act_count.
806				 */
807				m->act_count = 0;
808			}
809		}
810	}
811	bufspace -= bp->b_bufsize;
812	vmiospace -= bp->b_bufsize;
813	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
814	bp->b_npages = 0;
815	bp->b_bufsize = 0;
816	bp->b_flags &= ~B_VMIO;
817	if (bp->b_vp)
818		brelvp(bp);
819}
820
821/*
822 * Check to see if a block is currently memory resident.
823 */
824struct buf *
825gbincore(struct vnode * vp, daddr_t blkno)
826{
827	struct buf *bp;
828	struct bufhashhdr *bh;
829
830	bh = BUFHASH(vp, blkno);
831	bp = bh->lh_first;
832
833	/* Search hash chain */
834	while (bp != NULL) {
835		/* hit */
836		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
837		    (bp->b_flags & B_INVAL) == 0) {
838			break;
839		}
840		bp = bp->b_hash.le_next;
841	}
842	return (bp);
843}
844
845/*
846 * this routine implements clustered async writes for
847 * clearing out B_DELWRI buffers...  This is much better
848 * than the old way of writing only one buffer at a time.
849 */
850int
851vfs_bio_awrite(struct buf * bp)
852{
853	int i;
854	daddr_t lblkno = bp->b_lblkno;
855	struct vnode *vp = bp->b_vp;
856	int s;
857	int ncl;
858	struct buf *bpa;
859	int nwritten;
860	int size;
861	int maxcl;
862
863	s = splbio();
864	/*
865	 * right now we support clustered writing only to regular files
866	 */
867	if ((vp->v_type == VREG) &&
868	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
869	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
870
871		size = vp->v_mount->mnt_stat.f_iosize;
872		maxcl = MAXPHYS / size;
873
874		for (i = 1; i < maxcl; i++) {
875			if ((bpa = gbincore(vp, lblkno + i)) &&
876			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
877			    (B_DELWRI | B_CLUSTEROK)) &&
878			    (bpa->b_bufsize == size)) {
879				if ((bpa->b_blkno == bpa->b_lblkno) ||
880				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
881					break;
882			} else {
883				break;
884			}
885		}
886		ncl = i;
887		/*
888		 * this is a possible cluster write
889		 */
890		if (ncl != 1) {
891			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
892			splx(s);
893			return nwritten;
894		}
895	} else if ((vp->v_flag & VOBJBUF) && (vp->v_type == VBLK) &&
896		((size = bp->b_bufsize) >= PAGE_SIZE)) {
897		maxcl = MAXPHYS / size;
898		for (i = 1; i < maxcl; i++) {
899			if ((bpa = gbincore(vp, lblkno + i)) &&
900			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
901			    (B_DELWRI | B_CLUSTEROK)) &&
902			    (bpa->b_bufsize == size)) {
903				    if (bpa->b_blkno !=
904						bp->b_blkno + ((i * size) >> DEV_BSHIFT))
905							break;
906			} else {
907				break;
908			}
909		}
910		ncl = i;
911		/*
912		 * this is a possible cluster write
913		 */
914		if (ncl != 1) {
915			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
916			printf("Block cluster: (%d, %d)\n", lblkno, nwritten);
917			splx(s);
918			return nwritten;
919		}
920	}
921
922	bremfree(bp);
923	splx(s);
924	/*
925	 * default (old) behavior, writing out only one block
926	 */
927	bp->b_flags |= B_BUSY | B_ASYNC;
928	nwritten = bp->b_bufsize;
929	(void) VOP_BWRITE(bp);
930	return nwritten;
931}
932
933
934/*
935 * Find a buffer header which is available for use.
936 */
937static struct buf *
938getnewbuf(struct vnode *vp, daddr_t blkno,
939	int slpflag, int slptimeo, int size, int maxsize)
940{
941	struct buf *bp, *bp1;
942	int nbyteswritten = 0;
943	vm_offset_t addr;
944	static int writerecursion = 0;
945
946start:
947	if (bufspace >= maxbufspace)
948		goto trytofreespace;
949
950	/* can we constitute a new buffer? */
951	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
952#if !defined(MAX_PERF)
953		if (bp->b_qindex != QUEUE_EMPTY)
954			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
955			    bp->b_qindex);
956#endif
957		bp->b_flags |= B_BUSY;
958		bremfree(bp);
959		goto fillbuf;
960	}
961trytofreespace:
962	/*
963	 * We keep the file I/O from hogging metadata I/O
964	 * This is desirable because file data is cached in the
965	 * VM/Buffer cache even if a buffer is freed.
966	 */
967	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
968#if !defined(MAX_PERF)
969		if (bp->b_qindex != QUEUE_AGE)
970			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
971			    bp->b_qindex);
972#endif
973	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
974#if !defined(MAX_PERF)
975		if (bp->b_qindex != QUEUE_LRU)
976			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
977			    bp->b_qindex);
978#endif
979	}
980	if (!bp) {
981		/* wait for a free buffer of any kind */
982		needsbuffer |= VFS_BIO_NEED_ANY;
983		do
984			tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf",
985			    slptimeo);
986		while (needsbuffer & VFS_BIO_NEED_ANY);
987		return (0);
988	}
989
990#if defined(DIAGNOSTIC)
991	if (bp->b_flags & B_BUSY) {
992		panic("getnewbuf: busy buffer on free list\n");
993	}
994#endif
995
996	/*
997	 * We are fairly aggressive about freeing VMIO buffers, but since
998	 * the buffering is intact without buffer headers, there is not
999	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
1000	 */
1001	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1002		if ((bp->b_flags & B_VMIO) == 0 ||
1003			(vmiospace < maxvmiobufspace)) {
1004			--bp->b_usecount;
1005			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1006			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1007				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1008				goto start;
1009			}
1010			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1011		}
1012	}
1013
1014
1015	/* if we are a delayed write, convert to an async write */
1016	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1017
1018		/*
1019		 * If our delayed write is likely to be used soon, then
1020		 * recycle back onto the LRU queue.
1021		 */
1022		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1023			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1024
1025			if (bp->b_usecount > 0) {
1026				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1027
1028					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1029
1030					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1031						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1032						bp->b_usecount--;
1033						goto start;
1034					}
1035					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1036				}
1037			}
1038		}
1039
1040		/*
1041		 * Certain layered filesystems can recursively re-enter the vfs_bio
1042		 * code, due to delayed writes.  This helps keep the system from
1043		 * deadlocking.
1044		 */
1045		if (writerecursion > 0) {
1046			bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1047			while (bp) {
1048				if ((bp->b_flags & B_DELWRI) == 0)
1049					break;
1050				bp = TAILQ_NEXT(bp, b_freelist);
1051			}
1052			if (bp == NULL) {
1053				bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1054				while (bp) {
1055					if ((bp->b_flags & B_DELWRI) == 0)
1056						break;
1057					bp = TAILQ_NEXT(bp, b_freelist);
1058				}
1059			}
1060			if (bp == NULL)
1061				panic("getnewbuf: cannot get buffer, infinite recursion failure");
1062		} else {
1063			++writerecursion;
1064			nbyteswritten += vfs_bio_awrite(bp);
1065			--writerecursion;
1066			if (!slpflag && !slptimeo) {
1067				return (0);
1068			}
1069			goto start;
1070		}
1071	}
1072
1073	if (bp->b_flags & B_WANTED) {
1074		bp->b_flags &= ~B_WANTED;
1075		wakeup(bp);
1076	}
1077	bremfree(bp);
1078	bp->b_flags |= B_BUSY;
1079
1080	if (bp->b_flags & B_VMIO) {
1081		bp->b_flags &= ~B_ASYNC;
1082		vfs_vmio_release(bp);
1083	}
1084
1085	if (bp->b_vp)
1086		brelvp(bp);
1087
1088fillbuf:
1089	/* we are not free, nor do we contain interesting data */
1090	if (bp->b_rcred != NOCRED) {
1091		crfree(bp->b_rcred);
1092		bp->b_rcred = NOCRED;
1093	}
1094	if (bp->b_wcred != NOCRED) {
1095		crfree(bp->b_wcred);
1096		bp->b_wcred = NOCRED;
1097	}
1098
1099	LIST_REMOVE(bp, b_hash);
1100	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1101	if (bp->b_bufsize) {
1102		allocbuf(bp, 0);
1103	}
1104	bp->b_flags = B_BUSY;
1105	bp->b_dev = NODEV;
1106	bp->b_vp = NULL;
1107	bp->b_blkno = bp->b_lblkno = 0;
1108	bp->b_iodone = 0;
1109	bp->b_error = 0;
1110	bp->b_resid = 0;
1111	bp->b_bcount = 0;
1112	bp->b_npages = 0;
1113	bp->b_dirtyoff = bp->b_dirtyend = 0;
1114	bp->b_validoff = bp->b_validend = 0;
1115	bp->b_usecount = 4;
1116
1117	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1118
1119	/*
1120	 * we assume that buffer_map is not at address 0
1121	 */
1122	addr = 0;
1123	if (maxsize != bp->b_kvasize) {
1124		bfreekva(bp);
1125
1126findkvaspace:
1127		/*
1128		 * See if we have buffer kva space
1129		 */
1130		if (vm_map_findspace(buffer_map,
1131			vm_map_min(buffer_map), maxsize, &addr)) {
1132			if (kvafreespace > 0) {
1133				int tfree = 0;
1134				for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1135					bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist))
1136					if (bp1->b_kvasize != 0) {
1137						tfree += bp1->b_kvasize;
1138						bremfree(bp1);
1139						bfreekva(bp1);
1140						brelse(bp1);
1141						if (tfree >= maxsize)
1142							goto findkvaspace;
1143					}
1144			}
1145			bp->b_flags |= B_INVAL;
1146			brelse(bp);
1147			goto trytofreespace;
1148		}
1149	}
1150
1151	/*
1152	 * See if we are below are allocated minimum
1153	 */
1154	if (bufspace >= (maxbufspace + nbyteswritten)) {
1155		bp->b_flags |= B_INVAL;
1156		brelse(bp);
1157		goto trytofreespace;
1158	}
1159
1160	/*
1161	 * create a map entry for the buffer -- in essence
1162	 * reserving the kva space.
1163	 */
1164	if (addr) {
1165		vm_map_insert(buffer_map, NULL, 0,
1166			addr, addr + maxsize,
1167			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1168
1169		bp->b_kvabase = (caddr_t) addr;
1170		bp->b_kvasize = maxsize;
1171	}
1172	bp->b_data = bp->b_kvabase;
1173
1174	return (bp);
1175}
1176
1177static void
1178waitfreebuffers(int slpflag, int slptimeo) {
1179	while (numfreebuffers < hifreebuffers) {
1180		flushdirtybuffers(slpflag, slptimeo);
1181		if (numfreebuffers < hifreebuffers)
1182			break;
1183		needsbuffer |= VFS_BIO_NEED_FREE;
1184		if (tsleep(&needsbuffer, PRIBIO|slpflag, "biofre", slptimeo))
1185			break;
1186	}
1187}
1188
1189static void
1190flushdirtybuffers(int slpflag, int slptimeo) {
1191	int s;
1192	static pid_t flushing = 0;
1193
1194	s = splbio();
1195
1196	if (flushing) {
1197		if (flushing == curproc->p_pid) {
1198			splx(s);
1199			return;
1200		}
1201		while (flushing) {
1202			if (tsleep(&flushing, PRIBIO|slpflag, "biofls", slptimeo)) {
1203				splx(s);
1204				return;
1205			}
1206		}
1207	}
1208	flushing = curproc->p_pid;
1209
1210	while (numdirtybuffers > lodirtybuffers) {
1211		struct buf *bp;
1212		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1213		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1214		if (bp == NULL)
1215			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1216
1217		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1218			bp = TAILQ_NEXT(bp, b_freelist);
1219		}
1220
1221		if (bp) {
1222			vfs_bio_awrite(bp);
1223			continue;
1224		}
1225		break;
1226	}
1227
1228	flushing = 0;
1229	wakeup(&flushing);
1230	splx(s);
1231}
1232
1233/*
1234 * Check to see if a block is currently memory resident.
1235 */
1236struct buf *
1237incore(struct vnode * vp, daddr_t blkno)
1238{
1239	struct buf *bp;
1240
1241	int s = splbio();
1242	bp = gbincore(vp, blkno);
1243	splx(s);
1244	return (bp);
1245}
1246
1247/*
1248 * Returns true if no I/O is needed to access the
1249 * associated VM object.  This is like incore except
1250 * it also hunts around in the VM system for the data.
1251 */
1252
1253int
1254inmem(struct vnode * vp, daddr_t blkno)
1255{
1256	vm_object_t obj;
1257	vm_offset_t toff, tinc;
1258	vm_page_t m;
1259	vm_ooffset_t off;
1260
1261	if (incore(vp, blkno))
1262		return 1;
1263	if (vp->v_mount == NULL)
1264		return 0;
1265	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1266		return 0;
1267
1268	obj = vp->v_object;
1269	tinc = PAGE_SIZE;
1270	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1271		tinc = vp->v_mount->mnt_stat.f_iosize;
1272	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1273
1274	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1275
1276		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1277		if (!m)
1278			return 0;
1279		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
1280			return 0;
1281	}
1282	return 1;
1283}
1284
1285/*
1286 * now we set the dirty range for the buffer --
1287 * for NFS -- if the file is mapped and pages have
1288 * been written to, let it know.  We want the
1289 * entire range of the buffer to be marked dirty if
1290 * any of the pages have been written to for consistancy
1291 * with the b_validoff, b_validend set in the nfs write
1292 * code, and used by the nfs read code.
1293 */
1294static void
1295vfs_setdirty(struct buf *bp) {
1296	int i;
1297	vm_object_t object;
1298	vm_offset_t boffset, offset;
1299	/*
1300	 * We qualify the scan for modified pages on whether the
1301	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1302	 * is not cleared simply by protecting pages off.
1303	 */
1304	if ((bp->b_flags & B_VMIO) &&
1305		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1306		/*
1307		 * test the pages to see if they have been modified directly
1308		 * by users through the VM system.
1309		 */
1310		for (i = 0; i < bp->b_npages; i++)
1311			vm_page_test_dirty(bp->b_pages[i]);
1312
1313		/*
1314		 * scan forwards for the first page modified
1315		 */
1316		for (i = 0; i < bp->b_npages; i++) {
1317			if (bp->b_pages[i]->dirty) {
1318				break;
1319			}
1320		}
1321		boffset = (i << PAGE_SHIFT);
1322		if (boffset < bp->b_dirtyoff) {
1323			bp->b_dirtyoff = boffset;
1324		}
1325
1326		/*
1327		 * scan backwards for the last page modified
1328		 */
1329		for (i = bp->b_npages - 1; i >= 0; --i) {
1330			if (bp->b_pages[i]->dirty) {
1331				break;
1332			}
1333		}
1334		boffset = (i + 1);
1335		offset = boffset + bp->b_pages[0]->pindex;
1336		if (offset >= object->size)
1337			boffset = object->size - bp->b_pages[0]->pindex;
1338		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1339			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1340	}
1341}
1342
1343/*
1344 * Get a block given a specified block and offset into a file/device.
1345 */
1346struct buf *
1347getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1348{
1349	struct buf *bp;
1350	int s;
1351	struct bufhashhdr *bh;
1352	int maxsize;
1353
1354	if (vp->v_mount) {
1355		maxsize = vp->v_mount->mnt_stat.f_iosize;
1356		/*
1357		 * This happens on mount points.
1358		 */
1359		if (maxsize < size)
1360			maxsize = size;
1361	} else {
1362		maxsize = size;
1363	}
1364
1365#if !defined(MAX_PERF)
1366	if (size > MAXBSIZE)
1367		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1368#endif
1369
1370	s = splbio();
1371loop:
1372	if (numfreebuffers < lofreebuffers) {
1373		waitfreebuffers(slpflag, slptimeo);
1374	}
1375
1376	if ((bp = gbincore(vp, blkno))) {
1377		if (bp->b_flags & B_BUSY) {
1378			bp->b_flags |= B_WANTED;
1379			if (bp->b_usecount < BUF_MAXUSE)
1380				++bp->b_usecount;
1381			if (!tsleep(bp,
1382				(PRIBIO + 1) | slpflag, "getblk", slptimeo))
1383				goto loop;
1384
1385			splx(s);
1386			return (struct buf *) NULL;
1387		}
1388		bp->b_flags |= B_BUSY | B_CACHE;
1389		bremfree(bp);
1390
1391		/*
1392		 * check for size inconsistancies (note that they shouldn't
1393		 * happen but do when filesystems don't handle the size changes
1394		 * correctly.) We are conservative on metadata and don't just
1395		 * extend the buffer but write and re-constitute it.
1396		 */
1397
1398		if (bp->b_bcount != size) {
1399			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1400				allocbuf(bp, size);
1401			} else {
1402				bp->b_flags |= B_NOCACHE;
1403				VOP_BWRITE(bp);
1404				goto loop;
1405			}
1406		}
1407
1408		if (bp->b_usecount < BUF_MAXUSE)
1409			++bp->b_usecount;
1410		splx(s);
1411		return (bp);
1412	} else {
1413		vm_object_t obj;
1414
1415		if ((bp = getnewbuf(vp, blkno,
1416			slpflag, slptimeo, size, maxsize)) == 0) {
1417			if (slpflag || slptimeo) {
1418				splx(s);
1419				return NULL;
1420			}
1421			goto loop;
1422		}
1423
1424		/*
1425		 * This code is used to make sure that a buffer is not
1426		 * created while the getnewbuf routine is blocked.
1427		 * Normally the vnode is locked so this isn't a problem.
1428		 * VBLK type I/O requests, however, don't lock the vnode.
1429		 */
1430		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
1431			bp->b_flags |= B_INVAL;
1432			brelse(bp);
1433			goto loop;
1434		}
1435
1436		/*
1437		 * Insert the buffer into the hash, so that it can
1438		 * be found by incore.
1439		 */
1440		bp->b_blkno = bp->b_lblkno = blkno;
1441		bgetvp(vp, bp);
1442		LIST_REMOVE(bp, b_hash);
1443		bh = BUFHASH(vp, blkno);
1444		LIST_INSERT_HEAD(bh, bp, b_hash);
1445
1446		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
1447			bp->b_flags |= (B_VMIO | B_CACHE);
1448#if defined(VFS_BIO_DEBUG)
1449			if (vp->v_type != VREG && vp->v_type != VBLK)
1450				printf("getblk: vmioing file type %d???\n", vp->v_type);
1451#endif
1452		} else {
1453			bp->b_flags &= ~B_VMIO;
1454		}
1455		splx(s);
1456
1457		allocbuf(bp, size);
1458#ifdef	PC98
1459		/*
1460		 * 1024byte/sector support
1461		 */
1462#define B_XXX2 0x8000000
1463		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1464#endif
1465		return (bp);
1466	}
1467}
1468
1469/*
1470 * Get an empty, disassociated buffer of given size.
1471 */
1472struct buf *
1473geteblk(int size)
1474{
1475	struct buf *bp;
1476	int s;
1477
1478	s = splbio();
1479	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1480	splx(s);
1481	allocbuf(bp, size);
1482	bp->b_flags |= B_INVAL;
1483	return (bp);
1484}
1485
1486
1487/*
1488 * This code constitutes the buffer memory from either anonymous system
1489 * memory (in the case of non-VMIO operations) or from an associated
1490 * VM object (in the case of VMIO operations).
1491 *
1492 * Note that this code is tricky, and has many complications to resolve
1493 * deadlock or inconsistant data situations.  Tread lightly!!!
1494 *
1495 * Modify the length of a buffer's underlying buffer storage without
1496 * destroying information (unless, of course the buffer is shrinking).
1497 */
1498int
1499allocbuf(struct buf * bp, int size)
1500{
1501
1502	int s;
1503	int newbsize, mbsize;
1504	int i;
1505
1506#if !defined(MAX_PERF)
1507	if (!(bp->b_flags & B_BUSY))
1508		panic("allocbuf: buffer not busy");
1509
1510	if (bp->b_kvasize < size)
1511		panic("allocbuf: buffer too small");
1512#endif
1513
1514	if ((bp->b_flags & B_VMIO) == 0) {
1515		caddr_t origbuf;
1516		int origbufsize;
1517		/*
1518		 * Just get anonymous memory from the kernel
1519		 */
1520		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1521#if !defined(NO_B_MALLOC)
1522		if (bp->b_flags & B_MALLOC)
1523			newbsize = mbsize;
1524		else
1525#endif
1526			newbsize = round_page(size);
1527
1528		if (newbsize < bp->b_bufsize) {
1529#if !defined(NO_B_MALLOC)
1530			/*
1531			 * malloced buffers are not shrunk
1532			 */
1533			if (bp->b_flags & B_MALLOC) {
1534				if (newbsize) {
1535					bp->b_bcount = size;
1536				} else {
1537					free(bp->b_data, M_BIOBUF);
1538					bufspace -= bp->b_bufsize;
1539					bufmallocspace -= bp->b_bufsize;
1540					bp->b_data = bp->b_kvabase;
1541					bp->b_bufsize = 0;
1542					bp->b_bcount = 0;
1543					bp->b_flags &= ~B_MALLOC;
1544				}
1545				return 1;
1546			}
1547#endif
1548			vm_hold_free_pages(
1549			    bp,
1550			    (vm_offset_t) bp->b_data + newbsize,
1551			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1552		} else if (newbsize > bp->b_bufsize) {
1553#if !defined(NO_B_MALLOC)
1554			/*
1555			 * We only use malloced memory on the first allocation.
1556			 * and revert to page-allocated memory when the buffer grows.
1557			 */
1558			if ( (bufmallocspace < maxbufmallocspace) &&
1559				(bp->b_bufsize == 0) &&
1560				(mbsize <= PAGE_SIZE/2)) {
1561
1562				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1563				bp->b_bufsize = mbsize;
1564				bp->b_bcount = size;
1565				bp->b_flags |= B_MALLOC;
1566				bufspace += mbsize;
1567				bufmallocspace += mbsize;
1568				return 1;
1569			}
1570#endif
1571			origbuf = NULL;
1572			origbufsize = 0;
1573#if !defined(NO_B_MALLOC)
1574			/*
1575			 * If the buffer is growing on it's other-than-first allocation,
1576			 * then we revert to the page-allocation scheme.
1577			 */
1578			if (bp->b_flags & B_MALLOC) {
1579				origbuf = bp->b_data;
1580				origbufsize = bp->b_bufsize;
1581				bp->b_data = bp->b_kvabase;
1582				bufspace -= bp->b_bufsize;
1583				bufmallocspace -= bp->b_bufsize;
1584				bp->b_bufsize = 0;
1585				bp->b_flags &= ~B_MALLOC;
1586				newbsize = round_page(newbsize);
1587			}
1588#endif
1589			vm_hold_load_pages(
1590			    bp,
1591			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1592			    (vm_offset_t) bp->b_data + newbsize);
1593#if !defined(NO_B_MALLOC)
1594			if (origbuf) {
1595				bcopy(origbuf, bp->b_data, origbufsize);
1596				free(origbuf, M_BIOBUF);
1597			}
1598#endif
1599		}
1600	} else {
1601		vm_page_t m;
1602		int desiredpages;
1603
1604		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1605		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1606
1607#if !defined(NO_B_MALLOC)
1608		if (bp->b_flags & B_MALLOC)
1609			panic("allocbuf: VMIO buffer can't be malloced");
1610#endif
1611
1612		if (newbsize < bp->b_bufsize) {
1613			if (desiredpages < bp->b_npages) {
1614				for (i = desiredpages; i < bp->b_npages; i++) {
1615					/*
1616					 * the page is not freed here -- it
1617					 * is the responsibility of vnode_pager_setsize
1618					 */
1619					m = bp->b_pages[i];
1620#if defined(DIAGNOSTIC)
1621					if (m == bogus_page)
1622						panic("allocbuf: bogus page found");
1623#endif
1624					s = splvm();
1625					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1626						m->flags |= PG_WANTED;
1627						tsleep(m, PVM, "biodep", 0);
1628					}
1629					splx(s);
1630
1631					bp->b_pages[i] = NULL;
1632					vm_page_unwire(m);
1633				}
1634				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1635				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1636				bp->b_npages = desiredpages;
1637			}
1638		} else if (newbsize > bp->b_bufsize) {
1639			vm_object_t obj;
1640			vm_offset_t tinc, toff;
1641			vm_ooffset_t off;
1642			vm_pindex_t objoff;
1643			int pageindex, curbpnpages;
1644			struct vnode *vp;
1645			int bsize;
1646
1647			vp = bp->b_vp;
1648
1649			if (vp->v_type == VBLK)
1650				bsize = DEV_BSIZE;
1651			else
1652				bsize = vp->v_mount->mnt_stat.f_iosize;
1653
1654			if (bp->b_npages < desiredpages) {
1655				obj = vp->v_object;
1656				tinc = PAGE_SIZE;
1657				if (tinc > bsize)
1658					tinc = bsize;
1659				off = (vm_ooffset_t) bp->b_lblkno * bsize;
1660				curbpnpages = bp->b_npages;
1661		doretry:
1662				bp->b_flags |= B_CACHE;
1663				bp->b_validoff = bp->b_validend = 0;
1664				for (toff = 0; toff < newbsize; toff += tinc) {
1665					int bytesinpage;
1666
1667					pageindex = toff >> PAGE_SHIFT;
1668					objoff = OFF_TO_IDX(off + toff);
1669					if (pageindex < curbpnpages) {
1670
1671						m = bp->b_pages[pageindex];
1672#ifdef VFS_BIO_DIAG
1673						if (m->pindex != objoff)
1674							panic("allocbuf: page changed offset??!!!?");
1675#endif
1676						bytesinpage = tinc;
1677						if (tinc > (newbsize - toff))
1678							bytesinpage = newbsize - toff;
1679						if (bp->b_flags & B_CACHE)
1680							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1681						continue;
1682					}
1683					m = vm_page_lookup(obj, objoff);
1684					if (!m) {
1685						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1686						if (!m) {
1687							VM_WAIT;
1688							goto doretry;
1689						}
1690						/*
1691						 * Normally it is unwise to clear PG_BUSY without
1692						 * PAGE_WAKEUP -- but it is okay here, as there is
1693						 * no chance for blocking between here and vm_page_alloc
1694						 */
1695						m->flags &= ~PG_BUSY;
1696						vm_page_wire(m);
1697						bp->b_flags &= ~B_CACHE;
1698					} else if (m->flags & PG_BUSY) {
1699						s = splvm();
1700						if (m->flags & PG_BUSY) {
1701							m->flags |= PG_WANTED;
1702							tsleep(m, PVM, "pgtblk", 0);
1703						}
1704						splx(s);
1705						goto doretry;
1706					} else {
1707						if ((curproc != pageproc) &&
1708							((m->queue - m->pc) == PQ_CACHE) &&
1709						    ((cnt.v_free_count + cnt.v_cache_count) <
1710								(cnt.v_free_min + cnt.v_cache_min))) {
1711							pagedaemon_wakeup();
1712						}
1713						bytesinpage = tinc;
1714						if (tinc > (newbsize - toff))
1715							bytesinpage = newbsize - toff;
1716						if (bp->b_flags & B_CACHE)
1717							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1718						vm_page_wire(m);
1719					}
1720					bp->b_pages[pageindex] = m;
1721					curbpnpages = pageindex + 1;
1722				}
1723				if (vp->v_tag == VT_NFS &&
1724				    vp->v_type != VBLK) {
1725					if (bp->b_dirtyend > 0) {
1726						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1727						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1728					}
1729					if (bp->b_validend == 0)
1730						bp->b_flags &= ~B_CACHE;
1731				}
1732				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1733				bp->b_npages = curbpnpages;
1734				pmap_qenter((vm_offset_t) bp->b_data,
1735					bp->b_pages, bp->b_npages);
1736				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1737			}
1738		}
1739	}
1740	if (bp->b_flags & B_VMIO)
1741		vmiospace += (newbsize - bp->b_bufsize);
1742	bufspace += (newbsize - bp->b_bufsize);
1743	bp->b_bufsize = newbsize;
1744	bp->b_bcount = size;
1745	return 1;
1746}
1747
1748/*
1749 * Wait for buffer I/O completion, returning error status.
1750 */
1751int
1752biowait(register struct buf * bp)
1753{
1754	int s;
1755
1756	s = splbio();
1757	while ((bp->b_flags & B_DONE) == 0)
1758#if defined(NO_SCHEDULE_MODS)
1759		tsleep(bp, PRIBIO, "biowait", 0);
1760#else
1761		if (bp->b_flags & B_READ)
1762			tsleep(bp, PRIBIO, "biord", 0);
1763		else
1764			tsleep(bp, curproc->p_usrpri, "biowr", 0);
1765#endif
1766	splx(s);
1767	if (bp->b_flags & B_EINTR) {
1768		bp->b_flags &= ~B_EINTR;
1769		return (EINTR);
1770	}
1771	if (bp->b_flags & B_ERROR) {
1772		return (bp->b_error ? bp->b_error : EIO);
1773	} else {
1774		return (0);
1775	}
1776}
1777
1778/*
1779 * Finish I/O on a buffer, calling an optional function.
1780 * This is usually called from interrupt level, so process blocking
1781 * is not *a good idea*.
1782 */
1783void
1784biodone(register struct buf * bp)
1785{
1786	int s;
1787
1788	s = splbio();
1789
1790#if !defined(MAX_PERF)
1791	if (!(bp->b_flags & B_BUSY))
1792		panic("biodone: buffer not busy");
1793#endif
1794
1795	if (bp->b_flags & B_DONE) {
1796		splx(s);
1797#if !defined(MAX_PERF)
1798		printf("biodone: buffer already done\n");
1799#endif
1800		return;
1801	}
1802	bp->b_flags |= B_DONE;
1803
1804	if ((bp->b_flags & B_READ) == 0) {
1805		vwakeup(bp);
1806	}
1807#ifdef BOUNCE_BUFFERS
1808	if (bp->b_flags & B_BOUNCE)
1809		vm_bounce_free(bp);
1810#endif
1811
1812	/* call optional completion function if requested */
1813	if (bp->b_flags & B_CALL) {
1814		bp->b_flags &= ~B_CALL;
1815		(*bp->b_iodone) (bp);
1816		splx(s);
1817		return;
1818	}
1819	if (bp->b_flags & B_VMIO) {
1820		int i, resid;
1821		vm_ooffset_t foff;
1822		vm_page_t m;
1823		vm_object_t obj;
1824		int iosize;
1825		struct vnode *vp = bp->b_vp;
1826
1827		obj = vp->v_object;
1828
1829#if defined(VFS_BIO_DEBUG)
1830		if (vp->v_usecount == 0) {
1831			panic("biodone: zero vnode ref count");
1832		}
1833
1834		if (vp->v_object == NULL) {
1835			panic("biodone: missing VM object");
1836		}
1837
1838		if ((vp->v_flag & VOBJBUF) == 0) {
1839			panic("biodone: vnode is not setup for merged cache");
1840		}
1841#endif
1842
1843		if (vp->v_type == VBLK)
1844			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1845		else
1846			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1847#if !defined(MAX_PERF)
1848		if (!obj) {
1849			panic("biodone: no object");
1850		}
1851#endif
1852#if defined(VFS_BIO_DEBUG)
1853		if (obj->paging_in_progress < bp->b_npages) {
1854			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1855			    obj->paging_in_progress, bp->b_npages);
1856		}
1857#endif
1858		iosize = bp->b_bufsize;
1859		for (i = 0; i < bp->b_npages; i++) {
1860			int bogusflag = 0;
1861			m = bp->b_pages[i];
1862			if (m == bogus_page) {
1863				bogusflag = 1;
1864				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1865				if (!m) {
1866#if defined(VFS_BIO_DEBUG)
1867					printf("biodone: page disappeared\n");
1868#endif
1869					--obj->paging_in_progress;
1870					continue;
1871				}
1872				bp->b_pages[i] = m;
1873				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1874			}
1875#if defined(VFS_BIO_DEBUG)
1876			if (OFF_TO_IDX(foff) != m->pindex) {
1877				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1878			}
1879#endif
1880			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1881			if (resid > iosize)
1882				resid = iosize;
1883			/*
1884			 * In the write case, the valid and clean bits are
1885			 * already changed correctly, so we only need to do this
1886			 * here in the read case.
1887			 */
1888			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1889				vfs_page_set_valid(bp, foff, i, m);
1890			}
1891
1892			/*
1893			 * when debugging new filesystems or buffer I/O methods, this
1894			 * is the most common error that pops up.  if you see this, you
1895			 * have not set the page busy flag correctly!!!
1896			 */
1897			if (m->busy == 0) {
1898#if !defined(MAX_PERF)
1899				printf("biodone: page busy < 0, "
1900				    "pindex: %d, foff: 0x(%x,%x), "
1901				    "resid: %d, index: %d\n",
1902				    (int) m->pindex, (int)(foff >> 32),
1903						(int) foff & 0xffffffff, resid, i);
1904#endif
1905				if (vp->v_type != VBLK)
1906#if !defined(MAX_PERF)
1907					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1908					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1909					    (int) bp->b_lblkno,
1910					    bp->b_flags, bp->b_npages);
1911				else
1912					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1913					    (int) bp->b_lblkno,
1914					    bp->b_flags, bp->b_npages);
1915				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1916				    m->valid, m->dirty, m->wire_count);
1917#endif
1918				panic("biodone: page busy < 0\n");
1919			}
1920			--m->busy;
1921			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1922				m->flags &= ~PG_WANTED;
1923				wakeup(m);
1924			}
1925			--obj->paging_in_progress;
1926			foff += resid;
1927			iosize -= resid;
1928		}
1929		if (obj && obj->paging_in_progress == 0 &&
1930		    (obj->flags & OBJ_PIPWNT)) {
1931			obj->flags &= ~OBJ_PIPWNT;
1932			wakeup(obj);
1933		}
1934	}
1935	/*
1936	 * For asynchronous completions, release the buffer now. The brelse
1937	 * checks for B_WANTED and will do the wakeup there if necessary - so
1938	 * no need to do a wakeup here in the async case.
1939	 */
1940
1941	if (bp->b_flags & B_ASYNC) {
1942		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
1943			brelse(bp);
1944		else
1945			bqrelse(bp);
1946	} else {
1947		bp->b_flags &= ~B_WANTED;
1948		wakeup(bp);
1949	}
1950	splx(s);
1951}
1952
1953int
1954count_lock_queue()
1955{
1956	int count;
1957	struct buf *bp;
1958
1959	count = 0;
1960	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
1961	    bp != NULL;
1962	    bp = TAILQ_NEXT(bp, b_freelist))
1963		count++;
1964	return (count);
1965}
1966
1967int vfs_update_interval = 30;
1968
1969static void
1970vfs_update()
1971{
1972	while (1) {
1973		tsleep(&vfs_update_wakeup, PUSER, "update",
1974		    hz * vfs_update_interval);
1975		vfs_update_wakeup = 0;
1976		sync(curproc, NULL);
1977	}
1978}
1979
1980static int
1981sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1982{
1983	int error = sysctl_handle_int(oidp,
1984		oidp->oid_arg1, oidp->oid_arg2, req);
1985	if (!error)
1986		wakeup(&vfs_update_wakeup);
1987	return error;
1988}
1989
1990SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1991	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1992
1993
1994/*
1995 * This routine is called in lieu of iodone in the case of
1996 * incomplete I/O.  This keeps the busy status for pages
1997 * consistant.
1998 */
1999void
2000vfs_unbusy_pages(struct buf * bp)
2001{
2002	int i;
2003
2004	if (bp->b_flags & B_VMIO) {
2005		struct vnode *vp = bp->b_vp;
2006		vm_object_t obj = vp->v_object;
2007		vm_ooffset_t foff;
2008
2009		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2010
2011		for (i = 0; i < bp->b_npages; i++) {
2012			vm_page_t m = bp->b_pages[i];
2013
2014			if (m == bogus_page) {
2015				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
2016#if !defined(MAX_PERF)
2017				if (!m) {
2018					panic("vfs_unbusy_pages: page missing\n");
2019				}
2020#endif
2021				bp->b_pages[i] = m;
2022				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2023			}
2024			--obj->paging_in_progress;
2025			--m->busy;
2026			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
2027				m->flags &= ~PG_WANTED;
2028				wakeup(m);
2029			}
2030		}
2031		if (obj->paging_in_progress == 0 &&
2032		    (obj->flags & OBJ_PIPWNT)) {
2033			obj->flags &= ~OBJ_PIPWNT;
2034			wakeup(obj);
2035		}
2036	}
2037}
2038
2039/*
2040 * Set NFS' b_validoff and b_validend fields from the valid bits
2041 * of a page.  If the consumer is not NFS, and the page is not
2042 * valid for the entire range, clear the B_CACHE flag to force
2043 * the consumer to re-read the page.
2044 */
2045static void
2046vfs_buf_set_valid(struct buf *bp,
2047		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2048		  vm_page_t m)
2049{
2050	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2051		vm_offset_t svalid, evalid;
2052		int validbits = m->valid;
2053
2054		/*
2055		 * This only bothers with the first valid range in the
2056		 * page.
2057		 */
2058		svalid = off;
2059		while (validbits && !(validbits & 1)) {
2060			svalid += DEV_BSIZE;
2061			validbits >>= 1;
2062		}
2063		evalid = svalid;
2064		while (validbits & 1) {
2065			evalid += DEV_BSIZE;
2066			validbits >>= 1;
2067		}
2068		/*
2069		 * Make sure this range is contiguous with the range
2070		 * built up from previous pages.  If not, then we will
2071		 * just use the range from the previous pages.
2072		 */
2073		if (svalid == bp->b_validend) {
2074			bp->b_validoff = min(bp->b_validoff, svalid);
2075			bp->b_validend = max(bp->b_validend, evalid);
2076		}
2077	} else if (!vm_page_is_valid(m,
2078				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2079				     size)) {
2080		bp->b_flags &= ~B_CACHE;
2081	}
2082}
2083
2084/*
2085 * Set the valid bits in a page, taking care of the b_validoff,
2086 * b_validend fields which NFS uses to optimise small reads.  Off is
2087 * the offset within the file and pageno is the page index within the buf.
2088 */
2089static void
2090vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2091{
2092	struct vnode *vp = bp->b_vp;
2093	vm_ooffset_t soff, eoff;
2094
2095	soff = off;
2096	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2097	vm_page_set_invalid(m,
2098			    (vm_offset_t) (soff & PAGE_MASK),
2099			    (vm_offset_t) (eoff - soff));
2100	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2101		vm_ooffset_t sv, ev;
2102		off = off - pageno * PAGE_SIZE;
2103		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2104		ev = off + (bp->b_validend & ~(DEV_BSIZE - 1));
2105		soff = max(sv, soff);
2106		eoff = min(ev, eoff);
2107	}
2108	if (eoff > soff)
2109		vm_page_set_validclean(m,
2110				       (vm_offset_t) (soff & PAGE_MASK),
2111				       (vm_offset_t) (eoff - soff));
2112}
2113
2114/*
2115 * This routine is called before a device strategy routine.
2116 * It is used to tell the VM system that paging I/O is in
2117 * progress, and treat the pages associated with the buffer
2118 * almost as being PG_BUSY.  Also the object paging_in_progress
2119 * flag is handled to make sure that the object doesn't become
2120 * inconsistant.
2121 */
2122void
2123vfs_busy_pages(struct buf * bp, int clear_modify)
2124{
2125	int i;
2126
2127	if (bp->b_flags & B_VMIO) {
2128		struct vnode *vp = bp->b_vp;
2129		vm_object_t obj = vp->v_object;
2130		vm_ooffset_t foff;
2131
2132		if (vp->v_type == VBLK)
2133			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2134		else
2135			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2136		vfs_setdirty(bp);
2137		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2138			vm_page_t m = bp->b_pages[i];
2139
2140			if ((bp->b_flags & B_CLUSTER) == 0) {
2141				obj->paging_in_progress++;
2142				m->busy++;
2143			}
2144			vm_page_protect(m, VM_PROT_NONE);
2145			if (clear_modify)
2146				vfs_page_set_valid(bp, foff, i, m);
2147			else if (bp->b_bcount >= PAGE_SIZE) {
2148				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2149					bp->b_pages[i] = bogus_page;
2150					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2151				}
2152			}
2153		}
2154	}
2155}
2156
2157/*
2158 * Tell the VM system that the pages associated with this buffer
2159 * are clean.  This is used for delayed writes where the data is
2160 * going to go to disk eventually without additional VM intevention.
2161 */
2162void
2163vfs_clean_pages(struct buf * bp)
2164{
2165	int i;
2166
2167	if (bp->b_flags & B_VMIO) {
2168		struct vnode *vp = bp->b_vp;
2169		vm_ooffset_t foff;
2170
2171		if (vp->v_type == VBLK)
2172			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2173		else
2174			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2175		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2176			vm_page_t m = bp->b_pages[i];
2177
2178			vfs_page_set_valid(bp, foff, i, m);
2179		}
2180	}
2181}
2182
2183void
2184vfs_bio_clrbuf(struct buf *bp) {
2185	int i;
2186	if( bp->b_flags & B_VMIO) {
2187		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2188			int mask;
2189			mask = 0;
2190			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2191				mask |= (1 << (i/DEV_BSIZE));
2192			if( bp->b_pages[0]->valid != mask) {
2193				bzero(bp->b_data, bp->b_bufsize);
2194			}
2195			bp->b_pages[0]->valid = mask;
2196			bp->b_resid = 0;
2197			return;
2198		}
2199		for(i=0;i<bp->b_npages;i++) {
2200			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2201				continue;
2202			if( bp->b_pages[i]->valid == 0) {
2203				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2204					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2205				}
2206			} else {
2207				int j;
2208				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2209					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
2210						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2211				}
2212			}
2213			/* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
2214		}
2215		bp->b_resid = 0;
2216	} else {
2217		clrbuf(bp);
2218	}
2219}
2220
2221/*
2222 * vm_hold_load_pages and vm_hold_unload pages get pages into
2223 * a buffers address space.  The pages are anonymous and are
2224 * not associated with a file object.
2225 */
2226void
2227vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2228{
2229	vm_offset_t pg;
2230	vm_page_t p;
2231	int index;
2232
2233	to = round_page(to);
2234	from = round_page(from);
2235	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2236
2237	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2238
2239tryagain:
2240
2241		p = vm_page_alloc(kernel_object,
2242			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2243		    VM_ALLOC_NORMAL);
2244		if (!p) {
2245			VM_WAIT;
2246			goto tryagain;
2247		}
2248		vm_page_wire(p);
2249		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2250		bp->b_pages[index] = p;
2251		PAGE_WAKEUP(p);
2252	}
2253	bp->b_npages = index;
2254}
2255
2256void
2257vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2258{
2259	vm_offset_t pg;
2260	vm_page_t p;
2261	int index, newnpages;
2262
2263	from = round_page(from);
2264	to = round_page(to);
2265	newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2266
2267	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2268		p = bp->b_pages[index];
2269		if (p && (index < bp->b_npages)) {
2270#if !defined(MAX_PERF)
2271			if (p->busy) {
2272				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2273					bp->b_blkno, bp->b_lblkno);
2274			}
2275#endif
2276			bp->b_pages[index] = NULL;
2277			pmap_kremove(pg);
2278			vm_page_unwire(p);
2279			vm_page_free(p);
2280		}
2281	}
2282	bp->b_npages = newnpages;
2283}
2284
2285
2286#include "opt_ddb.h"
2287#ifdef DDB
2288#include <ddb/ddb.h>
2289
2290DB_SHOW_COMMAND(buffer, db_show_buffer)
2291{
2292	/* get args */
2293	struct buf *bp = (struct buf *)addr;
2294
2295	if (!have_addr) {
2296		db_printf("usage: show buffer <addr>\n");
2297		return;
2298	}
2299
2300	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2301		  bp->b_flags, "\20\40bounce\37cluster\36vmio\35ram\34ordered"
2302		  "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26tape"
2303		  "\25read\24raw\23phys\22clusterok\21malloc\20nocache"
2304		  "\17locked\16inval\15gathered\14error\13eintr\12done\11dirty"
2305		  "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age");
2306	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2307		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2308		  "b_blkno = %d, b_pblkno = %d\n",
2309		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2310		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2311	if (bp->b_npages) {
2312		int i;
2313		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2314		for (i = 0; i < bp->b_npages; i++) {
2315			vm_page_t m;
2316			m = bp->b_pages[i];
2317			db_printf("(0x%x, 0x%x, 0x%x)", m->object, m->pindex,
2318				VM_PAGE_TO_PHYS(m));
2319			if ((i + 1) < bp->b_npages)
2320				db_printf(",");
2321		}
2322		db_printf("\n");
2323	}
2324}
2325#endif /* DDB */
2326