vfs_bio.c revision 32702
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.143 1998/01/17 09:16:26 dyson Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 */
27
28#include "opt_bounce.h"
29
30#define VMIO
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/sysproto.h>
34#include <sys/kernel.h>
35#include <sys/sysctl.h>
36#include <sys/proc.h>
37#include <sys/vnode.h>
38#include <sys/vmmeter.h>
39#include <sys/lock.h>
40#include <vm/vm.h>
41#include <vm/vm_param.h>
42#include <vm/vm_prot.h>
43#include <vm/vm_kern.h>
44#include <vm/vm_pageout.h>
45#include <vm/vm_page.h>
46#include <vm/vm_object.h>
47#include <vm/vm_extern.h>
48#include <vm/vm_map.h>
49#include <sys/buf.h>
50#include <sys/mount.h>
51#include <sys/malloc.h>
52#include <sys/resourcevar.h>
53
54static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
55
56static void vfs_update __P((void));
57static struct	proc *updateproc;
58static struct kproc_desc up_kp = {
59	"update",
60	vfs_update,
61	&updateproc
62};
63SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
64
65struct buf *buf;		/* buffer header pool */
66struct swqueue bswlist;
67
68int count_lock_queue __P((void));
69static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
70		vm_offset_t to);
71static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
72		vm_offset_t to);
73static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
74			      vm_offset_t off, vm_offset_t size,
75			      vm_page_t m);
76static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
77			       int pageno, vm_page_t m);
78static void vfs_clean_pages(struct buf * bp);
79static void vfs_setdirty(struct buf *bp);
80static void vfs_vmio_release(struct buf *bp);
81static void flushdirtybuffers(int slpflag, int slptimeo);
82
83int needsbuffer;
84
85/*
86 * Internal update daemon, process 3
87 *	The variable vfs_update_wakeup allows for internal syncs.
88 */
89int vfs_update_wakeup;
90
91
92/*
93 * buffers base kva
94 */
95
96/*
97 * bogus page -- for I/O to/from partially complete buffers
98 * this is a temporary solution to the problem, but it is not
99 * really that bad.  it would be better to split the buffer
100 * for input in the case of buffers partially already in memory,
101 * but the code is intricate enough already.
102 */
103vm_page_t bogus_page;
104static vm_offset_t bogus_offset;
105
106static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
107	bufmallocspace, maxbufmallocspace;
108int numdirtybuffers, lodirtybuffers, hidirtybuffers;
109static int numfreebuffers, lofreebuffers, hifreebuffers;
110static int kvafreespace;
111
112SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
113	&numdirtybuffers, 0, "");
114SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
115	&lodirtybuffers, 0, "");
116SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
117	&hidirtybuffers, 0, "");
118SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
119	&numfreebuffers, 0, "");
120SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
121	&lofreebuffers, 0, "");
122SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
123	&hifreebuffers, 0, "");
124SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
125	&maxbufspace, 0, "");
126SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
127	&bufspace, 0, "");
128SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
129	&maxvmiobufspace, 0, "");
130SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
131	&vmiospace, 0, "");
132SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
133	&maxbufmallocspace, 0, "");
134SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
135	&bufmallocspace, 0, "");
136SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
137	&kvafreespace, 0, "");
138
139static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
140static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
141
142extern int vm_swap_size;
143
144#define BUF_MAXUSE 24
145
146#define VFS_BIO_NEED_ANY 1
147#define VFS_BIO_NEED_LOWLIMIT 2
148#define VFS_BIO_NEED_FREE 4
149
150/*
151 * Initialize buffer headers and related structures.
152 */
153void
154bufinit()
155{
156	struct buf *bp;
157	int i;
158
159	TAILQ_INIT(&bswlist);
160	LIST_INIT(&invalhash);
161
162	/* first, make a null hash table */
163	for (i = 0; i < BUFHSZ; i++)
164		LIST_INIT(&bufhashtbl[i]);
165
166	/* next, make a null set of free lists */
167	for (i = 0; i < BUFFER_QUEUES; i++)
168		TAILQ_INIT(&bufqueues[i]);
169
170	/* finally, initialize each buffer header and stick on empty q */
171	for (i = 0; i < nbuf; i++) {
172		bp = &buf[i];
173		bzero(bp, sizeof *bp);
174		bp->b_flags = B_INVAL;	/* we're just an empty header */
175		bp->b_dev = NODEV;
176		bp->b_rcred = NOCRED;
177		bp->b_wcred = NOCRED;
178		bp->b_qindex = QUEUE_EMPTY;
179		bp->b_vnbufs.le_next = NOLIST;
180		bp->b_generation = 0;
181		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
182		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
183	}
184/*
185 * maxbufspace is currently calculated to support all filesystem blocks
186 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
187 * cache is still the same as it would be for 8K filesystems.  This
188 * keeps the size of the buffer cache "in check" for big block filesystems.
189 */
190	maxbufspace = (nbuf + 8) * DFLTBSIZE;
191/*
192 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
193 */
194	maxvmiobufspace = 2 * maxbufspace / 3;
195/*
196 * Limit the amount of malloc memory since it is wired permanently into
197 * the kernel space.  Even though this is accounted for in the buffer
198 * allocation, we don't want the malloced region to grow uncontrolled.
199 * The malloc scheme improves memory utilization significantly on average
200 * (small) directories.
201 */
202	maxbufmallocspace = maxbufspace / 20;
203
204/*
205 * Remove the probability of deadlock conditions by limiting the
206 * number of dirty buffers.
207 */
208	hidirtybuffers = nbuf / 8 + 20;
209	lodirtybuffers = nbuf / 16 + 10;
210	numdirtybuffers = 0;
211	lofreebuffers = nbuf / 18 + 5;
212	hifreebuffers = 2 * lofreebuffers;
213	numfreebuffers = nbuf;
214	kvafreespace = 0;
215
216	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
217	bogus_page = vm_page_alloc(kernel_object,
218			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
219			VM_ALLOC_NORMAL);
220
221}
222
223/*
224 * Free the kva allocation for a buffer
225 * Must be called only at splbio or higher,
226 *  as this is the only locking for buffer_map.
227 */
228static void
229bfreekva(struct buf * bp)
230{
231	if (bp->b_kvasize == 0)
232		return;
233
234	vm_map_delete(buffer_map,
235		(vm_offset_t) bp->b_kvabase,
236		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
237
238	bp->b_kvasize = 0;
239
240}
241
242/*
243 * remove the buffer from the appropriate free list
244 */
245void
246bremfree(struct buf * bp)
247{
248	int s = splbio();
249
250	if (bp->b_qindex != QUEUE_NONE) {
251		if (bp->b_qindex == QUEUE_EMPTY) {
252			kvafreespace -= bp->b_kvasize;
253		}
254		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
255		bp->b_qindex = QUEUE_NONE;
256	} else {
257#if !defined(MAX_PERF)
258		panic("bremfree: removing a buffer when not on a queue");
259#endif
260	}
261	if ((bp->b_flags & B_INVAL) ||
262		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
263		--numfreebuffers;
264	splx(s);
265}
266
267
268/*
269 * Get a buffer with the specified data.  Look in the cache first.
270 */
271int
272bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
273    struct buf ** bpp)
274{
275	struct buf *bp;
276
277	bp = getblk(vp, blkno, size, 0, 0);
278	*bpp = bp;
279
280	/* if not found in cache, do some I/O */
281	if ((bp->b_flags & B_CACHE) == 0) {
282		if (curproc != NULL)
283			curproc->p_stats->p_ru.ru_inblock++;
284		bp->b_flags |= B_READ;
285		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
286		if (bp->b_rcred == NOCRED) {
287			if (cred != NOCRED)
288				crhold(cred);
289			bp->b_rcred = cred;
290		}
291		vfs_busy_pages(bp, 0);
292		VOP_STRATEGY(bp);
293		return (biowait(bp));
294	}
295	return (0);
296}
297
298/*
299 * Operates like bread, but also starts asynchronous I/O on
300 * read-ahead blocks.
301 */
302int
303breadn(struct vnode * vp, daddr_t blkno, int size,
304    daddr_t * rablkno, int *rabsize,
305    int cnt, struct ucred * cred, struct buf ** bpp)
306{
307	struct buf *bp, *rabp;
308	int i;
309	int rv = 0, readwait = 0;
310
311	*bpp = bp = getblk(vp, blkno, size, 0, 0);
312
313	/* if not found in cache, do some I/O */
314	if ((bp->b_flags & B_CACHE) == 0) {
315		if (curproc != NULL)
316			curproc->p_stats->p_ru.ru_inblock++;
317		bp->b_flags |= B_READ;
318		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
319		if (bp->b_rcred == NOCRED) {
320			if (cred != NOCRED)
321				crhold(cred);
322			bp->b_rcred = cred;
323		}
324		vfs_busy_pages(bp, 0);
325		VOP_STRATEGY(bp);
326		++readwait;
327	}
328	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
329		if (inmem(vp, *rablkno))
330			continue;
331		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
332
333		if ((rabp->b_flags & B_CACHE) == 0) {
334			if (curproc != NULL)
335				curproc->p_stats->p_ru.ru_inblock++;
336			rabp->b_flags |= B_READ | B_ASYNC;
337			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
338			if (rabp->b_rcred == NOCRED) {
339				if (cred != NOCRED)
340					crhold(cred);
341				rabp->b_rcred = cred;
342			}
343			vfs_busy_pages(rabp, 0);
344			VOP_STRATEGY(rabp);
345		} else {
346			brelse(rabp);
347		}
348	}
349
350	if (readwait) {
351		rv = biowait(bp);
352	}
353	return (rv);
354}
355
356/*
357 * Write, release buffer on completion.  (Done by iodone
358 * if async.)
359 */
360int
361bwrite(struct buf * bp)
362{
363	int oldflags = bp->b_flags;
364
365	if (bp->b_flags & B_INVAL) {
366		brelse(bp);
367		return (0);
368	}
369#if !defined(MAX_PERF)
370	if (!(bp->b_flags & B_BUSY))
371		panic("bwrite: buffer is not busy???");
372#endif
373
374	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
375	bp->b_flags |= B_WRITEINPROG;
376
377	if ((oldflags & B_DELWRI) == B_DELWRI) {
378		--numdirtybuffers;
379		reassignbuf(bp, bp->b_vp);
380	}
381
382	bp->b_vp->v_numoutput++;
383	vfs_busy_pages(bp, 1);
384	if (curproc != NULL)
385		curproc->p_stats->p_ru.ru_oublock++;
386	VOP_STRATEGY(bp);
387
388	if ((oldflags & B_ASYNC) == 0) {
389		int rtval = biowait(bp);
390
391		if (oldflags & B_DELWRI) {
392			reassignbuf(bp, bp->b_vp);
393		}
394		brelse(bp);
395		return (rtval);
396	}
397	return (0);
398}
399
400inline void
401vfs_bio_need_satisfy(void) {
402	++numfreebuffers;
403	if (!needsbuffer)
404		return;
405	if (numdirtybuffers < lodirtybuffers) {
406		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
407	} else {
408		needsbuffer &= ~VFS_BIO_NEED_ANY;
409	}
410	if (numfreebuffers >= hifreebuffers) {
411		needsbuffer &= ~VFS_BIO_NEED_FREE;
412	}
413	wakeup(&needsbuffer);
414}
415
416/*
417 * Delayed write. (Buffer is marked dirty).
418 */
419void
420bdwrite(struct buf * bp)
421{
422
423#if !defined(MAX_PERF)
424	if ((bp->b_flags & B_BUSY) == 0) {
425		panic("bdwrite: buffer is not busy");
426	}
427#endif
428
429	if (bp->b_flags & B_INVAL) {
430		brelse(bp);
431		return;
432	}
433	if (bp->b_flags & B_TAPE) {
434		bawrite(bp);
435		return;
436	}
437	bp->b_flags &= ~(B_READ|B_RELBUF);
438	if ((bp->b_flags & B_DELWRI) == 0) {
439		bp->b_flags |= B_DONE | B_DELWRI;
440		reassignbuf(bp, bp->b_vp);
441		++numdirtybuffers;
442	}
443
444	/*
445	 * This bmap keeps the system from needing to do the bmap later,
446	 * perhaps when the system is attempting to do a sync.  Since it
447	 * is likely that the indirect block -- or whatever other datastructure
448	 * that the filesystem needs is still in memory now, it is a good
449	 * thing to do this.  Note also, that if the pageout daemon is
450	 * requesting a sync -- there might not be enough memory to do
451	 * the bmap then...  So, this is important to do.
452	 */
453	if (bp->b_lblkno == bp->b_blkno) {
454		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
455	}
456
457	/*
458	 * Set the *dirty* buffer range based upon the VM system dirty pages.
459	 */
460	vfs_setdirty(bp);
461
462	/*
463	 * We need to do this here to satisfy the vnode_pager and the
464	 * pageout daemon, so that it thinks that the pages have been
465	 * "cleaned".  Note that since the pages are in a delayed write
466	 * buffer -- the VFS layer "will" see that the pages get written
467	 * out on the next sync, or perhaps the cluster will be completed.
468	 */
469	vfs_clean_pages(bp);
470	bqrelse(bp);
471
472	if (numdirtybuffers >= hidirtybuffers)
473		flushdirtybuffers(0, 0);
474
475	return;
476}
477
478/*
479 * Asynchronous write.
480 * Start output on a buffer, but do not wait for it to complete.
481 * The buffer is released when the output completes.
482 */
483void
484bawrite(struct buf * bp)
485{
486	bp->b_flags |= B_ASYNC;
487	(void) VOP_BWRITE(bp);
488}
489
490/*
491 * Ordered write.
492 * Start output on a buffer, but only wait for it to complete if the
493 * output device cannot guarantee ordering in some other way.  Devices
494 * that can perform asynchronous ordered writes will set the B_ASYNC
495 * flag in their strategy routine.
496 * The buffer is released when the output completes.
497 */
498int
499bowrite(struct buf * bp)
500{
501	/*
502	 * XXX Add in B_ASYNC once the SCSI
503	 *     layer can deal with ordered
504	 *     writes properly.
505	 */
506	bp->b_flags |= B_ORDERED;
507	return (VOP_BWRITE(bp));
508}
509
510/*
511 * Release a buffer.
512 */
513void
514brelse(struct buf * bp)
515{
516	int s;
517
518	if (bp->b_flags & B_CLUSTER) {
519		relpbuf(bp);
520		return;
521	}
522	/* anyone need a "free" block? */
523	s = splbio();
524
525	/* anyone need this block? */
526	if (bp->b_flags & B_WANTED) {
527		bp->b_flags &= ~(B_WANTED | B_AGE);
528		wakeup(bp);
529	}
530
531	if (bp->b_flags & B_LOCKED)
532		bp->b_flags &= ~B_ERROR;
533
534	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
535	    (bp->b_bufsize <= 0)) {
536		bp->b_flags |= B_INVAL;
537		if (bp->b_flags & B_DELWRI)
538			--numdirtybuffers;
539		bp->b_flags &= ~(B_DELWRI | B_CACHE);
540		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
541			if (bp->b_bufsize)
542				allocbuf(bp, 0);
543			brelvp(bp);
544		}
545	}
546
547	/*
548	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
549	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
550	 * but the VM object is kept around.  The B_NOCACHE flag is used to
551	 * invalidate the pages in the VM object.
552	 *
553	 * If the buffer is a partially filled NFS buffer, keep it
554	 * since invalidating it now will lose informatio.  The valid
555	 * flags in the vm_pages have only DEV_BSIZE resolution but
556	 * the b_validoff, b_validend fields have byte resolution.
557	 * This can avoid unnecessary re-reads of the buffer.
558	 * XXX this seems to cause performance problems.
559	 */
560	if ((bp->b_flags & B_VMIO)
561	    && !(bp->b_vp->v_tag == VT_NFS &&
562		 bp->b_vp->v_type != VBLK &&
563		 (bp->b_flags & B_DELWRI) != 0)
564#ifdef notdef
565	    && (bp->b_vp->v_tag != VT_NFS
566		|| bp->b_vp->v_type == VBLK
567		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
568		|| bp->b_validend == 0
569		|| (bp->b_validoff == 0
570		    && bp->b_validend == bp->b_bufsize))
571#endif
572	    ) {
573		vm_ooffset_t foff;
574		vm_object_t obj;
575		int i, resid;
576		vm_page_t m;
577		struct vnode *vp;
578		int iototal = bp->b_bufsize;
579
580		vp = bp->b_vp;
581
582#if !defined(MAX_PERF)
583		if (!vp)
584			panic("brelse: missing vp");
585#endif
586
587		if (bp->b_npages) {
588			vm_pindex_t poff;
589			obj = (vm_object_t) vp->v_object;
590			if (vp->v_type == VBLK)
591				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
592			else
593				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
594			poff = OFF_TO_IDX(foff);
595			for (i = 0; i < bp->b_npages; i++) {
596				m = bp->b_pages[i];
597				if (m == bogus_page) {
598					m = vm_page_lookup(obj, poff + i);
599#if !defined(MAX_PERF)
600					if (!m) {
601						panic("brelse: page missing\n");
602					}
603#endif
604					bp->b_pages[i] = m;
605					pmap_qenter(trunc_page(bp->b_data),
606						bp->b_pages, bp->b_npages);
607				}
608				resid = IDX_TO_OFF(m->pindex+1) - foff;
609				if (resid > iototal)
610					resid = iototal;
611				if (resid > 0) {
612					/*
613					 * Don't invalidate the page if the local machine has already
614					 * modified it.  This is the lesser of two evils, and should
615					 * be fixed.
616					 */
617					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
618						vm_page_test_dirty(m);
619						if (m->dirty == 0) {
620							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
621							if (m->valid == 0)
622								vm_page_protect(m, VM_PROT_NONE);
623						}
624					}
625					if (resid >= PAGE_SIZE) {
626						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
627							bp->b_flags |= B_INVAL;
628						}
629					} else {
630						if (!vm_page_is_valid(m,
631							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
632							bp->b_flags |= B_INVAL;
633						}
634					}
635				}
636				foff += resid;
637				iototal -= resid;
638			}
639		}
640		if (bp->b_flags & (B_INVAL | B_RELBUF))
641			vfs_vmio_release(bp);
642	}
643#if !defined(MAX_PERF)
644	if (bp->b_qindex != QUEUE_NONE)
645		panic("brelse: free buffer onto another queue???");
646#endif
647
648	/* enqueue */
649	/* buffers with no memory */
650	if (bp->b_bufsize == 0) {
651		bp->b_flags |= B_INVAL;
652		bp->b_qindex = QUEUE_EMPTY;
653		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
654		LIST_REMOVE(bp, b_hash);
655		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
656		bp->b_dev = NODEV;
657		kvafreespace += bp->b_kvasize;
658		bp->b_generation++;
659
660	/* buffers with junk contents */
661	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
662		bp->b_flags |= B_INVAL;
663		bp->b_qindex = QUEUE_AGE;
664		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
665		LIST_REMOVE(bp, b_hash);
666		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
667		bp->b_dev = NODEV;
668		bp->b_generation++;
669
670	/* buffers that are locked */
671	} else if (bp->b_flags & B_LOCKED) {
672		bp->b_qindex = QUEUE_LOCKED;
673		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
674
675	/* buffers with stale but valid contents */
676	} else if (bp->b_flags & B_AGE) {
677		bp->b_qindex = QUEUE_AGE;
678		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
679
680	/* buffers with valid and quite potentially reuseable contents */
681	} else {
682		bp->b_qindex = QUEUE_LRU;
683		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
684	}
685
686	if ((bp->b_flags & B_INVAL) ||
687		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
688		if (bp->b_flags & B_DELWRI) {
689			--numdirtybuffers;
690			bp->b_flags &= ~B_DELWRI;
691		}
692		vfs_bio_need_satisfy();
693	}
694
695	/* unlock */
696	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
697				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
698	splx(s);
699}
700
701/*
702 * Release a buffer.
703 */
704void
705bqrelse(struct buf * bp)
706{
707	int s;
708
709	s = splbio();
710
711	/* anyone need this block? */
712	if (bp->b_flags & B_WANTED) {
713		bp->b_flags &= ~(B_WANTED | B_AGE);
714		wakeup(bp);
715	}
716
717#if !defined(MAX_PERF)
718	if (bp->b_qindex != QUEUE_NONE)
719		panic("bqrelse: free buffer onto another queue???");
720#endif
721
722	if (bp->b_flags & B_LOCKED) {
723		bp->b_flags &= ~B_ERROR;
724		bp->b_qindex = QUEUE_LOCKED;
725		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
726		/* buffers with stale but valid contents */
727	} else {
728		bp->b_qindex = QUEUE_LRU;
729		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
730	}
731
732	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
733		vfs_bio_need_satisfy();
734	}
735
736	/* unlock */
737	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
738		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
739	splx(s);
740}
741
742static void
743vfs_vmio_release(bp)
744	struct buf *bp;
745{
746	int i;
747	vm_page_t m;
748
749	for (i = 0; i < bp->b_npages; i++) {
750		m = bp->b_pages[i];
751		bp->b_pages[i] = NULL;
752		vm_page_unwire(m);
753		/*
754		 * We don't mess with busy pages, it is
755		 * the responsibility of the process that
756		 * busied the pages to deal with them.
757		 */
758		if ((m->flags & PG_BUSY) || (m->busy != 0))
759			continue;
760
761		if (m->wire_count == 0) {
762
763			if (m->flags & PG_WANTED) {
764				m->flags &= ~PG_WANTED;
765				wakeup(m);
766			}
767
768			/*
769			 * If this is an async free -- we cannot place
770			 * pages onto the cache queue.  If it is an
771			 * async free, then we don't modify any queues.
772			 * This is probably in error (for perf reasons),
773			 * and we will eventually need to build
774			 * a more complete infrastructure to support I/O
775			 * rundown.
776			 */
777			if ((bp->b_flags & B_ASYNC) == 0) {
778
779			/*
780			 * In the case of sync buffer frees, we can do pretty much
781			 * anything to any of the memory queues.  Specifically,
782			 * the cache queue is okay to be modified.
783			 */
784				if (m->valid) {
785					if(m->dirty == 0)
786						vm_page_test_dirty(m);
787					/*
788					 * this keeps pressure off of the process memory
789					 */
790					if (m->dirty == 0 && m->hold_count == 0)
791						vm_page_cache(m);
792					else
793						vm_page_deactivate(m);
794				} else if (m->hold_count == 0) {
795					struct vnode *vp;
796					vp = bp->b_vp;
797					vm_page_protect(m, VM_PROT_NONE);
798					vm_page_free(m);
799				}
800			} else {
801				/*
802				 * If async, then at least we clear the
803				 * act_count.
804				 */
805				m->act_count = 0;
806			}
807		}
808	}
809	bufspace -= bp->b_bufsize;
810	vmiospace -= bp->b_bufsize;
811	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
812	bp->b_npages = 0;
813	bp->b_bufsize = 0;
814	bp->b_flags &= ~B_VMIO;
815	if (bp->b_vp)
816		brelvp(bp);
817}
818
819/*
820 * Check to see if a block is currently memory resident.
821 */
822struct buf *
823gbincore(struct vnode * vp, daddr_t blkno)
824{
825	struct buf *bp;
826	struct bufhashhdr *bh;
827
828	bh = BUFHASH(vp, blkno);
829	bp = bh->lh_first;
830
831	/* Search hash chain */
832	while (bp != NULL) {
833		/* hit */
834		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
835		    (bp->b_flags & B_INVAL) == 0) {
836			break;
837		}
838		bp = bp->b_hash.le_next;
839	}
840	return (bp);
841}
842
843/*
844 * this routine implements clustered async writes for
845 * clearing out B_DELWRI buffers...  This is much better
846 * than the old way of writing only one buffer at a time.
847 */
848int
849vfs_bio_awrite(struct buf * bp)
850{
851	int i;
852	daddr_t lblkno = bp->b_lblkno;
853	struct vnode *vp = bp->b_vp;
854	int s;
855	int ncl;
856	struct buf *bpa;
857	int nwritten;
858	int size;
859	int maxcl;
860
861	s = splbio();
862	/*
863	 * right now we support clustered writing only to regular files
864	 */
865	if ((vp->v_type == VREG) &&
866	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
867	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
868
869		size = vp->v_mount->mnt_stat.f_iosize;
870		maxcl = MAXPHYS / size;
871
872		for (i = 1; i < maxcl; i++) {
873			if ((bpa = gbincore(vp, lblkno + i)) &&
874			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
875			    (B_DELWRI | B_CLUSTEROK)) &&
876			    (bpa->b_bufsize == size)) {
877				if ((bpa->b_blkno == bpa->b_lblkno) ||
878				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
879					break;
880			} else {
881				break;
882			}
883		}
884		ncl = i;
885		/*
886		 * this is a possible cluster write
887		 */
888		if (ncl != 1) {
889			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
890			splx(s);
891			return nwritten;
892		}
893	}
894#if 0
895   	else if ((vp->v_flag & VOBJBUF) && (vp->v_type == VBLK) &&
896		((size = bp->b_bufsize) >= PAGE_SIZE)) {
897		maxcl = MAXPHYS / size;
898		for (i = 1; i < maxcl; i++) {
899			if ((bpa = gbincore(vp, lblkno + i)) &&
900			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
901			    (B_DELWRI | B_CLUSTEROK)) &&
902			    (bpa->b_bufsize == size)) {
903				    if (bpa->b_blkno !=
904						bp->b_blkno + ((i * size) >> DEV_BSHIFT))
905							break;
906			} else {
907				break;
908			}
909		}
910		ncl = i;
911		/*
912		 * this is a possible cluster write
913		 */
914		if (ncl != 1) {
915			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
916			splx(s);
917			return nwritten;
918		}
919	}
920#endif
921
922	bremfree(bp);
923	splx(s);
924	/*
925	 * default (old) behavior, writing out only one block
926	 */
927	bp->b_flags |= B_BUSY | B_ASYNC;
928	nwritten = bp->b_bufsize;
929	(void) VOP_BWRITE(bp);
930	return nwritten;
931}
932
933
934/*
935 * Find a buffer header which is available for use.
936 */
937static struct buf *
938getnewbuf(struct vnode *vp, daddr_t blkno,
939	int slpflag, int slptimeo, int size, int maxsize)
940{
941	struct buf *bp, *bp1;
942	int nbyteswritten = 0;
943	vm_offset_t addr;
944	static int writerecursion = 0;
945
946start:
947	if (bufspace >= maxbufspace)
948		goto trytofreespace;
949
950	/* can we constitute a new buffer? */
951	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
952#if !defined(MAX_PERF)
953		if (bp->b_qindex != QUEUE_EMPTY)
954			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
955			    bp->b_qindex);
956#endif
957		bp->b_flags |= B_BUSY;
958		bremfree(bp);
959		goto fillbuf;
960	}
961trytofreespace:
962	/*
963	 * We keep the file I/O from hogging metadata I/O
964	 * This is desirable because file data is cached in the
965	 * VM/Buffer cache even if a buffer is freed.
966	 */
967	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
968#if !defined(MAX_PERF)
969		if (bp->b_qindex != QUEUE_AGE)
970			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
971			    bp->b_qindex);
972#endif
973	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
974#if !defined(MAX_PERF)
975		if (bp->b_qindex != QUEUE_LRU)
976			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
977			    bp->b_qindex);
978#endif
979	}
980	if (!bp) {
981		/* wait for a free buffer of any kind */
982		needsbuffer |= VFS_BIO_NEED_ANY;
983		do
984			tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf",
985			    slptimeo);
986		while (needsbuffer & VFS_BIO_NEED_ANY);
987		return (0);
988	}
989
990#if defined(DIAGNOSTIC)
991	if (bp->b_flags & B_BUSY) {
992		panic("getnewbuf: busy buffer on free list\n");
993	}
994#endif
995
996	/*
997	 * We are fairly aggressive about freeing VMIO buffers, but since
998	 * the buffering is intact without buffer headers, there is not
999	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
1000	 */
1001	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1002		if ((bp->b_flags & B_VMIO) == 0 ||
1003			(vmiospace < maxvmiobufspace)) {
1004			--bp->b_usecount;
1005			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1006			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1007				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1008				goto start;
1009			}
1010			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1011		}
1012	}
1013
1014
1015	/* if we are a delayed write, convert to an async write */
1016	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1017
1018		/*
1019		 * If our delayed write is likely to be used soon, then
1020		 * recycle back onto the LRU queue.
1021		 */
1022		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1023			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1024
1025			if (bp->b_usecount > 0) {
1026				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1027
1028					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1029
1030					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1031						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1032						bp->b_usecount--;
1033						goto start;
1034					}
1035					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1036				}
1037			}
1038		}
1039
1040		/*
1041		 * Certain layered filesystems can recursively re-enter the vfs_bio
1042		 * code, due to delayed writes.  This helps keep the system from
1043		 * deadlocking.
1044		 */
1045		if (writerecursion > 0) {
1046			bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1047			while (bp) {
1048				if ((bp->b_flags & B_DELWRI) == 0)
1049					break;
1050				bp = TAILQ_NEXT(bp, b_freelist);
1051			}
1052			if (bp == NULL) {
1053				bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1054				while (bp) {
1055					if ((bp->b_flags & B_DELWRI) == 0)
1056						break;
1057					bp = TAILQ_NEXT(bp, b_freelist);
1058				}
1059			}
1060			if (bp == NULL)
1061				panic("getnewbuf: cannot get buffer, infinite recursion failure");
1062		} else {
1063			++writerecursion;
1064			nbyteswritten += vfs_bio_awrite(bp);
1065			--writerecursion;
1066			if (!slpflag && !slptimeo) {
1067				return (0);
1068			}
1069			goto start;
1070		}
1071	}
1072
1073	if (bp->b_flags & B_WANTED) {
1074		bp->b_flags &= ~B_WANTED;
1075		wakeup(bp);
1076	}
1077	bremfree(bp);
1078	bp->b_flags |= B_BUSY;
1079
1080	if (bp->b_flags & B_VMIO) {
1081		bp->b_flags &= ~B_ASYNC;
1082		vfs_vmio_release(bp);
1083	}
1084
1085	if (bp->b_vp)
1086		brelvp(bp);
1087
1088fillbuf:
1089	bp->b_generation++;
1090
1091	/* we are not free, nor do we contain interesting data */
1092	if (bp->b_rcred != NOCRED) {
1093		crfree(bp->b_rcred);
1094		bp->b_rcred = NOCRED;
1095	}
1096	if (bp->b_wcred != NOCRED) {
1097		crfree(bp->b_wcred);
1098		bp->b_wcred = NOCRED;
1099	}
1100
1101	LIST_REMOVE(bp, b_hash);
1102	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1103	if (bp->b_bufsize) {
1104		allocbuf(bp, 0);
1105	}
1106	bp->b_flags = B_BUSY;
1107	bp->b_dev = NODEV;
1108	bp->b_vp = NULL;
1109	bp->b_blkno = bp->b_lblkno = 0;
1110	bp->b_iodone = 0;
1111	bp->b_error = 0;
1112	bp->b_resid = 0;
1113	bp->b_bcount = 0;
1114	bp->b_npages = 0;
1115	bp->b_dirtyoff = bp->b_dirtyend = 0;
1116	bp->b_validoff = bp->b_validend = 0;
1117	bp->b_usecount = 5;
1118
1119	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1120
1121	/*
1122	 * we assume that buffer_map is not at address 0
1123	 */
1124	addr = 0;
1125	if (maxsize != bp->b_kvasize) {
1126		bfreekva(bp);
1127
1128findkvaspace:
1129		/*
1130		 * See if we have buffer kva space
1131		 */
1132		if (vm_map_findspace(buffer_map,
1133			vm_map_min(buffer_map), maxsize, &addr)) {
1134			if (kvafreespace > 0) {
1135				int tfree = 0;
1136				for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1137					bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist))
1138					if (bp1->b_kvasize != 0) {
1139						tfree += bp1->b_kvasize;
1140						bremfree(bp1);
1141						bfreekva(bp1);
1142						brelse(bp1);
1143						if (tfree >= maxsize)
1144							goto findkvaspace;
1145					}
1146			}
1147			bp->b_flags |= B_INVAL;
1148			brelse(bp);
1149			goto trytofreespace;
1150		}
1151	}
1152
1153	/*
1154	 * See if we are below are allocated minimum
1155	 */
1156	if (bufspace >= (maxbufspace + nbyteswritten)) {
1157		bp->b_flags |= B_INVAL;
1158		brelse(bp);
1159		goto trytofreespace;
1160	}
1161
1162	/*
1163	 * create a map entry for the buffer -- in essence
1164	 * reserving the kva space.
1165	 */
1166	if (addr) {
1167		vm_map_insert(buffer_map, NULL, 0,
1168			addr, addr + maxsize,
1169			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1170
1171		bp->b_kvabase = (caddr_t) addr;
1172		bp->b_kvasize = maxsize;
1173	}
1174	bp->b_data = bp->b_kvabase;
1175
1176	return (bp);
1177}
1178
1179static void
1180waitfreebuffers(int slpflag, int slptimeo) {
1181	while (numfreebuffers < hifreebuffers) {
1182		flushdirtybuffers(slpflag, slptimeo);
1183		if (numfreebuffers < hifreebuffers)
1184			break;
1185		needsbuffer |= VFS_BIO_NEED_FREE;
1186		if (tsleep(&needsbuffer, PRIBIO|slpflag, "biofre", slptimeo))
1187			break;
1188	}
1189}
1190
1191static void
1192flushdirtybuffers(int slpflag, int slptimeo) {
1193	int s;
1194	static pid_t flushing = 0;
1195
1196	s = splbio();
1197
1198	if (flushing) {
1199		if (flushing == curproc->p_pid) {
1200			splx(s);
1201			return;
1202		}
1203		while (flushing) {
1204			if (tsleep(&flushing, PRIBIO|slpflag, "biofls", slptimeo)) {
1205				splx(s);
1206				return;
1207			}
1208		}
1209	}
1210	flushing = curproc->p_pid;
1211
1212	while (numdirtybuffers > lodirtybuffers) {
1213		struct buf *bp;
1214		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1215		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1216		if (bp == NULL)
1217			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1218
1219		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1220			bp = TAILQ_NEXT(bp, b_freelist);
1221		}
1222
1223		if (bp) {
1224			vfs_bio_awrite(bp);
1225			continue;
1226		}
1227		break;
1228	}
1229
1230	flushing = 0;
1231	wakeup(&flushing);
1232	splx(s);
1233}
1234
1235/*
1236 * Check to see if a block is currently memory resident.
1237 */
1238struct buf *
1239incore(struct vnode * vp, daddr_t blkno)
1240{
1241	struct buf *bp;
1242
1243	int s = splbio();
1244	bp = gbincore(vp, blkno);
1245	splx(s);
1246	return (bp);
1247}
1248
1249/*
1250 * Returns true if no I/O is needed to access the
1251 * associated VM object.  This is like incore except
1252 * it also hunts around in the VM system for the data.
1253 */
1254
1255int
1256inmem(struct vnode * vp, daddr_t blkno)
1257{
1258	vm_object_t obj;
1259	vm_offset_t toff, tinc;
1260	vm_page_t m;
1261	vm_ooffset_t off;
1262
1263	if (incore(vp, blkno))
1264		return 1;
1265	if (vp->v_mount == NULL)
1266		return 0;
1267	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1268		return 0;
1269
1270	obj = vp->v_object;
1271	tinc = PAGE_SIZE;
1272	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1273		tinc = vp->v_mount->mnt_stat.f_iosize;
1274	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1275
1276	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1277
1278		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1279		if (!m)
1280			return 0;
1281		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
1282			return 0;
1283	}
1284	return 1;
1285}
1286
1287/*
1288 * now we set the dirty range for the buffer --
1289 * for NFS -- if the file is mapped and pages have
1290 * been written to, let it know.  We want the
1291 * entire range of the buffer to be marked dirty if
1292 * any of the pages have been written to for consistancy
1293 * with the b_validoff, b_validend set in the nfs write
1294 * code, and used by the nfs read code.
1295 */
1296static void
1297vfs_setdirty(struct buf *bp) {
1298	int i;
1299	vm_object_t object;
1300	vm_offset_t boffset, offset;
1301	/*
1302	 * We qualify the scan for modified pages on whether the
1303	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1304	 * is not cleared simply by protecting pages off.
1305	 */
1306	if ((bp->b_flags & B_VMIO) &&
1307		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1308		/*
1309		 * test the pages to see if they have been modified directly
1310		 * by users through the VM system.
1311		 */
1312		for (i = 0; i < bp->b_npages; i++)
1313			vm_page_test_dirty(bp->b_pages[i]);
1314
1315		/*
1316		 * scan forwards for the first page modified
1317		 */
1318		for (i = 0; i < bp->b_npages; i++) {
1319			if (bp->b_pages[i]->dirty) {
1320				break;
1321			}
1322		}
1323		boffset = (i << PAGE_SHIFT);
1324		if (boffset < bp->b_dirtyoff) {
1325			bp->b_dirtyoff = boffset;
1326		}
1327
1328		/*
1329		 * scan backwards for the last page modified
1330		 */
1331		for (i = bp->b_npages - 1; i >= 0; --i) {
1332			if (bp->b_pages[i]->dirty) {
1333				break;
1334			}
1335		}
1336		boffset = (i + 1);
1337		offset = boffset + bp->b_pages[0]->pindex;
1338		if (offset >= object->size)
1339			boffset = object->size - bp->b_pages[0]->pindex;
1340		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1341			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1342	}
1343}
1344
1345/*
1346 * Get a block given a specified block and offset into a file/device.
1347 */
1348struct buf *
1349getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1350{
1351	struct buf *bp;
1352	int s;
1353	struct bufhashhdr *bh;
1354	int maxsize;
1355	int generation;
1356
1357	if (vp->v_mount) {
1358		maxsize = vp->v_mount->mnt_stat.f_iosize;
1359		/*
1360		 * This happens on mount points.
1361		 */
1362		if (maxsize < size)
1363			maxsize = size;
1364	} else {
1365		maxsize = size;
1366	}
1367
1368#if !defined(MAX_PERF)
1369	if (size > MAXBSIZE)
1370		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1371#endif
1372
1373	s = splbio();
1374loop:
1375	if (numfreebuffers < lofreebuffers) {
1376		waitfreebuffers(slpflag, slptimeo);
1377	}
1378
1379	if ((bp = gbincore(vp, blkno))) {
1380loop1:
1381		generation = bp->b_generation;
1382		if (bp->b_flags & B_BUSY) {
1383			bp->b_flags |= B_WANTED;
1384			if (bp->b_usecount < BUF_MAXUSE)
1385				++bp->b_usecount;
1386			if (!tsleep(bp,
1387				(PRIBIO + 1) | slpflag, "getblk", slptimeo)) {
1388				if (bp->b_generation != generation)
1389					goto loop;
1390				goto loop1;
1391			} else {
1392				splx(s);
1393				return (struct buf *) NULL;
1394			}
1395		}
1396		bp->b_flags |= B_BUSY | B_CACHE;
1397		bremfree(bp);
1398
1399		/*
1400		 * check for size inconsistancies (note that they shouldn't
1401		 * happen but do when filesystems don't handle the size changes
1402		 * correctly.) We are conservative on metadata and don't just
1403		 * extend the buffer but write and re-constitute it.
1404		 */
1405
1406		if (bp->b_bcount != size) {
1407			bp->b_generation++;
1408			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1409				allocbuf(bp, size);
1410			} else {
1411				bp->b_flags |= B_NOCACHE;
1412				VOP_BWRITE(bp);
1413				goto loop;
1414			}
1415		}
1416
1417		if (bp->b_usecount < BUF_MAXUSE)
1418			++bp->b_usecount;
1419		splx(s);
1420		return (bp);
1421	} else {
1422		vm_object_t obj;
1423
1424		if ((bp = getnewbuf(vp, blkno,
1425			slpflag, slptimeo, size, maxsize)) == 0) {
1426			if (slpflag || slptimeo) {
1427				splx(s);
1428				return NULL;
1429			}
1430			goto loop;
1431		}
1432
1433		/*
1434		 * This code is used to make sure that a buffer is not
1435		 * created while the getnewbuf routine is blocked.
1436		 * Normally the vnode is locked so this isn't a problem.
1437		 * VBLK type I/O requests, however, don't lock the vnode.
1438		 */
1439		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
1440			bp->b_flags |= B_INVAL;
1441			brelse(bp);
1442			goto loop;
1443		}
1444
1445		/*
1446		 * Insert the buffer into the hash, so that it can
1447		 * be found by incore.
1448		 */
1449		bp->b_blkno = bp->b_lblkno = blkno;
1450		bgetvp(vp, bp);
1451		LIST_REMOVE(bp, b_hash);
1452		bh = BUFHASH(vp, blkno);
1453		LIST_INSERT_HEAD(bh, bp, b_hash);
1454
1455		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
1456			bp->b_flags |= (B_VMIO | B_CACHE);
1457#if defined(VFS_BIO_DEBUG)
1458			if (vp->v_type != VREG && vp->v_type != VBLK)
1459				printf("getblk: vmioing file type %d???\n", vp->v_type);
1460#endif
1461		} else {
1462			bp->b_flags &= ~B_VMIO;
1463		}
1464		splx(s);
1465
1466		allocbuf(bp, size);
1467#ifdef	PC98
1468		/*
1469		 * 1024byte/sector support
1470		 */
1471#define B_XXX2 0x8000000
1472		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1473#endif
1474		return (bp);
1475	}
1476}
1477
1478/*
1479 * Get an empty, disassociated buffer of given size.
1480 */
1481struct buf *
1482geteblk(int size)
1483{
1484	struct buf *bp;
1485	int s;
1486
1487	s = splbio();
1488	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1489	splx(s);
1490	allocbuf(bp, size);
1491	bp->b_flags |= B_INVAL;
1492	return (bp);
1493}
1494
1495
1496/*
1497 * This code constitutes the buffer memory from either anonymous system
1498 * memory (in the case of non-VMIO operations) or from an associated
1499 * VM object (in the case of VMIO operations).
1500 *
1501 * Note that this code is tricky, and has many complications to resolve
1502 * deadlock or inconsistant data situations.  Tread lightly!!!
1503 *
1504 * Modify the length of a buffer's underlying buffer storage without
1505 * destroying information (unless, of course the buffer is shrinking).
1506 */
1507int
1508allocbuf(struct buf * bp, int size)
1509{
1510
1511	int s;
1512	int newbsize, mbsize;
1513	int i;
1514
1515#if !defined(MAX_PERF)
1516	if (!(bp->b_flags & B_BUSY))
1517		panic("allocbuf: buffer not busy");
1518
1519	if (bp->b_kvasize < size)
1520		panic("allocbuf: buffer too small");
1521#endif
1522
1523	if ((bp->b_flags & B_VMIO) == 0) {
1524		caddr_t origbuf;
1525		int origbufsize;
1526		/*
1527		 * Just get anonymous memory from the kernel
1528		 */
1529		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1530#if !defined(NO_B_MALLOC)
1531		if (bp->b_flags & B_MALLOC)
1532			newbsize = mbsize;
1533		else
1534#endif
1535			newbsize = round_page(size);
1536
1537		if (newbsize < bp->b_bufsize) {
1538#if !defined(NO_B_MALLOC)
1539			/*
1540			 * malloced buffers are not shrunk
1541			 */
1542			if (bp->b_flags & B_MALLOC) {
1543				if (newbsize) {
1544					bp->b_bcount = size;
1545				} else {
1546					free(bp->b_data, M_BIOBUF);
1547					bufspace -= bp->b_bufsize;
1548					bufmallocspace -= bp->b_bufsize;
1549					bp->b_data = bp->b_kvabase;
1550					bp->b_bufsize = 0;
1551					bp->b_bcount = 0;
1552					bp->b_flags &= ~B_MALLOC;
1553				}
1554				return 1;
1555			}
1556#endif
1557			vm_hold_free_pages(
1558			    bp,
1559			    (vm_offset_t) bp->b_data + newbsize,
1560			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1561		} else if (newbsize > bp->b_bufsize) {
1562#if !defined(NO_B_MALLOC)
1563			/*
1564			 * We only use malloced memory on the first allocation.
1565			 * and revert to page-allocated memory when the buffer grows.
1566			 */
1567			if ( (bufmallocspace < maxbufmallocspace) &&
1568				(bp->b_bufsize == 0) &&
1569				(mbsize <= PAGE_SIZE/2)) {
1570
1571				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1572				bp->b_bufsize = mbsize;
1573				bp->b_bcount = size;
1574				bp->b_flags |= B_MALLOC;
1575				bufspace += mbsize;
1576				bufmallocspace += mbsize;
1577				return 1;
1578			}
1579#endif
1580			origbuf = NULL;
1581			origbufsize = 0;
1582#if !defined(NO_B_MALLOC)
1583			/*
1584			 * If the buffer is growing on it's other-than-first allocation,
1585			 * then we revert to the page-allocation scheme.
1586			 */
1587			if (bp->b_flags & B_MALLOC) {
1588				origbuf = bp->b_data;
1589				origbufsize = bp->b_bufsize;
1590				bp->b_data = bp->b_kvabase;
1591				bufspace -= bp->b_bufsize;
1592				bufmallocspace -= bp->b_bufsize;
1593				bp->b_bufsize = 0;
1594				bp->b_flags &= ~B_MALLOC;
1595				newbsize = round_page(newbsize);
1596			}
1597#endif
1598			vm_hold_load_pages(
1599			    bp,
1600			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1601			    (vm_offset_t) bp->b_data + newbsize);
1602#if !defined(NO_B_MALLOC)
1603			if (origbuf) {
1604				bcopy(origbuf, bp->b_data, origbufsize);
1605				free(origbuf, M_BIOBUF);
1606			}
1607#endif
1608		}
1609	} else {
1610		vm_page_t m;
1611		int desiredpages;
1612
1613		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1614		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1615
1616#if !defined(NO_B_MALLOC)
1617		if (bp->b_flags & B_MALLOC)
1618			panic("allocbuf: VMIO buffer can't be malloced");
1619#endif
1620
1621		if (newbsize < bp->b_bufsize) {
1622			if (desiredpages < bp->b_npages) {
1623				for (i = desiredpages; i < bp->b_npages; i++) {
1624					/*
1625					 * the page is not freed here -- it
1626					 * is the responsibility of vnode_pager_setsize
1627					 */
1628					m = bp->b_pages[i];
1629#if defined(DIAGNOSTIC)
1630					if (m == bogus_page)
1631						panic("allocbuf: bogus page found");
1632#endif
1633					s = splvm();
1634					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1635						m->flags |= PG_WANTED;
1636						tsleep(m, PVM, "biodep", 0);
1637					}
1638					splx(s);
1639
1640					bp->b_pages[i] = NULL;
1641					vm_page_unwire(m);
1642				}
1643				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1644				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1645				bp->b_npages = desiredpages;
1646			}
1647		} else if (newbsize > bp->b_bufsize) {
1648			vm_object_t obj;
1649			vm_offset_t tinc, toff;
1650			vm_ooffset_t off;
1651			vm_pindex_t objoff;
1652			int pageindex, curbpnpages;
1653			struct vnode *vp;
1654			int bsize;
1655
1656			vp = bp->b_vp;
1657
1658			if (vp->v_type == VBLK)
1659				bsize = DEV_BSIZE;
1660			else
1661				bsize = vp->v_mount->mnt_stat.f_iosize;
1662
1663			if (bp->b_npages < desiredpages) {
1664				obj = vp->v_object;
1665				tinc = PAGE_SIZE;
1666				if (tinc > bsize)
1667					tinc = bsize;
1668				off = (vm_ooffset_t) bp->b_lblkno * bsize;
1669				curbpnpages = bp->b_npages;
1670		doretry:
1671				bp->b_flags |= B_CACHE;
1672				bp->b_validoff = bp->b_validend = 0;
1673				for (toff = 0; toff < newbsize; toff += tinc) {
1674					int bytesinpage;
1675
1676					pageindex = toff >> PAGE_SHIFT;
1677					objoff = OFF_TO_IDX(off + toff);
1678					if (pageindex < curbpnpages) {
1679
1680						m = bp->b_pages[pageindex];
1681#ifdef VFS_BIO_DIAG
1682						if (m->pindex != objoff)
1683							panic("allocbuf: page changed offset??!!!?");
1684#endif
1685						bytesinpage = tinc;
1686						if (tinc > (newbsize - toff))
1687							bytesinpage = newbsize - toff;
1688						if (bp->b_flags & B_CACHE)
1689							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1690						continue;
1691					}
1692					m = vm_page_lookup(obj, objoff);
1693					if (!m) {
1694						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1695						if (!m) {
1696							VM_WAIT;
1697							vm_pageout_deficit += (desiredpages - bp->b_npages);
1698							goto doretry;
1699						}
1700						/*
1701						 * Normally it is unwise to clear PG_BUSY without
1702						 * PAGE_WAKEUP -- but it is okay here, as there is
1703						 * no chance for blocking between here and vm_page_alloc
1704						 */
1705						m->flags &= ~PG_BUSY;
1706						vm_page_wire(m);
1707						bp->b_flags &= ~B_CACHE;
1708					} else if (m->flags & PG_BUSY) {
1709						s = splvm();
1710						if (m->flags & PG_BUSY) {
1711							m->flags |= PG_WANTED;
1712							tsleep(m, PVM, "pgtblk", 0);
1713						}
1714						splx(s);
1715						goto doretry;
1716					} else {
1717						if ((curproc != pageproc) &&
1718							((m->queue - m->pc) == PQ_CACHE) &&
1719						    ((cnt.v_free_count + cnt.v_cache_count) <
1720								(cnt.v_free_min + cnt.v_cache_min))) {
1721							pagedaemon_wakeup();
1722						}
1723						bytesinpage = tinc;
1724						if (tinc > (newbsize - toff))
1725							bytesinpage = newbsize - toff;
1726						if (bp->b_flags & B_CACHE)
1727							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1728						vm_page_wire(m);
1729					}
1730					bp->b_pages[pageindex] = m;
1731					curbpnpages = pageindex + 1;
1732				}
1733				if (vp->v_tag == VT_NFS &&
1734				    vp->v_type != VBLK) {
1735					if (bp->b_dirtyend > 0) {
1736						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1737						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1738					}
1739					if (bp->b_validend == 0)
1740						bp->b_flags &= ~B_CACHE;
1741				}
1742				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1743				bp->b_npages = curbpnpages;
1744				pmap_qenter((vm_offset_t) bp->b_data,
1745					bp->b_pages, bp->b_npages);
1746				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1747			}
1748		}
1749	}
1750	if (bp->b_flags & B_VMIO)
1751		vmiospace += (newbsize - bp->b_bufsize);
1752	bufspace += (newbsize - bp->b_bufsize);
1753	bp->b_bufsize = newbsize;
1754	bp->b_bcount = size;
1755	return 1;
1756}
1757
1758/*
1759 * Wait for buffer I/O completion, returning error status.
1760 */
1761int
1762biowait(register struct buf * bp)
1763{
1764	int s;
1765
1766	s = splbio();
1767	while ((bp->b_flags & B_DONE) == 0)
1768#if defined(NO_SCHEDULE_MODS)
1769		tsleep(bp, PRIBIO, "biowait", 0);
1770#else
1771		if (bp->b_flags & B_READ)
1772			tsleep(bp, PRIBIO, "biord", 0);
1773		else
1774			tsleep(bp, curproc->p_usrpri, "biowr", 0);
1775#endif
1776	splx(s);
1777	if (bp->b_flags & B_EINTR) {
1778		bp->b_flags &= ~B_EINTR;
1779		return (EINTR);
1780	}
1781	if (bp->b_flags & B_ERROR) {
1782		return (bp->b_error ? bp->b_error : EIO);
1783	} else {
1784		return (0);
1785	}
1786}
1787
1788/*
1789 * Finish I/O on a buffer, calling an optional function.
1790 * This is usually called from interrupt level, so process blocking
1791 * is not *a good idea*.
1792 */
1793void
1794biodone(register struct buf * bp)
1795{
1796	int s;
1797
1798	s = splbio();
1799
1800#if !defined(MAX_PERF)
1801	if (!(bp->b_flags & B_BUSY))
1802		panic("biodone: buffer not busy");
1803#endif
1804
1805	if (bp->b_flags & B_DONE) {
1806		splx(s);
1807#if !defined(MAX_PERF)
1808		printf("biodone: buffer already done\n");
1809#endif
1810		return;
1811	}
1812	bp->b_flags |= B_DONE;
1813
1814	if ((bp->b_flags & B_READ) == 0) {
1815		vwakeup(bp);
1816	}
1817#ifdef BOUNCE_BUFFERS
1818	if (bp->b_flags & B_BOUNCE)
1819		vm_bounce_free(bp);
1820#endif
1821
1822	/* call optional completion function if requested */
1823	if (bp->b_flags & B_CALL) {
1824		bp->b_flags &= ~B_CALL;
1825		(*bp->b_iodone) (bp);
1826		splx(s);
1827		return;
1828	}
1829	if (bp->b_flags & B_VMIO) {
1830		int i, resid;
1831		vm_ooffset_t foff;
1832		vm_page_t m;
1833		vm_object_t obj;
1834		int iosize;
1835		struct vnode *vp = bp->b_vp;
1836
1837		obj = vp->v_object;
1838
1839#if defined(VFS_BIO_DEBUG)
1840		if (vp->v_usecount == 0) {
1841			panic("biodone: zero vnode ref count");
1842		}
1843
1844		if (vp->v_object == NULL) {
1845			panic("biodone: missing VM object");
1846		}
1847
1848		if ((vp->v_flag & VOBJBUF) == 0) {
1849			panic("biodone: vnode is not setup for merged cache");
1850		}
1851#endif
1852
1853		if (vp->v_type == VBLK)
1854			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1855		else
1856			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1857#if !defined(MAX_PERF)
1858		if (!obj) {
1859			panic("biodone: no object");
1860		}
1861#endif
1862#if defined(VFS_BIO_DEBUG)
1863		if (obj->paging_in_progress < bp->b_npages) {
1864			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1865			    obj->paging_in_progress, bp->b_npages);
1866		}
1867#endif
1868		iosize = bp->b_bufsize;
1869		for (i = 0; i < bp->b_npages; i++) {
1870			int bogusflag = 0;
1871			m = bp->b_pages[i];
1872			if (m == bogus_page) {
1873				bogusflag = 1;
1874				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1875				if (!m) {
1876#if defined(VFS_BIO_DEBUG)
1877					printf("biodone: page disappeared\n");
1878#endif
1879					--obj->paging_in_progress;
1880					continue;
1881				}
1882				bp->b_pages[i] = m;
1883				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1884			}
1885#if defined(VFS_BIO_DEBUG)
1886			if (OFF_TO_IDX(foff) != m->pindex) {
1887				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1888			}
1889#endif
1890			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1891			if (resid > iosize)
1892				resid = iosize;
1893			/*
1894			 * In the write case, the valid and clean bits are
1895			 * already changed correctly, so we only need to do this
1896			 * here in the read case.
1897			 */
1898			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1899				vfs_page_set_valid(bp, foff, i, m);
1900			}
1901
1902			/*
1903			 * when debugging new filesystems or buffer I/O methods, this
1904			 * is the most common error that pops up.  if you see this, you
1905			 * have not set the page busy flag correctly!!!
1906			 */
1907			if (m->busy == 0) {
1908#if !defined(MAX_PERF)
1909				printf("biodone: page busy < 0, "
1910				    "pindex: %d, foff: 0x(%x,%x), "
1911				    "resid: %d, index: %d\n",
1912				    (int) m->pindex, (int)(foff >> 32),
1913						(int) foff & 0xffffffff, resid, i);
1914#endif
1915				if (vp->v_type != VBLK)
1916#if !defined(MAX_PERF)
1917					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1918					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1919					    (int) bp->b_lblkno,
1920					    bp->b_flags, bp->b_npages);
1921				else
1922					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1923					    (int) bp->b_lblkno,
1924					    bp->b_flags, bp->b_npages);
1925				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1926				    m->valid, m->dirty, m->wire_count);
1927#endif
1928				panic("biodone: page busy < 0\n");
1929			}
1930			--m->busy;
1931			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1932				m->flags &= ~PG_WANTED;
1933				wakeup(m);
1934			}
1935			--obj->paging_in_progress;
1936			foff += resid;
1937			iosize -= resid;
1938		}
1939		if (obj && obj->paging_in_progress == 0 &&
1940		    (obj->flags & OBJ_PIPWNT)) {
1941			obj->flags &= ~OBJ_PIPWNT;
1942			wakeup(obj);
1943		}
1944	}
1945	/*
1946	 * For asynchronous completions, release the buffer now. The brelse
1947	 * checks for B_WANTED and will do the wakeup there if necessary - so
1948	 * no need to do a wakeup here in the async case.
1949	 */
1950
1951	if (bp->b_flags & B_ASYNC) {
1952		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
1953			brelse(bp);
1954		else
1955			bqrelse(bp);
1956	} else {
1957		bp->b_flags &= ~B_WANTED;
1958		wakeup(bp);
1959	}
1960	splx(s);
1961}
1962
1963int
1964count_lock_queue()
1965{
1966	int count;
1967	struct buf *bp;
1968
1969	count = 0;
1970	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
1971	    bp != NULL;
1972	    bp = TAILQ_NEXT(bp, b_freelist))
1973		count++;
1974	return (count);
1975}
1976
1977int vfs_update_interval = 30;
1978
1979static void
1980vfs_update()
1981{
1982	while (1) {
1983		tsleep(&vfs_update_wakeup, PUSER, "update",
1984		    hz * vfs_update_interval);
1985		vfs_update_wakeup = 0;
1986		sync(curproc, NULL);
1987	}
1988}
1989
1990static int
1991sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1992{
1993	int error = sysctl_handle_int(oidp,
1994		oidp->oid_arg1, oidp->oid_arg2, req);
1995	if (!error)
1996		wakeup(&vfs_update_wakeup);
1997	return error;
1998}
1999
2000SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2001	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2002
2003
2004/*
2005 * This routine is called in lieu of iodone in the case of
2006 * incomplete I/O.  This keeps the busy status for pages
2007 * consistant.
2008 */
2009void
2010vfs_unbusy_pages(struct buf * bp)
2011{
2012	int i;
2013
2014	if (bp->b_flags & B_VMIO) {
2015		struct vnode *vp = bp->b_vp;
2016		vm_object_t obj = vp->v_object;
2017		vm_ooffset_t foff;
2018
2019		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2020
2021		for (i = 0; i < bp->b_npages; i++) {
2022			vm_page_t m = bp->b_pages[i];
2023
2024			if (m == bogus_page) {
2025				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
2026#if !defined(MAX_PERF)
2027				if (!m) {
2028					panic("vfs_unbusy_pages: page missing\n");
2029				}
2030#endif
2031				bp->b_pages[i] = m;
2032				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2033			}
2034			--obj->paging_in_progress;
2035			--m->busy;
2036			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
2037				m->flags &= ~PG_WANTED;
2038				wakeup(m);
2039			}
2040		}
2041		if (obj->paging_in_progress == 0 &&
2042		    (obj->flags & OBJ_PIPWNT)) {
2043			obj->flags &= ~OBJ_PIPWNT;
2044			wakeup(obj);
2045		}
2046	}
2047}
2048
2049/*
2050 * Set NFS' b_validoff and b_validend fields from the valid bits
2051 * of a page.  If the consumer is not NFS, and the page is not
2052 * valid for the entire range, clear the B_CACHE flag to force
2053 * the consumer to re-read the page.
2054 */
2055static void
2056vfs_buf_set_valid(struct buf *bp,
2057		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2058		  vm_page_t m)
2059{
2060	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2061		vm_offset_t svalid, evalid;
2062		int validbits = m->valid;
2063
2064		/*
2065		 * This only bothers with the first valid range in the
2066		 * page.
2067		 */
2068		svalid = off;
2069		while (validbits && !(validbits & 1)) {
2070			svalid += DEV_BSIZE;
2071			validbits >>= 1;
2072		}
2073		evalid = svalid;
2074		while (validbits & 1) {
2075			evalid += DEV_BSIZE;
2076			validbits >>= 1;
2077		}
2078		/*
2079		 * Make sure this range is contiguous with the range
2080		 * built up from previous pages.  If not, then we will
2081		 * just use the range from the previous pages.
2082		 */
2083		if (svalid == bp->b_validend) {
2084			bp->b_validoff = min(bp->b_validoff, svalid);
2085			bp->b_validend = max(bp->b_validend, evalid);
2086		}
2087	} else if (!vm_page_is_valid(m,
2088				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2089				     size)) {
2090		bp->b_flags &= ~B_CACHE;
2091	}
2092}
2093
2094/*
2095 * Set the valid bits in a page, taking care of the b_validoff,
2096 * b_validend fields which NFS uses to optimise small reads.  Off is
2097 * the offset within the file and pageno is the page index within the buf.
2098 */
2099static void
2100vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2101{
2102	struct vnode *vp = bp->b_vp;
2103	vm_ooffset_t soff, eoff;
2104
2105	soff = off;
2106	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2107	vm_page_set_invalid(m,
2108			    (vm_offset_t) (soff & PAGE_MASK),
2109			    (vm_offset_t) (eoff - soff));
2110	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2111		vm_ooffset_t sv, ev;
2112		off = off - pageno * PAGE_SIZE;
2113		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2114		ev = off + (bp->b_validend & ~(DEV_BSIZE - 1));
2115		soff = max(sv, soff);
2116		eoff = min(ev, eoff);
2117	}
2118	if (eoff > soff)
2119		vm_page_set_validclean(m,
2120				       (vm_offset_t) (soff & PAGE_MASK),
2121				       (vm_offset_t) (eoff - soff));
2122}
2123
2124/*
2125 * This routine is called before a device strategy routine.
2126 * It is used to tell the VM system that paging I/O is in
2127 * progress, and treat the pages associated with the buffer
2128 * almost as being PG_BUSY.  Also the object paging_in_progress
2129 * flag is handled to make sure that the object doesn't become
2130 * inconsistant.
2131 */
2132void
2133vfs_busy_pages(struct buf * bp, int clear_modify)
2134{
2135	int i;
2136
2137	if (bp->b_flags & B_VMIO) {
2138		struct vnode *vp = bp->b_vp;
2139		vm_object_t obj = vp->v_object;
2140		vm_ooffset_t foff;
2141
2142		if (vp->v_type == VBLK)
2143			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2144		else
2145			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2146		vfs_setdirty(bp);
2147		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2148			vm_page_t m = bp->b_pages[i];
2149
2150			if ((bp->b_flags & B_CLUSTER) == 0) {
2151				obj->paging_in_progress++;
2152				m->busy++;
2153			}
2154			vm_page_protect(m, VM_PROT_NONE);
2155			if (clear_modify)
2156				vfs_page_set_valid(bp, foff, i, m);
2157			else if (bp->b_bcount >= PAGE_SIZE) {
2158				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2159					bp->b_pages[i] = bogus_page;
2160					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2161				}
2162			}
2163		}
2164	}
2165}
2166
2167/*
2168 * Tell the VM system that the pages associated with this buffer
2169 * are clean.  This is used for delayed writes where the data is
2170 * going to go to disk eventually without additional VM intevention.
2171 */
2172void
2173vfs_clean_pages(struct buf * bp)
2174{
2175	int i;
2176
2177	if (bp->b_flags & B_VMIO) {
2178		struct vnode *vp = bp->b_vp;
2179		vm_ooffset_t foff;
2180
2181		if (vp->v_type == VBLK)
2182			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2183		else
2184			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2185		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2186			vm_page_t m = bp->b_pages[i];
2187
2188			vfs_page_set_valid(bp, foff, i, m);
2189		}
2190	}
2191}
2192
2193void
2194vfs_bio_clrbuf(struct buf *bp) {
2195	int i;
2196	if( bp->b_flags & B_VMIO) {
2197		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2198			int mask;
2199			mask = 0;
2200			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2201				mask |= (1 << (i/DEV_BSIZE));
2202			if( bp->b_pages[0]->valid != mask) {
2203				bzero(bp->b_data, bp->b_bufsize);
2204			}
2205			bp->b_pages[0]->valid = mask;
2206			bp->b_resid = 0;
2207			return;
2208		}
2209		for(i=0;i<bp->b_npages;i++) {
2210			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2211				continue;
2212			if( bp->b_pages[i]->valid == 0) {
2213				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2214					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2215				}
2216			} else {
2217				int j;
2218				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2219					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
2220						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2221				}
2222			}
2223			/* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
2224		}
2225		bp->b_resid = 0;
2226	} else {
2227		clrbuf(bp);
2228	}
2229}
2230
2231/*
2232 * vm_hold_load_pages and vm_hold_unload pages get pages into
2233 * a buffers address space.  The pages are anonymous and are
2234 * not associated with a file object.
2235 */
2236void
2237vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2238{
2239	vm_offset_t pg;
2240	vm_page_t p;
2241	int index;
2242
2243	to = round_page(to);
2244	from = round_page(from);
2245	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2246
2247	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2248
2249tryagain:
2250
2251		p = vm_page_alloc(kernel_object,
2252			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2253		    VM_ALLOC_NORMAL);
2254		if (!p) {
2255			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2256			VM_WAIT;
2257			goto tryagain;
2258		}
2259		vm_page_wire(p);
2260		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2261		bp->b_pages[index] = p;
2262		PAGE_WAKEUP(p);
2263	}
2264	bp->b_npages = index;
2265}
2266
2267void
2268vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2269{
2270	vm_offset_t pg;
2271	vm_page_t p;
2272	int index, newnpages;
2273
2274	from = round_page(from);
2275	to = round_page(to);
2276	newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2277
2278	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2279		p = bp->b_pages[index];
2280		if (p && (index < bp->b_npages)) {
2281#if !defined(MAX_PERF)
2282			if (p->busy) {
2283				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2284					bp->b_blkno, bp->b_lblkno);
2285			}
2286#endif
2287			bp->b_pages[index] = NULL;
2288			pmap_kremove(pg);
2289			vm_page_unwire(p);
2290			vm_page_free(p);
2291		}
2292	}
2293	bp->b_npages = newnpages;
2294}
2295
2296
2297#include "opt_ddb.h"
2298#ifdef DDB
2299#include <ddb/ddb.h>
2300
2301DB_SHOW_COMMAND(buffer, db_show_buffer)
2302{
2303	/* get args */
2304	struct buf *bp = (struct buf *)addr;
2305
2306	if (!have_addr) {
2307		db_printf("usage: show buffer <addr>\n");
2308		return;
2309	}
2310
2311	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2312		  bp->b_flags, "\20\40bounce\37cluster\36vmio\35ram\34ordered"
2313		  "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26tape"
2314		  "\25read\24raw\23phys\22clusterok\21malloc\20nocache"
2315		  "\17locked\16inval\15gathered\14error\13eintr\12done\11dirty"
2316		  "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age");
2317	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2318		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2319		  "b_blkno = %d, b_pblkno = %d\n",
2320		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2321		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2322	if (bp->b_npages) {
2323		int i;
2324		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2325		for (i = 0; i < bp->b_npages; i++) {
2326			vm_page_t m;
2327			m = bp->b_pages[i];
2328			db_printf("(0x%x, 0x%x, 0x%x)", m->object, m->pindex,
2329				VM_PAGE_TO_PHYS(m));
2330			if ((i + 1) < bp->b_npages)
2331				db_printf(",");
2332		}
2333		db_printf("\n");
2334	}
2335}
2336#endif /* DDB */
2337