vfs_bio.c revision 5466
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17 *    is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 *    are met.
20 *
21 * $Id: vfs_bio.c,v 1.18 1995/01/10 07:32:35 davidg Exp $
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme.  Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author:  John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#define VMIO
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/proc.h>
40#include <sys/vnode.h>
41#include <vm/vm.h>
42#include <vm/vm_pageout.h>
43#include <vm/vm_page.h>
44#include <vm/vm_object.h>
45#include <sys/buf.h>
46#include <sys/mount.h>
47#include <sys/malloc.h>
48#include <sys/resourcevar.h>
49#include <sys/proc.h>
50
51#include <miscfs/specfs/specdev.h>
52
53struct buf *buf;		/* buffer header pool */
54int nbuf;			/* number of buffer headers calculated
55				 * elsewhere */
56struct swqueue bswlist;
57int nvmio, nlru;
58
59extern vm_map_t buffer_map, io_map, kernel_map, pager_map;
60
61void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
62void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
63void vfs_dirty_pages(struct buf * bp);
64void vfs_busy_pages(struct buf *, int clear_modify);
65
66int needsbuffer;
67
68/*
69 * Internal update daemon, process 3
70 *	The variable vfs_update_wakeup allows for internal syncs.
71 */
72int vfs_update_wakeup;
73
74
75/*
76 * buffers base kva
77 */
78caddr_t buffers_kva;
79
80/*
81 * bogus page -- for I/O to/from partially complete buffers
82 */
83vm_page_t bogus_page;
84vm_offset_t bogus_offset;
85
86/*
87 * Initialize buffer headers and related structures.
88 */
89void
90bufinit()
91{
92	struct buf *bp;
93	int i;
94
95	TAILQ_INIT(&bswlist);
96	LIST_INIT(&invalhash);
97
98	/* first, make a null hash table */
99	for (i = 0; i < BUFHSZ; i++)
100		LIST_INIT(&bufhashtbl[i]);
101
102	/* next, make a null set of free lists */
103	for (i = 0; i < BUFFER_QUEUES; i++)
104		TAILQ_INIT(&bufqueues[i]);
105
106	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
107	/* finally, initialize each buffer header and stick on empty q */
108	for (i = 0; i < nbuf; i++) {
109		bp = &buf[i];
110		bzero(bp, sizeof *bp);
111		bp->b_flags = B_INVAL;	/* we're just an empty header */
112		bp->b_dev = NODEV;
113		bp->b_vp = NULL;
114		bp->b_rcred = NOCRED;
115		bp->b_wcred = NOCRED;
116		bp->b_qindex = QUEUE_EMPTY;
117		bp->b_vnbufs.le_next = NOLIST;
118		bp->b_data = buffers_kva + i * MAXBSIZE;
119		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
120		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
121	}
122
123	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
124	bogus_page = vm_page_alloc(kernel_object, bogus_offset - VM_MIN_KERNEL_ADDRESS, 0);
125
126}
127
128/*
129 * remove the buffer from the appropriate free list
130 */
131void
132bremfree(struct buf * bp)
133{
134	int s = splbio();
135
136	if (bp->b_qindex != QUEUE_NONE) {
137		if (bp->b_qindex == QUEUE_LRU)
138			--nlru;
139		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
140		bp->b_qindex = QUEUE_NONE;
141	} else {
142		panic("bremfree: removing a buffer when not on a queue");
143	}
144	splx(s);
145}
146
147/*
148 * Get a buffer with the specified data.  Look in the cache first.
149 */
150int
151bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
152    struct buf ** bpp)
153{
154	struct buf *bp;
155
156	bp = getblk(vp, blkno, size, 0, 0);
157	*bpp = bp;
158
159	/* if not found in cache, do some I/O */
160	if ((bp->b_flags & B_CACHE) == 0) {
161		if (curproc && curproc->p_stats)	/* count block I/O */
162			curproc->p_stats->p_ru.ru_inblock++;
163		bp->b_flags |= B_READ;
164		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
165		if (bp->b_rcred == NOCRED) {
166			if (cred != NOCRED)
167				crhold(cred);
168			bp->b_rcred = cred;
169		}
170		vfs_busy_pages(bp, 0);
171		VOP_STRATEGY(bp);
172		return (biowait(bp));
173	} else if (bp->b_lblkno == bp->b_blkno) {
174		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
175		    &bp->b_blkno, (int *) 0);
176	}
177	return (0);
178}
179
180/*
181 * Operates like bread, but also starts asynchronous I/O on
182 * read-ahead blocks.
183 */
184int
185breadn(struct vnode * vp, daddr_t blkno, int size,
186    daddr_t * rablkno, int *rabsize,
187    int cnt, struct ucred * cred, struct buf ** bpp)
188{
189	struct buf *bp, *rabp;
190	int i;
191	int rv = 0, readwait = 0;
192
193	*bpp = bp = getblk(vp, blkno, size, 0, 0);
194
195	/* if not found in cache, do some I/O */
196	if ((bp->b_flags & B_CACHE) == 0) {
197		if (curproc && curproc->p_stats)	/* count block I/O */
198			curproc->p_stats->p_ru.ru_inblock++;
199		bp->b_flags |= B_READ;
200		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
201		if (bp->b_rcred == NOCRED) {
202			if (cred != NOCRED)
203				crhold(cred);
204			bp->b_rcred = cred;
205		}
206		vfs_busy_pages(bp, 0);
207		VOP_STRATEGY(bp);
208		++readwait;
209	} else if (bp->b_lblkno == bp->b_blkno) {
210		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
211		    &bp->b_blkno, (int *) 0);
212	}
213	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
214		if (inmem(vp, *rablkno))
215			continue;
216		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
217
218		if ((rabp->b_flags & B_CACHE) == 0) {
219			if (curproc && curproc->p_stats)
220				curproc->p_stats->p_ru.ru_inblock++;
221			rabp->b_flags |= B_READ | B_ASYNC;
222			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
223			if (rabp->b_rcred == NOCRED) {
224				if (cred != NOCRED)
225					crhold(cred);
226				rabp->b_rcred = cred;
227			}
228			vfs_busy_pages(rabp, 0);
229			VOP_STRATEGY(rabp);
230		} else {
231			brelse(rabp);
232		}
233	}
234
235	if (readwait) {
236		rv = biowait(bp);
237	}
238	return (rv);
239}
240
241/*
242 * this routine is used by filesystems to get at pages in the PG_CACHE
243 * queue.  also, it is used to read pages that are currently being
244 * written out by the file i/o routines.
245 */
246int
247vfs_read_bypass(struct vnode * vp, struct uio * uio, int maxread, daddr_t lbn)
248{
249	vm_page_t m;
250	vm_offset_t kv;
251	int nread;
252	int error;
253	struct buf *bp, *bpa;
254	vm_object_t obj;
255	int off;
256	int nrest;
257	int flags;
258	int s;
259
260	return 0;
261	/*
262	 * don't use the bypass mechanism for non-vmio vnodes
263	 */
264	if ((vp->v_flag & VVMIO) == 0)
265		return 0;
266	/*
267	 * get the VM object (it has the pages)
268	 */
269	obj = (vm_object_t) vp->v_vmdata;
270	if (obj == NULL)
271		return 0;
272
273	/*
274	 * if there is a buffer that is not busy, it is faster to use it.
275	 * This like read-ahead, etc work better
276	 */
277
278	s = splbio();
279	if ((bp = incore(vp, lbn)) &&
280	    (((bp->b_flags & B_READ) && (bp->b_flags & B_BUSY))
281		|| (bp->b_flags & B_BUSY) == 0)) {
282		splx(s);
283		return 0;
284	}
285	splx(s);
286
287	/*
288	 * get a pbuf --> we just use the kva
289	 */
290	kv = kmem_alloc_wait(pager_map, PAGE_SIZE);
291	nread = 0;
292	error = 0;
293
294	while (!error && uio->uio_resid && maxread > 0) {
295		int po;
296		int count;
297		int s;
298
299relookup:
300		/*
301		 * lookup the page
302		 */
303		m = vm_page_lookup(obj, trunc_page(uio->uio_offset));
304		if (!m)
305			break;
306		/*
307		 * get the offset into the page, and the amount to read in the
308		 * page
309		 */
310		nrest = round_page(uio->uio_offset) - uio->uio_offset;
311		if (nrest > uio->uio_resid)
312			nrest = uio->uio_resid;
313
314		/*
315		 * check the valid bits for the page (DEV_BSIZE chunks)
316		 */
317		if (!vm_page_is_valid(m, uio->uio_offset, nrest))
318			break;
319
320		/*
321		 * if the page is busy, wait for it
322		 */
323		s = splhigh();
324		if (!m->valid || (m->flags & PG_BUSY)) {
325			m->flags |= PG_WANTED;
326			tsleep((caddr_t) m, PVM, "vnibyp", 0);
327			splx(s);
328			goto relookup;
329		}
330		/*
331		 * if the page is on the cache queue, remove it -- cache queue
332		 * pages should be freeable by vm_page_alloc anytime.
333		 */
334		if (m->flags & PG_CACHE) {
335			if (cnt.v_free_count + cnt.v_cache_count < cnt.v_free_reserved) {
336				VM_WAIT;
337				goto relookup;
338			}
339			vm_page_unqueue(m);
340		}
341		/*
342		 * add a buffer mapping (essentially wires the page too).
343		 */
344		m->bmapped++;
345		splx(s);
346
347		/*
348		 * enter it into the kva
349		 */
350		pmap_qenter(kv, &m, 1);
351
352		/*
353		 * do the copy
354		 */
355		po = uio->uio_offset & (PAGE_SIZE - 1);
356		count = PAGE_SIZE - po;
357		if (count > maxread)
358			count = maxread;
359		if (count > uio->uio_resid)
360			count = uio->uio_resid;
361
362		error = uiomove((caddr_t) kv + po, count, uio);
363		if (!error) {
364			nread += count;
365			maxread -= count;
366		}
367		/*
368		 * remove from kva
369		 */
370		pmap_qremove(kv, 1);
371		PAGE_WAKEUP(m);	/* XXX probably unnecessary */
372		/*
373		 * If the page was on the cache queue, then by definition
374		 * bmapped was 0. Thus the following case will also take care
375		 * of the page being removed from the cache queue above.
376		 * Also, it is possible that the page was already entered onto
377		 * another queue (or was already there), so we don't put it
378		 * onto the cache queue...
379		 */
380		m->bmapped--;
381		if (m->bmapped == 0 &&
382		    (m->flags & (PG_CACHE | PG_ACTIVE | PG_INACTIVE)) == 0 &&
383		    m->wire_count == 0) {
384			vm_page_test_dirty(m);
385
386			/*
387			 * make sure that the darned page is on a queue
388			 * somewhere...
389			 */
390			if ((m->dirty & m->valid) == 0) {
391				vm_page_cache(m);
392			} else if (m->hold_count == 0) {
393				vm_page_deactivate(m);
394			} else {
395				vm_page_activate(m);
396			}
397		}
398	}
399	/*
400	 * release our buffer(kva).
401	 */
402	kmem_free_wakeup(pager_map, kv, PAGE_SIZE);
403	return nread;
404}
405
406
407/*
408 * Write, release buffer on completion.  (Done by iodone
409 * if async.)
410 */
411int
412bwrite(struct buf * bp)
413{
414	int oldflags = bp->b_flags;
415
416	if (bp->b_flags & B_INVAL) {
417		brelse(bp);
418		return (0);
419	}
420	if (!(bp->b_flags & B_BUSY))
421		panic("bwrite: buffer is not busy???");
422
423	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
424	bp->b_flags |= B_WRITEINPROG;
425
426	if (oldflags & B_ASYNC) {
427		if (oldflags & B_DELWRI) {
428			reassignbuf(bp, bp->b_vp);
429		} else if (curproc) {
430			++curproc->p_stats->p_ru.ru_oublock;
431		}
432	}
433	bp->b_vp->v_numoutput++;
434	vfs_busy_pages(bp, 1);
435	VOP_STRATEGY(bp);
436
437	if ((oldflags & B_ASYNC) == 0) {
438		int rtval = biowait(bp);
439
440		if (oldflags & B_DELWRI) {
441			reassignbuf(bp, bp->b_vp);
442		} else if (curproc) {
443			++curproc->p_stats->p_ru.ru_oublock;
444		}
445		brelse(bp);
446		return (rtval);
447	}
448	return (0);
449}
450
451int
452vn_bwrite(ap)
453	struct vop_bwrite_args *ap;
454{
455	return (bwrite(ap->a_bp));
456}
457
458/*
459 * Delayed write. (Buffer is marked dirty).
460 */
461void
462bdwrite(struct buf * bp)
463{
464
465	if ((bp->b_flags & B_BUSY) == 0) {
466		panic("bdwrite: buffer is not busy");
467	}
468	if (bp->b_flags & B_INVAL) {
469		brelse(bp);
470		return;
471	}
472	if (bp->b_flags & B_TAPE) {
473		bawrite(bp);
474		return;
475	}
476	bp->b_flags &= ~B_READ;
477	vfs_dirty_pages(bp);
478	if ((bp->b_flags & B_DELWRI) == 0) {
479		if (curproc)
480			++curproc->p_stats->p_ru.ru_oublock;
481		bp->b_flags |= B_DONE | B_DELWRI;
482		reassignbuf(bp, bp->b_vp);
483	}
484	brelse(bp);
485	return;
486}
487
488/*
489 * Asynchronous write.
490 * Start output on a buffer, but do not wait for it to complete.
491 * The buffer is released when the output completes.
492 */
493void
494bawrite(struct buf * bp)
495{
496	if (((bp->b_flags & B_DELWRI) == 0) && (bp->b_vp->v_numoutput > 24)) {
497		int s = splbio();
498
499		while (bp->b_vp->v_numoutput > 16) {
500			bp->b_vp->v_flag |= VBWAIT;
501			tsleep((caddr_t) &bp->b_vp->v_numoutput, PRIBIO, "bawnmo", 0);
502		}
503		splx(s);
504	}
505	bp->b_flags |= B_ASYNC;
506	(void) bwrite(bp);
507}
508
509/*
510 * Release a buffer.
511 */
512void
513brelse(struct buf * bp)
514{
515	int s;
516
517	if (bp->b_flags & B_CLUSTER) {
518		relpbuf(bp);
519		return;
520	}
521	/* anyone need a "free" block? */
522	s = splbio();
523
524	if (needsbuffer) {
525		needsbuffer = 0;
526		wakeup((caddr_t) &needsbuffer);
527	}
528	/* anyone need this block? */
529	if (bp->b_flags & B_WANTED) {
530		bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_AGE);
531		wakeup((caddr_t) bp);
532	} else if (bp->b_flags & B_VMIO) {
533		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
534		wakeup((caddr_t) bp);
535	}
536	if (bp->b_flags & B_LOCKED)
537		bp->b_flags &= ~B_ERROR;
538
539	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
540	    (bp->b_bufsize <= 0)) {
541		bp->b_flags |= B_INVAL;
542		bp->b_flags &= ~(B_DELWRI | B_CACHE);
543		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
544			brelvp(bp);
545	}
546	if (bp->b_flags & B_VMIO) {
547		vm_offset_t foff;
548		vm_object_t obj;
549		int i, resid;
550		vm_page_t m;
551		int iototal = bp->b_bufsize;
552
553		foff = 0;
554		obj = 0;
555		if (bp->b_npages) {
556			if (bp->b_vp && bp->b_vp->v_mount) {
557				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
558			} else {
559				/*
560				 * vnode pointer has been ripped away --
561				 * probably file gone...
562				 */
563				foff = bp->b_pages[0]->offset;
564			}
565		}
566		for (i = 0; i < bp->b_npages; i++) {
567			m = bp->b_pages[i];
568			if (m == bogus_page) {
569				panic("brelse: bogus page found");
570			}
571			resid = (m->offset + PAGE_SIZE) - foff;
572			if (resid > iototal)
573				resid = iototal;
574			if (resid > 0) {
575				if (bp->b_flags & (B_ERROR | B_NOCACHE)) {
576					vm_page_set_invalid(m, foff, resid);
577				} else if ((bp->b_flags & B_DELWRI) == 0) {
578					vm_page_set_clean(m, foff, resid);
579					vm_page_set_valid(m, foff, resid);
580				}
581			} else {
582				vm_page_test_dirty(m);
583			}
584			if (bp->b_flags & B_INVAL) {
585				if (m->bmapped == 0) {
586					panic("brelse: bmapped is zero for page\n");
587				}
588				--m->bmapped;
589				if (m->bmapped == 0) {
590					PAGE_WAKEUP(m);
591					if ((m->dirty & m->valid) == 0)
592						vm_page_cache(m);
593				}
594			}
595			foff += resid;
596			iototal -= resid;
597		}
598
599		if (bp->b_flags & B_INVAL) {
600			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
601			bp->b_npages = 0;
602			bp->b_bufsize = 0;
603			bp->b_flags &= ~B_VMIO;
604			if (bp->b_vp)
605				brelvp(bp);
606			--nvmio;
607		}
608	}
609	if (bp->b_qindex != QUEUE_NONE)
610		panic("brelse: free buffer onto another queue???");
611
612	/* enqueue */
613	/* buffers with no memory */
614	if (bp->b_bufsize == 0) {
615		bp->b_qindex = QUEUE_EMPTY;
616		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
617		LIST_REMOVE(bp, b_hash);
618		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
619		bp->b_dev = NODEV;
620		/* buffers with junk contents */
621	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE)) {
622		bp->b_qindex = QUEUE_AGE;
623		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
624		LIST_REMOVE(bp, b_hash);
625		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
626		bp->b_dev = NODEV;
627		/* buffers that are locked */
628	} else if (bp->b_flags & B_LOCKED) {
629		bp->b_qindex = QUEUE_LOCKED;
630		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
631		/* buffers with stale but valid contents */
632	} else if (bp->b_flags & B_AGE) {
633		bp->b_qindex = QUEUE_AGE;
634		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
635		/* buffers with valid and quite potentially reuseable contents */
636	} else {
637		if (bp->b_flags & B_VMIO)
638			bp->b_qindex = QUEUE_VMIO;
639		else {
640			bp->b_qindex = QUEUE_LRU;
641			++nlru;
642		}
643		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
644	}
645
646	/* unlock */
647	bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE);
648	splx(s);
649}
650
651/*
652 * this routine implements clustered async writes for
653 * clearing out B_DELWRI buffers...
654 */
655void
656vfs_bio_awrite(struct buf * bp)
657{
658	int i;
659	daddr_t lblkno = bp->b_lblkno;
660	struct vnode *vp = bp->b_vp;
661	int size = vp->v_mount->mnt_stat.f_iosize;
662	int s;
663	int ncl;
664	struct buf *bpa;
665
666	s = splbio();
667	ncl = 1;
668	if (vp->v_flag & VVMIO) {
669		for (i = 1; i < MAXPHYS / size; i++) {
670			if ((bpa = incore(vp, lblkno + i)) &&
671			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_BUSY | B_CLUSTEROK | B_INVAL)) == B_DELWRI | B_CLUSTEROK) &&
672			    (bpa->b_bufsize == size)) {
673				if ((bpa->b_blkno == bpa->b_lblkno) ||
674				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
675					break;
676			} else {
677				break;
678			}
679		}
680		ncl = i;
681	}
682	/*
683	 * we don't attempt to cluster meta-data or INVALID??? buffers
684	 */
685	if ((ncl != 1) &&
686	    (bp->b_flags & (B_INVAL | B_CLUSTEROK)) == B_CLUSTEROK) {
687		cluster_wbuild(vp, NULL, size, lblkno, ncl, -1);
688	} else {
689		bremfree(bp);
690		bp->b_flags |= B_BUSY | B_ASYNC;
691		bwrite(bp);
692	}
693	splx(s);
694}
695
696int freebufspace;
697int allocbufspace;
698
699/*
700 * Find a buffer header which is available for use.
701 */
702struct buf *
703getnewbuf(int slpflag, int slptimeo, int doingvmio)
704{
705	struct buf *bp;
706	int s;
707	int firstbp = 1;
708
709	s = splbio();
710start:
711	/* can we constitute a new buffer? */
712	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
713		if (bp->b_qindex != QUEUE_EMPTY)
714			panic("getnewbuf: inconsistent EMPTY queue");
715		bremfree(bp);
716		goto fillbuf;
717	}
718	/*
719	 * we keep the file I/O from hogging metadata I/O
720	 */
721	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
722		if (bp->b_qindex != QUEUE_AGE)
723			panic("getnewbuf: inconsistent AGE queue");
724	} else if ((nvmio > (2 * nbuf / 3))
725	    && (bp = bufqueues[QUEUE_VMIO].tqh_first)) {
726		if (bp->b_qindex != QUEUE_VMIO)
727			panic("getnewbuf: inconsistent VMIO queue");
728	} else if ((!doingvmio || (nlru > (2 * nbuf / 3))) &&
729	    (bp = bufqueues[QUEUE_LRU].tqh_first)) {
730		if (bp->b_qindex != QUEUE_LRU)
731			panic("getnewbuf: inconsistent LRU queue");
732	}
733	if (!bp) {
734		if (doingvmio) {
735			if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
736				if (bp->b_qindex != QUEUE_VMIO)
737					panic("getnewbuf: inconsistent VMIO queue");
738			} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
739				if (bp->b_qindex != QUEUE_LRU)
740					panic("getnewbuf: inconsistent LRU queue");
741			}
742		} else {
743			if (bp = bufqueues[QUEUE_LRU].tqh_first) {
744				if (bp->b_qindex != QUEUE_LRU)
745					panic("getnewbuf: inconsistent LRU queue");
746			} else if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
747				if (bp->b_qindex != QUEUE_VMIO)
748					panic("getnewbuf: inconsistent VMIO queue");
749			}
750		}
751	}
752	if (!bp) {
753		/* wait for a free buffer of any kind */
754		needsbuffer = 1;
755		tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
756		splx(s);
757		return (0);
758	}
759	/* if we are a delayed write, convert to an async write */
760	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
761		vfs_bio_awrite(bp);
762		if (!slpflag && !slptimeo) {
763			splx(s);
764			return (0);
765		}
766		goto start;
767	}
768	bremfree(bp);
769
770	if (bp->b_flags & B_VMIO) {
771		bp->b_flags |= B_INVAL | B_BUSY;
772		brelse(bp);
773		bremfree(bp);
774	}
775	if (bp->b_vp)
776		brelvp(bp);
777
778	/* we are not free, nor do we contain interesting data */
779	if (bp->b_rcred != NOCRED)
780		crfree(bp->b_rcred);
781	if (bp->b_wcred != NOCRED)
782		crfree(bp->b_wcred);
783fillbuf:
784	bp->b_flags = B_BUSY;
785	LIST_REMOVE(bp, b_hash);
786	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
787	splx(s);
788	if (bp->b_bufsize) {
789		allocbuf(bp, 0, 0);
790	}
791	bp->b_dev = NODEV;
792	bp->b_vp = NULL;
793	bp->b_blkno = bp->b_lblkno = 0;
794	bp->b_iodone = 0;
795	bp->b_error = 0;
796	bp->b_resid = 0;
797	bp->b_bcount = 0;
798	bp->b_npages = 0;
799	bp->b_wcred = bp->b_rcred = NOCRED;
800	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
801	bp->b_dirtyoff = bp->b_dirtyend = 0;
802	bp->b_validoff = bp->b_validend = 0;
803	return (bp);
804}
805
806/*
807 * Check to see if a block is currently memory resident.
808 */
809struct buf *
810incore(struct vnode * vp, daddr_t blkno)
811{
812	struct buf *bp;
813	struct bufhashhdr *bh;
814
815	int s = splbio();
816
817	bh = BUFHASH(vp, blkno);
818	bp = bh->lh_first;
819
820	/* Search hash chain */
821	while (bp) {
822		/* hit */
823		if (bp->b_lblkno == blkno && bp->b_vp == vp
824		    && (bp->b_flags & B_INVAL) == 0) {
825			splx(s);
826			return (bp);
827		}
828		bp = bp->b_hash.le_next;
829	}
830	splx(s);
831
832	return (0);
833}
834
835/*
836 * returns true if no I/O is needed to access the
837 * associated VM object.
838 */
839
840int
841inmem(struct vnode * vp, daddr_t blkno)
842{
843	vm_object_t obj;
844	vm_offset_t off, toff, tinc;
845	vm_page_t m;
846
847	if (incore(vp, blkno))
848		return 1;
849	if (vp->v_mount == 0)
850		return 0;
851	if (vp->v_vmdata == 0)
852		return 0;
853
854	obj = (vm_object_t) vp->v_vmdata;
855	tinc = PAGE_SIZE;
856	if (tinc > vp->v_mount->mnt_stat.f_iosize)
857		tinc = vp->v_mount->mnt_stat.f_iosize;
858	off = blkno * vp->v_mount->mnt_stat.f_iosize;
859
860	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
861		int mask;
862
863		m = vm_page_lookup(obj, trunc_page(toff + off));
864		if (!m)
865			return 0;
866		if (vm_page_is_valid(m, toff + off, tinc) == 0)
867			return 0;
868	}
869	return 1;
870}
871
872/*
873 * Get a block given a specified block and offset into a file/device.
874 */
875struct buf *
876getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
877{
878	struct buf *bp;
879	int s;
880	struct bufhashhdr *bh;
881	vm_offset_t off;
882	int bsize;
883	int nleft;
884
885	bsize = DEV_BSIZE;
886	if (vp->v_mount) {
887		bsize = vp->v_mount->mnt_stat.f_iosize;
888	}
889	s = splbio();
890loop:
891	if ((cnt.v_free_count + cnt.v_cache_count) <
892	    cnt.v_free_reserved + MAXBSIZE / PAGE_SIZE)
893		wakeup((caddr_t) &vm_pages_needed);
894	if (bp = incore(vp, blkno)) {
895		if (bp->b_flags & B_BUSY) {
896			bp->b_flags |= B_WANTED;
897			if (curproc == pageproc) {
898				bp->b_flags |= B_PDWANTED;
899				wakeup((caddr_t) &cnt.v_free_count);
900			}
901			if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo))
902				goto loop;
903			splx(s);
904			return (struct buf *) NULL;
905		}
906		bp->b_flags |= B_BUSY | B_CACHE;
907		bremfree(bp);
908		/*
909		 * check for size inconsistancies
910		 */
911		if (bp->b_bcount != size) {
912#if defined(VFS_BIO_DEBUG)
913			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
914#endif
915			bp->b_flags |= B_INVAL;
916			bwrite(bp);
917			goto loop;
918		}
919		splx(s);
920		return (bp);
921	} else {
922		vm_object_t obj;
923		int doingvmio;
924
925		if ((obj = (vm_object_t) vp->v_vmdata) &&
926		    (vp->v_flag & VVMIO) /* && (blkno >= 0) */ ) {
927			doingvmio = 1;
928		} else {
929			doingvmio = 0;
930		}
931		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
932			if (slpflag || slptimeo)
933				return NULL;
934			goto loop;
935		}
936		if (incore(vp, blkno)) {
937			bp->b_flags |= B_INVAL;
938			brelse(bp);
939			goto loop;
940		}
941		bp->b_blkno = bp->b_lblkno = blkno;
942		bgetvp(vp, bp);
943		LIST_REMOVE(bp, b_hash);
944		bh = BUFHASH(vp, blkno);
945		LIST_INSERT_HEAD(bh, bp, b_hash);
946		if (doingvmio) {
947			bp->b_flags |= (B_VMIO | B_CACHE);
948#if defined(VFS_BIO_DEBUG)
949			if (vp->v_type != VREG)
950				printf("getblk: vmioing file type %d???\n", vp->v_type);
951#endif
952			++nvmio;
953		} else {
954			if (bp->b_flags & B_VMIO)
955				--nvmio;
956			bp->b_flags &= ~B_VMIO;
957		}
958		splx(s);
959		if (!allocbuf(bp, size, 1)) {
960			s = splbio();
961			goto loop;
962		}
963		return (bp);
964	}
965}
966
967/*
968 * Get an empty, disassociated buffer of given size.
969 */
970struct buf *
971geteblk(int size)
972{
973	struct buf *bp;
974
975	while ((bp = getnewbuf(0, 0, 0)) == 0);
976	allocbuf(bp, size, 0);
977	bp->b_flags |= B_INVAL;
978	return (bp);
979}
980
981/*
982 * Modify the length of a buffer's underlying buffer storage without
983 * destroying information (unless, of course the buffer is shrinking).
984 */
985int
986allocbuf(struct buf * bp, int size, int vmio)
987{
988
989	int s;
990	int newbsize;
991	int i;
992
993	if ((bp->b_flags & B_VMIO) == 0) {
994		newbsize = round_page(size);
995		if (newbsize == bp->b_bufsize) {
996			bp->b_bcount = size;
997			return 1;
998		} else if (newbsize < bp->b_bufsize) {
999			if (bp->b_flags & B_MALLOC) {
1000				bp->b_bcount = size;
1001				return 1;
1002			}
1003			vm_hold_free_pages(
1004			    bp,
1005			    (vm_offset_t) bp->b_data + newbsize,
1006			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1007		} else if (newbsize > bp->b_bufsize) {
1008			if (bp->b_flags & B_MALLOC) {
1009				vm_offset_t bufaddr;
1010
1011				bufaddr = (vm_offset_t) bp->b_data;
1012				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1013				vm_hold_load_pages(
1014				    bp,
1015				    (vm_offset_t) bp->b_data,
1016				    (vm_offset_t) bp->b_data + newbsize);
1017				bcopy((caddr_t) bufaddr, bp->b_data, bp->b_bcount);
1018				free((caddr_t) bufaddr, M_TEMP);
1019			} else if ((newbsize <= PAGE_SIZE / 2) && (bp->b_bufsize == 0)) {
1020				bp->b_flags |= B_MALLOC;
1021				bp->b_data = malloc(newbsize, M_TEMP, M_WAITOK);
1022				bp->b_npages = 0;
1023			} else {
1024				vm_hold_load_pages(
1025				    bp,
1026				    (vm_offset_t) bp->b_data + bp->b_bufsize,
1027				    (vm_offset_t) bp->b_data + newbsize);
1028			}
1029		}
1030		/*
1031		 * adjust buffer cache's idea of memory allocated to buffer
1032		 * contents
1033		 */
1034		freebufspace -= newbsize - bp->b_bufsize;
1035		allocbufspace += newbsize - bp->b_bufsize;
1036	} else {
1037		vm_page_t m;
1038		int desiredpages;
1039
1040		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
1041		desiredpages = round_page(newbsize) / PAGE_SIZE;
1042
1043		if (newbsize == bp->b_bufsize) {
1044			bp->b_bcount = size;
1045			return 1;
1046		} else if (newbsize < bp->b_bufsize) {
1047			if (desiredpages < bp->b_npages) {
1048				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1049				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
1050				for (i = desiredpages; i < bp->b_npages; i++) {
1051					m = bp->b_pages[i];
1052					s = splhigh();
1053					if ((m->flags & PG_BUSY) || (m->busy != 0)) {
1054						m->flags |= PG_WANTED;
1055						tsleep(m, PVM, "biodep", 0);
1056					}
1057					splx(s);
1058
1059					if (m->bmapped == 0) {
1060						printf("allocbuf: bmapped is zero for page %d\n", i);
1061						panic("allocbuf: error");
1062					}
1063					--m->bmapped;
1064					if (m->bmapped == 0) {
1065						PAGE_WAKEUP(m);
1066						pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
1067						vm_page_free(m);
1068					}
1069					bp->b_pages[i] = NULL;
1070				}
1071				bp->b_npages = desiredpages;
1072			}
1073		} else {
1074			vm_object_t obj;
1075			vm_offset_t tinc, off, toff, objoff;
1076			int pageindex, curbpnpages;
1077			struct vnode *vp;
1078			int bsize;
1079
1080			vp = bp->b_vp;
1081			bsize = vp->v_mount->mnt_stat.f_iosize;
1082
1083			if (bp->b_npages < desiredpages) {
1084				obj = (vm_object_t) vp->v_vmdata;
1085				tinc = PAGE_SIZE;
1086				if (tinc > bsize)
1087					tinc = bsize;
1088				off = bp->b_lblkno * bsize;
1089				curbpnpages = bp->b_npages;
1090		doretry:
1091				for (toff = 0; toff < newbsize; toff += tinc) {
1092					int mask;
1093					int bytesinpage;
1094
1095					pageindex = toff / PAGE_SIZE;
1096					objoff = trunc_page(toff + off);
1097					if (pageindex < curbpnpages) {
1098						int pb;
1099
1100						m = bp->b_pages[pageindex];
1101						if (m->offset != objoff)
1102							panic("allocbuf: page changed offset??!!!?");
1103						bytesinpage = tinc;
1104						if (tinc > (newbsize - toff))
1105							bytesinpage = newbsize - toff;
1106						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1107							bp->b_flags &= ~B_CACHE;
1108						}
1109						if ((m->flags & PG_ACTIVE) == 0)
1110							vm_page_activate(m);
1111						continue;
1112					}
1113					m = vm_page_lookup(obj, objoff);
1114					if (!m) {
1115						m = vm_page_alloc(obj, objoff, 0);
1116						if (!m) {
1117							int j;
1118
1119							for (j = bp->b_npages; j < pageindex; j++) {
1120								vm_page_t mt = bp->b_pages[j];
1121
1122								PAGE_WAKEUP(mt);
1123								if (!mt->valid) {
1124									vm_page_free(mt);
1125								}
1126							}
1127							VM_WAIT;
1128							if (vmio && (bp->b_flags & B_PDWANTED)) {
1129								--nvmio;
1130								bp->b_flags &= ~B_VMIO;
1131								bp->b_flags |= B_INVAL;
1132								brelse(bp);
1133								return 0;
1134							}
1135							curbpnpages = bp->b_npages;
1136							goto doretry;
1137						}
1138						m->valid = 0;
1139						vm_page_activate(m);
1140					} else if ((m->valid == 0) || (m->flags & PG_BUSY)) {
1141						int j;
1142						int bufferdestroyed = 0;
1143
1144						for (j = bp->b_npages; j < pageindex; j++) {
1145							vm_page_t mt = bp->b_pages[j];
1146
1147							PAGE_WAKEUP(mt);
1148							if (mt->valid == 0) {
1149								vm_page_free(mt);
1150							}
1151						}
1152						if (vmio && (bp->b_flags & B_PDWANTED)) {
1153							--nvmio;
1154							bp->b_flags &= ~B_VMIO;
1155							bp->b_flags |= B_INVAL;
1156							brelse(bp);
1157							VM_WAIT;
1158							bufferdestroyed = 1;
1159						}
1160						s = splbio();
1161						if (m) {
1162							m->flags |= PG_WANTED;
1163							tsleep(m, PRIBIO, "pgtblk", 0);
1164						}
1165						splx(s);
1166						if (bufferdestroyed)
1167							return 0;
1168						curbpnpages = bp->b_npages;
1169						goto doretry;
1170					} else {
1171						int pb;
1172
1173						if ((m->flags & PG_CACHE) &&
1174						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_reserved) {
1175							int j;
1176
1177							for (j = bp->b_npages; j < pageindex; j++) {
1178								vm_page_t mt = bp->b_pages[j];
1179
1180								PAGE_WAKEUP(mt);
1181								if (mt->valid == 0) {
1182									vm_page_free(mt);
1183								}
1184							}
1185							VM_WAIT;
1186							if (vmio && (bp->b_flags & B_PDWANTED)) {
1187								--nvmio;
1188								bp->b_flags &= ~B_VMIO;
1189								bp->b_flags |= B_INVAL;
1190								brelse(bp);
1191								return 0;
1192							}
1193							curbpnpages = bp->b_npages;
1194							goto doretry;
1195						}
1196						bytesinpage = tinc;
1197						if (tinc > (newbsize - toff))
1198							bytesinpage = newbsize - toff;
1199						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1200							bp->b_flags &= ~B_CACHE;
1201						}
1202						if ((m->flags & PG_ACTIVE) == 0)
1203							vm_page_activate(m);
1204						m->flags |= PG_BUSY;
1205					}
1206					bp->b_pages[pageindex] = m;
1207					curbpnpages = pageindex + 1;
1208				}
1209				if (bsize >= PAGE_SIZE) {
1210					for (i = bp->b_npages; i < curbpnpages; i++) {
1211						m = bp->b_pages[i];
1212						if (m->valid == 0) {
1213							bp->b_flags &= ~B_CACHE;
1214						}
1215						m->bmapped++;
1216						PAGE_WAKEUP(m);
1217					}
1218				} else {
1219					if (!vm_page_is_valid(bp->b_pages[0], off, bsize))
1220						bp->b_flags &= ~B_CACHE;
1221					bp->b_pages[0]->bmapped++;
1222					PAGE_WAKEUP(bp->b_pages[0]);
1223				}
1224				bp->b_npages = curbpnpages;
1225				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1226				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1227				bp->b_data += off % PAGE_SIZE;
1228			}
1229		}
1230	}
1231	bp->b_bufsize = newbsize;
1232	bp->b_bcount = size;
1233	return 1;
1234}
1235
1236/*
1237 * Wait for buffer I/O completion, returning error status.
1238 */
1239int
1240biowait(register struct buf * bp)
1241{
1242	int s;
1243
1244	s = splbio();
1245	while ((bp->b_flags & B_DONE) == 0)
1246		tsleep((caddr_t) bp, PRIBIO, "biowait", 0);
1247	if ((bp->b_flags & B_ERROR) || bp->b_error) {
1248		if ((bp->b_flags & B_INVAL) == 0) {
1249			bp->b_flags |= B_INVAL;
1250			bp->b_dev = NODEV;
1251			LIST_REMOVE(bp, b_hash);
1252			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1253			wakeup((caddr_t) bp);
1254		}
1255		if (!bp->b_error)
1256			bp->b_error = EIO;
1257		else
1258			bp->b_flags |= B_ERROR;
1259		splx(s);
1260		return (bp->b_error);
1261	} else {
1262		splx(s);
1263		return (0);
1264	}
1265}
1266
1267/*
1268 * Finish I/O on a buffer, calling an optional function.
1269 * This is usually called from interrupt level, so process blocking
1270 * is not *a good idea*.
1271 */
1272void
1273biodone(register struct buf * bp)
1274{
1275	int s;
1276
1277	s = splbio();
1278	if (bp->b_flags & B_DONE)
1279		printf("biodone: buffer already done\n");
1280	bp->b_flags |= B_DONE;
1281
1282	if ((bp->b_flags & B_READ) == 0) {
1283		vwakeup(bp);
1284	}
1285#ifdef BOUNCE_BUFFERS
1286	if (bp->b_flags & B_BOUNCE)
1287		vm_bounce_free(bp);
1288#endif
1289
1290	/* call optional completion function if requested */
1291	if (bp->b_flags & B_CALL) {
1292		bp->b_flags &= ~B_CALL;
1293		(*bp->b_iodone) (bp);
1294		splx(s);
1295		return;
1296	}
1297	if (bp->b_flags & B_VMIO) {
1298		int i, resid;
1299		vm_offset_t foff;
1300		vm_page_t m;
1301		vm_object_t obj;
1302		int iosize;
1303		struct vnode *vp = bp->b_vp;
1304
1305		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1306		obj = (vm_object_t) vp->v_vmdata;
1307		if (!obj) {
1308			return;
1309		}
1310#if defined(VFS_BIO_DEBUG)
1311		if (obj->paging_in_progress < bp->b_npages) {
1312			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1313			    obj->paging_in_progress, bp->b_npages);
1314		}
1315#endif
1316		iosize = bp->b_bufsize;
1317		for (i = 0; i < bp->b_npages; i++) {
1318			m = bp->b_pages[i];
1319			if (m == bogus_page) {
1320				m = vm_page_lookup(obj, foff);
1321				if (!m) {
1322#if defined(VFS_BIO_DEBUG)
1323					printf("biodone: page disappeared\n");
1324#endif
1325					--obj->paging_in_progress;
1326					continue;
1327				}
1328				bp->b_pages[i] = m;
1329				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1330			}
1331#if defined(VFS_BIO_DEBUG)
1332			if (trunc_page(foff) != m->offset) {
1333				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1334			}
1335#endif
1336			resid = (m->offset + PAGE_SIZE) - foff;
1337			if (resid > iosize)
1338				resid = iosize;
1339			if (resid > 0) {
1340				vm_page_set_valid(m, foff, resid);
1341				vm_page_set_clean(m, foff, resid);
1342			}
1343			if (m->busy == 0) {
1344				printf("biodone: page busy < 0, off: %d, foff: %d, resid: %d, index: %d\n",
1345				    m->offset, foff, resid, i);
1346				printf(" iosize: %d, lblkno: %d\n",
1347				    bp->b_vp->v_mount->mnt_stat.f_iosize, bp->b_lblkno);
1348				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1349				    m->valid, m->dirty, m->bmapped);
1350				panic("biodone: page busy < 0\n");
1351			}
1352			--m->busy;
1353			PAGE_WAKEUP(m);
1354			--obj->paging_in_progress;
1355			foff += resid;
1356			iosize -= resid;
1357		}
1358		if (obj && obj->paging_in_progress == 0)
1359			wakeup((caddr_t) obj);
1360	}
1361	/*
1362	 * For asynchronous completions, release the buffer now. The brelse
1363	 * checks for B_WANTED and will do the wakeup there if necessary - so
1364	 * no need to do a wakeup here in the async case.
1365	 */
1366
1367	if (bp->b_flags & B_ASYNC) {
1368		brelse(bp);
1369	} else {
1370		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
1371		wakeup((caddr_t) bp);
1372	}
1373	splx(s);
1374}
1375
1376int
1377count_lock_queue()
1378{
1379	int count;
1380	struct buf *bp;
1381
1382	count = 0;
1383	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1384	    bp != NULL;
1385	    bp = bp->b_freelist.tqe_next)
1386		count++;
1387	return (count);
1388}
1389
1390int vfs_update_interval = 30;
1391
1392void
1393vfs_update()
1394{
1395	(void) spl0();
1396	while (1) {
1397		tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update",
1398		    hz * vfs_update_interval);
1399		vfs_update_wakeup = 0;
1400		sync(curproc, NULL, NULL);
1401	}
1402}
1403
1404void
1405vfs_unbusy_pages(struct buf * bp)
1406{
1407	int i;
1408
1409	if (bp->b_flags & B_VMIO) {
1410		struct vnode *vp = bp->b_vp;
1411		vm_object_t obj = (vm_object_t) vp->v_vmdata;
1412		vm_offset_t foff;
1413
1414		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1415
1416		for (i = 0; i < bp->b_npages; i++) {
1417			vm_page_t m = bp->b_pages[i];
1418
1419			if (m == bogus_page) {
1420				m = vm_page_lookup(obj, foff);
1421				if (!m) {
1422					panic("vfs_unbusy_pages: page missing\n");
1423				}
1424				bp->b_pages[i] = m;
1425				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1426			}
1427			--obj->paging_in_progress;
1428			--m->busy;
1429			PAGE_WAKEUP(m);
1430		}
1431		if (obj->paging_in_progress == 0)
1432			wakeup((caddr_t) obj);
1433	}
1434}
1435
1436void
1437vfs_busy_pages(struct buf * bp, int clear_modify)
1438{
1439	int i;
1440
1441	if (bp->b_flags & B_VMIO) {
1442		vm_object_t obj = (vm_object_t) bp->b_vp->v_vmdata;
1443		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1444		int iocount = bp->b_bufsize;
1445
1446		for (i = 0; i < bp->b_npages; i++) {
1447			vm_page_t m = bp->b_pages[i];
1448			int resid = (m->offset + PAGE_SIZE) - foff;
1449
1450			if (resid > iocount)
1451				resid = iocount;
1452			obj->paging_in_progress++;
1453			m->busy++;
1454			if (clear_modify) {
1455				vm_page_test_dirty(m);
1456				pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ);
1457			} else if (bp->b_bcount >= PAGE_SIZE) {
1458				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1459					bp->b_pages[i] = bogus_page;
1460					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1461				}
1462			}
1463			foff += resid;
1464			iocount -= resid;
1465		}
1466	}
1467}
1468
1469void
1470vfs_dirty_pages(struct buf * bp)
1471{
1472	int i;
1473
1474	if (bp->b_flags & B_VMIO) {
1475		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1476		int iocount = bp->b_bufsize;
1477
1478		for (i = 0; i < bp->b_npages; i++) {
1479			vm_page_t m = bp->b_pages[i];
1480			int resid = (m->offset + PAGE_SIZE) - foff;
1481
1482			if (resid > iocount)
1483				resid = iocount;
1484			if (resid > 0) {
1485				vm_page_set_valid(m, foff, resid);
1486				vm_page_set_dirty(m, foff, resid);
1487			}
1488			PAGE_WAKEUP(m);
1489			foff += resid;
1490			iocount -= resid;
1491		}
1492	}
1493}
1494/*
1495 * these routines are not in the correct place (yet)
1496 * also they work *ONLY* for kernel_pmap!!!
1497 */
1498void
1499vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1500{
1501	vm_offset_t pg;
1502	vm_page_t p;
1503	vm_offset_t from = round_page(froma);
1504	vm_offset_t to = round_page(toa);
1505
1506tryagain0:
1507	if ((curproc != pageproc) && ((cnt.v_free_count + cnt.v_cache_count) <=
1508		cnt.v_free_reserved + (toa - froma) / PAGE_SIZE)) {
1509		VM_WAIT;
1510		goto tryagain0;
1511	}
1512	for (pg = from; pg < to; pg += PAGE_SIZE) {
1513
1514tryagain:
1515
1516		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, 0);
1517		if (!p) {
1518			VM_WAIT;
1519			goto tryagain;
1520		}
1521		vm_page_wire(p);
1522		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1523		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1524		PAGE_WAKEUP(p);
1525		bp->b_npages++;
1526	}
1527}
1528
1529void
1530vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1531{
1532	vm_offset_t pg;
1533	vm_page_t p;
1534	vm_offset_t from = round_page(froma);
1535	vm_offset_t to = round_page(toa);
1536
1537	for (pg = from; pg < to; pg += PAGE_SIZE) {
1538		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1539		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1540		pmap_kremove(pg);
1541		vm_page_free(p);
1542		--bp->b_npages;
1543	}
1544}
1545
1546void
1547bufstats()
1548{
1549}
1550