vfs_bio.c revision 5484
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17 *    is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 *    are met.
20 *
21 * $Id: vfs_bio.c,v 1.19 1995/01/10 09:20:34 davidg Exp $
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme.  Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author:  John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#define VMIO
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/kernel.h>
39#include <sys/proc.h>
40#include <sys/vnode.h>
41#include <vm/vm.h>
42#include <vm/vm_pageout.h>
43#include <vm/vm_page.h>
44#include <vm/vm_object.h>
45#include <sys/buf.h>
46#include <sys/mount.h>
47#include <sys/malloc.h>
48#include <sys/resourcevar.h>
49#include <sys/proc.h>
50
51#include <miscfs/specfs/specdev.h>
52
53struct buf *buf;		/* buffer header pool */
54int nbuf;			/* number of buffer headers calculated
55				 * elsewhere */
56struct swqueue bswlist;
57int nvmio, nlru;
58
59extern vm_map_t buffer_map, io_map, kernel_map, pager_map;
60
61void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
62void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
63void vfs_dirty_pages(struct buf * bp);
64void vfs_busy_pages(struct buf *, int clear_modify);
65
66int needsbuffer;
67
68/*
69 * Internal update daemon, process 3
70 *	The variable vfs_update_wakeup allows for internal syncs.
71 */
72int vfs_update_wakeup;
73
74
75/*
76 * buffers base kva
77 */
78caddr_t buffers_kva;
79
80/*
81 * bogus page -- for I/O to/from partially complete buffers
82 */
83vm_page_t bogus_page;
84vm_offset_t bogus_offset;
85
86/*
87 * Initialize buffer headers and related structures.
88 */
89void
90bufinit()
91{
92	struct buf *bp;
93	int i;
94
95	TAILQ_INIT(&bswlist);
96	LIST_INIT(&invalhash);
97
98	/* first, make a null hash table */
99	for (i = 0; i < BUFHSZ; i++)
100		LIST_INIT(&bufhashtbl[i]);
101
102	/* next, make a null set of free lists */
103	for (i = 0; i < BUFFER_QUEUES; i++)
104		TAILQ_INIT(&bufqueues[i]);
105
106	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
107	/* finally, initialize each buffer header and stick on empty q */
108	for (i = 0; i < nbuf; i++) {
109		bp = &buf[i];
110		bzero(bp, sizeof *bp);
111		bp->b_flags = B_INVAL;	/* we're just an empty header */
112		bp->b_dev = NODEV;
113		bp->b_vp = NULL;
114		bp->b_rcred = NOCRED;
115		bp->b_wcred = NOCRED;
116		bp->b_qindex = QUEUE_EMPTY;
117		bp->b_vnbufs.le_next = NOLIST;
118		bp->b_data = buffers_kva + i * MAXBSIZE;
119		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
120		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
121	}
122
123	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
124	bogus_page = vm_page_alloc(kernel_object, bogus_offset - VM_MIN_KERNEL_ADDRESS, 0);
125
126}
127
128/*
129 * remove the buffer from the appropriate free list
130 */
131void
132bremfree(struct buf * bp)
133{
134	int s = splbio();
135
136	if (bp->b_qindex != QUEUE_NONE) {
137		if (bp->b_qindex == QUEUE_LRU)
138			--nlru;
139		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
140		bp->b_qindex = QUEUE_NONE;
141	} else {
142		panic("bremfree: removing a buffer when not on a queue");
143	}
144	splx(s);
145}
146
147/*
148 * Get a buffer with the specified data.  Look in the cache first.
149 */
150int
151bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
152    struct buf ** bpp)
153{
154	struct buf *bp;
155
156	bp = getblk(vp, blkno, size, 0, 0);
157	*bpp = bp;
158
159	/* if not found in cache, do some I/O */
160	if ((bp->b_flags & B_CACHE) == 0) {
161		if (curproc && curproc->p_stats)	/* count block I/O */
162			curproc->p_stats->p_ru.ru_inblock++;
163		bp->b_flags |= B_READ;
164		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
165		if (bp->b_rcred == NOCRED) {
166			if (cred != NOCRED)
167				crhold(cred);
168			bp->b_rcred = cred;
169		}
170		vfs_busy_pages(bp, 0);
171		VOP_STRATEGY(bp);
172		return (biowait(bp));
173	} else if (bp->b_lblkno == bp->b_blkno) {
174		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
175		    &bp->b_blkno, (int *) 0);
176	}
177	return (0);
178}
179
180/*
181 * Operates like bread, but also starts asynchronous I/O on
182 * read-ahead blocks.
183 */
184int
185breadn(struct vnode * vp, daddr_t blkno, int size,
186    daddr_t * rablkno, int *rabsize,
187    int cnt, struct ucred * cred, struct buf ** bpp)
188{
189	struct buf *bp, *rabp;
190	int i;
191	int rv = 0, readwait = 0;
192
193	*bpp = bp = getblk(vp, blkno, size, 0, 0);
194
195	/* if not found in cache, do some I/O */
196	if ((bp->b_flags & B_CACHE) == 0) {
197		if (curproc && curproc->p_stats)	/* count block I/O */
198			curproc->p_stats->p_ru.ru_inblock++;
199		bp->b_flags |= B_READ;
200		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
201		if (bp->b_rcred == NOCRED) {
202			if (cred != NOCRED)
203				crhold(cred);
204			bp->b_rcred = cred;
205		}
206		vfs_busy_pages(bp, 0);
207		VOP_STRATEGY(bp);
208		++readwait;
209	} else if (bp->b_lblkno == bp->b_blkno) {
210		VOP_BMAP(vp, bp->b_lblkno, (struct vnode **) 0,
211		    &bp->b_blkno, (int *) 0);
212	}
213	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
214		if (inmem(vp, *rablkno))
215			continue;
216		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
217
218		if ((rabp->b_flags & B_CACHE) == 0) {
219			if (curproc && curproc->p_stats)
220				curproc->p_stats->p_ru.ru_inblock++;
221			rabp->b_flags |= B_READ | B_ASYNC;
222			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
223			if (rabp->b_rcred == NOCRED) {
224				if (cred != NOCRED)
225					crhold(cred);
226				rabp->b_rcred = cred;
227			}
228			vfs_busy_pages(rabp, 0);
229			VOP_STRATEGY(rabp);
230		} else {
231			brelse(rabp);
232		}
233	}
234
235	if (readwait) {
236		rv = biowait(bp);
237	}
238	return (rv);
239}
240
241/*
242 * this routine is used by filesystems to get at pages in the PG_CACHE
243 * queue.  also, it is used to read pages that are currently being
244 * written out by the file i/o routines.
245 */
246int
247vfs_read_bypass(struct vnode * vp, struct uio * uio, int maxread, daddr_t lbn)
248{
249	vm_page_t m;
250	vm_offset_t kv;
251	int nread;
252	int error;
253	struct buf *bp, *bpa;
254	vm_object_t obj;
255	int off;
256	int nrest;
257	int flags;
258	int s;
259
260	return 0;
261	/*
262	 * don't use the bypass mechanism for non-vmio vnodes
263	 */
264	if ((vp->v_flag & VVMIO) == 0)
265		return 0;
266	/*
267	 * get the VM object (it has the pages)
268	 */
269	obj = (vm_object_t) vp->v_vmdata;
270	if (obj == NULL)
271		return 0;
272
273	/*
274	 * if there is a buffer that is not busy, it is faster to use it.
275	 * This like read-ahead, etc work better
276	 */
277
278	s = splbio();
279	if ((bp = incore(vp, lbn)) &&
280	    (((bp->b_flags & B_READ) && (bp->b_flags & B_BUSY))
281		|| (bp->b_flags & B_BUSY) == 0)) {
282		splx(s);
283		return 0;
284	}
285	splx(s);
286
287	/*
288	 * get a pbuf --> we just use the kva
289	 */
290	kv = kmem_alloc_wait(pager_map, PAGE_SIZE);
291	nread = 0;
292	error = 0;
293
294	while (!error && uio->uio_resid && maxread > 0) {
295		int po;
296		int count;
297		int s;
298
299relookup:
300		/*
301		 * lookup the page
302		 */
303		m = vm_page_lookup(obj, trunc_page(uio->uio_offset));
304		if (!m)
305			break;
306		/*
307		 * get the offset into the page, and the amount to read in the
308		 * page
309		 */
310		nrest = round_page(uio->uio_offset) - uio->uio_offset;
311		if (nrest > uio->uio_resid)
312			nrest = uio->uio_resid;
313
314		/*
315		 * check the valid bits for the page (DEV_BSIZE chunks)
316		 */
317		if (!vm_page_is_valid(m, uio->uio_offset, nrest))
318			break;
319
320		/*
321		 * if the page is busy, wait for it
322		 */
323		s = splhigh();
324		if (!m->valid || (m->flags & PG_BUSY)) {
325			m->flags |= PG_WANTED;
326			tsleep((caddr_t) m, PVM, "vnibyp", 0);
327			splx(s);
328			goto relookup;
329		}
330		/*
331		 * if the page is on the cache queue, remove it -- cache queue
332		 * pages should be freeable by vm_page_alloc anytime.
333		 */
334		if (m->flags & PG_CACHE) {
335			if (cnt.v_free_count + cnt.v_cache_count < cnt.v_free_reserved) {
336				VM_WAIT;
337				goto relookup;
338			}
339			vm_page_unqueue(m);
340		}
341		/*
342		 * add a buffer mapping (essentially wires the page too).
343		 */
344		m->bmapped++;
345		splx(s);
346
347		/*
348		 * enter it into the kva
349		 */
350		pmap_qenter(kv, &m, 1);
351
352		/*
353		 * do the copy
354		 */
355		po = uio->uio_offset & (PAGE_SIZE - 1);
356		count = PAGE_SIZE - po;
357		if (count > maxread)
358			count = maxread;
359		if (count > uio->uio_resid)
360			count = uio->uio_resid;
361
362		error = uiomove((caddr_t) kv + po, count, uio);
363		if (!error) {
364			nread += count;
365			maxread -= count;
366		}
367		/*
368		 * remove from kva
369		 */
370		pmap_qremove(kv, 1);
371		PAGE_WAKEUP(m);	/* XXX probably unnecessary */
372		/*
373		 * If the page was on the cache queue, then by definition
374		 * bmapped was 0. Thus the following case will also take care
375		 * of the page being removed from the cache queue above.
376		 * Also, it is possible that the page was already entered onto
377		 * another queue (or was already there), so we don't put it
378		 * onto the cache queue...
379		 */
380		m->bmapped--;
381		if (m->bmapped == 0 &&
382		    (m->flags & (PG_CACHE | PG_ACTIVE | PG_INACTIVE)) == 0 &&
383		    m->wire_count == 0) {
384			vm_page_test_dirty(m);
385
386			/*
387			 * make sure that the darned page is on a queue
388			 * somewhere...
389			 */
390			if ((m->dirty & m->valid) == 0) {
391				vm_page_cache(m);
392			} else if (m->hold_count == 0) {
393				vm_page_deactivate(m);
394			} else {
395				vm_page_activate(m);
396			}
397		}
398	}
399	/*
400	 * release our buffer(kva).
401	 */
402	kmem_free_wakeup(pager_map, kv, PAGE_SIZE);
403	return nread;
404}
405
406
407/*
408 * Write, release buffer on completion.  (Done by iodone
409 * if async.)
410 */
411int
412bwrite(struct buf * bp)
413{
414	int oldflags = bp->b_flags;
415
416	if (bp->b_flags & B_INVAL) {
417		brelse(bp);
418		return (0);
419	}
420	if (!(bp->b_flags & B_BUSY))
421		panic("bwrite: buffer is not busy???");
422
423	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
424	bp->b_flags |= B_WRITEINPROG;
425
426	if (oldflags & B_ASYNC) {
427		if (oldflags & B_DELWRI) {
428			reassignbuf(bp, bp->b_vp);
429		} else if (curproc) {
430			++curproc->p_stats->p_ru.ru_oublock;
431		}
432	}
433	bp->b_vp->v_numoutput++;
434	vfs_busy_pages(bp, 1);
435	VOP_STRATEGY(bp);
436
437	if ((oldflags & B_ASYNC) == 0) {
438		int rtval = biowait(bp);
439
440		if (oldflags & B_DELWRI) {
441			reassignbuf(bp, bp->b_vp);
442		} else if (curproc) {
443			++curproc->p_stats->p_ru.ru_oublock;
444		}
445		brelse(bp);
446		return (rtval);
447	}
448	return (0);
449}
450
451int
452vn_bwrite(ap)
453	struct vop_bwrite_args *ap;
454{
455	return (bwrite(ap->a_bp));
456}
457
458/*
459 * Delayed write. (Buffer is marked dirty).
460 */
461void
462bdwrite(struct buf * bp)
463{
464
465	if ((bp->b_flags & B_BUSY) == 0) {
466		panic("bdwrite: buffer is not busy");
467	}
468	if (bp->b_flags & B_INVAL) {
469		brelse(bp);
470		return;
471	}
472	if (bp->b_flags & B_TAPE) {
473		bawrite(bp);
474		return;
475	}
476	bp->b_flags &= ~B_READ;
477	vfs_dirty_pages(bp);
478	if ((bp->b_flags & B_DELWRI) == 0) {
479		if (curproc)
480			++curproc->p_stats->p_ru.ru_oublock;
481		bp->b_flags |= B_DONE | B_DELWRI;
482		reassignbuf(bp, bp->b_vp);
483	}
484	brelse(bp);
485	return;
486}
487
488/*
489 * Asynchronous write.
490 * Start output on a buffer, but do not wait for it to complete.
491 * The buffer is released when the output completes.
492 */
493void
494bawrite(struct buf * bp)
495{
496	if (((bp->b_flags & B_DELWRI) == 0) && (bp->b_vp->v_numoutput > 24)) {
497		int s = splbio();
498
499		while (bp->b_vp->v_numoutput > 16) {
500			bp->b_vp->v_flag |= VBWAIT;
501			tsleep((caddr_t) &bp->b_vp->v_numoutput, PRIBIO, "bawnmo", 0);
502		}
503		splx(s);
504	}
505	bp->b_flags |= B_ASYNC;
506	(void) bwrite(bp);
507}
508
509/*
510 * Release a buffer.
511 */
512void
513brelse(struct buf * bp)
514{
515	int s;
516
517	if (bp->b_flags & B_CLUSTER) {
518		relpbuf(bp);
519		return;
520	}
521	/* anyone need a "free" block? */
522	s = splbio();
523
524	if (needsbuffer) {
525		needsbuffer = 0;
526		wakeup((caddr_t) &needsbuffer);
527	}
528	/* anyone need this block? */
529	if (bp->b_flags & B_WANTED) {
530		bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_AGE);
531		wakeup((caddr_t) bp);
532	} else if (bp->b_flags & B_VMIO) {
533		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
534		wakeup((caddr_t) bp);
535	}
536	if (bp->b_flags & B_LOCKED)
537		bp->b_flags &= ~B_ERROR;
538
539	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
540	    (bp->b_bufsize <= 0)) {
541		bp->b_flags |= B_INVAL;
542		bp->b_flags &= ~(B_DELWRI | B_CACHE);
543		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
544			brelvp(bp);
545	}
546	if (bp->b_flags & B_VMIO) {
547		vm_offset_t foff;
548		vm_object_t obj;
549		int i, resid;
550		vm_page_t m;
551		int iototal = bp->b_bufsize;
552
553		foff = 0;
554		obj = 0;
555		if (bp->b_npages) {
556			if (bp->b_vp && bp->b_vp->v_mount) {
557				foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
558			} else {
559				/*
560				 * vnode pointer has been ripped away --
561				 * probably file gone...
562				 */
563				foff = bp->b_pages[0]->offset;
564			}
565		}
566		for (i = 0; i < bp->b_npages; i++) {
567			m = bp->b_pages[i];
568			if (m == bogus_page) {
569				panic("brelse: bogus page found");
570			}
571			resid = (m->offset + PAGE_SIZE) - foff;
572			if (resid > iototal)
573				resid = iototal;
574			if (resid > 0) {
575				if (bp->b_flags & (B_ERROR | B_NOCACHE)) {
576					vm_page_set_invalid(m, foff, resid);
577				} else if ((bp->b_flags & B_DELWRI) == 0) {
578					vm_page_set_clean(m, foff, resid);
579					vm_page_set_valid(m, foff, resid);
580				}
581			} else {
582				vm_page_test_dirty(m);
583			}
584			if (bp->b_flags & B_INVAL) {
585				if (m->bmapped == 0) {
586					panic("brelse: bmapped is zero for page\n");
587				}
588				--m->bmapped;
589				if (m->bmapped == 0) {
590					PAGE_WAKEUP(m);
591					if ((m->dirty & m->valid) == 0)
592						vm_page_cache(m);
593				}
594			}
595			foff += resid;
596			iototal -= resid;
597		}
598
599		if (bp->b_flags & B_INVAL) {
600			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
601			bp->b_npages = 0;
602			bp->b_bufsize = 0;
603			bp->b_flags &= ~B_VMIO;
604			if (bp->b_vp)
605				brelvp(bp);
606			--nvmio;
607		}
608	}
609	if (bp->b_qindex != QUEUE_NONE)
610		panic("brelse: free buffer onto another queue???");
611
612	/* enqueue */
613	/* buffers with no memory */
614	if (bp->b_bufsize == 0) {
615		bp->b_qindex = QUEUE_EMPTY;
616		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
617		LIST_REMOVE(bp, b_hash);
618		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
619		bp->b_dev = NODEV;
620		/* buffers with junk contents */
621	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE)) {
622		bp->b_qindex = QUEUE_AGE;
623		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
624		LIST_REMOVE(bp, b_hash);
625		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
626		bp->b_dev = NODEV;
627		/* buffers that are locked */
628	} else if (bp->b_flags & B_LOCKED) {
629		bp->b_qindex = QUEUE_LOCKED;
630		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
631		/* buffers with stale but valid contents */
632	} else if (bp->b_flags & B_AGE) {
633		bp->b_qindex = QUEUE_AGE;
634		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
635		/* buffers with valid and quite potentially reuseable contents */
636	} else {
637		if (bp->b_flags & B_VMIO)
638			bp->b_qindex = QUEUE_VMIO;
639		else {
640			bp->b_qindex = QUEUE_LRU;
641			++nlru;
642		}
643		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
644	}
645
646	/* unlock */
647	bp->b_flags &= ~(B_PDWANTED | B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE);
648	splx(s);
649}
650
651/*
652 * this routine implements clustered async writes for
653 * clearing out B_DELWRI buffers...
654 */
655void
656vfs_bio_awrite(struct buf * bp)
657{
658	int i;
659	daddr_t lblkno = bp->b_lblkno;
660	struct vnode *vp = bp->b_vp;
661	int s;
662	int ncl;
663	struct buf *bpa;
664
665	s = splbio();
666	if( vp->v_mount && (vp->v_flag & VVMIO) &&
667		(bp->b_flags & (B_CLUSTEROK|B_INVAL)) == B_CLUSTEROK) {
668		int size  = vp->v_mount->mnt_stat.f_iosize;
669		for (i = 1; i < MAXPHYS / size; i++) {
670			if ((bpa = incore(vp, lblkno + i)) &&
671			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_BUSY | B_CLUSTEROK | B_INVAL)) == B_DELWRI | B_CLUSTEROK) &&
672			    (bpa->b_bufsize == size)) {
673				if ((bpa->b_blkno == bpa->b_lblkno) ||
674				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
675					break;
676			} else {
677				break;
678			}
679		}
680		ncl = i;
681		/*
682		 * this is a possible cluster write
683		 */
684		if (ncl != 1) {
685			cluster_wbuild(vp, NULL, size, lblkno, ncl, -1);
686			splx(s);
687			return;
688		}
689	}
690	/*
691	 * default (old) behavior, writing out only one block
692	 */
693	bremfree(bp);
694	bp->b_flags |= B_BUSY | B_ASYNC;
695	bwrite(bp);
696	splx(s);
697}
698
699int freebufspace;
700int allocbufspace;
701
702/*
703 * Find a buffer header which is available for use.
704 */
705struct buf *
706getnewbuf(int slpflag, int slptimeo, int doingvmio)
707{
708	struct buf *bp;
709	int s;
710	int firstbp = 1;
711
712	s = splbio();
713start:
714	/* can we constitute a new buffer? */
715	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
716		if (bp->b_qindex != QUEUE_EMPTY)
717			panic("getnewbuf: inconsistent EMPTY queue");
718		bremfree(bp);
719		goto fillbuf;
720	}
721	/*
722	 * we keep the file I/O from hogging metadata I/O
723	 */
724	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
725		if (bp->b_qindex != QUEUE_AGE)
726			panic("getnewbuf: inconsistent AGE queue");
727	} else if ((nvmio > (2 * nbuf / 3))
728	    && (bp = bufqueues[QUEUE_VMIO].tqh_first)) {
729		if (bp->b_qindex != QUEUE_VMIO)
730			panic("getnewbuf: inconsistent VMIO queue");
731	} else if ((!doingvmio || (nlru > (2 * nbuf / 3))) &&
732	    (bp = bufqueues[QUEUE_LRU].tqh_first)) {
733		if (bp->b_qindex != QUEUE_LRU)
734			panic("getnewbuf: inconsistent LRU queue");
735	}
736	if (!bp) {
737		if (doingvmio) {
738			if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
739				if (bp->b_qindex != QUEUE_VMIO)
740					panic("getnewbuf: inconsistent VMIO queue");
741			} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
742				if (bp->b_qindex != QUEUE_LRU)
743					panic("getnewbuf: inconsistent LRU queue");
744			}
745		} else {
746			if (bp = bufqueues[QUEUE_LRU].tqh_first) {
747				if (bp->b_qindex != QUEUE_LRU)
748					panic("getnewbuf: inconsistent LRU queue");
749			} else if (bp = bufqueues[QUEUE_VMIO].tqh_first) {
750				if (bp->b_qindex != QUEUE_VMIO)
751					panic("getnewbuf: inconsistent VMIO queue");
752			}
753		}
754	}
755	if (!bp) {
756		/* wait for a free buffer of any kind */
757		needsbuffer = 1;
758		tsleep((caddr_t) &needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
759		splx(s);
760		return (0);
761	}
762	/* if we are a delayed write, convert to an async write */
763	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
764		vfs_bio_awrite(bp);
765		if (!slpflag && !slptimeo) {
766			splx(s);
767			return (0);
768		}
769		goto start;
770	}
771	bremfree(bp);
772
773	if (bp->b_flags & B_VMIO) {
774		bp->b_flags |= B_INVAL | B_BUSY;
775		brelse(bp);
776		bremfree(bp);
777	}
778	if (bp->b_vp)
779		brelvp(bp);
780
781	/* we are not free, nor do we contain interesting data */
782	if (bp->b_rcred != NOCRED)
783		crfree(bp->b_rcred);
784	if (bp->b_wcred != NOCRED)
785		crfree(bp->b_wcred);
786fillbuf:
787	bp->b_flags = B_BUSY;
788	LIST_REMOVE(bp, b_hash);
789	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
790	splx(s);
791	if (bp->b_bufsize) {
792		allocbuf(bp, 0, 0);
793	}
794	bp->b_dev = NODEV;
795	bp->b_vp = NULL;
796	bp->b_blkno = bp->b_lblkno = 0;
797	bp->b_iodone = 0;
798	bp->b_error = 0;
799	bp->b_resid = 0;
800	bp->b_bcount = 0;
801	bp->b_npages = 0;
802	bp->b_wcred = bp->b_rcred = NOCRED;
803	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
804	bp->b_dirtyoff = bp->b_dirtyend = 0;
805	bp->b_validoff = bp->b_validend = 0;
806	return (bp);
807}
808
809/*
810 * Check to see if a block is currently memory resident.
811 */
812struct buf *
813incore(struct vnode * vp, daddr_t blkno)
814{
815	struct buf *bp;
816	struct bufhashhdr *bh;
817
818	int s = splbio();
819
820	bh = BUFHASH(vp, blkno);
821	bp = bh->lh_first;
822
823	/* Search hash chain */
824	while (bp) {
825		/* hit */
826		if (bp->b_lblkno == blkno && bp->b_vp == vp
827		    && (bp->b_flags & B_INVAL) == 0) {
828			splx(s);
829			return (bp);
830		}
831		bp = bp->b_hash.le_next;
832	}
833	splx(s);
834
835	return (0);
836}
837
838/*
839 * returns true if no I/O is needed to access the
840 * associated VM object.
841 */
842
843int
844inmem(struct vnode * vp, daddr_t blkno)
845{
846	vm_object_t obj;
847	vm_offset_t off, toff, tinc;
848	vm_page_t m;
849
850	if (incore(vp, blkno))
851		return 1;
852	if (vp->v_mount == 0)
853		return 0;
854	if (vp->v_vmdata == 0)
855		return 0;
856
857	obj = (vm_object_t) vp->v_vmdata;
858	tinc = PAGE_SIZE;
859	if (tinc > vp->v_mount->mnt_stat.f_iosize)
860		tinc = vp->v_mount->mnt_stat.f_iosize;
861	off = blkno * vp->v_mount->mnt_stat.f_iosize;
862
863	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
864		int mask;
865
866		m = vm_page_lookup(obj, trunc_page(toff + off));
867		if (!m)
868			return 0;
869		if (vm_page_is_valid(m, toff + off, tinc) == 0)
870			return 0;
871	}
872	return 1;
873}
874
875/*
876 * Get a block given a specified block and offset into a file/device.
877 */
878struct buf *
879getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
880{
881	struct buf *bp;
882	int s;
883	struct bufhashhdr *bh;
884	vm_offset_t off;
885	int nleft;
886
887	s = splbio();
888loop:
889	if ((cnt.v_free_count + cnt.v_cache_count) <
890	    cnt.v_free_reserved + MAXBSIZE / PAGE_SIZE)
891		wakeup((caddr_t) &vm_pages_needed);
892	if (bp = incore(vp, blkno)) {
893		if (bp->b_flags & B_BUSY) {
894			bp->b_flags |= B_WANTED;
895			if (curproc == pageproc) {
896				bp->b_flags |= B_PDWANTED;
897				wakeup((caddr_t) &cnt.v_free_count);
898			}
899			if (!tsleep((caddr_t) bp, PRIBIO | slpflag, "getblk", slptimeo))
900				goto loop;
901			splx(s);
902			return (struct buf *) NULL;
903		}
904		bp->b_flags |= B_BUSY | B_CACHE;
905		bremfree(bp);
906		/*
907		 * check for size inconsistancies
908		 */
909		if (bp->b_bcount != size) {
910#if defined(VFS_BIO_DEBUG)
911			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
912#endif
913			bp->b_flags |= B_INVAL;
914			bwrite(bp);
915			goto loop;
916		}
917		splx(s);
918		return (bp);
919	} else {
920		vm_object_t obj;
921		int doingvmio;
922
923		if ((obj = (vm_object_t) vp->v_vmdata) &&
924		    (vp->v_flag & VVMIO) /* && (blkno >= 0) */ ) {
925			doingvmio = 1;
926		} else {
927			doingvmio = 0;
928		}
929		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
930			if (slpflag || slptimeo)
931				return NULL;
932			goto loop;
933		}
934		if (incore(vp, blkno)) {
935			bp->b_flags |= B_INVAL;
936			brelse(bp);
937			goto loop;
938		}
939		bp->b_blkno = bp->b_lblkno = blkno;
940		bgetvp(vp, bp);
941		LIST_REMOVE(bp, b_hash);
942		bh = BUFHASH(vp, blkno);
943		LIST_INSERT_HEAD(bh, bp, b_hash);
944		if (doingvmio) {
945			bp->b_flags |= (B_VMIO | B_CACHE);
946#if defined(VFS_BIO_DEBUG)
947			if (vp->v_type != VREG)
948				printf("getblk: vmioing file type %d???\n", vp->v_type);
949#endif
950			++nvmio;
951		} else {
952			if (bp->b_flags & B_VMIO)
953				--nvmio;
954			bp->b_flags &= ~B_VMIO;
955		}
956		splx(s);
957		if (!allocbuf(bp, size, 1)) {
958			s = splbio();
959			goto loop;
960		}
961		return (bp);
962	}
963}
964
965/*
966 * Get an empty, disassociated buffer of given size.
967 */
968struct buf *
969geteblk(int size)
970{
971	struct buf *bp;
972
973	while ((bp = getnewbuf(0, 0, 0)) == 0);
974	allocbuf(bp, size, 0);
975	bp->b_flags |= B_INVAL;
976	return (bp);
977}
978
979/*
980 * Modify the length of a buffer's underlying buffer storage without
981 * destroying information (unless, of course the buffer is shrinking).
982 */
983int
984allocbuf(struct buf * bp, int size, int vmio)
985{
986
987	int s;
988	int newbsize;
989	int i;
990
991	if ((bp->b_flags & B_VMIO) == 0) {
992		newbsize = round_page(size);
993		if (newbsize == bp->b_bufsize) {
994			bp->b_bcount = size;
995			return 1;
996		} else if (newbsize < bp->b_bufsize) {
997			if (bp->b_flags & B_MALLOC) {
998				bp->b_bcount = size;
999				return 1;
1000			}
1001			vm_hold_free_pages(
1002			    bp,
1003			    (vm_offset_t) bp->b_data + newbsize,
1004			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1005		} else if (newbsize > bp->b_bufsize) {
1006			if (bp->b_flags & B_MALLOC) {
1007				vm_offset_t bufaddr;
1008
1009				bufaddr = (vm_offset_t) bp->b_data;
1010				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1011				vm_hold_load_pages(
1012				    bp,
1013				    (vm_offset_t) bp->b_data,
1014				    (vm_offset_t) bp->b_data + newbsize);
1015				bcopy((caddr_t) bufaddr, bp->b_data, bp->b_bcount);
1016				free((caddr_t) bufaddr, M_TEMP);
1017			} else if ((newbsize <= PAGE_SIZE / 2) && (bp->b_bufsize == 0)) {
1018				bp->b_flags |= B_MALLOC;
1019				bp->b_data = malloc(newbsize, M_TEMP, M_WAITOK);
1020				bp->b_npages = 0;
1021			} else {
1022				vm_hold_load_pages(
1023				    bp,
1024				    (vm_offset_t) bp->b_data + bp->b_bufsize,
1025				    (vm_offset_t) bp->b_data + newbsize);
1026			}
1027		}
1028		/*
1029		 * adjust buffer cache's idea of memory allocated to buffer
1030		 * contents
1031		 */
1032		freebufspace -= newbsize - bp->b_bufsize;
1033		allocbufspace += newbsize - bp->b_bufsize;
1034	} else {
1035		vm_page_t m;
1036		int desiredpages;
1037
1038		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
1039		desiredpages = round_page(newbsize) / PAGE_SIZE;
1040
1041		if (newbsize == bp->b_bufsize) {
1042			bp->b_bcount = size;
1043			return 1;
1044		} else if (newbsize < bp->b_bufsize) {
1045			if (desiredpages < bp->b_npages) {
1046				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1047				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
1048				for (i = desiredpages; i < bp->b_npages; i++) {
1049					m = bp->b_pages[i];
1050					s = splhigh();
1051					if ((m->flags & PG_BUSY) || (m->busy != 0)) {
1052						m->flags |= PG_WANTED;
1053						tsleep(m, PVM, "biodep", 0);
1054					}
1055					splx(s);
1056
1057					if (m->bmapped == 0) {
1058						printf("allocbuf: bmapped is zero for page %d\n", i);
1059						panic("allocbuf: error");
1060					}
1061					--m->bmapped;
1062					if (m->bmapped == 0) {
1063						PAGE_WAKEUP(m);
1064						pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
1065						vm_page_free(m);
1066					}
1067					bp->b_pages[i] = NULL;
1068				}
1069				bp->b_npages = desiredpages;
1070			}
1071		} else {
1072			vm_object_t obj;
1073			vm_offset_t tinc, off, toff, objoff;
1074			int pageindex, curbpnpages;
1075			struct vnode *vp;
1076			int bsize;
1077
1078			vp = bp->b_vp;
1079			bsize = vp->v_mount->mnt_stat.f_iosize;
1080
1081			if (bp->b_npages < desiredpages) {
1082				obj = (vm_object_t) vp->v_vmdata;
1083				tinc = PAGE_SIZE;
1084				if (tinc > bsize)
1085					tinc = bsize;
1086				off = bp->b_lblkno * bsize;
1087				curbpnpages = bp->b_npages;
1088		doretry:
1089				for (toff = 0; toff < newbsize; toff += tinc) {
1090					int mask;
1091					int bytesinpage;
1092
1093					pageindex = toff / PAGE_SIZE;
1094					objoff = trunc_page(toff + off);
1095					if (pageindex < curbpnpages) {
1096						int pb;
1097
1098						m = bp->b_pages[pageindex];
1099						if (m->offset != objoff)
1100							panic("allocbuf: page changed offset??!!!?");
1101						bytesinpage = tinc;
1102						if (tinc > (newbsize - toff))
1103							bytesinpage = newbsize - toff;
1104						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1105							bp->b_flags &= ~B_CACHE;
1106						}
1107						if ((m->flags & PG_ACTIVE) == 0)
1108							vm_page_activate(m);
1109						continue;
1110					}
1111					m = vm_page_lookup(obj, objoff);
1112					if (!m) {
1113						m = vm_page_alloc(obj, objoff, 0);
1114						if (!m) {
1115							int j;
1116
1117							for (j = bp->b_npages; j < pageindex; j++) {
1118								vm_page_t mt = bp->b_pages[j];
1119
1120								PAGE_WAKEUP(mt);
1121								if (!mt->valid) {
1122									vm_page_free(mt);
1123								}
1124							}
1125							VM_WAIT;
1126							if (vmio && (bp->b_flags & B_PDWANTED)) {
1127								--nvmio;
1128								bp->b_flags &= ~B_VMIO;
1129								bp->b_flags |= B_INVAL;
1130								brelse(bp);
1131								return 0;
1132							}
1133							curbpnpages = bp->b_npages;
1134							goto doretry;
1135						}
1136						m->valid = 0;
1137						vm_page_activate(m);
1138					} else if ((m->valid == 0) || (m->flags & PG_BUSY)) {
1139						int j;
1140						int bufferdestroyed = 0;
1141
1142						for (j = bp->b_npages; j < pageindex; j++) {
1143							vm_page_t mt = bp->b_pages[j];
1144
1145							PAGE_WAKEUP(mt);
1146							if (mt->valid == 0) {
1147								vm_page_free(mt);
1148							}
1149						}
1150						if (vmio && (bp->b_flags & B_PDWANTED)) {
1151							--nvmio;
1152							bp->b_flags &= ~B_VMIO;
1153							bp->b_flags |= B_INVAL;
1154							brelse(bp);
1155							VM_WAIT;
1156							bufferdestroyed = 1;
1157						}
1158						s = splbio();
1159						if (m) {
1160							m->flags |= PG_WANTED;
1161							tsleep(m, PRIBIO, "pgtblk", 0);
1162						}
1163						splx(s);
1164						if (bufferdestroyed)
1165							return 0;
1166						curbpnpages = bp->b_npages;
1167						goto doretry;
1168					} else {
1169						int pb;
1170
1171						if ((m->flags & PG_CACHE) &&
1172						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_reserved) {
1173							int j;
1174
1175							for (j = bp->b_npages; j < pageindex; j++) {
1176								vm_page_t mt = bp->b_pages[j];
1177
1178								PAGE_WAKEUP(mt);
1179								if (mt->valid == 0) {
1180									vm_page_free(mt);
1181								}
1182							}
1183							VM_WAIT;
1184							if (vmio && (bp->b_flags & B_PDWANTED)) {
1185								--nvmio;
1186								bp->b_flags &= ~B_VMIO;
1187								bp->b_flags |= B_INVAL;
1188								brelse(bp);
1189								return 0;
1190							}
1191							curbpnpages = bp->b_npages;
1192							goto doretry;
1193						}
1194						bytesinpage = tinc;
1195						if (tinc > (newbsize - toff))
1196							bytesinpage = newbsize - toff;
1197						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1198							bp->b_flags &= ~B_CACHE;
1199						}
1200						if ((m->flags & PG_ACTIVE) == 0)
1201							vm_page_activate(m);
1202						m->flags |= PG_BUSY;
1203					}
1204					bp->b_pages[pageindex] = m;
1205					curbpnpages = pageindex + 1;
1206				}
1207				if (bsize >= PAGE_SIZE) {
1208					for (i = bp->b_npages; i < curbpnpages; i++) {
1209						m = bp->b_pages[i];
1210						if (m->valid == 0) {
1211							bp->b_flags &= ~B_CACHE;
1212						}
1213						m->bmapped++;
1214						PAGE_WAKEUP(m);
1215					}
1216				} else {
1217					if (!vm_page_is_valid(bp->b_pages[0], off, bsize))
1218						bp->b_flags &= ~B_CACHE;
1219					bp->b_pages[0]->bmapped++;
1220					PAGE_WAKEUP(bp->b_pages[0]);
1221				}
1222				bp->b_npages = curbpnpages;
1223				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1224				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1225				bp->b_data += off % PAGE_SIZE;
1226			}
1227		}
1228	}
1229	bp->b_bufsize = newbsize;
1230	bp->b_bcount = size;
1231	return 1;
1232}
1233
1234/*
1235 * Wait for buffer I/O completion, returning error status.
1236 */
1237int
1238biowait(register struct buf * bp)
1239{
1240	int s;
1241
1242	s = splbio();
1243	while ((bp->b_flags & B_DONE) == 0)
1244		tsleep((caddr_t) bp, PRIBIO, "biowait", 0);
1245	if ((bp->b_flags & B_ERROR) || bp->b_error) {
1246		if ((bp->b_flags & B_INVAL) == 0) {
1247			bp->b_flags |= B_INVAL;
1248			bp->b_dev = NODEV;
1249			LIST_REMOVE(bp, b_hash);
1250			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1251			wakeup((caddr_t) bp);
1252		}
1253		if (!bp->b_error)
1254			bp->b_error = EIO;
1255		else
1256			bp->b_flags |= B_ERROR;
1257		splx(s);
1258		return (bp->b_error);
1259	} else {
1260		splx(s);
1261		return (0);
1262	}
1263}
1264
1265/*
1266 * Finish I/O on a buffer, calling an optional function.
1267 * This is usually called from interrupt level, so process blocking
1268 * is not *a good idea*.
1269 */
1270void
1271biodone(register struct buf * bp)
1272{
1273	int s;
1274
1275	s = splbio();
1276	if (bp->b_flags & B_DONE)
1277		printf("biodone: buffer already done\n");
1278	bp->b_flags |= B_DONE;
1279
1280	if ((bp->b_flags & B_READ) == 0) {
1281		vwakeup(bp);
1282	}
1283#ifdef BOUNCE_BUFFERS
1284	if (bp->b_flags & B_BOUNCE)
1285		vm_bounce_free(bp);
1286#endif
1287
1288	/* call optional completion function if requested */
1289	if (bp->b_flags & B_CALL) {
1290		bp->b_flags &= ~B_CALL;
1291		(*bp->b_iodone) (bp);
1292		splx(s);
1293		return;
1294	}
1295	if (bp->b_flags & B_VMIO) {
1296		int i, resid;
1297		vm_offset_t foff;
1298		vm_page_t m;
1299		vm_object_t obj;
1300		int iosize;
1301		struct vnode *vp = bp->b_vp;
1302
1303		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1304		obj = (vm_object_t) vp->v_vmdata;
1305		if (!obj) {
1306			return;
1307		}
1308#if defined(VFS_BIO_DEBUG)
1309		if (obj->paging_in_progress < bp->b_npages) {
1310			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1311			    obj->paging_in_progress, bp->b_npages);
1312		}
1313#endif
1314		iosize = bp->b_bufsize;
1315		for (i = 0; i < bp->b_npages; i++) {
1316			m = bp->b_pages[i];
1317			if (m == bogus_page) {
1318				m = vm_page_lookup(obj, foff);
1319				if (!m) {
1320#if defined(VFS_BIO_DEBUG)
1321					printf("biodone: page disappeared\n");
1322#endif
1323					--obj->paging_in_progress;
1324					continue;
1325				}
1326				bp->b_pages[i] = m;
1327				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1328			}
1329#if defined(VFS_BIO_DEBUG)
1330			if (trunc_page(foff) != m->offset) {
1331				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1332			}
1333#endif
1334			resid = (m->offset + PAGE_SIZE) - foff;
1335			if (resid > iosize)
1336				resid = iosize;
1337			if (resid > 0) {
1338				vm_page_set_valid(m, foff, resid);
1339				vm_page_set_clean(m, foff, resid);
1340			}
1341			if (m->busy == 0) {
1342				printf("biodone: page busy < 0, off: %d, foff: %d, resid: %d, index: %d\n",
1343				    m->offset, foff, resid, i);
1344				printf(" iosize: %d, lblkno: %d\n",
1345				    bp->b_vp->v_mount->mnt_stat.f_iosize, bp->b_lblkno);
1346				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1347				    m->valid, m->dirty, m->bmapped);
1348				panic("biodone: page busy < 0\n");
1349			}
1350			--m->busy;
1351			PAGE_WAKEUP(m);
1352			--obj->paging_in_progress;
1353			foff += resid;
1354			iosize -= resid;
1355		}
1356		if (obj && obj->paging_in_progress == 0)
1357			wakeup((caddr_t) obj);
1358	}
1359	/*
1360	 * For asynchronous completions, release the buffer now. The brelse
1361	 * checks for B_WANTED and will do the wakeup there if necessary - so
1362	 * no need to do a wakeup here in the async case.
1363	 */
1364
1365	if (bp->b_flags & B_ASYNC) {
1366		brelse(bp);
1367	} else {
1368		bp->b_flags &= ~(B_WANTED | B_PDWANTED);
1369		wakeup((caddr_t) bp);
1370	}
1371	splx(s);
1372}
1373
1374int
1375count_lock_queue()
1376{
1377	int count;
1378	struct buf *bp;
1379
1380	count = 0;
1381	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1382	    bp != NULL;
1383	    bp = bp->b_freelist.tqe_next)
1384		count++;
1385	return (count);
1386}
1387
1388int vfs_update_interval = 30;
1389
1390void
1391vfs_update()
1392{
1393	(void) spl0();
1394	while (1) {
1395		tsleep((caddr_t) &vfs_update_wakeup, PRIBIO, "update",
1396		    hz * vfs_update_interval);
1397		vfs_update_wakeup = 0;
1398		sync(curproc, NULL, NULL);
1399	}
1400}
1401
1402void
1403vfs_unbusy_pages(struct buf * bp)
1404{
1405	int i;
1406
1407	if (bp->b_flags & B_VMIO) {
1408		struct vnode *vp = bp->b_vp;
1409		vm_object_t obj = (vm_object_t) vp->v_vmdata;
1410		vm_offset_t foff;
1411
1412		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1413
1414		for (i = 0; i < bp->b_npages; i++) {
1415			vm_page_t m = bp->b_pages[i];
1416
1417			if (m == bogus_page) {
1418				m = vm_page_lookup(obj, foff);
1419				if (!m) {
1420					panic("vfs_unbusy_pages: page missing\n");
1421				}
1422				bp->b_pages[i] = m;
1423				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1424			}
1425			--obj->paging_in_progress;
1426			--m->busy;
1427			PAGE_WAKEUP(m);
1428		}
1429		if (obj->paging_in_progress == 0)
1430			wakeup((caddr_t) obj);
1431	}
1432}
1433
1434void
1435vfs_busy_pages(struct buf * bp, int clear_modify)
1436{
1437	int i;
1438
1439	if (bp->b_flags & B_VMIO) {
1440		vm_object_t obj = (vm_object_t) bp->b_vp->v_vmdata;
1441		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1442		int iocount = bp->b_bufsize;
1443
1444		for (i = 0; i < bp->b_npages; i++) {
1445			vm_page_t m = bp->b_pages[i];
1446			int resid = (m->offset + PAGE_SIZE) - foff;
1447
1448			if (resid > iocount)
1449				resid = iocount;
1450			obj->paging_in_progress++;
1451			m->busy++;
1452			if (clear_modify) {
1453				vm_page_test_dirty(m);
1454				pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_READ);
1455			} else if (bp->b_bcount >= PAGE_SIZE) {
1456				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1457					bp->b_pages[i] = bogus_page;
1458					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1459				}
1460			}
1461			foff += resid;
1462			iocount -= resid;
1463		}
1464	}
1465}
1466
1467void
1468vfs_dirty_pages(struct buf * bp)
1469{
1470	int i;
1471
1472	if (bp->b_flags & B_VMIO) {
1473		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1474		int iocount = bp->b_bufsize;
1475
1476		for (i = 0; i < bp->b_npages; i++) {
1477			vm_page_t m = bp->b_pages[i];
1478			int resid = (m->offset + PAGE_SIZE) - foff;
1479
1480			if (resid > iocount)
1481				resid = iocount;
1482			if (resid > 0) {
1483				vm_page_set_valid(m, foff, resid);
1484				vm_page_set_dirty(m, foff, resid);
1485			}
1486			PAGE_WAKEUP(m);
1487			foff += resid;
1488			iocount -= resid;
1489		}
1490	}
1491}
1492/*
1493 * these routines are not in the correct place (yet)
1494 * also they work *ONLY* for kernel_pmap!!!
1495 */
1496void
1497vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1498{
1499	vm_offset_t pg;
1500	vm_page_t p;
1501	vm_offset_t from = round_page(froma);
1502	vm_offset_t to = round_page(toa);
1503
1504tryagain0:
1505	if ((curproc != pageproc) && ((cnt.v_free_count + cnt.v_cache_count) <=
1506		cnt.v_free_reserved + (toa - froma) / PAGE_SIZE)) {
1507		VM_WAIT;
1508		goto tryagain0;
1509	}
1510	for (pg = from; pg < to; pg += PAGE_SIZE) {
1511
1512tryagain:
1513
1514		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS, 0);
1515		if (!p) {
1516			VM_WAIT;
1517			goto tryagain;
1518		}
1519		vm_page_wire(p);
1520		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1521		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1522		PAGE_WAKEUP(p);
1523		bp->b_npages++;
1524	}
1525}
1526
1527void
1528vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1529{
1530	vm_offset_t pg;
1531	vm_page_t p;
1532	vm_offset_t from = round_page(froma);
1533	vm_offset_t to = round_page(toa);
1534
1535	for (pg = from; pg < to; pg += PAGE_SIZE) {
1536		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1537		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1538		pmap_kremove(pg);
1539		vm_page_free(p);
1540		--bp->b_npages;
1541	}
1542}
1543
1544void
1545bufstats()
1546{
1547}
1548