vfs_bio.c revision 12404
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17 *    is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 *    are met.
20 *
21 * $Id: vfs_bio.c,v 1.70 1995/11/18 23:33:48 dyson Exp $
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme.  Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author:  John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#define VMIO
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/sysproto.h>
39#include <sys/kernel.h>
40#include <sys/proc.h>
41#include <sys/vnode.h>
42#include <vm/vm.h>
43#include <vm/vm_kern.h>
44#include <vm/vm_pageout.h>
45#include <vm/vm_page.h>
46#include <vm/vm_object.h>
47#include <sys/buf.h>
48#include <sys/mount.h>
49#include <sys/malloc.h>
50#include <sys/resourcevar.h>
51#include <sys/proc.h>
52
53#include <miscfs/specfs/specdev.h>
54
55/*
56 * System initialization
57 */
58
59static void vfs_update __P((void));
60struct	proc *updateproc;
61
62static struct kproc_desc up_kp = {
63	"update",
64	vfs_update,
65	&updateproc
66};
67SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
68
69
70struct buf *buf;		/* buffer header pool */
71struct swqueue bswlist;
72
73void vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
74void vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to);
75void vfs_clean_pages(struct buf * bp);
76static void vfs_setdirty(struct buf *bp);
77static __inline struct buf * gbincore(struct vnode * vp, daddr_t blkno);
78
79int needsbuffer;
80
81/*
82 * Internal update daemon, process 3
83 *	The variable vfs_update_wakeup allows for internal syncs.
84 */
85int vfs_update_wakeup;
86
87
88/*
89 * buffers base kva
90 */
91caddr_t buffers_kva;
92
93/*
94 * bogus page -- for I/O to/from partially complete buffers
95 * this is a temporary solution to the problem, but it is not
96 * really that bad.  it would be better to split the buffer
97 * for input in the case of buffers partially already in memory,
98 * but the code is intricate enough already.
99 */
100vm_page_t bogus_page;
101vm_offset_t bogus_offset;
102
103int bufspace, maxbufspace;
104
105struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
106struct bqueues bufqueues[BUFFER_QUEUES];
107
108/*
109 * Initialize buffer headers and related structures.
110 */
111void
112bufinit()
113{
114	struct buf *bp;
115	int i;
116
117	TAILQ_INIT(&bswlist);
118	LIST_INIT(&invalhash);
119
120	/* first, make a null hash table */
121	for (i = 0; i < BUFHSZ; i++)
122		LIST_INIT(&bufhashtbl[i]);
123
124	/* next, make a null set of free lists */
125	for (i = 0; i < BUFFER_QUEUES; i++)
126		TAILQ_INIT(&bufqueues[i]);
127
128	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
129	/* finally, initialize each buffer header and stick on empty q */
130	for (i = 0; i < nbuf; i++) {
131		bp = &buf[i];
132		bzero(bp, sizeof *bp);
133		bp->b_flags = B_INVAL;	/* we're just an empty header */
134		bp->b_dev = NODEV;
135		bp->b_rcred = NOCRED;
136		bp->b_wcred = NOCRED;
137		bp->b_qindex = QUEUE_EMPTY;
138		bp->b_vnbufs.le_next = NOLIST;
139		bp->b_data = buffers_kva + i * MAXBSIZE;
140		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
141		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
142	}
143/*
144 * maxbufspace is currently calculated to support all filesystem blocks
145 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
146 * cache is still the same as it would be for 8K filesystems.  This
147 * keeps the size of the buffer cache "in check" for big block filesystems.
148 */
149	maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
150
151	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
152	bogus_page = vm_page_alloc(kernel_object,
153			bogus_offset - VM_MIN_KERNEL_ADDRESS, VM_ALLOC_NORMAL);
154
155}
156
157/*
158 * remove the buffer from the appropriate free list
159 */
160void
161bremfree(struct buf * bp)
162{
163	int s = splbio();
164
165	if (bp->b_qindex != QUEUE_NONE) {
166		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
167		bp->b_qindex = QUEUE_NONE;
168	} else {
169		panic("bremfree: removing a buffer when not on a queue");
170	}
171	splx(s);
172}
173
174/*
175 * Get a buffer with the specified data.  Look in the cache first.
176 */
177int
178bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
179    struct buf ** bpp)
180{
181	struct buf *bp;
182
183	bp = getblk(vp, blkno, size, 0, 0);
184	*bpp = bp;
185
186	/* if not found in cache, do some I/O */
187	if ((bp->b_flags & B_CACHE) == 0) {
188		if (curproc != NULL)
189			curproc->p_stats->p_ru.ru_inblock++;
190		bp->b_flags |= B_READ;
191		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
192		if (bp->b_rcred == NOCRED) {
193			if (cred != NOCRED)
194				crhold(cred);
195			bp->b_rcred = cred;
196		}
197		vfs_busy_pages(bp, 0);
198		VOP_STRATEGY(bp);
199		return (biowait(bp));
200	}
201	return (0);
202}
203
204/*
205 * Operates like bread, but also starts asynchronous I/O on
206 * read-ahead blocks.
207 */
208int
209breadn(struct vnode * vp, daddr_t blkno, int size,
210    daddr_t * rablkno, int *rabsize,
211    int cnt, struct ucred * cred, struct buf ** bpp)
212{
213	struct buf *bp, *rabp;
214	int i;
215	int rv = 0, readwait = 0;
216
217	*bpp = bp = getblk(vp, blkno, size, 0, 0);
218
219	/* if not found in cache, do some I/O */
220	if ((bp->b_flags & B_CACHE) == 0) {
221		if (curproc != NULL)
222			curproc->p_stats->p_ru.ru_inblock++;
223		bp->b_flags |= B_READ;
224		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
225		if (bp->b_rcred == NOCRED) {
226			if (cred != NOCRED)
227				crhold(cred);
228			bp->b_rcred = cred;
229		}
230		vfs_busy_pages(bp, 0);
231		VOP_STRATEGY(bp);
232		++readwait;
233	}
234	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
235		if (inmem(vp, *rablkno))
236			continue;
237		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
238
239		if ((rabp->b_flags & B_CACHE) == 0) {
240			if (curproc != NULL)
241				curproc->p_stats->p_ru.ru_inblock++;
242			rabp->b_flags |= B_READ | B_ASYNC;
243			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
244			if (rabp->b_rcred == NOCRED) {
245				if (cred != NOCRED)
246					crhold(cred);
247				rabp->b_rcred = cred;
248			}
249			vfs_busy_pages(rabp, 0);
250			VOP_STRATEGY(rabp);
251		} else {
252			brelse(rabp);
253		}
254	}
255
256	if (readwait) {
257		rv = biowait(bp);
258	}
259	return (rv);
260}
261
262/*
263 * Write, release buffer on completion.  (Done by iodone
264 * if async.)
265 */
266int
267bwrite(struct buf * bp)
268{
269	int oldflags = bp->b_flags;
270
271	if (bp->b_flags & B_INVAL) {
272		brelse(bp);
273		return (0);
274	}
275	if (!(bp->b_flags & B_BUSY))
276		panic("bwrite: buffer is not busy???");
277
278	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
279	bp->b_flags |= B_WRITEINPROG;
280
281	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
282		reassignbuf(bp, bp->b_vp);
283	}
284
285	bp->b_vp->v_numoutput++;
286	vfs_busy_pages(bp, 1);
287	if (curproc != NULL)
288		curproc->p_stats->p_ru.ru_oublock++;
289	VOP_STRATEGY(bp);
290
291	if ((oldflags & B_ASYNC) == 0) {
292		int rtval = biowait(bp);
293
294		if (oldflags & B_DELWRI) {
295			reassignbuf(bp, bp->b_vp);
296		}
297		brelse(bp);
298		return (rtval);
299	}
300	return (0);
301}
302
303int
304vn_bwrite(ap)
305	struct vop_bwrite_args *ap;
306{
307	return (bwrite(ap->a_bp));
308}
309
310/*
311 * Delayed write. (Buffer is marked dirty).
312 */
313void
314bdwrite(struct buf * bp)
315{
316
317	if ((bp->b_flags & B_BUSY) == 0) {
318		panic("bdwrite: buffer is not busy");
319	}
320	if (bp->b_flags & B_INVAL) {
321		brelse(bp);
322		return;
323	}
324	if (bp->b_flags & B_TAPE) {
325		bawrite(bp);
326		return;
327	}
328	bp->b_flags &= ~(B_READ|B_RELBUF);
329	if ((bp->b_flags & B_DELWRI) == 0) {
330		bp->b_flags |= B_DONE | B_DELWRI;
331		reassignbuf(bp, bp->b_vp);
332	}
333
334	/*
335	 * This bmap keeps the system from needing to do the bmap later,
336	 * perhaps when the system is attempting to do a sync.  Since it
337	 * is likely that the indirect block -- or whatever other datastructure
338	 * that the filesystem needs is still in memory now, it is a good
339	 * thing to do this.  Note also, that if the pageout daemon is
340	 * requesting a sync -- there might not be enough memory to do
341	 * the bmap then...  So, this is important to do.
342	 */
343	if( bp->b_lblkno == bp->b_blkno) {
344		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
345	}
346
347	/*
348	 * Set the *dirty* buffer range based upon the VM system dirty pages.
349	 */
350	vfs_setdirty(bp);
351
352	/*
353	 * We need to do this here to satisfy the vnode_pager and the
354	 * pageout daemon, so that it thinks that the pages have been
355	 * "cleaned".  Note that since the pages are in a delayed write
356	 * buffer -- the VFS layer "will" see that the pages get written
357	 * out on the next sync, or perhaps the cluster will be completed.
358	 */
359	vfs_clean_pages(bp);
360	brelse(bp);
361	return;
362}
363
364/*
365 * Asynchronous write.
366 * Start output on a buffer, but do not wait for it to complete.
367 * The buffer is released when the output completes.
368 */
369void
370bawrite(struct buf * bp)
371{
372	bp->b_flags |= B_ASYNC;
373	(void) VOP_BWRITE(bp);
374}
375
376/*
377 * Release a buffer.
378 */
379void
380brelse(struct buf * bp)
381{
382	int s;
383
384	if (bp->b_flags & B_CLUSTER) {
385		relpbuf(bp);
386		return;
387	}
388	/* anyone need a "free" block? */
389	s = splbio();
390
391	if (needsbuffer) {
392		needsbuffer = 0;
393		wakeup(&needsbuffer);
394	}
395
396	/* anyone need this block? */
397	if (bp->b_flags & B_WANTED) {
398		bp->b_flags &= ~(B_WANTED | B_AGE);
399		wakeup(bp);
400	} else if (bp->b_flags & B_VMIO) {
401		wakeup(bp);
402	}
403	if (bp->b_flags & B_LOCKED)
404		bp->b_flags &= ~B_ERROR;
405
406	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
407	    (bp->b_bufsize <= 0)) {
408		bp->b_flags |= B_INVAL;
409		bp->b_flags &= ~(B_DELWRI | B_CACHE);
410		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
411			brelvp(bp);
412	}
413
414	/*
415	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
416	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
417	 * but the VM object is kept around.  The B_NOCACHE flag is used to
418	 * invalidate the pages in the VM object.
419	 */
420	if (bp->b_flags & B_VMIO) {
421		vm_offset_t foff;
422		vm_object_t obj;
423		int i, resid;
424		vm_page_t m;
425		struct vnode *vp;
426		int iototal = bp->b_bufsize;
427
428		vp = bp->b_vp;
429		if (!vp)
430			panic("brelse: missing vp");
431		if (!vp->v_mount)
432			panic("brelse: missing mount info");
433
434		if (bp->b_npages) {
435			obj = (vm_object_t) vp->v_object;
436			foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno);
437			for (i = 0; i < bp->b_npages; i++) {
438				m = bp->b_pages[i];
439				if (m == bogus_page) {
440					m = vm_page_lookup(obj, foff);
441					if (!m) {
442						panic("brelse: page missing\n");
443					}
444					bp->b_pages[i] = m;
445					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
446				}
447				resid = (m->offset + PAGE_SIZE) - foff;
448				if (resid > iototal)
449					resid = iototal;
450				if (resid > 0) {
451					/*
452					 * Don't invalidate the page if the local machine has already
453					 * modified it.  This is the lesser of two evils, and should
454					 * be fixed.
455					 */
456					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
457						vm_page_test_dirty(m);
458						if (m->dirty == 0) {
459							vm_page_set_invalid(m, foff, resid);
460							if (m->valid == 0)
461								vm_page_protect(m, VM_PROT_NONE);
462						}
463					}
464				}
465				foff += resid;
466				iototal -= resid;
467			}
468		}
469
470		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
471			for(i = 0; i < bp->b_npages; i++) {
472				m = bp->b_pages[i];
473				--m->bmapped;
474				if (m->bmapped == 0) {
475					if (m->flags & PG_WANTED) {
476						wakeup(m);
477						m->flags &= ~PG_WANTED;
478					}
479					if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) {
480						vm_page_test_dirty(m);
481						/*
482						 * if page isn't valid, no sense in keeping it around
483						 */
484						if (m->valid == 0) {
485							vm_page_protect(m, VM_PROT_NONE);
486							vm_page_free(m);
487						/*
488						 * if page isn't dirty and hasn't been referenced by
489						 * a process, then cache it
490						 */
491						} else if ((m->dirty & m->valid) == 0 &&
492						    (m->flags & PG_REFERENCED) == 0 &&
493						    !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
494							vm_page_cache(m);
495						/*
496						 * otherwise activate it
497						 */
498						} else if ((m->flags & PG_ACTIVE) == 0) {
499							vm_page_activate(m);
500							m->act_count = 0;
501						}
502					}
503				}
504			}
505			bufspace -= bp->b_bufsize;
506			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
507			bp->b_npages = 0;
508			bp->b_bufsize = 0;
509			bp->b_flags &= ~B_VMIO;
510			if (bp->b_vp)
511				brelvp(bp);
512		}
513	}
514	if (bp->b_qindex != QUEUE_NONE)
515		panic("brelse: free buffer onto another queue???");
516
517	/* enqueue */
518	/* buffers with no memory */
519	if (bp->b_bufsize == 0) {
520		bp->b_qindex = QUEUE_EMPTY;
521		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
522		LIST_REMOVE(bp, b_hash);
523		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
524		bp->b_dev = NODEV;
525		/* buffers with junk contents */
526	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
527		bp->b_qindex = QUEUE_AGE;
528		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
529		LIST_REMOVE(bp, b_hash);
530		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
531		bp->b_dev = NODEV;
532		/* buffers that are locked */
533	} else if (bp->b_flags & B_LOCKED) {
534		bp->b_qindex = QUEUE_LOCKED;
535		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
536		/* buffers with stale but valid contents */
537	} else if (bp->b_flags & B_AGE) {
538		bp->b_qindex = QUEUE_AGE;
539		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
540		/* buffers with valid and quite potentially reuseable contents */
541	} else {
542		bp->b_qindex = QUEUE_LRU;
543		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
544	}
545
546	/* unlock */
547	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
548	splx(s);
549}
550
551/*
552 * Check to see if a block is currently memory resident.
553 */
554static __inline struct buf *
555gbincore(struct vnode * vp, daddr_t blkno)
556{
557	struct buf *bp;
558	struct bufhashhdr *bh;
559
560	bh = BUFHASH(vp, blkno);
561	bp = bh->lh_first;
562
563	/* Search hash chain */
564	while (bp != NULL) {
565		/* hit */
566		if (bp->b_vp == vp && bp->b_lblkno == blkno) {
567			break;
568		}
569		bp = bp->b_hash.le_next;
570	}
571	return (bp);
572}
573
574/*
575 * this routine implements clustered async writes for
576 * clearing out B_DELWRI buffers...  This is much better
577 * than the old way of writing only one buffer at a time.
578 */
579void
580vfs_bio_awrite(struct buf * bp)
581{
582	int i;
583	daddr_t lblkno = bp->b_lblkno;
584	struct vnode *vp = bp->b_vp;
585	int s;
586	int ncl;
587	struct buf *bpa;
588
589	s = splbio();
590	if (vp->v_mount && (vp->v_flag & VVMIO) &&
591	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
592		int size = vp->v_mount->mnt_stat.f_iosize;
593		int maxcl = MAXPHYS / size;
594
595		for (i = 1; i < maxcl; i++) {
596			if ((bpa = gbincore(vp, lblkno + i)) &&
597			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
598			    (B_DELWRI | B_CLUSTEROK)) &&
599			    (bpa->b_bufsize == size)) {
600				if ((bpa->b_blkno == bpa->b_lblkno) ||
601				    (bpa->b_blkno != bp->b_blkno + (i * size) / DEV_BSIZE))
602					break;
603			} else {
604				break;
605			}
606		}
607		ncl = i;
608		/*
609		 * this is a possible cluster write
610		 */
611		if (ncl != 1) {
612			cluster_wbuild(vp, size, lblkno, ncl);
613			splx(s);
614			return;
615		}
616	}
617	bremfree(bp);
618	splx(s);
619	/*
620	 * default (old) behavior, writing out only one block
621	 */
622	bp->b_flags |= B_BUSY | B_ASYNC;
623	(void) VOP_BWRITE(bp);
624}
625
626
627/*
628 * Find a buffer header which is available for use.
629 */
630static struct buf *
631getnewbuf(int slpflag, int slptimeo, int doingvmio)
632{
633	struct buf *bp;
634	int s;
635
636	s = splbio();
637start:
638	if (bufspace >= maxbufspace)
639		goto trytofreespace;
640
641	/* can we constitute a new buffer? */
642	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
643		if (bp->b_qindex != QUEUE_EMPTY)
644			panic("getnewbuf: inconsistent EMPTY queue");
645		bremfree(bp);
646		goto fillbuf;
647	}
648trytofreespace:
649	/*
650	 * We keep the file I/O from hogging metadata I/O
651	 * This is desirable because file data is cached in the
652	 * VM/Buffer cache even if a buffer is freed.
653	 */
654	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
655		if (bp->b_qindex != QUEUE_AGE)
656			panic("getnewbuf: inconsistent AGE queue");
657	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
658		if (bp->b_qindex != QUEUE_LRU)
659			panic("getnewbuf: inconsistent LRU queue");
660	}
661	if (!bp) {
662		/* wait for a free buffer of any kind */
663		needsbuffer = 1;
664		tsleep(&needsbuffer, PRIBIO | slpflag, "newbuf", slptimeo);
665		splx(s);
666		return (0);
667	}
668
669	/* if we are a delayed write, convert to an async write */
670	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
671		vfs_bio_awrite(bp);
672		if (!slpflag && !slptimeo) {
673			splx(s);
674			return (0);
675		}
676		goto start;
677	}
678
679	if (bp->b_flags & B_WANTED) {
680		bp->b_flags &= ~B_WANTED;
681		wakeup(bp);
682	}
683	bremfree(bp);
684
685	if (bp->b_flags & B_VMIO) {
686		bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
687		brelse(bp);
688		bremfree(bp);
689	}
690
691	if (bp->b_vp)
692		brelvp(bp);
693
694	/* we are not free, nor do we contain interesting data */
695	if (bp->b_rcred != NOCRED)
696		crfree(bp->b_rcred);
697	if (bp->b_wcred != NOCRED)
698		crfree(bp->b_wcred);
699fillbuf:
700	bp->b_flags |= B_BUSY;
701	LIST_REMOVE(bp, b_hash);
702	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
703	splx(s);
704	if (bp->b_bufsize) {
705		allocbuf(bp, 0);
706	}
707	bp->b_flags = B_BUSY;
708	bp->b_dev = NODEV;
709	bp->b_vp = NULL;
710	bp->b_blkno = bp->b_lblkno = 0;
711	bp->b_iodone = 0;
712	bp->b_error = 0;
713	bp->b_resid = 0;
714	bp->b_bcount = 0;
715	bp->b_npages = 0;
716	bp->b_wcred = bp->b_rcred = NOCRED;
717	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
718	bp->b_dirtyoff = bp->b_dirtyend = 0;
719	bp->b_validoff = bp->b_validend = 0;
720	if (bufspace >= maxbufspace) {
721		s = splbio();
722		bp->b_flags |= B_INVAL;
723		brelse(bp);
724		goto trytofreespace;
725	}
726	return (bp);
727}
728
729/*
730 * Check to see if a block is currently memory resident.
731 */
732struct buf *
733incore(struct vnode * vp, daddr_t blkno)
734{
735	struct buf *bp;
736	struct bufhashhdr *bh;
737
738	int s = splbio();
739
740	bh = BUFHASH(vp, blkno);
741	bp = bh->lh_first;
742
743	/* Search hash chain */
744	while (bp != NULL) {
745		/* hit */
746		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
747		    (bp->b_flags & B_INVAL) == 0) {
748			break;
749		}
750		bp = bp->b_hash.le_next;
751	}
752	splx(s);
753	return (bp);
754}
755
756/*
757 * Returns true if no I/O is needed to access the
758 * associated VM object.  This is like incore except
759 * it also hunts around in the VM system for the data.
760 */
761
762int
763inmem(struct vnode * vp, daddr_t blkno)
764{
765	vm_object_t obj;
766	vm_offset_t off, toff, tinc;
767	vm_page_t m;
768
769	if (incore(vp, blkno))
770		return 1;
771	if (vp->v_mount == NULL)
772		return 0;
773	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
774		return 0;
775
776	obj = vp->v_object;
777	tinc = PAGE_SIZE;
778	if (tinc > vp->v_mount->mnt_stat.f_iosize)
779		tinc = vp->v_mount->mnt_stat.f_iosize;
780	off = blkno * vp->v_mount->mnt_stat.f_iosize;
781
782	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
783
784		m = vm_page_lookup(obj, trunc_page(toff + off));
785		if (!m)
786			return 0;
787		if (vm_page_is_valid(m, toff + off, tinc) == 0)
788			return 0;
789	}
790	return 1;
791}
792
793/*
794 * now we set the dirty range for the buffer --
795 * for NFS -- if the file is mapped and pages have
796 * been written to, let it know.  We want the
797 * entire range of the buffer to be marked dirty if
798 * any of the pages have been written to for consistancy
799 * with the b_validoff, b_validend set in the nfs write
800 * code, and used by the nfs read code.
801 */
802static void
803vfs_setdirty(struct buf *bp) {
804	int i;
805	vm_object_t object;
806	vm_offset_t boffset, offset;
807	/*
808	 * We qualify the scan for modified pages on whether the
809	 * object has been flushed yet.  The OBJ_WRITEABLE flag
810	 * is not cleared simply by protecting pages off.
811	 */
812	if ((bp->b_flags & B_VMIO) &&
813		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
814		/*
815		 * test the pages to see if they have been modified directly
816		 * by users through the VM system.
817		 */
818		for (i = 0; i < bp->b_npages; i++)
819			vm_page_test_dirty(bp->b_pages[i]);
820
821		/*
822		 * scan forwards for the first page modified
823		 */
824		for (i = 0; i < bp->b_npages; i++) {
825			if (bp->b_pages[i]->dirty) {
826				break;
827			}
828		}
829		boffset = i * PAGE_SIZE;
830		if (boffset < bp->b_dirtyoff) {
831			bp->b_dirtyoff = boffset;
832		}
833
834		/*
835		 * scan backwards for the last page modified
836		 */
837		for (i = bp->b_npages - 1; i >= 0; --i) {
838			if (bp->b_pages[i]->dirty) {
839				break;
840			}
841		}
842		boffset = (i + 1) * PAGE_SIZE;
843		offset = boffset + bp->b_pages[0]->offset;
844		if (offset >= object->size) {
845			boffset = object->size - bp->b_pages[0]->offset;
846		}
847		if (bp->b_dirtyend < boffset) {
848			bp->b_dirtyend = boffset;
849		}
850	}
851}
852
853/*
854 * Get a block given a specified block and offset into a file/device.
855 */
856struct buf *
857getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
858{
859	struct buf *bp;
860	int s;
861	struct bufhashhdr *bh;
862
863	s = splbio();
864loop:
865	if (bp = gbincore(vp, blkno)) {
866		if (bp->b_flags & (B_BUSY|B_INVAL)) {
867			bp->b_flags |= B_WANTED;
868			if (!tsleep(bp, PRIBIO | slpflag, "getblk", slptimeo))
869				goto loop;
870
871			splx(s);
872			return (struct buf *) NULL;
873		}
874		bp->b_flags |= B_BUSY | B_CACHE;
875		bremfree(bp);
876
877		/*
878		 * check for size inconsistancies (note that they shouldn't happen
879		 * but do when filesystems don't handle the size changes correctly.)
880		 * We are conservative on metadata and don't just extend the buffer
881		 * but write and re-constitute it.
882		 */
883
884		if (bp->b_bcount != size) {
885			if (bp->b_flags & B_VMIO) {
886				allocbuf(bp, size);
887			} else {
888				bp->b_flags |= B_NOCACHE;
889				VOP_BWRITE(bp);
890				goto loop;
891			}
892		}
893
894		/*
895		 * make sure that all pages in the buffer are valid, if they
896		 * aren't, clear the cache flag.
897		 * ASSUMPTION:
898		 *  if the buffer is greater than 1 page in size, it is assumed
899		 *  that the buffer address starts on a page boundary...
900		 */
901		if (bp->b_flags & B_VMIO) {
902			int szleft, i;
903			szleft = size;
904			for (i=0;i<bp->b_npages;i++) {
905				if (szleft > PAGE_SIZE) {
906					if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) !=
907						VM_PAGE_BITS_ALL) {
908						bp->b_flags &= ~(B_CACHE|B_DONE);
909						break;
910					}
911					szleft -= PAGE_SIZE;
912				} else {
913					if (!vm_page_is_valid(bp->b_pages[i],
914						(((vm_offset_t) bp->b_data) & PAGE_MASK),
915						szleft)) {
916						bp->b_flags &= ~(B_CACHE|B_DONE);
917						break;
918					}
919					szleft = 0;
920				}
921			}
922		}
923		splx(s);
924		return (bp);
925	} else {
926		vm_object_t obj;
927		int doingvmio;
928
929		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
930			doingvmio = 1;
931		} else {
932			doingvmio = 0;
933		}
934		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
935			if (slpflag || slptimeo) {
936				splx(s);
937				return NULL;
938			}
939			goto loop;
940		}
941
942		/*
943		 * This code is used to make sure that a buffer is not
944		 * created while the getnewbuf routine is blocked.
945		 * Normally the vnode is locked so this isn't a problem.
946		 * VBLK type I/O requests, however, don't lock the vnode.
947		 */
948		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
949			bp->b_flags |= B_INVAL;
950			brelse(bp);
951			goto loop;
952		}
953
954		/*
955		 * Insert the buffer into the hash, so that it can
956		 * be found by incore.
957		 */
958		bp->b_blkno = bp->b_lblkno = blkno;
959		bgetvp(vp, bp);
960		LIST_REMOVE(bp, b_hash);
961		bh = BUFHASH(vp, blkno);
962		LIST_INSERT_HEAD(bh, bp, b_hash);
963
964		if (doingvmio) {
965			bp->b_flags |= (B_VMIO | B_CACHE);
966#if defined(VFS_BIO_DEBUG)
967			if (vp->v_type != VREG)
968				printf("getblk: vmioing file type %d???\n", vp->v_type);
969#endif
970		} else {
971			bp->b_flags &= ~B_VMIO;
972		}
973		splx(s);
974
975		allocbuf(bp, size);
976		return (bp);
977	}
978}
979
980/*
981 * Get an empty, disassociated buffer of given size.
982 */
983struct buf *
984geteblk(int size)
985{
986	struct buf *bp;
987
988	while ((bp = getnewbuf(0, 0, 0)) == 0);
989	allocbuf(bp, size);
990	bp->b_flags |= B_INVAL;
991	return (bp);
992}
993
994/*
995 * This code constitutes the buffer memory from either anonymous system
996 * memory (in the case of non-VMIO operations) or from an associated
997 * VM object (in the case of VMIO operations).
998 *
999 * Note that this code is tricky, and has many complications to resolve
1000 * deadlock or inconsistant data situations.  Tread lightly!!!
1001 *
1002 * Modify the length of a buffer's underlying buffer storage without
1003 * destroying information (unless, of course the buffer is shrinking).
1004 */
1005int
1006allocbuf(struct buf * bp, int size)
1007{
1008
1009	int s;
1010	int newbsize, mbsize;
1011	int i;
1012
1013	if (!(bp->b_flags & B_BUSY))
1014		panic("allocbuf: buffer not busy");
1015
1016	if ((bp->b_flags & B_VMIO) == 0) {
1017		/*
1018		 * Just get anonymous memory from the kernel
1019		 */
1020		mbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
1021		newbsize = round_page(size);
1022
1023		if (newbsize < bp->b_bufsize) {
1024			vm_hold_free_pages(
1025			    bp,
1026			    (vm_offset_t) bp->b_data + newbsize,
1027			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1028		} else if (newbsize > bp->b_bufsize) {
1029			vm_hold_load_pages(
1030			    bp,
1031			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1032			    (vm_offset_t) bp->b_data + newbsize);
1033		}
1034	} else {
1035		vm_page_t m;
1036		int desiredpages;
1037
1038		newbsize = ((size + DEV_BSIZE - 1) / DEV_BSIZE) * DEV_BSIZE;
1039		desiredpages = round_page(newbsize) / PAGE_SIZE;
1040
1041		if (newbsize < bp->b_bufsize) {
1042			if (desiredpages < bp->b_npages) {
1043				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1044				    desiredpages * PAGE_SIZE, (bp->b_npages - desiredpages));
1045				for (i = desiredpages; i < bp->b_npages; i++) {
1046					m = bp->b_pages[i];
1047					s = splhigh();
1048					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1049						m->flags |= PG_WANTED;
1050						tsleep(m, PVM, "biodep", 0);
1051					}
1052					splx(s);
1053
1054					if (m->bmapped == 0) {
1055						printf("allocbuf: bmapped is zero for page %d\n", i);
1056						panic("allocbuf: error");
1057					}
1058					--m->bmapped;
1059					if (m->bmapped == 0) {
1060						vm_page_protect(m, VM_PROT_NONE);
1061						vm_page_free(m);
1062					}
1063					bp->b_pages[i] = NULL;
1064				}
1065				bp->b_npages = desiredpages;
1066			}
1067		} else if (newbsize > bp->b_bufsize) {
1068			vm_object_t obj;
1069			vm_offset_t tinc, off, toff, objoff;
1070			int pageindex, curbpnpages;
1071			struct vnode *vp;
1072			int bsize;
1073
1074			vp = bp->b_vp;
1075			bsize = vp->v_mount->mnt_stat.f_iosize;
1076
1077			if (bp->b_npages < desiredpages) {
1078				obj = vp->v_object;
1079				tinc = PAGE_SIZE;
1080				if (tinc > bsize)
1081					tinc = bsize;
1082				off = bp->b_lblkno * bsize;
1083		doretry:
1084				curbpnpages = bp->b_npages;
1085				bp->b_flags |= B_CACHE;
1086				for (toff = 0; toff < newbsize; toff += tinc) {
1087					int bytesinpage;
1088
1089					pageindex = toff / PAGE_SIZE;
1090					objoff = trunc_page(toff + off);
1091					if (pageindex < curbpnpages) {
1092
1093						m = bp->b_pages[pageindex];
1094						if (m->offset != objoff)
1095							panic("allocbuf: page changed offset??!!!?");
1096						bytesinpage = tinc;
1097						if (tinc > (newbsize - toff))
1098							bytesinpage = newbsize - toff;
1099						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1100							bp->b_flags &= ~B_CACHE;
1101						}
1102						if ((m->flags & PG_ACTIVE) == 0) {
1103							vm_page_activate(m);
1104							m->act_count = 0;
1105						}
1106						continue;
1107					}
1108					m = vm_page_lookup(obj, objoff);
1109					if (!m) {
1110						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1111						if (!m) {
1112							int j;
1113
1114							for (j = bp->b_npages; j < pageindex; j++) {
1115								PAGE_WAKEUP(bp->b_pages[j]);
1116							}
1117							VM_WAIT;
1118							goto doretry;
1119						}
1120						vm_page_activate(m);
1121						m->act_count = 0;
1122						m->valid = 0;
1123						bp->b_flags &= ~B_CACHE;
1124					} else if (m->flags & PG_BUSY) {
1125						int j;
1126
1127						for (j = bp->b_npages; j < pageindex; j++) {
1128							PAGE_WAKEUP(bp->b_pages[j]);
1129						}
1130
1131						s = splbio();
1132						m->flags |= PG_WANTED;
1133						tsleep(m, PRIBIO, "pgtblk", 0);
1134						splx(s);
1135
1136						goto doretry;
1137					} else {
1138						if ((curproc != pageproc) &&
1139							(m->flags & PG_CACHE) &&
1140						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
1141							pagedaemon_wakeup();
1142						}
1143						bytesinpage = tinc;
1144						if (tinc > (newbsize - toff))
1145							bytesinpage = newbsize - toff;
1146						if (!vm_page_is_valid(m, toff + off, bytesinpage)) {
1147							bp->b_flags &= ~B_CACHE;
1148						}
1149						if ((m->flags & PG_ACTIVE) == 0) {
1150							vm_page_activate(m);
1151							m->act_count = 0;
1152						}
1153						m->flags |= PG_BUSY;
1154					}
1155					bp->b_pages[pageindex] = m;
1156					curbpnpages = pageindex + 1;
1157				}
1158				for (i = bp->b_npages; i < curbpnpages; i++) {
1159					m = bp->b_pages[i];
1160					m->bmapped++;
1161					PAGE_WAKEUP(m);
1162				}
1163				bp->b_npages = curbpnpages;
1164				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
1165				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
1166				bp->b_data += off % PAGE_SIZE;
1167			}
1168		}
1169	}
1170	bufspace += (newbsize - bp->b_bufsize);
1171	bp->b_bufsize = newbsize;
1172	bp->b_bcount = size;
1173	return 1;
1174}
1175
1176/*
1177 * Wait for buffer I/O completion, returning error status.
1178 */
1179int
1180biowait(register struct buf * bp)
1181{
1182	int s;
1183
1184	s = splbio();
1185	while ((bp->b_flags & B_DONE) == 0)
1186		tsleep(bp, PRIBIO, "biowait", 0);
1187	splx(s);
1188	if (bp->b_flags & B_EINTR) {
1189		bp->b_flags &= ~B_EINTR;
1190		return (EINTR);
1191	}
1192	if (bp->b_flags & B_ERROR) {
1193		return (bp->b_error ? bp->b_error : EIO);
1194	} else {
1195		return (0);
1196	}
1197}
1198
1199/*
1200 * Finish I/O on a buffer, calling an optional function.
1201 * This is usually called from interrupt level, so process blocking
1202 * is not *a good idea*.
1203 */
1204void
1205biodone(register struct buf * bp)
1206{
1207	int s;
1208
1209	s = splbio();
1210	if (!(bp->b_flags & B_BUSY))
1211		panic("biodone: buffer not busy");
1212
1213	if (bp->b_flags & B_DONE) {
1214		splx(s);
1215		printf("biodone: buffer already done\n");
1216		return;
1217	}
1218	bp->b_flags |= B_DONE;
1219
1220	if ((bp->b_flags & B_READ) == 0) {
1221		vwakeup(bp);
1222	}
1223#ifdef BOUNCE_BUFFERS
1224	if (bp->b_flags & B_BOUNCE)
1225		vm_bounce_free(bp);
1226#endif
1227
1228	/* call optional completion function if requested */
1229	if (bp->b_flags & B_CALL) {
1230		bp->b_flags &= ~B_CALL;
1231		(*bp->b_iodone) (bp);
1232		splx(s);
1233		return;
1234	}
1235	if (bp->b_flags & B_VMIO) {
1236		int i, resid;
1237		vm_offset_t foff;
1238		vm_page_t m;
1239		vm_object_t obj;
1240		int iosize;
1241		struct vnode *vp = bp->b_vp;
1242
1243		foff = vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1244		obj = vp->v_object;
1245		if (!obj) {
1246			panic("biodone: no object");
1247		}
1248#if defined(VFS_BIO_DEBUG)
1249		if (obj->paging_in_progress < bp->b_npages) {
1250			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1251			    obj->paging_in_progress, bp->b_npages);
1252		}
1253#endif
1254		iosize = bp->b_bufsize;
1255		for (i = 0; i < bp->b_npages; i++) {
1256			int bogusflag = 0;
1257			m = bp->b_pages[i];
1258			if (m == bogus_page) {
1259				bogusflag = 1;
1260				m = vm_page_lookup(obj, foff);
1261				if (!m) {
1262#if defined(VFS_BIO_DEBUG)
1263					printf("biodone: page disappeared\n");
1264#endif
1265					--obj->paging_in_progress;
1266					continue;
1267				}
1268				bp->b_pages[i] = m;
1269				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1270			}
1271#if defined(VFS_BIO_DEBUG)
1272			if (trunc_page(foff) != m->offset) {
1273				printf("biodone: foff(%d)/m->offset(%d) mismatch\n", foff, m->offset);
1274			}
1275#endif
1276			resid = (m->offset + PAGE_SIZE) - foff;
1277			if (resid > iosize)
1278				resid = iosize;
1279			/*
1280			 * In the write case, the valid and clean bits are
1281			 * already changed correctly, so we only need to do this
1282			 * here in the read case.
1283			 */
1284			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1285				vm_page_set_validclean(m, foff & (PAGE_SIZE-1), resid);
1286			}
1287
1288			/*
1289			 * when debugging new filesystems or buffer I/O methods, this
1290			 * is the most common error that pops up.  if you see this, you
1291			 * have not set the page busy flag correctly!!!
1292			 */
1293			if (m->busy == 0) {
1294				printf("biodone: page busy < 0, "
1295				    "off: %ld, foff: %ld, "
1296				    "resid: %d, index: %d\n",
1297				    m->offset, foff, resid, i);
1298				printf(" iosize: %ld, lblkno: %ld, flags: 0x%x, npages: %d\n",
1299				    bp->b_vp->v_mount->mnt_stat.f_iosize,
1300				    bp->b_lblkno, bp->b_flags, bp->b_npages);
1301				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
1302				    m->valid, m->dirty, m->bmapped);
1303				panic("biodone: page busy < 0\n");
1304			}
1305			--m->busy;
1306			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1307				m->flags &= ~PG_WANTED;
1308				wakeup(m);
1309			}
1310			--obj->paging_in_progress;
1311			foff += resid;
1312			iosize -= resid;
1313		}
1314		if (obj && obj->paging_in_progress == 0 &&
1315		    (obj->flags & OBJ_PIPWNT)) {
1316			obj->flags &= ~OBJ_PIPWNT;
1317			wakeup(obj);
1318		}
1319	}
1320	/*
1321	 * For asynchronous completions, release the buffer now. The brelse
1322	 * checks for B_WANTED and will do the wakeup there if necessary - so
1323	 * no need to do a wakeup here in the async case.
1324	 */
1325
1326	if (bp->b_flags & B_ASYNC) {
1327		brelse(bp);
1328	} else {
1329		bp->b_flags &= ~B_WANTED;
1330		wakeup(bp);
1331	}
1332	splx(s);
1333}
1334
1335int
1336count_lock_queue()
1337{
1338	int count;
1339	struct buf *bp;
1340
1341	count = 0;
1342	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
1343	    bp != NULL;
1344	    bp = bp->b_freelist.tqe_next)
1345		count++;
1346	return (count);
1347}
1348
1349int vfs_update_interval = 30;
1350
1351void
1352vfs_update()
1353{
1354	(void) spl0();
1355	while (1) {
1356		tsleep(&vfs_update_wakeup, PRIBIO, "update",
1357		    hz * vfs_update_interval);
1358		vfs_update_wakeup = 0;
1359		sync(curproc, NULL, NULL);
1360	}
1361}
1362
1363/*
1364 * This routine is called in lieu of iodone in the case of
1365 * incomplete I/O.  This keeps the busy status for pages
1366 * consistant.
1367 */
1368void
1369vfs_unbusy_pages(struct buf * bp)
1370{
1371	int i;
1372
1373	if (bp->b_flags & B_VMIO) {
1374		struct vnode *vp = bp->b_vp;
1375		vm_object_t obj = vp->v_object;
1376		vm_offset_t foff;
1377
1378		foff = trunc_page(vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno);
1379
1380		for (i = 0; i < bp->b_npages; i++) {
1381			vm_page_t m = bp->b_pages[i];
1382
1383			if (m == bogus_page) {
1384				m = vm_page_lookup(obj, foff + i * PAGE_SIZE);
1385				if (!m) {
1386					panic("vfs_unbusy_pages: page missing\n");
1387				}
1388				bp->b_pages[i] = m;
1389				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1390			}
1391			--obj->paging_in_progress;
1392			--m->busy;
1393			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1394				m->flags &= ~PG_WANTED;
1395				wakeup(m);
1396			}
1397		}
1398		if (obj->paging_in_progress == 0 &&
1399		    (obj->flags & OBJ_PIPWNT)) {
1400			obj->flags &= ~OBJ_PIPWNT;
1401			wakeup(obj);
1402		}
1403	}
1404}
1405
1406/*
1407 * This routine is called before a device strategy routine.
1408 * It is used to tell the VM system that paging I/O is in
1409 * progress, and treat the pages associated with the buffer
1410 * almost as being PG_BUSY.  Also the object paging_in_progress
1411 * flag is handled to make sure that the object doesn't become
1412 * inconsistant.
1413 */
1414void
1415vfs_busy_pages(struct buf * bp, int clear_modify)
1416{
1417	int i;
1418
1419	if (bp->b_flags & B_VMIO) {
1420		vm_object_t obj = bp->b_vp->v_object;
1421		vm_offset_t foff = bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1422		int iocount = bp->b_bufsize;
1423
1424		vfs_setdirty(bp);
1425		for (i = 0; i < bp->b_npages; i++) {
1426			vm_page_t m = bp->b_pages[i];
1427			int resid = (m->offset + PAGE_SIZE) - foff;
1428
1429			if (resid > iocount)
1430				resid = iocount;
1431			if ((bp->b_flags & B_CLUSTER) == 0) {
1432				obj->paging_in_progress++;
1433				m->busy++;
1434			}
1435			if (clear_modify) {
1436				vm_page_protect(m, VM_PROT_READ);
1437				vm_page_set_validclean(m,
1438					foff & (PAGE_SIZE-1), resid);
1439			} else if (bp->b_bcount >= PAGE_SIZE) {
1440				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1441					bp->b_pages[i] = bogus_page;
1442					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1443				}
1444			}
1445			foff += resid;
1446			iocount -= resid;
1447		}
1448	}
1449}
1450
1451/*
1452 * Tell the VM system that the pages associated with this buffer
1453 * are clean.  This is used for delayed writes where the data is
1454 * going to go to disk eventually without additional VM intevention.
1455 */
1456void
1457vfs_clean_pages(struct buf * bp)
1458{
1459	int i;
1460
1461	if (bp->b_flags & B_VMIO) {
1462		vm_offset_t foff =
1463			bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1464		int iocount = bp->b_bufsize;
1465
1466		for (i = 0; i < bp->b_npages; i++) {
1467			vm_page_t m = bp->b_pages[i];
1468			int resid = (m->offset + PAGE_SIZE) - foff;
1469
1470			if (resid > iocount)
1471				resid = iocount;
1472			if (resid > 0) {
1473				vm_page_set_validclean(m,
1474					foff & (PAGE_SIZE-1), resid);
1475			}
1476			foff += resid;
1477			iocount -= resid;
1478		}
1479	}
1480}
1481
1482void
1483vfs_bio_clrbuf(struct buf *bp) {
1484	int i;
1485	if( bp->b_flags & B_VMIO) {
1486		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1487			int j;
1488			if( bp->b_pages[0]->valid != VM_PAGE_BITS_ALL) {
1489				bzero(bp->b_data, bp->b_bufsize);
1490			}
1491			bp->b_resid = 0;
1492			return;
1493		}
1494		for(i=0;i<bp->b_npages;i++) {
1495			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1496				continue;
1497			if( bp->b_pages[i]->valid == 0) {
1498				if ((bp->b_pages[i]->flags & PG_ZERO) == 0)
1499					bzero(bp->b_data + i * PAGE_SIZE, PAGE_SIZE);
1500			} else {
1501				int j;
1502				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1503					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1504						bzero(bp->b_data + i * PAGE_SIZE + j * DEV_BSIZE, DEV_BSIZE);
1505				}
1506			}
1507			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
1508		}
1509		bp->b_resid = 0;
1510	} else {
1511		clrbuf(bp);
1512	}
1513}
1514
1515/*
1516 * vm_hold_load_pages and vm_hold_unload pages get pages into
1517 * a buffers address space.  The pages are anonymous and are
1518 * not associated with a file object.
1519 */
1520void
1521vm_hold_load_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1522{
1523	vm_offset_t pg;
1524	vm_page_t p;
1525	vm_offset_t from = round_page(froma);
1526	vm_offset_t to = round_page(toa);
1527
1528	for (pg = from; pg < to; pg += PAGE_SIZE) {
1529
1530tryagain:
1531
1532		p = vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS,
1533		    VM_ALLOC_NORMAL);
1534		if (!p) {
1535			VM_WAIT;
1536			goto tryagain;
1537		}
1538		vm_page_wire(p);
1539		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1540		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = p;
1541		PAGE_WAKEUP(p);
1542		bp->b_npages++;
1543	}
1544}
1545
1546void
1547vm_hold_free_pages(struct buf * bp, vm_offset_t froma, vm_offset_t toa)
1548{
1549	vm_offset_t pg;
1550	vm_page_t p;
1551	vm_offset_t from = round_page(froma);
1552	vm_offset_t to = round_page(toa);
1553
1554	for (pg = from; pg < to; pg += PAGE_SIZE) {
1555		p = bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE];
1556		bp->b_pages[((caddr_t) pg - bp->b_data) / PAGE_SIZE] = 0;
1557		pmap_kremove(pg);
1558		vm_page_free(p);
1559		--bp->b_npages;
1560	}
1561}
1562