vfs_bio.c revision 24850
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD.  Other use
17 *    is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 *    are met.
20 *
21 * $Id: vfs_bio.c,v 1.113 1997/04/01 08:38:53 bde Exp $
22 */
23
24/*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme.  Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author:  John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35#include "opt_bounce.h"
36
37#define VMIO
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/sysproto.h>
41#include <sys/kernel.h>
42#include <sys/sysctl.h>
43#include <sys/proc.h>
44#include <sys/vnode.h>
45#include <sys/vmmeter.h>
46#include <vm/vm.h>
47#include <vm/vm_param.h>
48#include <vm/vm_prot.h>
49#include <vm/vm_kern.h>
50#include <vm/vm_pageout.h>
51#include <vm/vm_page.h>
52#include <vm/vm_object.h>
53#include <vm/vm_extern.h>
54#include <vm/vm_map.h>
55#include <sys/buf.h>
56#include <sys/mount.h>
57#include <sys/malloc.h>
58#include <sys/resourcevar.h>
59#include <sys/proc.h>
60
61#include <miscfs/specfs/specdev.h>
62
63static void vfs_update __P((void));
64static struct	proc *updateproc;
65static struct kproc_desc up_kp = {
66	"update",
67	vfs_update,
68	&updateproc
69};
70SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
71
72struct buf *buf;		/* buffer header pool */
73struct swqueue bswlist;
74
75int count_lock_queue __P((void));
76static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
77		vm_offset_t to);
78static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
79		vm_offset_t to);
80static void vfs_clean_pages(struct buf * bp);
81static void vfs_setdirty(struct buf *bp);
82static void vfs_vmio_release(struct buf *bp);
83
84int needsbuffer;
85
86/*
87 * Internal update daemon, process 3
88 *	The variable vfs_update_wakeup allows for internal syncs.
89 */
90int vfs_update_wakeup;
91
92
93/*
94 * buffers base kva
95 */
96
97/*
98 * bogus page -- for I/O to/from partially complete buffers
99 * this is a temporary solution to the problem, but it is not
100 * really that bad.  it would be better to split the buffer
101 * for input in the case of buffers partially already in memory,
102 * but the code is intricate enough already.
103 */
104vm_page_t bogus_page;
105static vm_offset_t bogus_offset;
106
107static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
108	bufmallocspace, maxbufmallocspace;
109
110static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
111static struct bqueues bufqueues[BUFFER_QUEUES];
112
113extern int vm_swap_size;
114
115#define BUF_MAXUSE 16
116
117/*
118 * Initialize buffer headers and related structures.
119 */
120void
121bufinit()
122{
123	struct buf *bp;
124	int i;
125
126	TAILQ_INIT(&bswlist);
127	LIST_INIT(&invalhash);
128
129	/* first, make a null hash table */
130	for (i = 0; i < BUFHSZ; i++)
131		LIST_INIT(&bufhashtbl[i]);
132
133	/* next, make a null set of free lists */
134	for (i = 0; i < BUFFER_QUEUES; i++)
135		TAILQ_INIT(&bufqueues[i]);
136
137	/* finally, initialize each buffer header and stick on empty q */
138	for (i = 0; i < nbuf; i++) {
139		bp = &buf[i];
140		bzero(bp, sizeof *bp);
141		bp->b_flags = B_INVAL;	/* we're just an empty header */
142		bp->b_dev = NODEV;
143		bp->b_rcred = NOCRED;
144		bp->b_wcred = NOCRED;
145		bp->b_qindex = QUEUE_EMPTY;
146		bp->b_vnbufs.le_next = NOLIST;
147		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
148		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
149	}
150/*
151 * maxbufspace is currently calculated to support all filesystem blocks
152 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
153 * cache is still the same as it would be for 8K filesystems.  This
154 * keeps the size of the buffer cache "in check" for big block filesystems.
155 */
156	maxbufspace = (nbuf + 8) * DFLTBSIZE;
157/*
158 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
159 */
160	maxvmiobufspace = 2 * maxbufspace / 3;
161/*
162 * Limit the amount of malloc memory since it is wired permanently into
163 * the kernel space.  Even though this is accounted for in the buffer
164 * allocation, we don't want the malloced region to grow uncontrolled.
165 * The malloc scheme improves memory utilization significantly on average
166 * (small) directories.
167 */
168	maxbufmallocspace = maxbufspace / 20;
169
170	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
171	bogus_page = vm_page_alloc(kernel_object,
172			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
173			VM_ALLOC_NORMAL);
174
175}
176
177/*
178 * Free the kva allocation for a buffer
179 * Must be called only at splbio or higher,
180 *  as this is the only locking for buffer_map.
181 */
182static void
183bfreekva(struct buf * bp)
184{
185	if (bp->b_kvasize == 0)
186		return;
187
188	vm_map_delete(buffer_map,
189		(vm_offset_t) bp->b_kvabase,
190		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
191
192	bp->b_kvasize = 0;
193
194}
195
196/*
197 * remove the buffer from the appropriate free list
198 */
199void
200bremfree(struct buf * bp)
201{
202	int s = splbio();
203
204	if (bp->b_qindex != QUEUE_NONE) {
205		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
206		bp->b_qindex = QUEUE_NONE;
207	} else {
208		panic("bremfree: removing a buffer when not on a queue");
209	}
210	splx(s);
211}
212
213/*
214 * Get a buffer with the specified data.  Look in the cache first.
215 */
216int
217bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
218    struct buf ** bpp)
219{
220	struct buf *bp;
221
222	bp = getblk(vp, blkno, size, 0, 0);
223	*bpp = bp;
224
225	/* if not found in cache, do some I/O */
226	if ((bp->b_flags & B_CACHE) == 0) {
227		if (curproc != NULL)
228			curproc->p_stats->p_ru.ru_inblock++;
229		bp->b_flags |= B_READ;
230		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
231		if (bp->b_rcred == NOCRED) {
232			if (cred != NOCRED)
233				crhold(cred);
234			bp->b_rcred = cred;
235		}
236		vfs_busy_pages(bp, 0);
237		VOP_STRATEGY(bp);
238		return (biowait(bp));
239	}
240	return (0);
241}
242
243/*
244 * Operates like bread, but also starts asynchronous I/O on
245 * read-ahead blocks.
246 */
247int
248breadn(struct vnode * vp, daddr_t blkno, int size,
249    daddr_t * rablkno, int *rabsize,
250    int cnt, struct ucred * cred, struct buf ** bpp)
251{
252	struct buf *bp, *rabp;
253	int i;
254	int rv = 0, readwait = 0;
255
256	*bpp = bp = getblk(vp, blkno, size, 0, 0);
257
258	/* if not found in cache, do some I/O */
259	if ((bp->b_flags & B_CACHE) == 0) {
260		if (curproc != NULL)
261			curproc->p_stats->p_ru.ru_inblock++;
262		bp->b_flags |= B_READ;
263		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
264		if (bp->b_rcred == NOCRED) {
265			if (cred != NOCRED)
266				crhold(cred);
267			bp->b_rcred = cred;
268		}
269		vfs_busy_pages(bp, 0);
270		VOP_STRATEGY(bp);
271		++readwait;
272	}
273	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
274		if (inmem(vp, *rablkno))
275			continue;
276		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
277
278		if ((rabp->b_flags & B_CACHE) == 0) {
279			if (curproc != NULL)
280				curproc->p_stats->p_ru.ru_inblock++;
281			rabp->b_flags |= B_READ | B_ASYNC;
282			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
283			if (rabp->b_rcred == NOCRED) {
284				if (cred != NOCRED)
285					crhold(cred);
286				rabp->b_rcred = cred;
287			}
288			vfs_busy_pages(rabp, 0);
289			VOP_STRATEGY(rabp);
290		} else {
291			brelse(rabp);
292		}
293	}
294
295	if (readwait) {
296		rv = biowait(bp);
297	}
298	return (rv);
299}
300
301/*
302 * Write, release buffer on completion.  (Done by iodone
303 * if async.)
304 */
305int
306bwrite(struct buf * bp)
307{
308	int oldflags = bp->b_flags;
309
310	if (bp->b_flags & B_INVAL) {
311		brelse(bp);
312		return (0);
313	}
314	if (!(bp->b_flags & B_BUSY))
315		panic("bwrite: buffer is not busy???");
316
317	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
318	bp->b_flags |= B_WRITEINPROG;
319
320	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
321		reassignbuf(bp, bp->b_vp);
322	}
323
324	bp->b_vp->v_numoutput++;
325	vfs_busy_pages(bp, 1);
326	if (curproc != NULL)
327		curproc->p_stats->p_ru.ru_oublock++;
328	VOP_STRATEGY(bp);
329
330	/*
331	 * Handle ordered writes here.
332	 * If the write was originally flagged as ordered,
333	 * then we check to see if it was converted to async.
334	 * If it was converted to async, and is done now, then
335	 * we release the buffer.  Otherwise we clear the
336	 * ordered flag because it is not needed anymore.
337	 *
338 	 * Note that biodone has been modified so that it does
339	 * not release ordered buffers.  This allows us to have
340	 * a chance to determine whether or not the driver
341	 * has set the async flag in the strategy routine.  Otherwise
342	 * if biodone was not modified, then the buffer may have been
343	 * reused before we have had a chance to check the flag.
344	 */
345
346	if ((oldflags & B_ORDERED) == B_ORDERED) {
347		int s;
348		s = splbio();
349		if (bp->b_flags & B_ASYNC)  {
350			if ((bp->b_flags & B_DONE)) {
351				if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
352					brelse(bp);
353				else
354					bqrelse(bp);
355			}
356			splx(s);
357			return (0);
358		} else {
359			bp->b_flags &= ~B_ORDERED;
360		}
361		splx(s);
362	}
363
364	if ((oldflags & B_ASYNC) == 0) {
365		int rtval = biowait(bp);
366
367		if (oldflags & B_DELWRI) {
368			reassignbuf(bp, bp->b_vp);
369		}
370		brelse(bp);
371		return (rtval);
372	}
373	return (0);
374}
375
376int
377vn_bwrite(ap)
378	struct vop_bwrite_args *ap;
379{
380	return (bwrite(ap->a_bp));
381}
382
383/*
384 * Delayed write. (Buffer is marked dirty).
385 */
386void
387bdwrite(struct buf * bp)
388{
389
390	if ((bp->b_flags & B_BUSY) == 0) {
391		panic("bdwrite: buffer is not busy");
392	}
393	if (bp->b_flags & B_INVAL) {
394		brelse(bp);
395		return;
396	}
397	if (bp->b_flags & B_TAPE) {
398		bawrite(bp);
399		return;
400	}
401	bp->b_flags &= ~(B_READ|B_RELBUF);
402	if ((bp->b_flags & B_DELWRI) == 0) {
403		bp->b_flags |= B_DONE | B_DELWRI;
404		reassignbuf(bp, bp->b_vp);
405	}
406
407	/*
408	 * This bmap keeps the system from needing to do the bmap later,
409	 * perhaps when the system is attempting to do a sync.  Since it
410	 * is likely that the indirect block -- or whatever other datastructure
411	 * that the filesystem needs is still in memory now, it is a good
412	 * thing to do this.  Note also, that if the pageout daemon is
413	 * requesting a sync -- there might not be enough memory to do
414	 * the bmap then...  So, this is important to do.
415	 */
416	if( bp->b_lblkno == bp->b_blkno) {
417		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
418	}
419
420	/*
421	 * Set the *dirty* buffer range based upon the VM system dirty pages.
422	 */
423	vfs_setdirty(bp);
424
425	/*
426	 * We need to do this here to satisfy the vnode_pager and the
427	 * pageout daemon, so that it thinks that the pages have been
428	 * "cleaned".  Note that since the pages are in a delayed write
429	 * buffer -- the VFS layer "will" see that the pages get written
430	 * out on the next sync, or perhaps the cluster will be completed.
431	 */
432	vfs_clean_pages(bp);
433	bqrelse(bp);
434	return;
435}
436
437/*
438 * Asynchronous write.
439 * Start output on a buffer, but do not wait for it to complete.
440 * The buffer is released when the output completes.
441 */
442void
443bawrite(struct buf * bp)
444{
445	bp->b_flags |= B_ASYNC;
446	(void) VOP_BWRITE(bp);
447}
448
449/*
450 * Ordered write.
451 * Start output on a buffer, but only wait for it to complete if the
452 * output device cannot guarantee ordering in some other way.  Devices
453 * that can perform asynchronous ordered writes will set the B_ASYNC
454 * flag in their strategy routine.
455 * The buffer is released when the output completes.
456 */
457int
458bowrite(struct buf * bp)
459{
460	bp->b_flags |= B_ORDERED;
461	return (VOP_BWRITE(bp));
462}
463
464/*
465 * Release a buffer.
466 */
467void
468brelse(struct buf * bp)
469{
470	int s;
471
472	if (bp->b_flags & B_CLUSTER) {
473		relpbuf(bp);
474		return;
475	}
476	/* anyone need a "free" block? */
477	s = splbio();
478
479	/* anyone need this block? */
480	if (bp->b_flags & B_WANTED) {
481		bp->b_flags &= ~(B_WANTED | B_AGE);
482		wakeup(bp);
483	}
484
485	if (bp->b_flags & B_LOCKED)
486		bp->b_flags &= ~B_ERROR;
487
488	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
489	    (bp->b_bufsize <= 0)) {
490		bp->b_flags |= B_INVAL;
491		bp->b_flags &= ~(B_DELWRI | B_CACHE);
492		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
493			if (bp->b_bufsize)
494				allocbuf(bp, 0);
495			brelvp(bp);
496		}
497	}
498
499	/*
500	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
501	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
502	 * but the VM object is kept around.  The B_NOCACHE flag is used to
503	 * invalidate the pages in the VM object.
504	 */
505	if (bp->b_flags & B_VMIO) {
506		vm_ooffset_t foff;
507		vm_object_t obj;
508		int i, resid;
509		vm_page_t m;
510		struct vnode *vp;
511		int iototal = bp->b_bufsize;
512
513		vp = bp->b_vp;
514		if (!vp)
515			panic("brelse: missing vp");
516
517		if (bp->b_npages) {
518			vm_pindex_t poff;
519			obj = (vm_object_t) vp->v_object;
520			if (vp->v_type == VBLK)
521				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
522			else
523				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
524			poff = OFF_TO_IDX(foff);
525			for (i = 0; i < bp->b_npages; i++) {
526				m = bp->b_pages[i];
527				if (m == bogus_page) {
528					m = vm_page_lookup(obj, poff + i);
529					if (!m) {
530						panic("brelse: page missing\n");
531					}
532					bp->b_pages[i] = m;
533					pmap_qenter(trunc_page(bp->b_data),
534						bp->b_pages, bp->b_npages);
535				}
536				resid = IDX_TO_OFF(m->pindex+1) - foff;
537				if (resid > iototal)
538					resid = iototal;
539				if (resid > 0) {
540					/*
541					 * Don't invalidate the page if the local machine has already
542					 * modified it.  This is the lesser of two evils, and should
543					 * be fixed.
544					 */
545					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
546						vm_page_test_dirty(m);
547						if (m->dirty == 0) {
548							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
549							if (m->valid == 0)
550								vm_page_protect(m, VM_PROT_NONE);
551						}
552					}
553					if (resid >= PAGE_SIZE) {
554						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
555							bp->b_flags |= B_INVAL;
556						}
557					} else {
558						if (!vm_page_is_valid(m,
559							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
560							bp->b_flags |= B_INVAL;
561						}
562					}
563				}
564				foff += resid;
565				iototal -= resid;
566			}
567		}
568		if (bp->b_flags & (B_INVAL | B_RELBUF))
569			vfs_vmio_release(bp);
570	}
571	if (bp->b_qindex != QUEUE_NONE)
572		panic("brelse: free buffer onto another queue???");
573
574	/* enqueue */
575	/* buffers with no memory */
576	if (bp->b_bufsize == 0) {
577		bp->b_qindex = QUEUE_EMPTY;
578		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
579		LIST_REMOVE(bp, b_hash);
580		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
581		bp->b_dev = NODEV;
582		/*
583		 * Get rid of the kva allocation *now*
584		 */
585		bfreekva(bp);
586		if (needsbuffer) {
587			wakeup(&needsbuffer);
588			needsbuffer=0;
589		}
590		/* buffers with junk contents */
591	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
592		bp->b_qindex = QUEUE_AGE;
593		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
594		LIST_REMOVE(bp, b_hash);
595		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
596		bp->b_dev = NODEV;
597		if (needsbuffer) {
598			wakeup(&needsbuffer);
599			needsbuffer=0;
600		}
601		/* buffers that are locked */
602	} else if (bp->b_flags & B_LOCKED) {
603		bp->b_qindex = QUEUE_LOCKED;
604		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
605		/* buffers with stale but valid contents */
606	} else if (bp->b_flags & B_AGE) {
607		bp->b_qindex = QUEUE_AGE;
608		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
609		if (needsbuffer) {
610			wakeup(&needsbuffer);
611			needsbuffer=0;
612		}
613		/* buffers with valid and quite potentially reuseable contents */
614	} else {
615		bp->b_qindex = QUEUE_LRU;
616		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
617		if (needsbuffer) {
618			wakeup(&needsbuffer);
619			needsbuffer=0;
620		}
621	}
622
623	/* unlock */
624	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
625				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
626	splx(s);
627}
628
629/*
630 * Release a buffer.
631 */
632void
633bqrelse(struct buf * bp)
634{
635	int s;
636
637	s = splbio();
638
639
640	/* anyone need this block? */
641	if (bp->b_flags & B_WANTED) {
642		bp->b_flags &= ~(B_WANTED | B_AGE);
643		wakeup(bp);
644	}
645
646	if (bp->b_qindex != QUEUE_NONE)
647		panic("bqrelse: free buffer onto another queue???");
648
649	if (bp->b_flags & B_LOCKED) {
650		bp->b_flags &= ~B_ERROR;
651		bp->b_qindex = QUEUE_LOCKED;
652		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
653		/* buffers with stale but valid contents */
654	} else {
655		bp->b_qindex = QUEUE_LRU;
656		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
657		if (needsbuffer) {
658			wakeup(&needsbuffer);
659			needsbuffer=0;
660		}
661	}
662
663	/* unlock */
664	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
665		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
666	splx(s);
667}
668
669static void
670vfs_vmio_release(bp)
671	struct buf *bp;
672{
673	int i;
674	vm_page_t m;
675
676	for (i = 0; i < bp->b_npages; i++) {
677		m = bp->b_pages[i];
678		bp->b_pages[i] = NULL;
679		vm_page_unwire(m);
680		/*
681		 * We don't mess with busy pages, it is
682		 * the responsibility of the process that
683		 * busied the pages to deal with them.
684		 */
685		if ((m->flags & PG_BUSY) || (m->busy != 0))
686			continue;
687
688		if (m->wire_count == 0) {
689
690			if (m->flags & PG_WANTED) {
691				m->flags &= ~PG_WANTED;
692				wakeup(m);
693			}
694
695			/*
696			 * If this is an async free -- we cannot place
697			 * pages onto the cache queue.  If it is an
698			 * async free, then we don't modify any queues.
699			 * This is probably in error (for perf reasons),
700			 * and we will eventually need to build
701			 * a more complete infrastructure to support I/O
702			 * rundown.
703			 */
704			if ((bp->b_flags & B_ASYNC) == 0) {
705
706			/*
707			 * In the case of sync buffer frees, we can do pretty much
708			 * anything to any of the memory queues.  Specifically,
709			 * the cache queue is okay to be modified.
710			 */
711				if (m->valid) {
712					if(m->dirty == 0)
713						vm_page_test_dirty(m);
714					/*
715					 * this keeps pressure off of the process memory
716					 */
717					if (m->dirty == 0 && m->hold_count == 0)
718						vm_page_cache(m);
719					else
720						vm_page_deactivate(m);
721				} else if (m->hold_count == 0) {
722					vm_page_protect(m, VM_PROT_NONE);
723					vm_page_free(m);
724				}
725			} else {
726				/*
727				 * If async, then at least we clear the
728				 * act_count.
729				 */
730				m->act_count = 0;
731			}
732		}
733	}
734	bufspace -= bp->b_bufsize;
735	vmiospace -= bp->b_bufsize;
736	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
737	bp->b_npages = 0;
738	bp->b_bufsize = 0;
739	bp->b_flags &= ~B_VMIO;
740	if (bp->b_vp)
741		brelvp(bp);
742}
743
744/*
745 * Check to see if a block is currently memory resident.
746 */
747struct buf *
748gbincore(struct vnode * vp, daddr_t blkno)
749{
750	struct buf *bp;
751	struct bufhashhdr *bh;
752
753	bh = BUFHASH(vp, blkno);
754	bp = bh->lh_first;
755
756	/* Search hash chain */
757	while (bp != NULL) {
758		/* hit */
759		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
760		    (bp->b_flags & B_INVAL) == 0) {
761			break;
762		}
763		bp = bp->b_hash.le_next;
764	}
765	return (bp);
766}
767
768/*
769 * this routine implements clustered async writes for
770 * clearing out B_DELWRI buffers...  This is much better
771 * than the old way of writing only one buffer at a time.
772 */
773int
774vfs_bio_awrite(struct buf * bp)
775{
776	int i;
777	daddr_t lblkno = bp->b_lblkno;
778	struct vnode *vp = bp->b_vp;
779	int s;
780	int ncl;
781	struct buf *bpa;
782	int nwritten;
783
784	s = splbio();
785	/*
786	 * right now we support clustered writing only to regular files
787	 */
788	if ((vp->v_type == VREG) &&
789	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
790	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
791		int size;
792		int maxcl;
793
794		size = vp->v_mount->mnt_stat.f_iosize;
795		maxcl = MAXPHYS / size;
796
797		for (i = 1; i < maxcl; i++) {
798			if ((bpa = gbincore(vp, lblkno + i)) &&
799			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
800			    (B_DELWRI | B_CLUSTEROK)) &&
801			    (bpa->b_bufsize == size)) {
802				if ((bpa->b_blkno == bpa->b_lblkno) ||
803				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
804					break;
805			} else {
806				break;
807			}
808		}
809		ncl = i;
810		/*
811		 * this is a possible cluster write
812		 */
813		if (ncl != 1) {
814			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
815			splx(s);
816			return nwritten;
817		}
818	}
819	bremfree(bp);
820	splx(s);
821	/*
822	 * default (old) behavior, writing out only one block
823	 */
824	bp->b_flags |= B_BUSY | B_ASYNC;
825	nwritten = bp->b_bufsize;
826	(void) VOP_BWRITE(bp);
827	return nwritten;
828}
829
830
831/*
832 * Find a buffer header which is available for use.
833 */
834static struct buf *
835getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
836{
837	struct buf *bp;
838	int nbyteswritten = 0;
839	vm_offset_t addr;
840
841start:
842	if (bufspace >= maxbufspace)
843		goto trytofreespace;
844
845	/* can we constitute a new buffer? */
846	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
847		if (bp->b_qindex != QUEUE_EMPTY)
848			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
849			    bp->b_qindex);
850		bp->b_flags |= B_BUSY;
851		bremfree(bp);
852		goto fillbuf;
853	}
854trytofreespace:
855	/*
856	 * We keep the file I/O from hogging metadata I/O
857	 * This is desirable because file data is cached in the
858	 * VM/Buffer cache even if a buffer is freed.
859	 */
860	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
861		if (bp->b_qindex != QUEUE_AGE)
862			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
863			    bp->b_qindex);
864	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
865		if (bp->b_qindex != QUEUE_LRU)
866			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
867			    bp->b_qindex);
868	}
869	if (!bp) {
870		/* wait for a free buffer of any kind */
871		needsbuffer = 1;
872		tsleep(&needsbuffer,
873			(PRIBIO + 1) | slpflag, "newbuf", slptimeo);
874		return (0);
875	}
876
877#if defined(DIAGNOSTIC)
878	if (bp->b_flags & B_BUSY) {
879		panic("getnewbuf: busy buffer on free list\n");
880	}
881#endif
882
883	/*
884	 * We are fairly aggressive about freeing VMIO buffers, but since
885	 * the buffering is intact without buffer headers, there is not
886	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
887	 */
888	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
889		if ((bp->b_flags & B_VMIO) == 0 ||
890			(vmiospace < maxvmiobufspace)) {
891			--bp->b_usecount;
892			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
893			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
894				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
895				goto start;
896			}
897			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
898		}
899	}
900
901	/* if we are a delayed write, convert to an async write */
902	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
903		nbyteswritten += vfs_bio_awrite(bp);
904		if (!slpflag && !slptimeo) {
905			return (0);
906		}
907		goto start;
908	}
909
910	if (bp->b_flags & B_WANTED) {
911		bp->b_flags &= ~B_WANTED;
912		wakeup(bp);
913	}
914	bremfree(bp);
915	bp->b_flags |= B_BUSY;
916
917	if (bp->b_flags & B_VMIO) {
918		bp->b_flags &= ~B_ASYNC;
919		vfs_vmio_release(bp);
920	}
921
922	if (bp->b_vp)
923		brelvp(bp);
924
925fillbuf:
926	/* we are not free, nor do we contain interesting data */
927	if (bp->b_rcred != NOCRED) {
928		crfree(bp->b_rcred);
929		bp->b_rcred = NOCRED;
930	}
931	if (bp->b_wcred != NOCRED) {
932		crfree(bp->b_wcred);
933		bp->b_wcred = NOCRED;
934	}
935
936	LIST_REMOVE(bp, b_hash);
937	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
938	if (bp->b_bufsize) {
939		allocbuf(bp, 0);
940	}
941	bp->b_flags = B_BUSY;
942	bp->b_dev = NODEV;
943	bp->b_vp = NULL;
944	bp->b_blkno = bp->b_lblkno = 0;
945	bp->b_iodone = 0;
946	bp->b_error = 0;
947	bp->b_resid = 0;
948	bp->b_bcount = 0;
949	bp->b_npages = 0;
950	bp->b_dirtyoff = bp->b_dirtyend = 0;
951	bp->b_validoff = bp->b_validend = 0;
952	bp->b_usecount = 4;
953
954	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
955
956	/*
957	 * we assume that buffer_map is not at address 0
958	 */
959	addr = 0;
960	if (maxsize != bp->b_kvasize) {
961		bfreekva(bp);
962
963		/*
964		 * See if we have buffer kva space
965		 */
966		if (vm_map_findspace(buffer_map,
967			vm_map_min(buffer_map), maxsize, &addr)) {
968			bp->b_flags |= B_INVAL;
969			brelse(bp);
970			goto trytofreespace;
971		}
972	}
973
974	/*
975	 * See if we are below are allocated minimum
976	 */
977	if (bufspace >= (maxbufspace + nbyteswritten)) {
978		bp->b_flags |= B_INVAL;
979		brelse(bp);
980		goto trytofreespace;
981	}
982
983	/*
984	 * create a map entry for the buffer -- in essence
985	 * reserving the kva space.
986	 */
987	if (addr) {
988		vm_map_insert(buffer_map, NULL, 0,
989			addr, addr + maxsize,
990			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
991
992		bp->b_kvabase = (caddr_t) addr;
993		bp->b_kvasize = maxsize;
994	}
995	bp->b_data = bp->b_kvabase;
996
997	return (bp);
998}
999
1000/*
1001 * Check to see if a block is currently memory resident.
1002 */
1003struct buf *
1004incore(struct vnode * vp, daddr_t blkno)
1005{
1006	struct buf *bp;
1007
1008	int s = splbio();
1009	bp = gbincore(vp, blkno);
1010	splx(s);
1011	return (bp);
1012}
1013
1014/*
1015 * Returns true if no I/O is needed to access the
1016 * associated VM object.  This is like incore except
1017 * it also hunts around in the VM system for the data.
1018 */
1019
1020int
1021inmem(struct vnode * vp, daddr_t blkno)
1022{
1023	vm_object_t obj;
1024	vm_offset_t toff, tinc;
1025	vm_page_t m;
1026	vm_ooffset_t off;
1027
1028	if (incore(vp, blkno))
1029		return 1;
1030	if (vp->v_mount == NULL)
1031		return 0;
1032	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
1033		return 0;
1034
1035	obj = vp->v_object;
1036	tinc = PAGE_SIZE;
1037	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1038		tinc = vp->v_mount->mnt_stat.f_iosize;
1039	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1040
1041	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1042
1043		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1044		if (!m)
1045			return 0;
1046		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
1047			return 0;
1048	}
1049	return 1;
1050}
1051
1052/*
1053 * now we set the dirty range for the buffer --
1054 * for NFS -- if the file is mapped and pages have
1055 * been written to, let it know.  We want the
1056 * entire range of the buffer to be marked dirty if
1057 * any of the pages have been written to for consistancy
1058 * with the b_validoff, b_validend set in the nfs write
1059 * code, and used by the nfs read code.
1060 */
1061static void
1062vfs_setdirty(struct buf *bp) {
1063	int i;
1064	vm_object_t object;
1065	vm_offset_t boffset, offset;
1066	/*
1067	 * We qualify the scan for modified pages on whether the
1068	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1069	 * is not cleared simply by protecting pages off.
1070	 */
1071	if ((bp->b_flags & B_VMIO) &&
1072		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1073		/*
1074		 * test the pages to see if they have been modified directly
1075		 * by users through the VM system.
1076		 */
1077		for (i = 0; i < bp->b_npages; i++)
1078			vm_page_test_dirty(bp->b_pages[i]);
1079
1080		/*
1081		 * scan forwards for the first page modified
1082		 */
1083		for (i = 0; i < bp->b_npages; i++) {
1084			if (bp->b_pages[i]->dirty) {
1085				break;
1086			}
1087		}
1088		boffset = (i << PAGE_SHIFT);
1089		if (boffset < bp->b_dirtyoff) {
1090			bp->b_dirtyoff = boffset;
1091		}
1092
1093		/*
1094		 * scan backwards for the last page modified
1095		 */
1096		for (i = bp->b_npages - 1; i >= 0; --i) {
1097			if (bp->b_pages[i]->dirty) {
1098				break;
1099			}
1100		}
1101		boffset = (i + 1);
1102		offset = boffset + bp->b_pages[0]->pindex;
1103		if (offset >= object->size)
1104			boffset = object->size - bp->b_pages[0]->pindex;
1105		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1106			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1107	}
1108}
1109
1110/*
1111 * Get a block given a specified block and offset into a file/device.
1112 */
1113struct buf *
1114getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1115{
1116	struct buf *bp;
1117	int s;
1118	struct bufhashhdr *bh;
1119	int maxsize;
1120
1121	if (vp->v_mount) {
1122		maxsize = vp->v_mount->mnt_stat.f_iosize;
1123		/*
1124		 * This happens on mount points.
1125		 */
1126		if (maxsize < size)
1127			maxsize = size;
1128	} else {
1129		maxsize = size;
1130	}
1131
1132	if (size > MAXBSIZE)
1133		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1134
1135	s = splbio();
1136loop:
1137	if ((bp = gbincore(vp, blkno))) {
1138		if (bp->b_flags & B_BUSY) {
1139			bp->b_flags |= B_WANTED;
1140			if (bp->b_usecount < BUF_MAXUSE)
1141				++bp->b_usecount;
1142			if (!tsleep(bp,
1143				(PRIBIO + 1) | slpflag, "getblk", slptimeo))
1144				goto loop;
1145
1146			splx(s);
1147			return (struct buf *) NULL;
1148		}
1149		bp->b_flags |= B_BUSY | B_CACHE;
1150		bremfree(bp);
1151
1152		/*
1153		 * check for size inconsistancies (note that they shouldn't happen
1154		 * but do when filesystems don't handle the size changes correctly.)
1155		 * We are conservative on metadata and don't just extend the buffer
1156		 * but write and re-constitute it.
1157		 */
1158
1159		if (bp->b_bcount != size) {
1160			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1161				allocbuf(bp, size);
1162			} else {
1163				bp->b_flags |= B_NOCACHE;
1164				VOP_BWRITE(bp);
1165				goto loop;
1166			}
1167		}
1168
1169		if (bp->b_usecount < BUF_MAXUSE)
1170			++bp->b_usecount;
1171		splx(s);
1172		return (bp);
1173	} else {
1174		vm_object_t obj;
1175
1176		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) {
1177			if (slpflag || slptimeo) {
1178				splx(s);
1179				return NULL;
1180			}
1181			goto loop;
1182		}
1183
1184		/*
1185		 * This code is used to make sure that a buffer is not
1186		 * created while the getnewbuf routine is blocked.
1187		 * Normally the vnode is locked so this isn't a problem.
1188		 * VBLK type I/O requests, however, don't lock the vnode.
1189		 */
1190		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
1191			bp->b_flags |= B_INVAL;
1192			brelse(bp);
1193			goto loop;
1194		}
1195
1196		/*
1197		 * Insert the buffer into the hash, so that it can
1198		 * be found by incore.
1199		 */
1200		bp->b_blkno = bp->b_lblkno = blkno;
1201		bgetvp(vp, bp);
1202		LIST_REMOVE(bp, b_hash);
1203		bh = BUFHASH(vp, blkno);
1204		LIST_INSERT_HEAD(bh, bp, b_hash);
1205
1206		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
1207			bp->b_flags |= (B_VMIO | B_CACHE);
1208#if defined(VFS_BIO_DEBUG)
1209			if (vp->v_type != VREG && vp->v_type != VBLK)
1210				printf("getblk: vmioing file type %d???\n", vp->v_type);
1211#endif
1212		} else {
1213			bp->b_flags &= ~B_VMIO;
1214		}
1215		splx(s);
1216
1217		allocbuf(bp, size);
1218#ifdef	PC98
1219		/*
1220		 * 1024byte/sector support
1221		 */
1222#define B_XXX2 0x8000000
1223		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1224#endif
1225		return (bp);
1226	}
1227}
1228
1229/*
1230 * Get an empty, disassociated buffer of given size.
1231 */
1232struct buf *
1233geteblk(int size)
1234{
1235	struct buf *bp;
1236	int s;
1237
1238	s = splbio();
1239	while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
1240	splx(s);
1241	allocbuf(bp, size);
1242	bp->b_flags |= B_INVAL;
1243	return (bp);
1244}
1245
1246
1247/*
1248 * This code constitutes the buffer memory from either anonymous system
1249 * memory (in the case of non-VMIO operations) or from an associated
1250 * VM object (in the case of VMIO operations).
1251 *
1252 * Note that this code is tricky, and has many complications to resolve
1253 * deadlock or inconsistant data situations.  Tread lightly!!!
1254 *
1255 * Modify the length of a buffer's underlying buffer storage without
1256 * destroying information (unless, of course the buffer is shrinking).
1257 */
1258int
1259allocbuf(struct buf * bp, int size)
1260{
1261
1262	int s;
1263	int newbsize, mbsize;
1264	int i;
1265
1266	if (!(bp->b_flags & B_BUSY))
1267		panic("allocbuf: buffer not busy");
1268
1269	if (bp->b_kvasize < size)
1270		panic("allocbuf: buffer too small");
1271
1272	if ((bp->b_flags & B_VMIO) == 0) {
1273		caddr_t origbuf;
1274		int origbufsize;
1275		/*
1276		 * Just get anonymous memory from the kernel
1277		 */
1278		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1279#if !defined(NO_B_MALLOC)
1280		if (bp->b_flags & B_MALLOC)
1281			newbsize = mbsize;
1282		else
1283#endif
1284			newbsize = round_page(size);
1285
1286		if (newbsize < bp->b_bufsize) {
1287#if !defined(NO_B_MALLOC)
1288			/*
1289			 * malloced buffers are not shrunk
1290			 */
1291			if (bp->b_flags & B_MALLOC) {
1292				if (newbsize) {
1293					bp->b_bcount = size;
1294				} else {
1295					free(bp->b_data, M_BIOBUF);
1296					bufspace -= bp->b_bufsize;
1297					bufmallocspace -= bp->b_bufsize;
1298					bp->b_data = bp->b_kvabase;
1299					bp->b_bufsize = 0;
1300					bp->b_bcount = 0;
1301					bp->b_flags &= ~B_MALLOC;
1302				}
1303				return 1;
1304			}
1305#endif
1306			vm_hold_free_pages(
1307			    bp,
1308			    (vm_offset_t) bp->b_data + newbsize,
1309			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1310		} else if (newbsize > bp->b_bufsize) {
1311#if !defined(NO_B_MALLOC)
1312			/*
1313			 * We only use malloced memory on the first allocation.
1314			 * and revert to page-allocated memory when the buffer grows.
1315			 */
1316			if ( (bufmallocspace < maxbufmallocspace) &&
1317				(bp->b_bufsize == 0) &&
1318				(mbsize <= PAGE_SIZE/2)) {
1319
1320				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1321				bp->b_bufsize = mbsize;
1322				bp->b_bcount = size;
1323				bp->b_flags |= B_MALLOC;
1324				bufspace += mbsize;
1325				bufmallocspace += mbsize;
1326				return 1;
1327			}
1328#endif
1329			origbuf = NULL;
1330			origbufsize = 0;
1331#if !defined(NO_B_MALLOC)
1332			/*
1333			 * If the buffer is growing on it's other-than-first allocation,
1334			 * then we revert to the page-allocation scheme.
1335			 */
1336			if (bp->b_flags & B_MALLOC) {
1337				origbuf = bp->b_data;
1338				origbufsize = bp->b_bufsize;
1339				bp->b_data = bp->b_kvabase;
1340				bufspace -= bp->b_bufsize;
1341				bufmallocspace -= bp->b_bufsize;
1342				bp->b_bufsize = 0;
1343				bp->b_flags &= ~B_MALLOC;
1344				newbsize = round_page(newbsize);
1345			}
1346#endif
1347			vm_hold_load_pages(
1348			    bp,
1349			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1350			    (vm_offset_t) bp->b_data + newbsize);
1351#if !defined(NO_B_MALLOC)
1352			if (origbuf) {
1353				bcopy(origbuf, bp->b_data, origbufsize);
1354				free(origbuf, M_BIOBUF);
1355			}
1356#endif
1357		}
1358	} else {
1359		vm_page_t m;
1360		int desiredpages;
1361
1362		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1363		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1364
1365#if !defined(NO_B_MALLOC)
1366		if (bp->b_flags & B_MALLOC)
1367			panic("allocbuf: VMIO buffer can't be malloced");
1368#endif
1369
1370		if (newbsize < bp->b_bufsize) {
1371			if (desiredpages < bp->b_npages) {
1372				for (i = desiredpages; i < bp->b_npages; i++) {
1373					/*
1374					 * the page is not freed here -- it
1375					 * is the responsibility of vnode_pager_setsize
1376					 */
1377					m = bp->b_pages[i];
1378#if defined(DIAGNOSTIC)
1379					if (m == bogus_page)
1380						panic("allocbuf: bogus page found");
1381#endif
1382					s = splvm();
1383					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1384						m->flags |= PG_WANTED;
1385						tsleep(m, PVM, "biodep", 0);
1386					}
1387					splx(s);
1388
1389					bp->b_pages[i] = NULL;
1390					vm_page_unwire(m);
1391				}
1392				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1393				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1394				bp->b_npages = desiredpages;
1395			}
1396		} else if (newbsize > bp->b_bufsize) {
1397			vm_object_t obj;
1398			vm_offset_t tinc, toff;
1399			vm_ooffset_t off;
1400			vm_pindex_t objoff;
1401			int pageindex, curbpnpages;
1402			struct vnode *vp;
1403			int bsize;
1404
1405			vp = bp->b_vp;
1406
1407			if (vp->v_type == VBLK)
1408				bsize = DEV_BSIZE;
1409			else
1410				bsize = vp->v_mount->mnt_stat.f_iosize;
1411
1412			if (bp->b_npages < desiredpages) {
1413				obj = vp->v_object;
1414				tinc = PAGE_SIZE;
1415				if (tinc > bsize)
1416					tinc = bsize;
1417				off = (vm_ooffset_t) bp->b_lblkno * bsize;
1418				curbpnpages = bp->b_npages;
1419		doretry:
1420				bp->b_flags |= B_CACHE;
1421				for (toff = 0; toff < newbsize; toff += tinc) {
1422					int bytesinpage;
1423
1424					pageindex = toff >> PAGE_SHIFT;
1425					objoff = OFF_TO_IDX(off + toff);
1426					if (pageindex < curbpnpages) {
1427
1428						m = bp->b_pages[pageindex];
1429#ifdef VFS_BIO_DIAG
1430						if (m->pindex != objoff)
1431							panic("allocbuf: page changed offset??!!!?");
1432#endif
1433						bytesinpage = tinc;
1434						if (tinc > (newbsize - toff))
1435							bytesinpage = newbsize - toff;
1436						if ((bp->b_flags & B_CACHE) &&
1437							!vm_page_is_valid(m,
1438							(vm_offset_t) ((toff + off) & PAGE_MASK),
1439							bytesinpage)) {
1440							bp->b_flags &= ~B_CACHE;
1441						}
1442						continue;
1443					}
1444					m = vm_page_lookup(obj, objoff);
1445					if (!m) {
1446						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1447						if (!m) {
1448							VM_WAIT;
1449							goto doretry;
1450						}
1451						/*
1452						 * Normally it is unwise to clear PG_BUSY without
1453						 * PAGE_WAKEUP -- but it is okay here, as there is
1454						 * no chance for blocking between here and vm_page_alloc
1455						 */
1456						m->flags &= ~PG_BUSY;
1457						vm_page_wire(m);
1458						bp->b_flags &= ~B_CACHE;
1459					} else if (m->flags & PG_BUSY) {
1460						s = splvm();
1461						if (m->flags & PG_BUSY) {
1462							m->flags |= PG_WANTED;
1463							tsleep(m, PVM, "pgtblk", 0);
1464						}
1465						splx(s);
1466						goto doretry;
1467					} else {
1468						if ((curproc != pageproc) &&
1469							((m->queue - m->pc) == PQ_CACHE) &&
1470						    ((cnt.v_free_count + cnt.v_cache_count) <
1471								(cnt.v_free_min + cnt.v_cache_min))) {
1472							pagedaemon_wakeup();
1473						}
1474						bytesinpage = tinc;
1475						if (tinc > (newbsize - toff))
1476							bytesinpage = newbsize - toff;
1477						if ((bp->b_flags & B_CACHE) &&
1478							!vm_page_is_valid(m,
1479							(vm_offset_t) ((toff + off) & PAGE_MASK),
1480							bytesinpage)) {
1481							bp->b_flags &= ~B_CACHE;
1482						}
1483						vm_page_wire(m);
1484					}
1485					bp->b_pages[pageindex] = m;
1486					curbpnpages = pageindex + 1;
1487				}
1488				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1489				bp->b_npages = curbpnpages;
1490				pmap_qenter((vm_offset_t) bp->b_data,
1491					bp->b_pages, bp->b_npages);
1492				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1493			}
1494		}
1495	}
1496	if (bp->b_flags & B_VMIO)
1497		vmiospace += bp->b_bufsize;
1498	bufspace += (newbsize - bp->b_bufsize);
1499	bp->b_bufsize = newbsize;
1500	bp->b_bcount = size;
1501	return 1;
1502}
1503
1504/*
1505 * Wait for buffer I/O completion, returning error status.
1506 */
1507int
1508biowait(register struct buf * bp)
1509{
1510	int s;
1511
1512	s = splbio();
1513	while ((bp->b_flags & B_DONE) == 0)
1514		tsleep(bp, PRIBIO, "biowait", 0);
1515	splx(s);
1516	if (bp->b_flags & B_EINTR) {
1517		bp->b_flags &= ~B_EINTR;
1518		return (EINTR);
1519	}
1520	if (bp->b_flags & B_ERROR) {
1521		return (bp->b_error ? bp->b_error : EIO);
1522	} else {
1523		return (0);
1524	}
1525}
1526
1527/*
1528 * Finish I/O on a buffer, calling an optional function.
1529 * This is usually called from interrupt level, so process blocking
1530 * is not *a good idea*.
1531 */
1532void
1533biodone(register struct buf * bp)
1534{
1535	int s;
1536
1537	s = splbio();
1538	if (!(bp->b_flags & B_BUSY))
1539		panic("biodone: buffer not busy");
1540
1541	if (bp->b_flags & B_DONE) {
1542		splx(s);
1543		printf("biodone: buffer already done\n");
1544		return;
1545	}
1546	bp->b_flags |= B_DONE;
1547
1548	if ((bp->b_flags & B_READ) == 0) {
1549		vwakeup(bp);
1550	}
1551#ifdef BOUNCE_BUFFERS
1552	if (bp->b_flags & B_BOUNCE)
1553		vm_bounce_free(bp);
1554#endif
1555
1556	/* call optional completion function if requested */
1557	if (bp->b_flags & B_CALL) {
1558		bp->b_flags &= ~B_CALL;
1559		(*bp->b_iodone) (bp);
1560		splx(s);
1561		return;
1562	}
1563	if (bp->b_flags & B_VMIO) {
1564		int i, resid;
1565		vm_ooffset_t foff;
1566		vm_page_t m;
1567		vm_object_t obj;
1568		int iosize;
1569		struct vnode *vp = bp->b_vp;
1570
1571		if (vp->v_type == VBLK)
1572			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1573		else
1574			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1575		obj = vp->v_object;
1576		if (!obj) {
1577			panic("biodone: no object");
1578		}
1579#if defined(VFS_BIO_DEBUG)
1580		if (obj->paging_in_progress < bp->b_npages) {
1581			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1582			    obj->paging_in_progress, bp->b_npages);
1583		}
1584#endif
1585		iosize = bp->b_bufsize;
1586		for (i = 0; i < bp->b_npages; i++) {
1587			int bogusflag = 0;
1588			m = bp->b_pages[i];
1589			if (m == bogus_page) {
1590				bogusflag = 1;
1591				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1592				if (!m) {
1593#if defined(VFS_BIO_DEBUG)
1594					printf("biodone: page disappeared\n");
1595#endif
1596					--obj->paging_in_progress;
1597					continue;
1598				}
1599				bp->b_pages[i] = m;
1600				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1601			}
1602#if defined(VFS_BIO_DEBUG)
1603			if (OFF_TO_IDX(foff) != m->pindex) {
1604				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1605			}
1606#endif
1607			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1608			if (resid > iosize)
1609				resid = iosize;
1610			/*
1611			 * In the write case, the valid and clean bits are
1612			 * already changed correctly, so we only need to do this
1613			 * here in the read case.
1614			 */
1615			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1616				vm_page_set_validclean(m,
1617					(vm_offset_t) (foff & PAGE_MASK), resid);
1618			}
1619
1620			/*
1621			 * when debugging new filesystems or buffer I/O methods, this
1622			 * is the most common error that pops up.  if you see this, you
1623			 * have not set the page busy flag correctly!!!
1624			 */
1625			if (m->busy == 0) {
1626				printf("biodone: page busy < 0, "
1627				    "pindex: %d, foff: 0x(%x,%x), "
1628				    "resid: %d, index: %d\n",
1629				    (int) m->pindex, (int)(foff >> 32),
1630						(int) foff & 0xffffffff, resid, i);
1631				if (vp->v_type != VBLK)
1632					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1633					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1634					    (int) bp->b_lblkno,
1635					    bp->b_flags, bp->b_npages);
1636				else
1637					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1638					    (int) bp->b_lblkno,
1639					    bp->b_flags, bp->b_npages);
1640				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1641				    m->valid, m->dirty, m->wire_count);
1642				panic("biodone: page busy < 0\n");
1643			}
1644			--m->busy;
1645			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1646				m->flags &= ~PG_WANTED;
1647				wakeup(m);
1648			}
1649			--obj->paging_in_progress;
1650			foff += resid;
1651			iosize -= resid;
1652		}
1653		if (obj && obj->paging_in_progress == 0 &&
1654		    (obj->flags & OBJ_PIPWNT)) {
1655			obj->flags &= ~OBJ_PIPWNT;
1656			wakeup(obj);
1657		}
1658	}
1659	/*
1660	 * For asynchronous completions, release the buffer now. The brelse
1661	 * checks for B_WANTED and will do the wakeup there if necessary - so
1662	 * no need to do a wakeup here in the async case.
1663	 */
1664
1665	if (bp->b_flags & B_ASYNC) {
1666		if ((bp->b_flags & B_ORDERED) == 0) {
1667			if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
1668				brelse(bp);
1669			else
1670				bqrelse(bp);
1671		}
1672	} else {
1673		bp->b_flags &= ~B_WANTED;
1674		wakeup(bp);
1675	}
1676	splx(s);
1677}
1678
1679int
1680count_lock_queue()
1681{
1682	int count;
1683	struct buf *bp;
1684
1685	count = 0;
1686	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
1687	    bp != NULL;
1688	    bp = TAILQ_NEXT(bp, b_freelist))
1689		count++;
1690	return (count);
1691}
1692
1693int vfs_update_interval = 30;
1694
1695static void
1696vfs_update()
1697{
1698	while (1) {
1699		tsleep(&vfs_update_wakeup, PUSER, "update",
1700		    hz * vfs_update_interval);
1701		vfs_update_wakeup = 0;
1702		sync(curproc, NULL, NULL);
1703	}
1704}
1705
1706static int
1707sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1708{
1709	int error = sysctl_handle_int(oidp,
1710		oidp->oid_arg1, oidp->oid_arg2, req);
1711	if (!error)
1712		wakeup(&vfs_update_wakeup);
1713	return error;
1714}
1715
1716SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1717	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1718
1719
1720/*
1721 * This routine is called in lieu of iodone in the case of
1722 * incomplete I/O.  This keeps the busy status for pages
1723 * consistant.
1724 */
1725void
1726vfs_unbusy_pages(struct buf * bp)
1727{
1728	int i;
1729
1730	if (bp->b_flags & B_VMIO) {
1731		struct vnode *vp = bp->b_vp;
1732		vm_object_t obj = vp->v_object;
1733		vm_ooffset_t foff;
1734
1735		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1736
1737		for (i = 0; i < bp->b_npages; i++) {
1738			vm_page_t m = bp->b_pages[i];
1739
1740			if (m == bogus_page) {
1741				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
1742				if (!m) {
1743					panic("vfs_unbusy_pages: page missing\n");
1744				}
1745				bp->b_pages[i] = m;
1746				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1747			}
1748			--obj->paging_in_progress;
1749			--m->busy;
1750			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1751				m->flags &= ~PG_WANTED;
1752				wakeup(m);
1753			}
1754		}
1755		if (obj->paging_in_progress == 0 &&
1756		    (obj->flags & OBJ_PIPWNT)) {
1757			obj->flags &= ~OBJ_PIPWNT;
1758			wakeup(obj);
1759		}
1760	}
1761}
1762
1763/*
1764 * This routine is called before a device strategy routine.
1765 * It is used to tell the VM system that paging I/O is in
1766 * progress, and treat the pages associated with the buffer
1767 * almost as being PG_BUSY.  Also the object paging_in_progress
1768 * flag is handled to make sure that the object doesn't become
1769 * inconsistant.
1770 */
1771void
1772vfs_busy_pages(struct buf * bp, int clear_modify)
1773{
1774	int i;
1775
1776	if (bp->b_flags & B_VMIO) {
1777		vm_object_t obj = bp->b_vp->v_object;
1778		vm_ooffset_t foff;
1779		int iocount = bp->b_bufsize;
1780
1781		if (bp->b_vp->v_type == VBLK)
1782			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1783		else
1784			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1785		vfs_setdirty(bp);
1786		for (i = 0; i < bp->b_npages; i++) {
1787			vm_page_t m = bp->b_pages[i];
1788			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1789
1790			if (resid > iocount)
1791				resid = iocount;
1792			if ((bp->b_flags & B_CLUSTER) == 0) {
1793				obj->paging_in_progress++;
1794				m->busy++;
1795			}
1796			vm_page_protect(m, VM_PROT_NONE);
1797			if (clear_modify) {
1798				vm_page_set_validclean(m,
1799					(vm_offset_t) (foff & PAGE_MASK), resid);
1800			} else if (bp->b_bcount >= PAGE_SIZE) {
1801				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1802					bp->b_pages[i] = bogus_page;
1803					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1804				}
1805			}
1806			foff += resid;
1807			iocount -= resid;
1808		}
1809	}
1810}
1811
1812/*
1813 * Tell the VM system that the pages associated with this buffer
1814 * are clean.  This is used for delayed writes where the data is
1815 * going to go to disk eventually without additional VM intevention.
1816 */
1817void
1818vfs_clean_pages(struct buf * bp)
1819{
1820	int i;
1821
1822	if (bp->b_flags & B_VMIO) {
1823		vm_ooffset_t foff;
1824		int iocount = bp->b_bufsize;
1825
1826		if (bp->b_vp->v_type == VBLK)
1827			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1828		else
1829			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1830
1831		for (i = 0; i < bp->b_npages; i++) {
1832			vm_page_t m = bp->b_pages[i];
1833			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
1834
1835			if (resid > iocount)
1836				resid = iocount;
1837			if (resid > 0) {
1838				vm_page_set_validclean(m,
1839					((vm_offset_t) foff & PAGE_MASK), resid);
1840			}
1841			foff += resid;
1842			iocount -= resid;
1843		}
1844	}
1845}
1846
1847void
1848vfs_bio_clrbuf(struct buf *bp) {
1849	int i;
1850	if( bp->b_flags & B_VMIO) {
1851		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1852			int mask;
1853			mask = 0;
1854			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
1855				mask |= (1 << (i/DEV_BSIZE));
1856			if( bp->b_pages[0]->valid != mask) {
1857				bzero(bp->b_data, bp->b_bufsize);
1858			}
1859			bp->b_pages[0]->valid = mask;
1860			bp->b_resid = 0;
1861			return;
1862		}
1863		for(i=0;i<bp->b_npages;i++) {
1864			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1865				continue;
1866			if( bp->b_pages[i]->valid == 0) {
1867				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
1868					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
1869				}
1870			} else {
1871				int j;
1872				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1873					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1874						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
1875				}
1876			}
1877			/* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
1878		}
1879		bp->b_resid = 0;
1880	} else {
1881		clrbuf(bp);
1882	}
1883}
1884
1885/*
1886 * vm_hold_load_pages and vm_hold_unload pages get pages into
1887 * a buffers address space.  The pages are anonymous and are
1888 * not associated with a file object.
1889 */
1890void
1891vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
1892{
1893	vm_offset_t pg;
1894	vm_page_t p;
1895	int index;
1896
1897	to = round_page(to);
1898	from = round_page(from);
1899	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
1900
1901	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
1902
1903tryagain:
1904
1905		p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
1906		    VM_ALLOC_NORMAL);
1907		if (!p) {
1908			VM_WAIT;
1909			goto tryagain;
1910		}
1911		vm_page_wire(p);
1912		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1913		bp->b_pages[index] = p;
1914		PAGE_WAKEUP(p);
1915	}
1916	bp->b_npages = to >> PAGE_SHIFT;
1917}
1918
1919void
1920vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
1921{
1922	vm_offset_t pg;
1923	vm_page_t p;
1924	int index;
1925
1926	from = round_page(from);
1927	to = round_page(to);
1928	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
1929
1930	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
1931		p = bp->b_pages[index];
1932		if (p && (index < bp->b_npages)) {
1933			if (p->busy) {
1934				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
1935					bp->b_blkno, bp->b_lblkno);
1936			}
1937			bp->b_pages[index] = NULL;
1938			pmap_kremove(pg);
1939			vm_page_unwire(p);
1940			vm_page_free(p);
1941		}
1942	}
1943	bp->b_npages = from >> PAGE_SHIFT;
1944}
1945