vfs_bio.c revision 39648
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.176 1998/09/15 10:05:18 gibbs Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 */
27
28#define VMIO
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysproto.h>
32#include <sys/kernel.h>
33#include <sys/sysctl.h>
34#include <sys/proc.h>
35#include <sys/vnode.h>
36#include <sys/vmmeter.h>
37#include <sys/lock.h>
38#include <miscfs/specfs/specdev.h>
39#include <vm/vm.h>
40#include <vm/vm_param.h>
41#include <vm/vm_prot.h>
42#include <vm/vm_kern.h>
43#include <vm/vm_pageout.h>
44#include <vm/vm_page.h>
45#include <vm/vm_object.h>
46#include <vm/vm_extern.h>
47#include <vm/vm_map.h>
48#include <sys/buf.h>
49#include <sys/mount.h>
50#include <sys/malloc.h>
51#include <sys/resourcevar.h>
52
53static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
54
55struct	bio_ops bioops;		/* I/O operation notification */
56
57#if 0 	/* replaced bu sched_sync */
58static void vfs_update __P((void));
59static struct	proc *updateproc;
60static struct kproc_desc up_kp = {
61	"update",
62	vfs_update,
63	&updateproc
64};
65SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
66#endif
67
68struct buf *buf;		/* buffer header pool */
69struct swqueue bswlist;
70
71static int count_lock_queue __P((void));
72static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
73		vm_offset_t to);
74static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
75		vm_offset_t to);
76static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
77			      vm_offset_t off, vm_offset_t size,
78			      vm_page_t m);
79static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
80			       int pageno, vm_page_t m);
81static void vfs_clean_pages(struct buf * bp);
82static void vfs_setdirty(struct buf *bp);
83static void vfs_vmio_release(struct buf *bp);
84static void flushdirtybuffers(int slpflag, int slptimeo);
85
86int needsbuffer;
87
88/*
89 * Internal update daemon, process 3
90 *	The variable vfs_update_wakeup allows for internal syncs.
91 */
92int vfs_update_wakeup;
93
94
95/*
96 * buffers base kva
97 */
98
99/*
100 * bogus page -- for I/O to/from partially complete buffers
101 * this is a temporary solution to the problem, but it is not
102 * really that bad.  it would be better to split the buffer
103 * for input in the case of buffers partially already in memory,
104 * but the code is intricate enough already.
105 */
106vm_page_t bogus_page;
107static vm_offset_t bogus_offset;
108
109static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
110	bufmallocspace, maxbufmallocspace;
111int numdirtybuffers;
112static int lodirtybuffers, hidirtybuffers;
113static int numfreebuffers, lofreebuffers, hifreebuffers;
114static int kvafreespace;
115
116SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
117	&numdirtybuffers, 0, "");
118SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
119	&lodirtybuffers, 0, "");
120SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
121	&hidirtybuffers, 0, "");
122SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
123	&numfreebuffers, 0, "");
124SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
125	&lofreebuffers, 0, "");
126SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
127	&hifreebuffers, 0, "");
128SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
129	&maxbufspace, 0, "");
130SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
131	&bufspace, 0, "");
132SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
133	&maxvmiobufspace, 0, "");
134SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
135	&vmiospace, 0, "");
136SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
137	&maxbufmallocspace, 0, "");
138SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
139	&bufmallocspace, 0, "");
140SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
141	&kvafreespace, 0, "");
142
143static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
144struct bqueues bufqueues[BUFFER_QUEUES] = {0};
145
146extern int vm_swap_size;
147
148#define BUF_MAXUSE 24
149
150#define VFS_BIO_NEED_ANY 1
151#define VFS_BIO_NEED_LOWLIMIT 2
152#define VFS_BIO_NEED_FREE 4
153
154/*
155 * Initialize buffer headers and related structures.
156 */
157void
158bufinit()
159{
160	struct buf *bp;
161	int i;
162
163	TAILQ_INIT(&bswlist);
164	LIST_INIT(&invalhash);
165
166	/* first, make a null hash table */
167	for (i = 0; i < BUFHSZ; i++)
168		LIST_INIT(&bufhashtbl[i]);
169
170	/* next, make a null set of free lists */
171	for (i = 0; i < BUFFER_QUEUES; i++)
172		TAILQ_INIT(&bufqueues[i]);
173
174	/* finally, initialize each buffer header and stick on empty q */
175	for (i = 0; i < nbuf; i++) {
176		bp = &buf[i];
177		bzero(bp, sizeof *bp);
178		bp->b_flags = B_INVAL;	/* we're just an empty header */
179		bp->b_dev = NODEV;
180		bp->b_rcred = NOCRED;
181		bp->b_wcred = NOCRED;
182		bp->b_qindex = QUEUE_EMPTY;
183		bp->b_vnbufs.le_next = NOLIST;
184		LIST_INIT(&bp->b_dep);
185		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
186		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
187	}
188/*
189 * maxbufspace is currently calculated to support all filesystem blocks
190 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
191 * cache is still the same as it would be for 8K filesystems.  This
192 * keeps the size of the buffer cache "in check" for big block filesystems.
193 */
194	maxbufspace = (nbuf + 8) * DFLTBSIZE;
195/*
196 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
197 */
198	maxvmiobufspace = 2 * maxbufspace / 3;
199/*
200 * Limit the amount of malloc memory since it is wired permanently into
201 * the kernel space.  Even though this is accounted for in the buffer
202 * allocation, we don't want the malloced region to grow uncontrolled.
203 * The malloc scheme improves memory utilization significantly on average
204 * (small) directories.
205 */
206	maxbufmallocspace = maxbufspace / 20;
207
208/*
209 * Remove the probability of deadlock conditions by limiting the
210 * number of dirty buffers.
211 */
212	hidirtybuffers = nbuf / 8 + 20;
213	lodirtybuffers = nbuf / 16 + 10;
214	numdirtybuffers = 0;
215	lofreebuffers = nbuf / 18 + 5;
216	hifreebuffers = 2 * lofreebuffers;
217	numfreebuffers = nbuf;
218	kvafreespace = 0;
219
220	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
221	bogus_page = vm_page_alloc(kernel_object,
222			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
223			VM_ALLOC_NORMAL);
224
225}
226
227/*
228 * Free the kva allocation for a buffer
229 * Must be called only at splbio or higher,
230 *  as this is the only locking for buffer_map.
231 */
232static void
233bfreekva(struct buf * bp)
234{
235	if (bp->b_kvasize == 0)
236		return;
237
238	vm_map_delete(buffer_map,
239		(vm_offset_t) bp->b_kvabase,
240		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
241
242	bp->b_kvasize = 0;
243
244}
245
246/*
247 * remove the buffer from the appropriate free list
248 */
249void
250bremfree(struct buf * bp)
251{
252	int s = splbio();
253
254	if (bp->b_qindex != QUEUE_NONE) {
255		if (bp->b_qindex == QUEUE_EMPTY) {
256			kvafreespace -= bp->b_kvasize;
257		}
258		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
259		bp->b_qindex = QUEUE_NONE;
260	} else {
261#if !defined(MAX_PERF)
262		panic("bremfree: removing a buffer when not on a queue");
263#endif
264	}
265	if ((bp->b_flags & B_INVAL) ||
266		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
267		--numfreebuffers;
268	splx(s);
269}
270
271
272/*
273 * Get a buffer with the specified data.  Look in the cache first.
274 */
275int
276bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
277    struct buf ** bpp)
278{
279	struct buf *bp;
280
281	bp = getblk(vp, blkno, size, 0, 0);
282	*bpp = bp;
283
284	/* if not found in cache, do some I/O */
285	if ((bp->b_flags & B_CACHE) == 0) {
286		if (curproc != NULL)
287			curproc->p_stats->p_ru.ru_inblock++;
288		bp->b_flags |= B_READ;
289		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
290		if (bp->b_rcred == NOCRED) {
291			if (cred != NOCRED)
292				crhold(cred);
293			bp->b_rcred = cred;
294		}
295		vfs_busy_pages(bp, 0);
296		VOP_STRATEGY(vp, bp);
297		return (biowait(bp));
298	}
299	return (0);
300}
301
302/*
303 * Operates like bread, but also starts asynchronous I/O on
304 * read-ahead blocks.
305 */
306int
307breadn(struct vnode * vp, daddr_t blkno, int size,
308    daddr_t * rablkno, int *rabsize,
309    int cnt, struct ucred * cred, struct buf ** bpp)
310{
311	struct buf *bp, *rabp;
312	int i;
313	int rv = 0, readwait = 0;
314
315	*bpp = bp = getblk(vp, blkno, size, 0, 0);
316
317	/* if not found in cache, do some I/O */
318	if ((bp->b_flags & B_CACHE) == 0) {
319		if (curproc != NULL)
320			curproc->p_stats->p_ru.ru_inblock++;
321		bp->b_flags |= B_READ;
322		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
323		if (bp->b_rcred == NOCRED) {
324			if (cred != NOCRED)
325				crhold(cred);
326			bp->b_rcred = cred;
327		}
328		vfs_busy_pages(bp, 0);
329		VOP_STRATEGY(vp, bp);
330		++readwait;
331	}
332	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
333		if (inmem(vp, *rablkno))
334			continue;
335		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
336
337		if ((rabp->b_flags & B_CACHE) == 0) {
338			if (curproc != NULL)
339				curproc->p_stats->p_ru.ru_inblock++;
340			rabp->b_flags |= B_READ | B_ASYNC;
341			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
342			if (rabp->b_rcred == NOCRED) {
343				if (cred != NOCRED)
344					crhold(cred);
345				rabp->b_rcred = cred;
346			}
347			vfs_busy_pages(rabp, 0);
348			VOP_STRATEGY(vp, rabp);
349		} else {
350			brelse(rabp);
351		}
352	}
353
354	if (readwait) {
355		rv = biowait(bp);
356	}
357	return (rv);
358}
359
360/*
361 * Write, release buffer on completion.  (Done by iodone
362 * if async.)
363 */
364int
365bwrite(struct buf * bp)
366{
367	int oldflags, s;
368	struct vnode *vp;
369	struct mount *mp;
370
371
372	if (bp->b_flags & B_INVAL) {
373		brelse(bp);
374		return (0);
375	}
376
377	oldflags = bp->b_flags;
378
379#if !defined(MAX_PERF)
380	if ((bp->b_flags & B_BUSY) == 0)
381		panic("bwrite: buffer is not busy???");
382#endif
383
384	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
385	bp->b_flags |= B_WRITEINPROG;
386
387	s = splbio();
388	if ((oldflags & B_DELWRI) == B_DELWRI) {
389		--numdirtybuffers;
390		reassignbuf(bp, bp->b_vp);
391	}
392
393	bp->b_vp->v_numoutput++;
394	vfs_busy_pages(bp, 1);
395	if (curproc != NULL)
396		curproc->p_stats->p_ru.ru_oublock++;
397	splx(s);
398	VOP_STRATEGY(bp->b_vp, bp);
399
400	/*
401	 * Collect statistics on synchronous and asynchronous writes.
402	 * Writes to block devices are charged to their associated
403	 * filesystem (if any).
404	 */
405	if ((vp = bp->b_vp) != NULL) {
406		if (vp->v_type == VBLK)
407			mp = vp->v_specmountpoint;
408		else
409			mp = vp->v_mount;
410		if (mp != NULL)
411			if ((oldflags & B_ASYNC) == 0)
412				mp->mnt_stat.f_syncwrites++;
413			else
414				mp->mnt_stat.f_asyncwrites++;
415	}
416
417	if ((oldflags & B_ASYNC) == 0) {
418		int rtval = biowait(bp);
419		brelse(bp);
420		return (rtval);
421	}
422	return (0);
423}
424
425__inline void
426vfs_bio_need_satisfy(void) {
427	++numfreebuffers;
428	if (!needsbuffer)
429		return;
430	if (numdirtybuffers < lodirtybuffers) {
431		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
432	} else {
433		needsbuffer &= ~VFS_BIO_NEED_ANY;
434	}
435	if (numfreebuffers >= hifreebuffers) {
436		needsbuffer &= ~VFS_BIO_NEED_FREE;
437	}
438	wakeup(&needsbuffer);
439}
440
441/*
442 * Delayed write. (Buffer is marked dirty).
443 */
444void
445bdwrite(struct buf * bp)
446{
447	int s;
448	struct vnode *vp;
449
450#if !defined(MAX_PERF)
451	if ((bp->b_flags & B_BUSY) == 0) {
452		panic("bdwrite: buffer is not busy");
453	}
454#endif
455
456	if (bp->b_flags & B_INVAL) {
457		brelse(bp);
458		return;
459	}
460	bp->b_flags &= ~(B_READ|B_RELBUF);
461	if ((bp->b_flags & B_DELWRI) == 0) {
462		bp->b_flags |= B_DONE | B_DELWRI;
463		reassignbuf(bp, bp->b_vp);
464		++numdirtybuffers;
465	}
466
467	/*
468	 * This bmap keeps the system from needing to do the bmap later,
469	 * perhaps when the system is attempting to do a sync.  Since it
470	 * is likely that the indirect block -- or whatever other datastructure
471	 * that the filesystem needs is still in memory now, it is a good
472	 * thing to do this.  Note also, that if the pageout daemon is
473	 * requesting a sync -- there might not be enough memory to do
474	 * the bmap then...  So, this is important to do.
475	 */
476	if (bp->b_lblkno == bp->b_blkno) {
477		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
478	}
479
480	/*
481	 * Set the *dirty* buffer range based upon the VM system dirty pages.
482	 */
483	vfs_setdirty(bp);
484
485	/*
486	 * We need to do this here to satisfy the vnode_pager and the
487	 * pageout daemon, so that it thinks that the pages have been
488	 * "cleaned".  Note that since the pages are in a delayed write
489	 * buffer -- the VFS layer "will" see that the pages get written
490	 * out on the next sync, or perhaps the cluster will be completed.
491	 */
492	vfs_clean_pages(bp);
493	bqrelse(bp);
494
495	/*
496	 * XXX The soft dependency code is not prepared to
497	 * have I/O done when a bdwrite is requested. For
498	 * now we just let the write be delayed if it is
499	 * requested by the soft dependency code.
500	 */
501	if ((vp = bp->b_vp) &&
502	    (vp->v_type == VBLK && vp->v_specmountpoint &&
503	    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
504	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))
505		return;
506
507	if (numdirtybuffers >= hidirtybuffers)
508		flushdirtybuffers(0, 0);
509
510	return;
511}
512
513
514/*
515 * Same as first half of bdwrite, mark buffer dirty, but do not release it.
516 * Check how this compares with vfs_setdirty(); XXX [JRE]
517 */
518void
519bdirty(bp)
520      struct buf *bp;
521{
522	int s;
523
524	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
525	if ((bp->b_flags & B_DELWRI) == 0) {
526		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
527		reassignbuf(bp, bp->b_vp);
528		++numdirtybuffers;
529	}
530}
531
532/*
533 * Asynchronous write.
534 * Start output on a buffer, but do not wait for it to complete.
535 * The buffer is released when the output completes.
536 */
537void
538bawrite(struct buf * bp)
539{
540	bp->b_flags |= B_ASYNC;
541	(void) VOP_BWRITE(bp);
542}
543
544/*
545 * Ordered write.
546 * Start output on a buffer, and flag it so that the device will write
547 * it in the order it was queued.  The buffer is released when the output
548 * completes.
549 */
550int
551bowrite(struct buf * bp)
552{
553	bp->b_flags |= B_ORDERED|B_ASYNC;
554	return (VOP_BWRITE(bp));
555}
556
557/*
558 * Release a buffer.
559 */
560void
561brelse(struct buf * bp)
562{
563	int s;
564
565	if (bp->b_flags & B_CLUSTER) {
566		relpbuf(bp);
567		return;
568	}
569
570	s = splbio();
571
572	/* anyone need this block? */
573	if (bp->b_flags & B_WANTED) {
574		bp->b_flags &= ~(B_WANTED | B_AGE);
575		wakeup(bp);
576	}
577
578	if (bp->b_flags & B_LOCKED)
579		bp->b_flags &= ~B_ERROR;
580
581	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
582	    (bp->b_bufsize <= 0)) {
583		bp->b_flags |= B_INVAL;
584		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
585			(*bioops.io_deallocate)(bp);
586		if (bp->b_flags & B_DELWRI)
587			--numdirtybuffers;
588		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
589		if ((bp->b_flags & B_VMIO) == 0) {
590			if (bp->b_bufsize)
591				allocbuf(bp, 0);
592			if (bp->b_vp)
593				brelvp(bp);
594		}
595	}
596
597	/*
598	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
599	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
600	 * but the VM object is kept around.  The B_NOCACHE flag is used to
601	 * invalidate the pages in the VM object.
602	 *
603	 * If the buffer is a partially filled NFS buffer, keep it
604	 * since invalidating it now will lose informatio.  The valid
605	 * flags in the vm_pages have only DEV_BSIZE resolution but
606	 * the b_validoff, b_validend fields have byte resolution.
607	 * This can avoid unnecessary re-reads of the buffer.
608	 * XXX this seems to cause performance problems.
609	 */
610	if ((bp->b_flags & B_VMIO)
611	    && !(bp->b_vp->v_tag == VT_NFS &&
612		 bp->b_vp->v_type != VBLK &&
613		 (bp->b_flags & B_DELWRI) != 0)
614#ifdef notdef
615	    && (bp->b_vp->v_tag != VT_NFS
616		|| bp->b_vp->v_type == VBLK
617		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
618		|| bp->b_validend == 0
619		|| (bp->b_validoff == 0
620		    && bp->b_validend == bp->b_bufsize))
621#endif
622	    ) {
623
624		int i, j, resid;
625		vm_page_t m;
626		off_t foff;
627		vm_pindex_t poff;
628		vm_object_t obj;
629		struct vnode *vp;
630
631		vp = bp->b_vp;
632
633		resid = bp->b_bufsize;
634		foff = bp->b_offset;
635
636		for (i = 0; i < bp->b_npages; i++) {
637			m = bp->b_pages[i];
638			vm_page_flag_clear(m, PG_ZERO);
639			if (m == bogus_page) {
640
641				obj = (vm_object_t) vp->v_object;
642				poff = OFF_TO_IDX(bp->b_offset);
643
644				for (j = i; j < bp->b_npages; j++) {
645					m = bp->b_pages[j];
646					if (m == bogus_page) {
647						m = vm_page_lookup(obj, poff + j);
648#if !defined(MAX_PERF)
649						if (!m) {
650							panic("brelse: page missing\n");
651						}
652#endif
653						bp->b_pages[j] = m;
654					}
655				}
656
657				if ((bp->b_flags & B_INVAL) == 0) {
658					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
659				}
660			}
661			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
662				int poffset = foff & PAGE_MASK;
663				int presid = resid > (PAGE_SIZE - poffset) ?
664					(PAGE_SIZE - poffset) : resid;
665				vm_page_set_invalid(m, poffset, presid);
666			}
667			resid -= PAGE_SIZE;
668		}
669
670		if (bp->b_flags & (B_INVAL | B_RELBUF))
671			vfs_vmio_release(bp);
672
673	} else if (bp->b_flags & B_VMIO) {
674
675		if (bp->b_flags & (B_INVAL | B_RELBUF))
676			vfs_vmio_release(bp);
677
678	}
679
680#if !defined(MAX_PERF)
681	if (bp->b_qindex != QUEUE_NONE)
682		panic("brelse: free buffer onto another queue???");
683#endif
684
685	/* enqueue */
686	/* buffers with no memory */
687	if (bp->b_bufsize == 0) {
688		bp->b_flags |= B_INVAL;
689		bp->b_qindex = QUEUE_EMPTY;
690		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
691		LIST_REMOVE(bp, b_hash);
692		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
693		bp->b_dev = NODEV;
694		kvafreespace += bp->b_kvasize;
695
696	/* buffers with junk contents */
697	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
698		bp->b_flags |= B_INVAL;
699		bp->b_qindex = QUEUE_AGE;
700		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
701		LIST_REMOVE(bp, b_hash);
702		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
703		bp->b_dev = NODEV;
704
705	/* buffers that are locked */
706	} else if (bp->b_flags & B_LOCKED) {
707		bp->b_qindex = QUEUE_LOCKED;
708		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
709
710	/* buffers with stale but valid contents */
711	} else if (bp->b_flags & B_AGE) {
712		bp->b_qindex = QUEUE_AGE;
713		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
714
715	/* buffers with valid and quite potentially reuseable contents */
716	} else {
717		bp->b_qindex = QUEUE_LRU;
718		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
719	}
720
721	if ((bp->b_flags & B_INVAL) ||
722		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
723		if (bp->b_flags & B_DELWRI) {
724			--numdirtybuffers;
725			bp->b_flags &= ~B_DELWRI;
726		}
727		vfs_bio_need_satisfy();
728	}
729
730	/* unlock */
731	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
732		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
733	splx(s);
734}
735
736/*
737 * Release a buffer.
738 */
739void
740bqrelse(struct buf * bp)
741{
742	int s;
743
744	s = splbio();
745
746	/* anyone need this block? */
747	if (bp->b_flags & B_WANTED) {
748		bp->b_flags &= ~(B_WANTED | B_AGE);
749		wakeup(bp);
750	}
751
752#if !defined(MAX_PERF)
753	if (bp->b_qindex != QUEUE_NONE)
754		panic("bqrelse: free buffer onto another queue???");
755#endif
756
757	if (bp->b_flags & B_LOCKED) {
758		bp->b_flags &= ~B_ERROR;
759		bp->b_qindex = QUEUE_LOCKED;
760		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
761		/* buffers with stale but valid contents */
762	} else {
763		bp->b_qindex = QUEUE_LRU;
764		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
765	}
766
767	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
768		vfs_bio_need_satisfy();
769	}
770
771	/* unlock */
772	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
773		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
774	splx(s);
775}
776
777static void
778vfs_vmio_release(bp)
779	struct buf *bp;
780{
781	int i;
782	vm_page_t m;
783
784	for (i = 0; i < bp->b_npages; i++) {
785		m = bp->b_pages[i];
786		bp->b_pages[i] = NULL;
787		vm_page_unwire(m);
788
789		/*
790		 * We don't mess with busy pages, it is
791		 * the responsibility of the process that
792		 * busied the pages to deal with them.
793		 */
794		if ((m->flags & PG_BUSY) || (m->busy != 0))
795			continue;
796
797		if (m->wire_count == 0) {
798
799			/*
800			 * If this is an async free -- we cannot place
801			 * pages onto the cache queue.  If it is an
802			 * async free, then we don't modify any queues.
803			 * This is probably in error (for perf reasons),
804			 * and we will eventually need to build
805			 * a more complete infrastructure to support I/O
806			 * rundown.
807			 */
808			if ((bp->b_flags & B_ASYNC) == 0) {
809
810			/*
811			 * In the case of sync buffer frees, we can do pretty much
812			 * anything to any of the memory queues.  Specifically,
813			 * the cache queue is okay to be modified.
814			 */
815				if (m->valid) {
816					if(m->dirty == 0)
817						vm_page_test_dirty(m);
818					/*
819					 * this keeps pressure off of the process memory
820					 */
821					if (m->dirty == 0 && m->hold_count == 0)
822						vm_page_cache(m);
823					else
824						vm_page_deactivate(m);
825					vm_page_flag_clear(m, PG_ZERO);
826				} else if (m->hold_count == 0) {
827					vm_page_busy(m);
828					vm_page_protect(m, VM_PROT_NONE);
829					vm_page_free(m);
830				}
831			} else {
832				/*
833				 * If async, then at least we clear the
834				 * act_count.
835				 */
836				m->act_count = 0;
837				vm_page_flag_clear(m, PG_ZERO);
838			}
839		}
840	}
841	bufspace -= bp->b_bufsize;
842	vmiospace -= bp->b_bufsize;
843	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
844	bp->b_npages = 0;
845	bp->b_bufsize = 0;
846	bp->b_flags &= ~B_VMIO;
847	if (bp->b_vp)
848		brelvp(bp);
849}
850
851/*
852 * Check to see if a block is currently memory resident.
853 */
854struct buf *
855gbincore(struct vnode * vp, daddr_t blkno)
856{
857	struct buf *bp;
858	struct bufhashhdr *bh;
859
860	bh = BUFHASH(vp, blkno);
861	bp = bh->lh_first;
862
863	/* Search hash chain */
864	while (bp != NULL) {
865		/* hit */
866		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
867		    (bp->b_flags & B_INVAL) == 0) {
868			break;
869		}
870		bp = bp->b_hash.le_next;
871	}
872	return (bp);
873}
874
875/*
876 * this routine implements clustered async writes for
877 * clearing out B_DELWRI buffers...  This is much better
878 * than the old way of writing only one buffer at a time.
879 */
880int
881vfs_bio_awrite(struct buf * bp)
882{
883	int i;
884	daddr_t lblkno = bp->b_lblkno;
885	struct vnode *vp = bp->b_vp;
886	int s;
887	int ncl;
888	struct buf *bpa;
889	int nwritten;
890	int size;
891	int maxcl;
892
893	s = splbio();
894	/*
895	 * right now we support clustered writing only to regular files
896	 */
897	if ((vp->v_type == VREG) &&
898	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
899	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
900
901		size = vp->v_mount->mnt_stat.f_iosize;
902		maxcl = MAXPHYS / size;
903
904		for (i = 1; i < maxcl; i++) {
905			if ((bpa = gbincore(vp, lblkno + i)) &&
906			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
907			    (B_DELWRI | B_CLUSTEROK)) &&
908			    (bpa->b_bufsize == size)) {
909				if ((bpa->b_blkno == bpa->b_lblkno) ||
910				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
911					break;
912			} else {
913				break;
914			}
915		}
916		ncl = i;
917		/*
918		 * this is a possible cluster write
919		 */
920		if (ncl != 1) {
921			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
922			splx(s);
923			return nwritten;
924		}
925	}
926
927	bremfree(bp);
928	bp->b_flags |= B_BUSY | B_ASYNC;
929
930	splx(s);
931	/*
932	 * default (old) behavior, writing out only one block
933	 */
934	nwritten = bp->b_bufsize;
935	(void) VOP_BWRITE(bp);
936	return nwritten;
937}
938
939
940/*
941 * Find a buffer header which is available for use.
942 */
943static struct buf *
944getnewbuf(struct vnode *vp, daddr_t blkno,
945	int slpflag, int slptimeo, int size, int maxsize)
946{
947	struct buf *bp, *bp1;
948	int nbyteswritten = 0;
949	vm_offset_t addr;
950	static int writerecursion = 0;
951
952start:
953	if (bufspace >= maxbufspace)
954		goto trytofreespace;
955
956	/* can we constitute a new buffer? */
957	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
958#if !defined(MAX_PERF)
959		if (bp->b_qindex != QUEUE_EMPTY)
960			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
961			    bp->b_qindex);
962#endif
963		bp->b_flags |= B_BUSY;
964		bremfree(bp);
965		goto fillbuf;
966	}
967trytofreespace:
968	/*
969	 * We keep the file I/O from hogging metadata I/O
970	 * This is desirable because file data is cached in the
971	 * VM/Buffer cache even if a buffer is freed.
972	 */
973	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
974#if !defined(MAX_PERF)
975		if (bp->b_qindex != QUEUE_AGE)
976			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
977			    bp->b_qindex);
978#endif
979	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
980#if !defined(MAX_PERF)
981		if (bp->b_qindex != QUEUE_LRU)
982			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
983			    bp->b_qindex);
984#endif
985	}
986	if (!bp) {
987		/* wait for a free buffer of any kind */
988		needsbuffer |= VFS_BIO_NEED_ANY;
989		do
990			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
991			    slptimeo);
992		while (needsbuffer & VFS_BIO_NEED_ANY);
993		return (0);
994	}
995
996#if defined(DIAGNOSTIC)
997	if (bp->b_flags & B_BUSY) {
998		panic("getnewbuf: busy buffer on free list\n");
999	}
1000#endif
1001
1002	/*
1003	 * We are fairly aggressive about freeing VMIO buffers, but since
1004	 * the buffering is intact without buffer headers, there is not
1005	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
1006	 */
1007	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1008		if ((bp->b_flags & B_VMIO) == 0 ||
1009			(vmiospace < maxvmiobufspace)) {
1010			--bp->b_usecount;
1011			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1012			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1013				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1014				goto start;
1015			}
1016			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1017		}
1018	}
1019
1020
1021	/* if we are a delayed write, convert to an async write */
1022	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1023
1024		/*
1025		 * If our delayed write is likely to be used soon, then
1026		 * recycle back onto the LRU queue.
1027		 */
1028		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1029			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1030
1031			if (bp->b_usecount > 0) {
1032				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1033
1034					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1035
1036					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1037						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1038						bp->b_usecount--;
1039						goto start;
1040					}
1041					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1042				}
1043			}
1044		}
1045
1046		/*
1047		 * Certain layered filesystems can recursively re-enter the vfs_bio
1048		 * code, due to delayed writes.  This helps keep the system from
1049		 * deadlocking.
1050		 */
1051		if (writerecursion > 0) {
1052			if (writerecursion > 5) {
1053				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1054				while (bp) {
1055					if ((bp->b_flags & B_DELWRI) == 0)
1056						break;
1057					bp = TAILQ_NEXT(bp, b_freelist);
1058				}
1059				if (bp == NULL) {
1060					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1061					while (bp) {
1062						if ((bp->b_flags & B_DELWRI) == 0)
1063							break;
1064						bp = TAILQ_NEXT(bp, b_freelist);
1065					}
1066				}
1067				if (bp == NULL)
1068					panic("getnewbuf: cannot get buffer, infinite recursion failure");
1069			} else {
1070				bremfree(bp);
1071				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1072				nbyteswritten += bp->b_bufsize;
1073				++writerecursion;
1074				VOP_BWRITE(bp);
1075				--writerecursion;
1076				if (!slpflag && !slptimeo) {
1077					return (0);
1078				}
1079				goto start;
1080			}
1081		} else {
1082			++writerecursion;
1083			nbyteswritten += vfs_bio_awrite(bp);
1084			--writerecursion;
1085			if (!slpflag && !slptimeo) {
1086				return (0);
1087			}
1088			goto start;
1089		}
1090	}
1091
1092	if (bp->b_flags & B_WANTED) {
1093		bp->b_flags &= ~B_WANTED;
1094		wakeup(bp);
1095	}
1096	bremfree(bp);
1097	bp->b_flags |= B_BUSY;
1098
1099	if (bp->b_flags & B_VMIO) {
1100		bp->b_flags &= ~B_ASYNC;
1101		vfs_vmio_release(bp);
1102	}
1103
1104	if (bp->b_vp)
1105		brelvp(bp);
1106
1107fillbuf:
1108
1109	/* we are not free, nor do we contain interesting data */
1110	if (bp->b_rcred != NOCRED) {
1111		crfree(bp->b_rcred);
1112		bp->b_rcred = NOCRED;
1113	}
1114	if (bp->b_wcred != NOCRED) {
1115		crfree(bp->b_wcred);
1116		bp->b_wcred = NOCRED;
1117	}
1118	if (LIST_FIRST(&bp->b_dep) != NULL &&
1119	    bioops.io_deallocate)
1120		(*bioops.io_deallocate)(bp);
1121
1122	LIST_REMOVE(bp, b_hash);
1123	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1124	if (bp->b_bufsize) {
1125		allocbuf(bp, 0);
1126	}
1127	bp->b_flags = B_BUSY;
1128	bp->b_dev = NODEV;
1129	bp->b_vp = NULL;
1130	bp->b_blkno = bp->b_lblkno = 0;
1131	bp->b_offset = NOOFFSET;
1132	bp->b_iodone = 0;
1133	bp->b_error = 0;
1134	bp->b_resid = 0;
1135	bp->b_bcount = 0;
1136	bp->b_npages = 0;
1137	bp->b_dirtyoff = bp->b_dirtyend = 0;
1138	bp->b_validoff = bp->b_validend = 0;
1139	bp->b_usecount = 5;
1140	/* Here, not kern_physio.c, is where this should be done*/
1141	LIST_INIT(&bp->b_dep);
1142
1143	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1144
1145	/*
1146	 * we assume that buffer_map is not at address 0
1147	 */
1148	addr = 0;
1149	if (maxsize != bp->b_kvasize) {
1150		bfreekva(bp);
1151
1152findkvaspace:
1153		/*
1154		 * See if we have buffer kva space
1155		 */
1156		if (vm_map_findspace(buffer_map,
1157			vm_map_min(buffer_map), maxsize, &addr)) {
1158			if (kvafreespace > 0) {
1159				int totfree = 0, freed;
1160				do {
1161					freed = 0;
1162					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1163						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1164						if (bp1->b_kvasize != 0) {
1165							totfree += bp1->b_kvasize;
1166							freed = bp1->b_kvasize;
1167							bremfree(bp1);
1168							bfreekva(bp1);
1169							brelse(bp1);
1170							break;
1171						}
1172					}
1173				} while (freed);
1174				/*
1175				 * if we found free space, then retry with the same buffer.
1176				 */
1177				if (totfree)
1178					goto findkvaspace;
1179			}
1180			bp->b_flags |= B_INVAL;
1181			brelse(bp);
1182			goto trytofreespace;
1183		}
1184	}
1185
1186	/*
1187	 * See if we are below are allocated minimum
1188	 */
1189	if (bufspace >= (maxbufspace + nbyteswritten)) {
1190		bp->b_flags |= B_INVAL;
1191		brelse(bp);
1192		goto trytofreespace;
1193	}
1194
1195	/*
1196	 * create a map entry for the buffer -- in essence
1197	 * reserving the kva space.
1198	 */
1199	if (addr) {
1200		vm_map_insert(buffer_map, NULL, 0,
1201			addr, addr + maxsize,
1202			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1203
1204		bp->b_kvabase = (caddr_t) addr;
1205		bp->b_kvasize = maxsize;
1206	}
1207	bp->b_data = bp->b_kvabase;
1208
1209	return (bp);
1210}
1211
1212static void
1213waitfreebuffers(int slpflag, int slptimeo) {
1214	while (numfreebuffers < hifreebuffers) {
1215		flushdirtybuffers(slpflag, slptimeo);
1216		if (numfreebuffers < hifreebuffers)
1217			break;
1218		needsbuffer |= VFS_BIO_NEED_FREE;
1219		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1220			break;
1221	}
1222}
1223
1224static void
1225flushdirtybuffers(int slpflag, int slptimeo) {
1226	int s;
1227	static pid_t flushing = 0;
1228
1229	s = splbio();
1230
1231	if (flushing) {
1232		if (flushing == curproc->p_pid) {
1233			splx(s);
1234			return;
1235		}
1236		while (flushing) {
1237			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1238				splx(s);
1239				return;
1240			}
1241		}
1242	}
1243	flushing = curproc->p_pid;
1244
1245	while (numdirtybuffers > lodirtybuffers) {
1246		struct buf *bp;
1247		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1248		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1249		if (bp == NULL)
1250			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1251
1252		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1253			bp = TAILQ_NEXT(bp, b_freelist);
1254		}
1255
1256		if (bp) {
1257			vfs_bio_awrite(bp);
1258			continue;
1259		}
1260		break;
1261	}
1262
1263	flushing = 0;
1264	wakeup(&flushing);
1265	splx(s);
1266}
1267
1268/*
1269 * Check to see if a block is currently memory resident.
1270 */
1271struct buf *
1272incore(struct vnode * vp, daddr_t blkno)
1273{
1274	struct buf *bp;
1275
1276	int s = splbio();
1277	bp = gbincore(vp, blkno);
1278	splx(s);
1279	return (bp);
1280}
1281
1282/*
1283 * Returns true if no I/O is needed to access the
1284 * associated VM object.  This is like incore except
1285 * it also hunts around in the VM system for the data.
1286 */
1287
1288int
1289inmem(struct vnode * vp, daddr_t blkno)
1290{
1291	vm_object_t obj;
1292	vm_offset_t toff, tinc;
1293	vm_page_t m;
1294	vm_ooffset_t off;
1295
1296	if (incore(vp, blkno))
1297		return 1;
1298	if (vp->v_mount == NULL)
1299		return 0;
1300	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1301		return 0;
1302
1303	obj = vp->v_object;
1304	tinc = PAGE_SIZE;
1305	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1306		tinc = vp->v_mount->mnt_stat.f_iosize;
1307	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1308
1309	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1310
1311		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1312		if (!m)
1313			return 0;
1314		if (vm_page_is_valid(m,
1315		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1316			return 0;
1317	}
1318	return 1;
1319}
1320
1321/*
1322 * now we set the dirty range for the buffer --
1323 * for NFS -- if the file is mapped and pages have
1324 * been written to, let it know.  We want the
1325 * entire range of the buffer to be marked dirty if
1326 * any of the pages have been written to for consistancy
1327 * with the b_validoff, b_validend set in the nfs write
1328 * code, and used by the nfs read code.
1329 */
1330static void
1331vfs_setdirty(struct buf *bp) {
1332	int i;
1333	vm_object_t object;
1334	vm_offset_t boffset, offset;
1335	/*
1336	 * We qualify the scan for modified pages on whether the
1337	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1338	 * is not cleared simply by protecting pages off.
1339	 */
1340	if ((bp->b_flags & B_VMIO) &&
1341		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1342		/*
1343		 * test the pages to see if they have been modified directly
1344		 * by users through the VM system.
1345		 */
1346		for (i = 0; i < bp->b_npages; i++) {
1347			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1348			vm_page_test_dirty(bp->b_pages[i]);
1349		}
1350
1351		/*
1352		 * scan forwards for the first page modified
1353		 */
1354		for (i = 0; i < bp->b_npages; i++) {
1355			if (bp->b_pages[i]->dirty) {
1356				break;
1357			}
1358		}
1359		boffset = (i << PAGE_SHIFT);
1360		if (boffset < bp->b_dirtyoff) {
1361			bp->b_dirtyoff = boffset;
1362		}
1363
1364		/*
1365		 * scan backwards for the last page modified
1366		 */
1367		for (i = bp->b_npages - 1; i >= 0; --i) {
1368			if (bp->b_pages[i]->dirty) {
1369				break;
1370			}
1371		}
1372		boffset = (i + 1);
1373		offset = boffset + bp->b_pages[0]->pindex;
1374		if (offset >= object->size)
1375			boffset = object->size - bp->b_pages[0]->pindex;
1376		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1377			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1378	}
1379}
1380
1381/*
1382 * Get a block given a specified block and offset into a file/device.
1383 */
1384struct buf *
1385getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1386{
1387	struct buf *bp;
1388	int i, s;
1389	struct bufhashhdr *bh;
1390	int maxsize;
1391	int generation;
1392	int checksize;
1393
1394	if (vp->v_mount) {
1395		maxsize = vp->v_mount->mnt_stat.f_iosize;
1396		/*
1397		 * This happens on mount points.
1398		 */
1399		if (maxsize < size)
1400			maxsize = size;
1401	} else {
1402		maxsize = size;
1403	}
1404
1405#if !defined(MAX_PERF)
1406	if (size > MAXBSIZE)
1407		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1408#endif
1409
1410	s = splbio();
1411loop:
1412	if (numfreebuffers < lofreebuffers) {
1413		waitfreebuffers(slpflag, slptimeo);
1414	}
1415
1416	if ((bp = gbincore(vp, blkno))) {
1417loop1:
1418		if (bp->b_flags & B_BUSY) {
1419
1420			bp->b_flags |= B_WANTED;
1421			if (bp->b_usecount < BUF_MAXUSE)
1422				++bp->b_usecount;
1423
1424			if (!tsleep(bp,
1425				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1426				goto loop;
1427			}
1428
1429			splx(s);
1430			return (struct buf *) NULL;
1431		}
1432		bp->b_flags |= B_BUSY | B_CACHE;
1433		bremfree(bp);
1434
1435		/*
1436		 * check for size inconsistancies (note that they shouldn't
1437		 * happen but do when filesystems don't handle the size changes
1438		 * correctly.) We are conservative on metadata and don't just
1439		 * extend the buffer but write (if needed) and re-constitute it.
1440		 */
1441
1442		if (bp->b_bcount != size) {
1443			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1444				allocbuf(bp, size);
1445			} else {
1446				if (bp->b_flags & B_DELWRI) {
1447					bp->b_flags |= B_NOCACHE;
1448					VOP_BWRITE(bp);
1449				} else {
1450					if ((bp->b_flags & B_VMIO) &&
1451					   (LIST_FIRST(&bp->b_dep) == NULL)) {
1452						bp->b_flags |= B_RELBUF;
1453						brelse(bp);
1454					} else {
1455						bp->b_flags |= B_NOCACHE;
1456						VOP_BWRITE(bp);
1457					}
1458				}
1459				goto loop;
1460			}
1461		}
1462
1463#ifdef DIAGNOSTIC
1464		if (bp->b_offset == NOOFFSET)
1465			panic("getblk: no buffer offset");
1466#endif
1467
1468		/*
1469		 * Check that the constituted buffer really deserves for the
1470		 * B_CACHE bit to be set.  B_VMIO type buffers might not
1471		 * contain fully valid pages.  Normal (old-style) buffers
1472		 * should be fully valid.
1473		 */
1474		if (bp->b_flags & B_VMIO) {
1475			checksize = bp->b_bufsize;
1476			for (i = 0; i < bp->b_npages; i++) {
1477				int resid;
1478				int poffset;
1479				poffset = bp->b_offset & PAGE_MASK;
1480				resid = (checksize > (PAGE_SIZE - poffset)) ?
1481					(PAGE_SIZE - poffset) : checksize;
1482				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1483					bp->b_flags &= ~(B_CACHE | B_DONE);
1484					break;
1485				}
1486				checksize -= resid;
1487			}
1488		}
1489
1490		if (bp->b_usecount < BUF_MAXUSE)
1491			++bp->b_usecount;
1492		splx(s);
1493		return (bp);
1494	} else {
1495		vm_object_t obj;
1496
1497		if ((bp = getnewbuf(vp, blkno,
1498			slpflag, slptimeo, size, maxsize)) == 0) {
1499			if (slpflag || slptimeo) {
1500				splx(s);
1501				return NULL;
1502			}
1503			goto loop;
1504		}
1505
1506		/*
1507		 * This code is used to make sure that a buffer is not
1508		 * created while the getnewbuf routine is blocked.
1509		 * Normally the vnode is locked so this isn't a problem.
1510		 * VBLK type I/O requests, however, don't lock the vnode.
1511		 */
1512		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
1513			bp->b_flags |= B_INVAL;
1514			brelse(bp);
1515			goto loop;
1516		}
1517
1518		/*
1519		 * Insert the buffer into the hash, so that it can
1520		 * be found by incore.
1521		 */
1522		bp->b_blkno = bp->b_lblkno = blkno;
1523
1524		if (vp->v_type != VBLK)
1525			bp->b_offset = (off_t) blkno * maxsize;
1526		else
1527			bp->b_offset = (off_t) blkno * DEV_BSIZE;
1528
1529		bgetvp(vp, bp);
1530		LIST_REMOVE(bp, b_hash);
1531		bh = BUFHASH(vp, blkno);
1532		LIST_INSERT_HEAD(bh, bp, b_hash);
1533
1534		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
1535			bp->b_flags |= (B_VMIO | B_CACHE);
1536#if defined(VFS_BIO_DEBUG)
1537			if (vp->v_type != VREG && vp->v_type != VBLK)
1538				printf("getblk: vmioing file type %d???\n", vp->v_type);
1539#endif
1540		} else {
1541			bp->b_flags &= ~B_VMIO;
1542		}
1543
1544		allocbuf(bp, size);
1545
1546		splx(s);
1547		return (bp);
1548	}
1549}
1550
1551/*
1552 * Get an empty, disassociated buffer of given size.
1553 */
1554struct buf *
1555geteblk(int size)
1556{
1557	struct buf *bp;
1558	int s;
1559
1560	s = splbio();
1561	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1562	splx(s);
1563	allocbuf(bp, size);
1564	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
1565	return (bp);
1566}
1567
1568
1569/*
1570 * This code constitutes the buffer memory from either anonymous system
1571 * memory (in the case of non-VMIO operations) or from an associated
1572 * VM object (in the case of VMIO operations).
1573 *
1574 * Note that this code is tricky, and has many complications to resolve
1575 * deadlock or inconsistant data situations.  Tread lightly!!!
1576 *
1577 * Modify the length of a buffer's underlying buffer storage without
1578 * destroying information (unless, of course the buffer is shrinking).
1579 */
1580int
1581allocbuf(struct buf * bp, int size)
1582{
1583
1584	int s;
1585	int newbsize, mbsize;
1586	int i;
1587
1588#if !defined(MAX_PERF)
1589	if (!(bp->b_flags & B_BUSY))
1590		panic("allocbuf: buffer not busy");
1591
1592	if (bp->b_kvasize < size)
1593		panic("allocbuf: buffer too small");
1594#endif
1595
1596	if ((bp->b_flags & B_VMIO) == 0) {
1597		caddr_t origbuf;
1598		int origbufsize;
1599		/*
1600		 * Just get anonymous memory from the kernel
1601		 */
1602		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1603#if !defined(NO_B_MALLOC)
1604		if (bp->b_flags & B_MALLOC)
1605			newbsize = mbsize;
1606		else
1607#endif
1608			newbsize = round_page(size);
1609
1610		if (newbsize < bp->b_bufsize) {
1611#if !defined(NO_B_MALLOC)
1612			/*
1613			 * malloced buffers are not shrunk
1614			 */
1615			if (bp->b_flags & B_MALLOC) {
1616				if (newbsize) {
1617					bp->b_bcount = size;
1618				} else {
1619					free(bp->b_data, M_BIOBUF);
1620					bufspace -= bp->b_bufsize;
1621					bufmallocspace -= bp->b_bufsize;
1622					bp->b_data = bp->b_kvabase;
1623					bp->b_bufsize = 0;
1624					bp->b_bcount = 0;
1625					bp->b_flags &= ~B_MALLOC;
1626				}
1627				return 1;
1628			}
1629#endif
1630			vm_hold_free_pages(
1631			    bp,
1632			    (vm_offset_t) bp->b_data + newbsize,
1633			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1634		} else if (newbsize > bp->b_bufsize) {
1635#if !defined(NO_B_MALLOC)
1636			/*
1637			 * We only use malloced memory on the first allocation.
1638			 * and revert to page-allocated memory when the buffer grows.
1639			 */
1640			if ( (bufmallocspace < maxbufmallocspace) &&
1641				(bp->b_bufsize == 0) &&
1642				(mbsize <= PAGE_SIZE/2)) {
1643
1644				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1645				bp->b_bufsize = mbsize;
1646				bp->b_bcount = size;
1647				bp->b_flags |= B_MALLOC;
1648				bufspace += mbsize;
1649				bufmallocspace += mbsize;
1650				return 1;
1651			}
1652#endif
1653			origbuf = NULL;
1654			origbufsize = 0;
1655#if !defined(NO_B_MALLOC)
1656			/*
1657			 * If the buffer is growing on its other-than-first allocation,
1658			 * then we revert to the page-allocation scheme.
1659			 */
1660			if (bp->b_flags & B_MALLOC) {
1661				origbuf = bp->b_data;
1662				origbufsize = bp->b_bufsize;
1663				bp->b_data = bp->b_kvabase;
1664				bufspace -= bp->b_bufsize;
1665				bufmallocspace -= bp->b_bufsize;
1666				bp->b_bufsize = 0;
1667				bp->b_flags &= ~B_MALLOC;
1668				newbsize = round_page(newbsize);
1669			}
1670#endif
1671			vm_hold_load_pages(
1672			    bp,
1673			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1674			    (vm_offset_t) bp->b_data + newbsize);
1675#if !defined(NO_B_MALLOC)
1676			if (origbuf) {
1677				bcopy(origbuf, bp->b_data, origbufsize);
1678				free(origbuf, M_BIOBUF);
1679			}
1680#endif
1681		}
1682	} else {
1683		vm_page_t m;
1684		int desiredpages;
1685
1686		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1687		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1688
1689#if !defined(NO_B_MALLOC)
1690		if (bp->b_flags & B_MALLOC)
1691			panic("allocbuf: VMIO buffer can't be malloced");
1692#endif
1693
1694		if (newbsize < bp->b_bufsize) {
1695			if (desiredpages < bp->b_npages) {
1696				for (i = desiredpages; i < bp->b_npages; i++) {
1697					/*
1698					 * the page is not freed here -- it
1699					 * is the responsibility of vnode_pager_setsize
1700					 */
1701					m = bp->b_pages[i];
1702#if defined(DIAGNOSTIC)
1703					if (m == bogus_page)
1704						panic("allocbuf: bogus page found");
1705#endif
1706					vm_page_sleep(m, "biodep", &m->busy);
1707
1708					bp->b_pages[i] = NULL;
1709					vm_page_unwire(m);
1710				}
1711				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1712				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1713				bp->b_npages = desiredpages;
1714			}
1715		} else if (newbsize > bp->b_bufsize) {
1716			vm_object_t obj;
1717			vm_offset_t tinc, toff;
1718			vm_ooffset_t off;
1719			vm_pindex_t objoff;
1720			int pageindex, curbpnpages;
1721			struct vnode *vp;
1722			int bsize;
1723			int orig_validoff = bp->b_validoff;
1724			int orig_validend = bp->b_validend;
1725
1726			vp = bp->b_vp;
1727
1728			if (vp->v_type == VBLK)
1729				bsize = DEV_BSIZE;
1730			else
1731				bsize = vp->v_mount->mnt_stat.f_iosize;
1732
1733			if (bp->b_npages < desiredpages) {
1734				obj = vp->v_object;
1735				tinc = PAGE_SIZE;
1736				if (tinc > bsize)
1737					tinc = bsize;
1738
1739				off = bp->b_offset;
1740#ifdef DIAGNOSTIC
1741				if (bp->b_offset == NOOFFSET)
1742					panic("allocbuf: no buffer offset");
1743#endif
1744
1745				curbpnpages = bp->b_npages;
1746		doretry:
1747				bp->b_validoff = orig_validoff;
1748				bp->b_validend = orig_validend;
1749				bp->b_flags |= B_CACHE;
1750				for (toff = 0; toff < newbsize; toff += tinc) {
1751					int bytesinpage;
1752
1753					pageindex = toff >> PAGE_SHIFT;
1754					objoff = OFF_TO_IDX(off + toff);
1755					if (pageindex < curbpnpages) {
1756
1757						m = bp->b_pages[pageindex];
1758#ifdef VFS_BIO_DIAG
1759						if (m->pindex != objoff)
1760							panic("allocbuf: page changed offset??!!!?");
1761#endif
1762						bytesinpage = tinc;
1763						if (tinc > (newbsize - toff))
1764							bytesinpage = newbsize - toff;
1765						if (bp->b_flags & B_CACHE)
1766							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1767						continue;
1768					}
1769					m = vm_page_lookup(obj, objoff);
1770					if (!m) {
1771						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1772						if (!m) {
1773							VM_WAIT;
1774							vm_pageout_deficit += (desiredpages - bp->b_npages);
1775							goto doretry;
1776						}
1777
1778						vm_page_wire(m);
1779						vm_page_flag_clear(m, PG_BUSY);
1780						bp->b_flags &= ~B_CACHE;
1781
1782					} else if (m->flags & PG_BUSY) {
1783						s = splvm();
1784						if (m->flags & PG_BUSY) {
1785							vm_page_flag_set(m, PG_WANTED);
1786							tsleep(m, PVM, "pgtblk", 0);
1787						}
1788						splx(s);
1789						goto doretry;
1790					} else {
1791						if ((curproc != pageproc) &&
1792							((m->queue - m->pc) == PQ_CACHE) &&
1793						    ((cnt.v_free_count + cnt.v_cache_count) <
1794								(cnt.v_free_min + cnt.v_cache_min))) {
1795							pagedaemon_wakeup();
1796						}
1797						bytesinpage = tinc;
1798						if (tinc > (newbsize - toff))
1799							bytesinpage = newbsize - toff;
1800						if (bp->b_flags & B_CACHE)
1801							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1802						vm_page_flag_clear(m, PG_ZERO);
1803						vm_page_wire(m);
1804					}
1805					bp->b_pages[pageindex] = m;
1806					curbpnpages = pageindex + 1;
1807				}
1808				if (vp->v_tag == VT_NFS &&
1809				    vp->v_type != VBLK) {
1810					if (bp->b_dirtyend > 0) {
1811						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1812						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1813					}
1814					if (bp->b_validend == 0)
1815						bp->b_flags &= ~B_CACHE;
1816				}
1817				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1818				bp->b_npages = curbpnpages;
1819				pmap_qenter((vm_offset_t) bp->b_data,
1820					bp->b_pages, bp->b_npages);
1821				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1822			}
1823		}
1824	}
1825	if (bp->b_flags & B_VMIO)
1826		vmiospace += (newbsize - bp->b_bufsize);
1827	bufspace += (newbsize - bp->b_bufsize);
1828	bp->b_bufsize = newbsize;
1829	bp->b_bcount = size;
1830	return 1;
1831}
1832
1833/*
1834 * Wait for buffer I/O completion, returning error status.
1835 */
1836int
1837biowait(register struct buf * bp)
1838{
1839	int s;
1840
1841	s = splbio();
1842	while ((bp->b_flags & B_DONE) == 0)
1843#if defined(NO_SCHEDULE_MODS)
1844		tsleep(bp, PRIBIO, "biowait", 0);
1845#else
1846		if (bp->b_flags & B_READ)
1847			tsleep(bp, PRIBIO, "biord", 0);
1848		else
1849			tsleep(bp, PRIBIO, "biowr", 0);
1850#endif
1851	splx(s);
1852	if (bp->b_flags & B_EINTR) {
1853		bp->b_flags &= ~B_EINTR;
1854		return (EINTR);
1855	}
1856	if (bp->b_flags & B_ERROR) {
1857		return (bp->b_error ? bp->b_error : EIO);
1858	} else {
1859		return (0);
1860	}
1861}
1862
1863/*
1864 * Finish I/O on a buffer, calling an optional function.
1865 * This is usually called from interrupt level, so process blocking
1866 * is not *a good idea*.
1867 */
1868void
1869biodone(register struct buf * bp)
1870{
1871	int s;
1872
1873	s = splbio();
1874
1875#if !defined(MAX_PERF)
1876	if (!(bp->b_flags & B_BUSY))
1877		panic("biodone: buffer not busy");
1878#endif
1879
1880	if (bp->b_flags & B_DONE) {
1881		splx(s);
1882#if !defined(MAX_PERF)
1883		printf("biodone: buffer already done\n");
1884#endif
1885		return;
1886	}
1887	bp->b_flags |= B_DONE;
1888
1889	if (bp->b_flags & B_FREEBUF) {
1890		brelse(bp);
1891		splx(s);
1892		return;
1893	}
1894
1895	if ((bp->b_flags & B_READ) == 0) {
1896		vwakeup(bp);
1897	}
1898
1899	/* call optional completion function if requested */
1900	if (bp->b_flags & B_CALL) {
1901		bp->b_flags &= ~B_CALL;
1902		(*bp->b_iodone) (bp);
1903		splx(s);
1904		return;
1905	}
1906	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1907		(*bioops.io_complete)(bp);
1908
1909	if (bp->b_flags & B_VMIO) {
1910		int i, resid;
1911		vm_ooffset_t foff;
1912		vm_page_t m;
1913		vm_object_t obj;
1914		int iosize;
1915		struct vnode *vp = bp->b_vp;
1916
1917		obj = vp->v_object;
1918
1919#if defined(VFS_BIO_DEBUG)
1920		if (vp->v_usecount == 0) {
1921			panic("biodone: zero vnode ref count");
1922		}
1923
1924		if (vp->v_object == NULL) {
1925			panic("biodone: missing VM object");
1926		}
1927
1928		if ((vp->v_flag & VOBJBUF) == 0) {
1929			panic("biodone: vnode is not setup for merged cache");
1930		}
1931#endif
1932
1933		foff = bp->b_offset;
1934#ifdef DIAGNOSTIC
1935		if (bp->b_offset == NOOFFSET)
1936			panic("biodone: no buffer offset");
1937#endif
1938
1939#if !defined(MAX_PERF)
1940		if (!obj) {
1941			panic("biodone: no object");
1942		}
1943#endif
1944#if defined(VFS_BIO_DEBUG)
1945		if (obj->paging_in_progress < bp->b_npages) {
1946			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1947			    obj->paging_in_progress, bp->b_npages);
1948		}
1949#endif
1950		iosize = bp->b_bufsize;
1951		for (i = 0; i < bp->b_npages; i++) {
1952			int bogusflag = 0;
1953			m = bp->b_pages[i];
1954			if (m == bogus_page) {
1955				bogusflag = 1;
1956				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1957				if (!m) {
1958#if defined(VFS_BIO_DEBUG)
1959					printf("biodone: page disappeared\n");
1960#endif
1961					vm_object_pip_subtract(obj, 1);
1962					continue;
1963				}
1964				bp->b_pages[i] = m;
1965				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1966			}
1967#if defined(VFS_BIO_DEBUG)
1968			if (OFF_TO_IDX(foff) != m->pindex) {
1969				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1970			}
1971#endif
1972			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1973			if (resid > iosize)
1974				resid = iosize;
1975
1976			/*
1977			 * In the write case, the valid and clean bits are
1978			 * already changed correctly, so we only need to do this
1979			 * here in the read case.
1980			 */
1981			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1982				vfs_page_set_valid(bp, foff, i, m);
1983			}
1984			vm_page_flag_clear(m, PG_ZERO);
1985
1986			/*
1987			 * when debugging new filesystems or buffer I/O methods, this
1988			 * is the most common error that pops up.  if you see this, you
1989			 * have not set the page busy flag correctly!!!
1990			 */
1991			if (m->busy == 0) {
1992#if !defined(MAX_PERF)
1993				printf("biodone: page busy < 0, "
1994				    "pindex: %d, foff: 0x(%x,%x), "
1995				    "resid: %d, index: %d\n",
1996				    (int) m->pindex, (int)(foff >> 32),
1997						(int) foff & 0xffffffff, resid, i);
1998#endif
1999				if (vp->v_type != VBLK)
2000#if !defined(MAX_PERF)
2001					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
2002					    bp->b_vp->v_mount->mnt_stat.f_iosize,
2003					    (int) bp->b_lblkno,
2004					    bp->b_flags, bp->b_npages);
2005				else
2006					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2007					    (int) bp->b_lblkno,
2008					    bp->b_flags, bp->b_npages);
2009				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2010				    m->valid, m->dirty, m->wire_count);
2011#endif
2012				panic("biodone: page busy < 0\n");
2013			}
2014			vm_page_io_finish(m);
2015			vm_object_pip_subtract(obj, 1);
2016			foff += resid;
2017			iosize -= resid;
2018		}
2019		if (obj &&
2020			(obj->paging_in_progress == 0) &&
2021		    (obj->flags & OBJ_PIPWNT)) {
2022			vm_object_clear_flag(obj, OBJ_PIPWNT);
2023			wakeup(obj);
2024		}
2025	}
2026	/*
2027	 * For asynchronous completions, release the buffer now. The brelse
2028	 * checks for B_WANTED and will do the wakeup there if necessary - so
2029	 * no need to do a wakeup here in the async case.
2030	 */
2031
2032	if (bp->b_flags & B_ASYNC) {
2033		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2034			brelse(bp);
2035		else
2036			bqrelse(bp);
2037	} else {
2038		bp->b_flags &= ~B_WANTED;
2039		wakeup(bp);
2040	}
2041	splx(s);
2042}
2043
2044static int
2045count_lock_queue()
2046{
2047	int count;
2048	struct buf *bp;
2049
2050	count = 0;
2051	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
2052	    bp != NULL;
2053	    bp = TAILQ_NEXT(bp, b_freelist))
2054		count++;
2055	return (count);
2056}
2057
2058#if 0	/* not with kirks code */
2059static int vfs_update_interval = 30;
2060
2061static void
2062vfs_update()
2063{
2064	while (1) {
2065		tsleep(&vfs_update_wakeup, PUSER, "update",
2066		    hz * vfs_update_interval);
2067		vfs_update_wakeup = 0;
2068		sync(curproc, NULL);
2069	}
2070}
2071
2072static int
2073sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2074{
2075	int error = sysctl_handle_int(oidp,
2076		oidp->oid_arg1, oidp->oid_arg2, req);
2077	if (!error)
2078		wakeup(&vfs_update_wakeup);
2079	return error;
2080}
2081
2082SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2083	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2084
2085#endif
2086
2087
2088/*
2089 * This routine is called in lieu of iodone in the case of
2090 * incomplete I/O.  This keeps the busy status for pages
2091 * consistant.
2092 */
2093void
2094vfs_unbusy_pages(struct buf * bp)
2095{
2096	int i, s;
2097
2098	if (bp->b_flags & B_VMIO) {
2099		struct vnode *vp = bp->b_vp;
2100		vm_object_t obj = vp->v_object;
2101
2102		for (i = 0; i < bp->b_npages; i++) {
2103			vm_page_t m = bp->b_pages[i];
2104
2105			if (m == bogus_page) {
2106				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2107#if !defined(MAX_PERF)
2108				if (!m) {
2109					panic("vfs_unbusy_pages: page missing\n");
2110				}
2111#endif
2112				bp->b_pages[i] = m;
2113				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2114			}
2115			vm_object_pip_subtract(obj, 1);
2116			vm_page_flag_clear(m, PG_ZERO);
2117			vm_page_io_finish(m);
2118		}
2119		if (obj->paging_in_progress == 0 &&
2120		    (obj->flags & OBJ_PIPWNT)) {
2121			vm_object_clear_flag(obj, OBJ_PIPWNT);
2122			wakeup(obj);
2123		}
2124	}
2125}
2126
2127/*
2128 * Set NFS' b_validoff and b_validend fields from the valid bits
2129 * of a page.  If the consumer is not NFS, and the page is not
2130 * valid for the entire range, clear the B_CACHE flag to force
2131 * the consumer to re-read the page.
2132 */
2133static void
2134vfs_buf_set_valid(struct buf *bp,
2135		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2136		  vm_page_t m)
2137{
2138	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2139		vm_offset_t svalid, evalid;
2140		int validbits = m->valid;
2141
2142		/*
2143		 * This only bothers with the first valid range in the
2144		 * page.
2145		 */
2146		svalid = off;
2147		while (validbits && !(validbits & 1)) {
2148			svalid += DEV_BSIZE;
2149			validbits >>= 1;
2150		}
2151		evalid = svalid;
2152		while (validbits & 1) {
2153			evalid += DEV_BSIZE;
2154			validbits >>= 1;
2155		}
2156		/*
2157		 * Make sure this range is contiguous with the range
2158		 * built up from previous pages.  If not, then we will
2159		 * just use the range from the previous pages.
2160		 */
2161		if (svalid == bp->b_validend) {
2162			bp->b_validoff = min(bp->b_validoff, svalid);
2163			bp->b_validend = max(bp->b_validend, evalid);
2164		}
2165	} else if (!vm_page_is_valid(m,
2166				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2167				     size)) {
2168		bp->b_flags &= ~B_CACHE;
2169	}
2170}
2171
2172/*
2173 * Set the valid bits in a page, taking care of the b_validoff,
2174 * b_validend fields which NFS uses to optimise small reads.  Off is
2175 * the offset within the file and pageno is the page index within the buf.
2176 */
2177static void
2178vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2179{
2180	struct vnode *vp = bp->b_vp;
2181	vm_ooffset_t soff, eoff;
2182
2183	soff = off;
2184	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2185	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2186		vm_ooffset_t sv, ev;
2187		vm_page_set_invalid(m,
2188		    (vm_offset_t) (soff & PAGE_MASK),
2189		    (vm_offset_t) (eoff - soff));
2190		off = off - pageno * PAGE_SIZE;
2191		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2192		ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2193		soff = qmax(sv, soff);
2194		eoff = qmin(ev, eoff);
2195	}
2196	if (eoff > soff)
2197		vm_page_set_validclean(m,
2198	       (vm_offset_t) (soff & PAGE_MASK),
2199	       (vm_offset_t) (eoff - soff));
2200}
2201
2202/*
2203 * This routine is called before a device strategy routine.
2204 * It is used to tell the VM system that paging I/O is in
2205 * progress, and treat the pages associated with the buffer
2206 * almost as being PG_BUSY.  Also the object paging_in_progress
2207 * flag is handled to make sure that the object doesn't become
2208 * inconsistant.
2209 */
2210void
2211vfs_busy_pages(struct buf * bp, int clear_modify)
2212{
2213	int i, s;
2214
2215	if (bp->b_flags & B_VMIO) {
2216		struct vnode *vp = bp->b_vp;
2217		vm_object_t obj = vp->v_object;
2218		vm_ooffset_t foff;
2219
2220		foff = bp->b_offset;
2221#ifdef DIAGNOSTIC
2222		if (bp->b_offset == NOOFFSET)
2223			panic("vfs_busy_pages: no buffer offset");
2224#endif
2225
2226		vfs_setdirty(bp);
2227
2228retry:
2229		for (i = 0; i < bp->b_npages; i++) {
2230			vm_page_t m = bp->b_pages[i];
2231			if (vm_page_sleep(m, "vbpage", NULL))
2232				goto retry;
2233		}
2234
2235		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2236			vm_page_t m = bp->b_pages[i];
2237
2238			vm_page_flag_clear(m, PG_ZERO);
2239			if ((bp->b_flags & B_CLUSTER) == 0) {
2240				vm_object_pip_add(obj, 1);
2241				vm_page_io_start(m);
2242			}
2243
2244			vm_page_protect(m, VM_PROT_NONE);
2245			if (clear_modify)
2246				vfs_page_set_valid(bp, foff, i, m);
2247			else if (bp->b_bcount >= PAGE_SIZE) {
2248				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2249					bp->b_pages[i] = bogus_page;
2250					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2251				}
2252			}
2253		}
2254	}
2255}
2256
2257/*
2258 * Tell the VM system that the pages associated with this buffer
2259 * are clean.  This is used for delayed writes where the data is
2260 * going to go to disk eventually without additional VM intevention.
2261 */
2262void
2263vfs_clean_pages(struct buf * bp)
2264{
2265	int i;
2266
2267	if (bp->b_flags & B_VMIO) {
2268		struct vnode *vp = bp->b_vp;
2269		vm_ooffset_t foff;
2270		foff = bp->b_offset;
2271
2272#ifdef DIAGNOSTIC
2273		if (bp->b_offset == NOOFFSET)
2274			panic("vfs_clean_pages: no buffer offset");
2275#endif
2276
2277		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2278			vm_page_t m = bp->b_pages[i];
2279			vfs_page_set_valid(bp, foff, i, m);
2280		}
2281	}
2282}
2283
2284void
2285vfs_bio_clrbuf(struct buf *bp) {
2286	int i;
2287	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2288		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2289			int mask;
2290			mask = 0;
2291			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2292				mask |= (1 << (i/DEV_BSIZE));
2293			if(((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2294				(bp->b_pages[0]->valid != mask)) {
2295				bzero(bp->b_data, bp->b_bufsize);
2296			}
2297			bp->b_pages[0]->valid = mask;
2298			bp->b_resid = 0;
2299			return;
2300		}
2301		for(i=0;i<bp->b_npages;i++) {
2302			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2303				continue;
2304			if( bp->b_pages[i]->valid == 0) {
2305				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2306					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2307				}
2308			} else {
2309				int j;
2310				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2311					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2312						(bp->b_pages[i]->valid & (1<<j)) == 0)
2313						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2314				}
2315			}
2316			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
2317			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2318		}
2319		bp->b_resid = 0;
2320	} else {
2321		clrbuf(bp);
2322	}
2323}
2324
2325/*
2326 * vm_hold_load_pages and vm_hold_unload pages get pages into
2327 * a buffers address space.  The pages are anonymous and are
2328 * not associated with a file object.
2329 */
2330void
2331vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2332{
2333	vm_offset_t pg;
2334	vm_page_t p;
2335	int index;
2336
2337	to = round_page(to);
2338	from = round_page(from);
2339	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2340
2341	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2342
2343tryagain:
2344
2345		p = vm_page_alloc(kernel_object,
2346			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2347		    VM_ALLOC_NORMAL);
2348		if (!p) {
2349			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2350			VM_WAIT;
2351			goto tryagain;
2352		}
2353		vm_page_wire(p);
2354		p->valid = VM_PAGE_BITS_ALL;
2355		vm_page_flag_clear(p, PG_ZERO);
2356		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2357		bp->b_pages[index] = p;
2358		vm_page_wakeup(p);
2359	}
2360	bp->b_npages = index;
2361}
2362
2363void
2364vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2365{
2366	vm_offset_t pg;
2367	vm_page_t p;
2368	int index, newnpages;
2369
2370	from = round_page(from);
2371	to = round_page(to);
2372	newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2373
2374	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2375		p = bp->b_pages[index];
2376		if (p && (index < bp->b_npages)) {
2377#if !defined(MAX_PERF)
2378			if (p->busy) {
2379				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2380					bp->b_blkno, bp->b_lblkno);
2381			}
2382#endif
2383			bp->b_pages[index] = NULL;
2384			pmap_kremove(pg);
2385			vm_page_busy(p);
2386			vm_page_unwire(p);
2387			vm_page_free(p);
2388		}
2389	}
2390	bp->b_npages = newnpages;
2391}
2392
2393
2394#include "opt_ddb.h"
2395#ifdef DDB
2396#include <ddb/ddb.h>
2397
2398DB_SHOW_COMMAND(buffer, db_show_buffer)
2399{
2400	/* get args */
2401	struct buf *bp = (struct buf *)addr;
2402
2403	if (!have_addr) {
2404		db_printf("usage: show buffer <addr>\n");
2405		return;
2406	}
2407
2408	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2409		  (u_int)bp->b_flags, PRINT_BUF_FLAGS);
2410	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2411		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2412		  "b_blkno = %d, b_pblkno = %d\n",
2413		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2414		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2415	if (bp->b_npages) {
2416		int i;
2417		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2418		for (i = 0; i < bp->b_npages; i++) {
2419			vm_page_t m;
2420			m = bp->b_pages[i];
2421			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
2422			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
2423			if ((i + 1) < bp->b_npages)
2424				db_printf(",");
2425		}
2426		db_printf("\n");
2427	}
2428}
2429#endif /* DDB */
2430