vfs_bio.c revision 34611
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.155 1998/03/08 09:57:04 julian Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 */
27
28#include "opt_bounce.h"
29
30#define VMIO
31#include <sys/param.h>
32#include <sys/systm.h>
33#include <sys/sysproto.h>
34#include <sys/kernel.h>
35#include <sys/sysctl.h>
36#include <sys/proc.h>
37#include <sys/vnode.h>
38#include <sys/vmmeter.h>
39#include <sys/lock.h>
40#include <miscfs/specfs/specdev.h>
41#include <vm/vm.h>
42#include <vm/vm_param.h>
43#include <vm/vm_prot.h>
44#include <vm/vm_kern.h>
45#include <vm/vm_pageout.h>
46#include <vm/vm_page.h>
47#include <vm/vm_object.h>
48#include <vm/vm_extern.h>
49#include <vm/vm_map.h>
50#include <sys/buf.h>
51#include <sys/mount.h>
52#include <sys/malloc.h>
53#include <sys/resourcevar.h>
54
55static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
56
57struct	bio_ops bioops;		/* I/O operation notification */
58
59#if 0 	/* replaced bu sched_sync */
60static void vfs_update __P((void));
61static struct	proc *updateproc;
62static struct kproc_desc up_kp = {
63	"update",
64	vfs_update,
65	&updateproc
66};
67SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
68#endif
69
70struct buf *buf;		/* buffer header pool */
71struct swqueue bswlist;
72
73static int count_lock_queue __P((void));
74static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
75		vm_offset_t to);
76static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
77		vm_offset_t to);
78static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
79			      vm_offset_t off, vm_offset_t size,
80			      vm_page_t m);
81static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
82			       int pageno, vm_page_t m);
83static void vfs_clean_pages(struct buf * bp);
84static void vfs_setdirty(struct buf *bp);
85static void vfs_vmio_release(struct buf *bp);
86static void flushdirtybuffers(int slpflag, int slptimeo);
87
88int needsbuffer;
89
90/*
91 * Internal update daemon, process 3
92 *	The variable vfs_update_wakeup allows for internal syncs.
93 */
94int vfs_update_wakeup;
95
96
97/*
98 * buffers base kva
99 */
100
101/*
102 * bogus page -- for I/O to/from partially complete buffers
103 * this is a temporary solution to the problem, but it is not
104 * really that bad.  it would be better to split the buffer
105 * for input in the case of buffers partially already in memory,
106 * but the code is intricate enough already.
107 */
108vm_page_t bogus_page;
109static vm_offset_t bogus_offset;
110
111static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
112	bufmallocspace, maxbufmallocspace;
113int numdirtybuffers;
114static int lodirtybuffers, hidirtybuffers;
115static int numfreebuffers, lofreebuffers, hifreebuffers;
116static int kvafreespace;
117
118SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
119	&numdirtybuffers, 0, "");
120SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
121	&lodirtybuffers, 0, "");
122SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
123	&hidirtybuffers, 0, "");
124SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
125	&numfreebuffers, 0, "");
126SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
127	&lofreebuffers, 0, "");
128SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
129	&hifreebuffers, 0, "");
130SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
131	&maxbufspace, 0, "");
132SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
133	&bufspace, 0, "");
134SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
135	&maxvmiobufspace, 0, "");
136SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
137	&vmiospace, 0, "");
138SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
139	&maxbufmallocspace, 0, "");
140SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
141	&bufmallocspace, 0, "");
142SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
143	&kvafreespace, 0, "");
144
145static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
146struct bqueues bufqueues[BUFFER_QUEUES] = {0};
147
148extern int vm_swap_size;
149
150#define BUF_MAXUSE 24
151
152#define VFS_BIO_NEED_ANY 1
153#define VFS_BIO_NEED_LOWLIMIT 2
154#define VFS_BIO_NEED_FREE 4
155
156/*
157 * Initialize buffer headers and related structures.
158 */
159void
160bufinit()
161{
162	struct buf *bp;
163	int i;
164
165	TAILQ_INIT(&bswlist);
166	LIST_INIT(&invalhash);
167
168	/* first, make a null hash table */
169	for (i = 0; i < BUFHSZ; i++)
170		LIST_INIT(&bufhashtbl[i]);
171
172	/* next, make a null set of free lists */
173	for (i = 0; i < BUFFER_QUEUES; i++)
174		TAILQ_INIT(&bufqueues[i]);
175
176	/* finally, initialize each buffer header and stick on empty q */
177	for (i = 0; i < nbuf; i++) {
178		bp = &buf[i];
179		bzero(bp, sizeof *bp);
180		bp->b_flags = B_INVAL;	/* we're just an empty header */
181		bp->b_dev = NODEV;
182		bp->b_rcred = NOCRED;
183		bp->b_wcred = NOCRED;
184		bp->b_qindex = QUEUE_EMPTY;
185		bp->b_vnbufs.le_next = NOLIST;
186		bp->b_generation = 0;
187		LIST_INIT(&bp->b_dep);
188		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
189		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
190	}
191/*
192 * maxbufspace is currently calculated to support all filesystem blocks
193 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
194 * cache is still the same as it would be for 8K filesystems.  This
195 * keeps the size of the buffer cache "in check" for big block filesystems.
196 */
197	maxbufspace = (nbuf + 8) * DFLTBSIZE;
198/*
199 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
200 */
201	maxvmiobufspace = 2 * maxbufspace / 3;
202/*
203 * Limit the amount of malloc memory since it is wired permanently into
204 * the kernel space.  Even though this is accounted for in the buffer
205 * allocation, we don't want the malloced region to grow uncontrolled.
206 * The malloc scheme improves memory utilization significantly on average
207 * (small) directories.
208 */
209	maxbufmallocspace = maxbufspace / 20;
210
211/*
212 * Remove the probability of deadlock conditions by limiting the
213 * number of dirty buffers.
214 */
215	hidirtybuffers = nbuf / 8 + 20;
216	lodirtybuffers = nbuf / 16 + 10;
217	numdirtybuffers = 0;
218	lofreebuffers = nbuf / 18 + 5;
219	hifreebuffers = 2 * lofreebuffers;
220	numfreebuffers = nbuf;
221	kvafreespace = 0;
222
223	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
224	bogus_page = vm_page_alloc(kernel_object,
225			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
226			VM_ALLOC_NORMAL);
227
228}
229
230/*
231 * Free the kva allocation for a buffer
232 * Must be called only at splbio or higher,
233 *  as this is the only locking for buffer_map.
234 */
235static void
236bfreekva(struct buf * bp)
237{
238	if (bp->b_kvasize == 0)
239		return;
240
241	vm_map_delete(buffer_map,
242		(vm_offset_t) bp->b_kvabase,
243		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
244
245	bp->b_kvasize = 0;
246
247}
248
249/*
250 * remove the buffer from the appropriate free list
251 */
252void
253bremfree(struct buf * bp)
254{
255	int s = splbio();
256
257	if (bp->b_qindex != QUEUE_NONE) {
258		if (bp->b_qindex == QUEUE_EMPTY) {
259			kvafreespace -= bp->b_kvasize;
260		}
261		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
262		bp->b_qindex = QUEUE_NONE;
263	} else {
264#if !defined(MAX_PERF)
265		panic("bremfree: removing a buffer when not on a queue");
266#endif
267	}
268	if ((bp->b_flags & B_INVAL) ||
269		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
270		--numfreebuffers;
271	splx(s);
272}
273
274
275/*
276 * Get a buffer with the specified data.  Look in the cache first.
277 */
278int
279bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
280    struct buf ** bpp)
281{
282	struct buf *bp;
283
284	bp = getblk(vp, blkno, size, 0, 0);
285	*bpp = bp;
286
287	/* if not found in cache, do some I/O */
288	if ((bp->b_flags & B_CACHE) == 0) {
289		if (curproc != NULL)
290			curproc->p_stats->p_ru.ru_inblock++;
291		bp->b_flags |= B_READ;
292		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
293		if (bp->b_rcred == NOCRED) {
294			if (cred != NOCRED)
295				crhold(cred);
296			bp->b_rcred = cred;
297		}
298		vfs_busy_pages(bp, 0);
299		VOP_STRATEGY(bp);
300		return (biowait(bp));
301	}
302	return (0);
303}
304
305/*
306 * Operates like bread, but also starts asynchronous I/O on
307 * read-ahead blocks.
308 */
309int
310breadn(struct vnode * vp, daddr_t blkno, int size,
311    daddr_t * rablkno, int *rabsize,
312    int cnt, struct ucred * cred, struct buf ** bpp)
313{
314	struct buf *bp, *rabp;
315	int i;
316	int rv = 0, readwait = 0;
317
318	*bpp = bp = getblk(vp, blkno, size, 0, 0);
319
320	/* if not found in cache, do some I/O */
321	if ((bp->b_flags & B_CACHE) == 0) {
322		if (curproc != NULL)
323			curproc->p_stats->p_ru.ru_inblock++;
324		bp->b_flags |= B_READ;
325		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
326		if (bp->b_rcred == NOCRED) {
327			if (cred != NOCRED)
328				crhold(cred);
329			bp->b_rcred = cred;
330		}
331		vfs_busy_pages(bp, 0);
332		VOP_STRATEGY(bp);
333		++readwait;
334	}
335	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
336		if (inmem(vp, *rablkno))
337			continue;
338		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
339
340		if ((rabp->b_flags & B_CACHE) == 0) {
341			if (curproc != NULL)
342				curproc->p_stats->p_ru.ru_inblock++;
343			rabp->b_flags |= B_READ | B_ASYNC;
344			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
345			if (rabp->b_rcred == NOCRED) {
346				if (cred != NOCRED)
347					crhold(cred);
348				rabp->b_rcred = cred;
349			}
350			vfs_busy_pages(rabp, 0);
351			VOP_STRATEGY(rabp);
352		} else {
353			brelse(rabp);
354		}
355	}
356
357	if (readwait) {
358		rv = biowait(bp);
359	}
360	return (rv);
361}
362
363/*
364 * Write, release buffer on completion.  (Done by iodone
365 * if async.)
366 */
367int
368bwrite(struct buf * bp)
369{
370	int oldflags = bp->b_flags;
371	struct vnode *vp;
372	struct mount *mp;
373
374
375	if (bp->b_flags & B_INVAL) {
376		brelse(bp);
377		return (0);
378	}
379#if !defined(MAX_PERF)
380	if (!(bp->b_flags & B_BUSY))
381		panic("bwrite: buffer is not busy???");
382#endif
383
384	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
385	bp->b_flags |= B_WRITEINPROG;
386
387	if ((oldflags & B_DELWRI) == B_DELWRI) {
388		--numdirtybuffers;
389		reassignbuf(bp, bp->b_vp);
390	}
391
392	bp->b_vp->v_numoutput++;
393	vfs_busy_pages(bp, 1);
394	if (curproc != NULL)
395		curproc->p_stats->p_ru.ru_oublock++;
396	VOP_STRATEGY(bp);
397
398	/*
399	 * Collect statistics on synchronous and asynchronous writes.
400	 * Writes to block devices are charged to their associated
401	 * filesystem (if any).
402	 */
403	if ((vp = bp->b_vp) != NULL) {
404		if (vp->v_type == VBLK)
405			mp = vp->v_specmountpoint;
406		else
407			mp = vp->v_mount;
408		if (mp != NULL)
409			if ((oldflags & B_ASYNC) == 0)
410				mp->mnt_stat.f_syncwrites++;
411			else
412				mp->mnt_stat.f_asyncwrites++;
413	}
414
415	if ((oldflags & B_ASYNC) == 0) {
416		int rtval = biowait(bp);
417
418		if (oldflags & B_DELWRI) {
419			reassignbuf(bp, bp->b_vp);
420		}
421		brelse(bp);
422		return (rtval);
423	}
424	return (0);
425}
426
427inline void
428vfs_bio_need_satisfy(void) {
429	++numfreebuffers;
430	if (!needsbuffer)
431		return;
432	if (numdirtybuffers < lodirtybuffers) {
433		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
434	} else {
435		needsbuffer &= ~VFS_BIO_NEED_ANY;
436	}
437	if (numfreebuffers >= hifreebuffers) {
438		needsbuffer &= ~VFS_BIO_NEED_FREE;
439	}
440	wakeup(&needsbuffer);
441}
442
443/*
444 * Delayed write. (Buffer is marked dirty).
445 */
446void
447bdwrite(struct buf * bp)
448{
449	int s;
450	struct vnode *vp;
451
452#if !defined(MAX_PERF)
453	if ((bp->b_flags & B_BUSY) == 0) {
454		panic("bdwrite: buffer is not busy");
455	}
456#endif
457
458	if (bp->b_flags & B_INVAL) {
459		brelse(bp);
460		return;
461	}
462	if (bp->b_flags & B_TAPE) {
463		bawrite(bp);
464		return;
465	}
466	bp->b_flags &= ~(B_READ|B_RELBUF);
467	if ((bp->b_flags & B_DELWRI) == 0) {
468		bp->b_flags |= B_DONE | B_DELWRI;
469		s = splbio();
470		reassignbuf(bp, bp->b_vp);
471		splx(s);
472		++numdirtybuffers;
473	}
474
475	/*
476	 * This bmap keeps the system from needing to do the bmap later,
477	 * perhaps when the system is attempting to do a sync.  Since it
478	 * is likely that the indirect block -- or whatever other datastructure
479	 * that the filesystem needs is still in memory now, it is a good
480	 * thing to do this.  Note also, that if the pageout daemon is
481	 * requesting a sync -- there might not be enough memory to do
482	 * the bmap then...  So, this is important to do.
483	 */
484	if (bp->b_lblkno == bp->b_blkno) {
485		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
486	}
487
488	/*
489	 * Set the *dirty* buffer range based upon the VM system dirty pages.
490	 */
491	vfs_setdirty(bp);
492
493	/*
494	 * We need to do this here to satisfy the vnode_pager and the
495	 * pageout daemon, so that it thinks that the pages have been
496	 * "cleaned".  Note that since the pages are in a delayed write
497	 * buffer -- the VFS layer "will" see that the pages get written
498	 * out on the next sync, or perhaps the cluster will be completed.
499	 */
500	vfs_clean_pages(bp);
501	bqrelse(bp);
502
503	/*
504	 * XXX The soft dependency code is not prepared to
505	 * have I/O done when a bdwrite is requested. For
506	 * now we just let the write be delayed if it is
507	 * requested by the soft dependency code.
508	 */
509	if ((vp = bp->b_vp) &&
510	    (vp->v_type == VBLK && vp->v_specmountpoint &&
511	    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
512	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))
513		return;
514
515	if (numdirtybuffers >= hidirtybuffers)
516		flushdirtybuffers(0, 0);
517
518	return;
519}
520
521
522/*
523 * Same as first half of bdwrite, mark buffer dirty, but do not release it.
524 * Check how this compares with vfs_setdirty(); XXX [JRE]
525 */
526void
527bdirty(bp)
528      struct buf *bp;
529{
530	int s;
531
532	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
533	if ((bp->b_flags & B_DELWRI) == 0) {
534		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
535		s = splbio();
536		reassignbuf(bp, bp->b_vp);
537		splx(s);
538		++numdirtybuffers;
539	}
540}
541
542/*
543 * Asynchronous write.
544 * Start output on a buffer, but do not wait for it to complete.
545 * The buffer is released when the output completes.
546 */
547void
548bawrite(struct buf * bp)
549{
550	bp->b_flags |= B_ASYNC;
551	(void) VOP_BWRITE(bp);
552}
553
554/*
555 * Ordered write.
556 * Start output on a buffer, but only wait for it to complete if the
557 * output device cannot guarantee ordering in some other way.  Devices
558 * that can perform asynchronous ordered writes will set the B_ASYNC
559 * flag in their strategy routine.
560 * The buffer is released when the output completes.
561 */
562int
563bowrite(struct buf * bp)
564{
565	/*
566	 * XXX Add in B_ASYNC once the SCSI
567	 *     layer can deal with ordered
568	 *     writes properly.
569	 */
570	bp->b_flags |= B_ORDERED;
571	return (VOP_BWRITE(bp));
572}
573
574/*
575 * Release a buffer.
576 */
577void
578brelse(struct buf * bp)
579{
580	int s;
581
582	if (bp->b_flags & B_CLUSTER) {
583		relpbuf(bp);
584		return;
585	}
586
587	s = splbio();
588
589	/* anyone need this block? */
590	if (bp->b_flags & B_WANTED) {
591		bp->b_flags &= ~(B_WANTED | B_AGE);
592		wakeup(bp);
593	}
594
595	if (bp->b_flags & B_LOCKED)
596		bp->b_flags &= ~B_ERROR;
597
598	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
599	    (bp->b_bufsize <= 0)) {
600		bp->b_flags |= B_INVAL;
601		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
602			(*bioops.io_deallocate)(bp);
603		if (bp->b_flags & B_DELWRI)
604			--numdirtybuffers;
605		bp->b_flags &= ~(B_DELWRI | B_CACHE);
606		if ((bp->b_flags & B_VMIO) == 0) {
607			if (bp->b_bufsize)
608				allocbuf(bp, 0);
609			if (bp->b_vp)
610				brelvp(bp);
611		}
612	}
613
614	/*
615	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
616	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
617	 * but the VM object is kept around.  The B_NOCACHE flag is used to
618	 * invalidate the pages in the VM object.
619	 *
620	 * If the buffer is a partially filled NFS buffer, keep it
621	 * since invalidating it now will lose informatio.  The valid
622	 * flags in the vm_pages have only DEV_BSIZE resolution but
623	 * the b_validoff, b_validend fields have byte resolution.
624	 * This can avoid unnecessary re-reads of the buffer.
625	 * XXX this seems to cause performance problems.
626	 */
627	if ((bp->b_flags & B_VMIO)
628	    && !(bp->b_vp->v_tag == VT_NFS &&
629		 bp->b_vp->v_type != VBLK &&
630		 (bp->b_flags & B_DELWRI) != 0)
631#ifdef notdef
632	    && (bp->b_vp->v_tag != VT_NFS
633		|| bp->b_vp->v_type == VBLK
634		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
635		|| bp->b_validend == 0
636		|| (bp->b_validoff == 0
637		    && bp->b_validend == bp->b_bufsize))
638#endif
639	    ) {
640
641		int i, j, resid;
642		vm_page_t m;
643		off_t foff;
644		vm_pindex_t poff;
645		vm_object_t obj;
646		struct vnode *vp;
647
648		vp = bp->b_vp;
649
650		resid = bp->b_bufsize;
651		foff = bp->b_offset;
652
653		for (i = 0; i < bp->b_npages; i++) {
654			m = bp->b_pages[i];
655			if (m == bogus_page) {
656
657				obj = (vm_object_t) vp->v_object;
658				poff = OFF_TO_IDX(bp->b_offset);
659
660				for (j = i; j < bp->b_npages; j++) {
661					m = bp->b_pages[j];
662					if (m == bogus_page) {
663						m = vm_page_lookup(obj, poff + j);
664#if !defined(MAX_PERF)
665						if (!m) {
666							panic("brelse: page missing\n");
667						}
668#endif
669						bp->b_pages[j] = m;
670					}
671				}
672
673				if ((bp->b_flags & B_INVAL) == 0) {
674					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
675				}
676				break;
677			}
678			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
679				int poffset = foff & PAGE_MASK;
680				int presid = resid > (PAGE_SIZE - poffset) ?
681					(PAGE_SIZE - poffset) : resid;
682				vm_page_set_invalid(m, poffset, presid);
683			}
684			resid -= PAGE_SIZE;
685		}
686
687		if (bp->b_flags & (B_INVAL | B_RELBUF))
688			vfs_vmio_release(bp);
689
690	} else if (bp->b_flags & B_VMIO) {
691
692		if (bp->b_flags & (B_INVAL | B_RELBUF))
693			vfs_vmio_release(bp);
694
695	}
696
697#if !defined(MAX_PERF)
698	if (bp->b_qindex != QUEUE_NONE)
699		panic("brelse: free buffer onto another queue???");
700#endif
701
702	/* enqueue */
703	/* buffers with no memory */
704	if (bp->b_bufsize == 0) {
705		bp->b_flags |= B_INVAL;
706		bp->b_qindex = QUEUE_EMPTY;
707		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
708		LIST_REMOVE(bp, b_hash);
709		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
710		bp->b_dev = NODEV;
711		kvafreespace += bp->b_kvasize;
712		bp->b_generation++;
713
714	/* buffers with junk contents */
715	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
716		bp->b_flags |= B_INVAL;
717		bp->b_qindex = QUEUE_AGE;
718		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
719		LIST_REMOVE(bp, b_hash);
720		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
721		bp->b_dev = NODEV;
722		bp->b_generation++;
723
724	/* buffers that are locked */
725	} else if (bp->b_flags & B_LOCKED) {
726		bp->b_qindex = QUEUE_LOCKED;
727		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
728
729	/* buffers with stale but valid contents */
730	} else if (bp->b_flags & B_AGE) {
731		bp->b_qindex = QUEUE_AGE;
732		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
733
734	/* buffers with valid and quite potentially reuseable contents */
735	} else {
736		bp->b_qindex = QUEUE_LRU;
737		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
738	}
739
740	if ((bp->b_flags & B_INVAL) ||
741		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
742		if (bp->b_flags & B_DELWRI) {
743			--numdirtybuffers;
744			bp->b_flags &= ~B_DELWRI;
745		}
746		vfs_bio_need_satisfy();
747	}
748
749	/* unlock */
750	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
751				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
752	splx(s);
753}
754
755/*
756 * Release a buffer.
757 */
758void
759bqrelse(struct buf * bp)
760{
761	int s;
762
763	s = splbio();
764
765	/* anyone need this block? */
766	if (bp->b_flags & B_WANTED) {
767		bp->b_flags &= ~(B_WANTED | B_AGE);
768		wakeup(bp);
769	}
770
771#if !defined(MAX_PERF)
772	if (bp->b_qindex != QUEUE_NONE)
773		panic("bqrelse: free buffer onto another queue???");
774#endif
775
776	if (bp->b_flags & B_LOCKED) {
777		bp->b_flags &= ~B_ERROR;
778		bp->b_qindex = QUEUE_LOCKED;
779		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
780		/* buffers with stale but valid contents */
781	} else {
782		bp->b_qindex = QUEUE_LRU;
783		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
784	}
785
786	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
787		vfs_bio_need_satisfy();
788	}
789
790	/* unlock */
791	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
792		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
793	splx(s);
794}
795
796static void
797vfs_vmio_release(bp)
798	struct buf *bp;
799{
800	int i;
801	vm_page_t m;
802
803	for (i = 0; i < bp->b_npages; i++) {
804		m = bp->b_pages[i];
805		bp->b_pages[i] = NULL;
806		vm_page_unwire(m);
807
808		/*
809		 * We don't mess with busy pages, it is
810		 * the responsibility of the process that
811		 * busied the pages to deal with them.
812		 */
813		if ((m->flags & PG_BUSY) || (m->busy != 0))
814			continue;
815
816		if (m->wire_count == 0) {
817
818			/*
819			 * If this is an async free -- we cannot place
820			 * pages onto the cache queue.  If it is an
821			 * async free, then we don't modify any queues.
822			 * This is probably in error (for perf reasons),
823			 * and we will eventually need to build
824			 * a more complete infrastructure to support I/O
825			 * rundown.
826			 */
827			if ((bp->b_flags & B_ASYNC) == 0) {
828
829			/*
830			 * In the case of sync buffer frees, we can do pretty much
831			 * anything to any of the memory queues.  Specifically,
832			 * the cache queue is okay to be modified.
833			 */
834				if (m->valid) {
835					if(m->dirty == 0)
836						vm_page_test_dirty(m);
837					/*
838					 * this keeps pressure off of the process memory
839					 */
840					if (m->dirty == 0 && m->hold_count == 0)
841						vm_page_cache(m);
842					else
843						vm_page_deactivate(m);
844				} else if (m->hold_count == 0) {
845					m->flags |= PG_BUSY;
846					vm_page_protect(m, VM_PROT_NONE);
847					vm_page_free(m);
848				}
849			} else {
850				/*
851				 * If async, then at least we clear the
852				 * act_count.
853				 */
854				m->act_count = 0;
855			}
856		}
857	}
858	bufspace -= bp->b_bufsize;
859	vmiospace -= bp->b_bufsize;
860	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
861	bp->b_npages = 0;
862	bp->b_bufsize = 0;
863	bp->b_flags &= ~B_VMIO;
864	if (bp->b_vp)
865		brelvp(bp);
866}
867
868/*
869 * Check to see if a block is currently memory resident.
870 */
871struct buf *
872gbincore(struct vnode * vp, daddr_t blkno)
873{
874	struct buf *bp;
875	struct bufhashhdr *bh;
876
877	bh = BUFHASH(vp, blkno);
878	bp = bh->lh_first;
879
880	/* Search hash chain */
881	while (bp != NULL) {
882		/* hit */
883		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
884		    (bp->b_flags & B_INVAL) == 0) {
885			break;
886		}
887		bp = bp->b_hash.le_next;
888	}
889	return (bp);
890}
891
892/*
893 * this routine implements clustered async writes for
894 * clearing out B_DELWRI buffers...  This is much better
895 * than the old way of writing only one buffer at a time.
896 */
897int
898vfs_bio_awrite(struct buf * bp)
899{
900	int i;
901	daddr_t lblkno = bp->b_lblkno;
902	struct vnode *vp = bp->b_vp;
903	int s;
904	int ncl;
905	struct buf *bpa;
906	int nwritten;
907	int size;
908	int maxcl;
909
910	s = splbio();
911	/*
912	 * right now we support clustered writing only to regular files
913	 */
914	if ((vp->v_type == VREG) &&
915	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
916	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
917
918		size = vp->v_mount->mnt_stat.f_iosize;
919		maxcl = MAXPHYS / size;
920
921		for (i = 1; i < maxcl; i++) {
922			if ((bpa = gbincore(vp, lblkno + i)) &&
923			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
924			    (B_DELWRI | B_CLUSTEROK)) &&
925			    (bpa->b_bufsize == size)) {
926				if ((bpa->b_blkno == bpa->b_lblkno) ||
927				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
928					break;
929			} else {
930				break;
931			}
932		}
933		ncl = i;
934		/*
935		 * this is a possible cluster write
936		 */
937		if (ncl != 1) {
938			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
939			splx(s);
940			return nwritten;
941		}
942	}
943
944	bremfree(bp);
945	splx(s);
946	/*
947	 * default (old) behavior, writing out only one block
948	 */
949	bp->b_flags |= B_BUSY | B_ASYNC;
950	nwritten = bp->b_bufsize;
951	(void) VOP_BWRITE(bp);
952	return nwritten;
953}
954
955
956/*
957 * Find a buffer header which is available for use.
958 */
959static struct buf *
960getnewbuf(struct vnode *vp, daddr_t blkno,
961	int slpflag, int slptimeo, int size, int maxsize)
962{
963	struct buf *bp, *bp1;
964	int nbyteswritten = 0;
965	vm_offset_t addr;
966	static int writerecursion = 0;
967
968start:
969	if (bufspace >= maxbufspace)
970		goto trytofreespace;
971
972	/* can we constitute a new buffer? */
973	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
974#if !defined(MAX_PERF)
975		if (bp->b_qindex != QUEUE_EMPTY)
976			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
977			    bp->b_qindex);
978#endif
979		bp->b_flags |= B_BUSY;
980		bremfree(bp);
981		goto fillbuf;
982	}
983trytofreespace:
984	/*
985	 * We keep the file I/O from hogging metadata I/O
986	 * This is desirable because file data is cached in the
987	 * VM/Buffer cache even if a buffer is freed.
988	 */
989	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
990#if !defined(MAX_PERF)
991		if (bp->b_qindex != QUEUE_AGE)
992			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
993			    bp->b_qindex);
994#endif
995	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
996#if !defined(MAX_PERF)
997		if (bp->b_qindex != QUEUE_LRU)
998			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
999			    bp->b_qindex);
1000#endif
1001	}
1002	if (!bp) {
1003		/* wait for a free buffer of any kind */
1004		needsbuffer |= VFS_BIO_NEED_ANY;
1005		do
1006			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
1007			    slptimeo);
1008		while (needsbuffer & VFS_BIO_NEED_ANY);
1009		return (0);
1010	}
1011
1012#if defined(DIAGNOSTIC)
1013	if (bp->b_flags & B_BUSY) {
1014		panic("getnewbuf: busy buffer on free list\n");
1015	}
1016#endif
1017
1018	/*
1019	 * We are fairly aggressive about freeing VMIO buffers, but since
1020	 * the buffering is intact without buffer headers, there is not
1021	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
1022	 */
1023	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1024		if ((bp->b_flags & B_VMIO) == 0 ||
1025			(vmiospace < maxvmiobufspace)) {
1026			--bp->b_usecount;
1027			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1028			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1029				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1030				goto start;
1031			}
1032			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1033		}
1034	}
1035
1036
1037	/* if we are a delayed write, convert to an async write */
1038	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1039
1040		/*
1041		 * If our delayed write is likely to be used soon, then
1042		 * recycle back onto the LRU queue.
1043		 */
1044		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1045			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1046
1047			if (bp->b_usecount > 0) {
1048				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1049
1050					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1051
1052					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1053						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1054						bp->b_usecount--;
1055						goto start;
1056					}
1057					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1058				}
1059			}
1060		}
1061
1062		/*
1063		 * Certain layered filesystems can recursively re-enter the vfs_bio
1064		 * code, due to delayed writes.  This helps keep the system from
1065		 * deadlocking.
1066		 */
1067		if (writerecursion > 0) {
1068			if (writerecursion > 5) {
1069				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1070				while (bp) {
1071					if ((bp->b_flags & B_DELWRI) == 0)
1072						break;
1073					bp = TAILQ_NEXT(bp, b_freelist);
1074				}
1075				if (bp == NULL) {
1076					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1077					while (bp) {
1078						if ((bp->b_flags & B_DELWRI) == 0)
1079							break;
1080						bp = TAILQ_NEXT(bp, b_freelist);
1081					}
1082				}
1083				if (bp == NULL)
1084					panic("getnewbuf: cannot get buffer, infinite recursion failure");
1085			} else {
1086				bremfree(bp);
1087				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1088				nbyteswritten += bp->b_bufsize;
1089				++writerecursion;
1090				VOP_BWRITE(bp);
1091				--writerecursion;
1092				if (!slpflag && !slptimeo) {
1093					return (0);
1094				}
1095				goto start;
1096			}
1097		} else {
1098			++writerecursion;
1099			nbyteswritten += vfs_bio_awrite(bp);
1100			--writerecursion;
1101			if (!slpflag && !slptimeo) {
1102				return (0);
1103			}
1104			goto start;
1105		}
1106	}
1107
1108	if (bp->b_flags & B_WANTED) {
1109		bp->b_flags &= ~B_WANTED;
1110		wakeup(bp);
1111	}
1112	bremfree(bp);
1113	bp->b_flags |= B_BUSY;
1114
1115	if (bp->b_flags & B_VMIO) {
1116		bp->b_flags &= ~B_ASYNC;
1117		vfs_vmio_release(bp);
1118	}
1119
1120	if (bp->b_vp)
1121		brelvp(bp);
1122
1123fillbuf:
1124	bp->b_generation++;
1125
1126	/* we are not free, nor do we contain interesting data */
1127	if (bp->b_rcred != NOCRED) {
1128		crfree(bp->b_rcred);
1129		bp->b_rcred = NOCRED;
1130	}
1131	if (bp->b_wcred != NOCRED) {
1132		crfree(bp->b_wcred);
1133		bp->b_wcred = NOCRED;
1134	}
1135	if (LIST_FIRST(&bp->b_dep) != NULL &&
1136	    bioops.io_deallocate)
1137		(*bioops.io_deallocate)(bp);
1138
1139	LIST_REMOVE(bp, b_hash);
1140	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1141	if (bp->b_bufsize) {
1142		allocbuf(bp, 0);
1143	}
1144	bp->b_flags = B_BUSY;
1145	bp->b_dev = NODEV;
1146	bp->b_vp = NULL;
1147	bp->b_blkno = bp->b_lblkno = 0;
1148	bp->b_offset = 0;
1149	bp->b_iodone = 0;
1150	bp->b_error = 0;
1151	bp->b_resid = 0;
1152	bp->b_bcount = 0;
1153	bp->b_npages = 0;
1154	bp->b_dirtyoff = bp->b_dirtyend = 0;
1155	bp->b_validoff = bp->b_validend = 0;
1156	bp->b_usecount = 5;
1157	/* Here, not kern_physio.c, is where this should be done*/
1158	LIST_INIT(&bp->b_dep);
1159
1160	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1161
1162	/*
1163	 * we assume that buffer_map is not at address 0
1164	 */
1165	addr = 0;
1166	if (maxsize != bp->b_kvasize) {
1167		bfreekva(bp);
1168
1169findkvaspace:
1170		/*
1171		 * See if we have buffer kva space
1172		 */
1173		if (vm_map_findspace(buffer_map,
1174			vm_map_min(buffer_map), maxsize, &addr)) {
1175			if (kvafreespace > 0) {
1176				int totfree = 0, freed;
1177				do {
1178					freed = 0;
1179					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1180						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1181						if (bp1->b_kvasize != 0) {
1182							totfree += bp1->b_kvasize;
1183							freed = bp1->b_kvasize;
1184							bremfree(bp1);
1185							bfreekva(bp1);
1186							brelse(bp1);
1187							break;
1188						}
1189					}
1190				} while (freed);
1191				/*
1192				 * if we found free space, then retry with the same buffer.
1193				 */
1194				if (totfree)
1195					goto findkvaspace;
1196			}
1197			bp->b_flags |= B_INVAL;
1198			brelse(bp);
1199			goto trytofreespace;
1200		}
1201	}
1202
1203	/*
1204	 * See if we are below are allocated minimum
1205	 */
1206	if (bufspace >= (maxbufspace + nbyteswritten)) {
1207		bp->b_flags |= B_INVAL;
1208		brelse(bp);
1209		goto trytofreespace;
1210	}
1211
1212	/*
1213	 * create a map entry for the buffer -- in essence
1214	 * reserving the kva space.
1215	 */
1216	if (addr) {
1217		vm_map_insert(buffer_map, NULL, 0,
1218			addr, addr + maxsize,
1219			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1220
1221		bp->b_kvabase = (caddr_t) addr;
1222		bp->b_kvasize = maxsize;
1223	}
1224	bp->b_data = bp->b_kvabase;
1225
1226	return (bp);
1227}
1228
1229static void
1230waitfreebuffers(int slpflag, int slptimeo) {
1231	while (numfreebuffers < hifreebuffers) {
1232		flushdirtybuffers(slpflag, slptimeo);
1233		if (numfreebuffers < hifreebuffers)
1234			break;
1235		needsbuffer |= VFS_BIO_NEED_FREE;
1236		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1237			break;
1238	}
1239}
1240
1241static void
1242flushdirtybuffers(int slpflag, int slptimeo) {
1243	int s;
1244	static pid_t flushing = 0;
1245
1246	s = splbio();
1247
1248	if (flushing) {
1249		if (flushing == curproc->p_pid) {
1250			splx(s);
1251			return;
1252		}
1253		while (flushing) {
1254			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1255				splx(s);
1256				return;
1257			}
1258		}
1259	}
1260	flushing = curproc->p_pid;
1261
1262	while (numdirtybuffers > lodirtybuffers) {
1263		struct buf *bp;
1264		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1265		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1266		if (bp == NULL)
1267			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1268
1269		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1270			bp = TAILQ_NEXT(bp, b_freelist);
1271		}
1272
1273		if (bp) {
1274			vfs_bio_awrite(bp);
1275			continue;
1276		}
1277		break;
1278	}
1279
1280	flushing = 0;
1281	wakeup(&flushing);
1282	splx(s);
1283}
1284
1285/*
1286 * Check to see if a block is currently memory resident.
1287 */
1288struct buf *
1289incore(struct vnode * vp, daddr_t blkno)
1290{
1291	struct buf *bp;
1292
1293	int s = splbio();
1294	bp = gbincore(vp, blkno);
1295	splx(s);
1296	return (bp);
1297}
1298
1299/*
1300 * Returns true if no I/O is needed to access the
1301 * associated VM object.  This is like incore except
1302 * it also hunts around in the VM system for the data.
1303 */
1304
1305int
1306inmem(struct vnode * vp, daddr_t blkno)
1307{
1308	vm_object_t obj;
1309	vm_offset_t toff, tinc;
1310	vm_page_t m;
1311	vm_ooffset_t off;
1312
1313	if (incore(vp, blkno))
1314		return 1;
1315	if (vp->v_mount == NULL)
1316		return 0;
1317	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1318		return 0;
1319
1320	obj = vp->v_object;
1321	tinc = PAGE_SIZE;
1322	if (tinc > vp->v_mount->mnt_stat.f_iosize)
1323		tinc = vp->v_mount->mnt_stat.f_iosize;
1324	off = blkno * vp->v_mount->mnt_stat.f_iosize;
1325
1326	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1327
1328		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1329		if (!m)
1330			return 0;
1331		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
1332			return 0;
1333	}
1334	return 1;
1335}
1336
1337/*
1338 * now we set the dirty range for the buffer --
1339 * for NFS -- if the file is mapped and pages have
1340 * been written to, let it know.  We want the
1341 * entire range of the buffer to be marked dirty if
1342 * any of the pages have been written to for consistancy
1343 * with the b_validoff, b_validend set in the nfs write
1344 * code, and used by the nfs read code.
1345 */
1346static void
1347vfs_setdirty(struct buf *bp) {
1348	int i;
1349	vm_object_t object;
1350	vm_offset_t boffset, offset;
1351	/*
1352	 * We qualify the scan for modified pages on whether the
1353	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1354	 * is not cleared simply by protecting pages off.
1355	 */
1356	if ((bp->b_flags & B_VMIO) &&
1357		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1358		/*
1359		 * test the pages to see if they have been modified directly
1360		 * by users through the VM system.
1361		 */
1362		for (i = 0; i < bp->b_npages; i++)
1363			vm_page_test_dirty(bp->b_pages[i]);
1364
1365		/*
1366		 * scan forwards for the first page modified
1367		 */
1368		for (i = 0; i < bp->b_npages; i++) {
1369			if (bp->b_pages[i]->dirty) {
1370				break;
1371			}
1372		}
1373		boffset = (i << PAGE_SHIFT);
1374		if (boffset < bp->b_dirtyoff) {
1375			bp->b_dirtyoff = boffset;
1376		}
1377
1378		/*
1379		 * scan backwards for the last page modified
1380		 */
1381		for (i = bp->b_npages - 1; i >= 0; --i) {
1382			if (bp->b_pages[i]->dirty) {
1383				break;
1384			}
1385		}
1386		boffset = (i + 1);
1387		offset = boffset + bp->b_pages[0]->pindex;
1388		if (offset >= object->size)
1389			boffset = object->size - bp->b_pages[0]->pindex;
1390		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1391			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1392	}
1393}
1394
1395/*
1396 * Get a block given a specified block and offset into a file/device.
1397 */
1398struct buf *
1399getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1400{
1401	struct buf *bp;
1402	int i, s;
1403	struct bufhashhdr *bh;
1404	int maxsize;
1405	int generation;
1406	int checksize;
1407
1408	if (vp->v_mount) {
1409		maxsize = vp->v_mount->mnt_stat.f_iosize;
1410		/*
1411		 * This happens on mount points.
1412		 */
1413		if (maxsize < size)
1414			maxsize = size;
1415	} else {
1416		maxsize = size;
1417	}
1418
1419#if !defined(MAX_PERF)
1420	if (size > MAXBSIZE)
1421		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1422#endif
1423
1424	s = splbio();
1425loop:
1426	if (numfreebuffers < lofreebuffers) {
1427		waitfreebuffers(slpflag, slptimeo);
1428	}
1429
1430	if ((bp = gbincore(vp, blkno))) {
1431		generation = bp->b_generation;
1432loop1:
1433		if (bp->b_flags & B_BUSY) {
1434
1435			bp->b_flags |= B_WANTED;
1436			if (bp->b_usecount < BUF_MAXUSE)
1437				++bp->b_usecount;
1438
1439			if (!tsleep(bp,
1440				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1441				if (bp->b_generation != generation)
1442					goto loop;
1443				goto loop1;
1444			}
1445
1446			splx(s);
1447			return (struct buf *) NULL;
1448		}
1449		bp->b_flags |= B_BUSY | B_CACHE;
1450		bremfree(bp);
1451
1452		/*
1453		 * check for size inconsistancies (note that they shouldn't
1454		 * happen but do when filesystems don't handle the size changes
1455		 * correctly.) We are conservative on metadata and don't just
1456		 * extend the buffer but write (if needed) and re-constitute it.
1457		 */
1458
1459		if (bp->b_bcount != size) {
1460			bp->b_generation++;
1461			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1462				allocbuf(bp, size);
1463			} else {
1464				bp->b_flags |= B_NOCACHE;
1465				if (bp->b_flags & B_DELWRI) {
1466					VOP_BWRITE(bp);
1467				} else {
1468					brelse(bp);
1469				}
1470				goto loop;
1471			}
1472		}
1473
1474		/*
1475		 * Check that the constituted buffer really deserves for the
1476		 * B_CACHE bit to be set.
1477		 */
1478		checksize = bp->b_bufsize;
1479		for (i = 0; i < bp->b_npages; i++) {
1480			int resid;
1481			int poffset;
1482			poffset = bp->b_offset & PAGE_MASK;
1483			resid = (checksize > (PAGE_SIZE - poffset)) ?
1484				(PAGE_SIZE - poffset) : checksize;
1485			if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1486				bp->b_flags &= ~(B_CACHE | B_DONE);
1487				break;
1488			}
1489			checksize -= resid;
1490		}
1491
1492		if (bp->b_usecount < BUF_MAXUSE)
1493			++bp->b_usecount;
1494		splx(s);
1495		return (bp);
1496	} else {
1497		vm_object_t obj;
1498
1499		if ((bp = getnewbuf(vp, blkno,
1500			slpflag, slptimeo, size, maxsize)) == 0) {
1501			if (slpflag || slptimeo) {
1502				splx(s);
1503				return NULL;
1504			}
1505			goto loop;
1506		}
1507
1508		/*
1509		 * This code is used to make sure that a buffer is not
1510		 * created while the getnewbuf routine is blocked.
1511		 * Normally the vnode is locked so this isn't a problem.
1512		 * VBLK type I/O requests, however, don't lock the vnode.
1513		 */
1514		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
1515			bp->b_flags |= B_INVAL;
1516			brelse(bp);
1517			goto loop;
1518		}
1519
1520		/*
1521		 * Insert the buffer into the hash, so that it can
1522		 * be found by incore.
1523		 */
1524		bp->b_blkno = bp->b_lblkno = blkno;
1525		if (vp->v_type != VBLK)
1526			bp->b_offset = (off_t) blkno * maxsize;
1527		else
1528			bp->b_offset = (off_t) blkno * DEV_BSIZE;
1529
1530		bgetvp(vp, bp);
1531		LIST_REMOVE(bp, b_hash);
1532		bh = BUFHASH(vp, blkno);
1533		LIST_INSERT_HEAD(bh, bp, b_hash);
1534
1535		if ((obj = vp->v_object) && (vp->v_flag & VOBJBUF)) {
1536			bp->b_flags |= (B_VMIO | B_CACHE);
1537#if defined(VFS_BIO_DEBUG)
1538			if (vp->v_type != VREG && vp->v_type != VBLK)
1539				printf("getblk: vmioing file type %d???\n", vp->v_type);
1540#endif
1541		} else {
1542			bp->b_flags &= ~B_VMIO;
1543		}
1544
1545		allocbuf(bp, size);
1546
1547		splx(s);
1548#ifdef	PC98
1549		/*
1550		 * 1024byte/sector support
1551		 */
1552#define B_XXX2 0x8000000
1553		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1554#endif
1555		return (bp);
1556	}
1557}
1558
1559/*
1560 * Get an empty, disassociated buffer of given size.
1561 */
1562struct buf *
1563geteblk(int size)
1564{
1565	struct buf *bp;
1566	int s;
1567
1568	s = splbio();
1569	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1570	splx(s);
1571	allocbuf(bp, size);
1572	bp->b_flags |= B_INVAL;
1573	return (bp);
1574}
1575
1576
1577/*
1578 * This code constitutes the buffer memory from either anonymous system
1579 * memory (in the case of non-VMIO operations) or from an associated
1580 * VM object (in the case of VMIO operations).
1581 *
1582 * Note that this code is tricky, and has many complications to resolve
1583 * deadlock or inconsistant data situations.  Tread lightly!!!
1584 *
1585 * Modify the length of a buffer's underlying buffer storage without
1586 * destroying information (unless, of course the buffer is shrinking).
1587 */
1588int
1589allocbuf(struct buf * bp, int size)
1590{
1591
1592	int s;
1593	int newbsize, mbsize;
1594	int i;
1595
1596#if !defined(MAX_PERF)
1597	if (!(bp->b_flags & B_BUSY))
1598		panic("allocbuf: buffer not busy");
1599
1600	if (bp->b_kvasize < size)
1601		panic("allocbuf: buffer too small");
1602#endif
1603
1604	if ((bp->b_flags & B_VMIO) == 0) {
1605		caddr_t origbuf;
1606		int origbufsize;
1607		/*
1608		 * Just get anonymous memory from the kernel
1609		 */
1610		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1611#if !defined(NO_B_MALLOC)
1612		if (bp->b_flags & B_MALLOC)
1613			newbsize = mbsize;
1614		else
1615#endif
1616			newbsize = round_page(size);
1617
1618		if (newbsize < bp->b_bufsize) {
1619#if !defined(NO_B_MALLOC)
1620			/*
1621			 * malloced buffers are not shrunk
1622			 */
1623			if (bp->b_flags & B_MALLOC) {
1624				if (newbsize) {
1625					bp->b_bcount = size;
1626				} else {
1627					free(bp->b_data, M_BIOBUF);
1628					bufspace -= bp->b_bufsize;
1629					bufmallocspace -= bp->b_bufsize;
1630					bp->b_data = bp->b_kvabase;
1631					bp->b_bufsize = 0;
1632					bp->b_bcount = 0;
1633					bp->b_flags &= ~B_MALLOC;
1634				}
1635				return 1;
1636			}
1637#endif
1638			vm_hold_free_pages(
1639			    bp,
1640			    (vm_offset_t) bp->b_data + newbsize,
1641			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1642		} else if (newbsize > bp->b_bufsize) {
1643#if !defined(NO_B_MALLOC)
1644			/*
1645			 * We only use malloced memory on the first allocation.
1646			 * and revert to page-allocated memory when the buffer grows.
1647			 */
1648			if ( (bufmallocspace < maxbufmallocspace) &&
1649				(bp->b_bufsize == 0) &&
1650				(mbsize <= PAGE_SIZE/2)) {
1651
1652				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1653				bp->b_bufsize = mbsize;
1654				bp->b_bcount = size;
1655				bp->b_flags |= B_MALLOC;
1656				bufspace += mbsize;
1657				bufmallocspace += mbsize;
1658				return 1;
1659			}
1660#endif
1661			origbuf = NULL;
1662			origbufsize = 0;
1663#if !defined(NO_B_MALLOC)
1664			/*
1665			 * If the buffer is growing on it's other-than-first allocation,
1666			 * then we revert to the page-allocation scheme.
1667			 */
1668			if (bp->b_flags & B_MALLOC) {
1669				origbuf = bp->b_data;
1670				origbufsize = bp->b_bufsize;
1671				bp->b_data = bp->b_kvabase;
1672				bufspace -= bp->b_bufsize;
1673				bufmallocspace -= bp->b_bufsize;
1674				bp->b_bufsize = 0;
1675				bp->b_flags &= ~B_MALLOC;
1676				newbsize = round_page(newbsize);
1677			}
1678#endif
1679			vm_hold_load_pages(
1680			    bp,
1681			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1682			    (vm_offset_t) bp->b_data + newbsize);
1683#if !defined(NO_B_MALLOC)
1684			if (origbuf) {
1685				bcopy(origbuf, bp->b_data, origbufsize);
1686				free(origbuf, M_BIOBUF);
1687			}
1688#endif
1689		}
1690	} else {
1691		vm_page_t m;
1692		int desiredpages;
1693
1694		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1695		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1696
1697#if !defined(NO_B_MALLOC)
1698		if (bp->b_flags & B_MALLOC)
1699			panic("allocbuf: VMIO buffer can't be malloced");
1700#endif
1701
1702		if (newbsize < bp->b_bufsize) {
1703			if (desiredpages < bp->b_npages) {
1704				for (i = desiredpages; i < bp->b_npages; i++) {
1705					/*
1706					 * the page is not freed here -- it
1707					 * is the responsibility of vnode_pager_setsize
1708					 */
1709					m = bp->b_pages[i];
1710#if defined(DIAGNOSTIC)
1711					if (m == bogus_page)
1712						panic("allocbuf: bogus page found");
1713#endif
1714					vm_page_sleep(m, "biodep", &m->busy);
1715
1716					bp->b_pages[i] = NULL;
1717					vm_page_unwire(m);
1718				}
1719				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1720				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1721				bp->b_npages = desiredpages;
1722			}
1723		} else if (newbsize > bp->b_bufsize) {
1724			vm_object_t obj;
1725			vm_offset_t tinc, toff;
1726			vm_ooffset_t off;
1727			vm_pindex_t objoff;
1728			int pageindex, curbpnpages;
1729			struct vnode *vp;
1730			int bsize;
1731			int orig_validoff = bp->b_validoff;
1732			int orig_validend = bp->b_validend;
1733
1734			vp = bp->b_vp;
1735
1736			if (vp->v_type == VBLK)
1737				bsize = DEV_BSIZE;
1738			else
1739				bsize = vp->v_mount->mnt_stat.f_iosize;
1740
1741			if (bp->b_npages < desiredpages) {
1742				obj = vp->v_object;
1743				tinc = PAGE_SIZE;
1744				if (tinc > bsize)
1745					tinc = bsize;
1746
1747				off = bp->b_offset;
1748				curbpnpages = bp->b_npages;
1749		doretry:
1750				bp->b_validoff = orig_validoff;
1751				bp->b_validend = orig_validend;
1752				bp->b_flags |= B_CACHE;
1753				for (toff = 0; toff < newbsize; toff += tinc) {
1754					int bytesinpage;
1755
1756					pageindex = toff >> PAGE_SHIFT;
1757					objoff = OFF_TO_IDX(off + toff);
1758					if (pageindex < curbpnpages) {
1759
1760						m = bp->b_pages[pageindex];
1761#ifdef VFS_BIO_DIAG
1762						if (m->pindex != objoff)
1763							panic("allocbuf: page changed offset??!!!?");
1764#endif
1765						bytesinpage = tinc;
1766						if (tinc > (newbsize - toff))
1767							bytesinpage = newbsize - toff;
1768						if (bp->b_flags & B_CACHE)
1769							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1770						continue;
1771					}
1772					m = vm_page_lookup(obj, objoff);
1773					if (!m) {
1774						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1775						if (!m) {
1776							VM_WAIT;
1777							vm_pageout_deficit += (desiredpages - bp->b_npages);
1778							goto doretry;
1779						}
1780
1781						vm_page_wire(m);
1782						m->flags &= ~PG_BUSY;
1783						bp->b_flags &= ~B_CACHE;
1784
1785					} else if (m->flags & PG_BUSY) {
1786						s = splvm();
1787						if (m->flags & PG_BUSY) {
1788							m->flags |= PG_WANTED;
1789							tsleep(m, PVM, "pgtblk", 0);
1790						}
1791						splx(s);
1792						goto doretry;
1793					} else {
1794						if ((curproc != pageproc) &&
1795							((m->queue - m->pc) == PQ_CACHE) &&
1796						    ((cnt.v_free_count + cnt.v_cache_count) <
1797								(cnt.v_free_min + cnt.v_cache_min))) {
1798							pagedaemon_wakeup();
1799						}
1800						bytesinpage = tinc;
1801						if (tinc > (newbsize - toff))
1802							bytesinpage = newbsize - toff;
1803						if (bp->b_flags & B_CACHE)
1804							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1805						vm_page_wire(m);
1806					}
1807					bp->b_pages[pageindex] = m;
1808					curbpnpages = pageindex + 1;
1809				}
1810				if (vp->v_tag == VT_NFS &&
1811				    vp->v_type != VBLK) {
1812					if (bp->b_dirtyend > 0) {
1813						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1814						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1815					}
1816					if (bp->b_validend == 0)
1817						bp->b_flags &= ~B_CACHE;
1818				}
1819				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1820				bp->b_npages = curbpnpages;
1821				pmap_qenter((vm_offset_t) bp->b_data,
1822					bp->b_pages, bp->b_npages);
1823				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1824			}
1825		}
1826	}
1827	if (bp->b_flags & B_VMIO)
1828		vmiospace += (newbsize - bp->b_bufsize);
1829	bufspace += (newbsize - bp->b_bufsize);
1830	bp->b_bufsize = newbsize;
1831	bp->b_bcount = size;
1832	return 1;
1833}
1834
1835/*
1836 * Wait for buffer I/O completion, returning error status.
1837 */
1838int
1839biowait(register struct buf * bp)
1840{
1841	int s;
1842
1843	s = splbio();
1844	while ((bp->b_flags & B_DONE) == 0)
1845#if defined(NO_SCHEDULE_MODS)
1846		tsleep(bp, PRIBIO, "biowait", 0);
1847#else
1848		if (bp->b_flags & B_READ)
1849			tsleep(bp, PRIBIO, "biord", 0);
1850		else
1851			tsleep(bp, PRIBIO, "biowr", 0);
1852#endif
1853	splx(s);
1854	if (bp->b_flags & B_EINTR) {
1855		bp->b_flags &= ~B_EINTR;
1856		return (EINTR);
1857	}
1858	if (bp->b_flags & B_ERROR) {
1859		return (bp->b_error ? bp->b_error : EIO);
1860	} else {
1861		return (0);
1862	}
1863}
1864
1865/*
1866 * Finish I/O on a buffer, calling an optional function.
1867 * This is usually called from interrupt level, so process blocking
1868 * is not *a good idea*.
1869 */
1870void
1871biodone(register struct buf * bp)
1872{
1873	int s;
1874
1875	s = splbio();
1876
1877#if !defined(MAX_PERF)
1878	if (!(bp->b_flags & B_BUSY))
1879		panic("biodone: buffer not busy");
1880#endif
1881
1882	if (bp->b_flags & B_DONE) {
1883		splx(s);
1884#if !defined(MAX_PERF)
1885		printf("biodone: buffer already done\n");
1886#endif
1887		return;
1888	}
1889	bp->b_flags |= B_DONE;
1890
1891	if ((bp->b_flags & B_READ) == 0) {
1892		vwakeup(bp);
1893	}
1894#ifdef BOUNCE_BUFFERS
1895	if (bp->b_flags & B_BOUNCE)
1896		vm_bounce_free(bp);
1897#endif
1898
1899	/* call optional completion function if requested */
1900	if (bp->b_flags & B_CALL) {
1901		bp->b_flags &= ~B_CALL;
1902		(*bp->b_iodone) (bp);
1903		splx(s);
1904		return;
1905	}
1906	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1907		(*bioops.io_complete)(bp);
1908
1909	if (bp->b_flags & B_VMIO) {
1910		int i, resid;
1911		vm_ooffset_t foff;
1912		vm_page_t m;
1913		vm_object_t obj;
1914		int iosize;
1915		struct vnode *vp = bp->b_vp;
1916
1917		obj = vp->v_object;
1918
1919#if defined(VFS_BIO_DEBUG)
1920		if (vp->v_usecount == 0) {
1921			panic("biodone: zero vnode ref count");
1922		}
1923
1924		if (vp->v_object == NULL) {
1925			panic("biodone: missing VM object");
1926		}
1927
1928		if ((vp->v_flag & VOBJBUF) == 0) {
1929			panic("biodone: vnode is not setup for merged cache");
1930		}
1931#endif
1932
1933		foff = bp->b_offset;
1934
1935#if !defined(MAX_PERF)
1936		if (!obj) {
1937			panic("biodone: no object");
1938		}
1939#endif
1940#if defined(VFS_BIO_DEBUG)
1941		if (obj->paging_in_progress < bp->b_npages) {
1942			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1943			    obj->paging_in_progress, bp->b_npages);
1944		}
1945#endif
1946		iosize = bp->b_bufsize;
1947		for (i = 0; i < bp->b_npages; i++) {
1948			int bogusflag = 0;
1949			m = bp->b_pages[i];
1950			if (m == bogus_page) {
1951				bogusflag = 1;
1952				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1953				if (!m) {
1954#if defined(VFS_BIO_DEBUG)
1955					printf("biodone: page disappeared\n");
1956#endif
1957					--obj->paging_in_progress;
1958					continue;
1959				}
1960				bp->b_pages[i] = m;
1961				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1962			}
1963#if defined(VFS_BIO_DEBUG)
1964			if (OFF_TO_IDX(foff) != m->pindex) {
1965				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1966			}
1967#endif
1968			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1969			if (resid > iosize)
1970				resid = iosize;
1971
1972			/*
1973			 * In the write case, the valid and clean bits are
1974			 * already changed correctly, so we only need to do this
1975			 * here in the read case.
1976			 */
1977			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1978				vfs_page_set_valid(bp, foff, i, m);
1979			}
1980
1981			/*
1982			 * when debugging new filesystems or buffer I/O methods, this
1983			 * is the most common error that pops up.  if you see this, you
1984			 * have not set the page busy flag correctly!!!
1985			 */
1986			if (m->busy == 0) {
1987#if !defined(MAX_PERF)
1988				printf("biodone: page busy < 0, "
1989				    "pindex: %d, foff: 0x(%x,%x), "
1990				    "resid: %d, index: %d\n",
1991				    (int) m->pindex, (int)(foff >> 32),
1992						(int) foff & 0xffffffff, resid, i);
1993#endif
1994				if (vp->v_type != VBLK)
1995#if !defined(MAX_PERF)
1996					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1997					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1998					    (int) bp->b_lblkno,
1999					    bp->b_flags, bp->b_npages);
2000				else
2001					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2002					    (int) bp->b_lblkno,
2003					    bp->b_flags, bp->b_npages);
2004				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2005				    m->valid, m->dirty, m->wire_count);
2006#endif
2007				panic("biodone: page busy < 0\n");
2008			}
2009			PAGE_BWAKEUP(m);
2010			--obj->paging_in_progress;
2011			foff += resid;
2012			iosize -= resid;
2013		}
2014		if (obj &&
2015			(obj->paging_in_progress == 0) &&
2016		    (obj->flags & OBJ_PIPWNT)) {
2017			obj->flags &= ~OBJ_PIPWNT;
2018			wakeup(obj);
2019		}
2020	}
2021	/*
2022	 * For asynchronous completions, release the buffer now. The brelse
2023	 * checks for B_WANTED and will do the wakeup there if necessary - so
2024	 * no need to do a wakeup here in the async case.
2025	 */
2026
2027	if (bp->b_flags & B_ASYNC) {
2028		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2029			brelse(bp);
2030		else
2031			bqrelse(bp);
2032	} else {
2033		bp->b_flags &= ~B_WANTED;
2034		wakeup(bp);
2035	}
2036	splx(s);
2037}
2038
2039static int
2040count_lock_queue()
2041{
2042	int count;
2043	struct buf *bp;
2044
2045	count = 0;
2046	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
2047	    bp != NULL;
2048	    bp = TAILQ_NEXT(bp, b_freelist))
2049		count++;
2050	return (count);
2051}
2052
2053#if 0	/* not with kirks code */
2054static int vfs_update_interval = 30;
2055
2056static void
2057vfs_update()
2058{
2059	while (1) {
2060		tsleep(&vfs_update_wakeup, PUSER, "update",
2061		    hz * vfs_update_interval);
2062		vfs_update_wakeup = 0;
2063		sync(curproc, NULL);
2064	}
2065}
2066
2067static int
2068sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2069{
2070	int error = sysctl_handle_int(oidp,
2071		oidp->oid_arg1, oidp->oid_arg2, req);
2072	if (!error)
2073		wakeup(&vfs_update_wakeup);
2074	return error;
2075}
2076
2077SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2078	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2079
2080#endif
2081
2082
2083/*
2084 * This routine is called in lieu of iodone in the case of
2085 * incomplete I/O.  This keeps the busy status for pages
2086 * consistant.
2087 */
2088void
2089vfs_unbusy_pages(struct buf * bp)
2090{
2091	int i;
2092
2093	if (bp->b_flags & B_VMIO) {
2094		struct vnode *vp = bp->b_vp;
2095		vm_object_t obj = vp->v_object;
2096
2097		for (i = 0; i < bp->b_npages; i++) {
2098			vm_page_t m = bp->b_pages[i];
2099
2100			if (m == bogus_page) {
2101				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2102#if !defined(MAX_PERF)
2103				if (!m) {
2104					panic("vfs_unbusy_pages: page missing\n");
2105				}
2106#endif
2107				bp->b_pages[i] = m;
2108				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2109			}
2110			--obj->paging_in_progress;
2111			PAGE_BWAKEUP(m);
2112		}
2113		if (obj->paging_in_progress == 0 &&
2114		    (obj->flags & OBJ_PIPWNT)) {
2115			obj->flags &= ~OBJ_PIPWNT;
2116			wakeup(obj);
2117		}
2118	}
2119}
2120
2121/*
2122 * Set NFS' b_validoff and b_validend fields from the valid bits
2123 * of a page.  If the consumer is not NFS, and the page is not
2124 * valid for the entire range, clear the B_CACHE flag to force
2125 * the consumer to re-read the page.
2126 */
2127static void
2128vfs_buf_set_valid(struct buf *bp,
2129		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2130		  vm_page_t m)
2131{
2132	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2133		vm_offset_t svalid, evalid;
2134		int validbits = m->valid;
2135
2136		/*
2137		 * This only bothers with the first valid range in the
2138		 * page.
2139		 */
2140		svalid = off;
2141		while (validbits && !(validbits & 1)) {
2142			svalid += DEV_BSIZE;
2143			validbits >>= 1;
2144		}
2145		evalid = svalid;
2146		while (validbits & 1) {
2147			evalid += DEV_BSIZE;
2148			validbits >>= 1;
2149		}
2150		/*
2151		 * Make sure this range is contiguous with the range
2152		 * built up from previous pages.  If not, then we will
2153		 * just use the range from the previous pages.
2154		 */
2155		if (svalid == bp->b_validend) {
2156			bp->b_validoff = min(bp->b_validoff, svalid);
2157			bp->b_validend = max(bp->b_validend, evalid);
2158		}
2159	} else if (!vm_page_is_valid(m,
2160				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2161				     size)) {
2162		bp->b_flags &= ~B_CACHE;
2163	}
2164}
2165
2166/*
2167 * Set the valid bits in a page, taking care of the b_validoff,
2168 * b_validend fields which NFS uses to optimise small reads.  Off is
2169 * the offset within the file and pageno is the page index within the buf.
2170 */
2171static void
2172vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2173{
2174	struct vnode *vp = bp->b_vp;
2175	vm_ooffset_t soff, eoff;
2176
2177	soff = off;
2178	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2179	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2180		vm_ooffset_t sv, ev;
2181		vm_page_set_invalid(m,
2182		    (vm_offset_t) (soff & PAGE_MASK),
2183		    (vm_offset_t) (eoff - soff));
2184		off = off - pageno * PAGE_SIZE;
2185		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2186		ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
2187		soff = max(sv, soff);
2188		eoff = min(ev, eoff);
2189	}
2190	if (eoff > soff)
2191		vm_page_set_validclean(m,
2192	       (vm_offset_t) (soff & PAGE_MASK),
2193	       (vm_offset_t) (eoff - soff));
2194}
2195
2196/*
2197 * This routine is called before a device strategy routine.
2198 * It is used to tell the VM system that paging I/O is in
2199 * progress, and treat the pages associated with the buffer
2200 * almost as being PG_BUSY.  Also the object paging_in_progress
2201 * flag is handled to make sure that the object doesn't become
2202 * inconsistant.
2203 */
2204void
2205vfs_busy_pages(struct buf * bp, int clear_modify)
2206{
2207	int i,s;
2208
2209	if (bp->b_flags & B_VMIO) {
2210		struct vnode *vp = bp->b_vp;
2211		vm_object_t obj = vp->v_object;
2212		vm_ooffset_t foff;
2213
2214		foff = bp->b_offset;
2215
2216		vfs_setdirty(bp);
2217
2218retry:
2219		for (i = 0; i < bp->b_npages; i++) {
2220			vm_page_t m = bp->b_pages[i];
2221			if (vm_page_sleep(m, "vbpage", NULL))
2222				goto retry;
2223		}
2224
2225		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2226			vm_page_t m = bp->b_pages[i];
2227
2228			if ((bp->b_flags & B_CLUSTER) == 0) {
2229				obj->paging_in_progress++;
2230				m->busy++;
2231			}
2232
2233			vm_page_protect(m, VM_PROT_NONE);
2234			if (clear_modify)
2235				vfs_page_set_valid(bp, foff, i, m);
2236			else if (bp->b_bcount >= PAGE_SIZE) {
2237				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
2238					bp->b_pages[i] = bogus_page;
2239					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
2240				}
2241			}
2242		}
2243	}
2244}
2245
2246/*
2247 * Tell the VM system that the pages associated with this buffer
2248 * are clean.  This is used for delayed writes where the data is
2249 * going to go to disk eventually without additional VM intevention.
2250 */
2251void
2252vfs_clean_pages(struct buf * bp)
2253{
2254	int i;
2255
2256	if (bp->b_flags & B_VMIO) {
2257		struct vnode *vp = bp->b_vp;
2258		vm_ooffset_t foff;
2259		foff = bp->b_offset;
2260
2261		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2262			vm_page_t m = bp->b_pages[i];
2263			vfs_page_set_valid(bp, foff, i, m);
2264		}
2265	}
2266}
2267
2268void
2269vfs_bio_clrbuf(struct buf *bp) {
2270	int i;
2271	if( bp->b_flags & B_VMIO) {
2272		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2273			int mask;
2274			mask = 0;
2275			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2276				mask |= (1 << (i/DEV_BSIZE));
2277			if( bp->b_pages[0]->valid != mask) {
2278				bzero(bp->b_data, bp->b_bufsize);
2279			}
2280			bp->b_pages[0]->valid = mask;
2281			bp->b_resid = 0;
2282			return;
2283		}
2284		for(i=0;i<bp->b_npages;i++) {
2285			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2286				continue;
2287			if( bp->b_pages[i]->valid == 0) {
2288				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2289					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2290				}
2291			} else {
2292				int j;
2293				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2294					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
2295						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2296				}
2297			}
2298			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
2299		}
2300		bp->b_resid = 0;
2301	} else {
2302		clrbuf(bp);
2303	}
2304}
2305
2306/*
2307 * vm_hold_load_pages and vm_hold_unload pages get pages into
2308 * a buffers address space.  The pages are anonymous and are
2309 * not associated with a file object.
2310 */
2311void
2312vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2313{
2314	vm_offset_t pg;
2315	vm_page_t p;
2316	int index;
2317
2318	to = round_page(to);
2319	from = round_page(from);
2320	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2321
2322	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2323
2324tryagain:
2325
2326		p = vm_page_alloc(kernel_object,
2327			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2328		    VM_ALLOC_NORMAL);
2329		if (!p) {
2330			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2331			VM_WAIT;
2332			goto tryagain;
2333		}
2334		vm_page_wire(p);
2335		p->valid = VM_PAGE_BITS_ALL;
2336		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2337		bp->b_pages[index] = p;
2338		PAGE_WAKEUP(p);
2339	}
2340	bp->b_npages = index;
2341}
2342
2343void
2344vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2345{
2346	vm_offset_t pg;
2347	vm_page_t p;
2348	int index, newnpages;
2349
2350	from = round_page(from);
2351	to = round_page(to);
2352	newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2353
2354	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2355		p = bp->b_pages[index];
2356		if (p && (index < bp->b_npages)) {
2357#if !defined(MAX_PERF)
2358			if (p->busy) {
2359				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2360					bp->b_blkno, bp->b_lblkno);
2361			}
2362#endif
2363			bp->b_pages[index] = NULL;
2364			pmap_kremove(pg);
2365			p->flags |= PG_BUSY;
2366			vm_page_unwire(p);
2367			vm_page_free(p);
2368		}
2369	}
2370	bp->b_npages = newnpages;
2371}
2372
2373
2374#include "opt_ddb.h"
2375#ifdef DDB
2376#include <ddb/ddb.h>
2377
2378DB_SHOW_COMMAND(buffer, db_show_buffer)
2379{
2380	/* get args */
2381	struct buf *bp = (struct buf *)addr;
2382
2383	if (!have_addr) {
2384		db_printf("usage: show buffer <addr>\n");
2385		return;
2386	}
2387
2388	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2389		  bp->b_flags, "\20\40bounce\37cluster\36vmio\35ram\34ordered"
2390		  "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26tape"
2391		  "\25read\24raw\23phys\22clusterok\21malloc\20nocache"
2392		  "\17locked\16inval\15gathered\14error\13eintr\12done\11dirty"
2393		  "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age");
2394	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2395		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2396		  "b_blkno = %d, b_pblkno = %d\n",
2397		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2398		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2399	if (bp->b_npages) {
2400		int i;
2401		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2402		for (i = 0; i < bp->b_npages; i++) {
2403			vm_page_t m;
2404			m = bp->b_pages[i];
2405			db_printf("(0x%x, 0x%x, 0x%x)", m->object, m->pindex,
2406				VM_PAGE_TO_PHYS(m));
2407			if ((i + 1) < bp->b_npages)
2408				db_printf(",");
2409		}
2410		db_printf("\n");
2411	}
2412}
2413#endif /* DDB */
2414