vfs_bio.c revision 42007
1/*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 *		John S. Dyson.
13 *
14 * $Id: vfs_bio.c,v 1.187 1998/12/14 21:17:37 dillon Exp $
15 */
16
17/*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme.  Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author:  John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 */
27
28#define VMIO
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysproto.h>
32#include <sys/kernel.h>
33#include <sys/sysctl.h>
34#include <sys/proc.h>
35#include <sys/vnode.h>
36#include <sys/vmmeter.h>
37#include <sys/lock.h>
38#include <miscfs/specfs/specdev.h>
39#include <vm/vm.h>
40#include <vm/vm_param.h>
41#include <vm/vm_prot.h>
42#include <vm/vm_kern.h>
43#include <vm/vm_pageout.h>
44#include <vm/vm_page.h>
45#include <vm/vm_object.h>
46#include <vm/vm_extern.h>
47#include <vm/vm_map.h>
48#include <sys/buf.h>
49#include <sys/mount.h>
50#include <sys/malloc.h>
51#include <sys/resourcevar.h>
52
53static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
54
55struct	bio_ops bioops;		/* I/O operation notification */
56
57#if 0 	/* replaced bu sched_sync */
58static void vfs_update __P((void));
59static struct	proc *updateproc;
60static struct kproc_desc up_kp = {
61	"update",
62	vfs_update,
63	&updateproc
64};
65SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
66#endif
67
68struct buf *buf;		/* buffer header pool */
69struct swqueue bswlist;
70
71static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
72		vm_offset_t to);
73static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
74		vm_offset_t to);
75static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
76			      vm_offset_t off, vm_offset_t size,
77			      vm_page_t m);
78static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
79			       int pageno, vm_page_t m);
80static void vfs_clean_pages(struct buf * bp);
81static void vfs_setdirty(struct buf *bp);
82static void vfs_vmio_release(struct buf *bp);
83static void flushdirtybuffers(int slpflag, int slptimeo);
84
85int needsbuffer;
86
87/*
88 * Internal update daemon, process 3
89 *	The variable vfs_update_wakeup allows for internal syncs.
90 */
91int vfs_update_wakeup;
92
93
94/*
95 * buffers base kva
96 */
97
98/*
99 * bogus page -- for I/O to/from partially complete buffers
100 * this is a temporary solution to the problem, but it is not
101 * really that bad.  it would be better to split the buffer
102 * for input in the case of buffers partially already in memory,
103 * but the code is intricate enough already.
104 */
105vm_page_t bogus_page;
106static vm_offset_t bogus_offset;
107
108static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
109	bufmallocspace, maxbufmallocspace;
110int numdirtybuffers;
111static int lodirtybuffers, hidirtybuffers;
112static int numfreebuffers, lofreebuffers, hifreebuffers;
113static int kvafreespace;
114
115SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
116	&numdirtybuffers, 0, "");
117SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
118	&lodirtybuffers, 0, "");
119SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
120	&hidirtybuffers, 0, "");
121SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
122	&numfreebuffers, 0, "");
123SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
124	&lofreebuffers, 0, "");
125SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
126	&hifreebuffers, 0, "");
127SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
128	&maxbufspace, 0, "");
129SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
130	&bufspace, 0, "");
131SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
132	&maxvmiobufspace, 0, "");
133SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
134	&vmiospace, 0, "");
135SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
136	&maxbufmallocspace, 0, "");
137SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
138	&bufmallocspace, 0, "");
139SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
140	&kvafreespace, 0, "");
141
142static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
143struct bqueues bufqueues[BUFFER_QUEUES] = {0};
144
145extern int vm_swap_size;
146
147#define BUF_MAXUSE 24
148
149#define VFS_BIO_NEED_ANY 1
150#define VFS_BIO_NEED_LOWLIMIT 2
151#define VFS_BIO_NEED_FREE 4
152
153/*
154 * Initialize buffer headers and related structures.
155 */
156void
157bufinit()
158{
159	struct buf *bp;
160	int i;
161
162	TAILQ_INIT(&bswlist);
163	LIST_INIT(&invalhash);
164
165	/* first, make a null hash table */
166	for (i = 0; i < BUFHSZ; i++)
167		LIST_INIT(&bufhashtbl[i]);
168
169	/* next, make a null set of free lists */
170	for (i = 0; i < BUFFER_QUEUES; i++)
171		TAILQ_INIT(&bufqueues[i]);
172
173	/* finally, initialize each buffer header and stick on empty q */
174	for (i = 0; i < nbuf; i++) {
175		bp = &buf[i];
176		bzero(bp, sizeof *bp);
177		bp->b_flags = B_INVAL;	/* we're just an empty header */
178		bp->b_dev = NODEV;
179		bp->b_rcred = NOCRED;
180		bp->b_wcred = NOCRED;
181		bp->b_qindex = QUEUE_EMPTY;
182		bp->b_xflags = 0;
183		LIST_INIT(&bp->b_dep);
184		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
185		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
186	}
187/*
188 * maxbufspace is currently calculated to support all filesystem blocks
189 * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
190 * cache is still the same as it would be for 8K filesystems.  This
191 * keeps the size of the buffer cache "in check" for big block filesystems.
192 */
193	maxbufspace = (nbuf + 8) * DFLTBSIZE;
194/*
195 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
196 */
197	maxvmiobufspace = 2 * maxbufspace / 3;
198/*
199 * Limit the amount of malloc memory since it is wired permanently into
200 * the kernel space.  Even though this is accounted for in the buffer
201 * allocation, we don't want the malloced region to grow uncontrolled.
202 * The malloc scheme improves memory utilization significantly on average
203 * (small) directories.
204 */
205	maxbufmallocspace = maxbufspace / 20;
206
207/*
208 * Remove the probability of deadlock conditions by limiting the
209 * number of dirty buffers.
210 */
211	hidirtybuffers = nbuf / 8 + 20;
212	lodirtybuffers = nbuf / 16 + 10;
213	numdirtybuffers = 0;
214	lofreebuffers = nbuf / 18 + 5;
215	hifreebuffers = 2 * lofreebuffers;
216	numfreebuffers = nbuf;
217	kvafreespace = 0;
218
219	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
220	bogus_page = vm_page_alloc(kernel_object,
221			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
222			VM_ALLOC_NORMAL);
223
224}
225
226/*
227 * Free the kva allocation for a buffer
228 * Must be called only at splbio or higher,
229 *  as this is the only locking for buffer_map.
230 */
231static void
232bfreekva(struct buf * bp)
233{
234	if (bp->b_kvasize == 0)
235		return;
236
237	vm_map_delete(buffer_map,
238		(vm_offset_t) bp->b_kvabase,
239		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
240
241	bp->b_kvasize = 0;
242
243}
244
245/*
246 * remove the buffer from the appropriate free list
247 */
248void
249bremfree(struct buf * bp)
250{
251	int s = splbio();
252
253	if (bp->b_qindex != QUEUE_NONE) {
254		if (bp->b_qindex == QUEUE_EMPTY) {
255			kvafreespace -= bp->b_kvasize;
256		}
257		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
258		bp->b_qindex = QUEUE_NONE;
259	} else {
260#if !defined(MAX_PERF)
261		panic("bremfree: removing a buffer when not on a queue");
262#endif
263	}
264	if ((bp->b_flags & B_INVAL) ||
265		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
266		--numfreebuffers;
267	splx(s);
268}
269
270
271/*
272 * Get a buffer with the specified data.  Look in the cache first.
273 */
274int
275bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
276    struct buf ** bpp)
277{
278	struct buf *bp;
279
280	bp = getblk(vp, blkno, size, 0, 0);
281	*bpp = bp;
282
283	/* if not found in cache, do some I/O */
284	if ((bp->b_flags & B_CACHE) == 0) {
285		if (curproc != NULL)
286			curproc->p_stats->p_ru.ru_inblock++;
287		bp->b_flags |= B_READ;
288		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
289		if (bp->b_rcred == NOCRED) {
290			if (cred != NOCRED)
291				crhold(cred);
292			bp->b_rcred = cred;
293		}
294		vfs_busy_pages(bp, 0);
295		VOP_STRATEGY(vp, bp);
296		return (biowait(bp));
297	}
298	return (0);
299}
300
301/*
302 * Operates like bread, but also starts asynchronous I/O on
303 * read-ahead blocks.
304 */
305int
306breadn(struct vnode * vp, daddr_t blkno, int size,
307    daddr_t * rablkno, int *rabsize,
308    int cnt, struct ucred * cred, struct buf ** bpp)
309{
310	struct buf *bp, *rabp;
311	int i;
312	int rv = 0, readwait = 0;
313
314	*bpp = bp = getblk(vp, blkno, size, 0, 0);
315
316	/* if not found in cache, do some I/O */
317	if ((bp->b_flags & B_CACHE) == 0) {
318		if (curproc != NULL)
319			curproc->p_stats->p_ru.ru_inblock++;
320		bp->b_flags |= B_READ;
321		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
322		if (bp->b_rcred == NOCRED) {
323			if (cred != NOCRED)
324				crhold(cred);
325			bp->b_rcred = cred;
326		}
327		vfs_busy_pages(bp, 0);
328		VOP_STRATEGY(vp, bp);
329		++readwait;
330	}
331	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
332		if (inmem(vp, *rablkno))
333			continue;
334		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
335
336		if ((rabp->b_flags & B_CACHE) == 0) {
337			if (curproc != NULL)
338				curproc->p_stats->p_ru.ru_inblock++;
339			rabp->b_flags |= B_READ | B_ASYNC;
340			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
341			if (rabp->b_rcred == NOCRED) {
342				if (cred != NOCRED)
343					crhold(cred);
344				rabp->b_rcred = cred;
345			}
346			vfs_busy_pages(rabp, 0);
347			VOP_STRATEGY(vp, rabp);
348		} else {
349			brelse(rabp);
350		}
351	}
352
353	if (readwait) {
354		rv = biowait(bp);
355	}
356	return (rv);
357}
358
359/*
360 * Write, release buffer on completion.  (Done by iodone
361 * if async.)
362 */
363int
364bwrite(struct buf * bp)
365{
366	int oldflags, s;
367	struct vnode *vp;
368	struct mount *mp;
369
370
371	if (bp->b_flags & B_INVAL) {
372		brelse(bp);
373		return (0);
374	}
375
376	oldflags = bp->b_flags;
377
378#if !defined(MAX_PERF)
379	if ((bp->b_flags & B_BUSY) == 0)
380		panic("bwrite: buffer is not busy???");
381#endif
382
383	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
384	bp->b_flags |= B_WRITEINPROG;
385
386	s = splbio();
387	if ((oldflags & B_DELWRI) == B_DELWRI) {
388		--numdirtybuffers;
389		reassignbuf(bp, bp->b_vp);
390	}
391
392	bp->b_vp->v_numoutput++;
393	vfs_busy_pages(bp, 1);
394	if (curproc != NULL)
395		curproc->p_stats->p_ru.ru_oublock++;
396	splx(s);
397	VOP_STRATEGY(bp->b_vp, bp);
398
399	/*
400	 * Collect statistics on synchronous and asynchronous writes.
401	 * Writes to block devices are charged to their associated
402	 * filesystem (if any).
403	 */
404	if ((vp = bp->b_vp) != NULL) {
405		if (vp->v_type == VBLK)
406			mp = vp->v_specmountpoint;
407		else
408			mp = vp->v_mount;
409		if (mp != NULL)
410			if ((oldflags & B_ASYNC) == 0)
411				mp->mnt_stat.f_syncwrites++;
412			else
413				mp->mnt_stat.f_asyncwrites++;
414	}
415
416	if ((oldflags & B_ASYNC) == 0) {
417		int rtval = biowait(bp);
418		brelse(bp);
419		return (rtval);
420	}
421	return (0);
422}
423
424void
425vfs_bio_need_satisfy(void) {
426	++numfreebuffers;
427	if (!needsbuffer)
428		return;
429	if (numdirtybuffers < lodirtybuffers) {
430		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
431	} else {
432		needsbuffer &= ~VFS_BIO_NEED_ANY;
433	}
434	if (numfreebuffers >= hifreebuffers) {
435		needsbuffer &= ~VFS_BIO_NEED_FREE;
436	}
437	wakeup(&needsbuffer);
438}
439
440/*
441 * Delayed write. (Buffer is marked dirty).
442 */
443void
444bdwrite(struct buf * bp)
445{
446	struct vnode *vp;
447
448#if !defined(MAX_PERF)
449	if ((bp->b_flags & B_BUSY) == 0) {
450		panic("bdwrite: buffer is not busy");
451	}
452#endif
453
454	if (bp->b_flags & B_INVAL) {
455		brelse(bp);
456		return;
457	}
458	bp->b_flags &= ~(B_READ|B_RELBUF);
459	if ((bp->b_flags & B_DELWRI) == 0) {
460		bp->b_flags |= B_DONE | B_DELWRI;
461		reassignbuf(bp, bp->b_vp);
462		++numdirtybuffers;
463	}
464
465	/*
466	 * This bmap keeps the system from needing to do the bmap later,
467	 * perhaps when the system is attempting to do a sync.  Since it
468	 * is likely that the indirect block -- or whatever other datastructure
469	 * that the filesystem needs is still in memory now, it is a good
470	 * thing to do this.  Note also, that if the pageout daemon is
471	 * requesting a sync -- there might not be enough memory to do
472	 * the bmap then...  So, this is important to do.
473	 */
474	if (bp->b_lblkno == bp->b_blkno) {
475		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
476	}
477
478	/*
479	 * Set the *dirty* buffer range based upon the VM system dirty pages.
480	 */
481	vfs_setdirty(bp);
482
483	/*
484	 * We need to do this here to satisfy the vnode_pager and the
485	 * pageout daemon, so that it thinks that the pages have been
486	 * "cleaned".  Note that since the pages are in a delayed write
487	 * buffer -- the VFS layer "will" see that the pages get written
488	 * out on the next sync, or perhaps the cluster will be completed.
489	 */
490	vfs_clean_pages(bp);
491	bqrelse(bp);
492
493	/*
494	 * XXX The soft dependency code is not prepared to
495	 * have I/O done when a bdwrite is requested. For
496	 * now we just let the write be delayed if it is
497	 * requested by the soft dependency code.
498	 */
499	if ((vp = bp->b_vp) &&
500	    ((vp->v_type == VBLK && vp->v_specmountpoint &&
501		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
502		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
503		return;
504
505	if (numdirtybuffers >= hidirtybuffers)
506		flushdirtybuffers(0, 0);
507
508	return;
509}
510
511
512/*
513 * Same as first half of bdwrite, mark buffer dirty, but do not release it.
514 * Check how this compares with vfs_setdirty(); XXX [JRE]
515 */
516void
517bdirty(bp)
518      struct buf *bp;
519{
520
521	bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
522	if ((bp->b_flags & B_DELWRI) == 0) {
523		bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
524		reassignbuf(bp, bp->b_vp);
525		++numdirtybuffers;
526	}
527}
528
529/*
530 * Asynchronous write.
531 * Start output on a buffer, but do not wait for it to complete.
532 * The buffer is released when the output completes.
533 */
534void
535bawrite(struct buf * bp)
536{
537	bp->b_flags |= B_ASYNC;
538	(void) VOP_BWRITE(bp);
539}
540
541/*
542 * Ordered write.
543 * Start output on a buffer, and flag it so that the device will write
544 * it in the order it was queued.  The buffer is released when the output
545 * completes.
546 */
547int
548bowrite(struct buf * bp)
549{
550	bp->b_flags |= B_ORDERED|B_ASYNC;
551	return (VOP_BWRITE(bp));
552}
553
554/*
555 * Release a buffer.
556 */
557void
558brelse(struct buf * bp)
559{
560	int s;
561
562	if (bp->b_flags & B_CLUSTER) {
563		relpbuf(bp);
564		return;
565	}
566
567	s = splbio();
568
569	/* anyone need this block? */
570	if (bp->b_flags & B_WANTED) {
571		bp->b_flags &= ~(B_WANTED | B_AGE);
572		wakeup(bp);
573	}
574
575	if (bp->b_flags & B_LOCKED)
576		bp->b_flags &= ~B_ERROR;
577
578	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
579	    (bp->b_bufsize <= 0)) {
580		bp->b_flags |= B_INVAL;
581		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
582			(*bioops.io_deallocate)(bp);
583		if (bp->b_flags & B_DELWRI)
584			--numdirtybuffers;
585		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
586		if ((bp->b_flags & B_VMIO) == 0) {
587			if (bp->b_bufsize)
588				allocbuf(bp, 0);
589			if (bp->b_vp)
590				brelvp(bp);
591		}
592	}
593
594	/*
595	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release()
596	 * is called with B_DELWRI set, the underlying pages may wind up
597	 * getting freed causing a previous write (bdwrite()) to get 'lost'
598	 * because pages associated with a B_DELWRI bp are marked clean.
599	 *
600	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
601	 * if B_DELWRI is set.
602	 */
603
604	if (bp->b_flags & B_DELWRI)
605		bp->b_flags &= ~B_RELBUF;
606
607	/*
608	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
609	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
610	 * but the VM object is kept around.  The B_NOCACHE flag is used to
611	 * invalidate the pages in the VM object.
612	 *
613	 * If the buffer is a partially filled NFS buffer, keep it
614	 * since invalidating it now will lose informatio.  The valid
615	 * flags in the vm_pages have only DEV_BSIZE resolution but
616	 * the b_validoff, b_validend fields have byte resolution.
617	 * This can avoid unnecessary re-reads of the buffer.
618	 * XXX this seems to cause performance problems.
619	 */
620	if ((bp->b_flags & B_VMIO)
621	    && !(bp->b_vp->v_tag == VT_NFS &&
622		 bp->b_vp->v_type != VBLK &&
623		 (bp->b_flags & B_DELWRI) != 0)
624#ifdef notdef
625	    && (bp->b_vp->v_tag != VT_NFS
626		|| bp->b_vp->v_type == VBLK
627		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
628		|| bp->b_validend == 0
629		|| (bp->b_validoff == 0
630		    && bp->b_validend == bp->b_bufsize))
631#endif
632	    ) {
633
634		int i, j, resid;
635		vm_page_t m;
636		off_t foff;
637		vm_pindex_t poff;
638		vm_object_t obj;
639		struct vnode *vp;
640
641		vp = bp->b_vp;
642
643		resid = bp->b_bufsize;
644		foff = bp->b_offset;
645
646		for (i = 0; i < bp->b_npages; i++) {
647			m = bp->b_pages[i];
648			vm_page_flag_clear(m, PG_ZERO);
649			if (m == bogus_page) {
650
651				obj = (vm_object_t) vp->v_object;
652				poff = OFF_TO_IDX(bp->b_offset);
653
654				for (j = i; j < bp->b_npages; j++) {
655					m = bp->b_pages[j];
656					if (m == bogus_page) {
657						m = vm_page_lookup(obj, poff + j);
658#if !defined(MAX_PERF)
659						if (!m) {
660							panic("brelse: page missing\n");
661						}
662#endif
663						bp->b_pages[j] = m;
664					}
665				}
666
667				if ((bp->b_flags & B_INVAL) == 0) {
668					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
669				}
670			}
671			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
672				int poffset = foff & PAGE_MASK;
673				int presid = resid > (PAGE_SIZE - poffset) ?
674					(PAGE_SIZE - poffset) : resid;
675#ifdef DIAGNOSTIC
676				if (presid < 0)
677					panic("brelse: extra page");
678#endif
679				vm_page_set_invalid(m, poffset, presid);
680			}
681			resid -= PAGE_SIZE - (foff & PAGE_MASK);
682			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
683		}
684
685		if (bp->b_flags & (B_INVAL | B_RELBUF))
686			vfs_vmio_release(bp);
687
688	} else if (bp->b_flags & B_VMIO) {
689
690		if (bp->b_flags & (B_INVAL | B_RELBUF))
691			vfs_vmio_release(bp);
692
693	}
694
695#if !defined(MAX_PERF)
696	if (bp->b_qindex != QUEUE_NONE)
697		panic("brelse: free buffer onto another queue???");
698#endif
699
700	/* enqueue */
701	/* buffers with no memory */
702	if (bp->b_bufsize == 0) {
703		bp->b_flags |= B_INVAL;
704		bp->b_qindex = QUEUE_EMPTY;
705		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
706		LIST_REMOVE(bp, b_hash);
707		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
708		bp->b_dev = NODEV;
709		kvafreespace += bp->b_kvasize;
710
711	/* buffers with junk contents */
712	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
713		bp->b_flags |= B_INVAL;
714		bp->b_qindex = QUEUE_AGE;
715		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
716		LIST_REMOVE(bp, b_hash);
717		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
718		bp->b_dev = NODEV;
719
720	/* buffers that are locked */
721	} else if (bp->b_flags & B_LOCKED) {
722		bp->b_qindex = QUEUE_LOCKED;
723		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
724
725	/* buffers with stale but valid contents */
726	} else if (bp->b_flags & B_AGE) {
727		bp->b_qindex = QUEUE_AGE;
728		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
729
730	/* buffers with valid and quite potentially reuseable contents */
731	} else {
732		bp->b_qindex = QUEUE_LRU;
733		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
734	}
735
736	if ((bp->b_flags & B_INVAL) ||
737		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
738		if (bp->b_flags & B_DELWRI) {
739			--numdirtybuffers;
740			bp->b_flags &= ~B_DELWRI;
741		}
742		vfs_bio_need_satisfy();
743	}
744
745	/* unlock */
746	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
747		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
748	splx(s);
749}
750
751/*
752 * Release a buffer.
753 */
754void
755bqrelse(struct buf * bp)
756{
757	int s;
758
759	s = splbio();
760
761	/* anyone need this block? */
762	if (bp->b_flags & B_WANTED) {
763		bp->b_flags &= ~(B_WANTED | B_AGE);
764		wakeup(bp);
765	}
766
767#if !defined(MAX_PERF)
768	if (bp->b_qindex != QUEUE_NONE)
769		panic("bqrelse: free buffer onto another queue???");
770#endif
771
772	if (bp->b_flags & B_LOCKED) {
773		bp->b_flags &= ~B_ERROR;
774		bp->b_qindex = QUEUE_LOCKED;
775		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
776		/* buffers with stale but valid contents */
777	} else {
778		bp->b_qindex = QUEUE_LRU;
779		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
780	}
781
782	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
783		vfs_bio_need_satisfy();
784	}
785
786	/* unlock */
787	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
788		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
789	splx(s);
790}
791
792static void
793vfs_vmio_release(bp)
794	struct buf *bp;
795{
796	int i, s;
797	vm_page_t m;
798
799	s = splvm();
800	for (i = 0; i < bp->b_npages; i++) {
801		m = bp->b_pages[i];
802		bp->b_pages[i] = NULL;
803		/*
804		 * In order to keep page LRU ordering consistent, put
805		 * everything on the inactive queue.
806		 */
807		vm_page_unwire(m, 0);
808		/*
809		 * We don't mess with busy pages, it is
810		 * the responsibility of the process that
811		 * busied the pages to deal with them.
812		 */
813		if ((m->flags & PG_BUSY) || (m->busy != 0))
814			continue;
815
816		if (m->wire_count == 0) {
817			vm_page_flag_clear(m, PG_ZERO);
818			/*
819			 * Might as well free the page if we can and it has
820			 * no valid data.
821			 */
822			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
823				vm_page_busy(m);
824				vm_page_protect(m, VM_PROT_NONE);
825				vm_page_free(m);
826			}
827		}
828	}
829	splx(s);
830	bufspace -= bp->b_bufsize;
831	vmiospace -= bp->b_bufsize;
832	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
833	bp->b_npages = 0;
834	bp->b_bufsize = 0;
835	bp->b_flags &= ~B_VMIO;
836	if (bp->b_vp)
837		brelvp(bp);
838}
839
840/*
841 * Check to see if a block is currently memory resident.
842 */
843struct buf *
844gbincore(struct vnode * vp, daddr_t blkno)
845{
846	struct buf *bp;
847	struct bufhashhdr *bh;
848
849	bh = BUFHASH(vp, blkno);
850	bp = bh->lh_first;
851
852	/* Search hash chain */
853	while (bp != NULL) {
854		/* hit */
855		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
856		    (bp->b_flags & B_INVAL) == 0) {
857			break;
858		}
859		bp = bp->b_hash.le_next;
860	}
861	return (bp);
862}
863
864/*
865 * this routine implements clustered async writes for
866 * clearing out B_DELWRI buffers...  This is much better
867 * than the old way of writing only one buffer at a time.
868 */
869int
870vfs_bio_awrite(struct buf * bp)
871{
872	int i;
873	daddr_t lblkno = bp->b_lblkno;
874	struct vnode *vp = bp->b_vp;
875	int s;
876	int ncl;
877	struct buf *bpa;
878	int nwritten;
879	int size;
880	int maxcl;
881
882	s = splbio();
883	/*
884	 * right now we support clustered writing only to regular files
885	 */
886	if ((vp->v_type == VREG) &&
887	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
888	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
889
890		size = vp->v_mount->mnt_stat.f_iosize;
891		maxcl = MAXPHYS / size;
892
893		for (i = 1; i < maxcl; i++) {
894			if ((bpa = gbincore(vp, lblkno + i)) &&
895			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
896			    (B_DELWRI | B_CLUSTEROK)) &&
897			    (bpa->b_bufsize == size)) {
898				if ((bpa->b_blkno == bpa->b_lblkno) ||
899				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
900					break;
901			} else {
902				break;
903			}
904		}
905		ncl = i;
906		/*
907		 * this is a possible cluster write
908		 */
909		if (ncl != 1) {
910			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
911			splx(s);
912			return nwritten;
913		}
914	}
915
916	bremfree(bp);
917	bp->b_flags |= B_BUSY | B_ASYNC;
918
919	splx(s);
920	/*
921	 * default (old) behavior, writing out only one block
922	 */
923	nwritten = bp->b_bufsize;
924	(void) VOP_BWRITE(bp);
925	return nwritten;
926}
927
928
929/*
930 * Find a buffer header which is available for use.
931 */
932static struct buf *
933getnewbuf(struct vnode *vp, daddr_t blkno,
934	int slpflag, int slptimeo, int size, int maxsize)
935{
936	struct buf *bp, *bp1;
937	int nbyteswritten = 0;
938	vm_offset_t addr;
939	static int writerecursion = 0;
940
941start:
942	if (bufspace >= maxbufspace)
943		goto trytofreespace;
944
945	/* can we constitute a new buffer? */
946	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
947#if !defined(MAX_PERF)
948		if (bp->b_qindex != QUEUE_EMPTY)
949			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
950			    bp->b_qindex);
951#endif
952		bp->b_flags |= B_BUSY;
953		bremfree(bp);
954		goto fillbuf;
955	}
956trytofreespace:
957	/*
958	 * We keep the file I/O from hogging metadata I/O
959	 * This is desirable because file data is cached in the
960	 * VM/Buffer cache even if a buffer is freed.
961	 */
962	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
963#if !defined(MAX_PERF)
964		if (bp->b_qindex != QUEUE_AGE)
965			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
966			    bp->b_qindex);
967#endif
968	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
969#if !defined(MAX_PERF)
970		if (bp->b_qindex != QUEUE_LRU)
971			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
972			    bp->b_qindex);
973#endif
974	}
975	if (!bp) {
976		/* wait for a free buffer of any kind */
977		needsbuffer |= VFS_BIO_NEED_ANY;
978		do
979			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
980			    slptimeo);
981		while (needsbuffer & VFS_BIO_NEED_ANY);
982		return (0);
983	}
984
985#if defined(DIAGNOSTIC)
986	if (bp->b_flags & B_BUSY) {
987		panic("getnewbuf: busy buffer on free list\n");
988	}
989#endif
990
991	/*
992	 * We are fairly aggressive about freeing VMIO buffers, but since
993	 * the buffering is intact without buffer headers, there is not
994	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
995	 */
996	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
997		if ((bp->b_flags & B_VMIO) == 0 ||
998			(vmiospace < maxvmiobufspace)) {
999			--bp->b_usecount;
1000			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1001			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1002				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1003				goto start;
1004			}
1005			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1006		}
1007	}
1008
1009
1010	/* if we are a delayed write, convert to an async write */
1011	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1012
1013		/*
1014		 * If our delayed write is likely to be used soon, then
1015		 * recycle back onto the LRU queue.
1016		 */
1017		if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1018			(bp->b_lblkno >= blkno) && (maxsize > 0)) {
1019
1020			if (bp->b_usecount > 0) {
1021				if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1022
1023					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1024
1025					if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1026						TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1027						bp->b_usecount--;
1028						goto start;
1029					}
1030					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1031				}
1032			}
1033		}
1034
1035		/*
1036		 * Certain layered filesystems can recursively re-enter the vfs_bio
1037		 * code, due to delayed writes.  This helps keep the system from
1038		 * deadlocking.
1039		 */
1040		if (writerecursion > 0) {
1041			if (writerecursion > 5) {
1042				bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1043				while (bp) {
1044					if ((bp->b_flags & B_DELWRI) == 0)
1045						break;
1046					bp = TAILQ_NEXT(bp, b_freelist);
1047				}
1048				if (bp == NULL) {
1049					bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1050					while (bp) {
1051						if ((bp->b_flags & B_DELWRI) == 0)
1052							break;
1053						bp = TAILQ_NEXT(bp, b_freelist);
1054					}
1055				}
1056				if (bp == NULL)
1057					panic("getnewbuf: cannot get buffer, infinite recursion failure");
1058			} else {
1059				bremfree(bp);
1060				bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1061				nbyteswritten += bp->b_bufsize;
1062				++writerecursion;
1063				VOP_BWRITE(bp);
1064				--writerecursion;
1065				if (!slpflag && !slptimeo) {
1066					return (0);
1067				}
1068				goto start;
1069			}
1070		} else {
1071			++writerecursion;
1072			nbyteswritten += vfs_bio_awrite(bp);
1073			--writerecursion;
1074			if (!slpflag && !slptimeo) {
1075				return (0);
1076			}
1077			goto start;
1078		}
1079	}
1080
1081	if (bp->b_flags & B_WANTED) {
1082		bp->b_flags &= ~B_WANTED;
1083		wakeup(bp);
1084	}
1085	bremfree(bp);
1086	bp->b_flags |= B_BUSY;
1087
1088	if (bp->b_flags & B_VMIO) {
1089		bp->b_flags &= ~B_ASYNC;
1090		vfs_vmio_release(bp);
1091	}
1092
1093	if (bp->b_vp)
1094		brelvp(bp);
1095
1096fillbuf:
1097
1098	/* we are not free, nor do we contain interesting data */
1099	if (bp->b_rcred != NOCRED) {
1100		crfree(bp->b_rcred);
1101		bp->b_rcred = NOCRED;
1102	}
1103	if (bp->b_wcred != NOCRED) {
1104		crfree(bp->b_wcred);
1105		bp->b_wcred = NOCRED;
1106	}
1107	if (LIST_FIRST(&bp->b_dep) != NULL &&
1108	    bioops.io_deallocate)
1109		(*bioops.io_deallocate)(bp);
1110
1111	LIST_REMOVE(bp, b_hash);
1112	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1113	if (bp->b_bufsize) {
1114		allocbuf(bp, 0);
1115	}
1116	bp->b_flags = B_BUSY;
1117	bp->b_dev = NODEV;
1118	bp->b_vp = NULL;
1119	bp->b_blkno = bp->b_lblkno = 0;
1120	bp->b_offset = NOOFFSET;
1121	bp->b_iodone = 0;
1122	bp->b_error = 0;
1123	bp->b_resid = 0;
1124	bp->b_bcount = 0;
1125	bp->b_npages = 0;
1126	bp->b_dirtyoff = bp->b_dirtyend = 0;
1127	bp->b_validoff = bp->b_validend = 0;
1128	bp->b_usecount = 5;
1129	/* Here, not kern_physio.c, is where this should be done*/
1130	LIST_INIT(&bp->b_dep);
1131
1132	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1133
1134	/*
1135	 * we assume that buffer_map is not at address 0
1136	 */
1137	addr = 0;
1138	if (maxsize != bp->b_kvasize) {
1139		bfreekva(bp);
1140
1141findkvaspace:
1142		/*
1143		 * See if we have buffer kva space
1144		 */
1145		if (vm_map_findspace(buffer_map,
1146			vm_map_min(buffer_map), maxsize, &addr)) {
1147			if (kvafreespace > 0) {
1148				int totfree = 0, freed;
1149				do {
1150					freed = 0;
1151					for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1152						bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1153						if (bp1->b_kvasize != 0) {
1154							totfree += bp1->b_kvasize;
1155							freed = bp1->b_kvasize;
1156							bremfree(bp1);
1157							bfreekva(bp1);
1158							brelse(bp1);
1159							break;
1160						}
1161					}
1162				} while (freed);
1163				/*
1164				 * if we found free space, then retry with the same buffer.
1165				 */
1166				if (totfree)
1167					goto findkvaspace;
1168			}
1169			bp->b_flags |= B_INVAL;
1170			brelse(bp);
1171			goto trytofreespace;
1172		}
1173	}
1174
1175	/*
1176	 * See if we are below are allocated minimum
1177	 */
1178	if (bufspace >= (maxbufspace + nbyteswritten)) {
1179		bp->b_flags |= B_INVAL;
1180		brelse(bp);
1181		goto trytofreespace;
1182	}
1183
1184	/*
1185	 * create a map entry for the buffer -- in essence
1186	 * reserving the kva space.
1187	 */
1188	if (addr) {
1189		vm_map_insert(buffer_map, NULL, 0,
1190			addr, addr + maxsize,
1191			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1192
1193		bp->b_kvabase = (caddr_t) addr;
1194		bp->b_kvasize = maxsize;
1195	}
1196	bp->b_data = bp->b_kvabase;
1197
1198	return (bp);
1199}
1200
1201static void
1202waitfreebuffers(int slpflag, int slptimeo) {
1203	while (numfreebuffers < hifreebuffers) {
1204		flushdirtybuffers(slpflag, slptimeo);
1205		if (numfreebuffers < hifreebuffers)
1206			break;
1207		needsbuffer |= VFS_BIO_NEED_FREE;
1208		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1209			break;
1210	}
1211}
1212
1213static void
1214flushdirtybuffers(int slpflag, int slptimeo) {
1215	int s;
1216	static pid_t flushing = 0;
1217
1218	s = splbio();
1219
1220	if (flushing) {
1221		if (flushing == curproc->p_pid) {
1222			splx(s);
1223			return;
1224		}
1225		while (flushing) {
1226			if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1227				splx(s);
1228				return;
1229			}
1230		}
1231	}
1232	flushing = curproc->p_pid;
1233
1234	while (numdirtybuffers > lodirtybuffers) {
1235		struct buf *bp;
1236		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1237		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1238		if (bp == NULL)
1239			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1240
1241		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1242			bp = TAILQ_NEXT(bp, b_freelist);
1243		}
1244
1245		if (bp) {
1246			vfs_bio_awrite(bp);
1247			continue;
1248		}
1249		break;
1250	}
1251
1252	flushing = 0;
1253	wakeup(&flushing);
1254	splx(s);
1255}
1256
1257/*
1258 * Check to see if a block is currently memory resident.
1259 */
1260struct buf *
1261incore(struct vnode * vp, daddr_t blkno)
1262{
1263	struct buf *bp;
1264
1265	int s = splbio();
1266	bp = gbincore(vp, blkno);
1267	splx(s);
1268	return (bp);
1269}
1270
1271/*
1272 * Returns true if no I/O is needed to access the
1273 * associated VM object.  This is like incore except
1274 * it also hunts around in the VM system for the data.
1275 */
1276
1277int
1278inmem(struct vnode * vp, daddr_t blkno)
1279{
1280	vm_object_t obj;
1281	vm_offset_t toff, tinc, size;
1282	vm_page_t m;
1283	vm_ooffset_t off;
1284
1285	if (incore(vp, blkno))
1286		return 1;
1287	if (vp->v_mount == NULL)
1288		return 0;
1289	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1290		return 0;
1291
1292	obj = vp->v_object;
1293	size = PAGE_SIZE;
1294	if (size > vp->v_mount->mnt_stat.f_iosize)
1295		size = vp->v_mount->mnt_stat.f_iosize;
1296	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
1297
1298	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1299		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1300		if (!m)
1301			return 0;
1302		tinc = size;
1303		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
1304			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
1305		if (vm_page_is_valid(m,
1306		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1307			return 0;
1308	}
1309	return 1;
1310}
1311
1312/*
1313 * now we set the dirty range for the buffer --
1314 * for NFS -- if the file is mapped and pages have
1315 * been written to, let it know.  We want the
1316 * entire range of the buffer to be marked dirty if
1317 * any of the pages have been written to for consistancy
1318 * with the b_validoff, b_validend set in the nfs write
1319 * code, and used by the nfs read code.
1320 */
1321static void
1322vfs_setdirty(struct buf *bp) {
1323	int i;
1324	vm_object_t object;
1325	vm_offset_t boffset, offset;
1326	/*
1327	 * We qualify the scan for modified pages on whether the
1328	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1329	 * is not cleared simply by protecting pages off.
1330	 */
1331	if ((bp->b_flags & B_VMIO) &&
1332		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1333		/*
1334		 * test the pages to see if they have been modified directly
1335		 * by users through the VM system.
1336		 */
1337		for (i = 0; i < bp->b_npages; i++) {
1338			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1339			vm_page_test_dirty(bp->b_pages[i]);
1340		}
1341
1342		/*
1343		 * scan forwards for the first page modified
1344		 */
1345		for (i = 0; i < bp->b_npages; i++) {
1346			if (bp->b_pages[i]->dirty) {
1347				break;
1348			}
1349		}
1350		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1351		if (boffset < bp->b_dirtyoff) {
1352			bp->b_dirtyoff = max(boffset, 0);
1353		}
1354
1355		/*
1356		 * scan backwards for the last page modified
1357		 */
1358		for (i = bp->b_npages - 1; i >= 0; --i) {
1359			if (bp->b_pages[i]->dirty) {
1360				break;
1361			}
1362		}
1363		boffset = (i + 1);
1364#if 0
1365		offset = boffset + bp->b_pages[0]->pindex;
1366		if (offset >= object->size)
1367			boffset = object->size - bp->b_pages[0]->pindex;
1368#endif
1369		boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1370		if (bp->b_dirtyend < boffset)
1371			bp->b_dirtyend = min(boffset, bp->b_bufsize);
1372	}
1373}
1374
1375/*
1376 * Get a block given a specified block and offset into a file/device.
1377 */
1378struct buf *
1379getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1380{
1381	struct buf *bp;
1382	int i, s;
1383	struct bufhashhdr *bh;
1384	int maxsize;
1385
1386#if !defined(MAX_PERF)
1387	if (size > MAXBSIZE)
1388		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1389#endif
1390
1391	s = splbio();
1392loop:
1393	if (numfreebuffers < lofreebuffers) {
1394		waitfreebuffers(slpflag, slptimeo);
1395	}
1396
1397	if ((bp = gbincore(vp, blkno))) {
1398		if (bp->b_flags & B_BUSY) {
1399
1400			bp->b_flags |= B_WANTED;
1401			if (bp->b_usecount < BUF_MAXUSE)
1402				++bp->b_usecount;
1403
1404			if (!tsleep(bp,
1405				(PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1406				goto loop;
1407			}
1408
1409			splx(s);
1410			return (struct buf *) NULL;
1411		}
1412		bp->b_flags |= B_BUSY | B_CACHE;
1413		bremfree(bp);
1414
1415		/*
1416		 * check for size inconsistancies (note that they shouldn't
1417		 * happen but do when filesystems don't handle the size changes
1418		 * correctly.) We are conservative on metadata and don't just
1419		 * extend the buffer but write (if needed) and re-constitute it.
1420		 */
1421
1422		if (bp->b_bcount != size) {
1423			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1424				allocbuf(bp, size);
1425			} else {
1426				if (bp->b_flags & B_DELWRI) {
1427					bp->b_flags |= B_NOCACHE;
1428					VOP_BWRITE(bp);
1429				} else {
1430					if ((bp->b_flags & B_VMIO) &&
1431					   (LIST_FIRST(&bp->b_dep) == NULL)) {
1432						bp->b_flags |= B_RELBUF;
1433						brelse(bp);
1434					} else {
1435						bp->b_flags |= B_NOCACHE;
1436						VOP_BWRITE(bp);
1437					}
1438				}
1439				goto loop;
1440			}
1441		}
1442
1443#ifdef DIAGNOSTIC
1444		if (bp->b_offset == NOOFFSET)
1445			panic("getblk: no buffer offset");
1446#endif
1447
1448		/*
1449		 * Check that the constituted buffer really deserves for the
1450		 * B_CACHE bit to be set.  B_VMIO type buffers might not
1451		 * contain fully valid pages.  Normal (old-style) buffers
1452		 * should be fully valid.
1453		 */
1454		if (bp->b_flags & B_VMIO) {
1455			int checksize = bp->b_bufsize;
1456			int poffset = bp->b_offset & PAGE_MASK;
1457			int resid;
1458			for (i = 0; i < bp->b_npages; i++) {
1459				resid = (checksize > (PAGE_SIZE - poffset)) ?
1460					(PAGE_SIZE - poffset) : checksize;
1461				if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1462					bp->b_flags &= ~(B_CACHE | B_DONE);
1463					break;
1464				}
1465				checksize -= resid;
1466				poffset = 0;
1467			}
1468		}
1469
1470		if (bp->b_usecount < BUF_MAXUSE)
1471			++bp->b_usecount;
1472		splx(s);
1473		return (bp);
1474	} else {
1475		int bsize, maxsize, vmio;
1476		off_t offset;
1477
1478		if (vp->v_type == VBLK)
1479			bsize = DEV_BSIZE;
1480		else if (vp->v_mountedhere)
1481			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
1482		else if (vp->v_mount)
1483			bsize = vp->v_mount->mnt_stat.f_iosize;
1484		else
1485			bsize = size;
1486
1487		offset = (off_t)blkno * bsize;
1488		vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
1489		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
1490		maxsize = imax(maxsize, bsize);
1491
1492		if ((bp = getnewbuf(vp, blkno,
1493			slpflag, slptimeo, size, maxsize)) == 0) {
1494			if (slpflag || slptimeo) {
1495				splx(s);
1496				return NULL;
1497			}
1498			goto loop;
1499		}
1500
1501		/*
1502		 * This code is used to make sure that a buffer is not
1503		 * created while the getnewbuf routine is blocked.
1504		 * Normally the vnode is locked so this isn't a problem.
1505		 * VBLK type I/O requests, however, don't lock the vnode.
1506		 */
1507		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) {
1508			bp->b_flags |= B_INVAL;
1509			brelse(bp);
1510			goto loop;
1511		}
1512
1513		/*
1514		 * Insert the buffer into the hash, so that it can
1515		 * be found by incore.
1516		 */
1517		bp->b_blkno = bp->b_lblkno = blkno;
1518		bp->b_offset = offset;
1519
1520		bgetvp(vp, bp);
1521		LIST_REMOVE(bp, b_hash);
1522		bh = BUFHASH(vp, blkno);
1523		LIST_INSERT_HEAD(bh, bp, b_hash);
1524
1525		if (vmio) {
1526			bp->b_flags |= (B_VMIO | B_CACHE);
1527#if defined(VFS_BIO_DEBUG)
1528			if (vp->v_type != VREG && vp->v_type != VBLK)
1529				printf("getblk: vmioing file type %d???\n", vp->v_type);
1530#endif
1531		} else {
1532			bp->b_flags &= ~B_VMIO;
1533		}
1534
1535		allocbuf(bp, size);
1536
1537		splx(s);
1538		return (bp);
1539	}
1540}
1541
1542/*
1543 * Get an empty, disassociated buffer of given size.
1544 */
1545struct buf *
1546geteblk(int size)
1547{
1548	struct buf *bp;
1549	int s;
1550
1551	s = splbio();
1552	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1553	splx(s);
1554	allocbuf(bp, size);
1555	bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
1556	return (bp);
1557}
1558
1559
1560/*
1561 * This code constitutes the buffer memory from either anonymous system
1562 * memory (in the case of non-VMIO operations) or from an associated
1563 * VM object (in the case of VMIO operations).
1564 *
1565 * Note that this code is tricky, and has many complications to resolve
1566 * deadlock or inconsistant data situations.  Tread lightly!!!
1567 *
1568 * Modify the length of a buffer's underlying buffer storage without
1569 * destroying information (unless, of course the buffer is shrinking).
1570 */
1571int
1572allocbuf(struct buf * bp, int size)
1573{
1574
1575	int s;
1576	int newbsize, mbsize;
1577	int i;
1578
1579#if !defined(MAX_PERF)
1580	if (!(bp->b_flags & B_BUSY))
1581		panic("allocbuf: buffer not busy");
1582
1583	if (bp->b_kvasize < size)
1584		panic("allocbuf: buffer too small");
1585#endif
1586
1587	if ((bp->b_flags & B_VMIO) == 0) {
1588		caddr_t origbuf;
1589		int origbufsize;
1590		/*
1591		 * Just get anonymous memory from the kernel
1592		 */
1593		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1594#if !defined(NO_B_MALLOC)
1595		if (bp->b_flags & B_MALLOC)
1596			newbsize = mbsize;
1597		else
1598#endif
1599			newbsize = round_page(size);
1600
1601		if (newbsize < bp->b_bufsize) {
1602#if !defined(NO_B_MALLOC)
1603			/*
1604			 * malloced buffers are not shrunk
1605			 */
1606			if (bp->b_flags & B_MALLOC) {
1607				if (newbsize) {
1608					bp->b_bcount = size;
1609				} else {
1610					free(bp->b_data, M_BIOBUF);
1611					bufspace -= bp->b_bufsize;
1612					bufmallocspace -= bp->b_bufsize;
1613					bp->b_data = bp->b_kvabase;
1614					bp->b_bufsize = 0;
1615					bp->b_bcount = 0;
1616					bp->b_flags &= ~B_MALLOC;
1617				}
1618				return 1;
1619			}
1620#endif
1621			vm_hold_free_pages(
1622			    bp,
1623			    (vm_offset_t) bp->b_data + newbsize,
1624			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1625		} else if (newbsize > bp->b_bufsize) {
1626#if !defined(NO_B_MALLOC)
1627			/*
1628			 * We only use malloced memory on the first allocation.
1629			 * and revert to page-allocated memory when the buffer grows.
1630			 */
1631			if ( (bufmallocspace < maxbufmallocspace) &&
1632				(bp->b_bufsize == 0) &&
1633				(mbsize <= PAGE_SIZE/2)) {
1634
1635				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1636				bp->b_bufsize = mbsize;
1637				bp->b_bcount = size;
1638				bp->b_flags |= B_MALLOC;
1639				bufspace += mbsize;
1640				bufmallocspace += mbsize;
1641				return 1;
1642			}
1643#endif
1644			origbuf = NULL;
1645			origbufsize = 0;
1646#if !defined(NO_B_MALLOC)
1647			/*
1648			 * If the buffer is growing on its other-than-first allocation,
1649			 * then we revert to the page-allocation scheme.
1650			 */
1651			if (bp->b_flags & B_MALLOC) {
1652				origbuf = bp->b_data;
1653				origbufsize = bp->b_bufsize;
1654				bp->b_data = bp->b_kvabase;
1655				bufspace -= bp->b_bufsize;
1656				bufmallocspace -= bp->b_bufsize;
1657				bp->b_bufsize = 0;
1658				bp->b_flags &= ~B_MALLOC;
1659				newbsize = round_page(newbsize);
1660			}
1661#endif
1662			vm_hold_load_pages(
1663			    bp,
1664			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1665			    (vm_offset_t) bp->b_data + newbsize);
1666#if !defined(NO_B_MALLOC)
1667			if (origbuf) {
1668				bcopy(origbuf, bp->b_data, origbufsize);
1669				free(origbuf, M_BIOBUF);
1670			}
1671#endif
1672		}
1673	} else {
1674		vm_page_t m;
1675		int desiredpages;
1676
1677		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1678		desiredpages = (size == 0) ? 0 :
1679			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
1680
1681#if !defined(NO_B_MALLOC)
1682		if (bp->b_flags & B_MALLOC)
1683			panic("allocbuf: VMIO buffer can't be malloced");
1684#endif
1685
1686		if (newbsize < bp->b_bufsize) {
1687			if (desiredpages < bp->b_npages) {
1688				for (i = desiredpages; i < bp->b_npages; i++) {
1689					/*
1690					 * the page is not freed here -- it
1691					 * is the responsibility of vnode_pager_setsize
1692					 */
1693					m = bp->b_pages[i];
1694#if defined(DIAGNOSTIC)
1695					if (m == bogus_page)
1696						panic("allocbuf: bogus page found");
1697#endif
1698					vm_page_sleep(m, "biodep", &m->busy);
1699
1700					bp->b_pages[i] = NULL;
1701					vm_page_unwire(m, 0);
1702				}
1703				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
1704				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1705				bp->b_npages = desiredpages;
1706			}
1707		} else if (newbsize > bp->b_bufsize) {
1708			vm_object_t obj;
1709			vm_offset_t tinc, toff;
1710			vm_ooffset_t off;
1711			vm_pindex_t objoff;
1712			int pageindex, curbpnpages;
1713			struct vnode *vp;
1714			int bsize;
1715			int orig_validoff = bp->b_validoff;
1716			int orig_validend = bp->b_validend;
1717
1718			vp = bp->b_vp;
1719
1720			if (vp->v_type == VBLK)
1721				bsize = DEV_BSIZE;
1722			else
1723				bsize = vp->v_mount->mnt_stat.f_iosize;
1724
1725			if (bp->b_npages < desiredpages) {
1726				obj = vp->v_object;
1727				tinc = PAGE_SIZE;
1728
1729				off = bp->b_offset;
1730#ifdef DIAGNOSTIC
1731				if (bp->b_offset == NOOFFSET)
1732					panic("allocbuf: no buffer offset");
1733#endif
1734
1735				curbpnpages = bp->b_npages;
1736		doretry:
1737				bp->b_validoff = orig_validoff;
1738				bp->b_validend = orig_validend;
1739				bp->b_flags |= B_CACHE;
1740				for (toff = 0; toff < newbsize; toff += tinc) {
1741					objoff = OFF_TO_IDX(off + toff);
1742					pageindex = objoff - OFF_TO_IDX(off);
1743					tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
1744					if (pageindex < curbpnpages) {
1745
1746						m = bp->b_pages[pageindex];
1747#ifdef VFS_BIO_DIAG
1748						if (m->pindex != objoff)
1749							panic("allocbuf: page changed offset?!!!?");
1750#endif
1751						if (tinc > (newbsize - toff))
1752							tinc = newbsize - toff;
1753						if (bp->b_flags & B_CACHE)
1754							vfs_buf_set_valid(bp, off, toff, tinc, m);
1755						continue;
1756					}
1757					m = vm_page_lookup(obj, objoff);
1758					if (!m) {
1759						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1760						if (!m) {
1761							VM_WAIT;
1762							vm_pageout_deficit += (desiredpages - curbpnpages);
1763							goto doretry;
1764						}
1765
1766						vm_page_wire(m);
1767						vm_page_flag_clear(m, PG_BUSY);
1768						bp->b_flags &= ~B_CACHE;
1769
1770					} else if (m->flags & PG_BUSY) {
1771						s = splvm();
1772						if (m->flags & PG_BUSY) {
1773							vm_page_flag_set(m, PG_WANTED);
1774							tsleep(m, PVM, "pgtblk", 0);
1775						}
1776						splx(s);
1777						goto doretry;
1778					} else {
1779						if ((curproc != pageproc) &&
1780							((m->queue - m->pc) == PQ_CACHE) &&
1781						    ((cnt.v_free_count + cnt.v_cache_count) <
1782								(cnt.v_free_min + cnt.v_cache_min))) {
1783							pagedaemon_wakeup();
1784						}
1785						if (tinc > (newbsize - toff))
1786							tinc = newbsize - toff;
1787						if (bp->b_flags & B_CACHE)
1788							vfs_buf_set_valid(bp, off, toff, tinc, m);
1789						vm_page_flag_clear(m, PG_ZERO);
1790						vm_page_wire(m);
1791					}
1792					bp->b_pages[pageindex] = m;
1793					curbpnpages = pageindex + 1;
1794				}
1795				if (vp->v_tag == VT_NFS &&
1796				    vp->v_type != VBLK) {
1797					if (bp->b_dirtyend > 0) {
1798						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1799						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1800					}
1801					if (bp->b_validend == 0)
1802						bp->b_flags &= ~B_CACHE;
1803				}
1804				bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
1805				bp->b_npages = curbpnpages;
1806				pmap_qenter((vm_offset_t) bp->b_data,
1807					bp->b_pages, bp->b_npages);
1808				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1809			}
1810		}
1811	}
1812	if (bp->b_flags & B_VMIO)
1813		vmiospace += (newbsize - bp->b_bufsize);
1814	bufspace += (newbsize - bp->b_bufsize);
1815	bp->b_bufsize = newbsize;
1816	bp->b_bcount = size;
1817	return 1;
1818}
1819
1820/*
1821 * Wait for buffer I/O completion, returning error status.
1822 */
1823int
1824biowait(register struct buf * bp)
1825{
1826	int s;
1827
1828	s = splbio();
1829	while ((bp->b_flags & B_DONE) == 0)
1830#if defined(NO_SCHEDULE_MODS)
1831		tsleep(bp, PRIBIO, "biowait", 0);
1832#else
1833		if (bp->b_flags & B_READ)
1834			tsleep(bp, PRIBIO, "biord", 0);
1835		else
1836			tsleep(bp, PRIBIO, "biowr", 0);
1837#endif
1838	splx(s);
1839	if (bp->b_flags & B_EINTR) {
1840		bp->b_flags &= ~B_EINTR;
1841		return (EINTR);
1842	}
1843	if (bp->b_flags & B_ERROR) {
1844		return (bp->b_error ? bp->b_error : EIO);
1845	} else {
1846		return (0);
1847	}
1848}
1849
1850/*
1851 * Finish I/O on a buffer, calling an optional function.
1852 * This is usually called from interrupt level, so process blocking
1853 * is not *a good idea*.
1854 */
1855void
1856biodone(register struct buf * bp)
1857{
1858	int s;
1859
1860	s = splbio();
1861
1862#if !defined(MAX_PERF)
1863	if (!(bp->b_flags & B_BUSY))
1864		panic("biodone: buffer not busy");
1865#endif
1866
1867	if (bp->b_flags & B_DONE) {
1868		splx(s);
1869#if !defined(MAX_PERF)
1870		printf("biodone: buffer already done\n");
1871#endif
1872		return;
1873	}
1874	bp->b_flags |= B_DONE;
1875
1876	if (bp->b_flags & B_FREEBUF) {
1877		brelse(bp);
1878		splx(s);
1879		return;
1880	}
1881
1882	if ((bp->b_flags & B_READ) == 0) {
1883		vwakeup(bp);
1884	}
1885
1886	/* call optional completion function if requested */
1887	if (bp->b_flags & B_CALL) {
1888		bp->b_flags &= ~B_CALL;
1889		(*bp->b_iodone) (bp);
1890		splx(s);
1891		return;
1892	}
1893	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1894		(*bioops.io_complete)(bp);
1895
1896	if (bp->b_flags & B_VMIO) {
1897		int i, resid;
1898		vm_ooffset_t foff;
1899		vm_page_t m;
1900		vm_object_t obj;
1901		int iosize;
1902		struct vnode *vp = bp->b_vp;
1903
1904		obj = vp->v_object;
1905
1906#if defined(VFS_BIO_DEBUG)
1907		if (vp->v_usecount == 0) {
1908			panic("biodone: zero vnode ref count");
1909		}
1910
1911		if (vp->v_object == NULL) {
1912			panic("biodone: missing VM object");
1913		}
1914
1915		if ((vp->v_flag & VOBJBUF) == 0) {
1916			panic("biodone: vnode is not setup for merged cache");
1917		}
1918#endif
1919
1920		foff = bp->b_offset;
1921#ifdef DIAGNOSTIC
1922		if (bp->b_offset == NOOFFSET)
1923			panic("biodone: no buffer offset");
1924#endif
1925
1926#if !defined(MAX_PERF)
1927		if (!obj) {
1928			panic("biodone: no object");
1929		}
1930#endif
1931#if defined(VFS_BIO_DEBUG)
1932		if (obj->paging_in_progress < bp->b_npages) {
1933			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1934			    obj->paging_in_progress, bp->b_npages);
1935		}
1936#endif
1937		iosize = bp->b_bufsize;
1938		for (i = 0; i < bp->b_npages; i++) {
1939			int bogusflag = 0;
1940			m = bp->b_pages[i];
1941			if (m == bogus_page) {
1942				bogusflag = 1;
1943				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1944				if (!m) {
1945#if defined(VFS_BIO_DEBUG)
1946					printf("biodone: page disappeared\n");
1947#endif
1948					vm_object_pip_subtract(obj, 1);
1949					continue;
1950				}
1951				bp->b_pages[i] = m;
1952				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
1953			}
1954#if defined(VFS_BIO_DEBUG)
1955			if (OFF_TO_IDX(foff) != m->pindex) {
1956				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1957			}
1958#endif
1959			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1960			if (resid > iosize)
1961				resid = iosize;
1962
1963			/*
1964			 * In the write case, the valid and clean bits are
1965			 * already changed correctly, so we only need to do this
1966			 * here in the read case.
1967			 */
1968			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1969				vfs_page_set_valid(bp, foff, i, m);
1970			}
1971			vm_page_flag_clear(m, PG_ZERO);
1972
1973			/*
1974			 * when debugging new filesystems or buffer I/O methods, this
1975			 * is the most common error that pops up.  if you see this, you
1976			 * have not set the page busy flag correctly!!!
1977			 */
1978			if (m->busy == 0) {
1979#if !defined(MAX_PERF)
1980				printf("biodone: page busy < 0, "
1981				    "pindex: %d, foff: 0x(%x,%x), "
1982				    "resid: %d, index: %d\n",
1983				    (int) m->pindex, (int)(foff >> 32),
1984						(int) foff & 0xffffffff, resid, i);
1985#endif
1986				if (vp->v_type != VBLK)
1987#if !defined(MAX_PERF)
1988					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1989					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1990					    (int) bp->b_lblkno,
1991					    bp->b_flags, bp->b_npages);
1992				else
1993					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1994					    (int) bp->b_lblkno,
1995					    bp->b_flags, bp->b_npages);
1996				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1997				    m->valid, m->dirty, m->wire_count);
1998#endif
1999				panic("biodone: page busy < 0\n");
2000			}
2001			vm_page_io_finish(m);
2002			vm_object_pip_subtract(obj, 1);
2003			foff += resid;
2004			iosize -= resid;
2005		}
2006		if (obj &&
2007			(obj->paging_in_progress == 0) &&
2008		    (obj->flags & OBJ_PIPWNT)) {
2009			vm_object_clear_flag(obj, OBJ_PIPWNT);
2010			wakeup(obj);
2011		}
2012	}
2013	/*
2014	 * For asynchronous completions, release the buffer now. The brelse
2015	 * checks for B_WANTED and will do the wakeup there if necessary - so
2016	 * no need to do a wakeup here in the async case.
2017	 */
2018
2019	if (bp->b_flags & B_ASYNC) {
2020		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2021			brelse(bp);
2022		else
2023			bqrelse(bp);
2024	} else {
2025		bp->b_flags &= ~B_WANTED;
2026		wakeup(bp);
2027	}
2028	splx(s);
2029}
2030
2031#if 0	/* not with kirks code */
2032static int vfs_update_interval = 30;
2033
2034static void
2035vfs_update()
2036{
2037	while (1) {
2038		tsleep(&vfs_update_wakeup, PUSER, "update",
2039		    hz * vfs_update_interval);
2040		vfs_update_wakeup = 0;
2041		sync(curproc, NULL);
2042	}
2043}
2044
2045static int
2046sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2047{
2048	int error = sysctl_handle_int(oidp,
2049		oidp->oid_arg1, oidp->oid_arg2, req);
2050	if (!error)
2051		wakeup(&vfs_update_wakeup);
2052	return error;
2053}
2054
2055SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2056	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2057
2058#endif
2059
2060
2061/*
2062 * This routine is called in lieu of iodone in the case of
2063 * incomplete I/O.  This keeps the busy status for pages
2064 * consistant.
2065 */
2066void
2067vfs_unbusy_pages(struct buf * bp)
2068{
2069	int i;
2070
2071	if (bp->b_flags & B_VMIO) {
2072		struct vnode *vp = bp->b_vp;
2073		vm_object_t obj = vp->v_object;
2074
2075		for (i = 0; i < bp->b_npages; i++) {
2076			vm_page_t m = bp->b_pages[i];
2077
2078			if (m == bogus_page) {
2079				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2080#if !defined(MAX_PERF)
2081				if (!m) {
2082					panic("vfs_unbusy_pages: page missing\n");
2083				}
2084#endif
2085				bp->b_pages[i] = m;
2086				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2087			}
2088			vm_object_pip_subtract(obj, 1);
2089			vm_page_flag_clear(m, PG_ZERO);
2090			vm_page_io_finish(m);
2091		}
2092		if (obj->paging_in_progress == 0 &&
2093		    (obj->flags & OBJ_PIPWNT)) {
2094			vm_object_clear_flag(obj, OBJ_PIPWNT);
2095			wakeup(obj);
2096		}
2097	}
2098}
2099
2100/*
2101 * Set NFS' b_validoff and b_validend fields from the valid bits
2102 * of a page.  If the consumer is not NFS, and the page is not
2103 * valid for the entire range, clear the B_CACHE flag to force
2104 * the consumer to re-read the page.
2105 */
2106static void
2107vfs_buf_set_valid(struct buf *bp,
2108		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2109		  vm_page_t m)
2110{
2111	if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2112		vm_offset_t svalid, evalid;
2113		int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
2114
2115		/*
2116		 * This only bothers with the first valid range in the
2117		 * page.
2118		 */
2119		svalid = off;
2120		while (validbits && !(validbits & 1)) {
2121			svalid += DEV_BSIZE;
2122			validbits >>= 1;
2123		}
2124		evalid = svalid;
2125		while (validbits & 1) {
2126			evalid += DEV_BSIZE;
2127			validbits >>= 1;
2128		}
2129		evalid = min(evalid, off + size);
2130		/*
2131		 * Make sure this range is contiguous with the range
2132		 * built up from previous pages.  If not, then we will
2133		 * just use the range from the previous pages.
2134		 */
2135		if (svalid == bp->b_validend) {
2136			bp->b_validoff = min(bp->b_validoff, svalid);
2137			bp->b_validend = max(bp->b_validend, evalid);
2138		}
2139	} else if (!vm_page_is_valid(m,
2140				     (vm_offset_t) ((foff + off) & PAGE_MASK),
2141				     size)) {
2142		bp->b_flags &= ~B_CACHE;
2143	}
2144}
2145
2146/*
2147 * Set the valid bits in a page, taking care of the b_validoff,
2148 * b_validend fields which NFS uses to optimise small reads.  Off is
2149 * the offset within the file and pageno is the page index within the buf.
2150 */
2151static void
2152vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2153{
2154	struct vnode *vp = bp->b_vp;
2155	vm_ooffset_t soff, eoff;
2156
2157	soff = off;
2158	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
2159	if (eoff > bp->b_offset + bp->b_bufsize)
2160		eoff = bp->b_offset + bp->b_bufsize;
2161	if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2162		vm_ooffset_t sv, ev;
2163		vm_page_set_invalid(m,
2164		    (vm_offset_t) (soff & PAGE_MASK),
2165		    (vm_offset_t) (eoff - soff));
2166		sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2167		ev = (bp->b_offset + bp->b_validend) & ~(DEV_BSIZE - 1);
2168		soff = qmax(sv, soff);
2169		eoff = qmin(ev, eoff);
2170	}
2171	if (eoff > soff)
2172		vm_page_set_validclean(m,
2173	       (vm_offset_t) (soff & PAGE_MASK),
2174	       (vm_offset_t) (eoff - soff));
2175}
2176
2177/*
2178 * This routine is called before a device strategy routine.
2179 * It is used to tell the VM system that paging I/O is in
2180 * progress, and treat the pages associated with the buffer
2181 * almost as being PG_BUSY.  Also the object paging_in_progress
2182 * flag is handled to make sure that the object doesn't become
2183 * inconsistant.
2184 */
2185void
2186vfs_busy_pages(struct buf * bp, int clear_modify)
2187{
2188	int i, bogus;
2189
2190	if (bp->b_flags & B_VMIO) {
2191		struct vnode *vp = bp->b_vp;
2192		vm_object_t obj = vp->v_object;
2193		vm_ooffset_t foff;
2194
2195		foff = bp->b_offset;
2196#ifdef DIAGNOSTIC
2197		if (bp->b_offset == NOOFFSET)
2198			panic("vfs_busy_pages: no buffer offset");
2199#endif
2200
2201		vfs_setdirty(bp);
2202
2203retry:
2204		for (i = 0; i < bp->b_npages; i++) {
2205			vm_page_t m = bp->b_pages[i];
2206			if (vm_page_sleep(m, "vbpage", NULL))
2207				goto retry;
2208		}
2209
2210		bogus = 0;
2211		for (i = 0; i < bp->b_npages; i++) {
2212			vm_page_t m = bp->b_pages[i];
2213
2214			vm_page_flag_clear(m, PG_ZERO);
2215			if ((bp->b_flags & B_CLUSTER) == 0) {
2216				vm_object_pip_add(obj, 1);
2217				vm_page_io_start(m);
2218			}
2219
2220			vm_page_protect(m, VM_PROT_NONE);
2221			if (clear_modify)
2222				vfs_page_set_valid(bp, foff, i, m);
2223			else if (m->valid == VM_PAGE_BITS_ALL &&
2224				(bp->b_flags & B_CACHE) == 0) {
2225				bp->b_pages[i] = bogus_page;
2226				bogus++;
2227			}
2228			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2229		}
2230		if (bogus)
2231			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2232	}
2233}
2234
2235/*
2236 * Tell the VM system that the pages associated with this buffer
2237 * are clean.  This is used for delayed writes where the data is
2238 * going to go to disk eventually without additional VM intevention.
2239 */
2240void
2241vfs_clean_pages(struct buf * bp)
2242{
2243	int i;
2244
2245	if (bp->b_flags & B_VMIO) {
2246		vm_ooffset_t foff;
2247		foff = bp->b_offset;
2248
2249#ifdef DIAGNOSTIC
2250		if (bp->b_offset == NOOFFSET)
2251			panic("vfs_clean_pages: no buffer offset");
2252#endif
2253
2254		for (i = 0; i < bp->b_npages; i++) {
2255			vm_page_t m = bp->b_pages[i];
2256			vfs_page_set_valid(bp, foff, i, m);
2257			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2258		}
2259	}
2260}
2261
2262void
2263vfs_bio_clrbuf(struct buf *bp) {
2264	int i, size, mask = 0;
2265	caddr_t sa, ea;
2266	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2267		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
2268		    (bp->b_offset & PAGE_MASK) == 0) {
2269			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
2270			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2271			    ((bp->b_pages[0]->valid & mask) != mask)) {
2272				bzero(bp->b_data, bp->b_bufsize);
2273			}
2274			bp->b_pages[0]->valid |= mask;
2275			bp->b_resid = 0;
2276			return;
2277		}
2278		ea = sa = bp->b_data;
2279		for(i=0;i<bp->b_npages;i++,sa=ea) {
2280			int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
2281			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
2282			ea = (caddr_t)ulmin((u_long)ea,
2283				(u_long)bp->b_data + bp->b_bufsize);
2284			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
2285			if ((bp->b_pages[i]->valid & mask) == mask)
2286				continue;
2287			if ((bp->b_pages[i]->valid & mask) == 0) {
2288				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2289					bzero(sa, ea - sa);
2290				}
2291			} else {
2292				for (; sa < ea; sa += DEV_BSIZE, j++) {
2293					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2294						(bp->b_pages[i]->valid & (1<<j)) == 0)
2295						bzero(sa, DEV_BSIZE);
2296				}
2297			}
2298			bp->b_pages[i]->valid |= mask;
2299			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2300		}
2301		bp->b_resid = 0;
2302	} else {
2303		clrbuf(bp);
2304	}
2305}
2306
2307/*
2308 * vm_hold_load_pages and vm_hold_unload pages get pages into
2309 * a buffers address space.  The pages are anonymous and are
2310 * not associated with a file object.
2311 */
2312void
2313vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2314{
2315	vm_offset_t pg;
2316	vm_page_t p;
2317	int index;
2318
2319	to = round_page(to);
2320	from = round_page(from);
2321	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2322
2323	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2324
2325tryagain:
2326
2327		p = vm_page_alloc(kernel_object,
2328			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2329		    VM_ALLOC_NORMAL);
2330		if (!p) {
2331			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2332			VM_WAIT;
2333			goto tryagain;
2334		}
2335		vm_page_wire(p);
2336		p->valid = VM_PAGE_BITS_ALL;
2337		vm_page_flag_clear(p, PG_ZERO);
2338		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2339		bp->b_pages[index] = p;
2340		vm_page_wakeup(p);
2341	}
2342	bp->b_npages = index;
2343}
2344
2345void
2346vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2347{
2348	vm_offset_t pg;
2349	vm_page_t p;
2350	int index, newnpages;
2351
2352	from = round_page(from);
2353	to = round_page(to);
2354	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2355
2356	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2357		p = bp->b_pages[index];
2358		if (p && (index < bp->b_npages)) {
2359#if !defined(MAX_PERF)
2360			if (p->busy) {
2361				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2362					bp->b_blkno, bp->b_lblkno);
2363			}
2364#endif
2365			bp->b_pages[index] = NULL;
2366			pmap_kremove(pg);
2367			vm_page_busy(p);
2368			vm_page_unwire(p, 0);
2369			vm_page_free(p);
2370		}
2371	}
2372	bp->b_npages = newnpages;
2373}
2374
2375
2376#include "opt_ddb.h"
2377#ifdef DDB
2378#include <ddb/ddb.h>
2379
2380DB_SHOW_COMMAND(buffer, db_show_buffer)
2381{
2382	/* get args */
2383	struct buf *bp = (struct buf *)addr;
2384
2385	if (!have_addr) {
2386		db_printf("usage: show buffer <addr>\n");
2387		return;
2388	}
2389
2390	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2391		  (u_int)bp->b_flags, PRINT_BUF_FLAGS);
2392	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2393		  "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2394		  "b_blkno = %d, b_pblkno = %d\n",
2395		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2396		  bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2397	if (bp->b_npages) {
2398		int i;
2399		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2400		for (i = 0; i < bp->b_npages; i++) {
2401			vm_page_t m;
2402			m = bp->b_pages[i];
2403			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
2404			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
2405			if ((i + 1) < bp->b_npages)
2406				db_printf(",");
2407		}
2408		db_printf("\n");
2409	}
2410}
2411#endif /* DDB */
2412