vfs_bio.c revision 28465
1152909Sanholt/*
2152909Sanholt * Copyright (c) 1994 John S. Dyson
395584Sanholt * All rights reserved.
495584Sanholt *
595584Sanholt * Redistribution and use in source and binary forms, with or without
695584Sanholt * modification, are permitted provided that the following conditions
795584Sanholt * are met:
895584Sanholt * 1. Redistributions of source code must retain the above copyright
995584Sanholt *    notice immediately at the beginning of the file, without modification,
1095584Sanholt *    this list of conditions, and the following disclaimer.
1195584Sanholt * 2. Redistributions in binary form must reproduce the above copyright
1295584Sanholt *    notice, this list of conditions and the following disclaimer in the
1395584Sanholt *    documentation and/or other materials provided with the distribution.
1495584Sanholt * 3. Absolutely no warranty of function or purpose is made by the author
1595584Sanholt *    John S. Dyson.
1695584Sanholt * 4. This work was done expressly for inclusion into FreeBSD.  Other use
1795584Sanholt *    is allowed if this notation is included.
1895584Sanholt * 5. Modifications may be freely made to this file if the above conditions
1995584Sanholt *    are met.
2095584Sanholt *
2195584Sanholt * $Id: vfs_bio.c,v 1.122 1997/08/09 10:13:12 dyson Exp $
2295584Sanholt */
2395584Sanholt
2495584Sanholt/*
2595584Sanholt * this file contains a new buffer I/O scheme implementing a coherent
2695584Sanholt * VM object and buffer cache scheme.  Pains have been taken to make
2795584Sanholt * sure that the performance degradation associated with schemes such
2895584Sanholt * as this is not realized.
2995584Sanholt *
3095584Sanholt * Author:  John S. Dyson
31152909Sanholt * Significant help during the development and debugging phases
32152909Sanholt * had been provided by David Greenman, also of the FreeBSD core team.
33152909Sanholt */
3495584Sanholt
3595584Sanholt#include "opt_bounce.h"
3695584Sanholt
37145132Sanholt#define VMIO
38145132Sanholt#include <sys/param.h>
39145132Sanholt#include <sys/systm.h>
40145132Sanholt#include <sys/sysproto.h>
41145132Sanholt#include <sys/kernel.h>
42145132Sanholt#include <sys/sysctl.h>
43145132Sanholt#include <sys/proc.h>
44196470Srnoland#include <sys/vnode.h>
45145132Sanholt#include <sys/vmmeter.h>
46145132Sanholt#include <vm/vm.h>
47145132Sanholt#include <vm/vm_param.h>
48145132Sanholt#include <vm/vm_prot.h>
49145132Sanholt#include <vm/vm_kern.h>
50145132Sanholt#include <vm/vm_pageout.h>
51145132Sanholt#include <vm/vm_page.h>
52145132Sanholt#include <vm/vm_object.h>
53145132Sanholt#include <vm/vm_extern.h>
54145132Sanholt#include <vm/vm_map.h>
55145132Sanholt#include <sys/buf.h>
56145132Sanholt#include <sys/mount.h>
57145132Sanholt#include <sys/malloc.h>
58145132Sanholt#include <sys/resourcevar.h>
59145132Sanholt#include <sys/proc.h>
60145132Sanholt
61145132Sanholt#include <miscfs/specfs/specdev.h>
62145132Sanholt
63145132Sanholtstatic void vfs_update __P((void));
64145132Sanholtstatic struct	proc *updateproc;
65145132Sanholtstatic struct kproc_desc up_kp = {
66145132Sanholt	"update",
67145132Sanholt	vfs_update,
68145132Sanholt	&updateproc
69145132Sanholt};
70145132SanholtSYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
71145132Sanholt
72145132Sanholtstruct buf *buf;		/* buffer header pool */
73145132Sanholtstruct swqueue bswlist;
74145132Sanholt
75145132Sanholtint count_lock_queue __P((void));
76145132Sanholtstatic void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
77145132Sanholt		vm_offset_t to);
78145132Sanholtstatic void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
79145132Sanholt		vm_offset_t to);
80145132Sanholtstatic void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
81145132Sanholt			      vm_offset_t off, vm_offset_t size,
82145132Sanholt			      vm_page_t m);
83145132Sanholtstatic void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
84145132Sanholt			       int pageno, vm_page_t m);
85145132Sanholtstatic void vfs_clean_pages(struct buf * bp);
86145132Sanholtstatic void vfs_setdirty(struct buf *bp);
87145132Sanholtstatic void vfs_vmio_release(struct buf *bp);
88152909Sanholtstatic void flushdirtybuffers(int slpflag, int slptimeo);
89157617Sanholt
90157617Sanholtint needsbuffer;
91157617Sanholt
92157617Sanholt/*
93152909Sanholt * Internal update daemon, process 3
94157617Sanholt *	The variable vfs_update_wakeup allows for internal syncs.
95157617Sanholt */
96157617Sanholtint vfs_update_wakeup;
97157617Sanholt
98157617Sanholt
99162132Sanholt/*
100162132Sanholt * buffers base kva
101182080Srnoland */
102182080Srnoland
103182080Srnoland/*
104182080Srnoland * bogus page -- for I/O to/from partially complete buffers
105196471Srnoland * this is a temporary solution to the problem, but it is not
106196471Srnoland * really that bad.  it would be better to split the buffer
107145132Sanholt * for input in the case of buffers partially already in memory,
108145132Sanholt * but the code is intricate enough already.
109196471Srnoland */
110145132Sanholtvm_page_t bogus_page;
111145132Sanholtstatic vm_offset_t bogus_offset;
112157617Sanholt
113157617Sanholtstatic int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
114157617Sanholt	bufmallocspace, maxbufmallocspace;
115145132Sanholtint numdirtybuffers, lodirtybuffers, hidirtybuffers;
116145132Sanholtstatic int numfreebuffers, lofreebuffers, hifreebuffers;
117157617Sanholt
118145132SanholtSYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
119152909Sanholt	&numdirtybuffers, 0, "");
120157617SanholtSYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
121145132Sanholt	&lodirtybuffers, 0, "");
122145132SanholtSYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
123157617Sanholt	&hidirtybuffers, 0, "");
124145132SanholtSYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
125145132Sanholt	&numfreebuffers, 0, "");
126148211SanholtSYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
127145132Sanholt	&lofreebuffers, 0, "");
128157617SanholtSYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
129148211Sanholt	&hifreebuffers, 0, "");
130183830SrnolandSYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
131157617Sanholt	&maxbufspace, 0, "");
132157617SanholtSYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
133182080Srnoland	&bufspace, 0, "");
134189499SrnolandSYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
135182080Srnoland	&maxvmiobufspace, 0, "");
136183828SrnolandSYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
137182080Srnoland	&vmiospace, 0, "");
138182080SrnolandSYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
139182080Srnoland	&maxbufmallocspace, 0, "");
140182080SrnolandSYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
141182080Srnoland	&bufmallocspace, 0, "");
142182080Srnoland
143189499Srnolandstatic struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
144189499Srnolandstatic struct bqueues bufqueues[BUFFER_QUEUES];
145189499Srnoland
146197605Srnolandextern int vm_swap_size;
147189499Srnoland
148189499Srnoland#define BUF_MAXUSE 24
149189499Srnoland
150196142Srnoland#define VFS_BIO_NEED_ANY 1
151189499Srnoland#define VFS_BIO_NEED_LOWLIMIT 2
152189499Srnoland#define VFS_BIO_NEED_FREE 4
153189499Srnoland
154197605Srnoland/*
155145132Sanholt * Initialize buffer headers and related structures.
156145132Sanholt */
157145132Sanholtvoid
158189499Srnolandbufinit()
159189499Srnoland{
160189499Srnoland	struct buf *bp;
161189499Srnoland	int i;
162189499Srnoland
163189499Srnoland	TAILQ_INIT(&bswlist);
164145132Sanholt	LIST_INIT(&invalhash);
165145132Sanholt
166145132Sanholt	/* first, make a null hash table */
167145132Sanholt	for (i = 0; i < BUFHSZ; i++)
168182080Srnoland		LIST_INIT(&bufhashtbl[i]);
169182080Srnoland
170182080Srnoland	/* next, make a null set of free lists */
171182080Srnoland	for (i = 0; i < BUFFER_QUEUES; i++)
172182080Srnoland		TAILQ_INIT(&bufqueues[i]);
173182080Srnoland
174182080Srnoland	/* finally, initialize each buffer header and stick on empty q */
175182080Srnoland	for (i = 0; i < nbuf; i++) {
176182080Srnoland		bp = &buf[i];
177182080Srnoland		bzero(bp, sizeof *bp);
178182080Srnoland		bp->b_flags = B_INVAL;	/* we're just an empty header */
179145132Sanholt		bp->b_dev = NODEV;
180145132Sanholt		bp->b_rcred = NOCRED;
18195584Sanholt		bp->b_wcred = NOCRED;
182145132Sanholt		bp->b_qindex = QUEUE_EMPTY;
183182080Srnoland		bp->b_vnbufs.le_next = NOLIST;
184145132Sanholt		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
185145132Sanholt		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
18695584Sanholt	}
18795584Sanholt/*
18895584Sanholt * maxbufspace is currently calculated to support all filesystem blocks
18995584Sanholt * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
19095584Sanholt * cache is still the same as it would be for 8K filesystems.  This
191189499Srnoland * keeps the size of the buffer cache "in check" for big block filesystems.
192189499Srnoland */
19395584Sanholt	maxbufspace = (nbuf + 8) * DFLTBSIZE;
194182080Srnoland/*
195182080Srnoland * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
196182080Srnoland */
197182080Srnoland	maxvmiobufspace = 2 * maxbufspace / 3;
198182080Srnoland/*
199182080Srnoland * Limit the amount of malloc memory since it is wired permanently into
20095584Sanholt * the kernel space.  Even though this is accounted for in the buffer
20195584Sanholt * allocation, we don't want the malloced region to grow uncontrolled.
20295584Sanholt * The malloc scheme improves memory utilization significantly on average
20395584Sanholt * (small) directories.
20495584Sanholt */
20595584Sanholt	maxbufmallocspace = maxbufspace / 20;
20695584Sanholt
20795584Sanholt/*
20895584Sanholt * Remove the probability of deadlock conditions by limiting the
20995584Sanholt * number of dirty buffers.
21095584Sanholt */
21195584Sanholt	hidirtybuffers = nbuf / 6 + 20;
21295584Sanholt	lodirtybuffers = nbuf / 12 + 10;
213145132Sanholt	numdirtybuffers = 0;
214145132Sanholt	lofreebuffers = nbuf / 18 + 5;
215145132Sanholt	hifreebuffers = 2 * lofreebuffers;
216112015Sanholt	numfreebuffers = nbuf;
217112015Sanholt
218112015Sanholt	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
219112015Sanholt	bogus_page = vm_page_alloc(kernel_object,
220112015Sanholt			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
221112015Sanholt			VM_ALLOC_NORMAL);
222182080Srnoland
223112015Sanholt}
224112015Sanholt
225145132Sanholt/*
226145132Sanholt * Free the kva allocation for a buffer
227145132Sanholt * Must be called only at splbio or higher,
228145132Sanholt *  as this is the only locking for buffer_map.
229145132Sanholt */
230145132Sanholtstatic void
231145132Sanholtbfreekva(struct buf * bp)
232145132Sanholt{
233145132Sanholt	if (bp->b_kvasize == 0)
234145132Sanholt		return;
235145132Sanholt
236145132Sanholt	vm_map_delete(buffer_map,
237182080Srnoland		(vm_offset_t) bp->b_kvabase,
238189499Srnoland		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
239145132Sanholt
240145132Sanholt	bp->b_kvasize = 0;
241196470Srnoland
242196470Srnoland}
243196470Srnoland
244196470Srnoland/*
245196470Srnoland * remove the buffer from the appropriate free list
246196470Srnoland */
247196470Srnolandvoid
248196470Srnolandbremfree(struct buf * bp)
249196470Srnoland{
250196470Srnoland	int s = splbio();
251196470Srnoland
252196470Srnoland	if (bp->b_qindex != QUEUE_NONE) {
253196470Srnoland		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
254196470Srnoland		bp->b_qindex = QUEUE_NONE;
255196470Srnoland	} else {
256196470Srnoland#if !defined(MAX_PERF)
257196470Srnoland		panic("bremfree: removing a buffer when not on a queue");
258196470Srnoland#endif
259196470Srnoland	}
260196470Srnoland	if ((bp->b_flags & B_INVAL) ||
261196470Srnoland		(bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
262196470Srnoland		--numfreebuffers;
263196470Srnoland	splx(s);
264196470Srnoland}
265196470Srnoland
266196470Srnoland/*
267196470Srnoland * Get a buffer with the specified data.  Look in the cache first.
268196470Srnoland */
269196470Srnolandint
270196470Srnolandbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
271196470Srnoland    struct buf ** bpp)
272196470Srnoland{
273196470Srnoland	struct buf *bp;
274196470Srnoland
275196470Srnoland	bp = getblk(vp, blkno, size, 0, 0);
276196470Srnoland	*bpp = bp;
277196470Srnoland
278196470Srnoland	/* if not found in cache, do some I/O */
279196470Srnoland	if ((bp->b_flags & B_CACHE) == 0) {
280196470Srnoland		if (curproc != NULL)
281189499Srnoland			curproc->p_stats->p_ru.ru_inblock++;
282189499Srnoland		bp->b_flags |= B_READ;
283182080Srnoland		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
28495584Sanholt		if (bp->b_rcred == NOCRED) {
28595584Sanholt			if (cred != NOCRED)
28695584Sanholt				crhold(cred);
28795584Sanholt			bp->b_rcred = cred;
288122580Sanholt		}
289157617Sanholt		vfs_busy_pages(bp, 0);
290157617Sanholt		VOP_STRATEGY(bp);
291122580Sanholt		return (biowait(bp));
292119895Sanholt	}
293119895Sanholt	return (0);
294119895Sanholt}
29595584Sanholt
29695584Sanholt/*
29795584Sanholt * Operates like bread, but also starts asynchronous I/O on
29895584Sanholt * read-ahead blocks.
299145132Sanholt */
300145132Sanholtint
30195584Sanholtbreadn(struct vnode * vp, daddr_t blkno, int size,
302112015Sanholt    daddr_t * rablkno, int *rabsize,
30395584Sanholt    int cnt, struct ucred * cred, struct buf ** bpp)
30495584Sanholt{
305112015Sanholt	struct buf *bp, *rabp;
306189499Srnoland	int i;
307189499Srnoland	int rv = 0, readwait = 0;
308112015Sanholt
309112015Sanholt	*bpp = bp = getblk(vp, blkno, size, 0, 0);
310112015Sanholt
311112015Sanholt	/* if not found in cache, do some I/O */
312112015Sanholt	if ((bp->b_flags & B_CACHE) == 0) {
313112015Sanholt		if (curproc != NULL)
314112015Sanholt			curproc->p_stats->p_ru.ru_inblock++;
315112015Sanholt		bp->b_flags |= B_READ;
316112015Sanholt		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
317112015Sanholt		if (bp->b_rcred == NOCRED) {
31895584Sanholt			if (cred != NOCRED)
319112015Sanholt				crhold(cred);
32095584Sanholt			bp->b_rcred = cred;
32195584Sanholt		}
32295584Sanholt		vfs_busy_pages(bp, 0);
32395584Sanholt		VOP_STRATEGY(bp);
32495584Sanholt		++readwait;
32595584Sanholt	}
32695584Sanholt	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
32795584Sanholt		if (inmem(vp, *rablkno))
32895584Sanholt			continue;
32995584Sanholt		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
33095584Sanholt
33195584Sanholt		if ((rabp->b_flags & B_CACHE) == 0) {
33295584Sanholt			if (curproc != NULL)
33395584Sanholt				curproc->p_stats->p_ru.ru_inblock++;
33495584Sanholt			rabp->b_flags |= B_READ | B_ASYNC;
33595584Sanholt			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
33695584Sanholt			if (rabp->b_rcred == NOCRED) {
337145132Sanholt				if (cred != NOCRED)
338113995Sanholt					crhold(cred);
339113995Sanholt				rabp->b_rcred = cred;
340113995Sanholt			}
341119895Sanholt			vfs_busy_pages(rabp, 0);
34295584Sanholt			VOP_STRATEGY(rabp);
343112015Sanholt		} else {
344112015Sanholt			brelse(rabp);
345112015Sanholt		}
346119895Sanholt	}
347112015Sanholt
348119895Sanholt	if (readwait) {
349112015Sanholt		rv = biowait(bp);
350112015Sanholt	}
351112015Sanholt	return (rv);
352145132Sanholt}
353145132Sanholt
354182080Srnoland/*
355182080Srnoland * Write, release buffer on completion.  (Done by iodone
356182080Srnoland * if async.)
357182080Srnoland */
358112015Sanholtint
359145132Sanholtbwrite(struct buf * bp)
360182080Srnoland{
361152909Sanholt	int oldflags = bp->b_flags;
362152909Sanholt
363182080Srnoland	if (bp->b_flags & B_INVAL) {
364182080Srnoland		brelse(bp);
365157617Sanholt		return (0);
366157617Sanholt	}
367157617Sanholt#if !defined(MAX_PERF)
368145132Sanholt	if (!(bp->b_flags & B_BUSY))
369145132Sanholt		panic("bwrite: buffer is not busy???");
370182080Srnoland#endif
371145132Sanholt
372182080Srnoland	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
373196471Srnoland	bp->b_flags |= B_WRITEINPROG;
374182080Srnoland
375189499Srnoland	if ((oldflags & B_DELWRI) == B_DELWRI) {
376189499Srnoland		--numdirtybuffers;
377189499Srnoland		reassignbuf(bp, bp->b_vp);
378189499Srnoland	}
379189499Srnoland
380189499Srnoland	bp->b_vp->v_numoutput++;
381189499Srnoland	vfs_busy_pages(bp, 1);
382189499Srnoland	if (curproc != NULL)
383189499Srnoland		curproc->p_stats->p_ru.ru_oublock++;
384189499Srnoland	VOP_STRATEGY(bp);
385189499Srnoland
386189499Srnoland	/*
387189499Srnoland	 * Handle ordered writes here.
388189499Srnoland	 * If the write was originally flagged as ordered,
389189499Srnoland	 * then we check to see if it was converted to async.
390189499Srnoland	 * If it was converted to async, and is done now, then
391189499Srnoland	 * we release the buffer.  Otherwise we clear the
392189499Srnoland	 * ordered flag because it is not needed anymore.
393189499Srnoland	 *
394189499Srnoland 	 * Note that biodone has been modified so that it does
395196470Srnoland	 * not release ordered buffers.  This allows us to have
396196470Srnoland	 * a chance to determine whether or not the driver
397189499Srnoland	 * has set the async flag in the strategy routine.  Otherwise
398196470Srnoland	 * if biodone was not modified, then the buffer may have been
399196470Srnoland	 * reused before we have had a chance to check the flag.
400196470Srnoland	 */
401196470Srnoland
40295584Sanholt	if ((oldflags & B_ORDERED) == B_ORDERED) {
40395584Sanholt		int s;
40495584Sanholt		s = splbio();
40595584Sanholt		if (bp->b_flags & B_ASYNC)  {
40695584Sanholt			if ((bp->b_flags & B_DONE)) {
40795584Sanholt				if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
408157617Sanholt					brelse(bp);
409157617Sanholt				else
410157617Sanholt					bqrelse(bp);
411157617Sanholt			}
412182080Srnoland			splx(s);
413157617Sanholt			return (0);
414157617Sanholt		} else {
415152909Sanholt			bp->b_flags &= ~B_ORDERED;
416182080Srnoland		}
417152909Sanholt		splx(s);
418152909Sanholt	}
419189499Srnoland
420189499Srnoland	if ((oldflags & B_ASYNC) == 0) {
421189499Srnoland		int rtval = biowait(bp);
422189499Srnoland
423189499Srnoland		if (oldflags & B_DELWRI) {
424189499Srnoland			reassignbuf(bp, bp->b_vp);
425182080Srnoland		}
426182080Srnoland		brelse(bp);
427182080Srnoland		return (rtval);
428182080Srnoland	}
429182080Srnoland	return (0);
430182080Srnoland}
431196470Srnoland
432196470Srnolandint
433196470Srnolandvn_bwrite(ap)
434196470Srnoland	struct vop_bwrite_args *ap;
435182080Srnoland{
436182080Srnoland	return (bwrite(ap->a_bp));
437182080Srnoland}
438182080Srnoland
439182080Srnolandvoid
44095584Sanholtvfs_bio_need_satisfy(void) {
441182080Srnoland	++numfreebuffers;
442182080Srnoland	if (!needsbuffer)
443182080Srnoland		return;
444182080Srnoland	if (numdirtybuffers < lodirtybuffers) {
445182080Srnoland		needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
446182080Srnoland	} else {
447182080Srnoland		needsbuffer &= ~VFS_BIO_NEED_ANY;
448182080Srnoland	}
449182080Srnoland	if (numfreebuffers >= hifreebuffers) {
450182080Srnoland		needsbuffer &= ~VFS_BIO_NEED_FREE;
451189499Srnoland	}
452189499Srnoland	wakeup(&needsbuffer);
453189499Srnoland}
45495584Sanholt
455182080Srnoland/*
456182080Srnoland * Delayed write. (Buffer is marked dirty).
45795584Sanholt */
458145132Sanholtvoid
45995584Sanholtbdwrite(struct buf * bp)
460145132Sanholt{
46195584Sanholt
462189499Srnoland#if !defined(MAX_PERF)
463189499Srnoland	if ((bp->b_flags & B_BUSY) == 0) {
464189499Srnoland		panic("bdwrite: buffer is not busy");
465189499Srnoland	}
466182080Srnoland#endif
467182080Srnoland
468182080Srnoland	if (bp->b_flags & B_INVAL) {
469145132Sanholt		brelse(bp);
470182080Srnoland		return;
471182080Srnoland	}
47295584Sanholt	if (bp->b_flags & B_TAPE) {
473189499Srnoland		bawrite(bp);
474189499Srnoland		return;
475189499Srnoland	}
476189499Srnoland	bp->b_flags &= ~(B_READ|B_RELBUF);
477112015Sanholt	if ((bp->b_flags & B_DELWRI) == 0) {
478182080Srnoland		bp->b_flags |= B_DONE | B_DELWRI;
479182080Srnoland		reassignbuf(bp, bp->b_vp);
480182080Srnoland		++numdirtybuffers;
481112015Sanholt	}
482182080Srnoland
483182080Srnoland	/*
484182080Srnoland	 * This bmap keeps the system from needing to do the bmap later,
485182080Srnoland	 * perhaps when the system is attempting to do a sync.  Since it
486145132Sanholt	 * is likely that the indirect block -- or whatever other datastructure
487182080Srnoland	 * that the filesystem needs is still in memory now, it is a good
488189499Srnoland	 * thing to do this.  Note also, that if the pageout daemon is
489182080Srnoland	 * requesting a sync -- there might not be enough memory to do
490189499Srnoland	 * the bmap then...  So, this is important to do.
491182080Srnoland	 */
492182080Srnoland	if( bp->b_lblkno == bp->b_blkno) {
493112015Sanholt		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
494152909Sanholt	}
495152909Sanholt
496152909Sanholt	/*
497189499Srnoland	 * Set the *dirty* buffer range based upon the VM system dirty pages.
498182080Srnoland	 */
499189499Srnoland	vfs_setdirty(bp);
500182080Srnoland
501182080Srnoland	/*
502189499Srnoland	 * We need to do this here to satisfy the vnode_pager and the
503189499Srnoland	 * pageout daemon, so that it thinks that the pages have been
504152909Sanholt	 * "cleaned".  Note that since the pages are in a delayed write
505189499Srnoland	 * buffer -- the VFS layer "will" see that the pages get written
506152909Sanholt	 * out on the next sync, or perhaps the cluster will be completed.
507148211Sanholt	 */
508182080Srnoland	vfs_clean_pages(bp);
509148211Sanholt	bqrelse(bp);
510182080Srnoland
511182080Srnoland	if (numdirtybuffers >= hidirtybuffers)
512182080Srnoland		flushdirtybuffers(0, 0);
513148211Sanholt
514189499Srnoland	return;
515189499Srnoland}
516189499Srnoland
517189499Srnoland/*
518189499Srnoland * Asynchronous write.
519189499Srnoland * Start output on a buffer, but do not wait for it to complete.
520189499Srnoland * The buffer is released when the output completes.
521189499Srnoland */
522189499Srnolandvoid
523189499Srnolandbawrite(struct buf * bp)
524189499Srnoland{
525189499Srnoland	bp->b_flags |= B_ASYNC;
526189499Srnoland	(void) VOP_BWRITE(bp);
527189499Srnoland}
528196470Srnoland
529196470Srnoland/*
530196470Srnoland * Ordered write.
531196470Srnoland * Start output on a buffer, but only wait for it to complete if the
532196470Srnoland * output device cannot guarantee ordering in some other way.  Devices
533189499Srnoland * that can perform asynchronous ordered writes will set the B_ASYNC
534196470Srnoland * flag in their strategy routine.
535196470Srnoland * The buffer is released when the output completes.
536196470Srnoland */
537196470Srnolandint
538196470Srnolandbowrite(struct buf * bp)
539196470Srnoland{
540196470Srnoland	bp->b_flags |= B_ORDERED;
541196470Srnoland	return (VOP_BWRITE(bp));
542196470Srnoland}
543196470Srnoland
544196470Srnoland/*
545196470Srnoland * Release a buffer.
546196470Srnoland */
547196470Srnolandvoid
548196470Srnolandbrelse(struct buf * bp)
549196470Srnoland{
550196470Srnoland	int s;
551196470Srnoland
552196470Srnoland	if (bp->b_flags & B_CLUSTER) {
553196470Srnoland		relpbuf(bp);
554196470Srnoland		return;
555196470Srnoland	}
556112015Sanholt	/* anyone need a "free" block? */
557112015Sanholt	s = splbio();
558112015Sanholt
559112015Sanholt	/* anyone need this block? */
560112015Sanholt	if (bp->b_flags & B_WANTED) {
561112015Sanholt		bp->b_flags &= ~(B_WANTED | B_AGE);
562112015Sanholt		wakeup(bp);
563112015Sanholt	}
56495584Sanholt
56595584Sanholt	if (bp->b_flags & B_LOCKED)
56695584Sanholt		bp->b_flags &= ~B_ERROR;
567189499Srnoland
568189499Srnoland	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
569189499Srnoland	    (bp->b_bufsize <= 0)) {
570145132Sanholt		bp->b_flags |= B_INVAL;
571189499Srnoland		if (bp->b_flags & B_DELWRI)
572189499Srnoland			--numdirtybuffers;
57395584Sanholt		bp->b_flags &= ~(B_DELWRI | B_CACHE);
57495584Sanholt		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
57595584Sanholt			if (bp->b_bufsize)
57695584Sanholt				allocbuf(bp, 0);
57795584Sanholt			brelvp(bp);
57895584Sanholt		}
57995584Sanholt	}
58095584Sanholt
581183830Srnoland	/*
582183830Srnoland	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
583183830Srnoland	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
584183830Srnoland	 * but the VM object is kept around.  The B_NOCACHE flag is used to
585183830Srnoland	 * invalidate the pages in the VM object.
586183830Srnoland	 *
58795584Sanholt	 * If the buffer is a partially filled NFS buffer, keep it
588184374Srnoland	 * since invalidating it now will lose informatio.  The valid
58995584Sanholt	 * flags in the vm_pages have only DEV_BSIZE resolution but
590184374Srnoland	 * the b_validoff, b_validend fields have byte resolution.
591184374Srnoland	 * This can avoid unnecessary re-reads of the buffer.
592184374Srnoland	 * XXX this seems to cause performance problems.
593189499Srnoland	 */
59495584Sanholt	if ((bp->b_flags & B_VMIO)
595183830Srnoland	    && !(bp->b_vp->v_tag == VT_NFS &&
596183830Srnoland		 (bp->b_flags & B_DELWRI) != 0)
597183830Srnoland#ifdef notdef
598183830Srnoland	    && (bp->b_vp->v_tag != VT_NFS
599183830Srnoland		|| (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
600183830Srnoland		|| bp->b_validend == 0
601183830Srnoland		|| (bp->b_validoff == 0
602183830Srnoland		    && bp->b_validend == bp->b_bufsize))
603183830Srnoland#endif
604183830Srnoland	    ) {
605183830Srnoland		vm_ooffset_t foff;
606183830Srnoland		vm_object_t obj;
60795584Sanholt		int i, resid;
60895584Sanholt		vm_page_t m;
60995584Sanholt		struct vnode *vp;
61095584Sanholt		int iototal = bp->b_bufsize;
611189499Srnoland
61295584Sanholt		vp = bp->b_vp;
61395584Sanholt
61495584Sanholt#if !defined(MAX_PERF)
61595584Sanholt		if (!vp)
616112015Sanholt			panic("brelse: missing vp");
617112015Sanholt#endif
61895584Sanholt
619148211Sanholt		if (bp->b_npages) {
620148211Sanholt			vm_pindex_t poff;
621148211Sanholt			obj = (vm_object_t) vp->v_object;
622182080Srnoland			if (vp->v_type == VBLK)
623182080Srnoland				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
624182080Srnoland			else
625182080Srnoland				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
626182080Srnoland			poff = OFF_TO_IDX(foff);
627182080Srnoland			for (i = 0; i < bp->b_npages; i++) {
628182080Srnoland				m = bp->b_pages[i];
629182080Srnoland				if (m == bogus_page) {
630148211Sanholt					m = vm_page_lookup(obj, poff + i);
631148211Sanholt#if !defined(MAX_PERF)
632182080Srnoland					if (!m) {
633148211Sanholt						panic("brelse: page missing\n");
634148211Sanholt					}
635148211Sanholt#endif
636148211Sanholt					bp->b_pages[i] = m;
637148211Sanholt					pmap_qenter(trunc_page(bp->b_data),
638182080Srnoland						bp->b_pages, bp->b_npages);
639182080Srnoland				}
640182080Srnoland				resid = IDX_TO_OFF(m->pindex+1) - foff;
641182080Srnoland				if (resid > iototal)
642182080Srnoland					resid = iototal;
643182080Srnoland				if (resid > 0) {
644182080Srnoland					/*
645182080Srnoland					 * Don't invalidate the page if the local machine has already
646182080Srnoland					 * modified it.  This is the lesser of two evils, and should
647182080Srnoland					 * be fixed.
648182080Srnoland					 */
649182080Srnoland					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
650182080Srnoland						vm_page_test_dirty(m);
651182080Srnoland						if (m->dirty == 0) {
652182080Srnoland							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
653182080Srnoland							if (m->valid == 0)
654182080Srnoland								vm_page_protect(m, VM_PROT_NONE);
655182080Srnoland						}
656182080Srnoland					}
657182080Srnoland					if (resid >= PAGE_SIZE) {
658182080Srnoland						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
659182080Srnoland							bp->b_flags |= B_INVAL;
660182080Srnoland						}
661182080Srnoland					} else {
662261455Seadler						if (!vm_page_is_valid(m,
663182080Srnoland							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
664182080Srnoland							bp->b_flags |= B_INVAL;
665182080Srnoland						}
666182080Srnoland					}
667182080Srnoland				}
668182080Srnoland				foff += resid;
669182080Srnoland				iototal -= resid;
670182080Srnoland			}
671182080Srnoland		}
672182080Srnoland		if (bp->b_flags & (B_INVAL | B_RELBUF))
673182080Srnoland			vfs_vmio_release(bp);
674182080Srnoland	}
675182080Srnoland#if !defined(MAX_PERF)
676182080Srnoland	if (bp->b_qindex != QUEUE_NONE)
677182080Srnoland		panic("brelse: free buffer onto another queue???");
678182080Srnoland#endif
679182080Srnoland
680182080Srnoland	/* enqueue */
681182080Srnoland	/* buffers with no memory */
682182080Srnoland	if (bp->b_bufsize == 0) {
683182080Srnoland		bp->b_flags |= B_INVAL;
684182080Srnoland		bp->b_qindex = QUEUE_EMPTY;
685182080Srnoland		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
686182080Srnoland		LIST_REMOVE(bp, b_hash);
687182080Srnoland		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
688182080Srnoland		bp->b_dev = NODEV;
689182080Srnoland		/*
690182080Srnoland		 * Get rid of the kva allocation *now*
691189499Srnoland		 */
692189499Srnoland		bfreekva(bp);
693189499Srnoland
694189499Srnoland	/* buffers with junk contents */
695189499Srnoland	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
696189499Srnoland		bp->b_flags |= B_INVAL;
697189499Srnoland		bp->b_qindex = QUEUE_AGE;
698189499Srnoland		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
699189499Srnoland		LIST_REMOVE(bp, b_hash);
700189499Srnoland		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
701189499Srnoland		bp->b_dev = NODEV;
702189499Srnoland
703189499Srnoland	/* buffers that are locked */
704189499Srnoland	} else if (bp->b_flags & B_LOCKED) {
705189499Srnoland		bp->b_qindex = QUEUE_LOCKED;
706189499Srnoland		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
707189499Srnoland
708189499Srnoland	/* buffers with stale but valid contents */
709189499Srnoland	} else if (bp->b_flags & B_AGE) {
710189499Srnoland		bp->b_qindex = QUEUE_AGE;
711189499Srnoland		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
712189499Srnoland
713189499Srnoland	/* buffers with valid and quite potentially reuseable contents */
714189499Srnoland	} else {
715189499Srnoland		bp->b_qindex = QUEUE_LRU;
716189499Srnoland		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
717189499Srnoland	}
718189499Srnoland
719189499Srnoland	if ((bp->b_flags & B_INVAL) ||
720189499Srnoland		(bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
721189499Srnoland		if (bp->b_flags & B_DELWRI) {
722189499Srnoland			--numdirtybuffers;
723189499Srnoland			bp->b_flags &= ~B_DELWRI;
724189499Srnoland		}
725189499Srnoland		vfs_bio_need_satisfy();
726189499Srnoland	}
727189499Srnoland
728189499Srnoland	/* unlock */
729189499Srnoland	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
730189499Srnoland				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
731189499Srnoland	splx(s);
732189499Srnoland}
733189499Srnoland
734189499Srnoland/*
735189499Srnoland * Release a buffer.
736189499Srnoland */
737189499Srnolandvoid
738189499Srnolandbqrelse(struct buf * bp)
739189499Srnoland{
740189499Srnoland	int s;
741182080Srnoland
742182080Srnoland	s = splbio();
743182080Srnoland
744182080Srnoland	/* anyone need this block? */
745182080Srnoland	if (bp->b_flags & B_WANTED) {
746182080Srnoland		bp->b_flags &= ~(B_WANTED | B_AGE);
747182080Srnoland		wakeup(bp);
748182080Srnoland	}
749182080Srnoland
750182080Srnoland#if !defined(MAX_PERF)
751182080Srnoland	if (bp->b_qindex != QUEUE_NONE)
752182080Srnoland		panic("bqrelse: free buffer onto another queue???");
753182080Srnoland#endif
754182080Srnoland
755145132Sanholt	if (bp->b_flags & B_LOCKED) {
756145132Sanholt		bp->b_flags &= ~B_ERROR;
757145132Sanholt		bp->b_qindex = QUEUE_LOCKED;
758182080Srnoland		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
759182080Srnoland		/* buffers with stale but valid contents */
760145132Sanholt	} else {
761145132Sanholt		bp->b_qindex = QUEUE_LRU;
762182080Srnoland		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
763182080Srnoland	}
764196471Srnoland
765182080Srnoland	if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
766182080Srnoland		vfs_bio_need_satisfy();
767182080Srnoland	}
768182080Srnoland
769182080Srnoland	/* unlock */
770182080Srnoland	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
771182080Srnoland		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
772182080Srnoland	splx(s);
773182080Srnoland}
774182080Srnoland
775182080Srnolandstatic void
776182080Srnolandvfs_vmio_release(bp)
777182080Srnoland	struct buf *bp;
778261455Seadler{
779182080Srnoland	int i;
780182080Srnoland	vm_page_t m;
781182080Srnoland
782182080Srnoland	for (i = 0; i < bp->b_npages; i++) {
783122580Sanholt		m = bp->b_pages[i];
78495584Sanholt		bp->b_pages[i] = NULL;
78595584Sanholt		vm_page_unwire(m);
786182080Srnoland		/*
787182080Srnoland		 * We don't mess with busy pages, it is
78895584Sanholt		 * the responsibility of the process that
78995584Sanholt		 * busied the pages to deal with them.
79095584Sanholt		 */
79195584Sanholt		if ((m->flags & PG_BUSY) || (m->busy != 0))
79295584Sanholt			continue;
79395584Sanholt
79495584Sanholt		if (m->wire_count == 0) {
79595584Sanholt
79695584Sanholt			if (m->flags & PG_WANTED) {
79795584Sanholt				m->flags &= ~PG_WANTED;
79895584Sanholt				wakeup(m);
79995584Sanholt			}
80095584Sanholt
80195584Sanholt			/*
80295584Sanholt			 * If this is an async free -- we cannot place
80395584Sanholt			 * pages onto the cache queue.  If it is an
80495584Sanholt			 * async free, then we don't modify any queues.
805182080Srnoland			 * This is probably in error (for perf reasons),
80695584Sanholt			 * and we will eventually need to build
80795584Sanholt			 * a more complete infrastructure to support I/O
80895584Sanholt			 * rundown.
80995584Sanholt			 */
810261455Seadler			if ((bp->b_flags & B_ASYNC) == 0) {
811261455Seadler
81295584Sanholt			/*
81395584Sanholt			 * In the case of sync buffer frees, we can do pretty much
81495584Sanholt			 * anything to any of the memory queues.  Specifically,
81595584Sanholt			 * the cache queue is okay to be modified.
81695584Sanholt			 */
81795584Sanholt				if (m->valid) {
81895584Sanholt					if(m->dirty == 0)
81995584Sanholt						vm_page_test_dirty(m);
82095584Sanholt					/*
82195584Sanholt					 * this keeps pressure off of the process memory
822112015Sanholt					 */
823112015Sanholt					if (m->dirty == 0 && m->hold_count == 0)
824189499Srnoland						vm_page_cache(m);
825112015Sanholt					else
826189499Srnoland						vm_page_deactivate(m);
827182080Srnoland				} else if (m->hold_count == 0) {
828189499Srnoland					vm_page_protect(m, VM_PROT_NONE);
829189499Srnoland					vm_page_free(m);
830189499Srnoland				}
831189499Srnoland			} else {
832189499Srnoland				/*
833189499Srnoland				 * If async, then at least we clear the
834189499Srnoland				 * act_count.
835189499Srnoland				 */
836189499Srnoland				m->act_count = 0;
837189499Srnoland			}
838182080Srnoland		}
839189499Srnoland	}
840189499Srnoland	bufspace -= bp->b_bufsize;
841112015Sanholt	vmiospace -= bp->b_bufsize;
842112015Sanholt	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
843182080Srnoland	bp->b_npages = 0;
844112015Sanholt	bp->b_bufsize = 0;
845112015Sanholt	bp->b_flags &= ~B_VMIO;
846112015Sanholt	if (bp->b_vp)
847112015Sanholt		brelvp(bp);
848112015Sanholt}
849182080Srnoland
850182080Srnoland/*
851182080Srnoland * Check to see if a block is currently memory resident.
852112015Sanholt */
853112015Sanholtstruct buf *
854182080Srnolandgbincore(struct vnode * vp, daddr_t blkno)
855112015Sanholt{
856182080Srnoland	struct buf *bp;
857112015Sanholt	struct bufhashhdr *bh;
85895584Sanholt
85995584Sanholt	bh = BUFHASH(vp, blkno);
86095584Sanholt	bp = bh->lh_first;
86195584Sanholt
86295584Sanholt	/* Search hash chain */
86395584Sanholt	while (bp != NULL) {
86495584Sanholt		/* hit */
86595584Sanholt		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
86695584Sanholt		    (bp->b_flags & B_INVAL) == 0) {
86795584Sanholt			break;
86895584Sanholt		}
86995584Sanholt		bp = bp->b_hash.le_next;
87095584Sanholt	}
871112015Sanholt	return (bp);
872112015Sanholt}
873112015Sanholt
874112015Sanholt/*
875112015Sanholt * this routine implements clustered async writes for
876112015Sanholt * clearing out B_DELWRI buffers...  This is much better
87795584Sanholt * than the old way of writing only one buffer at a time.
87895584Sanholt */
87995584Sanholtint
88095584Sanholtvfs_bio_awrite(struct buf * bp)
88195584Sanholt{
88295584Sanholt	int i;
88395584Sanholt	daddr_t lblkno = bp->b_lblkno;
88495584Sanholt	struct vnode *vp = bp->b_vp;
88595584Sanholt	int s;
88695584Sanholt	int ncl;
88795584Sanholt	struct buf *bpa;
88895584Sanholt	int nwritten;
88995584Sanholt
89095584Sanholt	s = splbio();
89195584Sanholt	/*
89295584Sanholt	 * right now we support clustered writing only to regular files
89395584Sanholt	 */
89495584Sanholt	if ((vp->v_type == VREG) &&
89595584Sanholt	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
896122580Sanholt	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
89795584Sanholt		int size;
89895584Sanholt		int maxcl;
89995584Sanholt
900182080Srnoland		size = vp->v_mount->mnt_stat.f_iosize;
901182080Srnoland		maxcl = MAXPHYS / size;
902182080Srnoland
903182080Srnoland		for (i = 1; i < maxcl; i++) {
904182080Srnoland			if ((bpa = gbincore(vp, lblkno + i)) &&
905261455Seadler			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
90695584Sanholt			    (B_DELWRI | B_CLUSTEROK)) &&
90795584Sanholt			    (bpa->b_bufsize == size)) {
90895584Sanholt				if ((bpa->b_blkno == bpa->b_lblkno) ||
90995584Sanholt				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
91095584Sanholt					break;
91195584Sanholt			} else {
91295584Sanholt				break;
91395584Sanholt			}
91495584Sanholt		}
91595584Sanholt		ncl = i;
916145132Sanholt		/*
91795584Sanholt		 * this is a possible cluster write
918145132Sanholt		 */
919112015Sanholt		if (ncl != 1) {
92095584Sanholt			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
92195584Sanholt			splx(s);
92295584Sanholt			return nwritten;
92395584Sanholt		}
92495584Sanholt	}
92595584Sanholt	bremfree(bp);
92695584Sanholt	splx(s);
927261455Seadler	/*
928182080Srnoland	 * default (old) behavior, writing out only one block
929182080Srnoland	 */
930182080Srnoland	bp->b_flags |= B_BUSY | B_ASYNC;
931261455Seadler	nwritten = bp->b_bufsize;
932189499Srnoland	(void) VOP_BWRITE(bp);
933162132Sanholt	return nwritten;
934162132Sanholt}
935162132Sanholt
936261455Seadler
937182080Srnoland/*
938182080Srnoland * Find a buffer header which is available for use.
939182080Srnoland */
940182080Srnolandstatic struct buf *
94195584Sanholtgetnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize)
94295584Sanholt{
94395584Sanholt	struct buf *bp;
944189499Srnoland	int nbyteswritten = 0;
94595584Sanholt	vm_offset_t addr;
946112015Sanholt	static int writerecursion = 0;
947112015Sanholt
948112015Sanholtstart:
949189499Srnoland	if (bufspace >= maxbufspace)
950189499Srnoland		goto trytofreespace;
95195584Sanholt
952261455Seadler	/* can we constitute a new buffer? */
95395584Sanholt	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
95495584Sanholt#if !defined(MAX_PERF)
95595584Sanholt		if (bp->b_qindex != QUEUE_EMPTY)
95695584Sanholt			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
95795584Sanholt			    bp->b_qindex);
95895584Sanholt#endif
95995584Sanholt		bp->b_flags |= B_BUSY;
96095584Sanholt		bremfree(bp);
96195584Sanholt		goto fillbuf;
962182080Srnoland	}
963182080Srnolandtrytofreespace:
964182080Srnoland	/*
965182080Srnoland	 * We keep the file I/O from hogging metadata I/O
966182080Srnoland	 * This is desirable because file data is cached in the
967182080Srnoland	 * VM/Buffer cache even if a buffer is freed.
968182080Srnoland	 */
969182080Srnoland	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
970182080Srnoland#if !defined(MAX_PERF)
971182080Srnoland		if (bp->b_qindex != QUEUE_AGE)
972182080Srnoland			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
973182080Srnoland			    bp->b_qindex);
974182080Srnoland#endif
975182080Srnoland	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
976182080Srnoland#if !defined(MAX_PERF)
977182080Srnoland		if (bp->b_qindex != QUEUE_LRU)
978182080Srnoland			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
979182080Srnoland			    bp->b_qindex);
980182080Srnoland#endif
98195584Sanholt	}
982182080Srnoland	if (!bp) {
983182080Srnoland		/* wait for a free buffer of any kind */
984182080Srnoland		needsbuffer |= VFS_BIO_NEED_ANY;
98595584Sanholt		do
986182080Srnoland			tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf",
987182080Srnoland			    slptimeo);
988182080Srnoland		while (needsbuffer & VFS_BIO_NEED_ANY);
989182080Srnoland		return (0);
990182080Srnoland	}
991182080Srnoland
992182080Srnoland#if defined(DIAGNOSTIC)
993182080Srnoland	if (bp->b_flags & B_BUSY) {
994182080Srnoland		panic("getnewbuf: busy buffer on free list\n");
995182080Srnoland	}
996182080Srnoland#endif
997182080Srnoland
998182080Srnoland	/*
999182080Srnoland	 * We are fairly aggressive about freeing VMIO buffers, but since
1000182080Srnoland	 * the buffering is intact without buffer headers, there is not
1001182080Srnoland	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
1002182080Srnoland	 */
1003182080Srnoland	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1004182080Srnoland		if ((bp->b_flags & B_VMIO) == 0 ||
1005182080Srnoland			(vmiospace < maxvmiobufspace)) {
1006261455Seadler			--bp->b_usecount;
100795584Sanholt			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
100895584Sanholt			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
100995584Sanholt				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
101095584Sanholt				goto start;
101195584Sanholt			}
101295584Sanholt			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
101395584Sanholt		}
101495584Sanholt	}
101595584Sanholt
101695584Sanholt
101795584Sanholt	/* if we are a delayed write, convert to an async write */
101895584Sanholt	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
101995584Sanholt
102095584Sanholt		if (writerecursion > 0) {
102195584Sanholt			bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
102295584Sanholt			while (bp) {
102395584Sanholt				if ((bp->b_flags & B_DELWRI) == 0)
102495584Sanholt					break;
102595584Sanholt				bp = TAILQ_NEXT(bp, b_freelist);
102695584Sanholt			}
102795584Sanholt			if (bp == NULL) {
102895584Sanholt				bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
102995584Sanholt				while (bp) {
103095584Sanholt					if ((bp->b_flags & B_DELWRI) == 0)
103195584Sanholt						break;
103295584Sanholt					bp = TAILQ_NEXT(bp, b_freelist);
103395584Sanholt				}
103495584Sanholt			}
103595584Sanholt			if (bp == NULL)
103695584Sanholt				panic("getnewbuf: cannot get buffer, infinite recursion failure");
103795584Sanholt		} else {
103895584Sanholt			++writerecursion;
103995584Sanholt			nbyteswritten += vfs_bio_awrite(bp);
104095584Sanholt			--writerecursion;
104195584Sanholt			if (!slpflag && !slptimeo) {
1042112015Sanholt				return (0);
1043112015Sanholt			}
1044112015Sanholt			goto start;
1045112015Sanholt		}
1046112015Sanholt	}
1047112015Sanholt
1048112015Sanholt	if (bp->b_flags & B_WANTED) {
1049112015Sanholt		bp->b_flags &= ~B_WANTED;
1050112015Sanholt		wakeup(bp);
1051112015Sanholt	}
105295584Sanholt	bremfree(bp);
105395584Sanholt	bp->b_flags |= B_BUSY;
105495584Sanholt
105595584Sanholt	if (bp->b_flags & B_VMIO) {
105695584Sanholt		bp->b_flags &= ~B_ASYNC;
105795584Sanholt		vfs_vmio_release(bp);
105895584Sanholt	}
105995584Sanholt
106095584Sanholt	if (bp->b_vp)
106195584Sanholt		brelvp(bp);
106295584Sanholt
106395584Sanholtfillbuf:
106495584Sanholt	/* we are not free, nor do we contain interesting data */
106595584Sanholt	if (bp->b_rcred != NOCRED) {
106695584Sanholt		crfree(bp->b_rcred);
106795584Sanholt		bp->b_rcred = NOCRED;
106895584Sanholt	}
106995584Sanholt	if (bp->b_wcred != NOCRED) {
107095584Sanholt		crfree(bp->b_wcred);
107195584Sanholt		bp->b_wcred = NOCRED;
107295584Sanholt	}
1073145132Sanholt
107495584Sanholt	LIST_REMOVE(bp, b_hash);
107595584Sanholt	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
107695584Sanholt	if (bp->b_bufsize) {
107795584Sanholt		allocbuf(bp, 0);
107895584Sanholt	}
107995584Sanholt	bp->b_flags = B_BUSY;
108095584Sanholt	bp->b_dev = NODEV;
108195584Sanholt	bp->b_vp = NULL;
108295584Sanholt	bp->b_blkno = bp->b_lblkno = 0;
108395584Sanholt	bp->b_iodone = 0;
108495584Sanholt	bp->b_error = 0;
108595584Sanholt	bp->b_resid = 0;
108695584Sanholt	bp->b_bcount = 0;
108795584Sanholt	bp->b_npages = 0;
108895584Sanholt	bp->b_dirtyoff = bp->b_dirtyend = 0;
108995584Sanholt	bp->b_validoff = bp->b_validend = 0;
109095584Sanholt	bp->b_usecount = 4;
109195584Sanholt
109295584Sanholt	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
109395584Sanholt
109495584Sanholt	/*
109595584Sanholt	 * we assume that buffer_map is not at address 0
109695584Sanholt	 */
109795584Sanholt	addr = 0;
109895584Sanholt	if (maxsize != bp->b_kvasize) {
1099148211Sanholt		bfreekva(bp);
1100148211Sanholt
110195584Sanholt		/*
110295584Sanholt		 * See if we have buffer kva space
110395584Sanholt		 */
110495584Sanholt		if (vm_map_findspace(buffer_map,
1105145132Sanholt			vm_map_min(buffer_map), maxsize, &addr)) {
110695584Sanholt			bp->b_flags |= B_INVAL;
110795584Sanholt			brelse(bp);
110895584Sanholt			goto trytofreespace;
110995584Sanholt		}
111095584Sanholt	}
111195584Sanholt
111295584Sanholt	/*
111395584Sanholt	 * See if we are below are allocated minimum
111495584Sanholt	 */
111595584Sanholt	if (bufspace >= (maxbufspace + nbyteswritten)) {
111695584Sanholt		bp->b_flags |= B_INVAL;
111795584Sanholt		brelse(bp);
1118112015Sanholt		goto trytofreespace;
1119162132Sanholt	}
1120261455Seadler
112195584Sanholt	/*
112295584Sanholt	 * create a map entry for the buffer -- in essence
112395584Sanholt	 * reserving the kva space.
112495584Sanholt	 */
112595584Sanholt	if (addr) {
112695584Sanholt		vm_map_insert(buffer_map, NULL, 0,
112795584Sanholt			addr, addr + maxsize,
112895584Sanholt			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
112995584Sanholt
113095584Sanholt		bp->b_kvabase = (caddr_t) addr;
113195584Sanholt		bp->b_kvasize = maxsize;
113295584Sanholt	}
113395584Sanholt	bp->b_data = bp->b_kvabase;
113495584Sanholt
113595584Sanholt	return (bp);
113695584Sanholt}
113795584Sanholt
113895584Sanholtstatic void
113995584Sanholtwaitfreebuffers(int slpflag, int slptimeo) {
114095584Sanholt	while (numfreebuffers < hifreebuffers) {
114195584Sanholt		flushdirtybuffers(slpflag, slptimeo);
1142184374Srnoland		if (numfreebuffers < hifreebuffers)
114395584Sanholt			break;
114495584Sanholt		needsbuffer |= VFS_BIO_NEED_FREE;
114595584Sanholt		if (tsleep(&needsbuffer, PRIBIO|slpflag, "biofre", slptimeo))
114695584Sanholt			break;
114795584Sanholt	}
114895584Sanholt}
114995584Sanholt
115095584Sanholtstatic void
115195584Sanholtflushdirtybuffers(int slpflag, int slptimeo) {
115295584Sanholt	int s;
115395584Sanholt	static pid_t flushing = 0;
115495584Sanholt
115595584Sanholt	s = splbio();
1156148211Sanholt
1157148211Sanholt	if (flushing) {
1158148211Sanholt		if (flushing == curproc->p_pid) {
1159148211Sanholt			splx(s);
1160189499Srnoland			return;
116195584Sanholt		}
116295584Sanholt		while (flushing) {
1163112015Sanholt			if (tsleep(&flushing, PRIBIO|slpflag, "biofls", slptimeo)) {
116495584Sanholt				splx(s);
1165112015Sanholt				return;
1166148211Sanholt			}
1167112015Sanholt		}
1168145132Sanholt	}
1169145132Sanholt	flushing = curproc->p_pid;
1170145132Sanholt
1171148211Sanholt	while (numdirtybuffers > lodirtybuffers) {
1172148211Sanholt		struct buf *bp;
1173148211Sanholt		needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1174148211Sanholt		bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1175145132Sanholt		if (bp == NULL)
1176148211Sanholt			bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
117795584Sanholt
117895584Sanholt		while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
117995584Sanholt			bp = TAILQ_NEXT(bp, b_freelist);
1180112015Sanholt		}
118195584Sanholt
1182189499Srnoland		if (bp) {
1183189499Srnoland			splx(s);
1184189499Srnoland			vfs_bio_awrite(bp);
1185189499Srnoland			s = splbio();
1186189499Srnoland			continue;
1187189499Srnoland		}
1188189499Srnoland		break;
1189189499Srnoland	}
119095584Sanholt
119195584Sanholt	flushing = 0;
119295584Sanholt	wakeup(&flushing);
119395584Sanholt	splx(s);
119495584Sanholt}
119595584Sanholt
1196261455Seadler/*
1197112015Sanholt * Check to see if a block is currently memory resident.
119895584Sanholt */
119995584Sanholtstruct buf *
120095584Sanholtincore(struct vnode * vp, daddr_t blkno)
120195584Sanholt{
120295584Sanholt	struct buf *bp;
120395584Sanholt
120495584Sanholt	int s = splbio();
120595584Sanholt	bp = gbincore(vp, blkno);
120695584Sanholt	splx(s);
120795584Sanholt	return (bp);
120895584Sanholt}
120995584Sanholt
1210112015Sanholt/*
121195584Sanholt * Returns true if no I/O is needed to access the
121295584Sanholt * associated VM object.  This is like incore except
121395584Sanholt * it also hunts around in the VM system for the data.
121495584Sanholt */
121595584Sanholt
121695584Sanholtint
121795584Sanholtinmem(struct vnode * vp, daddr_t blkno)
121895584Sanholt{
121995584Sanholt	vm_object_t obj;
122095584Sanholt	vm_offset_t toff, tinc;
122195584Sanholt	vm_page_t m;
122295584Sanholt	vm_ooffset_t off;
122395584Sanholt
122495584Sanholt	if (incore(vp, blkno))
122595584Sanholt		return 1;
122695584Sanholt	if (vp->v_mount == NULL)
122795584Sanholt		return 0;
122895584Sanholt	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
122995584Sanholt		return 0;
123095584Sanholt
123195584Sanholt	obj = vp->v_object;
123295584Sanholt	tinc = PAGE_SIZE;
123395584Sanholt	if (tinc > vp->v_mount->mnt_stat.f_iosize)
123495584Sanholt		tinc = vp->v_mount->mnt_stat.f_iosize;
123595584Sanholt	off = blkno * vp->v_mount->mnt_stat.f_iosize;
123695584Sanholt
1237119098Sanholt	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1238112015Sanholt
1239112015Sanholt		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1240112015Sanholt		if (!m)
1241112015Sanholt			return 0;
1242112015Sanholt		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
124395584Sanholt			return 0;
1244112015Sanholt	}
1245112015Sanholt	return 1;
1246112015Sanholt}
1247112015Sanholt
1248112015Sanholt/*
1249112015Sanholt * now we set the dirty range for the buffer --
1250112015Sanholt * for NFS -- if the file is mapped and pages have
1251112015Sanholt * been written to, let it know.  We want the
1252145132Sanholt * entire range of the buffer to be marked dirty if
1253112015Sanholt * any of the pages have been written to for consistancy
1254112015Sanholt * with the b_validoff, b_validend set in the nfs write
1255112015Sanholt * code, and used by the nfs read code.
1256112015Sanholt */
1257145132Sanholtstatic void
1258145132Sanholtvfs_setdirty(struct buf *bp) {
1259145132Sanholt	int i;
1260145132Sanholt	vm_object_t object;
1261145132Sanholt	vm_offset_t boffset, offset;
1262145132Sanholt	/*
1263145132Sanholt	 * We qualify the scan for modified pages on whether the
1264145132Sanholt	 * object has been flushed yet.  The OBJ_WRITEABLE flag
1265112015Sanholt	 * is not cleared simply by protecting pages off.
1266112015Sanholt	 */
1267112015Sanholt	if ((bp->b_flags & B_VMIO) &&
1268112015Sanholt		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1269112015Sanholt		/*
1270112015Sanholt		 * test the pages to see if they have been modified directly
1271112015Sanholt		 * by users through the VM system.
1272112015Sanholt		 */
1273112015Sanholt		for (i = 0; i < bp->b_npages; i++)
1274112015Sanholt			vm_page_test_dirty(bp->b_pages[i]);
1275112015Sanholt
1276112015Sanholt		/*
1277112015Sanholt		 * scan forwards for the first page modified
1278112015Sanholt		 */
1279112015Sanholt		for (i = 0; i < bp->b_npages; i++) {
1280112015Sanholt			if (bp->b_pages[i]->dirty) {
1281112015Sanholt				break;
1282112015Sanholt			}
1283112015Sanholt		}
1284112015Sanholt		boffset = (i << PAGE_SHIFT);
1285112015Sanholt		if (boffset < bp->b_dirtyoff) {
1286112015Sanholt			bp->b_dirtyoff = boffset;
1287112015Sanholt		}
1288112015Sanholt
1289112015Sanholt		/*
1290112015Sanholt		 * scan backwards for the last page modified
1291112015Sanholt		 */
1292112015Sanholt		for (i = bp->b_npages - 1; i >= 0; --i) {
1293112015Sanholt			if (bp->b_pages[i]->dirty) {
1294112015Sanholt				break;
1295112015Sanholt			}
1296112015Sanholt		}
1297112015Sanholt		boffset = (i + 1);
1298112015Sanholt		offset = boffset + bp->b_pages[0]->pindex;
1299112015Sanholt		if (offset >= object->size)
1300112015Sanholt			boffset = object->size - bp->b_pages[0]->pindex;
1301112015Sanholt		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1302112015Sanholt			bp->b_dirtyend = (boffset << PAGE_SHIFT);
1303112015Sanholt	}
1304112015Sanholt}
1305112015Sanholt
1306112015Sanholt/*
1307112015Sanholt * Get a block given a specified block and offset into a file/device.
1308112015Sanholt */
1309112015Sanholtstruct buf *
1310112015Sanholtgetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1311112015Sanholt{
1312112015Sanholt	struct buf *bp;
1313112015Sanholt	int s;
1314112015Sanholt	struct bufhashhdr *bh;
1315112015Sanholt	int maxsize;
1316112015Sanholt	static pid_t flushing = 0;
1317112015Sanholt
1318145132Sanholt	if (vp->v_mount) {
1319112015Sanholt		maxsize = vp->v_mount->mnt_stat.f_iosize;
1320112015Sanholt		/*
1321112015Sanholt		 * This happens on mount points.
1322112015Sanholt		 */
1323112015Sanholt		if (maxsize < size)
1324145132Sanholt			maxsize = size;
1325119098Sanholt	} else {
1326119098Sanholt		maxsize = size;
1327112015Sanholt	}
1328145132Sanholt
1329145132Sanholt#if !defined(MAX_PERF)
1330145132Sanholt	if (size > MAXBSIZE)
1331145132Sanholt		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1332145132Sanholt#endif
1333145132Sanholt
1334119098Sanholt	s = splbio();
1335162132Sanholtloop:
1336162132Sanholt	if (numfreebuffers < lofreebuffers) {
1337112015Sanholt		waitfreebuffers(slpflag, slptimeo);
1338112015Sanholt	}
1339112015Sanholt
1340112015Sanholt	if ((bp = gbincore(vp, blkno))) {
1341112015Sanholt		if (bp->b_flags & B_BUSY) {
1342112015Sanholt			bp->b_flags |= B_WANTED;
1343112015Sanholt			if (bp->b_usecount < BUF_MAXUSE)
1344112015Sanholt				++bp->b_usecount;
1345112015Sanholt			if (!tsleep(bp,
1346112015Sanholt				(PRIBIO + 1) | slpflag, "getblk", slptimeo))
1347145132Sanholt				goto loop;
1348112015Sanholt
1349130331Sanholt			splx(s);
1350112015Sanholt			return (struct buf *) NULL;
1351145132Sanholt		}
1352145132Sanholt		bp->b_flags |= B_BUSY | B_CACHE;
1353189499Srnoland		bremfree(bp);
1354145132Sanholt
1355152909Sanholt		/*
1356189499Srnoland		 * check for size inconsistancies (note that they shouldn't happen
1357152909Sanholt		 * but do when filesystems don't handle the size changes correctly.)
1358162132Sanholt		 * We are conservative on metadata and don't just extend the buffer
1359162132Sanholt		 * but write and re-constitute it.
1360189499Srnoland		 */
1361189499Srnoland
1362152909Sanholt		if (bp->b_bcount != size) {
1363182080Srnoland			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1364182080Srnoland				allocbuf(bp, size);
1365182080Srnoland			} else {
1366152909Sanholt				bp->b_flags |= B_NOCACHE;
1367182080Srnoland				VOP_BWRITE(bp);
1368182080Srnoland				goto loop;
1369152909Sanholt			}
1370182080Srnoland		}
1371182080Srnoland
1372182080Srnoland		if (bp->b_usecount < BUF_MAXUSE)
1373182080Srnoland			++bp->b_usecount;
1374182080Srnoland		splx(s);
1375182080Srnoland		return (bp);
1376182080Srnoland	} else {
1377182080Srnoland		vm_object_t obj;
1378182080Srnoland
1379182080Srnoland		if ((bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize)) == 0) {
1380182080Srnoland			if (slpflag || slptimeo) {
1381182080Srnoland				splx(s);
1382182080Srnoland				return NULL;
1383182080Srnoland			}
1384182080Srnoland			goto loop;
1385182080Srnoland		}
1386182080Srnoland
1387182080Srnoland		/*
1388189499Srnoland		 * This code is used to make sure that a buffer is not
1389189499Srnoland		 * created while the getnewbuf routine is blocked.
1390189499Srnoland		 * Normally the vnode is locked so this isn't a problem.
1391189499Srnoland		 * VBLK type I/O requests, however, don't lock the vnode.
1392189499Srnoland		 */
1393189499Srnoland		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
1394189499Srnoland			bp->b_flags |= B_INVAL;
1395189499Srnoland			brelse(bp);
1396189499Srnoland			goto loop;
1397189499Srnoland		}
1398189499Srnoland
1399189499Srnoland		/*
1400189499Srnoland		 * Insert the buffer into the hash, so that it can
1401189499Srnoland		 * be found by incore.
1402189499Srnoland		 */
1403189499Srnoland		bp->b_blkno = bp->b_lblkno = blkno;
1404189499Srnoland		bgetvp(vp, bp);
1405189499Srnoland		LIST_REMOVE(bp, b_hash);
1406189499Srnoland		bh = BUFHASH(vp, blkno);
1407189499Srnoland		LIST_INSERT_HEAD(bh, bp, b_hash);
1408189499Srnoland
1409189499Srnoland		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
1410189499Srnoland			bp->b_flags |= (B_VMIO | B_CACHE);
1411189499Srnoland#if defined(VFS_BIO_DEBUG)
1412189499Srnoland			if (vp->v_type != VREG && vp->v_type != VBLK)
1413189499Srnoland				printf("getblk: vmioing file type %d???\n", vp->v_type);
1414189499Srnoland#endif
1415189499Srnoland		} else {
1416189499Srnoland			bp->b_flags &= ~B_VMIO;
1417189499Srnoland		}
1418189499Srnoland		splx(s);
1419189499Srnoland
1420189499Srnoland		allocbuf(bp, size);
1421189499Srnoland#ifdef	PC98
1422189499Srnoland		/*
1423189499Srnoland		 * 1024byte/sector support
1424189499Srnoland		 */
1425189499Srnoland#define B_XXX2 0x8000000
1426189499Srnoland		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1427189499Srnoland#endif
1428189499Srnoland		return (bp);
1429189499Srnoland	}
1430189499Srnoland}
1431189499Srnoland
1432189499Srnoland/*
1433189499Srnoland * Get an empty, disassociated buffer of given size.
1434189499Srnoland */
1435189499Srnolandstruct buf *
1436189499Srnolandgeteblk(int size)
1437189499Srnoland{
1438189499Srnoland	struct buf *bp;
1439189499Srnoland	int s;
1440189499Srnoland
1441189499Srnoland	s = splbio();
1442189499Srnoland	while ((bp = getnewbuf(0, 0, 0, size, MAXBSIZE)) == 0);
1443189499Srnoland	splx(s);
1444189499Srnoland	allocbuf(bp, size);
1445189499Srnoland	bp->b_flags |= B_INVAL;
1446189499Srnoland	return (bp);
1447189499Srnoland}
1448189499Srnoland
1449189499Srnoland
1450189499Srnoland/*
1451189499Srnoland * This code constitutes the buffer memory from either anonymous system
1452189499Srnoland * memory (in the case of non-VMIO operations) or from an associated
1453189499Srnoland * VM object (in the case of VMIO operations).
1454189499Srnoland *
1455189499Srnoland * Note that this code is tricky, and has many complications to resolve
1456189499Srnoland * deadlock or inconsistant data situations.  Tread lightly!!!
1457189499Srnoland *
1458189499Srnoland * Modify the length of a buffer's underlying buffer storage without
1459189499Srnoland * destroying information (unless, of course the buffer is shrinking).
1460189499Srnoland */
1461189499Srnolandint
1462189499Srnolandallocbuf(struct buf * bp, int size)
1463189499Srnoland{
1464189499Srnoland
1465189499Srnoland	int s;
1466189499Srnoland	int newbsize, mbsize;
1467189499Srnoland	int i;
1468189499Srnoland
1469189499Srnoland#if !defined(MAX_PERF)
1470189499Srnoland	if (!(bp->b_flags & B_BUSY))
1471189499Srnoland		panic("allocbuf: buffer not busy");
1472189499Srnoland
1473189499Srnoland	if (bp->b_kvasize < size)
1474189499Srnoland		panic("allocbuf: buffer too small");
1475189499Srnoland#endif
1476189499Srnoland
1477189499Srnoland	if ((bp->b_flags & B_VMIO) == 0) {
1478189499Srnoland		caddr_t origbuf;
1479189499Srnoland		int origbufsize;
1480189499Srnoland		/*
1481189499Srnoland		 * Just get anonymous memory from the kernel
1482189499Srnoland		 */
1483189499Srnoland		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1484189499Srnoland#if !defined(NO_B_MALLOC)
1485189499Srnoland		if (bp->b_flags & B_MALLOC)
1486189499Srnoland			newbsize = mbsize;
1487189499Srnoland		else
1488189499Srnoland#endif
1489189499Srnoland			newbsize = round_page(size);
1490189499Srnoland
1491189499Srnoland		if (newbsize < bp->b_bufsize) {
1492189499Srnoland#if !defined(NO_B_MALLOC)
1493189499Srnoland			/*
1494189499Srnoland			 * malloced buffers are not shrunk
1495189499Srnoland			 */
1496189499Srnoland			if (bp->b_flags & B_MALLOC) {
1497189499Srnoland				if (newbsize) {
1498189499Srnoland					bp->b_bcount = size;
1499189499Srnoland				} else {
1500189499Srnoland					free(bp->b_data, M_BIOBUF);
1501189499Srnoland					bufspace -= bp->b_bufsize;
1502189499Srnoland					bufmallocspace -= bp->b_bufsize;
1503189499Srnoland					bp->b_data = bp->b_kvabase;
1504189499Srnoland					bp->b_bufsize = 0;
1505189499Srnoland					bp->b_bcount = 0;
1506189499Srnoland					bp->b_flags &= ~B_MALLOC;
1507189499Srnoland				}
1508189499Srnoland				return 1;
1509189499Srnoland			}
1510189499Srnoland#endif
1511189499Srnoland			vm_hold_free_pages(
1512189499Srnoland			    bp,
1513189499Srnoland			    (vm_offset_t) bp->b_data + newbsize,
1514189499Srnoland			    (vm_offset_t) bp->b_data + bp->b_bufsize);
1515189499Srnoland		} else if (newbsize > bp->b_bufsize) {
1516189499Srnoland#if !defined(NO_B_MALLOC)
1517189499Srnoland			/*
1518189499Srnoland			 * We only use malloced memory on the first allocation.
1519189499Srnoland			 * and revert to page-allocated memory when the buffer grows.
1520189499Srnoland			 */
1521189499Srnoland			if ( (bufmallocspace < maxbufmallocspace) &&
1522189499Srnoland				(bp->b_bufsize == 0) &&
1523189499Srnoland				(mbsize <= PAGE_SIZE/2)) {
1524189499Srnoland
1525189499Srnoland				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1526189499Srnoland				bp->b_bufsize = mbsize;
1527261455Seadler				bp->b_bcount = size;
1528189499Srnoland				bp->b_flags |= B_MALLOC;
1529189499Srnoland				bufspace += mbsize;
1530189499Srnoland				bufmallocspace += mbsize;
1531189499Srnoland				return 1;
1532189499Srnoland			}
1533189499Srnoland#endif
1534189499Srnoland			origbuf = NULL;
1535189499Srnoland			origbufsize = 0;
1536189499Srnoland#if !defined(NO_B_MALLOC)
1537189499Srnoland			/*
1538189499Srnoland			 * If the buffer is growing on it's other-than-first allocation,
1539189499Srnoland			 * then we revert to the page-allocation scheme.
1540189499Srnoland			 */
1541189499Srnoland			if (bp->b_flags & B_MALLOC) {
1542189499Srnoland				origbuf = bp->b_data;
1543189499Srnoland				origbufsize = bp->b_bufsize;
1544189499Srnoland				bp->b_data = bp->b_kvabase;
1545189499Srnoland				bufspace -= bp->b_bufsize;
1546189499Srnoland				bufmallocspace -= bp->b_bufsize;
1547189499Srnoland				bp->b_bufsize = 0;
1548189499Srnoland				bp->b_flags &= ~B_MALLOC;
1549261455Seadler				newbsize = round_page(newbsize);
1550189499Srnoland			}
1551189499Srnoland#endif
1552189499Srnoland			vm_hold_load_pages(
1553189499Srnoland			    bp,
1554189499Srnoland			    (vm_offset_t) bp->b_data + bp->b_bufsize,
1555189499Srnoland			    (vm_offset_t) bp->b_data + newbsize);
1556189499Srnoland#if !defined(NO_B_MALLOC)
1557189499Srnoland			if (origbuf) {
1558189499Srnoland				bcopy(origbuf, bp->b_data, origbufsize);
1559189499Srnoland				free(origbuf, M_BIOBUF);
1560189499Srnoland			}
1561189499Srnoland#endif
1562189499Srnoland		}
1563189499Srnoland	} else {
1564189499Srnoland		vm_page_t m;
1565189499Srnoland		int desiredpages;
1566189499Srnoland
1567189499Srnoland		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1568189499Srnoland		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1569189499Srnoland
1570189499Srnoland#if !defined(NO_B_MALLOC)
1571189499Srnoland		if (bp->b_flags & B_MALLOC)
1572189499Srnoland			panic("allocbuf: VMIO buffer can't be malloced");
1573189499Srnoland#endif
1574189499Srnoland
1575189499Srnoland		if (newbsize < bp->b_bufsize) {
1576189499Srnoland			if (desiredpages < bp->b_npages) {
1577189499Srnoland				for (i = desiredpages; i < bp->b_npages; i++) {
1578189499Srnoland					/*
1579189499Srnoland					 * the page is not freed here -- it
1580189499Srnoland					 * is the responsibility of vnode_pager_setsize
1581189499Srnoland					 */
1582189499Srnoland					m = bp->b_pages[i];
1583189499Srnoland#if defined(DIAGNOSTIC)
1584189499Srnoland					if (m == bogus_page)
1585189499Srnoland						panic("allocbuf: bogus page found");
1586189499Srnoland#endif
1587189499Srnoland					s = splvm();
1588189499Srnoland					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1589189499Srnoland						m->flags |= PG_WANTED;
1590189499Srnoland						tsleep(m, PVM, "biodep", 0);
1591189499Srnoland					}
1592189499Srnoland					splx(s);
1593189499Srnoland
1594189499Srnoland					bp->b_pages[i] = NULL;
1595189499Srnoland					vm_page_unwire(m);
1596189499Srnoland				}
1597189499Srnoland				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1598189499Srnoland				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1599189499Srnoland				bp->b_npages = desiredpages;
1600189499Srnoland			}
1601189499Srnoland		} else if (newbsize > bp->b_bufsize) {
1602189499Srnoland			vm_object_t obj;
1603189499Srnoland			vm_offset_t tinc, toff;
1604189499Srnoland			vm_ooffset_t off;
1605189499Srnoland			vm_pindex_t objoff;
1606189499Srnoland			int pageindex, curbpnpages;
1607189499Srnoland			struct vnode *vp;
1608189499Srnoland			int bsize;
1609189499Srnoland
1610189499Srnoland			vp = bp->b_vp;
1611189499Srnoland
1612189499Srnoland			if (vp->v_type == VBLK)
1613189499Srnoland				bsize = DEV_BSIZE;
1614189499Srnoland			else
1615189499Srnoland				bsize = vp->v_mount->mnt_stat.f_iosize;
1616189499Srnoland
1617189499Srnoland			if (bp->b_npages < desiredpages) {
1618189499Srnoland				obj = vp->v_object;
1619189499Srnoland				tinc = PAGE_SIZE;
1620189499Srnoland				if (tinc > bsize)
1621189499Srnoland					tinc = bsize;
1622261455Seadler				off = (vm_ooffset_t) bp->b_lblkno * bsize;
1623189499Srnoland				curbpnpages = bp->b_npages;
1624189499Srnoland		doretry:
1625189499Srnoland				bp->b_flags |= B_CACHE;
1626189499Srnoland				bp->b_validoff = bp->b_validend = 0;
1627189499Srnoland				for (toff = 0; toff < newbsize; toff += tinc) {
1628189499Srnoland					int bytesinpage;
1629189499Srnoland
1630189499Srnoland					pageindex = toff >> PAGE_SHIFT;
1631189499Srnoland					objoff = OFF_TO_IDX(off + toff);
1632189499Srnoland					if (pageindex < curbpnpages) {
1633189499Srnoland
1634189499Srnoland						m = bp->b_pages[pageindex];
1635189499Srnoland#ifdef VFS_BIO_DIAG
1636189499Srnoland						if (m->pindex != objoff)
1637189499Srnoland							panic("allocbuf: page changed offset??!!!?");
1638189499Srnoland#endif
1639189499Srnoland						bytesinpage = tinc;
1640189499Srnoland						if (tinc > (newbsize - toff))
1641189499Srnoland							bytesinpage = newbsize - toff;
1642189499Srnoland						if (bp->b_flags & B_CACHE)
1643189499Srnoland							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1644189499Srnoland						continue;
1645189499Srnoland					}
1646189499Srnoland					m = vm_page_lookup(obj, objoff);
1647189499Srnoland					if (!m) {
1648189499Srnoland						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1649189499Srnoland						if (!m) {
1650189499Srnoland							VM_WAIT;
1651189499Srnoland							goto doretry;
1652189499Srnoland						}
1653189499Srnoland						/*
1654189499Srnoland						 * Normally it is unwise to clear PG_BUSY without
1655189499Srnoland						 * PAGE_WAKEUP -- but it is okay here, as there is
1656189499Srnoland						 * no chance for blocking between here and vm_page_alloc
1657189499Srnoland						 */
1658189499Srnoland						m->flags &= ~PG_BUSY;
1659189499Srnoland						vm_page_wire(m);
1660189499Srnoland						bp->b_flags &= ~B_CACHE;
1661189499Srnoland					} else if (m->flags & PG_BUSY) {
1662189499Srnoland						s = splvm();
1663189499Srnoland						if (m->flags & PG_BUSY) {
1664189499Srnoland							m->flags |= PG_WANTED;
1665189499Srnoland							tsleep(m, PVM, "pgtblk", 0);
1666189499Srnoland						}
1667189499Srnoland						splx(s);
1668189499Srnoland						goto doretry;
1669189499Srnoland					} else {
1670189499Srnoland						if ((curproc != pageproc) &&
1671189499Srnoland							((m->queue - m->pc) == PQ_CACHE) &&
1672189499Srnoland						    ((cnt.v_free_count + cnt.v_cache_count) <
1673189499Srnoland								(cnt.v_free_min + cnt.v_cache_min))) {
1674189499Srnoland							pagedaemon_wakeup();
1675189499Srnoland						}
1676189499Srnoland						bytesinpage = tinc;
1677189499Srnoland						if (tinc > (newbsize - toff))
1678189499Srnoland							bytesinpage = newbsize - toff;
1679189499Srnoland						if (bp->b_flags & B_CACHE)
1680189499Srnoland							vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1681189499Srnoland						vm_page_wire(m);
1682261455Seadler					}
1683189499Srnoland					bp->b_pages[pageindex] = m;
1684189499Srnoland					curbpnpages = pageindex + 1;
1685189499Srnoland				}
1686189499Srnoland				if (vp->v_tag == VT_NFS) {
1687189499Srnoland					if (bp->b_dirtyend > 0) {
1688189499Srnoland						bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1689189499Srnoland						bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1690189499Srnoland					}
1691189499Srnoland					if (bp->b_validend == 0)
1692189499Srnoland						bp->b_flags &= ~B_CACHE;
1693189499Srnoland				}
1694189499Srnoland				bp->b_data = (caddr_t) trunc_page(bp->b_data);
1695189499Srnoland				bp->b_npages = curbpnpages;
1696189499Srnoland				pmap_qenter((vm_offset_t) bp->b_data,
1697189499Srnoland					bp->b_pages, bp->b_npages);
1698189499Srnoland				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1699189499Srnoland			}
1700189499Srnoland		}
1701189499Srnoland	}
1702189499Srnoland	if (bp->b_flags & B_VMIO)
1703189499Srnoland		vmiospace += (newbsize - bp->b_bufsize);
1704189499Srnoland	bufspace += (newbsize - bp->b_bufsize);
1705189499Srnoland	bp->b_bufsize = newbsize;
1706189499Srnoland	bp->b_bcount = size;
1707189499Srnoland	return 1;
1708189499Srnoland}
1709189499Srnoland
1710189499Srnoland/*
1711189499Srnoland * Wait for buffer I/O completion, returning error status.
1712189499Srnoland */
1713189499Srnolandint
1714189499Srnolandbiowait(register struct buf * bp)
1715189499Srnoland{
1716189499Srnoland	int s;
1717189499Srnoland
1718189499Srnoland	s = splbio();
1719189499Srnoland	while ((bp->b_flags & B_DONE) == 0)
1720189499Srnoland#if defined(NO_SCHEDULE_MODS)
1721189499Srnoland		tsleep(bp, PRIBIO, "biowait", 0);
1722189499Srnoland#else
1723189499Srnoland		tsleep(bp, curproc->p_usrpri, "biowait", 0);
1724189499Srnoland#endif
1725189499Srnoland	splx(s);
1726189499Srnoland	if (bp->b_flags & B_EINTR) {
1727189499Srnoland		bp->b_flags &= ~B_EINTR;
1728189499Srnoland		return (EINTR);
1729189499Srnoland	}
1730189499Srnoland	if (bp->b_flags & B_ERROR) {
1731189499Srnoland		return (bp->b_error ? bp->b_error : EIO);
1732189499Srnoland	} else {
1733189499Srnoland		return (0);
1734189499Srnoland	}
1735189499Srnoland}
1736189499Srnoland
1737189499Srnoland/*
1738189499Srnoland * Finish I/O on a buffer, calling an optional function.
1739189499Srnoland * This is usually called from interrupt level, so process blocking
1740189499Srnoland * is not *a good idea*.
1741189499Srnoland */
1742189499Srnolandvoid
1743189499Srnolandbiodone(register struct buf * bp)
1744189499Srnoland{
1745189499Srnoland	int s;
1746189499Srnoland
1747189499Srnoland	s = splbio();
1748189499Srnoland
1749189499Srnoland#if !defined(MAX_PERF)
1750189499Srnoland	if (!(bp->b_flags & B_BUSY))
1751189499Srnoland		panic("biodone: buffer not busy");
1752189499Srnoland#endif
1753189499Srnoland
1754189499Srnoland	if (bp->b_flags & B_DONE) {
1755189499Srnoland		splx(s);
1756261455Seadler#if !defined(MAX_PERF)
1757189499Srnoland		printf("biodone: buffer already done\n");
1758189499Srnoland#endif
1759189499Srnoland		return;
1760189499Srnoland	}
1761189499Srnoland	bp->b_flags |= B_DONE;
1762189499Srnoland
1763189499Srnoland	if ((bp->b_flags & B_READ) == 0) {
1764189499Srnoland		vwakeup(bp);
1765189499Srnoland	}
1766189499Srnoland#ifdef BOUNCE_BUFFERS
1767189499Srnoland	if (bp->b_flags & B_BOUNCE)
1768189499Srnoland		vm_bounce_free(bp);
1769189499Srnoland#endif
1770189499Srnoland
1771189499Srnoland	/* call optional completion function if requested */
1772189499Srnoland	if (bp->b_flags & B_CALL) {
1773189499Srnoland		bp->b_flags &= ~B_CALL;
1774189499Srnoland		(*bp->b_iodone) (bp);
1775189499Srnoland		splx(s);
1776189499Srnoland		return;
1777189499Srnoland	}
1778189499Srnoland	if (bp->b_flags & B_VMIO) {
1779189499Srnoland		int i, resid;
1780189499Srnoland		vm_ooffset_t foff;
1781189499Srnoland		vm_page_t m;
1782189499Srnoland		vm_object_t obj;
1783189499Srnoland		int iosize;
1784189499Srnoland		struct vnode *vp = bp->b_vp;
1785189499Srnoland
1786189499Srnoland		if (vp->v_type == VBLK)
1787189499Srnoland			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1788189499Srnoland		else
1789189499Srnoland			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1790189499Srnoland		obj = vp->v_object;
1791189499Srnoland#if !defined(MAX_PERF)
1792189499Srnoland		if (!obj) {
1793189499Srnoland			panic("biodone: no object");
1794189499Srnoland		}
1795189499Srnoland#endif
1796189499Srnoland#if defined(VFS_BIO_DEBUG)
1797189499Srnoland		if (obj->paging_in_progress < bp->b_npages) {
1798189499Srnoland			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1799189499Srnoland			    obj->paging_in_progress, bp->b_npages);
1800189499Srnoland		}
1801189499Srnoland#endif
1802189499Srnoland		iosize = bp->b_bufsize;
1803189499Srnoland		for (i = 0; i < bp->b_npages; i++) {
180495584Sanholt			int bogusflag = 0;
180595584Sanholt			m = bp->b_pages[i];
180695584Sanholt			if (m == bogus_page) {
180795584Sanholt				bogusflag = 1;
180895584Sanholt				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
180995584Sanholt				if (!m) {
1810112015Sanholt#if defined(VFS_BIO_DEBUG)
181195584Sanholt					printf("biodone: page disappeared\n");
181295584Sanholt#endif
1813189499Srnoland					--obj->paging_in_progress;
1814189499Srnoland					continue;
1815189499Srnoland				}
1816189499Srnoland				bp->b_pages[i] = m;
1817189499Srnoland				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
181895584Sanholt			}
181995584Sanholt#if defined(VFS_BIO_DEBUG)
182095584Sanholt			if (OFF_TO_IDX(foff) != m->pindex) {
182195584Sanholt				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
182295584Sanholt			}
1823152909Sanholt#endif
1824152909Sanholt			resid = IDX_TO_OFF(m->pindex + 1) - foff;
1825189499Srnoland			if (resid > iosize)
1826189499Srnoland				resid = iosize;
1827189499Srnoland			/*
1828189499Srnoland			 * In the write case, the valid and clean bits are
1829189499Srnoland			 * already changed correctly, so we only need to do this
1830189499Srnoland			 * here in the read case.
1831189499Srnoland			 */
1832189499Srnoland			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1833189499Srnoland				vfs_page_set_valid(bp, foff, i, m);
1834189499Srnoland			}
1835112015Sanholt
1836112015Sanholt			/*
183795584Sanholt			 * when debugging new filesystems or buffer I/O methods, this
1838189499Srnoland			 * is the most common error that pops up.  if you see this, you
183995584Sanholt			 * have not set the page busy flag correctly!!!
1840189499Srnoland			 */
184195584Sanholt			if (m->busy == 0) {
1842189499Srnoland#if !defined(MAX_PERF)
184395584Sanholt				printf("biodone: page busy < 0, "
184495584Sanholt				    "pindex: %d, foff: 0x(%x,%x), "
1845189499Srnoland				    "resid: %d, index: %d\n",
1846148211Sanholt				    (int) m->pindex, (int)(foff >> 32),
1847189499Srnoland						(int) foff & 0xffffffff, resid, i);
1848148211Sanholt#endif
1849189499Srnoland				if (vp->v_type != VBLK)
1850148211Sanholt#if !defined(MAX_PERF)
1851148211Sanholt					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1852189499Srnoland					    bp->b_vp->v_mount->mnt_stat.f_iosize,
1853182080Srnoland					    (int) bp->b_lblkno,
1854182080Srnoland					    bp->b_flags, bp->b_npages);
1855182080Srnoland				else
1856182080Srnoland					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1857182080Srnoland					    (int) bp->b_lblkno,
1858182080Srnoland					    bp->b_flags, bp->b_npages);
1859189499Srnoland				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1860182080Srnoland				    m->valid, m->dirty, m->wire_count);
1861189499Srnoland#endif
1862182080Srnoland				panic("biodone: page busy < 0\n");
1863189499Srnoland			}
1864189499Srnoland			--m->busy;
1865182080Srnoland			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1866182080Srnoland				m->flags &= ~PG_WANTED;
1867189499Srnoland				wakeup(m);
1868182080Srnoland			}
1869182080Srnoland			--obj->paging_in_progress;
1870182080Srnoland			foff += resid;
1871182080Srnoland			iosize -= resid;
1872182080Srnoland		}
1873182080Srnoland		if (obj && obj->paging_in_progress == 0 &&
1874189499Srnoland		    (obj->flags & OBJ_PIPWNT)) {
1875189499Srnoland			obj->flags &= ~OBJ_PIPWNT;
1876189499Srnoland			wakeup(obj);
1877189499Srnoland		}
1878189499Srnoland	}
1879189499Srnoland	/*
1880189499Srnoland	 * For asynchronous completions, release the buffer now. The brelse
1881182080Srnoland	 * checks for B_WANTED and will do the wakeup there if necessary - so
1882189499Srnoland	 * no need to do a wakeup here in the async case.
1883189499Srnoland	 */
1884189499Srnoland
1885189499Srnoland	if (bp->b_flags & B_ASYNC) {
1886189499Srnoland		if ((bp->b_flags & B_ORDERED) == 0) {
1887189499Srnoland			if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
1888189499Srnoland				brelse(bp);
1889182080Srnoland			else
1890182080Srnoland				bqrelse(bp);
189195584Sanholt		}
189295584Sanholt	} else {
189395584Sanholt		bp->b_flags &= ~B_WANTED;
189495584Sanholt		wakeup(bp);
189595584Sanholt	}
189695584Sanholt	splx(s);
189795584Sanholt}
189895584Sanholt
189995584Sanholtint
190095584Sanholtcount_lock_queue()
190195584Sanholt{
190295584Sanholt	int count;
190395584Sanholt	struct buf *bp;
190495584Sanholt
190595584Sanholt	count = 0;
190695584Sanholt	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
1907196470Srnoland	    bp != NULL;
1908196470Srnoland	    bp = TAILQ_NEXT(bp, b_freelist))
1909196470Srnoland		count++;
1910196470Srnoland	return (count);
191195584Sanholt}
191295584Sanholt
191395584Sanholtint vfs_update_interval = 30;
191495584Sanholt
191595584Sanholtstatic void
1916196470Srnolandvfs_update()
1917196470Srnoland{
1918196470Srnoland	while (1) {
1919196470Srnoland		tsleep(&vfs_update_wakeup, PUSER, "update",
192095584Sanholt		    hz * vfs_update_interval);
192195584Sanholt		vfs_update_wakeup = 0;
192295584Sanholt		sync(curproc, NULL, NULL);
192395584Sanholt	}
192495584Sanholt}
1925196470Srnoland
1926196470Srnolandstatic int
1927196470Srnolandsysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1928196470Srnoland{
192995584Sanholt	int error = sysctl_handle_int(oidp,
193095584Sanholt		oidp->oid_arg1, oidp->oid_arg2, req);
193195584Sanholt	if (!error)
193295584Sanholt		wakeup(&vfs_update_wakeup);
193395584Sanholt	return error;
193495584Sanholt}
1935196470Srnoland
1936196470SrnolandSYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1937196470Srnoland	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1938196470Srnoland
193995584Sanholt
194095584Sanholt/*
194195584Sanholt * This routine is called in lieu of iodone in the case of
194295584Sanholt * incomplete I/O.  This keeps the busy status for pages
1943182080Srnoland * consistant.
1944189499Srnoland */
1945189499Srnolandvoid
1946182080Srnolandvfs_unbusy_pages(struct buf * bp)
1947189499Srnoland{
1948189499Srnoland	int i;
1949189499Srnoland
195095584Sanholt	if (bp->b_flags & B_VMIO) {
195195584Sanholt		struct vnode *vp = bp->b_vp;
195295584Sanholt		vm_object_t obj = vp->v_object;
1953182080Srnoland		vm_ooffset_t foff;
1954189499Srnoland
1955189499Srnoland		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1956182080Srnoland
1957189499Srnoland		for (i = 0; i < bp->b_npages; i++) {
1958189499Srnoland			vm_page_t m = bp->b_pages[i];
1959189499Srnoland
196095584Sanholt			if (m == bogus_page) {
196195584Sanholt				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
196295584Sanholt#if !defined(MAX_PERF)
1963182080Srnoland				if (!m) {
1964189499Srnoland					panic("vfs_unbusy_pages: page missing\n");
1965189499Srnoland				}
1966182080Srnoland#endif
1967189499Srnoland				bp->b_pages[i] = m;
1968189499Srnoland				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1969189499Srnoland			}
197095584Sanholt			--obj->paging_in_progress;
197195584Sanholt			--m->busy;
197295584Sanholt			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1973182080Srnoland				m->flags &= ~PG_WANTED;
1974189499Srnoland				wakeup(m);
1975189499Srnoland			}
1976182080Srnoland		}
1977189499Srnoland		if (obj->paging_in_progress == 0 &&
1978189499Srnoland		    (obj->flags & OBJ_PIPWNT)) {
1979189499Srnoland			obj->flags &= ~OBJ_PIPWNT;
198095584Sanholt			wakeup(obj);
198195584Sanholt		}
198295584Sanholt	}
198395584Sanholt}
198495584Sanholt
198595584Sanholt/*
1986145132Sanholt * Set NFS' b_validoff and b_validend fields from the valid bits
1987112015Sanholt * of a page.  If the consumer is not NFS, and the page is not
198895584Sanholt * valid for the entire range, clear the B_CACHE flag to force
198995584Sanholt * the consumer to re-read the page.
1990112015Sanholt */
1991113995Sanholtstatic void
1992112015Sanholtvfs_buf_set_valid(struct buf *bp,
1993112015Sanholt		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
199495584Sanholt		  vm_page_t m)
199595584Sanholt{
199695584Sanholt	if (bp->b_vp->v_tag == VT_NFS) {
199795584Sanholt		vm_offset_t svalid, evalid;
1998189499Srnoland		int validbits = m->valid;
1999189499Srnoland
200095584Sanholt		/*
2001189499Srnoland		 * This only bothers with the first valid range in the
2002189499Srnoland		 * page.
2003189499Srnoland		 */
2004189499Srnoland		svalid = off;
2005189499Srnoland		while (validbits && !(validbits & 1)) {
200695584Sanholt			svalid += DEV_BSIZE;
200795584Sanholt			validbits >>= 1;
200895584Sanholt		}
200995584Sanholt		evalid = svalid;
201095584Sanholt		while (validbits & 1) {
201195584Sanholt			evalid += DEV_BSIZE;
201295584Sanholt			validbits >>= 1;
201395584Sanholt		}
201495584Sanholt		/*
201595584Sanholt		 * Make sure this range is contiguous with the range
201695584Sanholt		 * built up from previous pages.  If not, then we will
201795584Sanholt		 * just use the range from the previous pages.
201895584Sanholt		 */
201995584Sanholt		if (svalid == bp->b_validend) {
202095584Sanholt			bp->b_validoff = min(bp->b_validoff, svalid);
202195584Sanholt			bp->b_validend = max(bp->b_validend, evalid);
202295584Sanholt		}
202395584Sanholt	} else if (!vm_page_is_valid(m,
202495584Sanholt				     (vm_offset_t) ((foff + off) & PAGE_MASK),
202595584Sanholt				     size)) {
202695584Sanholt		bp->b_flags &= ~B_CACHE;
2027189499Srnoland	}
2028189499Srnoland}
2029189499Srnoland
2030189499Srnoland/*
2031189499Srnoland * Set the valid bits in a page, taking care of the b_validoff,
2032189499Srnoland * b_validend fields which NFS uses to optimise small reads.  Off is
2033189499Srnoland * the offset within the file and pageno is the page index within the buf.
2034189499Srnoland */
2035189499Srnolandstatic void
2036189499Srnolandvfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2037189499Srnoland{
2038189499Srnoland	struct vnode *vp = bp->b_vp;
2039189499Srnoland	vm_ooffset_t soff, eoff;
2040189499Srnoland
2041189499Srnoland	soff = off;
2042189499Srnoland	eoff = off + min(PAGE_SIZE, bp->b_bufsize);
2043189499Srnoland	vm_page_set_invalid(m,
2044189499Srnoland			    (vm_offset_t) (soff & PAGE_MASK),
204595584Sanholt			    (vm_offset_t) (eoff - soff));
204695584Sanholt	if (vp->v_tag == VT_NFS) {
204795584Sanholt		vm_ooffset_t sv, ev;
204895584Sanholt		off = off - pageno * PAGE_SIZE;
204995584Sanholt		sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
205095584Sanholt		ev = off + (bp->b_validend & ~(DEV_BSIZE - 1));
2051189499Srnoland		soff = max(sv, soff);
205295584Sanholt		eoff = min(ev, eoff);
2053196470Srnoland	}
2054196470Srnoland	if (eoff > soff)
205595584Sanholt		vm_page_set_validclean(m,
205695584Sanholt				       (vm_offset_t) (soff & PAGE_MASK),
2057182080Srnoland				       (vm_offset_t) (eoff - soff));
205895584Sanholt}
2059196470Srnoland
2060196470Srnoland/*
2061196470Srnoland * This routine is called before a device strategy routine.
2062196470Srnoland * It is used to tell the VM system that paging I/O is in
2063196470Srnoland * progress, and treat the pages associated with the buffer
206495584Sanholt * almost as being PG_BUSY.  Also the object paging_in_progress
2065112015Sanholt * flag is handled to make sure that the object doesn't become
206695584Sanholt * inconsistant.
206795584Sanholt */
206895584Sanholtvoid
206995584Sanholtvfs_busy_pages(struct buf * bp, int clear_modify)
207095584Sanholt{
207195584Sanholt	int i;
207295584Sanholt
207395584Sanholt	if (bp->b_flags & B_VMIO) {
207495584Sanholt		struct vnode *vp = bp->b_vp;
207595584Sanholt		vm_object_t obj = vp->v_object;
2076112015Sanholt		vm_ooffset_t foff;
2077182080Srnoland
2078112015Sanholt		if (vp->v_type == VBLK)
2079112015Sanholt			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2080189499Srnoland		else
2081112015Sanholt			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2082112015Sanholt		vfs_setdirty(bp);
208395584Sanholt		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
208495584Sanholt			vm_page_t m = bp->b_pages[i];
2085189499Srnoland
2086189499Srnoland			if ((bp->b_flags & B_CLUSTER) == 0) {
2087112015Sanholt				obj->paging_in_progress++;
2088189499Srnoland				m->busy++;
2089189499Srnoland			}
2090112015Sanholt			vm_page_protect(m, VM_PROT_NONE);
209195584Sanholt			if (clear_modify)
209295584Sanholt				vfs_page_set_valid(bp, foff, i, m);
209395584Sanholt			else if (bp->b_bcount >= PAGE_SIZE) {
209495584Sanholt				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
209595584Sanholt					bp->b_pages[i] = bogus_page;
209695584Sanholt					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
209795584Sanholt				}
209895584Sanholt			}
209995584Sanholt		}
2100112015Sanholt	}
2101112015Sanholt}
2102112015Sanholt
2103112015Sanholt/*
210495584Sanholt * Tell the VM system that the pages associated with this buffer
2105189499Srnoland * are clean.  This is used for delayed writes where the data is
2106112015Sanholt * going to go to disk eventually without additional VM intevention.
2107145132Sanholt */
2108112015Sanholtvoid
2109112015Sanholtvfs_clean_pages(struct buf * bp)
2110145132Sanholt{
2111145132Sanholt	int i;
2112189499Srnoland
2113145132Sanholt	if (bp->b_flags & B_VMIO) {
2114145132Sanholt		struct vnode *vp = bp->b_vp;
2115145132Sanholt		vm_object_t obj = vp->v_object;
2116145132Sanholt		vm_ooffset_t foff;
2117112015Sanholt
2118145132Sanholt		if (vp->v_type == VBLK)
2119112015Sanholt			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
2120145132Sanholt		else
2121145132Sanholt			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
2122145132Sanholt		for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
2123145132Sanholt			vm_page_t m = bp->b_pages[i];
2124145132Sanholt
2125112015Sanholt			vfs_page_set_valid(bp, foff, i, m);
2126112015Sanholt		}
2127112015Sanholt	}
2128145132Sanholt}
2129
2130void
2131vfs_bio_clrbuf(struct buf *bp) {
2132	int i;
2133	if( bp->b_flags & B_VMIO) {
2134		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
2135			int mask;
2136			mask = 0;
2137			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
2138				mask |= (1 << (i/DEV_BSIZE));
2139			if( bp->b_pages[0]->valid != mask) {
2140				bzero(bp->b_data, bp->b_bufsize);
2141			}
2142			bp->b_pages[0]->valid = mask;
2143			bp->b_resid = 0;
2144			return;
2145		}
2146		for(i=0;i<bp->b_npages;i++) {
2147			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
2148				continue;
2149			if( bp->b_pages[i]->valid == 0) {
2150				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2151					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
2152				}
2153			} else {
2154				int j;
2155				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
2156					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
2157						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
2158				}
2159			}
2160			/* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
2161		}
2162		bp->b_resid = 0;
2163	} else {
2164		clrbuf(bp);
2165	}
2166}
2167
2168/*
2169 * vm_hold_load_pages and vm_hold_unload pages get pages into
2170 * a buffers address space.  The pages are anonymous and are
2171 * not associated with a file object.
2172 */
2173void
2174vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2175{
2176	vm_offset_t pg;
2177	vm_page_t p;
2178	int index;
2179
2180	to = round_page(to);
2181	from = round_page(from);
2182	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2183
2184	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2185
2186tryagain:
2187
2188		p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2189		    VM_ALLOC_NORMAL);
2190		if (!p) {
2191			VM_WAIT;
2192			goto tryagain;
2193		}
2194		vm_page_wire(p);
2195		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2196		bp->b_pages[index] = p;
2197		PAGE_WAKEUP(p);
2198	}
2199	bp->b_npages = index;
2200}
2201
2202void
2203vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2204{
2205	vm_offset_t pg;
2206	vm_page_t p;
2207	int index, newnpages;
2208
2209	from = round_page(from);
2210	to = round_page(to);
2211	newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
2212
2213	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2214		p = bp->b_pages[index];
2215		if (p && (index < bp->b_npages)) {
2216#if !defined(MAX_PERF)
2217			if (p->busy) {
2218				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2219					bp->b_blkno, bp->b_lblkno);
2220			}
2221#endif
2222			bp->b_pages[index] = NULL;
2223			pmap_kremove(pg);
2224			vm_page_unwire(p);
2225			vm_page_free(p);
2226		}
2227	}
2228	bp->b_npages = newnpages;
2229}
2230
2231
2232#include "opt_ddb.h"
2233#ifdef DDB
2234#include <ddb/ddb.h>
2235
2236DB_SHOW_COMMAND(buffer, db_show_buffer)
2237{
2238	/* get args */
2239	struct buf *bp = (struct buf *)addr;
2240
2241	if (!have_addr) {
2242		db_printf("usage: show buffer <addr>\n");
2243		return;
2244	}
2245
2246	db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2247		  bp->b_flags, "\20\40bounce\37cluster\36vmio\35ram\34ordered"
2248		  "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26tape"
2249		  "\25read\24raw\23phys\22clusterok\21malloc\20nocache"
2250		  "\17locked\16inval\15gathered\14error\13eintr\12done\11dirty"
2251		  "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age");
2252	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2253		  "b_resid = %ld\nb_dev = 0x%x, b_un.b_addr = %p, "
2254		  "b_blkno = %d, b_pblkno = %d\n",
2255		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2256		  bp->b_dev, bp->b_un.b_addr, bp->b_blkno, bp->b_pblkno);
2257}
2258#endif /* DDB */
2259