vfs_bio.c revision 3374
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 *
19 * $Id: vfs_bio.c,v 1.13 1994/10/04 03:10:47 davidg Exp $
20 */
21
22#include <sys/param.h>
23#include <sys/systm.h>
24#include <sys/kernel.h>
25#include <sys/proc.h>
26#include <sys/vnode.h>
27#include <sys/buf.h>
28#include <sys/mount.h>
29#include <sys/malloc.h>
30#include <sys/resourcevar.h>
31#include <sys/proc.h>
32#include <vm/vm.h>
33#include <vm/vm_pageout.h>
34
35#include <miscfs/specfs/specdev.h>
36
37struct	buf *buf;		/* buffer header pool */
38int	nbuf;			/* number of buffer headers calculated elsewhere */
39struct swqueue bswlist;
40struct	buf *bclnlist;		/* Head of cleaned page list. */
41
42extern	vm_map_t buffer_map, io_map;
43
44void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
45void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
46
47int needsbuffer;
48
49/*
50 * Internal update daemon, process 3
51 *	The variable vfs_update_wakeup allows for internal syncs.
52 */
53int vfs_update_wakeup;
54
55/*
56 * Initialize buffer headers and related structures.
57 */
58void
59bufinit()
60{
61	struct buf *bp;
62	int i;
63	caddr_t baddr;
64
65	TAILQ_INIT(&bswlist);
66	LIST_INIT(&invalhash);
67
68	/* first, make a null hash table */
69	for(i=0;i<BUFHSZ;i++)
70		LIST_INIT(&bufhashtbl[i]);
71
72	/* next, make a null set of free lists */
73	for(i=0;i<BUFFER_QUEUES;i++)
74		TAILQ_INIT(&bufqueues[i]);
75
76	baddr = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
77	/* finally, initialize each buffer header and stick on empty q */
78	for(i=0;i<nbuf;i++) {
79		bp = &buf[i];
80		bzero(bp, sizeof *bp);
81		bp->b_flags = B_INVAL;	/* we're just an empty header */
82		bp->b_dev = NODEV;
83		bp->b_vp = NULL;
84		bp->b_rcred = NOCRED;
85		bp->b_wcred = NOCRED;
86		bp->b_qindex = QUEUE_EMPTY;
87		bp->b_vnbufs.le_next = NOLIST;
88		bp->b_data = baddr + i * MAXBSIZE;
89		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
90		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
91	}
92}
93
94/*
95 * remove the buffer from the appropriate free list
96 */
97void
98bremfree(struct buf *bp)
99{
100	int s = splbio();
101	if( bp->b_qindex != QUEUE_NONE) {
102		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
103		bp->b_qindex = QUEUE_NONE;
104	} else {
105		panic("bremfree: removing a buffer when not on a queue");
106	}
107	splx(s);
108}
109
110/*
111 * Get a buffer with the specified data.  Look in the cache first.
112 */
113int
114bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
115	struct buf **bpp)
116{
117	struct buf *bp;
118
119	bp = getblk (vp, blkno, size, 0, 0);
120	*bpp = bp;
121
122	/* if not found in cache, do some I/O */
123	if ((bp->b_flags & B_CACHE) == 0) {
124		if (curproc && curproc->p_stats)	/* count block I/O */
125			curproc->p_stats->p_ru.ru_inblock++;
126		bp->b_flags |= B_READ;
127		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
128		if( bp->b_rcred == NOCRED) {
129			if (cred != NOCRED)
130				crhold(cred);
131			bp->b_rcred = cred;
132		}
133		VOP_STRATEGY(bp);
134		return( biowait (bp));
135	}
136
137	return (0);
138}
139
140/*
141 * Operates like bread, but also starts asynchronous I/O on
142 * read-ahead blocks.
143 */
144int
145breadn(struct vnode *vp, daddr_t blkno, int size,
146	daddr_t *rablkno, int *rabsize,
147	int cnt, struct ucred *cred, struct buf **bpp)
148{
149	struct buf *bp, *rabp;
150	int i;
151	int rv = 0, readwait = 0;
152
153	*bpp = bp = getblk (vp, blkno, size, 0, 0);
154
155	/* if not found in cache, do some I/O */
156	if ((bp->b_flags & B_CACHE) == 0) {
157		if (curproc && curproc->p_stats)	/* count block I/O */
158			curproc->p_stats->p_ru.ru_inblock++;
159		bp->b_flags |= B_READ;
160		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
161		if( bp->b_rcred == NOCRED) {
162			if (cred != NOCRED)
163				crhold(cred);
164			bp->b_rcred = cred;
165		}
166		VOP_STRATEGY(bp);
167		++readwait;
168	}
169
170	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
171		if( incore(vp, *rablkno)) {
172			continue;
173		}
174		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
175
176		if ((rabp->b_flags & B_CACHE) == 0) {
177			if (curproc && curproc->p_stats)
178				curproc->p_stats->p_ru.ru_inblock++;
179			rabp->b_flags |= B_READ | B_ASYNC;
180			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
181			if( rabp->b_rcred == NOCRED) {
182				if (cred != NOCRED)
183					crhold(cred);
184				rabp->b_rcred = cred;
185			}
186			VOP_STRATEGY(rabp);
187		} else {
188			brelse(rabp);
189		}
190	}
191
192	if( readwait) {
193		rv = biowait (bp);
194	}
195
196	return (rv);
197}
198
199/*
200 * Write, release buffer on completion.  (Done by iodone
201 * if async.)
202 */
203int
204bwrite(struct buf *bp)
205{
206	int oldflags = bp->b_flags;
207
208	if(bp->b_flags & B_INVAL) {
209		brelse(bp);
210		return (0);
211	}
212
213	if(!(bp->b_flags & B_BUSY))
214		panic("bwrite: buffer is not busy???");
215
216	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
217	bp->b_flags |= B_WRITEINPROG;
218
219	if (oldflags & B_ASYNC) {
220		if (oldflags & B_DELWRI) {
221			reassignbuf(bp, bp->b_vp);
222		} else if( curproc) {
223			++curproc->p_stats->p_ru.ru_oublock;
224		}
225	}
226
227	bp->b_vp->v_numoutput++;
228	VOP_STRATEGY(bp);
229
230	if( (oldflags & B_ASYNC) == 0) {
231		int rtval = biowait(bp);
232		if (oldflags & B_DELWRI) {
233			reassignbuf(bp, bp->b_vp);
234		} else if( curproc) {
235			++curproc->p_stats->p_ru.ru_oublock;
236		}
237		brelse(bp);
238		return (rtval);
239	}
240
241	return(0);
242}
243
244int
245vn_bwrite(ap)
246	struct vop_bwrite_args *ap;
247{
248	return (bwrite(ap->a_bp));
249}
250
251/*
252 * Delayed write. (Buffer is marked dirty).
253 */
254void
255bdwrite(struct buf *bp)
256{
257
258	if((bp->b_flags & B_BUSY) == 0) {
259		panic("bdwrite: buffer is not busy");
260	}
261
262	if(bp->b_flags & B_INVAL) {
263		brelse(bp);
264		return;
265	}
266
267	if(bp->b_flags & B_TAPE) {
268		bawrite(bp);
269		return;
270	}
271
272	bp->b_flags &= ~B_READ;
273	if( (bp->b_flags & B_DELWRI) == 0) {
274		if( curproc)
275			++curproc->p_stats->p_ru.ru_oublock;
276		bp->b_flags |= B_DONE|B_DELWRI;
277		reassignbuf(bp, bp->b_vp);
278	}
279	brelse(bp);
280	return;
281}
282
283/*
284 * Asynchronous write.
285 * Start output on a buffer, but do not wait for it to complete.
286 * The buffer is released when the output completes.
287 */
288void
289bawrite(struct buf *bp)
290{
291	bp->b_flags |= B_ASYNC;
292	(void) bwrite(bp);
293}
294
295/*
296 * Release a buffer.
297 */
298void
299brelse(struct buf *bp)
300{
301	int x;
302
303	/* anyone need a "free" block? */
304	x=splbio();
305	if (needsbuffer) {
306		needsbuffer = 0;
307		wakeup((caddr_t)&needsbuffer);
308	}
309
310	/* anyone need this block? */
311	if (bp->b_flags & B_WANTED) {
312		bp->b_flags &= ~(B_WANTED|B_AGE);
313		wakeup((caddr_t)bp);
314	}
315
316	if (bp->b_flags & B_LOCKED)
317		bp->b_flags &= ~B_ERROR;
318
319	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
320		(bp->b_bufsize <= 0)) {
321		bp->b_flags |= B_INVAL;
322		bp->b_flags &= ~(B_DELWRI|B_CACHE);
323		if(bp->b_vp)
324			brelvp(bp);
325	}
326
327	if( bp->b_qindex != QUEUE_NONE)
328		panic("brelse: free buffer onto another queue???");
329
330	/* enqueue */
331	/* buffers with no memory */
332	if(bp->b_bufsize == 0) {
333		bp->b_qindex = QUEUE_EMPTY;
334		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
335		LIST_REMOVE(bp, b_hash);
336		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
337		bp->b_dev = NODEV;
338	/* buffers with junk contents */
339	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
340		bp->b_qindex = QUEUE_AGE;
341		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
342		LIST_REMOVE(bp, b_hash);
343		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
344		bp->b_dev = NODEV;
345	/* buffers that are locked */
346	} else if(bp->b_flags & B_LOCKED) {
347		bp->b_qindex = QUEUE_LOCKED;
348		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
349	/* buffers with stale but valid contents */
350	} else if(bp->b_flags & B_AGE) {
351		bp->b_qindex = QUEUE_AGE;
352		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
353	/* buffers with valid and quite potentially reuseable contents */
354	} else {
355		bp->b_qindex = QUEUE_LRU;
356		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
357	}
358
359	/* unlock */
360	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
361	splx(x);
362}
363
364int freebufspace;
365int allocbufspace;
366
367/*
368 * Find a buffer header which is available for use.
369 */
370struct buf *
371getnewbuf(int slpflag, int slptimeo)
372{
373	struct buf *bp;
374	int s;
375	s = splbio();
376start:
377	/* can we constitute a new buffer? */
378	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
379		if( bp->b_qindex != QUEUE_EMPTY)
380			panic("getnewbuf: inconsistent EMPTY queue");
381		bremfree(bp);
382		goto fillbuf;
383	}
384
385	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
386		if( bp->b_qindex != QUEUE_AGE)
387			panic("getnewbuf: inconsistent AGE queue");
388		bremfree(bp);
389	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
390		if( bp->b_qindex != QUEUE_LRU)
391			panic("getnewbuf: inconsistent LRU queue");
392		bremfree(bp);
393	} else	{
394		/* wait for a free buffer of any kind */
395		needsbuffer = 1;
396		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
397		splx(s);
398		return (0);
399	}
400
401
402	/* if we are a delayed write, convert to an async write */
403	if (bp->b_flags & B_DELWRI) {
404		bp->b_flags |= B_BUSY;
405		bawrite (bp);
406		goto start;
407	}
408
409	if(bp->b_vp)
410		brelvp(bp);
411
412	/* we are not free, nor do we contain interesting data */
413	if (bp->b_rcred != NOCRED)
414		crfree(bp->b_rcred);
415	if (bp->b_wcred != NOCRED)
416		crfree(bp->b_wcred);
417fillbuf:
418	bp->b_flags = B_BUSY;
419	LIST_REMOVE(bp, b_hash);
420	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
421	splx(s);
422	bp->b_dev = NODEV;
423	bp->b_vp = NULL;
424	bp->b_blkno = bp->b_lblkno = 0;
425	bp->b_iodone = 0;
426	bp->b_error = 0;
427	bp->b_resid = 0;
428	bp->b_bcount = 0;
429	bp->b_wcred = bp->b_rcred = NOCRED;
430	bp->b_dirtyoff = bp->b_dirtyend = 0;
431	bp->b_validoff = bp->b_validend = 0;
432	return (bp);
433}
434
435/*
436 * Check to see if a block is currently memory resident.
437 */
438struct buf *
439incore(struct vnode *vp, daddr_t blkno)
440{
441	struct buf *bp;
442	struct bufhashhdr *bh;
443
444	int s = splbio();
445
446	bh = BUFHASH(vp, blkno);
447	bp = bh->lh_first;
448
449	/* Search hash chain */
450	while (bp) {
451#ifdef DEBUG
452		if( (bp < buf) || (bp >= buf + nbuf)) {
453			printf("incore: buf out of range: %p, hash: %d\n",
454				bp, bh - bufhashtbl);
455			panic("incore: buf fault");
456		}
457#endif
458		/* hit */
459		if (bp->b_lblkno == blkno && bp->b_vp == vp
460			&& (bp->b_flags & B_INVAL) == 0) {
461			splx(s);
462			return (bp);
463		}
464		bp = bp->b_hash.le_next;
465	}
466	splx(s);
467
468	return(0);
469}
470
471/*
472 * Get a block given a specified block and offset into a file/device.
473 */
474struct buf *
475getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
476{
477	struct buf *bp;
478	int s;
479	struct bufhashhdr *bh;
480
481	s = splbio();
482loop:
483	if ((bp = incore(vp, blkno))) {
484		if (bp->b_flags & B_BUSY) {
485			bp->b_flags |= B_WANTED;
486			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
487			goto loop;
488		}
489		bp->b_flags |= B_BUSY | B_CACHE;
490		bremfree(bp);
491		/*
492		 * check for size inconsistancies
493		 */
494		if (bp->b_bcount != size) {
495			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
496			bp->b_flags |= B_INVAL;
497			bwrite(bp);
498			goto loop;
499		}
500	} else {
501		if ((bp = getnewbuf(0, 0)) == 0)
502			goto loop;
503		bp->b_blkno = bp->b_lblkno = blkno;
504		bgetvp(vp, bp);
505		LIST_REMOVE(bp, b_hash);
506		bh = BUFHASH(vp, blkno);
507		LIST_INSERT_HEAD(bh, bp, b_hash);
508		allocbuf(bp, size);
509	}
510	splx(s);
511	return (bp);
512}
513
514/*
515 * Get an empty, disassociated buffer of given size.
516 */
517struct buf *
518geteblk(int size)
519{
520	struct buf *bp;
521	while ((bp = getnewbuf(0, 0)) == 0)
522		;
523	allocbuf(bp, size);
524	bp->b_flags |= B_INVAL;
525	return (bp);
526}
527
528/*
529 * Modify the length of a buffer's underlying buffer storage without
530 * destroying information (unless, of course the buffer is shrinking).
531 */
532void
533allocbuf(struct buf *bp, int size)
534{
535
536	int newbsize = round_page(size);
537
538	if( newbsize == bp->b_bufsize) {
539		bp->b_bcount = size;
540		return;
541	} else if( newbsize < bp->b_bufsize) {
542		vm_hold_free_pages(
543			(vm_offset_t) bp->b_data + newbsize,
544			(vm_offset_t) bp->b_data + bp->b_bufsize);
545	} else if( newbsize > bp->b_bufsize) {
546		vm_hold_load_pages(
547			(vm_offset_t) bp->b_data + bp->b_bufsize,
548			(vm_offset_t) bp->b_data + newbsize);
549	}
550
551	/* adjust buffer cache's idea of memory allocated to buffer contents */
552	freebufspace -= newbsize - bp->b_bufsize;
553	allocbufspace += newbsize - bp->b_bufsize;
554
555	bp->b_bufsize = newbsize;
556	bp->b_bcount = size;
557}
558
559/*
560 * Wait for buffer I/O completion, returning error status.
561 */
562int
563biowait(register struct buf *bp)
564{
565	int s;
566
567	s = splbio();
568	while ((bp->b_flags & B_DONE) == 0)
569		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
570	if((bp->b_flags & B_ERROR) || bp->b_error) {
571		if ((bp->b_flags & B_INVAL) == 0) {
572			bp->b_flags |= B_INVAL;
573			bp->b_dev = NODEV;
574			LIST_REMOVE(bp, b_hash);
575			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
576		}
577		if (!bp->b_error)
578			bp->b_error = EIO;
579		else
580			bp->b_flags |= B_ERROR;
581		splx(s);
582		return (bp->b_error);
583	} else {
584		splx(s);
585		return (0);
586	}
587}
588
589/*
590 * Finish I/O on a buffer, calling an optional function.
591 * This is usually called from interrupt level, so process blocking
592 * is not *a good idea*.
593 */
594void
595biodone(register struct buf *bp)
596{
597	int s;
598	s = splbio();
599	bp->b_flags |= B_DONE;
600
601	if ((bp->b_flags & B_READ) == 0)  {
602		vwakeup(bp);
603	}
604
605#ifdef BOUNCE_BUFFERS
606	if (bp->b_flags & B_BOUNCE)
607		vm_bounce_free(bp);
608#endif
609
610	/* call optional completion function if requested */
611	if (bp->b_flags & B_CALL) {
612		bp->b_flags &= ~B_CALL;
613		(*bp->b_iodone)(bp);
614		splx(s);
615		return;
616	}
617
618/*
619 * For asynchronous completions, release the buffer now. The brelse
620 *	checks for B_WANTED and will do the wakeup there if necessary -
621 *	so no need to do a wakeup here in the async case.
622 */
623
624	if (bp->b_flags & B_ASYNC) {
625		brelse(bp);
626	} else {
627		bp->b_flags &= ~B_WANTED;
628		wakeup((caddr_t) bp);
629	}
630	splx(s);
631}
632
633int
634count_lock_queue()
635{
636	int count;
637	struct buf *bp;
638
639	count = 0;
640	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
641	    bp != NULL;
642	    bp = bp->b_freelist.tqe_next)
643		count++;
644	return(count);
645}
646
647int vfs_update_interval = 30;
648
649void
650vfs_update() {
651	(void) spl0();
652	while(1) {
653		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
654			hz * vfs_update_interval);
655		vfs_update_wakeup = 0;
656		sync(curproc, NULL, NULL);
657	}
658}
659
660#if 0
661#define MAXFREEBP 128
662#define LDFREE_BUSY 1
663#define LDFREE_WANT 2
664int loadfreeing;
665struct buf *freebp[MAXFREEBP];
666#endif
667/*
668 * these routines are not in the correct place (yet)
669 * also they work *ONLY* for kernel_pmap!!!
670 */
671void
672vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
673	vm_offset_t pg;
674	vm_page_t p;
675	vm_offset_t from = round_page(froma);
676	vm_offset_t to = round_page(toa);
677
678	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
679
680	tryagain:
681#if 0
682/*
683 * don't allow buffer cache to cause VM paging
684 */
685		if ( cnt.v_free_count < cnt.v_free_min) {
686			if( !loadfreeing ) {
687				int n=0;
688				struct buf *bp;
689				loadfreeing = LDFREE_BUSY;
690				while( (cnt.v_free_count <= cnt.v_free_min) &&
691					(n < MAXFREEBP)) {
692					bp = geteblk(0);
693					if( bp)
694						freebp[n++] = bp;
695					else
696						break;
697				}
698				while(--n >= 0) {
699					brelse(freebp[n]);
700				}
701				if( loadfreeing & LDFREE_WANT)
702					wakeup((caddr_t) &loadfreeing);
703				loadfreeing = 0;
704			} else {
705				loadfreeing |= LDFREE_WANT;
706				tsleep(&loadfreeing, PRIBIO, "biofree", 0);
707			}
708		}
709#endif
710		if (cnt.v_free_count <=
711			cnt.v_free_reserved + (toa-froma) / PAGE_SIZE) {
712			VM_WAIT;
713			goto tryagain;
714		}
715
716		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
717		if( !p) {
718			VM_WAIT;
719			goto tryagain;
720		}
721
722		vm_page_wire(p);
723		pmap_kenter( pg, VM_PAGE_TO_PHYS(p));
724	}
725}
726
727void
728vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa)
729{
730	vm_offset_t pg;
731	vm_page_t p;
732	vm_offset_t from = round_page(froma);
733	vm_offset_t to = round_page(toa);
734
735	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
736		p = PHYS_TO_VM_PAGE( pmap_kextract( pg));
737		pmap_kremove( pg);
738		vm_page_free(p);
739	}
740}
741
742void
743bufstats()
744{
745}
746
747