vfs_bio.c revision 2112
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 *
19 * $Id: vfs_bio.c,v 1.8 1994/08/08 15:40:59 wollman Exp $
20 */
21
22#include <sys/param.h>
23#include <sys/systm.h>
24#include <sys/kernel.h>
25#include <sys/proc.h>
26#include <sys/vnode.h>
27#include <sys/buf.h>
28#include <sys/mount.h>
29#include <sys/malloc.h>
30#include <sys/resourcevar.h>
31#include <vm/vm.h>
32#include <vm/vm_pageout.h>
33
34#include <miscfs/specfs/specdev.h>
35
36struct	buf *buf;		/* buffer header pool */
37int	nbuf;			/* number of buffer headers calculated elsewhere */
38struct swqueue bswlist;
39struct	buf *bclnlist;		/* Head of cleaned page list. */
40
41extern	vm_map_t buffer_map, io_map;
42
43void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
44void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
45
46int needsbuffer;
47
48/*
49 * Internal update daemon, process 3
50 *	The variable vfs_update_wakeup allows for internal syncs.
51 */
52int vfs_update_wakeup;
53
54/*
55 * Initialize buffer headers and related structures.
56 */
57void bufinit()
58{
59	struct buf *bp;
60	int i;
61
62	TAILQ_INIT(&bswlist);
63	LIST_INIT(&invalhash);
64
65	/* first, make a null hash table */
66	for(i=0;i<BUFHSZ;i++)
67		LIST_INIT(&bufhashtbl[i]);
68
69	/* next, make a null set of free lists */
70	for(i=0;i<BUFFER_QUEUES;i++)
71		TAILQ_INIT(&bufqueues[i]);
72
73	/* finally, initialize each buffer header and stick on empty q */
74	for(i=0;i<nbuf;i++) {
75		bp = &buf[i];
76		bzero(bp, sizeof *bp);
77		bp->b_flags = B_INVAL;	/* we're just an empty header */
78		bp->b_dev = NODEV;
79		bp->b_vp = NULL;
80		bp->b_rcred = NOCRED;
81		bp->b_wcred = NOCRED;
82		bp->b_qindex = QUEUE_EMPTY;
83		bp->b_vnbufs.le_next = NOLIST;
84		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
85		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
86		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
87	}
88}
89
90/*
91 * remove the buffer from the appropriate free list
92 */
93void
94bremfree(struct buf *bp)
95{
96	int s = splbio();
97	if( bp->b_qindex != QUEUE_NONE) {
98		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
99		bp->b_qindex = QUEUE_NONE;
100	} else {
101		panic("bremfree: removing a buffer when not on a queue");
102	}
103	splx(s);
104}
105
106/*
107 * Get a buffer with the specified data.  Look in the cache first.
108 */
109int
110bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
111	struct buf **bpp)
112{
113	struct buf *bp;
114
115	bp = getblk (vp, blkno, size, 0, 0);
116	*bpp = bp;
117
118	/* if not found in cache, do some I/O */
119	if ((bp->b_flags & B_CACHE) == 0) {
120		if (curproc && curproc->p_stats)	/* count block I/O */
121			curproc->p_stats->p_ru.ru_inblock++;
122		bp->b_flags |= B_READ;
123		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
124		if( bp->b_rcred == NOCRED) {
125			if (cred != NOCRED)
126				crhold(cred);
127			bp->b_rcred = cred;
128		}
129		VOP_STRATEGY(bp);
130		return( biowait (bp));
131	}
132
133	return (0);
134}
135
136/*
137 * Operates like bread, but also starts asynchronous I/O on
138 * read-ahead blocks.
139 */
140int
141breadn(struct vnode *vp, daddr_t blkno, int size,
142	daddr_t *rablkno, int *rabsize,
143	int cnt, struct ucred *cred, struct buf **bpp)
144{
145	struct buf *bp, *rabp;
146	int i;
147	int rv = 0, readwait = 0;
148
149	*bpp = bp = getblk (vp, blkno, size, 0, 0);
150
151	/* if not found in cache, do some I/O */
152	if ((bp->b_flags & B_CACHE) == 0) {
153		if (curproc && curproc->p_stats)	/* count block I/O */
154			curproc->p_stats->p_ru.ru_inblock++;
155		bp->b_flags |= B_READ;
156		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
157		if( bp->b_rcred == NOCRED) {
158			if (cred != NOCRED)
159				crhold(cred);
160			bp->b_rcred = cred;
161		}
162		VOP_STRATEGY(bp);
163		++readwait;
164	}
165
166	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
167		if( incore(vp, *rablkno)) {
168			continue;
169		}
170		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
171
172		if ((rabp->b_flags & B_CACHE) == 0) {
173			if (curproc && curproc->p_stats)
174				curproc->p_stats->p_ru.ru_inblock++;
175			rabp->b_flags |= B_READ | B_ASYNC;
176			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
177			if( rabp->b_rcred == NOCRED) {
178				if (cred != NOCRED)
179					crhold(cred);
180				rabp->b_rcred = cred;
181			}
182			VOP_STRATEGY(rabp);
183		} else {
184			brelse(rabp);
185		}
186	}
187
188	if( readwait) {
189		rv = biowait (bp);
190	}
191
192	return (rv);
193}
194
195/*
196 * Write, release buffer on completion.  (Done by iodone
197 * if async.)
198 */
199int
200bwrite(struct buf *bp)
201{
202	int oldflags = bp->b_flags;
203
204	if(bp->b_flags & B_INVAL) {
205		brelse(bp);
206		return (0);
207	}
208
209	if(!(bp->b_flags & B_BUSY))
210		panic("bwrite: buffer is not busy???");
211
212	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
213	bp->b_flags |= B_WRITEINPROG;
214
215	if (oldflags & B_ASYNC) {
216		if (oldflags & B_DELWRI) {
217			reassignbuf(bp, bp->b_vp);
218		} else if( curproc) {
219			++curproc->p_stats->p_ru.ru_oublock;
220		}
221	}
222
223	bp->b_vp->v_numoutput++;
224	VOP_STRATEGY(bp);
225
226	if( (oldflags & B_ASYNC) == 0) {
227		int rtval = biowait(bp);
228		if (oldflags & B_DELWRI) {
229			reassignbuf(bp, bp->b_vp);
230		} else if( curproc) {
231			++curproc->p_stats->p_ru.ru_oublock;
232		}
233		brelse(bp);
234		return (rtval);
235	}
236
237	return(0);
238}
239
240int
241vn_bwrite(ap)
242	struct vop_bwrite_args *ap;
243{
244	return (bwrite(ap->a_bp));
245}
246
247/*
248 * Delayed write. (Buffer is marked dirty).
249 */
250void
251bdwrite(struct buf *bp)
252{
253
254	if((bp->b_flags & B_BUSY) == 0) {
255		panic("bdwrite: buffer is not busy");
256	}
257
258	if(bp->b_flags & B_INVAL) {
259		brelse(bp);
260		return;
261	}
262
263	if(bp->b_flags & B_TAPE) {
264		bawrite(bp);
265		return;
266	}
267
268	bp->b_flags &= ~B_READ;
269	if( (bp->b_flags & B_DELWRI) == 0) {
270		if( curproc)
271			++curproc->p_stats->p_ru.ru_oublock;
272		bp->b_flags |= B_DONE|B_DELWRI;
273		reassignbuf(bp, bp->b_vp);
274	}
275	brelse(bp);
276	return;
277}
278
279/*
280 * Asynchronous write.
281 * Start output on a buffer, but do not wait for it to complete.
282 * The buffer is released when the output completes.
283 */
284void
285bawrite(struct buf *bp)
286{
287	bp->b_flags |= B_ASYNC;
288	(void) bwrite(bp);
289}
290
291/*
292 * Release a buffer.
293 */
294void
295brelse(struct buf *bp)
296{
297	int x;
298
299	/* anyone need a "free" block? */
300	x=splbio();
301	if (needsbuffer) {
302		needsbuffer = 0;
303		wakeup((caddr_t)&needsbuffer);
304	}
305
306	/* anyone need this block? */
307	if (bp->b_flags & B_WANTED) {
308		bp->b_flags &= ~(B_WANTED|B_AGE);
309		wakeup((caddr_t)bp);
310	}
311
312	if (bp->b_flags & B_LOCKED)
313		bp->b_flags &= ~B_ERROR;
314
315	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
316		(bp->b_bufsize <= 0)) {
317		bp->b_flags |= B_INVAL;
318		bp->b_flags &= ~(B_DELWRI|B_CACHE);
319		if(bp->b_vp)
320			brelvp(bp);
321	}
322
323	if( bp->b_qindex != QUEUE_NONE)
324		panic("brelse: free buffer onto another queue???");
325
326	/* enqueue */
327	/* buffers with no memory */
328	if(bp->b_bufsize == 0) {
329		bp->b_qindex = QUEUE_EMPTY;
330		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
331		LIST_REMOVE(bp, b_hash);
332		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
333		bp->b_dev = NODEV;
334	/* buffers with junk contents */
335	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
336		bp->b_qindex = QUEUE_AGE;
337		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
338		LIST_REMOVE(bp, b_hash);
339		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
340		bp->b_dev = NODEV;
341	/* buffers that are locked */
342	} else if(bp->b_flags & B_LOCKED) {
343		bp->b_qindex = QUEUE_LOCKED;
344		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
345	/* buffers with stale but valid contents */
346	} else if(bp->b_flags & B_AGE) {
347		bp->b_qindex = QUEUE_AGE;
348		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
349	/* buffers with valid and quite potentially reuseable contents */
350	} else {
351		bp->b_qindex = QUEUE_LRU;
352		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
353	}
354
355	/* unlock */
356	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
357	splx(x);
358}
359
360int freebufspace;
361int allocbufspace;
362
363/*
364 * Find a buffer header which is available for use.
365 */
366struct buf *
367getnewbuf(int slpflag, int slptimeo)
368{
369	struct buf *bp;
370	int s;
371	s = splbio();
372start:
373	/* can we constitute a new buffer? */
374	if (bp = bufqueues[QUEUE_EMPTY].tqh_first) {
375		if( bp->b_qindex != QUEUE_EMPTY)
376			panic("getnewbuf: inconsistent EMPTY queue");
377		bremfree(bp);
378		goto fillbuf;
379	}
380
381tryfree:
382	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
383		if( bp->b_qindex != QUEUE_AGE)
384			panic("getnewbuf: inconsistent AGE queue");
385		bremfree(bp);
386	} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
387		if( bp->b_qindex != QUEUE_LRU)
388			panic("getnewbuf: inconsistent LRU queue");
389		bremfree(bp);
390	} else	{
391		/* wait for a free buffer of any kind */
392		needsbuffer = 1;
393		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
394		splx(s);
395		return (0);
396	}
397
398
399	/* if we are a delayed write, convert to an async write */
400	if (bp->b_flags & B_DELWRI) {
401		bp->b_flags |= B_BUSY;
402		bawrite (bp);
403		goto start;
404	}
405
406	if(bp->b_vp)
407		brelvp(bp);
408
409	/* we are not free, nor do we contain interesting data */
410	if (bp->b_rcred != NOCRED)
411		crfree(bp->b_rcred);
412	if (bp->b_wcred != NOCRED)
413		crfree(bp->b_wcred);
414fillbuf:
415	bp->b_flags = B_BUSY;
416	LIST_REMOVE(bp, b_hash);
417	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
418	splx(s);
419	bp->b_dev = NODEV;
420	bp->b_vp = NULL;
421	bp->b_blkno = bp->b_lblkno = 0;
422	bp->b_iodone = 0;
423	bp->b_error = 0;
424	bp->b_resid = 0;
425	bp->b_bcount = 0;
426	bp->b_wcred = bp->b_rcred = NOCRED;
427	bp->b_dirtyoff = bp->b_dirtyend = 0;
428	bp->b_validoff = bp->b_validend = 0;
429	return (bp);
430}
431
432/*
433 * Check to see if a block is currently memory resident.
434 */
435struct buf *
436incore(struct vnode *vp, daddr_t blkno)
437{
438	struct buf *bp;
439	struct bufhashhdr *bh;
440
441	int s = splbio();
442
443	bh = BUFHASH(vp, blkno);
444	bp = bh->lh_first;
445
446	/* Search hash chain */
447	while (bp) {
448		if( (bp < buf) || (bp >= buf + nbuf)) {
449			printf("incore: buf out of range: %lx, hash: %d\n",
450				bp, bh - bufhashtbl);
451			panic("incore: buf fault");
452		}
453		/* hit */
454		if (bp->b_lblkno == blkno && bp->b_vp == vp
455			&& (bp->b_flags & B_INVAL) == 0) {
456			splx(s);
457			return (bp);
458		}
459		bp = bp->b_hash.le_next;
460	}
461	splx(s);
462
463	return(0);
464}
465
466/*
467 * Get a block given a specified block and offset into a file/device.
468 */
469struct buf *
470getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
471{
472	struct buf *bp;
473	int s;
474	struct bufhashhdr *bh;
475
476	s = splbio();
477loop:
478	if (bp = incore(vp, blkno)) {
479		if (bp->b_flags & B_BUSY) {
480			bp->b_flags |= B_WANTED;
481			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
482			goto loop;
483		}
484		bp->b_flags |= B_BUSY | B_CACHE;
485		bremfree(bp);
486		/*
487		 * check for size inconsistancies
488		 */
489		if (bp->b_bcount != size) {
490			printf("getblk: invalid buffer size: %d\n", bp->b_bcount);
491			bp->b_flags |= B_INVAL;
492			bwrite(bp);
493			goto loop;
494		}
495	} else {
496
497		if ((bp = getnewbuf(0, 0)) == 0)
498			goto loop;
499		allocbuf(bp, size);
500		/*
501		 * have to check again, because of a possible
502		 * race condition.
503		 */
504		if (incore( vp, blkno)) {
505			allocbuf(bp, 0);
506			bp->b_flags |= B_INVAL;
507			brelse(bp);
508			goto loop;
509		}
510		bp->b_blkno = bp->b_lblkno = blkno;
511		bgetvp(vp, bp);
512		LIST_REMOVE(bp, b_hash);
513		bh = BUFHASH(vp, blkno);
514		LIST_INSERT_HEAD(bh, bp, b_hash);
515	}
516	splx(s);
517	return (bp);
518}
519
520/*
521 * Get an empty, disassociated buffer of given size.
522 */
523struct buf *
524geteblk(int size)
525{
526	struct buf *bp;
527	while ((bp = getnewbuf(0, 0)) == 0)
528		;
529	allocbuf(bp, size);
530	bp->b_flags |= B_INVAL;
531	return (bp);
532}
533
534/*
535 * Modify the length of a buffer's underlying buffer storage without
536 * destroying information (unless, of course the buffer is shrinking).
537 */
538void
539allocbuf(struct buf *bp, int size)
540{
541
542	int newbsize = round_page(size);
543
544	if( newbsize == bp->b_bufsize) {
545		bp->b_bcount = size;
546		return;
547	} else if( newbsize < bp->b_bufsize) {
548		vm_hold_free_pages(
549			(vm_offset_t) bp->b_data + newbsize,
550			(vm_offset_t) bp->b_data + bp->b_bufsize);
551	} else if( newbsize > bp->b_bufsize) {
552		vm_hold_load_pages(
553			(vm_offset_t) bp->b_data + bp->b_bufsize,
554			(vm_offset_t) bp->b_data + newbsize);
555	}
556
557	/* adjust buffer cache's idea of memory allocated to buffer contents */
558	freebufspace -= newbsize - bp->b_bufsize;
559	allocbufspace += newbsize - bp->b_bufsize;
560
561	bp->b_bufsize = newbsize;
562	bp->b_bcount = size;
563}
564
565/*
566 * Wait for buffer I/O completion, returning error status.
567 */
568int
569biowait(register struct buf *bp)
570{
571	int s;
572
573	s = splbio();
574	while ((bp->b_flags & B_DONE) == 0)
575		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
576	if((bp->b_flags & B_ERROR) || bp->b_error) {
577		if ((bp->b_flags & B_INVAL) == 0) {
578			bp->b_flags |= B_INVAL;
579			bp->b_dev = NODEV;
580			LIST_REMOVE(bp, b_hash);
581			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
582		}
583		if (!bp->b_error)
584			bp->b_error = EIO;
585		else
586			bp->b_flags |= B_ERROR;
587		splx(s);
588		return (bp->b_error);
589	} else {
590		splx(s);
591		return (0);
592	}
593}
594
595/*
596 * Finish I/O on a buffer, calling an optional function.
597 * This is usually called from interrupt level, so process blocking
598 * is not *a good idea*.
599 */
600void
601biodone(register struct buf *bp)
602{
603	int s;
604	s = splbio();
605	bp->b_flags |= B_DONE;
606
607	if ((bp->b_flags & B_READ) == 0)  {
608		vwakeup(bp);
609	}
610
611	if (bp->b_flags & B_BOUNCE)
612		vm_bounce_free(bp);
613
614	/* call optional completion function if requested */
615	if (bp->b_flags & B_CALL) {
616		bp->b_flags &= ~B_CALL;
617		(*bp->b_iodone)(bp);
618		splx(s);
619		return;
620	}
621
622/*
623 * For asynchronous completions, release the buffer now. The brelse
624 *	checks for B_WANTED and will do the wakeup there if necessary -
625 *	so no need to do a wakeup here in the async case.
626 */
627
628	if (bp->b_flags & B_ASYNC) {
629		brelse(bp);
630	} else {
631		bp->b_flags &= ~B_WANTED;
632		wakeup((caddr_t) bp);
633	}
634	splx(s);
635}
636
637int
638count_lock_queue()
639{
640	int count;
641	struct buf *bp;
642
643	count = 0;
644	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
645	    bp != NULL;
646	    bp = bp->b_freelist.tqe_next)
647		count++;
648	return(count);
649}
650
651int vfs_update_interval = 30;
652
653void
654vfs_update() {
655	(void) spl0();
656	while(1) {
657		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
658			hz * vfs_update_interval);
659		vfs_update_wakeup = 0;
660		sync(curproc, NULL, NULL);
661	}
662}
663
664/*
665 * these routines are not in the correct place (yet)
666 * also they work *ONLY* for kernel_pmap!!!
667 */
668void
669vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
670	vm_offset_t pg;
671	vm_page_t p;
672	vm_offset_t from = round_page(froma);
673	vm_offset_t to = round_page(toa);
674
675	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
676		vm_offset_t pa;
677
678	tryagain:
679		if (cnt.v_free_count <= cnt.v_free_reserved) {
680			VM_WAIT;
681			goto tryagain;
682		}
683
684		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
685		if( !p) {
686			VM_WAIT;
687			goto tryagain;
688		}
689
690		vm_page_wire(p);
691		pmap_kenter( pg, VM_PAGE_TO_PHYS(p));
692	}
693}
694
695void
696vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) {
697	vm_offset_t pg;
698	vm_page_t p;
699	vm_offset_t from = round_page(froma);
700	vm_offset_t to = round_page(toa);
701
702	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
703		p = PHYS_TO_VM_PAGE( pmap_kextract( pg));
704		pmap_kremove( pg);
705		vm_page_free(p);
706	}
707}
708
709void
710bufstats()
711{
712}
713
714