vfs_bio.c revision 1549
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 */
19
20#include <sys/param.h>
21#include <sys/systm.h>
22#include <sys/kernel.h>
23#include <sys/proc.h>
24#include <sys/vnode.h>
25#include <sys/buf.h>
26#include <sys/mount.h>
27#include <sys/malloc.h>
28#include <sys/resourcevar.h>
29#include <vm/vm.h>
30#include <vm/vm_pageout.h>
31
32#include <miscfs/specfs/specdev.h>
33
34struct	buf *buf;		/* the buffer pool itself */
35int	nbuf;			/* number of buffer headers */
36int	bufpages;		/* number of memory pages in the buffer pool */
37struct	buf *swbuf;		/* swap I/O headers */
38int	nswbuf;
39#define BUFHSZ 512
40int bufhash = BUFHSZ - 1;
41
42struct buf *getnewbuf(int,int);
43extern	vm_map_t buffer_map, io_map;
44void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
45void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
46/*
47 * Definitions for the buffer hash lists.
48 */
49#define	BUFHASH(dvp, lbn)	\
50	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
51
52/*
53 * Definitions for the buffer free lists.
54 */
55#define	BQUEUES		5		/* number of free buffer queues */
56
57LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
58TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
59
60#define	BQ_NONE		0	/* on no queue */
61#define	BQ_LOCKED	1	/* locked buffers */
62#define	BQ_LRU		2	/* useful buffers */
63#define	BQ_AGE		3	/* less useful buffers */
64#define	BQ_EMPTY	4	/* empty buffer headers*/
65
66int needsbuffer;
67
68/*
69 * Internal update daemon, process 3
70 *	The variable vfs_update_wakeup allows for internal syncs.
71 */
72int vfs_update_wakeup;
73
74/*
75 * Initialize buffer headers and related structures.
76 */
77void bufinit()
78{
79	struct buf *bp;
80	int i;
81
82	TAILQ_INIT(&bswlist);
83	LIST_INIT(&invalhash);
84
85	/* first, make a null hash table */
86	for(i=0;i<BUFHSZ;i++)
87		LIST_INIT(&bufhashtbl[i]);
88
89	/* next, make a null set of free lists */
90	for(i=0;i<BQUEUES;i++)
91		TAILQ_INIT(&bufqueues[i]);
92
93	/* finally, initialize each buffer header and stick on empty q */
94	for(i=0;i<nbuf;i++) {
95		bp = &buf[i];
96		bzero(bp, sizeof *bp);
97		bp->b_flags = B_INVAL;	/* we're just an empty header */
98		bp->b_dev = NODEV;
99		bp->b_vp = NULL;
100		bp->b_rcred = NOCRED;
101		bp->b_wcred = NOCRED;
102		bp->b_qindex = BQ_EMPTY;
103		bp->b_vnbufs.le_next = NOLIST;
104		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
105		TAILQ_INSERT_TAIL(&bufqueues[BQ_EMPTY], bp, b_freelist);
106		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
107	}
108}
109
110/*
111 * remove the buffer from the appropriate free list
112 */
113void
114bremfree(struct buf *bp)
115{
116	int s = splbio();
117	if( bp->b_qindex != BQ_NONE) {
118		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
119		bp->b_qindex = BQ_NONE;
120	} else {
121		panic("bremfree: removing a buffer when not on a queue");
122	}
123	splx(s);
124}
125
126/*
127 * Get a buffer with the specified data.  Look in the cache first.
128 */
129int
130bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
131	struct buf **bpp)
132{
133	struct buf *bp;
134
135	bp = getblk (vp, blkno, size, 0, 0);
136	*bpp = bp;
137
138	/* if not found in cache, do some I/O */
139	if ((bp->b_flags & B_CACHE) == 0) {
140		if (curproc && curproc->p_stats)	/* count block I/O */
141			curproc->p_stats->p_ru.ru_inblock++;
142		bp->b_flags |= B_READ;
143		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
144		if( bp->b_rcred == NOCRED) {
145			if (cred != NOCRED)
146				crhold(cred);
147			bp->b_rcred = cred;
148		}
149		VOP_STRATEGY(bp);
150		return( biowait (bp));
151	}
152
153	return (0);
154}
155
156/*
157 * Operates like bread, but also starts asynchronous I/O on
158 * read-ahead blocks.
159 */
160int
161breadn(struct vnode *vp, daddr_t blkno, int size,
162	daddr_t *rablkno, int *rabsize,
163	int cnt, struct ucred *cred, struct buf **bpp)
164{
165	struct buf *bp, *rabp;
166	int i;
167	int rv = 0, readwait = 0;
168
169	*bpp = bp = getblk (vp, blkno, size, 0, 0);
170
171	/* if not found in cache, do some I/O */
172	if ((bp->b_flags & B_CACHE) == 0) {
173		if (curproc && curproc->p_stats)	/* count block I/O */
174			curproc->p_stats->p_ru.ru_inblock++;
175		bp->b_flags |= B_READ;
176		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
177		if( bp->b_rcred == NOCRED) {
178			if (cred != NOCRED)
179				crhold(cred);
180			bp->b_rcred = cred;
181		}
182		VOP_STRATEGY(bp);
183		++readwait;
184	}
185
186	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
187		if( incore(vp, *rablkno)) {
188			continue;
189		}
190		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
191
192		if ((rabp->b_flags & B_CACHE) == 0) {
193			if (curproc && curproc->p_stats)
194				curproc->p_stats->p_ru.ru_inblock++;
195			rabp->b_flags |= B_READ | B_ASYNC;
196			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
197			if( rabp->b_rcred == NOCRED) {
198				if (cred != NOCRED)
199					crhold(cred);
200				rabp->b_rcred = cred;
201			}
202			VOP_STRATEGY(rabp);
203		} else {
204			brelse(rabp);
205		}
206	}
207
208	if( readwait) {
209		rv = biowait (bp);
210	}
211
212	return (rv);
213}
214
215/*
216 * Write, release buffer on completion.  (Done by iodone
217 * if async.)
218 */
219int
220bwrite(struct buf *bp)
221{
222	int oldflags = bp->b_flags;
223
224	if(bp->b_flags & B_INVAL) {
225		brelse(bp);
226		return (0);
227	}
228
229	if(!(bp->b_flags & B_BUSY))
230		panic("bwrite: buffer is not busy???");
231
232	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
233	bp->b_flags |= B_WRITEINPROG;
234
235	if (oldflags & B_ASYNC) {
236		if (oldflags & B_DELWRI) {
237			reassignbuf(bp, bp->b_vp);
238		} else if( curproc) {
239			++curproc->p_stats->p_ru.ru_oublock;
240		}
241	}
242
243	bp->b_vp->v_numoutput++;
244	VOP_STRATEGY(bp);
245
246	if( (oldflags & B_ASYNC) == 0) {
247		int rtval = biowait(bp);
248		if (oldflags & B_DELWRI) {
249			reassignbuf(bp, bp->b_vp);
250		} else if( curproc) {
251			++curproc->p_stats->p_ru.ru_oublock;
252		}
253		brelse(bp);
254		return (rtval);
255	}
256
257	return(0);
258}
259
260int
261vn_bwrite(ap)
262	struct vop_bwrite_args *ap;
263{
264	return (bwrite(ap->a_bp));
265}
266
267/*
268 * Delayed write. (Buffer is marked dirty).
269 */
270void
271bdwrite(struct buf *bp)
272{
273
274	if((bp->b_flags & B_BUSY) == 0) {
275		panic("bdwrite: buffer is not busy");
276	}
277
278	if(bp->b_flags & B_INVAL) {
279		brelse(bp);
280		return;
281	}
282
283	if(bp->b_flags & B_TAPE) {
284		bawrite(bp);
285		return;
286	}
287
288	bp->b_flags &= ~B_READ;
289	if( (bp->b_flags & B_DELWRI) == 0) {
290		if( curproc)
291			++curproc->p_stats->p_ru.ru_oublock;
292		bp->b_flags |= B_DONE|B_DELWRI;
293		reassignbuf(bp, bp->b_vp);
294	}
295	brelse(bp);
296	return;
297}
298
299/*
300 * Asynchronous write.
301 * Start output on a buffer, but do not wait for it to complete.
302 * The buffer is released when the output completes.
303 */
304void
305bawrite(struct buf *bp)
306{
307	bp->b_flags |= B_ASYNC;
308	(void) bwrite(bp);
309}
310
311/*
312 * Release a buffer.
313 */
314void
315brelse(struct buf *bp)
316{
317	int x;
318
319	/* anyone need a "free" block? */
320	x=splbio();
321	if (needsbuffer) {
322		needsbuffer = 0;
323		wakeup((caddr_t)&needsbuffer);
324	}
325	/* anyone need this very block? */
326	if (bp->b_flags & B_WANTED) {
327		bp->b_flags &= ~(B_WANTED|B_AGE);
328		wakeup((caddr_t)bp);
329	}
330
331	if (bp->b_flags & B_LOCKED)
332		bp->b_flags &= ~B_ERROR;
333
334	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
335		(bp->b_bufsize <= 0)) {
336		bp->b_flags |= B_INVAL;
337		bp->b_flags &= ~(B_DELWRI|B_CACHE);
338		if(bp->b_vp)
339			brelvp(bp);
340	}
341
342	if( bp->b_qindex != BQ_NONE)
343		panic("brelse: free buffer onto another queue???");
344
345	/* enqueue */
346	/* buffers with junk contents */
347	if(bp->b_bufsize == 0) {
348		bp->b_qindex = BQ_EMPTY;
349		TAILQ_INSERT_HEAD(&bufqueues[BQ_EMPTY], bp, b_freelist);
350		LIST_REMOVE(bp, b_hash);
351		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
352		bp->b_dev = NODEV;
353	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
354		bp->b_qindex = BQ_AGE;
355		TAILQ_INSERT_HEAD(&bufqueues[BQ_AGE], bp, b_freelist);
356		LIST_REMOVE(bp, b_hash);
357		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
358		bp->b_dev = NODEV;
359	/* buffers that are locked */
360	} else if(bp->b_flags & B_LOCKED) {
361		bp->b_qindex = BQ_LOCKED;
362		TAILQ_INSERT_TAIL(&bufqueues[BQ_LOCKED], bp, b_freelist);
363	/* buffers with stale but valid contents */
364	} else if(bp->b_flags & B_AGE) {
365		bp->b_qindex = BQ_AGE;
366		TAILQ_INSERT_TAIL(&bufqueues[BQ_AGE], bp, b_freelist);
367	/* buffers with valid and quite potentially reuseable contents */
368	} else {
369		bp->b_qindex = BQ_LRU;
370		TAILQ_INSERT_TAIL(&bufqueues[BQ_LRU], bp, b_freelist);
371	}
372
373	/* unlock */
374	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
375	splx(x);
376}
377
378int freebufspace;
379int allocbufspace;
380
381/*
382 * Find a buffer header which is available for use.
383 */
384struct buf *
385getnewbuf(int slpflag, int slptimeo)
386{
387	struct buf *bp;
388	int x;
389	x = splbio();
390start:
391	/* can we constitute a new buffer? */
392	if (bp = bufqueues[BQ_EMPTY].tqh_first) {
393		if( bp->b_qindex != BQ_EMPTY)
394			panic("getnewbuf: inconsistent EMPTY queue");
395		bremfree(bp);
396		goto fillbuf;
397	}
398
399tryfree:
400	if (bp = bufqueues[BQ_AGE].tqh_first) {
401		if( bp->b_qindex != BQ_AGE)
402			panic("getnewbuf: inconsistent AGE queue");
403		bremfree(bp);
404	} else if (bp = bufqueues[BQ_LRU].tqh_first) {
405		if( bp->b_qindex != BQ_LRU)
406			panic("getnewbuf: inconsistent LRU queue");
407		bremfree(bp);
408	} else	{
409		/* wait for a free buffer of any kind */
410		needsbuffer = 1;
411		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
412		splx(x);
413		return (0);
414	}
415
416
417	/* if we are a delayed write, convert to an async write */
418	if (bp->b_flags & B_DELWRI) {
419		bp->b_flags |= B_BUSY;
420		bawrite (bp);
421		goto start;
422	}
423
424	if(bp->b_vp)
425		brelvp(bp);
426
427	/* we are not free, nor do we contain interesting data */
428	if (bp->b_rcred != NOCRED)
429		crfree(bp->b_rcred);
430	if (bp->b_wcred != NOCRED)
431		crfree(bp->b_wcred);
432fillbuf:
433	bp->b_flags = B_BUSY;
434	LIST_REMOVE(bp, b_hash);
435	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
436	splx(x);
437	bp->b_dev = NODEV;
438	bp->b_vp = NULL;
439	bp->b_blkno = bp->b_lblkno = 0;
440	bp->b_iodone = 0;
441	bp->b_error = 0;
442	bp->b_resid = 0;
443	bp->b_bcount = 0;
444	bp->b_wcred = bp->b_rcred = NOCRED;
445	bp->b_dirtyoff = bp->b_dirtyend = 0;
446	bp->b_validoff = bp->b_validend = 0;
447	return (bp);
448}
449
450/*
451 * Check to see if a block is currently memory resident.
452 */
453struct buf *
454incore(struct vnode *vp, daddr_t blkno)
455{
456	struct buf *bp;
457	struct bufhashhdr *bh;
458
459	int s = splbio();
460
461	bh = BUFHASH(vp, blkno);
462	bp = bh->lh_first;
463
464	/* Search hash chain */
465	while (bp) {
466		if( (bp < buf) || (bp >= buf + nbuf)) {
467			printf("incore: buf out of range: %lx, hash: %d\n",
468				bp, bh - bufhashtbl);
469			panic("incore: buf fault");
470		}
471		/* hit */
472		if (bp->b_lblkno == blkno && bp->b_vp == vp
473			&& (bp->b_flags & B_INVAL) == 0)
474			return (bp);
475		bp = bp->b_hash.le_next;
476	}
477	splx(s);
478
479	return(0);
480}
481
482/*
483 * Get a block given a specified block and offset into a file/device.
484 */
485struct buf *
486getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
487{
488	struct buf *bp;
489	int x;
490	struct bufhashhdr *bh;
491
492	x = splbio();
493loop:
494	if (bp = incore(vp, blkno)) {
495		if (bp->b_flags & B_BUSY) {
496			bp->b_flags |= B_WANTED;
497			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
498			goto loop;
499		}
500		bp->b_flags |= B_BUSY | B_CACHE;
501		bremfree(bp);
502		/*
503		 * check for size inconsistancies
504		 */
505		if (bp->b_bcount != size) {
506			printf("getblk: invalid buffer size: %d\n", bp->b_bcount);
507			bp->b_flags |= B_INVAL;
508			bwrite(bp);
509			goto loop;
510		}
511	} else {
512
513		if ((bp = getnewbuf(0, 0)) == 0)
514			goto loop;
515		allocbuf(bp, size);
516		/*
517		 * have to check again, because of a possible
518		 * race condition.
519		 */
520		if (incore( vp, blkno)) {
521			allocbuf(bp, 0);
522			bp->b_flags |= B_INVAL;
523			brelse(bp);
524			goto loop;
525		}
526		bp->b_blkno = bp->b_lblkno = blkno;
527		bgetvp(vp, bp);
528		LIST_REMOVE(bp, b_hash);
529		bh = BUFHASH(vp, blkno);
530		LIST_INSERT_HEAD(bh, bp, b_hash);
531	}
532	splx(x);
533	return (bp);
534}
535
536/*
537 * Get an empty, disassociated buffer of given size.
538 */
539struct buf *
540geteblk(int size)
541{
542	struct buf *bp;
543	while ((bp = getnewbuf(0, 0)) == 0)
544		;
545	allocbuf(bp, size);
546	bp->b_flags |= B_INVAL;
547	return (bp);
548}
549
550/*
551 * Modify the length of a buffer's underlying buffer storage without
552 * destroying information (unless, of course the buffer is shrinking).
553 */
554void
555allocbuf(struct buf *bp, int size)
556{
557
558	int newbsize = round_page(size);
559
560	if( newbsize == bp->b_bufsize) {
561		bp->b_bcount = size;
562		return;
563	} else if( newbsize < bp->b_bufsize) {
564		vm_hold_free_pages(
565			(vm_offset_t) bp->b_data + newbsize,
566			(vm_offset_t) bp->b_data + bp->b_bufsize);
567	} else if( newbsize > bp->b_bufsize) {
568		vm_hold_load_pages(
569			(vm_offset_t) bp->b_data + bp->b_bufsize,
570			(vm_offset_t) bp->b_data + newbsize);
571	}
572
573	/* adjust buffer cache's idea of memory allocated to buffer contents */
574	freebufspace -= newbsize - bp->b_bufsize;
575	allocbufspace += newbsize - bp->b_bufsize;
576
577	bp->b_bufsize = newbsize;
578	bp->b_bcount = size;
579}
580
581/*
582 * Wait for buffer I/O completion, returning error status.
583 */
584int
585biowait(register struct buf *bp)
586{
587	int x;
588
589	x = splbio();
590	while ((bp->b_flags & B_DONE) == 0)
591		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
592	if((bp->b_flags & B_ERROR) || bp->b_error) {
593		if ((bp->b_flags & B_INVAL) == 0) {
594			bp->b_flags |= B_INVAL;
595			bp->b_dev = NODEV;
596			LIST_REMOVE(bp, b_hash);
597			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
598		}
599		if (!bp->b_error)
600			bp->b_error = EIO;
601		else
602			bp->b_flags |= B_ERROR;
603		splx(x);
604		return (bp->b_error);
605	} else {
606		splx(x);
607		return (0);
608	}
609}
610
611/*
612 * Finish I/O on a buffer, calling an optional function.
613 * This is usually called from interrupt level, so process blocking
614 * is not *a good idea*.
615 */
616void
617biodone(register struct buf *bp)
618{
619	int s;
620	s = splbio();
621	bp->b_flags |= B_DONE;
622
623	if ((bp->b_flags & B_READ) == 0)  {
624		vwakeup(bp);
625	}
626
627	/* call optional completion function if requested */
628	if (bp->b_flags & B_CALL) {
629		bp->b_flags &= ~B_CALL;
630		(*bp->b_iodone)(bp);
631		splx(s);
632		return;
633	}
634
635/*
636 * For asynchronous completions, release the buffer now. The brelse
637 *	checks for B_WANTED and will do the wakeup there if necessary -
638 *	so no need to do a wakeup here in the async case.
639 */
640
641	if (bp->b_flags & B_ASYNC) {
642		brelse(bp);
643	} else {
644		bp->b_flags &= ~B_WANTED;
645		wakeup((caddr_t) bp);
646	}
647	splx(s);
648}
649
650int
651count_lock_queue()
652{
653	int count;
654	struct buf *bp;
655
656	count = 0;
657	for(bp = bufqueues[BQ_LOCKED].tqh_first;
658	    bp != NULL;
659	    bp = bp->b_freelist.tqe_next)
660		count++;
661	return(count);
662}
663
664#ifndef UPDATE_INTERVAL
665int vfs_update_interval = 30;
666#else
667int vfs_update_interval = UPDATE_INTERVAL;
668#endif
669
670void
671vfs_update() {
672	(void) spl0();
673	while(1) {
674		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
675			hz * vfs_update_interval);
676		vfs_update_wakeup = 0;
677		sync(curproc, NULL, NULL);
678	}
679}
680
681/*
682 * these routines are not in the correct place (yet)
683 * also they work *ONLY* for kernel_pmap!!!
684 */
685void
686vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
687	vm_offset_t pg;
688	vm_page_t p;
689	vm_offset_t from = round_page(froma);
690	vm_offset_t to = round_page(toa);
691
692	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
693		vm_offset_t pa;
694
695	tryagain:
696		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
697		if( !p) {
698			VM_WAIT;
699			goto tryagain;
700		}
701
702		vm_page_wire(p);
703		pmap_enter(kernel_pmap, pg, VM_PAGE_TO_PHYS(p),
704			VM_PROT_READ|VM_PROT_WRITE, 1);
705	}
706}
707
708void
709vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) {
710	vm_offset_t pg;
711	vm_page_t p;
712	vm_offset_t from = round_page(froma);
713	vm_offset_t to = round_page(toa);
714
715	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
716		vm_offset_t pa;
717		pa = pmap_kextract(pg);
718		if( !pa) {
719			printf("No pa for va: %x\n", pg);
720		} else {
721			p = PHYS_TO_VM_PAGE( pa);
722			pmap_remove(kernel_pmap, pg, pg + PAGE_SIZE);
723			vm_page_free(p);
724		}
725	}
726}
727
728void
729bufstats()
730{
731}
732
733