vfs_bio.c revision 3098
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 *
19 * $Id: vfs_bio.c,v 1.11 1994/08/31 06:17:37 davidg Exp $
20 */
21
22#include <sys/param.h>
23#include <sys/systm.h>
24#include <sys/kernel.h>
25#include <sys/proc.h>
26#include <sys/vnode.h>
27#include <sys/buf.h>
28#include <sys/mount.h>
29#include <sys/malloc.h>
30#include <sys/resourcevar.h>
31#include <sys/proc.h>
32#include <vm/vm.h>
33#include <vm/vm_pageout.h>
34
35#include <miscfs/specfs/specdev.h>
36
37struct	buf *buf;		/* buffer header pool */
38int	nbuf;			/* number of buffer headers calculated elsewhere */
39struct swqueue bswlist;
40struct	buf *bclnlist;		/* Head of cleaned page list. */
41
42extern	vm_map_t buffer_map, io_map;
43
44void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
45void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
46
47int needsbuffer;
48
49/*
50 * Internal update daemon, process 3
51 *	The variable vfs_update_wakeup allows for internal syncs.
52 */
53int vfs_update_wakeup;
54
55/*
56 * Initialize buffer headers and related structures.
57 */
58void
59bufinit()
60{
61	struct buf *bp;
62	int i;
63
64	TAILQ_INIT(&bswlist);
65	LIST_INIT(&invalhash);
66
67	/* first, make a null hash table */
68	for(i=0;i<BUFHSZ;i++)
69		LIST_INIT(&bufhashtbl[i]);
70
71	/* next, make a null set of free lists */
72	for(i=0;i<BUFFER_QUEUES;i++)
73		TAILQ_INIT(&bufqueues[i]);
74
75	/* finally, initialize each buffer header and stick on empty q */
76	for(i=0;i<nbuf;i++) {
77		bp = &buf[i];
78		bzero(bp, sizeof *bp);
79		bp->b_flags = B_INVAL;	/* we're just an empty header */
80		bp->b_dev = NODEV;
81		bp->b_vp = NULL;
82		bp->b_rcred = NOCRED;
83		bp->b_wcred = NOCRED;
84		bp->b_qindex = QUEUE_EMPTY;
85		bp->b_vnbufs.le_next = NOLIST;
86		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
87		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
88		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
89	}
90}
91
92/*
93 * remove the buffer from the appropriate free list
94 */
95void
96bremfree(struct buf *bp)
97{
98	int s = splbio();
99	if( bp->b_qindex != QUEUE_NONE) {
100		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
101		bp->b_qindex = QUEUE_NONE;
102	} else {
103		panic("bremfree: removing a buffer when not on a queue");
104	}
105	splx(s);
106}
107
108/*
109 * Get a buffer with the specified data.  Look in the cache first.
110 */
111int
112bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
113	struct buf **bpp)
114{
115	struct buf *bp;
116
117	bp = getblk (vp, blkno, size, 0, 0);
118	*bpp = bp;
119
120	/* if not found in cache, do some I/O */
121	if ((bp->b_flags & B_CACHE) == 0) {
122		if (curproc && curproc->p_stats)	/* count block I/O */
123			curproc->p_stats->p_ru.ru_inblock++;
124		bp->b_flags |= B_READ;
125		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
126		if( bp->b_rcred == NOCRED) {
127			if (cred != NOCRED)
128				crhold(cred);
129			bp->b_rcred = cred;
130		}
131		VOP_STRATEGY(bp);
132		return( biowait (bp));
133	}
134
135	return (0);
136}
137
138/*
139 * Operates like bread, but also starts asynchronous I/O on
140 * read-ahead blocks.
141 */
142int
143breadn(struct vnode *vp, daddr_t blkno, int size,
144	daddr_t *rablkno, int *rabsize,
145	int cnt, struct ucred *cred, struct buf **bpp)
146{
147	struct buf *bp, *rabp;
148	int i;
149	int rv = 0, readwait = 0;
150
151	*bpp = bp = getblk (vp, blkno, size, 0, 0);
152
153	/* if not found in cache, do some I/O */
154	if ((bp->b_flags & B_CACHE) == 0) {
155		if (curproc && curproc->p_stats)	/* count block I/O */
156			curproc->p_stats->p_ru.ru_inblock++;
157		bp->b_flags |= B_READ;
158		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
159		if( bp->b_rcred == NOCRED) {
160			if (cred != NOCRED)
161				crhold(cred);
162			bp->b_rcred = cred;
163		}
164		VOP_STRATEGY(bp);
165		++readwait;
166	}
167
168	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
169		if( incore(vp, *rablkno)) {
170			continue;
171		}
172		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
173
174		if ((rabp->b_flags & B_CACHE) == 0) {
175			if (curproc && curproc->p_stats)
176				curproc->p_stats->p_ru.ru_inblock++;
177			rabp->b_flags |= B_READ | B_ASYNC;
178			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
179			if( rabp->b_rcred == NOCRED) {
180				if (cred != NOCRED)
181					crhold(cred);
182				rabp->b_rcred = cred;
183			}
184			VOP_STRATEGY(rabp);
185		} else {
186			brelse(rabp);
187		}
188	}
189
190	if( readwait) {
191		rv = biowait (bp);
192	}
193
194	return (rv);
195}
196
197/*
198 * Write, release buffer on completion.  (Done by iodone
199 * if async.)
200 */
201int
202bwrite(struct buf *bp)
203{
204	int oldflags = bp->b_flags;
205
206	if(bp->b_flags & B_INVAL) {
207		brelse(bp);
208		return (0);
209	}
210
211	if(!(bp->b_flags & B_BUSY))
212		panic("bwrite: buffer is not busy???");
213
214	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
215	bp->b_flags |= B_WRITEINPROG;
216
217	if (oldflags & B_ASYNC) {
218		if (oldflags & B_DELWRI) {
219			reassignbuf(bp, bp->b_vp);
220		} else if( curproc) {
221			++curproc->p_stats->p_ru.ru_oublock;
222		}
223	}
224
225	bp->b_vp->v_numoutput++;
226	VOP_STRATEGY(bp);
227
228	if( (oldflags & B_ASYNC) == 0) {
229		int rtval = biowait(bp);
230		if (oldflags & B_DELWRI) {
231			reassignbuf(bp, bp->b_vp);
232		} else if( curproc) {
233			++curproc->p_stats->p_ru.ru_oublock;
234		}
235		brelse(bp);
236		return (rtval);
237	}
238
239	return(0);
240}
241
242int
243vn_bwrite(ap)
244	struct vop_bwrite_args *ap;
245{
246	return (bwrite(ap->a_bp));
247}
248
249/*
250 * Delayed write. (Buffer is marked dirty).
251 */
252void
253bdwrite(struct buf *bp)
254{
255
256	if((bp->b_flags & B_BUSY) == 0) {
257		panic("bdwrite: buffer is not busy");
258	}
259
260	if(bp->b_flags & B_INVAL) {
261		brelse(bp);
262		return;
263	}
264
265	if(bp->b_flags & B_TAPE) {
266		bawrite(bp);
267		return;
268	}
269
270	bp->b_flags &= ~B_READ;
271	if( (bp->b_flags & B_DELWRI) == 0) {
272		if( curproc)
273			++curproc->p_stats->p_ru.ru_oublock;
274		bp->b_flags |= B_DONE|B_DELWRI;
275		reassignbuf(bp, bp->b_vp);
276	}
277	brelse(bp);
278	return;
279}
280
281/*
282 * Asynchronous write.
283 * Start output on a buffer, but do not wait for it to complete.
284 * The buffer is released when the output completes.
285 */
286void
287bawrite(struct buf *bp)
288{
289	bp->b_flags |= B_ASYNC;
290	(void) bwrite(bp);
291}
292
293/*
294 * Release a buffer.
295 */
296void
297brelse(struct buf *bp)
298{
299	int x;
300
301	/* anyone need a "free" block? */
302	x=splbio();
303	if (needsbuffer) {
304		needsbuffer = 0;
305		wakeup((caddr_t)&needsbuffer);
306	}
307
308	/* anyone need this block? */
309	if (bp->b_flags & B_WANTED) {
310		bp->b_flags &= ~(B_WANTED|B_AGE);
311		wakeup((caddr_t)bp);
312	}
313
314	if (bp->b_flags & B_LOCKED)
315		bp->b_flags &= ~B_ERROR;
316
317	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
318		(bp->b_bufsize <= 0)) {
319		bp->b_flags |= B_INVAL;
320		bp->b_flags &= ~(B_DELWRI|B_CACHE);
321		if(bp->b_vp)
322			brelvp(bp);
323	}
324
325	if( bp->b_qindex != QUEUE_NONE)
326		panic("brelse: free buffer onto another queue???");
327
328	/* enqueue */
329	/* buffers with no memory */
330	if(bp->b_bufsize == 0) {
331		bp->b_qindex = QUEUE_EMPTY;
332		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
333		LIST_REMOVE(bp, b_hash);
334		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
335		bp->b_dev = NODEV;
336	/* buffers with junk contents */
337	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
338		bp->b_qindex = QUEUE_AGE;
339		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
340		LIST_REMOVE(bp, b_hash);
341		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
342		bp->b_dev = NODEV;
343	/* buffers that are locked */
344	} else if(bp->b_flags & B_LOCKED) {
345		bp->b_qindex = QUEUE_LOCKED;
346		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
347	/* buffers with stale but valid contents */
348	} else if(bp->b_flags & B_AGE) {
349		bp->b_qindex = QUEUE_AGE;
350		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
351	/* buffers with valid and quite potentially reuseable contents */
352	} else {
353		bp->b_qindex = QUEUE_LRU;
354		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
355	}
356
357	/* unlock */
358	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
359	splx(x);
360}
361
362int freebufspace;
363int allocbufspace;
364
365/*
366 * Find a buffer header which is available for use.
367 */
368struct buf *
369getnewbuf(int slpflag, int slptimeo)
370{
371	struct buf *bp;
372	int s;
373	s = splbio();
374start:
375	/* can we constitute a new buffer? */
376	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
377		if( bp->b_qindex != QUEUE_EMPTY)
378			panic("getnewbuf: inconsistent EMPTY queue");
379		bremfree(bp);
380		goto fillbuf;
381	}
382
383	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
384		if( bp->b_qindex != QUEUE_AGE)
385			panic("getnewbuf: inconsistent AGE queue");
386		bremfree(bp);
387	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
388		if( bp->b_qindex != QUEUE_LRU)
389			panic("getnewbuf: inconsistent LRU queue");
390		bremfree(bp);
391	} else	{
392		/* wait for a free buffer of any kind */
393		needsbuffer = 1;
394		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
395		splx(s);
396		return (0);
397	}
398
399
400	/* if we are a delayed write, convert to an async write */
401	if (bp->b_flags & B_DELWRI) {
402		bp->b_flags |= B_BUSY;
403		bawrite (bp);
404		goto start;
405	}
406
407	if(bp->b_vp)
408		brelvp(bp);
409
410	/* we are not free, nor do we contain interesting data */
411	if (bp->b_rcred != NOCRED)
412		crfree(bp->b_rcred);
413	if (bp->b_wcred != NOCRED)
414		crfree(bp->b_wcred);
415fillbuf:
416	bp->b_flags = B_BUSY;
417	LIST_REMOVE(bp, b_hash);
418	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
419	splx(s);
420	bp->b_dev = NODEV;
421	bp->b_vp = NULL;
422	bp->b_blkno = bp->b_lblkno = 0;
423	bp->b_iodone = 0;
424	bp->b_error = 0;
425	bp->b_resid = 0;
426	bp->b_bcount = 0;
427	bp->b_wcred = bp->b_rcred = NOCRED;
428	bp->b_dirtyoff = bp->b_dirtyend = 0;
429	bp->b_validoff = bp->b_validend = 0;
430	return (bp);
431}
432
433/*
434 * Check to see if a block is currently memory resident.
435 */
436struct buf *
437incore(struct vnode *vp, daddr_t blkno)
438{
439	struct buf *bp;
440	struct bufhashhdr *bh;
441
442	int s = splbio();
443
444	bh = BUFHASH(vp, blkno);
445	bp = bh->lh_first;
446
447	/* Search hash chain */
448	while (bp) {
449		if( (bp < buf) || (bp >= buf + nbuf)) {
450			printf("incore: buf out of range: %p, hash: %d\n",
451				bp, bh - bufhashtbl);
452			panic("incore: buf fault");
453		}
454		/* hit */
455		if (bp->b_lblkno == blkno && bp->b_vp == vp
456			&& (bp->b_flags & B_INVAL) == 0) {
457			splx(s);
458			return (bp);
459		}
460		bp = bp->b_hash.le_next;
461	}
462	splx(s);
463
464	return(0);
465}
466
467/*
468 * Get a block given a specified block and offset into a file/device.
469 */
470struct buf *
471getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
472{
473	struct buf *bp;
474	int s;
475	struct bufhashhdr *bh;
476
477	s = splbio();
478loop:
479	if ((bp = incore(vp, blkno))) {
480		if (bp->b_flags & B_BUSY) {
481			bp->b_flags |= B_WANTED;
482			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
483			goto loop;
484		}
485		bp->b_flags |= B_BUSY | B_CACHE;
486		bremfree(bp);
487		/*
488		 * check for size inconsistancies
489		 */
490		if (bp->b_bcount != size) {
491			printf("getblk: invalid buffer size: %ld\n", bp->b_bcount);
492			bp->b_flags |= B_INVAL;
493			bwrite(bp);
494			goto loop;
495		}
496	} else {
497
498		if ((bp = getnewbuf(0, 0)) == 0)
499			goto loop;
500		allocbuf(bp, size);
501		/*
502		 * have to check again, because of a possible
503		 * race condition.
504		 */
505		if (incore( vp, blkno)) {
506			allocbuf(bp, 0);
507			bp->b_flags |= B_INVAL;
508			brelse(bp);
509			goto loop;
510		}
511		bp->b_blkno = bp->b_lblkno = blkno;
512		bgetvp(vp, bp);
513		LIST_REMOVE(bp, b_hash);
514		bh = BUFHASH(vp, blkno);
515		LIST_INSERT_HEAD(bh, bp, b_hash);
516	}
517	splx(s);
518	return (bp);
519}
520
521/*
522 * Get an empty, disassociated buffer of given size.
523 */
524struct buf *
525geteblk(int size)
526{
527	struct buf *bp;
528	while ((bp = getnewbuf(0, 0)) == 0)
529		;
530	allocbuf(bp, size);
531	bp->b_flags |= B_INVAL;
532	return (bp);
533}
534
535/*
536 * Modify the length of a buffer's underlying buffer storage without
537 * destroying information (unless, of course the buffer is shrinking).
538 */
539void
540allocbuf(struct buf *bp, int size)
541{
542
543	int newbsize = round_page(size);
544
545	if( newbsize == bp->b_bufsize) {
546		bp->b_bcount = size;
547		return;
548	} else if( newbsize < bp->b_bufsize) {
549		vm_hold_free_pages(
550			(vm_offset_t) bp->b_data + newbsize,
551			(vm_offset_t) bp->b_data + bp->b_bufsize);
552	} else if( newbsize > bp->b_bufsize) {
553		vm_hold_load_pages(
554			(vm_offset_t) bp->b_data + bp->b_bufsize,
555			(vm_offset_t) bp->b_data + newbsize);
556	}
557
558	/* adjust buffer cache's idea of memory allocated to buffer contents */
559	freebufspace -= newbsize - bp->b_bufsize;
560	allocbufspace += newbsize - bp->b_bufsize;
561
562	bp->b_bufsize = newbsize;
563	bp->b_bcount = size;
564}
565
566/*
567 * Wait for buffer I/O completion, returning error status.
568 */
569int
570biowait(register struct buf *bp)
571{
572	int s;
573
574	s = splbio();
575	while ((bp->b_flags & B_DONE) == 0)
576		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
577	if((bp->b_flags & B_ERROR) || bp->b_error) {
578		if ((bp->b_flags & B_INVAL) == 0) {
579			bp->b_flags |= B_INVAL;
580			bp->b_dev = NODEV;
581			LIST_REMOVE(bp, b_hash);
582			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
583		}
584		if (!bp->b_error)
585			bp->b_error = EIO;
586		else
587			bp->b_flags |= B_ERROR;
588		splx(s);
589		return (bp->b_error);
590	} else {
591		splx(s);
592		return (0);
593	}
594}
595
596/*
597 * Finish I/O on a buffer, calling an optional function.
598 * This is usually called from interrupt level, so process blocking
599 * is not *a good idea*.
600 */
601void
602biodone(register struct buf *bp)
603{
604	int s;
605	s = splbio();
606	bp->b_flags |= B_DONE;
607
608	if ((bp->b_flags & B_READ) == 0)  {
609		vwakeup(bp);
610	}
611
612#ifdef BOUNCE_BUFFERS
613	if (bp->b_flags & B_BOUNCE)
614		vm_bounce_free(bp);
615#endif
616
617	/* call optional completion function if requested */
618	if (bp->b_flags & B_CALL) {
619		bp->b_flags &= ~B_CALL;
620		(*bp->b_iodone)(bp);
621		splx(s);
622		return;
623	}
624
625/*
626 * For asynchronous completions, release the buffer now. The brelse
627 *	checks for B_WANTED and will do the wakeup there if necessary -
628 *	so no need to do a wakeup here in the async case.
629 */
630
631	if (bp->b_flags & B_ASYNC) {
632		brelse(bp);
633	} else {
634		bp->b_flags &= ~B_WANTED;
635		wakeup((caddr_t) bp);
636	}
637	splx(s);
638}
639
640int
641count_lock_queue()
642{
643	int count;
644	struct buf *bp;
645
646	count = 0;
647	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
648	    bp != NULL;
649	    bp = bp->b_freelist.tqe_next)
650		count++;
651	return(count);
652}
653
654int vfs_update_interval = 30;
655
656void
657vfs_update() {
658	(void) spl0();
659	while(1) {
660		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
661			hz * vfs_update_interval);
662		vfs_update_wakeup = 0;
663		sync(curproc, NULL, NULL);
664	}
665}
666
667#define MAXFREEBP 128
668#define LDFREE_BUSY 1
669#define LDFREE_WANT 2
670int loadfreeing;
671struct buf *freebp[MAXFREEBP];
672/*
673 * these routines are not in the correct place (yet)
674 * also they work *ONLY* for kernel_pmap!!!
675 */
676void
677vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
678	vm_offset_t pg;
679	vm_page_t p;
680	vm_offset_t from = round_page(froma);
681	vm_offset_t to = round_page(toa);
682
683	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
684
685	tryagain:
686/*
687 * don't allow buffer cache to cause VM paging
688 */
689		if ( cnt.v_free_count < cnt.v_free_min) {
690			if( !loadfreeing ) {
691				int n=0;
692				struct buf *bp;
693				loadfreeing = LDFREE_BUSY;
694				while( (cnt.v_free_count <= cnt.v_free_min) &&
695					(n < MAXFREEBP)) {
696					bp = geteblk(0);
697					if( bp)
698						freebp[n++] = bp;
699					else
700						break;
701				}
702				while(--n >= 0) {
703					brelse(freebp[n]);
704				}
705				if( loadfreeing & LDFREE_WANT)
706					wakeup((caddr_t) &loadfreeing);
707				loadfreeing = 0;
708			} else {
709				loadfreeing |= LDFREE_WANT;
710				tsleep(&loadfreeing, PRIBIO, "biofree", 0);
711			}
712		}
713
714
715		if (cnt.v_free_count <=
716			cnt.v_free_reserved + (toa-froma) / PAGE_SIZE) {
717			VM_WAIT;
718			goto tryagain;
719		}
720
721		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
722		if( !p) {
723			VM_WAIT;
724			goto tryagain;
725		}
726
727		vm_page_wire(p);
728		pmap_kenter( pg, VM_PAGE_TO_PHYS(p));
729	}
730}
731
732void
733vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa)
734{
735	vm_offset_t pg;
736	vm_page_t p;
737	vm_offset_t from = round_page(froma);
738	vm_offset_t to = round_page(toa);
739
740	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
741		p = PHYS_TO_VM_PAGE( pmap_kextract( pg));
742		pmap_kremove( pg);
743		vm_page_free(p);
744	}
745}
746
747void
748bufstats()
749{
750}
751
752