vfs_bio.c revision 1896
1/*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice immediately at the beginning of the file, without modification,
10 *    this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 *    John S. Dyson.
16 * 4. Modifications may be freely made to this file if the above conditions
17 *    are met.
18 *
19 * $Id: vfs_bio.c,v 1.6 1994/08/06 09:15:28 davidg Exp $
20 */
21
22#include <sys/param.h>
23#include <sys/systm.h>
24#include <sys/kernel.h>
25#include <sys/proc.h>
26#include <sys/vnode.h>
27#include <sys/buf.h>
28#include <sys/mount.h>
29#include <sys/malloc.h>
30#include <sys/resourcevar.h>
31#include <vm/vm.h>
32#include <vm/vm_pageout.h>
33
34#include <miscfs/specfs/specdev.h>
35
36struct	buf *buf;		/* buffer header pool */
37int	nbuf;			/* number of buffer headers calculated elsewhere */
38
39extern	vm_map_t buffer_map, io_map;
40
41void vm_hold_free_pages(vm_offset_t from, vm_offset_t to);
42void vm_hold_load_pages(vm_offset_t from, vm_offset_t to);
43
44int needsbuffer;
45
46/*
47 * Internal update daemon, process 3
48 *	The variable vfs_update_wakeup allows for internal syncs.
49 */
50int vfs_update_wakeup;
51
52/*
53 * Initialize buffer headers and related structures.
54 */
55void bufinit()
56{
57	struct buf *bp;
58	int i;
59
60	TAILQ_INIT(&bswlist);
61	LIST_INIT(&invalhash);
62
63	/* first, make a null hash table */
64	for(i=0;i<BUFHSZ;i++)
65		LIST_INIT(&bufhashtbl[i]);
66
67	/* next, make a null set of free lists */
68	for(i=0;i<BUFFER_QUEUES;i++)
69		TAILQ_INIT(&bufqueues[i]);
70
71	/* finally, initialize each buffer header and stick on empty q */
72	for(i=0;i<nbuf;i++) {
73		bp = &buf[i];
74		bzero(bp, sizeof *bp);
75		bp->b_flags = B_INVAL;	/* we're just an empty header */
76		bp->b_dev = NODEV;
77		bp->b_vp = NULL;
78		bp->b_rcred = NOCRED;
79		bp->b_wcred = NOCRED;
80		bp->b_qindex = QUEUE_EMPTY;
81		bp->b_vnbufs.le_next = NOLIST;
82		bp->b_data = (caddr_t)kmem_alloc_pageable(buffer_map, MAXBSIZE);
83		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
84		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
85	}
86}
87
88/*
89 * remove the buffer from the appropriate free list
90 */
91void
92bremfree(struct buf *bp)
93{
94	int s = splbio();
95	if( bp->b_qindex != QUEUE_NONE) {
96		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
97		bp->b_qindex = QUEUE_NONE;
98	} else {
99		panic("bremfree: removing a buffer when not on a queue");
100	}
101	splx(s);
102}
103
104/*
105 * Get a buffer with the specified data.  Look in the cache first.
106 */
107int
108bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
109	struct buf **bpp)
110{
111	struct buf *bp;
112
113	bp = getblk (vp, blkno, size, 0, 0);
114	*bpp = bp;
115
116	/* if not found in cache, do some I/O */
117	if ((bp->b_flags & B_CACHE) == 0) {
118		if (curproc && curproc->p_stats)	/* count block I/O */
119			curproc->p_stats->p_ru.ru_inblock++;
120		bp->b_flags |= B_READ;
121		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
122		if( bp->b_rcred == NOCRED) {
123			if (cred != NOCRED)
124				crhold(cred);
125			bp->b_rcred = cred;
126		}
127		VOP_STRATEGY(bp);
128		return( biowait (bp));
129	}
130
131	return (0);
132}
133
134/*
135 * Operates like bread, but also starts asynchronous I/O on
136 * read-ahead blocks.
137 */
138int
139breadn(struct vnode *vp, daddr_t blkno, int size,
140	daddr_t *rablkno, int *rabsize,
141	int cnt, struct ucred *cred, struct buf **bpp)
142{
143	struct buf *bp, *rabp;
144	int i;
145	int rv = 0, readwait = 0;
146
147	*bpp = bp = getblk (vp, blkno, size, 0, 0);
148
149	/* if not found in cache, do some I/O */
150	if ((bp->b_flags & B_CACHE) == 0) {
151		if (curproc && curproc->p_stats)	/* count block I/O */
152			curproc->p_stats->p_ru.ru_inblock++;
153		bp->b_flags |= B_READ;
154		bp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
155		if( bp->b_rcred == NOCRED) {
156			if (cred != NOCRED)
157				crhold(cred);
158			bp->b_rcred = cred;
159		}
160		VOP_STRATEGY(bp);
161		++readwait;
162	}
163
164	for(i=0;i<cnt;i++, rablkno++, rabsize++) {
165		if( incore(vp, *rablkno)) {
166			continue;
167		}
168		rabp = getblk (vp, *rablkno, *rabsize, 0, 0);
169
170		if ((rabp->b_flags & B_CACHE) == 0) {
171			if (curproc && curproc->p_stats)
172				curproc->p_stats->p_ru.ru_inblock++;
173			rabp->b_flags |= B_READ | B_ASYNC;
174			rabp->b_flags &= ~(B_DONE|B_ERROR|B_INVAL);
175			if( rabp->b_rcred == NOCRED) {
176				if (cred != NOCRED)
177					crhold(cred);
178				rabp->b_rcred = cred;
179			}
180			VOP_STRATEGY(rabp);
181		} else {
182			brelse(rabp);
183		}
184	}
185
186	if( readwait) {
187		rv = biowait (bp);
188	}
189
190	return (rv);
191}
192
193/*
194 * Write, release buffer on completion.  (Done by iodone
195 * if async.)
196 */
197int
198bwrite(struct buf *bp)
199{
200	int oldflags = bp->b_flags;
201
202	if(bp->b_flags & B_INVAL) {
203		brelse(bp);
204		return (0);
205	}
206
207	if(!(bp->b_flags & B_BUSY))
208		panic("bwrite: buffer is not busy???");
209
210	bp->b_flags &= ~(B_READ|B_DONE|B_ERROR|B_DELWRI);
211	bp->b_flags |= B_WRITEINPROG;
212
213	if (oldflags & B_ASYNC) {
214		if (oldflags & B_DELWRI) {
215			reassignbuf(bp, bp->b_vp);
216		} else if( curproc) {
217			++curproc->p_stats->p_ru.ru_oublock;
218		}
219	}
220
221	bp->b_vp->v_numoutput++;
222	VOP_STRATEGY(bp);
223
224	if( (oldflags & B_ASYNC) == 0) {
225		int rtval = biowait(bp);
226		if (oldflags & B_DELWRI) {
227			reassignbuf(bp, bp->b_vp);
228		} else if( curproc) {
229			++curproc->p_stats->p_ru.ru_oublock;
230		}
231		brelse(bp);
232		return (rtval);
233	}
234
235	return(0);
236}
237
238int
239vn_bwrite(ap)
240	struct vop_bwrite_args *ap;
241{
242	return (bwrite(ap->a_bp));
243}
244
245/*
246 * Delayed write. (Buffer is marked dirty).
247 */
248void
249bdwrite(struct buf *bp)
250{
251
252	if((bp->b_flags & B_BUSY) == 0) {
253		panic("bdwrite: buffer is not busy");
254	}
255
256	if(bp->b_flags & B_INVAL) {
257		brelse(bp);
258		return;
259	}
260
261	if(bp->b_flags & B_TAPE) {
262		bawrite(bp);
263		return;
264	}
265
266	bp->b_flags &= ~B_READ;
267	if( (bp->b_flags & B_DELWRI) == 0) {
268		if( curproc)
269			++curproc->p_stats->p_ru.ru_oublock;
270		bp->b_flags |= B_DONE|B_DELWRI;
271		reassignbuf(bp, bp->b_vp);
272	}
273	brelse(bp);
274	return;
275}
276
277/*
278 * Asynchronous write.
279 * Start output on a buffer, but do not wait for it to complete.
280 * The buffer is released when the output completes.
281 */
282void
283bawrite(struct buf *bp)
284{
285	bp->b_flags |= B_ASYNC;
286	(void) bwrite(bp);
287}
288
289/*
290 * Release a buffer.
291 */
292void
293brelse(struct buf *bp)
294{
295	int x;
296
297	/* anyone need a "free" block? */
298	x=splbio();
299	if (needsbuffer) {
300		needsbuffer = 0;
301		wakeup((caddr_t)&needsbuffer);
302	}
303
304	/* anyone need this block? */
305	if (bp->b_flags & B_WANTED) {
306		bp->b_flags &= ~(B_WANTED|B_AGE);
307		wakeup((caddr_t)bp);
308	}
309
310	if (bp->b_flags & B_LOCKED)
311		bp->b_flags &= ~B_ERROR;
312
313	if ((bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR)) ||
314		(bp->b_bufsize <= 0)) {
315		bp->b_flags |= B_INVAL;
316		bp->b_flags &= ~(B_DELWRI|B_CACHE);
317		if(bp->b_vp)
318			brelvp(bp);
319	}
320
321	if( bp->b_qindex != QUEUE_NONE)
322		panic("brelse: free buffer onto another queue???");
323
324	/* enqueue */
325	/* buffers with no memory */
326	if(bp->b_bufsize == 0) {
327		bp->b_qindex = QUEUE_EMPTY;
328		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
329		LIST_REMOVE(bp, b_hash);
330		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
331		bp->b_dev = NODEV;
332	/* buffers with junk contents */
333	} else if(bp->b_flags & (B_ERROR|B_INVAL|B_NOCACHE)) {
334		bp->b_qindex = QUEUE_AGE;
335		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
336		LIST_REMOVE(bp, b_hash);
337		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
338		bp->b_dev = NODEV;
339	/* buffers that are locked */
340	} else if(bp->b_flags & B_LOCKED) {
341		bp->b_qindex = QUEUE_LOCKED;
342		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
343	/* buffers with stale but valid contents */
344	} else if(bp->b_flags & B_AGE) {
345		bp->b_qindex = QUEUE_AGE;
346		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
347	/* buffers with valid and quite potentially reuseable contents */
348	} else {
349		bp->b_qindex = QUEUE_LRU;
350		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
351	}
352
353	/* unlock */
354	bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_NOCACHE|B_AGE);
355	splx(x);
356}
357
358int freebufspace;
359int allocbufspace;
360
361/*
362 * Find a buffer header which is available for use.
363 */
364struct buf *
365getnewbuf(int slpflag, int slptimeo)
366{
367	struct buf *bp;
368	int s;
369	s = splbio();
370start:
371	/* can we constitute a new buffer? */
372	if (bp = bufqueues[QUEUE_EMPTY].tqh_first) {
373		if( bp->b_qindex != QUEUE_EMPTY)
374			panic("getnewbuf: inconsistent EMPTY queue");
375		bremfree(bp);
376		goto fillbuf;
377	}
378
379tryfree:
380	if (bp = bufqueues[QUEUE_AGE].tqh_first) {
381		if( bp->b_qindex != QUEUE_AGE)
382			panic("getnewbuf: inconsistent AGE queue");
383		bremfree(bp);
384	} else if (bp = bufqueues[QUEUE_LRU].tqh_first) {
385		if( bp->b_qindex != QUEUE_LRU)
386			panic("getnewbuf: inconsistent LRU queue");
387		bremfree(bp);
388	} else	{
389		/* wait for a free buffer of any kind */
390		needsbuffer = 1;
391		tsleep((caddr_t)&needsbuffer, PRIBIO, "newbuf", 0);
392		splx(s);
393		return (0);
394	}
395
396
397	/* if we are a delayed write, convert to an async write */
398	if (bp->b_flags & B_DELWRI) {
399		bp->b_flags |= B_BUSY;
400		bawrite (bp);
401		goto start;
402	}
403
404	if(bp->b_vp)
405		brelvp(bp);
406
407	/* we are not free, nor do we contain interesting data */
408	if (bp->b_rcred != NOCRED)
409		crfree(bp->b_rcred);
410	if (bp->b_wcred != NOCRED)
411		crfree(bp->b_wcred);
412fillbuf:
413	bp->b_flags = B_BUSY;
414	LIST_REMOVE(bp, b_hash);
415	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
416	splx(s);
417	bp->b_dev = NODEV;
418	bp->b_vp = NULL;
419	bp->b_blkno = bp->b_lblkno = 0;
420	bp->b_iodone = 0;
421	bp->b_error = 0;
422	bp->b_resid = 0;
423	bp->b_bcount = 0;
424	bp->b_wcred = bp->b_rcred = NOCRED;
425	bp->b_dirtyoff = bp->b_dirtyend = 0;
426	bp->b_validoff = bp->b_validend = 0;
427	return (bp);
428}
429
430/*
431 * Check to see if a block is currently memory resident.
432 */
433struct buf *
434incore(struct vnode *vp, daddr_t blkno)
435{
436	struct buf *bp;
437	struct bufhashhdr *bh;
438
439	int s = splbio();
440
441	bh = BUFHASH(vp, blkno);
442	bp = bh->lh_first;
443
444	/* Search hash chain */
445	while (bp) {
446		if( (bp < buf) || (bp >= buf + nbuf)) {
447			printf("incore: buf out of range: %lx, hash: %d\n",
448				bp, bh - bufhashtbl);
449			panic("incore: buf fault");
450		}
451		/* hit */
452		if (bp->b_lblkno == blkno && bp->b_vp == vp
453			&& (bp->b_flags & B_INVAL) == 0) {
454			splx(s);
455			return (bp);
456		}
457		bp = bp->b_hash.le_next;
458	}
459	splx(s);
460
461	return(0);
462}
463
464/*
465 * Get a block given a specified block and offset into a file/device.
466 */
467struct buf *
468getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
469{
470	struct buf *bp;
471	int s;
472	struct bufhashhdr *bh;
473
474	s = splbio();
475loop:
476	if (bp = incore(vp, blkno)) {
477		if (bp->b_flags & B_BUSY) {
478			bp->b_flags |= B_WANTED;
479			tsleep ((caddr_t)bp, PRIBIO, "getblk", 0);
480			goto loop;
481		}
482		bp->b_flags |= B_BUSY | B_CACHE;
483		bremfree(bp);
484		/*
485		 * check for size inconsistancies
486		 */
487		if (bp->b_bcount != size) {
488			printf("getblk: invalid buffer size: %d\n", bp->b_bcount);
489			bp->b_flags |= B_INVAL;
490			bwrite(bp);
491			goto loop;
492		}
493	} else {
494
495		if ((bp = getnewbuf(0, 0)) == 0)
496			goto loop;
497		allocbuf(bp, size);
498		/*
499		 * have to check again, because of a possible
500		 * race condition.
501		 */
502		if (incore( vp, blkno)) {
503			allocbuf(bp, 0);
504			bp->b_flags |= B_INVAL;
505			brelse(bp);
506			goto loop;
507		}
508		bp->b_blkno = bp->b_lblkno = blkno;
509		bgetvp(vp, bp);
510		LIST_REMOVE(bp, b_hash);
511		bh = BUFHASH(vp, blkno);
512		LIST_INSERT_HEAD(bh, bp, b_hash);
513	}
514	splx(s);
515	return (bp);
516}
517
518/*
519 * Get an empty, disassociated buffer of given size.
520 */
521struct buf *
522geteblk(int size)
523{
524	struct buf *bp;
525	while ((bp = getnewbuf(0, 0)) == 0)
526		;
527	allocbuf(bp, size);
528	bp->b_flags |= B_INVAL;
529	return (bp);
530}
531
532/*
533 * Modify the length of a buffer's underlying buffer storage without
534 * destroying information (unless, of course the buffer is shrinking).
535 */
536void
537allocbuf(struct buf *bp, int size)
538{
539
540	int newbsize = round_page(size);
541
542	if( newbsize == bp->b_bufsize) {
543		bp->b_bcount = size;
544		return;
545	} else if( newbsize < bp->b_bufsize) {
546		vm_hold_free_pages(
547			(vm_offset_t) bp->b_data + newbsize,
548			(vm_offset_t) bp->b_data + bp->b_bufsize);
549	} else if( newbsize > bp->b_bufsize) {
550		vm_hold_load_pages(
551			(vm_offset_t) bp->b_data + bp->b_bufsize,
552			(vm_offset_t) bp->b_data + newbsize);
553	}
554
555	/* adjust buffer cache's idea of memory allocated to buffer contents */
556	freebufspace -= newbsize - bp->b_bufsize;
557	allocbufspace += newbsize - bp->b_bufsize;
558
559	bp->b_bufsize = newbsize;
560	bp->b_bcount = size;
561}
562
563/*
564 * Wait for buffer I/O completion, returning error status.
565 */
566int
567biowait(register struct buf *bp)
568{
569	int s;
570
571	s = splbio();
572	while ((bp->b_flags & B_DONE) == 0)
573		tsleep((caddr_t)bp, PRIBIO, "biowait", 0);
574	if((bp->b_flags & B_ERROR) || bp->b_error) {
575		if ((bp->b_flags & B_INVAL) == 0) {
576			bp->b_flags |= B_INVAL;
577			bp->b_dev = NODEV;
578			LIST_REMOVE(bp, b_hash);
579			LIST_INSERT_HEAD(&invalhash, bp, b_hash);
580		}
581		if (!bp->b_error)
582			bp->b_error = EIO;
583		else
584			bp->b_flags |= B_ERROR;
585		splx(s);
586		return (bp->b_error);
587	} else {
588		splx(s);
589		return (0);
590	}
591}
592
593/*
594 * Finish I/O on a buffer, calling an optional function.
595 * This is usually called from interrupt level, so process blocking
596 * is not *a good idea*.
597 */
598void
599biodone(register struct buf *bp)
600{
601	int s;
602	s = splbio();
603	bp->b_flags |= B_DONE;
604
605	if ((bp->b_flags & B_READ) == 0)  {
606		vwakeup(bp);
607	}
608
609	if (bp->b_flags & B_BOUNCE)
610		vm_bounce_free(bp);
611
612	/* call optional completion function if requested */
613	if (bp->b_flags & B_CALL) {
614		bp->b_flags &= ~B_CALL;
615		(*bp->b_iodone)(bp);
616		splx(s);
617		return;
618	}
619
620/*
621 * For asynchronous completions, release the buffer now. The brelse
622 *	checks for B_WANTED and will do the wakeup there if necessary -
623 *	so no need to do a wakeup here in the async case.
624 */
625
626	if (bp->b_flags & B_ASYNC) {
627		brelse(bp);
628	} else {
629		bp->b_flags &= ~B_WANTED;
630		wakeup((caddr_t) bp);
631	}
632	splx(s);
633}
634
635int
636count_lock_queue()
637{
638	int count;
639	struct buf *bp;
640
641	count = 0;
642	for(bp = bufqueues[QUEUE_LOCKED].tqh_first;
643	    bp != NULL;
644	    bp = bp->b_freelist.tqe_next)
645		count++;
646	return(count);
647}
648
649#ifndef UPDATE_INTERVAL
650int vfs_update_interval = 30;
651#else
652int vfs_update_interval = UPDATE_INTERVAL;
653#endif
654
655void
656vfs_update() {
657	(void) spl0();
658	while(1) {
659		tsleep((caddr_t)&vfs_update_wakeup, PRIBIO, "update",
660			hz * vfs_update_interval);
661		vfs_update_wakeup = 0;
662		sync(curproc, NULL, NULL);
663	}
664}
665
666/*
667 * these routines are not in the correct place (yet)
668 * also they work *ONLY* for kernel_pmap!!!
669 */
670void
671vm_hold_load_pages(vm_offset_t froma, vm_offset_t toa) {
672	vm_offset_t pg;
673	vm_page_t p;
674	vm_offset_t from = round_page(froma);
675	vm_offset_t to = round_page(toa);
676
677	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
678		vm_offset_t pa;
679
680	tryagain:
681		if (cnt.v_free_count <= cnt.v_free_reserved) {
682			VM_WAIT;
683			goto tryagain;
684		}
685
686		p =  vm_page_alloc(kernel_object, pg - VM_MIN_KERNEL_ADDRESS);
687		if( !p) {
688			VM_WAIT;
689			goto tryagain;
690		}
691
692		vm_page_wire(p);
693		pmap_kenter( pg, VM_PAGE_TO_PHYS(p));
694	}
695}
696
697void
698vm_hold_free_pages(vm_offset_t froma, vm_offset_t toa) {
699	vm_offset_t pg;
700	vm_page_t p;
701	vm_offset_t from = round_page(froma);
702	vm_offset_t to = round_page(toa);
703
704	for(pg = from ; pg < to ; pg += PAGE_SIZE) {
705		p = PHYS_TO_VM_PAGE( pmap_kextract( pg));
706		pmap_kremove( pg);
707		vm_page_free(p);
708	}
709}
710
711void
712bufstats()
713{
714}
715
716