vfs_bio.c revision 51811
154359Sroberto/* 254359Sroberto * Copyright (c) 1994,1997 John S. Dyson 354359Sroberto * All rights reserved. 4132451Sroberto * 554359Sroberto * Redistribution and use in source and binary forms, with or without 682498Sroberto * modification, are permitted provided that the following conditions 754359Sroberto * are met: 854359Sroberto * 1. Redistributions of source code must retain the above copyright 954359Sroberto * notice immediately at the beginning of the file, without modification, 1054359Sroberto * this list of conditions, and the following disclaimer. 1154359Sroberto * 2. Absolutely no warranty of function or purpose is made by the author 1254359Sroberto * John S. Dyson. 1354359Sroberto * 1454359Sroberto * $FreeBSD: head/sys/kern/vfs_bio.c 51811 1999-09-30 07:39:20Z dt $ 1554359Sroberto */ 1682498Sroberto 1782498Sroberto/* 18132451Sroberto * this file contains a new buffer I/O scheme implementing a coherent 1982498Sroberto * VM object and buffer cache scheme. Pains have been taken to make 2082498Sroberto * sure that the performance degradation associated with schemes such 2182498Sroberto * as this is not realized. 2282498Sroberto * 2354359Sroberto * Author: John S. Dyson 2454359Sroberto * Significant help during the development and debugging phases 2554359Sroberto * had been provided by David Greenman, also of the FreeBSD core team. 2654359Sroberto * 2754359Sroberto * see man buf(9) for more info. 2854359Sroberto */ 2954359Sroberto 3054359Sroberto#define VMIO 3154359Sroberto#include <sys/param.h> 3254359Sroberto#include <sys/systm.h> 3354359Sroberto#include <sys/sysproto.h> 3454359Sroberto#include <sys/kernel.h> 3554359Sroberto#include <sys/sysctl.h> 36132451Sroberto#include <sys/proc.h> 37132451Sroberto#include <sys/kthread.h> 38132451Sroberto#include <sys/vnode.h> 39132451Sroberto#include <sys/vmmeter.h> 40132451Sroberto#include <sys/lock.h> 41132451Sroberto#include <vm/vm.h> 4254359Sroberto#include <vm/vm_param.h> 43132451Sroberto#include <vm/vm_prot.h> 44132451Sroberto#include <vm/vm_kern.h> 4554359Sroberto#include <vm/vm_pageout.h> 4654359Sroberto#include <vm/vm_page.h> 4754359Sroberto#include <vm/vm_object.h> 48132451Sroberto#include <vm/vm_extern.h> 49132451Sroberto#include <vm/vm_map.h> 50132451Sroberto#include <sys/buf.h> 5154359Sroberto#include <sys/mount.h> 5254359Sroberto#include <sys/malloc.h> 5354359Sroberto#include <sys/resourcevar.h> 5454359Sroberto#include <sys/conf.h> 5554359Sroberto 5654359Srobertostatic MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 5754359Sroberto 5854359Srobertostruct bio_ops bioops; /* I/O operation notification */ 5954359Sroberto 6054359Srobertostruct buf *buf; /* buffer header pool */ 61132451Srobertostruct swqueue bswlist; 62132451Sroberto 6354359Srobertostatic void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 6454359Sroberto vm_offset_t to); 65132451Srobertostatic void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 66132451Sroberto vm_offset_t to); 67132451Srobertostatic void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, 68132451Sroberto int pageno, vm_page_t m); 69132451Srobertostatic void vfs_clean_pages(struct buf * bp); 70132451Srobertostatic void vfs_setdirty(struct buf *bp); 71132451Srobertostatic void vfs_vmio_release(struct buf *bp); 72132451Srobertostatic int flushbufqueues(void); 73132451Sroberto 74132451Srobertostatic int bd_request; 75132451Sroberto 76132451Srobertostatic void buf_daemon __P((void)); 77132451Sroberto/* 78132451Sroberto * bogus page -- for I/O to/from partially complete buffers 79132451Sroberto * this is a temporary solution to the problem, but it is not 80132451Sroberto * really that bad. it would be better to split the buffer 81132451Sroberto * for input in the case of buffers partially already in memory, 82132451Sroberto * but the code is intricate enough already. 83132451Sroberto */ 84132451Srobertovm_page_t bogus_page; 85132451Srobertoint runningbufspace; 86132451Srobertoint vmiodirenable = FALSE; 87132451Srobertostatic vm_offset_t bogus_offset; 88132451Sroberto 89132451Srobertostatic int bufspace, maxbufspace, vmiospace, 90132451Sroberto bufmallocspace, maxbufmallocspace, hibufspace; 91132451Sroberto#if 0 92132451Srobertostatic int maxvmiobufspace; 93132451Sroberto#endif 9454359Srobertostatic int maxbdrun; 95132451Srobertostatic int needsbuffer; 96132451Srobertostatic int numdirtybuffers, lodirtybuffers, hidirtybuffers; 97132451Srobertostatic int numfreebuffers, lofreebuffers, hifreebuffers; 98132451Srobertostatic int getnewbufcalls; 99132451Srobertostatic int getnewbufrestarts; 100132451Srobertostatic int kvafreespace; 101182007Sroberto 102182007SrobertoSYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, 103182007Sroberto &numdirtybuffers, 0, ""); 10454359SrobertoSYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, 105132451Sroberto &lodirtybuffers, 0, ""); 10654359SrobertoSYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, 10754359Sroberto &hidirtybuffers, 0, ""); 108132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, 109132451Sroberto &numfreebuffers, 0, ""); 11054359SrobertoSYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, 11154359Sroberto &lofreebuffers, 0, ""); 112132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, 11354359Sroberto &hifreebuffers, 0, ""); 11454359SrobertoSYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, 11554359Sroberto &runningbufspace, 0, ""); 11654359SrobertoSYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, 11754359Sroberto &maxbufspace, 0, ""); 11854359SrobertoSYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, 119132451Sroberto &hibufspace, 0, ""); 120132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, 121132451Sroberto &bufspace, 0, ""); 122132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, maxbdrun, CTLFLAG_RW, 123132451Sroberto &maxbdrun, 0, ""); 124132451Sroberto#if 0 125132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, 126132451Sroberto &maxvmiobufspace, 0, ""); 127132451Sroberto#endif 128132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, 129132451Sroberto &vmiospace, 0, ""); 130132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, 131132451Sroberto &maxbufmallocspace, 0, ""); 132132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, 133132451Sroberto &bufmallocspace, 0, ""); 134132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD, 135132451Sroberto &kvafreespace, 0, ""); 136132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, 137132451Sroberto &getnewbufcalls, 0, ""); 138132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, 139132451Sroberto &getnewbufrestarts, 0, ""); 140132451SrobertoSYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, 141132451Sroberto &vmiodirenable, 0, ""); 142132451Sroberto 143132451Sroberto 144132451Srobertostatic int bufhashmask; 145132451Srobertostatic LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 146132451Srobertostruct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } }; 147132451Srobertochar *buf_wmesg = BUF_WMESG; 148132451Sroberto 149132451Srobertoextern int vm_swap_size; 150132451Sroberto 151132451Sroberto#define BUF_MAXUSE 24 152132451Sroberto 153132451Sroberto#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 154132451Sroberto#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */ 155132451Sroberto#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ 156132451Sroberto#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 157132451Sroberto#define VFS_BIO_NEED_KVASPACE 0x10 /* wait for buffer_map space, emerg */ 158132451Sroberto 159132451Sroberto/* 160132451Sroberto * Buffer hash table code. Note that the logical block scans linearly, which 161132451Sroberto * gives us some L1 cache locality. 162132451Sroberto */ 163132451Sroberto 164132451Srobertostatic __inline 16554359Srobertostruct bufhashhdr * 166132451Srobertobufhash(struct vnode *vnp, daddr_t bn) 16754359Sroberto{ 16854359Sroberto return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); 169132451Sroberto} 170132451Sroberto 171132451Sroberto/* 172132451Sroberto * kvaspacewakeup: 173132451Sroberto * 174132451Sroberto * Called when kva space is potential available for recovery or when 17554359Sroberto * kva space is recovered in the buffer_map. This function wakes up 176182007Sroberto * anyone waiting for buffer_map kva space. Even though the buffer_map 177182007Sroberto * is larger then maxbufspace, this situation will typically occur 178182007Sroberto * when the buffer_map gets fragmented. 179132451Sroberto */ 18054359Sroberto 18154359Srobertostatic __inline void 18254359Srobertokvaspacewakeup(void) 18354359Sroberto{ 18454359Sroberto /* 18554359Sroberto * If someone is waiting for KVA space, wake them up. Even 18654359Sroberto * though we haven't freed the kva space yet, the waiting 18782498Sroberto * process will be able to now. 18854359Sroberto */ 18954359Sroberto if (needsbuffer & VFS_BIO_NEED_KVASPACE) { 19054359Sroberto needsbuffer &= ~VFS_BIO_NEED_KVASPACE; 19154359Sroberto wakeup(&needsbuffer); 19254359Sroberto } 19354359Sroberto} 19454359Sroberto 19554359Sroberto/* 19654359Sroberto * numdirtywakeup: 19754359Sroberto * 19854359Sroberto * If someone is blocked due to there being too many dirty buffers, 19954359Sroberto * and numdirtybuffers is now reasonable, wake them up. 20054359Sroberto */ 20154359Sroberto 20254359Srobertostatic __inline void 20354359Srobertonumdirtywakeup(void) 20454359Sroberto{ 20554359Sroberto if (numdirtybuffers < hidirtybuffers) { 20654359Sroberto if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) { 20754359Sroberto needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH; 20854359Sroberto wakeup(&needsbuffer); 20954359Sroberto } 21054359Sroberto } 21154359Sroberto} 21254359Sroberto 21354359Sroberto/* 21454359Sroberto * bufspacewakeup: 21554359Sroberto * 21654359Sroberto * Called when buffer space is potentially available for recovery or when 21754359Sroberto * buffer space is recovered. getnewbuf() will block on this flag when 21854359Sroberto * it is unable to free sufficient buffer space. Buffer space becomes 219132451Sroberto * recoverable when bp's get placed back in the queues. 22054359Sroberto */ 22154359Sroberto 22254359Srobertostatic __inline void 22354359Srobertobufspacewakeup(void) 22454359Sroberto{ 22554359Sroberto /* 22654359Sroberto * If someone is waiting for BUF space, wake them up. Even 22754359Sroberto * though we haven't freed the kva space yet, the waiting 22854359Sroberto * process will be able to now. 22954359Sroberto */ 23054359Sroberto if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 23154359Sroberto needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 23254359Sroberto wakeup(&needsbuffer); 23354359Sroberto } 23454359Sroberto} 23554359Sroberto 23654359Sroberto/* 23754359Sroberto * bufcountwakeup: 23854359Sroberto * 23954359Sroberto * Called when a buffer has been added to one of the free queues to 24054359Sroberto * account for the buffer and to wakeup anyone waiting for free buffers. 24154359Sroberto * This typically occurs when large amounts of metadata are being handled 24254359Sroberto * by the buffer cache ( else buffer space runs out first, usually ). 24354359Sroberto */ 24454359Sroberto 245132451Srobertostatic __inline void 24654359Srobertobufcountwakeup(void) 24754359Sroberto{ 24854359Sroberto ++numfreebuffers; 24954359Sroberto if (needsbuffer) { 25054359Sroberto needsbuffer &= ~VFS_BIO_NEED_ANY; 25154359Sroberto if (numfreebuffers >= hifreebuffers) 25254359Sroberto needsbuffer &= ~VFS_BIO_NEED_FREE; 25354359Sroberto wakeup(&needsbuffer); 25454359Sroberto } 25554359Sroberto} 25654359Sroberto 25754359Sroberto/* 25854359Sroberto * vfs_buf_test_cache: 25954359Sroberto * 26054359Sroberto * Called when a buffer is extended. This function clears the B_CACHE 26154359Sroberto * bit if the newly extended portion of the buffer does not contain 26254359Sroberto * valid data. 26354359Sroberto */ 26454359Srobertostatic __inline__ 26554359Srobertovoid 26654359Srobertovfs_buf_test_cache(struct buf *bp, 26754359Sroberto vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 26854359Sroberto vm_page_t m) 26954359Sroberto{ 27054359Sroberto if (bp->b_flags & B_CACHE) { 27154359Sroberto int base = (foff + off) & PAGE_MASK; 27254359Sroberto if (vm_page_is_valid(m, base, size) == 0) 27354359Sroberto bp->b_flags &= ~B_CACHE; 27454359Sroberto } 275132451Sroberto} 27654359Sroberto 27754359Srobertostatic __inline__ 27854359Srobertovoid 27954359Srobertobd_wakeup(int dirtybuflevel) 28054359Sroberto{ 28154359Sroberto if (numdirtybuffers >= dirtybuflevel && bd_request == 0) { 28254359Sroberto bd_request = 1; 28354359Sroberto wakeup(&bd_request); 28454359Sroberto } 28554359Sroberto} 28654359Sroberto 287132451Sroberto 28854359Sroberto/* 28954359Sroberto * Initialize buffer headers and related structures. 29054359Sroberto */ 29154359Sroberto 29254359Srobertocaddr_t 29354359Srobertobufhashinit(caddr_t vaddr) 29454359Sroberto{ 29554359Sroberto /* first, make a null hash table */ 29654359Sroberto for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1) 29754359Sroberto ; 29854359Sroberto bufhashtbl = (void *)vaddr; 29954359Sroberto vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask; 30054359Sroberto --bufhashmask; 30154359Sroberto return(vaddr); 30254359Sroberto} 30354359Sroberto 30454359Srobertovoid 30554359Srobertobufinit(void) 30654359Sroberto{ 30754359Sroberto struct buf *bp; 30854359Sroberto int i; 30954359Sroberto 31054359Sroberto TAILQ_INIT(&bswlist); 31154359Sroberto LIST_INIT(&invalhash); 31254359Sroberto simple_lock_init(&buftimelock); 31354359Sroberto 31454359Sroberto for (i = 0; i <= bufhashmask; i++) 31554359Sroberto LIST_INIT(&bufhashtbl[i]); 31654359Sroberto 31754359Sroberto /* next, make a null set of free lists */ 31854359Sroberto for (i = 0; i < BUFFER_QUEUES; i++) 31954359Sroberto TAILQ_INIT(&bufqueues[i]); 32054359Sroberto 32154359Sroberto /* finally, initialize each buffer header and stick on empty q */ 32254359Sroberto for (i = 0; i < nbuf; i++) { 32354359Sroberto bp = &buf[i]; 32454359Sroberto bzero(bp, sizeof *bp); 32554359Sroberto bp->b_flags = B_INVAL; /* we're just an empty header */ 32654359Sroberto bp->b_dev = NODEV; 32754359Sroberto bp->b_rcred = NOCRED; 32854359Sroberto bp->b_wcred = NOCRED; 32954359Sroberto bp->b_qindex = QUEUE_EMPTY; 33054359Sroberto bp->b_xflags = 0; 33154359Sroberto LIST_INIT(&bp->b_dep); 33254359Sroberto BUF_LOCKINIT(bp); 33354359Sroberto TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 33454359Sroberto LIST_INSERT_HEAD(&invalhash, bp, b_hash); 33554359Sroberto } 33654359Sroberto 33754359Sroberto /* 33854359Sroberto * maxbufspace is currently calculated to support all filesystem 33954359Sroberto * blocks to be 8K. If you happen to use a 16K filesystem, the size 34054359Sroberto * of the buffer cache is still the same as it would be for 8K 34154359Sroberto * filesystems. This keeps the size of the buffer cache "in check" 34254359Sroberto * for big block filesystems. 34354359Sroberto * 34454359Sroberto * maxbufspace is calculated as around 50% of the KVA available in 34554359Sroberto * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the 34654359Sroberto * effect of fragmentation. 34754359Sroberto */ 34854359Sroberto maxbufspace = (nbuf + 8) * DFLTBSIZE; 34954359Sroberto if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE) 35054359Sroberto hibufspace = 3 * maxbufspace / 4; 35154359Sroberto#if 0 35254359Sroberto/* 35354359Sroberto * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed 35454359Sroberto */ 35554359Sroberto maxvmiobufspace = 2 * hibufspace / 3; 35654359Sroberto#endif 35754359Sroberto/* 35854359Sroberto * Limit the amount of malloc memory since it is wired permanently into 35954359Sroberto * the kernel space. Even though this is accounted for in the buffer 36054359Sroberto * allocation, we don't want the malloced region to grow uncontrolled. 36154359Sroberto * The malloc scheme improves memory utilization significantly on average 36254359Sroberto * (small) directories. 36354359Sroberto */ 36454359Sroberto maxbufmallocspace = hibufspace / 20; 36554359Sroberto 36654359Sroberto/* 36754359Sroberto * Reduce the chance of a deadlock occuring by limiting the number 36854359Sroberto * of delayed-write dirty buffers we allow to stack up. 36954359Sroberto */ 37054359Sroberto lodirtybuffers = nbuf / 7 + 10; 37154359Sroberto hidirtybuffers = nbuf / 4 + 20; 37254359Sroberto numdirtybuffers = 0; 37354359Sroberto 37454359Sroberto/* 37554359Sroberto * Try to keep the number of free buffers in the specified range, 37654359Sroberto * and give the syncer access to an emergency reserve. 37754359Sroberto */ 37854359Sroberto lofreebuffers = nbuf / 18 + 5; 37954359Sroberto hifreebuffers = 2 * lofreebuffers; 38054359Sroberto numfreebuffers = nbuf; 38154359Sroberto 38254359Sroberto/* 38354359Sroberto * Maximum number of async ops initiated per buf_daemon loop. This is 38454359Sroberto * somewhat of a hack at the moment, we really need to limit ourselves 38554359Sroberto * based on the number of bytes of I/O in-transit that were initiated 38654359Sroberto * from buf_daemon. 38754359Sroberto */ 38854359Sroberto if ((maxbdrun = nswbuf / 4) < 4) 38954359Sroberto maxbdrun = 4; 39054359Sroberto 39154359Sroberto kvafreespace = 0; 39254359Sroberto 39354359Sroberto bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 39454359Sroberto bogus_page = vm_page_alloc(kernel_object, 39554359Sroberto ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 39654359Sroberto VM_ALLOC_NORMAL); 39754359Sroberto cnt.v_wire_count++; 39854359Sroberto 39954359Sroberto} 40054359Sroberto 40154359Sroberto/* 40254359Sroberto * Free the kva allocation for a buffer 40354359Sroberto * Must be called only at splbio or higher, 40454359Sroberto * as this is the only locking for buffer_map. 40554359Sroberto */ 40654359Srobertostatic void 40754359Srobertobfreekva(struct buf * bp) 40854359Sroberto{ 40954359Sroberto if (bp->b_kvasize) { 41054359Sroberto vm_map_delete(buffer_map, 41154359Sroberto (vm_offset_t) bp->b_kvabase, 412200576Sroberto (vm_offset_t) bp->b_kvabase + bp->b_kvasize 41354359Sroberto ); 414106163Sroberto bp->b_kvasize = 0; 415132451Sroberto kvaspacewakeup(); 41654359Sroberto } 41754359Sroberto} 41882498Sroberto 419132451Sroberto/* 42054359Sroberto * bremfree: 42154359Sroberto * 42254359Sroberto * Remove the buffer from the appropriate free list. 42354359Sroberto */ 42454359Srobertovoid 42554359Srobertobremfree(struct buf * bp) 42654359Sroberto{ 42754359Sroberto int s = splbio(); 42854359Sroberto int old_qindex = bp->b_qindex; 42954359Sroberto 43082498Sroberto if (bp->b_qindex != QUEUE_NONE) { 43154359Sroberto if (bp->b_qindex == QUEUE_EMPTYKVA) { 43254359Sroberto kvafreespace -= bp->b_kvasize; 43354359Sroberto } 43454359Sroberto KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp)); 43554359Sroberto TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 43654359Sroberto bp->b_qindex = QUEUE_NONE; 43754359Sroberto runningbufspace += bp->b_bufsize; 43882498Sroberto } else { 43982498Sroberto#if !defined(MAX_PERF) 44082498Sroberto if (BUF_REFCNT(bp) <= 1) 44182498Sroberto panic("bremfree: removing a buffer not on a queue"); 44282498Sroberto#endif 44382498Sroberto } 44482498Sroberto 44582498Sroberto /* 446106163Sroberto * Fixup numfreebuffers count. If the buffer is invalid or not 44782498Sroberto * delayed-write, and it was on the EMPTY, LRU, or AGE queues, 448200576Sroberto * the buffer was free and we must decrement numfreebuffers. 449200576Sroberto */ 450200576Sroberto if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) { 451200576Sroberto switch(old_qindex) { 452200576Sroberto case QUEUE_DIRTY: 453200576Sroberto case QUEUE_CLEAN: 454200576Sroberto case QUEUE_EMPTY: 455200576Sroberto case QUEUE_EMPTYKVA: 45654359Sroberto --numfreebuffers; 45754359Sroberto break; 45854359Sroberto default: 45954359Sroberto break; 46054359Sroberto } 46154359Sroberto } 46254359Sroberto splx(s); 46354359Sroberto} 46454359Sroberto 46554359Sroberto 466132451Sroberto/* 467132451Sroberto * Get a buffer with the specified data. Look in the cache first. We 46854359Sroberto * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 46954359Sroberto * is set, the buffer is valid and we do not have to do anything ( see 47054359Sroberto * getblk() ). 47154359Sroberto */ 47254359Srobertoint 47354359Srobertobread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 47454359Sroberto struct buf ** bpp) 47554359Sroberto{ 47654359Sroberto struct buf *bp; 47754359Sroberto 47854359Sroberto bp = getblk(vp, blkno, size, 0, 0); 47954359Sroberto *bpp = bp; 48054359Sroberto 48154359Sroberto /* if not found in cache, do some I/O */ 48254359Sroberto if ((bp->b_flags & B_CACHE) == 0) { 48354359Sroberto if (curproc != NULL) 48454359Sroberto curproc->p_stats->p_ru.ru_inblock++; 48554359Sroberto KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp)); 48654359Sroberto bp->b_flags |= B_READ; 48754359Sroberto bp->b_flags &= ~(B_ERROR | B_INVAL); 48854359Sroberto if (bp->b_rcred == NOCRED) { 48954359Sroberto if (cred != NOCRED) 49054359Sroberto crhold(cred); 49154359Sroberto bp->b_rcred = cred; 49254359Sroberto } 49354359Sroberto vfs_busy_pages(bp, 0); 494132451Sroberto VOP_STRATEGY(vp, bp); 495132451Sroberto return (biowait(bp)); 496132451Sroberto } 497132451Sroberto return (0); 498132451Sroberto} 499132451Sroberto 500132451Sroberto/* 501132451Sroberto * Operates like bread, but also starts asynchronous I/O on 502132451Sroberto * read-ahead blocks. We must clear B_ERROR and B_INVAL prior 503132451Sroberto * to initiating I/O . If B_CACHE is set, the buffer is valid 504132451Sroberto * and we do not have to do anything. 505132451Sroberto */ 506200576Srobertoint 507132451Srobertobreadn(struct vnode * vp, daddr_t blkno, int size, 508200576Sroberto daddr_t * rablkno, int *rabsize, 509200576Sroberto int cnt, struct ucred * cred, struct buf ** bpp) 510182007Sroberto{ 511132451Sroberto struct buf *bp, *rabp; 512132451Sroberto int i; 513132451Sroberto int rv = 0, readwait = 0; 514182007Sroberto 515132451Sroberto *bpp = bp = getblk(vp, blkno, size, 0, 0); 516132451Sroberto 517132451Sroberto /* if not found in cache, do some I/O */ 518132451Sroberto if ((bp->b_flags & B_CACHE) == 0) { 519132451Sroberto if (curproc != NULL) 520132451Sroberto curproc->p_stats->p_ru.ru_inblock++; 521182007Sroberto bp->b_flags |= B_READ; 522132451Sroberto bp->b_flags &= ~(B_ERROR | B_INVAL); 523132451Sroberto if (bp->b_rcred == NOCRED) { 524182007Sroberto if (cred != NOCRED) 525132451Sroberto crhold(cred); 526132451Sroberto bp->b_rcred = cred; 527132451Sroberto } 528132451Sroberto vfs_busy_pages(bp, 0); 529132451Sroberto VOP_STRATEGY(vp, bp); 530132451Sroberto ++readwait; 531132451Sroberto } 532132451Sroberto 533132451Sroberto for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 534132451Sroberto if (inmem(vp, *rablkno)) 535132451Sroberto continue; 536132451Sroberto rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 537132451Sroberto 538132451Sroberto if ((rabp->b_flags & B_CACHE) == 0) { 539132451Sroberto if (curproc != NULL) 540132451Sroberto curproc->p_stats->p_ru.ru_inblock++; 541132451Sroberto rabp->b_flags |= B_READ | B_ASYNC; 54254359Sroberto rabp->b_flags &= ~(B_ERROR | B_INVAL); 54354359Sroberto if (rabp->b_rcred == NOCRED) { 54454359Sroberto if (cred != NOCRED) 54554359Sroberto crhold(cred); 54654359Sroberto rabp->b_rcred = cred; 54754359Sroberto } 54854359Sroberto vfs_busy_pages(rabp, 0); 54954359Sroberto BUF_KERNPROC(rabp); 55054359Sroberto VOP_STRATEGY(vp, rabp); 55154359Sroberto } else { 55254359Sroberto brelse(rabp); 553132451Sroberto } 554106163Sroberto } 555106163Sroberto 556132451Sroberto if (readwait) { 557106163Sroberto rv = biowait(bp); 558106163Sroberto } 559106163Sroberto return (rv); 560106163Sroberto} 561106163Sroberto 56254359Sroberto/* 56354359Sroberto * Write, release buffer on completion. (Done by iodone 56454359Sroberto * if async). Do not bother writing anything if the buffer 56554359Sroberto * is invalid. 56654359Sroberto * 567106163Sroberto * Note that we set B_CACHE here, indicating that buffer is 56854359Sroberto * fully valid and thus cacheable. This is true even of NFS 56954359Sroberto * now so we set it generally. This could be set either here 57054359Sroberto * or in biodone() since the I/O is synchronous. We put it 571132451Sroberto * here. 572132451Sroberto */ 573132451Srobertoint 574132451Srobertobwrite(struct buf * bp) 575132451Sroberto{ 576132451Sroberto int oldflags, s; 577132451Sroberto struct vnode *vp; 578132451Sroberto struct mount *mp; 57954359Sroberto 58054359Sroberto if (bp->b_flags & B_INVAL) { 58154359Sroberto brelse(bp); 58254359Sroberto return (0); 58354359Sroberto } 58454359Sroberto 58554359Sroberto oldflags = bp->b_flags; 58654359Sroberto 58754359Sroberto#if !defined(MAX_PERF) 58854359Sroberto if (BUF_REFCNT(bp) == 0) 589132451Sroberto panic("bwrite: buffer is not busy???"); 590132451Sroberto#endif 59154359Sroberto s = splbio(); 59254359Sroberto bundirty(bp); 59354359Sroberto 59454359Sroberto bp->b_flags &= ~(B_READ | B_DONE | B_ERROR); 59554359Sroberto bp->b_flags |= B_WRITEINPROG | B_CACHE; 59654359Sroberto 59754359Sroberto bp->b_vp->v_numoutput++; 598132451Sroberto vfs_busy_pages(bp, 1); 599132451Sroberto if (curproc != NULL) 600132451Sroberto curproc->p_stats->p_ru.ru_oublock++; 60154359Sroberto splx(s); 60254359Sroberto if (oldflags & B_ASYNC) 60354359Sroberto BUF_KERNPROC(bp); 60454359Sroberto VOP_STRATEGY(bp->b_vp, bp); 60554359Sroberto 60654359Sroberto /* 60754359Sroberto * Collect statistics on synchronous and asynchronous writes. 60854359Sroberto * Writes to block devices are charged to their associated 60954359Sroberto * filesystem (if any). 610106163Sroberto */ 61154359Sroberto if ((vp = bp->b_vp) != NULL) { 61254359Sroberto if (vp->v_type == VBLK) 61354359Sroberto mp = vp->v_specmountpoint; 61454359Sroberto else 61554359Sroberto mp = vp->v_mount; 61654359Sroberto if (mp != NULL) { 617132451Sroberto if ((oldflags & B_ASYNC) == 0) 618132451Sroberto mp->mnt_stat.f_syncwrites++; 619132451Sroberto else 620132451Sroberto mp->mnt_stat.f_asyncwrites++; 62154359Sroberto } 62254359Sroberto } 62354359Sroberto 62454359Sroberto if ((oldflags & B_ASYNC) == 0) { 62554359Sroberto int rtval = biowait(bp); 62654359Sroberto brelse(bp); 62754359Sroberto return (rtval); 62854359Sroberto } 629106163Sroberto 630106163Sroberto return (0); 631132451Sroberto} 632132451Sroberto 633132451Sroberto/* 634132451Sroberto * Delayed write. (Buffer is marked dirty). Do not bother writing 63554359Sroberto * anything if the buffer is marked invalid. 63654359Sroberto * 63754359Sroberto * Note that since the buffer must be completely valid, we can safely 63854359Sroberto * set B_CACHE. In fact, we have to set B_CACHE here rather then in 63954359Sroberto * biodone() in order to prevent getblk from writing the buffer 64054359Sroberto * out synchronously. 64154359Sroberto */ 64254359Srobertovoid 64354359Srobertobdwrite(struct buf * bp) 64454359Sroberto{ 64554359Sroberto#if 0 64654359Sroberto struct vnode *vp; 64754359Sroberto#endif 64854359Sroberto 64954359Sroberto#if !defined(MAX_PERF) 65054359Sroberto if (BUF_REFCNT(bp) == 0) 65154359Sroberto panic("bdwrite: buffer is not busy"); 65254359Sroberto#endif 65354359Sroberto 65454359Sroberto if (bp->b_flags & B_INVAL) { 65554359Sroberto brelse(bp); 65654359Sroberto return; 657132451Sroberto } 65854359Sroberto bdirty(bp); 65954359Sroberto 66054359Sroberto /* 66154359Sroberto * Set B_CACHE, indicating that the buffer is fully valid. This is 66254359Sroberto * true even of NFS now. 66354359Sroberto */ 66454359Sroberto bp->b_flags |= B_CACHE; 665132451Sroberto 66654359Sroberto /* 66754359Sroberto * This bmap keeps the system from needing to do the bmap later, 668132451Sroberto * perhaps when the system is attempting to do a sync. Since it 669182007Sroberto * is likely that the indirect block -- or whatever other datastructure 67054359Sroberto * that the filesystem needs is still in memory now, it is a good 67154359Sroberto * thing to do this. Note also, that if the pageout daemon is 672132451Sroberto * requesting a sync -- there might not be enough memory to do 673132451Sroberto * the bmap then... So, this is important to do. 674132451Sroberto */ 675132451Sroberto if (bp->b_lblkno == bp->b_blkno) { 676132451Sroberto VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 677132451Sroberto } 678132451Sroberto 679132451Sroberto /* 680132451Sroberto * Set the *dirty* buffer range based upon the VM system dirty pages. 681132451Sroberto */ 682132451Sroberto vfs_setdirty(bp); 683132451Sroberto 684132451Sroberto /* 685132451Sroberto * We need to do this here to satisfy the vnode_pager and the 686132451Sroberto * pageout daemon, so that it thinks that the pages have been 687132451Sroberto * "cleaned". Note that since the pages are in a delayed write 688132451Sroberto * buffer -- the VFS layer "will" see that the pages get written 689132451Sroberto * out on the next sync, or perhaps the cluster will be completed. 690132451Sroberto */ 691132451Sroberto vfs_clean_pages(bp); 692132451Sroberto bqrelse(bp); 693132451Sroberto 694132451Sroberto /* 695132451Sroberto * Wakeup the buffer flushing daemon if we have saturated the 696132451Sroberto * buffer cache. 697132451Sroberto */ 698132451Sroberto 699132451Sroberto bd_wakeup(hidirtybuffers); 700132451Sroberto 701132451Sroberto /* 702132451Sroberto * note: we cannot initiate I/O from a bdwrite even if we wanted to, 70354359Sroberto * due to the softdep code. 70454359Sroberto */ 70554359Sroberto#if 0 70654359Sroberto /* 70754359Sroberto * XXX The soft dependency code is not prepared to 70854359Sroberto * have I/O done when a bdwrite is requested. For 70954359Sroberto * now we just let the write be delayed if it is 71054359Sroberto * requested by the soft dependency code. 71154359Sroberto */ 71254359Sroberto if ((vp = bp->b_vp) && 71354359Sroberto ((vp->v_type == VBLK && vp->v_specmountpoint && 714132451Sroberto (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) || 71554359Sroberto (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))) 71654359Sroberto return; 71754359Sroberto#endif 71854359Sroberto} 71954359Sroberto 72054359Sroberto/* 72154359Sroberto * bdirty: 72254359Sroberto * 723132451Sroberto * Turn buffer into delayed write request. We must clear B_READ and 72454359Sroberto * B_RELBUF, and we must set B_DELWRI. We reassign the buffer to 72554359Sroberto * itself to properly update it in the dirty/clean lists. We mark it 72654359Sroberto * B_DONE to ensure that any asynchronization of the buffer properly 72754359Sroberto * clears B_DONE ( else a panic will occur later ). 72854359Sroberto * 72954359Sroberto * bdirty() is kinda like bdwrite() - we have to clear B_INVAL which 730132451Sroberto * might have been set pre-getblk(). Unlike bwrite/bdwrite, bdirty() 731182007Sroberto * should only be called if the buffer is known-good. 73254359Sroberto * 73354359Sroberto * Since the buffer is not on a queue, we do not update the numfreebuffers 73454359Sroberto * count. 73554359Sroberto * 73654359Sroberto * Must be called at splbio(). 73754359Sroberto * The buffer must be on QUEUE_NONE. 738132451Sroberto */ 739132451Srobertovoid 740132451Srobertobdirty(bp) 741132451Sroberto struct buf *bp; 742132451Sroberto{ 743132451Sroberto KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 744132451Sroberto bp->b_flags &= ~(B_READ|B_RELBUF); 745132451Sroberto 746182007Sroberto if ((bp->b_flags & B_DELWRI) == 0) { 747182007Sroberto bp->b_flags |= B_DONE | B_DELWRI; 748182007Sroberto reassignbuf(bp, bp->b_vp); 749182007Sroberto ++numdirtybuffers; 750132451Sroberto bd_wakeup(hidirtybuffers); 751132451Sroberto } 752132451Sroberto} 753132451Sroberto 754132451Sroberto/* 755132451Sroberto * bundirty: 756132451Sroberto * 757132451Sroberto * Clear B_DELWRI for buffer. 758132451Sroberto * 759132451Sroberto * Since the buffer is not on a queue, we do not update the numfreebuffers 760182007Sroberto * count. 761182007Sroberto * 762182007Sroberto * Must be called at splbio(). 763182007Sroberto * The buffer must be on QUEUE_NONE. 764182007Sroberto */ 765182007Sroberto 766182007Srobertovoid 767182007Srobertobundirty(bp) 768182007Sroberto struct buf *bp; 769182007Sroberto{ 770182007Sroberto KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex)); 771182007Sroberto 77254359Sroberto if (bp->b_flags & B_DELWRI) { 773132451Sroberto bp->b_flags &= ~B_DELWRI; 774132451Sroberto reassignbuf(bp, bp->b_vp); 775182007Sroberto --numdirtybuffers; 776132451Sroberto numdirtywakeup(); 777132451Sroberto } 778132451Sroberto} 779132451Sroberto 780132451Sroberto/* 781132451Sroberto * bawrite: 782132451Sroberto * 783132451Sroberto * Asynchronous write. Start output on a buffer, but do not wait for 784132451Sroberto * it to complete. The buffer is released when the output completes. 785132451Sroberto * 786132451Sroberto * bwrite() ( or the VOP routine anyway ) is responsible for handling 787132451Sroberto * B_INVAL buffers. Not us. 788132451Sroberto */ 789132451Srobertovoid 790132451Srobertobawrite(struct buf * bp) 791132451Sroberto{ 792132451Sroberto bp->b_flags |= B_ASYNC; 793132451Sroberto (void) VOP_BWRITE(bp->b_vp, bp); 794132451Sroberto} 795132451Sroberto 796132451Sroberto/* 797132451Sroberto * bowrite: 798132451Sroberto * 799132451Sroberto * Ordered write. Start output on a buffer, and flag it so that the 800132451Sroberto * device will write it in the order it was queued. The buffer is 801132451Sroberto * released when the output completes. bwrite() ( or the VOP routine 802132451Sroberto * anyway ) is responsible for handling B_INVAL buffers. 803182007Sroberto */ 804132451Srobertoint 805132451Srobertobowrite(struct buf * bp) 80654359Sroberto{ 80754359Sroberto bp->b_flags |= B_ORDERED | B_ASYNC; 80854359Sroberto return (VOP_BWRITE(bp->b_vp, bp)); 80954359Sroberto} 81054359Sroberto 81154359Sroberto/* 81254359Sroberto * bwillwrite: 81354359Sroberto * 81454359Sroberto * Called prior to the locking of any vnodes when we are expecting to 81554359Sroberto * write. We do not want to starve the buffer cache with too many 81654359Sroberto * dirty buffers so we block here. By blocking prior to the locking 81754359Sroberto * of any vnodes we attempt to avoid the situation where a locked vnode 818132451Sroberto * prevents the various system daemons from flushing related buffers. 81954359Sroberto */ 82054359Sroberto 82154359Srobertovoid 82254359Srobertobwillwrite(void) 82354359Sroberto{ 82454359Sroberto int twenty = (hidirtybuffers - lodirtybuffers) / 5; 82554359Sroberto 82654359Sroberto if (numdirtybuffers > hidirtybuffers + twenty) { 82754359Sroberto int s; 828132451Sroberto 82954359Sroberto s = splbio(); 83054359Sroberto while (numdirtybuffers > hidirtybuffers) { 83154359Sroberto bd_wakeup(hidirtybuffers); 83254359Sroberto needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH; 83354359Sroberto tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0); 83454359Sroberto } 835132451Sroberto splx(s); 83654359Sroberto } 837132451Sroberto} 83854359Sroberto 839132451Sroberto/* 840132451Sroberto * brelse: 841132451Sroberto * 842132451Sroberto * Release a busy buffer and, if requested, free its resources. The 843132451Sroberto * buffer will be stashed in the appropriate bufqueue[] allowing it 844132451Sroberto * to be accessed later as a cache entity or reused for other purposes. 845132451Sroberto */ 846132451Srobertovoid 847132451Srobertobrelse(struct buf * bp) 848132451Sroberto{ 849132451Sroberto int s; 850132451Sroberto 85154359Sroberto KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 85254359Sroberto 85354359Sroberto#if 0 854132451Sroberto if (bp->b_flags & B_CLUSTER) { 855182007Sroberto relpbuf(bp, NULL); 856182007Sroberto return; 857182007Sroberto } 858182007Sroberto#endif 859182007Sroberto 860182007Sroberto s = splbio(); 861182007Sroberto 862132451Sroberto if (bp->b_flags & B_LOCKED) 863132451Sroberto bp->b_flags &= ~B_ERROR; 864132451Sroberto 865132451Sroberto if ((bp->b_flags & (B_READ | B_ERROR | B_INVAL)) == B_ERROR) { 866182007Sroberto /* 867182007Sroberto * Failed write, redirty. Must clear B_ERROR to prevent 868182007Sroberto * pages from being scrapped. If B_INVAL is set then 869182007Sroberto * this case is not run and the next case is run to 870182007Sroberto * destroy the buffer. B_INVAL can occur if the buffer 871182007Sroberto * is outside the range supported by the underlying device. 872182007Sroberto */ 873182007Sroberto bp->b_flags &= ~B_ERROR; 874182007Sroberto bdirty(bp); 875182007Sroberto } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) || 876182007Sroberto (bp->b_bufsize <= 0)) { 877132451Sroberto /* 878132451Sroberto * Either a failed I/O or we were asked to free or not 879132451Sroberto * cache the buffer. 880132451Sroberto */ 881132451Sroberto bp->b_flags |= B_INVAL; 88254359Sroberto if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) 88354359Sroberto (*bioops.io_deallocate)(bp); 88454359Sroberto if (bp->b_flags & B_DELWRI) { 88554359Sroberto --numdirtybuffers; 88654359Sroberto numdirtywakeup(); 88754359Sroberto } 88854359Sroberto bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF); 88954359Sroberto if ((bp->b_flags & B_VMIO) == 0) { 89054359Sroberto if (bp->b_bufsize) 89154359Sroberto allocbuf(bp, 0); 89254359Sroberto if (bp->b_vp) 89354359Sroberto brelvp(bp); 89454359Sroberto } 89554359Sroberto } 89654359Sroberto 89754359Sroberto /* 89854359Sroberto * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() 89954359Sroberto * is called with B_DELWRI set, the underlying pages may wind up 90054359Sroberto * getting freed causing a previous write (bdwrite()) to get 'lost' 90154359Sroberto * because pages associated with a B_DELWRI bp are marked clean. 90254359Sroberto * 90354359Sroberto * We still allow the B_INVAL case to call vfs_vmio_release(), even 90454359Sroberto * if B_DELWRI is set. 90554359Sroberto */ 90654359Sroberto 90754359Sroberto if (bp->b_flags & B_DELWRI) 90854359Sroberto bp->b_flags &= ~B_RELBUF; 909132451Sroberto 91054359Sroberto /* 911132451Sroberto * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 91254359Sroberto * constituted, not even NFS buffers now. Two flags effect this. If 91354359Sroberto * B_INVAL, the struct buf is invalidated but the VM object is kept 91454359Sroberto * around ( i.e. so it is trivial to reconstitute the buffer later ). 91554359Sroberto * 91654359Sroberto * If B_ERROR or B_NOCACHE is set, pages in the VM object will be 91754359Sroberto * invalidated. B_ERROR cannot be set for a failed write unless the 91854359Sroberto * buffer is also B_INVAL because it hits the re-dirtying code above. 91954359Sroberto * 92054359Sroberto * Normally we can do this whether a buffer is B_DELWRI or not. If 92154359Sroberto * the buffer is an NFS buffer, it is tracking piecemeal writes or 92254359Sroberto * the commit state and we cannot afford to lose the buffer. 92354359Sroberto */ 92454359Sroberto if ((bp->b_flags & B_VMIO) 92554359Sroberto && !(bp->b_vp->v_tag == VT_NFS && 92654359Sroberto bp->b_vp->v_type != VBLK && 92754359Sroberto (bp->b_flags & B_DELWRI)) 92854359Sroberto ) { 929132451Sroberto 930132451Sroberto int i, j, resid; 93154359Sroberto vm_page_t m; 93254359Sroberto off_t foff; 93354359Sroberto vm_pindex_t poff; 93454359Sroberto vm_object_t obj; 93554359Sroberto struct vnode *vp; 93654359Sroberto 93754359Sroberto vp = bp->b_vp; 93882498Sroberto 93954359Sroberto /* 94054359Sroberto * Get the base offset and length of the buffer. Note that 94154359Sroberto * for block sizes that are less then PAGE_SIZE, the b_data 94254359Sroberto * base of the buffer does not represent exactly b_offset and 94354359Sroberto * neither b_offset nor b_size are necessarily page aligned. 94454359Sroberto * Instead, the starting position of b_offset is: 94554359Sroberto * 94654359Sroberto * b_data + (b_offset & PAGE_MASK) 94754359Sroberto * 94854359Sroberto * block sizes less then DEV_BSIZE (usually 512) are not 94954359Sroberto * supported due to the page granularity bits (m->valid, 950132451Sroberto * m->dirty, etc...). 95154359Sroberto * 95254359Sroberto * See man buf(9) for more information 95354359Sroberto */ 95454359Sroberto 95554359Sroberto resid = bp->b_bufsize; 95654359Sroberto foff = bp->b_offset; 95754359Sroberto 95854359Sroberto for (i = 0; i < bp->b_npages; i++) { 959132451Sroberto m = bp->b_pages[i]; 96054359Sroberto vm_page_flag_clear(m, PG_ZERO); 96154359Sroberto if (m == bogus_page) { 962182007Sroberto 963182007Sroberto obj = (vm_object_t) vp->v_object; 964182007Sroberto poff = OFF_TO_IDX(bp->b_offset); 965182007Sroberto 96654359Sroberto for (j = i; j < bp->b_npages; j++) { 96754359Sroberto m = bp->b_pages[j]; 96854359Sroberto if (m == bogus_page) { 969132451Sroberto m = vm_page_lookup(obj, poff + j); 97054359Sroberto#if !defined(MAX_PERF) 971132451Sroberto if (!m) { 972132451Sroberto panic("brelse: page missing\n"); 973132451Sroberto } 974132451Sroberto#endif 975132451Sroberto bp->b_pages[j] = m; 976132451Sroberto } 977132451Sroberto } 978132451Sroberto 979132451Sroberto if ((bp->b_flags & B_INVAL) == 0) { 980132451Sroberto pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 981132451Sroberto } 982132451Sroberto } 983182007Sroberto if (bp->b_flags & (B_NOCACHE|B_ERROR)) { 984182007Sroberto int poffset = foff & PAGE_MASK; 985182007Sroberto int presid = resid > (PAGE_SIZE - poffset) ? 986132451Sroberto (PAGE_SIZE - poffset) : resid; 987182007Sroberto 988132451Sroberto KASSERT(presid >= 0, ("brelse: extra page")); 989132451Sroberto vm_page_set_invalid(m, poffset, presid); 990132451Sroberto } 99154359Sroberto resid -= PAGE_SIZE - (foff & PAGE_MASK); 99254359Sroberto foff = (foff + PAGE_SIZE) & ~PAGE_MASK; 993182007Sroberto } 994182007Sroberto 995182007Sroberto if (bp->b_flags & (B_INVAL | B_RELBUF)) 996182007Sroberto vfs_vmio_release(bp); 997132451Sroberto 998182007Sroberto } else if (bp->b_flags & B_VMIO) { 999182007Sroberto 1000182007Sroberto if (bp->b_flags & (B_INVAL | B_RELBUF)) 1001182007Sroberto vfs_vmio_release(bp); 1002182007Sroberto 1003182007Sroberto } 1004182007Sroberto 1005182007Sroberto#if !defined(MAX_PERF) 1006182007Sroberto if (bp->b_qindex != QUEUE_NONE) 1007182007Sroberto panic("brelse: free buffer onto another queue???"); 1008182007Sroberto#endif 1009182007Sroberto if (BUF_REFCNT(bp) > 1) { 1010132451Sroberto /* Temporary panic to verify exclusive locking */ 1011132451Sroberto /* This panic goes away when we allow shared refs */ 1012132451Sroberto panic("brelse: multiple refs"); 1013132451Sroberto /* do not release to free list */ 1014182007Sroberto BUF_UNLOCK(bp); 1015182007Sroberto splx(s); 1016182007Sroberto return; 1017182007Sroberto } 1018182007Sroberto 1019182007Sroberto /* enqueue */ 1020182007Sroberto 1021132451Sroberto /* buffers with no memory */ 1022132451Sroberto if (bp->b_bufsize == 0) { 1023132451Sroberto bp->b_flags |= B_INVAL; 102454359Sroberto if (bp->b_kvasize) 102554359Sroberto bp->b_qindex = QUEUE_EMPTYKVA; 102654359Sroberto else 102754359Sroberto bp->b_qindex = QUEUE_EMPTY; 102854359Sroberto TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 102954359Sroberto LIST_REMOVE(bp, b_hash); 103054359Sroberto LIST_INSERT_HEAD(&invalhash, bp, b_hash); 103154359Sroberto bp->b_dev = NODEV; 103254359Sroberto kvafreespace += bp->b_kvasize; 103354359Sroberto if (bp->b_kvasize) 103454359Sroberto kvaspacewakeup(); 103554359Sroberto /* buffers with junk contents */ 103654359Sroberto } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { 103754359Sroberto bp->b_flags |= B_INVAL; 1038182007Sroberto bp->b_qindex = QUEUE_CLEAN; 1039182007Sroberto TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 104054359Sroberto LIST_REMOVE(bp, b_hash); 104154359Sroberto LIST_INSERT_HEAD(&invalhash, bp, b_hash); 104254359Sroberto bp->b_dev = NODEV; 104354359Sroberto 1044182007Sroberto /* buffers that are locked */ 104554359Sroberto } else if (bp->b_flags & B_LOCKED) { 104654359Sroberto bp->b_qindex = QUEUE_LOCKED; 104754359Sroberto TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 104854359Sroberto 104954359Sroberto /* remaining buffers */ 105054359Sroberto } else { 105154359Sroberto switch(bp->b_flags & (B_DELWRI|B_AGE)) { 105254359Sroberto case B_DELWRI | B_AGE: 105354359Sroberto bp->b_qindex = QUEUE_DIRTY; 105454359Sroberto TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 105554359Sroberto break; 105654359Sroberto case B_DELWRI: 105754359Sroberto bp->b_qindex = QUEUE_DIRTY; 105854359Sroberto TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 105954359Sroberto break; 106054359Sroberto case B_AGE: 106154359Sroberto bp->b_qindex = QUEUE_CLEAN; 106254359Sroberto TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 106354359Sroberto break; 106454359Sroberto default: 106554359Sroberto bp->b_qindex = QUEUE_CLEAN; 106654359Sroberto TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 1067132451Sroberto break; 106854359Sroberto } 106954359Sroberto } 107054359Sroberto 107154359Sroberto /* 107254359Sroberto * If B_INVAL, clear B_DELWRI. We've already placed the buffer 107354359Sroberto * on the correct queue. 107454359Sroberto */ 1075132451Sroberto if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) { 107654359Sroberto bp->b_flags &= ~B_DELWRI; 107754359Sroberto --numdirtybuffers; 1078132451Sroberto numdirtywakeup(); 1079132451Sroberto } 1080132451Sroberto 1081132451Sroberto runningbufspace -= bp->b_bufsize; 1082132451Sroberto 1083132451Sroberto /* 1084132451Sroberto * Fixup numfreebuffers count. The bp is on an appropriate queue 1085132451Sroberto * unless locked. We then bump numfreebuffers if it is not B_DELWRI. 108654359Sroberto * We've already handled the B_INVAL case ( B_DELWRI will be clear 108754359Sroberto * if B_INVAL is set ). 108854359Sroberto */ 1089132451Sroberto 1090132451Sroberto if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI)) 1091132451Sroberto bufcountwakeup(); 109254359Sroberto 109354359Sroberto /* 1094132451Sroberto * Something we can maybe free. 109554359Sroberto */ 109654359Sroberto 109754359Sroberto if (bp->b_bufsize) 109854359Sroberto bufspacewakeup(); 109954359Sroberto 110082498Sroberto /* unlock */ 1101182007Sroberto BUF_UNLOCK(bp); 110254359Sroberto bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 110354359Sroberto splx(s); 110454359Sroberto} 110554359Sroberto 110654359Sroberto/* 110754359Sroberto * Release a buffer back to the appropriate queue but do not try to free 1108106163Sroberto * it. 1109106163Sroberto * 111054359Sroberto * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1111106163Sroberto * biodone() to requeue an async I/O on completion. It is also used when 1112106163Sroberto * known good buffers need to be requeued but we think we may need the data 1113106163Sroberto * again soon. 1114106163Sroberto */ 1115106163Srobertovoid 111654359Srobertobqrelse(struct buf * bp) 1117106163Sroberto{ 1118106163Sroberto int s; 1119106163Sroberto 112054359Sroberto s = splbio(); 1121106163Sroberto 1122106163Sroberto KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1123106163Sroberto 112454359Sroberto#if !defined(MAX_PERF) 1125106163Sroberto if (bp->b_qindex != QUEUE_NONE) 112654359Sroberto panic("bqrelse: free buffer onto another queue???"); 112754359Sroberto#endif 112854359Sroberto if (BUF_REFCNT(bp) > 1) { 112954359Sroberto /* do not release to free list */ 113054359Sroberto panic("bqrelse: multiple refs"); 113154359Sroberto BUF_UNLOCK(bp); 113254359Sroberto splx(s); 113354359Sroberto return; 113454359Sroberto } 113554359Sroberto if (bp->b_flags & B_LOCKED) { 113654359Sroberto bp->b_flags &= ~B_ERROR; 113754359Sroberto bp->b_qindex = QUEUE_LOCKED; 113854359Sroberto TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 1139132451Sroberto /* buffers with stale but valid contents */ 114054359Sroberto } else if (bp->b_flags & B_DELWRI) { 114154359Sroberto bp->b_qindex = QUEUE_DIRTY; 114254359Sroberto TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist); 114354359Sroberto } else { 114454359Sroberto bp->b_qindex = QUEUE_CLEAN; 114554359Sroberto TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist); 114654359Sroberto } 114754359Sroberto 114854359Sroberto runningbufspace -= bp->b_bufsize; 114954359Sroberto 1150132451Sroberto if ((bp->b_flags & B_LOCKED) == 0 && 115154359Sroberto ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) { 115254359Sroberto bufcountwakeup(); 1153132451Sroberto } 115454359Sroberto 115554359Sroberto /* 115654359Sroberto * Something we can maybe wakeup 115754359Sroberto */ 115854359Sroberto if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 115954359Sroberto bufspacewakeup(); 116054359Sroberto 1161132451Sroberto /* unlock */ 116254359Sroberto BUF_UNLOCK(bp); 116354359Sroberto bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 116454359Sroberto splx(s); 116554359Sroberto} 116654359Sroberto 116754359Srobertostatic void 116854359Srobertovfs_vmio_release(bp) 116954359Sroberto struct buf *bp; 117054359Sroberto{ 117154359Sroberto int i, s; 1172132451Sroberto vm_page_t m; 117354359Sroberto 117454359Sroberto s = splvm(); 117554359Sroberto for (i = 0; i < bp->b_npages; i++) { 117654359Sroberto m = bp->b_pages[i]; 117754359Sroberto bp->b_pages[i] = NULL; 117854359Sroberto /* 117954359Sroberto * In order to keep page LRU ordering consistent, put 118054359Sroberto * everything on the inactive queue. 118154359Sroberto */ 118254359Sroberto vm_page_unwire(m, 0); 1183182007Sroberto /* 118454359Sroberto * We don't mess with busy pages, it is 118554359Sroberto * the responsibility of the process that 118654359Sroberto * busied the pages to deal with them. 118754359Sroberto */ 118854359Sroberto if ((m->flags & PG_BUSY) || (m->busy != 0)) 118954359Sroberto continue; 119054359Sroberto 119154359Sroberto if (m->wire_count == 0) { 119254359Sroberto vm_page_flag_clear(m, PG_ZERO); 119354359Sroberto /* 119454359Sroberto * Might as well free the page if we can and it has 119554359Sroberto * no valid data. 119654359Sroberto */ 119754359Sroberto if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { 119854359Sroberto vm_page_busy(m); 119954359Sroberto vm_page_protect(m, VM_PROT_NONE); 120054359Sroberto vm_page_free(m); 1201182007Sroberto } 120254359Sroberto } 120354359Sroberto } 120454359Sroberto bufspace -= bp->b_bufsize; 120554359Sroberto vmiospace -= bp->b_bufsize; 120654359Sroberto runningbufspace -= bp->b_bufsize; 120754359Sroberto splx(s); 120854359Sroberto pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 120954359Sroberto if (bp->b_bufsize) 121054359Sroberto bufspacewakeup(); 121154359Sroberto bp->b_npages = 0; 121254359Sroberto bp->b_bufsize = 0; 121354359Sroberto bp->b_flags &= ~B_VMIO; 121454359Sroberto if (bp->b_vp) 121554359Sroberto brelvp(bp); 121654359Sroberto} 121754359Sroberto 1218132451Sroberto/* 121954359Sroberto * Check to see if a block is currently memory resident. 122054359Sroberto */ 122154359Srobertostruct buf * 122254359Srobertogbincore(struct vnode * vp, daddr_t blkno) 122354359Sroberto{ 122454359Sroberto struct buf *bp; 122554359Sroberto struct bufhashhdr *bh; 122654359Sroberto 122754359Sroberto bh = bufhash(vp, blkno); 122854359Sroberto bp = bh->lh_first; 122954359Sroberto 123054359Sroberto /* Search hash chain */ 123154359Sroberto while (bp != NULL) { 123254359Sroberto /* hit */ 123354359Sroberto if (bp->b_vp == vp && bp->b_lblkno == blkno && 123454359Sroberto (bp->b_flags & B_INVAL) == 0) { 123554359Sroberto break; 123654359Sroberto } 123754359Sroberto bp = bp->b_hash.le_next; 123854359Sroberto } 123954359Sroberto return (bp); 124054359Sroberto} 124154359Sroberto 124254359Sroberto/* 124354359Sroberto * vfs_bio_awrite: 124454359Sroberto * 124554359Sroberto * Implement clustered async writes for clearing out B_DELWRI buffers. 124654359Sroberto * This is much better then the old way of writing only one buffer at 124754359Sroberto * a time. Note that we may not be presented with the buffers in the 124854359Sroberto * correct order, so we search for the cluster in both directions. 124954359Sroberto */ 125054359Srobertoint 125154359Srobertovfs_bio_awrite(struct buf * bp) 125254359Sroberto{ 125354359Sroberto int i; 125454359Sroberto int j; 125554359Sroberto daddr_t lblkno = bp->b_lblkno; 1256132451Sroberto struct vnode *vp = bp->b_vp; 125754359Sroberto int s; 125854359Sroberto int ncl; 125954359Sroberto struct buf *bpa; 126054359Sroberto int nwritten; 126154359Sroberto int size; 126254359Sroberto int maxcl; 126354359Sroberto 126454359Sroberto s = splbio(); 126554359Sroberto /* 126654359Sroberto * right now we support clustered writing only to regular files. If 126754359Sroberto * we find a clusterable block we could be in the middle of a cluster 126854359Sroberto * rather then at the beginning. 126954359Sroberto */ 127054359Sroberto if ((vp->v_type == VREG) && 127154359Sroberto (vp->v_mount != 0) && /* Only on nodes that have the size info */ 127254359Sroberto (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 127354359Sroberto 127454359Sroberto size = vp->v_mount->mnt_stat.f_iosize; 127554359Sroberto maxcl = MAXPHYS / size; 127654359Sroberto 127754359Sroberto for (i = 1; i < maxcl; i++) { 127854359Sroberto if ((bpa = gbincore(vp, lblkno + i)) && 127954359Sroberto BUF_REFCNT(bpa) == 0 && 128054359Sroberto ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 128154359Sroberto (B_DELWRI | B_CLUSTEROK)) && 128254359Sroberto (bpa->b_bufsize == size)) { 128354359Sroberto if ((bpa->b_blkno == bpa->b_lblkno) || 128454359Sroberto (bpa->b_blkno != 128554359Sroberto bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 128654359Sroberto break; 128754359Sroberto } else { 1288132451Sroberto break; 128954359Sroberto } 129054359Sroberto } 129154359Sroberto for (j = 1; i + j <= maxcl && j <= lblkno; j++) { 129254359Sroberto if ((bpa = gbincore(vp, lblkno - j)) && 129354359Sroberto BUF_REFCNT(bpa) == 0 && 129454359Sroberto ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 129554359Sroberto (B_DELWRI | B_CLUSTEROK)) && 129654359Sroberto (bpa->b_bufsize == size)) { 129754359Sroberto if ((bpa->b_blkno == bpa->b_lblkno) || 129854359Sroberto (bpa->b_blkno != 129954359Sroberto bp->b_blkno - ((j * size) >> DEV_BSHIFT))) 130054359Sroberto break; 130154359Sroberto } else { 1302182007Sroberto break; 130354359Sroberto } 130454359Sroberto } 130554359Sroberto --j; 130654359Sroberto ncl = i + j; 130754359Sroberto /* 130854359Sroberto * this is a possible cluster write 130954359Sroberto */ 131054359Sroberto if (ncl != 1) { 131154359Sroberto nwritten = cluster_wbuild(vp, size, lblkno - j, ncl); 1312182007Sroberto splx(s); 131354359Sroberto return nwritten; 131454359Sroberto } 131554359Sroberto } 131654359Sroberto 131754359Sroberto BUF_LOCK(bp, LK_EXCLUSIVE); 131854359Sroberto bremfree(bp); 131954359Sroberto bp->b_flags |= B_ASYNC; 132054359Sroberto 132154359Sroberto splx(s); 132254359Sroberto /* 132354359Sroberto * default (old) behavior, writing out only one block 1324132451Sroberto * 132554359Sroberto * XXX returns b_bufsize instead of b_bcount for nwritten? 132654359Sroberto */ 132754359Sroberto nwritten = bp->b_bufsize; 132854359Sroberto (void) VOP_BWRITE(bp->b_vp, bp); 1329200576Sroberto 1330132451Sroberto return nwritten; 133182498Sroberto} 1332132451Sroberto 1333132451Sroberto/* 1334132451Sroberto * getnewbuf: 1335132451Sroberto * 133654359Sroberto * Find and initialize a new buffer header, freeing up existing buffers 133754359Sroberto * in the bufqueues as necessary. The new buffer is returned locked. 133854359Sroberto * 133954359Sroberto * Important: B_INVAL is not set. If the caller wishes to throw the 134054359Sroberto * buffer away, the caller must set B_INVAL prior to calling brelse(). 134154359Sroberto * 134254359Sroberto * We block if: 134354359Sroberto * We have insufficient buffer headers 1344132451Sroberto * We have insufficient buffer space 1345132451Sroberto * buffer_map is too fragmented ( space reservation fails ) 134654359Sroberto * If we have to flush dirty buffers ( but we try to avoid this ) 134754359Sroberto * 1348132451Sroberto * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1349132451Sroberto * Instead we ask the buf daemon to do it for us. We attempt to 135054359Sroberto * avoid piecemeal wakeups of the pageout daemon. 1351132451Sroberto */ 1352132451Sroberto 1353132451Srobertostatic struct buf * 135454359Srobertogetnewbuf(int slpflag, int slptimeo, int size, int maxsize) 1355132451Sroberto{ 1356182007Sroberto struct buf *bp; 135754359Sroberto struct buf *nbp; 1358132451Sroberto struct buf *dbp; 1359132451Sroberto int outofspace; 136054359Sroberto int nqindex; 136154359Sroberto int defrag = 0; 136254359Sroberto 136354359Sroberto ++getnewbufcalls; 136454359Sroberto --getnewbufrestarts; 136554359Srobertorestart: 136654359Sroberto ++getnewbufrestarts; 136754359Sroberto 136854359Sroberto /* 136954359Sroberto * Calculate whether we are out of buffer space. This state is 137054359Sroberto * recalculated on every restart. If we are out of space, we 1371132451Sroberto * have to turn off defragmentation. Setting defrag to -1 when 137254359Sroberto * outofspace is positive means "defrag while freeing buffers". 1373132451Sroberto * The looping conditional will be muffed up if defrag is left 1374132451Sroberto * positive when outofspace is positive. 1375132451Sroberto */ 1376132451Sroberto 1377132451Sroberto dbp = NULL; 1378132451Sroberto outofspace = 0; 1379132451Sroberto if (bufspace >= hibufspace) { 1380182007Sroberto if ((curproc && (curproc->p_flag & P_BUFEXHAUST) == 0) || 1381132451Sroberto bufspace >= maxbufspace) { 1382182007Sroberto outofspace = 1; 1383132451Sroberto if (defrag > 0) 1384132451Sroberto defrag = -1; 1385182007Sroberto } 1386182007Sroberto } 1387132451Sroberto 1388132451Sroberto /* 1389182007Sroberto * defrag state is semi-persistant. 1 means we are flagged for 1390132451Sroberto * defragging. -1 means we actually defragged something. 1391132451Sroberto */ 1392132451Sroberto /* nop */ 1393132451Sroberto 1394132451Sroberto /* 1395132451Sroberto * Setup for scan. If we do not have enough free buffers, 1396132451Sroberto * we setup a degenerate case that immediately fails. Note 1397132451Sroberto * that if we are specially marked process, we are allowed to 1398132451Sroberto * dip into our reserves. 1399132451Sroberto * 1400132451Sroberto * Normally we want to find an EMPTYKVA buffer. That is, a 140154359Sroberto * buffer with kva already allocated. If there are no EMPTYKVA 1402132451Sroberto * buffers we back up to the truely EMPTY buffers. When defragging 140354359Sroberto * we do not bother backing up since we have to locate buffers with 1404132451Sroberto * kva to defrag. If we are out of space we skip both EMPTY and 1405132451Sroberto * EMPTYKVA and dig right into the CLEAN queue. 1406132451Sroberto * 1407132451Sroberto * In this manner we avoid scanning unnecessary buffers. It is very 1408132451Sroberto * important for us to do this because the buffer cache is almost 1409132451Sroberto * constantly out of space or in need of defragmentation. 1410132451Sroberto */ 1411132451Sroberto 1412132451Sroberto if (curproc && (curproc->p_flag & P_BUFEXHAUST) == 0 && 141382498Sroberto numfreebuffers < lofreebuffers) { 141454359Sroberto nqindex = QUEUE_CLEAN; 141554359Sroberto nbp = NULL; 1416132451Sroberto } else { 1417132451Sroberto nqindex = QUEUE_EMPTYKVA; 1418132451Sroberto nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]); 1419132451Sroberto if (nbp == NULL) { 142054359Sroberto if (defrag <= 0) { 142154359Sroberto nqindex = QUEUE_EMPTY; 142254359Sroberto nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); 1423200576Sroberto } 1424200576Sroberto } 1425200576Sroberto if (outofspace || nbp == NULL) { 1426200576Sroberto nqindex = QUEUE_CLEAN; 1427200576Sroberto nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]); 1428200576Sroberto } 1429200576Sroberto } 1430200576Sroberto 1431200576Sroberto /* 1432200576Sroberto * Run scan, possibly freeing data and/or kva mappings on the fly 1433200576Sroberto * depending. 1434200576Sroberto */ 1435200576Sroberto 1436200576Sroberto while ((bp = nbp) != NULL) { 1437200576Sroberto int qindex = nqindex; 1438200576Sroberto 1439200576Sroberto /* 1440132451Sroberto * Calculate next bp ( we can only use it if we do not block 1441132451Sroberto * or do other fancy things ). 144254359Sroberto */ 144354359Sroberto if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) { 144454359Sroberto switch(qindex) { 144554359Sroberto case QUEUE_EMPTY: 144654359Sroberto nqindex = QUEUE_EMPTYKVA; 1447132451Sroberto if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]))) 1448132451Sroberto break; 144954359Sroberto /* fall through */ 145082498Sroberto case QUEUE_EMPTYKVA: 145182498Sroberto nqindex = QUEUE_CLEAN; 145282498Sroberto if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]))) 145382498Sroberto break; 1454132451Sroberto /* fall through */ 145582498Sroberto case QUEUE_CLEAN: 145682498Sroberto /* 145782498Sroberto * nbp is NULL. 145882498Sroberto */ 145982498Sroberto break; 146082498Sroberto } 146182498Sroberto } 146282498Sroberto 146382498Sroberto /* 146482498Sroberto * Sanity Checks 146582498Sroberto */ 146682498Sroberto KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp)); 146782498Sroberto 146882498Sroberto /* 146982498Sroberto * Note: we no longer distinguish between VMIO and non-VMIO 147082498Sroberto * buffers. 147182498Sroberto */ 147282498Sroberto 147382498Sroberto KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); 147482498Sroberto 147582498Sroberto /* 147682498Sroberto * If we are defragging and the buffer isn't useful for fixing 147782498Sroberto * that problem we continue. If we are out of space and the 147882498Sroberto * buffer isn't useful for fixing that problem we continue. 147982498Sroberto */ 148082498Sroberto 148182498Sroberto if (defrag > 0 && bp->b_kvasize == 0) 148282498Sroberto continue; 148382498Sroberto if (outofspace > 0 && bp->b_bufsize == 0) 148482498Sroberto continue; 148582498Sroberto 148682498Sroberto /* 148782498Sroberto * Start freeing the bp. This is somewhat involved. nbp 148882498Sroberto * remains valid only for QUEUE_EMPTY[KVA] bp's. 148982498Sroberto */ 149082498Sroberto 149182498Sroberto if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 149282498Sroberto panic("getnewbuf: locked buf"); 149382498Sroberto bremfree(bp); 149482498Sroberto 149582498Sroberto if (qindex == QUEUE_CLEAN) { 149682498Sroberto if (bp->b_flags & B_VMIO) { 149782498Sroberto bp->b_flags &= ~B_ASYNC; 149882498Sroberto vfs_vmio_release(bp); 149982498Sroberto } 150082498Sroberto if (bp->b_vp) 150182498Sroberto brelvp(bp); 150282498Sroberto } 150382498Sroberto 150482498Sroberto /* 150582498Sroberto * NOTE: nbp is now entirely invalid. We can only restart 150682498Sroberto * the scan from this point on. 150782498Sroberto * 150882498Sroberto * Get the rest of the buffer freed up. b_kva* is still 150982498Sroberto * valid after this operation. 151082498Sroberto */ 151182498Sroberto 151282498Sroberto if (bp->b_rcred != NOCRED) { 151382498Sroberto crfree(bp->b_rcred); 151482498Sroberto bp->b_rcred = NOCRED; 151582498Sroberto } 151682498Sroberto if (bp->b_wcred != NOCRED) { 151782498Sroberto crfree(bp->b_wcred); 151882498Sroberto bp->b_wcred = NOCRED; 151982498Sroberto } 152082498Sroberto if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) 1521132451Sroberto (*bioops.io_deallocate)(bp); 1522132451Sroberto LIST_REMOVE(bp, b_hash); 152382498Sroberto LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1524132451Sroberto 152582498Sroberto if (bp->b_bufsize) 152682498Sroberto allocbuf(bp, 0); 152782498Sroberto 152882498Sroberto bp->b_flags = 0; 152982498Sroberto bp->b_dev = NODEV; 153082498Sroberto bp->b_vp = NULL; 153182498Sroberto bp->b_blkno = bp->b_lblkno = 0; 153282498Sroberto bp->b_offset = NOOFFSET; 153382498Sroberto bp->b_iodone = 0; 153482498Sroberto bp->b_error = 0; 153582498Sroberto bp->b_resid = 0; 1536132451Sroberto bp->b_bcount = 0; 153782498Sroberto bp->b_npages = 0; 153882498Sroberto bp->b_dirtyoff = bp->b_dirtyend = 0; 153982498Sroberto 154082498Sroberto LIST_INIT(&bp->b_dep); 154182498Sroberto 1542132451Sroberto /* 154382498Sroberto * Ok, now that we have a free buffer, if we are defragging 154482498Sroberto * we have to recover the kvaspace. If we are out of space 154554359Sroberto * we have to free the buffer (which we just did), but we 154654359Sroberto * do not have to recover kva space unless we hit a defrag 154754359Sroberto * hicup. Being able to avoid freeing the kva space leads 154854359Sroberto * to a significant reduction in overhead. 1549132451Sroberto */ 155054359Sroberto 155154359Sroberto if (defrag > 0) { 155254359Sroberto defrag = -1; 155354359Sroberto bp->b_flags |= B_INVAL; 155454359Sroberto bfreekva(bp); 1555132451Sroberto brelse(bp); 155654359Sroberto goto restart; 155754359Sroberto } 1558132451Sroberto 155954359Sroberto if (outofspace > 0) { 156054359Sroberto outofspace = -1; 156154359Sroberto bp->b_flags |= B_INVAL; 156254359Sroberto if (defrag < 0) 156354359Sroberto bfreekva(bp); 156454359Sroberto brelse(bp); 156554359Sroberto goto restart; 156654359Sroberto } 156754359Sroberto 156854359Sroberto /* 156954359Sroberto * We are done 157054359Sroberto */ 157154359Sroberto break; 1572132451Sroberto } 1573132451Sroberto 1574132451Sroberto /* 1575132451Sroberto * If we exhausted our list, sleep as appropriate. We may have to 1576132451Sroberto * wakeup various daemons and write out some dirty buffers. 1577132451Sroberto * 1578132451Sroberto * Generally we are sleeping due to insufficient buffer space. 1579132451Sroberto */ 1580132451Sroberto 1581132451Sroberto if (bp == NULL) { 1582132451Sroberto int flags; 1583132451Sroberto char *waitmsg; 1584132451Sroberto 1585132451Srobertodosleep: 158654359Sroberto if (defrag > 0) { 158754359Sroberto flags = VFS_BIO_NEED_KVASPACE; 1588182007Sroberto waitmsg = "nbufkv"; 1589182007Sroberto } else if (outofspace > 0) { 1590182007Sroberto waitmsg = "nbufbs"; 1591182007Sroberto flags = VFS_BIO_NEED_BUFSPACE; 159254359Sroberto } else { 159354359Sroberto waitmsg = "newbuf"; 159454359Sroberto flags = VFS_BIO_NEED_ANY; 159554359Sroberto } 159654359Sroberto 159754359Sroberto /* XXX */ 159854359Sroberto 159954359Sroberto (void) speedup_syncer(); 160054359Sroberto needsbuffer |= flags; 1601132451Sroberto while (needsbuffer & flags) { 1602132451Sroberto if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, 160354359Sroberto waitmsg, slptimeo)) 160454359Sroberto return (NULL); 160554359Sroberto } 160654359Sroberto } else { 160754359Sroberto /* 160854359Sroberto * We finally have a valid bp. We aren't quite out of the 160954359Sroberto * woods, we still have to reserve kva space. 161054359Sroberto */ 161154359Sroberto vm_offset_t addr = 0; 161254359Sroberto 161354359Sroberto maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; 161454359Sroberto 161554359Sroberto if (maxsize != bp->b_kvasize) { 161654359Sroberto bfreekva(bp); 1617132451Sroberto 1618132451Sroberto if (vm_map_findspace(buffer_map, 1619132451Sroberto vm_map_min(buffer_map), maxsize, &addr)) { 1620132451Sroberto /* 1621132451Sroberto * Uh oh. Buffer map is to fragmented. Try 1622132451Sroberto * to defragment. 1623132451Sroberto */ 1624132451Sroberto if (defrag <= 0) { 1625132451Sroberto defrag = 1; 1626132451Sroberto bp->b_flags |= B_INVAL; 1627132451Sroberto brelse(bp); 1628132451Sroberto goto restart; 1629132451Sroberto } 1630132451Sroberto /* 163154359Sroberto * Uh oh. We couldn't seem to defragment 1632132451Sroberto */ 1633132451Sroberto bp = NULL; 163454359Sroberto goto dosleep; 163554359Sroberto } 163654359Sroberto } 163754359Sroberto if (addr) { 163854359Sroberto vm_map_insert(buffer_map, NULL, 0, 163954359Sroberto addr, addr + maxsize, 164054359Sroberto VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); 164154359Sroberto 164254359Sroberto bp->b_kvabase = (caddr_t) addr; 164354359Sroberto bp->b_kvasize = maxsize; 164454359Sroberto } 1645132451Sroberto bp->b_data = bp->b_kvabase; 164654359Sroberto } 164754359Sroberto return(bp); 164854359Sroberto} 164954359Sroberto 165054359Sroberto/* 165154359Sroberto * waitfreebuffers: 165254359Sroberto * 165354359Sroberto * Wait for sufficient free buffers. Only called from normal processes. 165454359Sroberto */ 165554359Sroberto 165654359Srobertostatic void 165754359Srobertowaitfreebuffers(int slpflag, int slptimeo) 165854359Sroberto{ 1659132451Sroberto while (numfreebuffers < hifreebuffers) { 166054359Sroberto if (numfreebuffers >= hifreebuffers) 166154359Sroberto break; 166254359Sroberto needsbuffer |= VFS_BIO_NEED_FREE; 166354359Sroberto if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) 166454359Sroberto break; 166554359Sroberto } 166654359Sroberto} 166754359Sroberto 166854359Sroberto/* 166954359Sroberto * buf_daemon: 167054359Sroberto * 167154359Sroberto * buffer flushing daemon. Buffers are normally flushed by the 167254359Sroberto * update daemon but if it cannot keep up this process starts to 1673132451Sroberto * take the load in an attempt to prevent getnewbuf() from blocking. 167454359Sroberto */ 167554359Sroberto 167654359Srobertostatic struct proc *bufdaemonproc; 167754359Srobertostatic int bd_interval; 167854359Srobertostatic int bd_flushto; 167982498Sroberto 1680182007Srobertostatic struct kproc_desc buf_kp = { 168154359Sroberto "bufdaemon", 1682182007Sroberto buf_daemon, 168354359Sroberto &bufdaemonproc 168482498Sroberto}; 168554359SrobertoSYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp) 168654359Sroberto 168754359Srobertostatic void 168854359Srobertobuf_daemon() 168954359Sroberto{ 1690182007Sroberto int s; 1691182007Sroberto /* 169282498Sroberto * This process is allowed to take the buffer cache to the limit 169354359Sroberto */ 1694106163Sroberto curproc->p_flag |= P_BUFEXHAUST; 169582498Sroberto s = splbio(); 1696132451Sroberto 169782498Sroberto bd_interval = 5 * hz; /* dynamically adjusted */ 1698106163Sroberto bd_flushto = hidirtybuffers; /* dynamically adjusted */ 1699106163Sroberto 170054359Sroberto while (TRUE) { 170154359Sroberto bd_request = 0; 170254359Sroberto 170354359Sroberto /* 170454359Sroberto * Do the flush. Limit the number of buffers we flush in one 1705132451Sroberto * go. The failure condition occurs when processes are writing 170682498Sroberto * buffers faster then we can dispose of them. In this case 1707132451Sroberto * we may be flushing so often that the previous set of flushes 170854359Sroberto * have not had time to complete, causing us to run out of 1709132451Sroberto * physical buffers and block. 171054359Sroberto */ 1711132451Sroberto { 171254359Sroberto int runcount = maxbdrun; 1713132451Sroberto 171454359Sroberto while (numdirtybuffers > bd_flushto && runcount) { 1715132451Sroberto --runcount; 1716106163Sroberto if (flushbufqueues() == 0) 1717132451Sroberto break; 1718106163Sroberto } 1719132451Sroberto } 172054359Sroberto 1721182007Sroberto /* 1722182007Sroberto * If nobody is requesting anything we sleep 1723182007Sroberto */ 1724182007Sroberto if (bd_request == 0) 1725182007Sroberto tsleep(&bd_request, PVM, "psleep", bd_interval); 1726182007Sroberto 172754359Sroberto /* 172854359Sroberto * We calculate how much to add or subtract from bd_flushto 172954359Sroberto * and bd_interval based on how far off we are from the 173054359Sroberto * optimal number of dirty buffers, which is 20% below the 173154359Sroberto * hidirtybuffers mark. We cannot use hidirtybuffers straight 173254359Sroberto * because being right on the mark will cause getnewbuf() 173354359Sroberto * to oscillate our wakeup. 173454359Sroberto * 1735132451Sroberto * The larger the error in either direction, the more we adjust 173654359Sroberto * bd_flushto and bd_interval. The time interval is adjusted 173754359Sroberto * by 2 seconds per whole-buffer-range of error. This is an 173854359Sroberto * exponential convergence algorithm, with large errors 173954359Sroberto * producing large changes and small errors producing small 174054359Sroberto * changes. 174154359Sroberto */ 1742132451Sroberto 174354359Sroberto { 174454359Sroberto int brange = hidirtybuffers - lodirtybuffers; 174554359Sroberto int middb = hidirtybuffers - brange / 5; 1746132451Sroberto int deltabuf = middb - numdirtybuffers; 174754359Sroberto 174854359Sroberto bd_flushto += deltabuf / 20; 174954359Sroberto bd_interval += deltabuf * (2 * hz) / (brange * 1); 1750132451Sroberto } 1751132451Sroberto if (bd_flushto < lodirtybuffers) 175254359Sroberto bd_flushto = lodirtybuffers; 175354359Sroberto if (bd_flushto > hidirtybuffers) 1754132451Sroberto bd_flushto = hidirtybuffers; 1755132451Sroberto if (bd_interval < hz / 10) 175654359Sroberto bd_interval = hz / 10; 175754359Sroberto if (bd_interval > 5 * hz) 175854359Sroberto bd_interval = 5 * hz; 175954359Sroberto } 176054359Sroberto} 176154359Sroberto 1762132451Sroberto/* 1763132451Sroberto * flushbufqueues: 1764132451Sroberto * 1765132451Sroberto * Try to flush a buffer in the dirty queue. We must be careful to 1766132451Sroberto * free up B_INVAL buffers instead of write them, which NFS is 1767132451Sroberto * particularly sensitive to. 1768132451Sroberto */ 1769132451Sroberto 1770132451Srobertostatic int 1771132451Srobertoflushbufqueues(void) 177254359Sroberto{ 177354359Sroberto struct buf *bp; 177454359Sroberto int r = 0; 177554359Sroberto 177654359Sroberto bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]); 177754359Sroberto 177854359Sroberto while (bp) { 177954359Sroberto KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp)); 178054359Sroberto if ((bp->b_flags & B_DELWRI) != 0) { 178154359Sroberto if (bp->b_flags & B_INVAL) { 1782132451Sroberto if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 178354359Sroberto panic("flushbufqueues: locked buf"); 178454359Sroberto bremfree(bp); 178554359Sroberto brelse(bp); 178654359Sroberto ++r; 178754359Sroberto break; 178854359Sroberto } 178954359Sroberto vfs_bio_awrite(bp); 179054359Sroberto ++r; 179154359Sroberto break; 179254359Sroberto } 179354359Sroberto bp = TAILQ_NEXT(bp, b_freelist); 179454359Sroberto } 179554359Sroberto return(r); 179654359Sroberto} 1797132451Sroberto 179854359Sroberto/* 179954359Sroberto * Check to see if a block is currently memory resident. 180054359Sroberto */ 180154359Srobertostruct buf * 180254359Srobertoincore(struct vnode * vp, daddr_t blkno) 180354359Sroberto{ 180454359Sroberto struct buf *bp; 180554359Sroberto 180654359Sroberto int s = splbio(); 180754359Sroberto bp = gbincore(vp, blkno); 180854359Sroberto splx(s); 180954359Sroberto return (bp); 181054359Sroberto} 1811132451Sroberto 181254359Sroberto/* 181354359Sroberto * Returns true if no I/O is needed to access the 181454359Sroberto * associated VM object. This is like incore except 181554359Sroberto * it also hunts around in the VM system for the data. 181654359Sroberto */ 181754359Sroberto 181854359Srobertoint 181954359Srobertoinmem(struct vnode * vp, daddr_t blkno) 182054359Sroberto{ 182154359Sroberto vm_object_t obj; 182254359Sroberto vm_offset_t toff, tinc, size; 182354359Sroberto vm_page_t m; 182454359Sroberto vm_ooffset_t off; 182554359Sroberto 182654359Sroberto if (incore(vp, blkno)) 182754359Sroberto return 1; 1828132451Sroberto if (vp->v_mount == NULL) 182954359Sroberto return 0; 183054359Sroberto if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0) 183154359Sroberto return 0; 183254359Sroberto 183354359Sroberto obj = vp->v_object; 183454359Sroberto size = PAGE_SIZE; 183554359Sroberto if (size > vp->v_mount->mnt_stat.f_iosize) 1836132451Sroberto size = vp->v_mount->mnt_stat.f_iosize; 1837132451Sroberto off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; 183854359Sroberto 183954359Sroberto for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 184054359Sroberto m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 184154359Sroberto if (!m) 184254359Sroberto return 0; 184354359Sroberto tinc = size; 184454359Sroberto if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) 184554359Sroberto tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); 184654359Sroberto if (vm_page_is_valid(m, 184754359Sroberto (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) 184854359Sroberto return 0; 1849182007Sroberto } 1850182007Sroberto return 1; 185154359Sroberto} 185254359Sroberto 185382498Sroberto/* 185454359Sroberto * vfs_setdirty: 185582498Sroberto * 1856132451Sroberto * Sets the dirty range for a buffer based on the status of the dirty 1857132451Sroberto * bits in the pages comprising the buffer. 1858132451Sroberto * 1859132451Sroberto * The range is limited to the size of the buffer. 1860132451Sroberto * 1861132451Sroberto * This routine is primarily used by NFS, but is generalized for the 1862132451Sroberto * B_VMIO case. 1863132451Sroberto */ 1864132451Srobertostatic void 1865132451Srobertovfs_setdirty(struct buf *bp) 186654359Sroberto{ 186754359Sroberto int i; 186854359Sroberto vm_object_t object; 186982498Sroberto 187054359Sroberto /* 187154359Sroberto * Degenerate case - empty buffer 187254359Sroberto */ 187354359Sroberto 187454359Sroberto if (bp->b_bufsize == 0) 187554359Sroberto return; 187654359Sroberto 187754359Sroberto /* 187854359Sroberto * We qualify the scan for modified pages on whether the 1879132451Sroberto * object has been flushed yet. The OBJ_WRITEABLE flag 1880132451Sroberto * is not cleared simply by protecting pages off. 188154359Sroberto */ 188254359Sroberto 1883132451Sroberto if ((bp->b_flags & B_VMIO) == 0) 1884132451Sroberto return; 1885132451Sroberto 1886132451Sroberto object = bp->b_pages[0]->object; 1887132451Sroberto 1888132451Sroberto if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY)) 1889132451Sroberto printf("Warning: object %p writeable but not mightbedirty\n", object); 1890132451Sroberto if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY)) 1891132451Sroberto printf("Warning: object %p mightbedirty but not writeable\n", object); 1892132451Sroberto 1893132451Sroberto if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) { 189454359Sroberto vm_offset_t boffset; 189554359Sroberto vm_offset_t eoffset; 189654359Sroberto 189754359Sroberto /* 189854359Sroberto * test the pages to see if they have been modified directly 189954359Sroberto * by users through the VM system. 190054359Sroberto */ 190154359Sroberto for (i = 0; i < bp->b_npages; i++) { 190254359Sroberto vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 190354359Sroberto vm_page_test_dirty(bp->b_pages[i]); 190454359Sroberto } 190554359Sroberto 190654359Sroberto /* 190754359Sroberto * Calculate the encompassing dirty range, boffset and eoffset, 1908132451Sroberto * (eoffset - boffset) bytes. 190954359Sroberto */ 191054359Sroberto 191154359Sroberto for (i = 0; i < bp->b_npages; i++) { 191254359Sroberto if (bp->b_pages[i]->dirty) 191354359Sroberto break; 191454359Sroberto } 191554359Sroberto boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 191654359Sroberto 191754359Sroberto for (i = bp->b_npages - 1; i >= 0; --i) { 191854359Sroberto if (bp->b_pages[i]->dirty) { 191954359Sroberto break; 192054359Sroberto } 192154359Sroberto } 192254359Sroberto eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); 192354359Sroberto 192454359Sroberto /* 192554359Sroberto * Fit it to the buffer. 192654359Sroberto */ 1927132451Sroberto 192854359Sroberto if (eoffset > bp->b_bcount) 192954359Sroberto eoffset = bp->b_bcount; 1930132451Sroberto 1931132451Sroberto /* 1932132451Sroberto * If we have a good dirty range, merge with the existing 193354359Sroberto * dirty range. 1934132451Sroberto */ 1935132451Sroberto 1936132451Sroberto if (boffset < eoffset) { 1937132451Sroberto if (bp->b_dirtyoff > boffset) 1938132451Sroberto bp->b_dirtyoff = boffset; 1939132451Sroberto if (bp->b_dirtyend < eoffset) 1940132451Sroberto bp->b_dirtyend = eoffset; 1941132451Sroberto } 1942132451Sroberto } 1943132451Sroberto} 194454359Sroberto 194554359Sroberto/* 194654359Sroberto * getblk: 194754359Sroberto * 194854359Sroberto * Get a block given a specified block and offset into a file/device. 194954359Sroberto * The buffers B_DONE bit will be cleared on return, making it almost 195054359Sroberto * ready for an I/O initiation. B_INVAL may or may not be set on 195154359Sroberto * return. The caller should clear B_INVAL prior to initiating a 195254359Sroberto * READ. 195354359Sroberto * 195454359Sroberto * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 195554359Sroberto * an existing buffer. 195654359Sroberto * 1957132451Sroberto * For a VMIO buffer, B_CACHE is modified according to the backing VM. 195854359Sroberto * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 195954359Sroberto * and then cleared based on the backing VM. If the previous buffer is 196054359Sroberto * non-0-sized but invalid, B_CACHE will be cleared. 196154359Sroberto * 196254359Sroberto * If getblk() must create a new buffer, the new buffer is returned with 196354359Sroberto * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 196454359Sroberto * case it is returned with B_INVAL clear and B_CACHE set based on the 196554359Sroberto * backing VM. 196654359Sroberto * 196754359Sroberto * getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos 196854359Sroberto * B_CACHE bit is clear. 196954359Sroberto * 197054359Sroberto * What this means, basically, is that the caller should use B_CACHE to 197154359Sroberto * determine whether the buffer is fully valid or not and should clear 1972132451Sroberto * B_INVAL prior to issuing a read. If the caller intends to validate 197354359Sroberto * the buffer by loading its data area with something, the caller needs 197454359Sroberto * to clear B_INVAL. If the caller does this without issuing an I/O, 1975132451Sroberto * the caller should set B_CACHE ( as an optimization ), else the caller 1976132451Sroberto * should issue the I/O and biodone() will set B_CACHE if the I/O was 1977132451Sroberto * a write attempt or if it was a successfull read. If the caller 197854359Sroberto * intends to issue a READ, the caller must clear B_INVAL and B_ERROR 1979132451Sroberto * prior to issuing the READ. biodone() will *not* clear B_INVAL. 1980132451Sroberto */ 1981132451Srobertostruct buf * 1982132451Srobertogetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1983132451Sroberto{ 1984132451Sroberto struct buf *bp; 1985132451Sroberto int s; 1986132451Sroberto struct bufhashhdr *bh; 1987132451Sroberto 1988132451Sroberto#if !defined(MAX_PERF) 1989132451Sroberto if (size > MAXBSIZE) 1990132451Sroberto panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 1991132451Sroberto#endif 1992132451Sroberto 1993132451Sroberto s = splbio(); 1994132451Srobertoloop: 1995132451Sroberto /* 1996132451Sroberto * Block if we are low on buffers. Certain processes are allowed 1997182007Sroberto * to completely exhaust the buffer cache. 199854359Sroberto * 199954359Sroberto * If this check ever becomes a bottleneck it may be better to 200054359Sroberto * move it into the else, when gbincore() fails. At the moment 200154359Sroberto * it isn't a problem. 200254359Sroberto */ 200354359Sroberto if (!curproc || (curproc->p_flag & P_BUFEXHAUST)) { 200454359Sroberto if (numfreebuffers == 0) { 200554359Sroberto if (!curproc) 200654359Sroberto return NULL; 200754359Sroberto needsbuffer |= VFS_BIO_NEED_ANY; 200854359Sroberto tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", 200954359Sroberto slptimeo); 201054359Sroberto } 201154359Sroberto } else if (numfreebuffers < lofreebuffers) { 201254359Sroberto waitfreebuffers(slpflag, slptimeo); 201354359Sroberto } 201454359Sroberto 201554359Sroberto if ((bp = gbincore(vp, blkno))) { 201654359Sroberto /* 201754359Sroberto * Buffer is in-core. If the buffer is not busy, it must 201854359Sroberto * be on a queue. 201954359Sroberto */ 202054359Sroberto 202154359Sroberto if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 202254359Sroberto if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL, 202354359Sroberto "getblk", slpflag, slptimeo) == ENOLCK) 202454359Sroberto goto loop; 202554359Sroberto splx(s); 202654359Sroberto return (struct buf *) NULL; 202754359Sroberto } 202854359Sroberto 202954359Sroberto /* 2030132451Sroberto * The buffer is locked. B_CACHE is cleared if the buffer is 203154359Sroberto * invalid. Ohterwise, for a non-VMIO buffer, B_CACHE is set 203254359Sroberto * and for a VMIO buffer B_CACHE is adjusted according to the 203354359Sroberto * backing VM cache. 203454359Sroberto */ 203554359Sroberto if (bp->b_flags & B_INVAL) 203654359Sroberto bp->b_flags &= ~B_CACHE; 203754359Sroberto else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0) 203854359Sroberto bp->b_flags |= B_CACHE; 203982498Sroberto bremfree(bp); 204054359Sroberto 204154359Sroberto /* 204254359Sroberto * check for size inconsistancies for non-VMIO case. 204354359Sroberto */ 204454359Sroberto 2045182007Sroberto if (bp->b_bcount != size) { 2046182007Sroberto if ((bp->b_flags & B_VMIO) == 0 || 204754359Sroberto (size > bp->b_kvasize)) { 204882498Sroberto if (bp->b_flags & B_DELWRI) { 204982498Sroberto bp->b_flags |= B_NOCACHE; 205054359Sroberto VOP_BWRITE(bp->b_vp, bp); 205154359Sroberto } else { 205254359Sroberto if ((bp->b_flags & B_VMIO) && 205354359Sroberto (LIST_FIRST(&bp->b_dep) == NULL)) { 205454359Sroberto bp->b_flags |= B_RELBUF; 205554359Sroberto brelse(bp); 205654359Sroberto } else { 205754359Sroberto bp->b_flags |= B_NOCACHE; 205854359Sroberto VOP_BWRITE(bp->b_vp, bp); 205954359Sroberto } 206054359Sroberto } 206154359Sroberto goto loop; 206254359Sroberto } 206354359Sroberto } 206454359Sroberto 206554359Sroberto /* 206654359Sroberto * If the size is inconsistant in the VMIO case, we can resize 2067132451Sroberto * the buffer. This might lead to B_CACHE getting set or 206854359Sroberto * cleared. If the size has not changed, B_CACHE remains 206954359Sroberto * unchanged from its previous state. 207054359Sroberto */ 207154359Sroberto 207254359Sroberto if (bp->b_bcount != size) 207354359Sroberto allocbuf(bp, size); 207454359Sroberto 2075132451Sroberto KASSERT(bp->b_offset != NOOFFSET, 207654359Sroberto ("getblk: no buffer offset")); 207754359Sroberto 207854359Sroberto /* 207954359Sroberto * A buffer with B_DELWRI set and B_CACHE clear must 208054359Sroberto * be committed before we can return the buffer in 208154359Sroberto * order to prevent the caller from issuing a read 208254359Sroberto * ( due to B_CACHE not being set ) and overwriting 208354359Sroberto * it. 208454359Sroberto * 208554359Sroberto * Most callers, including NFS and FFS, need this to 208654359Sroberto * operate properly either because they assume they 208754359Sroberto * can issue a read if B_CACHE is not set, or because 2088132451Sroberto * ( for example ) an uncached B_DELWRI might loop due 2089132451Sroberto * to softupdates re-dirtying the buffer. In the latter 2090132451Sroberto * case, B_CACHE is set after the first write completes, 2091132451Sroberto * preventing further loops. 2092132451Sroberto */ 2093132451Sroberto 2094132451Sroberto if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 2095132451Sroberto VOP_BWRITE(bp->b_vp, bp); 2096132451Sroberto goto loop; 2097132451Sroberto } 2098132451Sroberto 2099132451Sroberto splx(s); 210054359Sroberto bp->b_flags &= ~B_DONE; 210154359Sroberto } else { 210254359Sroberto /* 2103132451Sroberto * Buffer is not in-core, create new buffer. The buffer 2104132451Sroberto * returned by getnewbuf() is locked. Note that the returned 210554359Sroberto * buffer is also considered valid (not marked B_INVAL). 210654359Sroberto */ 210754359Sroberto int bsize, maxsize, vmio; 210854359Sroberto off_t offset; 210954359Sroberto 211054359Sroberto if (vp->v_type == VBLK) 211154359Sroberto bsize = DEV_BSIZE; 211254359Sroberto else if (vp->v_mountedhere) 211354359Sroberto bsize = vp->v_mountedhere->mnt_stat.f_iosize; 211454359Sroberto else if (vp->v_mount) 211554359Sroberto bsize = vp->v_mount->mnt_stat.f_iosize; 211654359Sroberto else 211754359Sroberto bsize = size; 211854359Sroberto 2119132451Sroberto offset = (off_t)blkno * bsize; 2120132451Sroberto vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF); 2121132451Sroberto maxsize = vmio ? size + (offset & PAGE_MASK) : size; 2122132451Sroberto maxsize = imax(maxsize, bsize); 2123132451Sroberto 2124132451Sroberto if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) { 2125132451Sroberto if (slpflag || slptimeo) { 2126132451Sroberto splx(s); 2127132451Sroberto return NULL; 2128132451Sroberto } 2129132451Sroberto goto loop; 213054359Sroberto } 213154359Sroberto 213254359Sroberto /* 213354359Sroberto * This code is used to make sure that a buffer is not 213454359Sroberto * created while the getnewbuf routine is blocked. 2135132451Sroberto * This can be a problem whether the vnode is locked or not. 2136132451Sroberto * If the buffer is created out from under us, we have to 213754359Sroberto * throw away the one we just created. There is now window 213854359Sroberto * race because we are safely running at splbio() from the 213954359Sroberto * point of the duplicate buffer creation through to here, 214054359Sroberto * and we've locked the buffer. 214154359Sroberto */ 214254359Sroberto if (gbincore(vp, blkno)) { 214354359Sroberto bp->b_flags |= B_INVAL; 214454359Sroberto brelse(bp); 214554359Sroberto goto loop; 214654359Sroberto } 214754359Sroberto 2148132451Sroberto /* 214954359Sroberto * Insert the buffer into the hash, so that it can 215054359Sroberto * be found by incore. 215154359Sroberto */ 215254359Sroberto bp->b_blkno = bp->b_lblkno = blkno; 215354359Sroberto bp->b_offset = offset; 215454359Sroberto 215554359Sroberto bgetvp(vp, bp); 215654359Sroberto LIST_REMOVE(bp, b_hash); 215754359Sroberto bh = bufhash(vp, blkno); 215854359Sroberto LIST_INSERT_HEAD(bh, bp, b_hash); 215954359Sroberto 216054359Sroberto /* 216154359Sroberto * set B_VMIO bit. allocbuf() the buffer bigger. Since the 216254359Sroberto * buffer size starts out as 0, B_CACHE will be set by 2163132451Sroberto * allocbuf() for the VMIO case prior to it testing the 216454359Sroberto * backing store for validity. 216554359Sroberto */ 216654359Sroberto 216754359Sroberto if (vmio) { 216854359Sroberto bp->b_flags |= B_VMIO; 216954359Sroberto#if defined(VFS_BIO_DEBUG) 217054359Sroberto if (vp->v_type != VREG && vp->v_type != VBLK) 217154359Sroberto printf("getblk: vmioing file type %d???\n", vp->v_type); 217254359Sroberto#endif 217354359Sroberto } else { 217454359Sroberto bp->b_flags &= ~B_VMIO; 217554359Sroberto } 217654359Sroberto 2177132451Sroberto allocbuf(bp, size); 217854359Sroberto 217954359Sroberto splx(s); 218054359Sroberto bp->b_flags &= ~B_DONE; 218154359Sroberto } 218254359Sroberto return (bp); 218354359Sroberto} 218454359Sroberto 218554359Sroberto/* 218654359Sroberto * Get an empty, disassociated buffer of given size. The buffer is initially 218754359Sroberto * set to B_INVAL. 218854359Sroberto */ 218954359Srobertostruct buf * 219054359Srobertogeteblk(int size) 2191132451Sroberto{ 219254359Sroberto struct buf *bp; 219354359Sroberto int s; 219482498Sroberto 219554359Sroberto s = splbio(); 219654359Sroberto while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0); 219754359Sroberto splx(s); 219854359Sroberto allocbuf(bp, size); 219954359Sroberto bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 220054359Sroberto return (bp); 220154359Sroberto} 220254359Sroberto 220354359Sroberto 220454359Sroberto/* 220554359Sroberto * This code constitutes the buffer memory from either anonymous system 220654359Sroberto * memory (in the case of non-VMIO operations) or from an associated 220754359Sroberto * VM object (in the case of VMIO operations). This code is able to 220854359Sroberto * resize a buffer up or down. 220954359Sroberto * 221054359Sroberto * Note that this code is tricky, and has many complications to resolve 221154359Sroberto * deadlock or inconsistant data situations. Tread lightly!!! 221254359Sroberto * There are B_CACHE and B_DELWRI interactions that must be dealt with by 221354359Sroberto * the caller. Calling this code willy nilly can result in the loss of data. 221454359Sroberto * 221554359Sroberto * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 2216132451Sroberto * B_CACHE for the non-VMIO case. 221754359Sroberto */ 221854359Sroberto 221954359Srobertoint 222054359Srobertoallocbuf(struct buf *bp, int size) 222154359Sroberto{ 222254359Sroberto int newbsize, mbsize; 222354359Sroberto int i; 222454359Sroberto 222554359Sroberto#if !defined(MAX_PERF) 222654359Sroberto if (BUF_REFCNT(bp) == 0) 222754359Sroberto panic("allocbuf: buffer not busy"); 222854359Sroberto 222954359Sroberto if (bp->b_kvasize < size) 223054359Sroberto panic("allocbuf: buffer too small"); 223154359Sroberto#endif 223254359Sroberto 223354359Sroberto if ((bp->b_flags & B_VMIO) == 0) { 223454359Sroberto caddr_t origbuf; 223554359Sroberto int origbufsize; 223654359Sroberto /* 223754359Sroberto * Just get anonymous memory from the kernel. Don't 223854359Sroberto * mess with B_CACHE. 223954359Sroberto */ 224054359Sroberto mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 224154359Sroberto#if !defined(NO_B_MALLOC) 224254359Sroberto if (bp->b_flags & B_MALLOC) 224354359Sroberto newbsize = mbsize; 224454359Sroberto else 224554359Sroberto#endif 224654359Sroberto newbsize = round_page(size); 224754359Sroberto 224854359Sroberto if (newbsize < bp->b_bufsize) { 224954359Sroberto#if !defined(NO_B_MALLOC) 225054359Sroberto /* 225154359Sroberto * malloced buffers are not shrunk 225254359Sroberto */ 225354359Sroberto if (bp->b_flags & B_MALLOC) { 225454359Sroberto if (newbsize) { 225554359Sroberto bp->b_bcount = size; 225654359Sroberto } else { 225754359Sroberto free(bp->b_data, M_BIOBUF); 225854359Sroberto bufspace -= bp->b_bufsize; 225954359Sroberto bufmallocspace -= bp->b_bufsize; 226054359Sroberto runningbufspace -= bp->b_bufsize; 226154359Sroberto if (bp->b_bufsize) 226254359Sroberto bufspacewakeup(); 226354359Sroberto bp->b_data = bp->b_kvabase; 226454359Sroberto bp->b_bufsize = 0; 226554359Sroberto bp->b_bcount = 0; 226654359Sroberto bp->b_flags &= ~B_MALLOC; 226754359Sroberto } 226854359Sroberto return 1; 226954359Sroberto } 227054359Sroberto#endif 227154359Sroberto vm_hold_free_pages( 227254359Sroberto bp, 227354359Sroberto (vm_offset_t) bp->b_data + newbsize, 227454359Sroberto (vm_offset_t) bp->b_data + bp->b_bufsize); 227554359Sroberto } else if (newbsize > bp->b_bufsize) { 227654359Sroberto#if !defined(NO_B_MALLOC) 227754359Sroberto /* 227854359Sroberto * We only use malloced memory on the first allocation. 227954359Sroberto * and revert to page-allocated memory when the buffer 228054359Sroberto * grows. 228154359Sroberto */ 228254359Sroberto if ( (bufmallocspace < maxbufmallocspace) && 228354359Sroberto (bp->b_bufsize == 0) && 2284132451Sroberto (mbsize <= PAGE_SIZE/2)) { 228554359Sroberto 228654359Sroberto bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 228754359Sroberto bp->b_bufsize = mbsize; 228854359Sroberto bp->b_bcount = size; 228954359Sroberto bp->b_flags |= B_MALLOC; 229054359Sroberto bufspace += mbsize; 229154359Sroberto bufmallocspace += mbsize; 229254359Sroberto runningbufspace += bp->b_bufsize; 229354359Sroberto return 1; 229454359Sroberto } 229554359Sroberto#endif 229654359Sroberto origbuf = NULL; 229754359Sroberto origbufsize = 0; 229854359Sroberto#if !defined(NO_B_MALLOC) 229954359Sroberto /* 230054359Sroberto * If the buffer is growing on its other-than-first allocation, 230154359Sroberto * then we revert to the page-allocation scheme. 230254359Sroberto */ 230354359Sroberto if (bp->b_flags & B_MALLOC) { 230454359Sroberto origbuf = bp->b_data; 2305132451Sroberto origbufsize = bp->b_bufsize; 230654359Sroberto bp->b_data = bp->b_kvabase; 230754359Sroberto bufspace -= bp->b_bufsize; 230854359Sroberto bufmallocspace -= bp->b_bufsize; 2309132451Sroberto runningbufspace -= bp->b_bufsize; 2310132451Sroberto if (bp->b_bufsize) 2311132451Sroberto bufspacewakeup(); 2312132451Sroberto bp->b_bufsize = 0; 2313132451Sroberto bp->b_flags &= ~B_MALLOC; 2314132451Sroberto newbsize = round_page(newbsize); 2315132451Sroberto } 2316132451Sroberto#endif 2317132451Sroberto vm_hold_load_pages( 2318132451Sroberto bp, 2319132451Sroberto (vm_offset_t) bp->b_data + bp->b_bufsize, 2320132451Sroberto (vm_offset_t) bp->b_data + newbsize); 2321132451Sroberto#if !defined(NO_B_MALLOC) 2322132451Sroberto if (origbuf) { 2323132451Sroberto bcopy(origbuf, bp->b_data, origbufsize); 2324132451Sroberto free(origbuf, M_BIOBUF); 2325132451Sroberto } 232654359Sroberto#endif 232754359Sroberto } 232854359Sroberto } else { 232954359Sroberto vm_page_t m; 233054359Sroberto int desiredpages; 233154359Sroberto 233254359Sroberto newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 233354359Sroberto desiredpages = (size == 0) ? 0 : 233454359Sroberto num_pages((bp->b_offset & PAGE_MASK) + newbsize); 233554359Sroberto 233654359Sroberto#if !defined(NO_B_MALLOC) 233754359Sroberto if (bp->b_flags & B_MALLOC) 233854359Sroberto panic("allocbuf: VMIO buffer can't be malloced"); 233954359Sroberto#endif 234054359Sroberto /* 234154359Sroberto * Set B_CACHE initially if buffer is 0 length or will become 234254359Sroberto * 0-length. 234354359Sroberto */ 2344132451Sroberto if (size == 0 || bp->b_bufsize == 0) 234554359Sroberto bp->b_flags |= B_CACHE; 234654359Sroberto 234754359Sroberto if (newbsize < bp->b_bufsize) { 234854359Sroberto /* 234954359Sroberto * DEV_BSIZE aligned new buffer size is less then the 235054359Sroberto * DEV_BSIZE aligned existing buffer size. Figure out 235154359Sroberto * if we have to remove any pages. 235254359Sroberto */ 235354359Sroberto if (desiredpages < bp->b_npages) { 235454359Sroberto for (i = desiredpages; i < bp->b_npages; i++) { 235554359Sroberto /* 235654359Sroberto * the page is not freed here -- it 235754359Sroberto * is the responsibility of 235854359Sroberto * vnode_pager_setsize 2359132451Sroberto */ 236054359Sroberto m = bp->b_pages[i]; 236154359Sroberto KASSERT(m != bogus_page, 236254359Sroberto ("allocbuf: bogus page found")); 236354359Sroberto while (vm_page_sleep_busy(m, TRUE, "biodep")) 236454359Sroberto ; 236554359Sroberto 236654359Sroberto bp->b_pages[i] = NULL; 236754359Sroberto vm_page_unwire(m, 0); 236854359Sroberto } 236954359Sroberto pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 237054359Sroberto (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 237154359Sroberto bp->b_npages = desiredpages; 237254359Sroberto } 237354359Sroberto } else if (size > bp->b_bcount) { 2374132451Sroberto /* 237554359Sroberto * We are growing the buffer, possibly in a 237654359Sroberto * byte-granular fashion. 237754359Sroberto */ 237854359Sroberto struct vnode *vp; 237954359Sroberto vm_object_t obj; 238054359Sroberto vm_offset_t toff; 238154359Sroberto vm_offset_t tinc; 238254359Sroberto 2383132451Sroberto /* 238454359Sroberto * Step 1, bring in the VM pages from the object, 238554359Sroberto * allocating them if necessary. We must clear 2386132451Sroberto * B_CACHE if these pages are not valid for the 238754359Sroberto * range covered by the buffer. 238854359Sroberto */ 2389132451Sroberto 2390132451Sroberto vp = bp->b_vp; 239154359Sroberto obj = vp->v_object; 239254359Sroberto 239354359Sroberto while (bp->b_npages < desiredpages) { 239454359Sroberto vm_page_t m; 239554359Sroberto vm_pindex_t pi; 239654359Sroberto 239782498Sroberto pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages; 239854359Sroberto if ((m = vm_page_lookup(obj, pi)) == NULL) { 239954359Sroberto m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL); 240054359Sroberto if (m == NULL) { 240154359Sroberto VM_WAIT; 240254359Sroberto vm_pageout_deficit += desiredpages - bp->b_npages; 240354359Sroberto } else { 240454359Sroberto vm_page_wire(m); 240554359Sroberto vm_page_wakeup(m); 240654359Sroberto bp->b_flags &= ~B_CACHE; 240754359Sroberto bp->b_pages[bp->b_npages] = m; 240854359Sroberto ++bp->b_npages; 2409132451Sroberto } 2410132451Sroberto continue; 2411132451Sroberto } 2412132451Sroberto 241354359Sroberto /* 241454359Sroberto * We found a page. If we have to sleep on it, 241554359Sroberto * retry because it might have gotten freed out 241654359Sroberto * from under us. 241754359Sroberto * 241854359Sroberto * We can only test PG_BUSY here. Blocking on 241954359Sroberto * m->busy might lead to a deadlock: 2420132451Sroberto * 2421132451Sroberto * vm_fault->getpages->cluster_read->allocbuf 2422132451Sroberto * 2423132451Sroberto */ 242454359Sroberto 2425132451Sroberto if (vm_page_sleep_busy(m, FALSE, "pgtblk")) 242654359Sroberto continue; 2427132451Sroberto 242854359Sroberto /* 242954359Sroberto * We have a good page. Should we wakeup the 243054359Sroberto * page daemon? 243154359Sroberto */ 243254359Sroberto if ((curproc != pageproc) && 243354359Sroberto ((m->queue - m->pc) == PQ_CACHE) && 243454359Sroberto ((cnt.v_free_count + cnt.v_cache_count) < 243554359Sroberto (cnt.v_free_min + cnt.v_cache_min))) { 243654359Sroberto pagedaemon_wakeup(); 243754359Sroberto } 243854359Sroberto vm_page_flag_clear(m, PG_ZERO); 243954359Sroberto vm_page_wire(m); 244054359Sroberto bp->b_pages[bp->b_npages] = m; 244154359Sroberto ++bp->b_npages; 244254359Sroberto } 244354359Sroberto 244454359Sroberto /* 244554359Sroberto * Step 2. We've loaded the pages into the buffer, 244654359Sroberto * we have to figure out if we can still have B_CACHE 244754359Sroberto * set. Note that B_CACHE is set according to the 244854359Sroberto * byte-granular range ( bcount and size ), new the 244954359Sroberto * aligned range ( newbsize ). 245054359Sroberto * 2451132451Sroberto * The VM test is against m->valid, which is DEV_BSIZE 245254359Sroberto * aligned. Needless to say, the validity of the data 245354359Sroberto * needs to also be DEV_BSIZE aligned. Note that this 245454359Sroberto * fails with NFS if the server or some other client 245554359Sroberto * extends the file's EOF. If our buffer is resized, 245682498Sroberto * B_CACHE may remain set! XXX 245754359Sroberto */ 245854359Sroberto 245954359Sroberto toff = bp->b_bcount; 246054359Sroberto tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK); 246154359Sroberto 246282498Sroberto while ((bp->b_flags & B_CACHE) && toff < size) { 246354359Sroberto vm_pindex_t pi; 246454359Sroberto 246554359Sroberto if (tinc > (size - toff)) 246654359Sroberto tinc = size - toff; 246754359Sroberto 246854359Sroberto pi = ((bp->b_offset & PAGE_MASK) + toff) >> 246954359Sroberto PAGE_SHIFT; 247054359Sroberto 247154359Sroberto vfs_buf_test_cache( 247254359Sroberto bp, 247354359Sroberto bp->b_offset, 247454359Sroberto toff, 247554359Sroberto tinc, 247654359Sroberto bp->b_pages[pi] 247754359Sroberto ); 247854359Sroberto toff += tinc; 2479132451Sroberto tinc = PAGE_SIZE; 248054359Sroberto } 248154359Sroberto 248254359Sroberto /* 248354359Sroberto * Step 3, fixup the KVM pmap. Remember that 248482498Sroberto * bp->b_data is relative to bp->b_offset, but 248582498Sroberto * bp->b_offset may be offset into the first page. 248654359Sroberto */ 248754359Sroberto 248854359Sroberto bp->b_data = (caddr_t) 248954359Sroberto trunc_page((vm_offset_t)bp->b_data); 249054359Sroberto pmap_qenter( 249182498Sroberto (vm_offset_t)bp->b_data, 249254359Sroberto bp->b_pages, 249354359Sroberto bp->b_npages 249454359Sroberto ); 249554359Sroberto bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 249654359Sroberto (vm_offset_t)(bp->b_offset & PAGE_MASK)); 249754359Sroberto } 249854359Sroberto } 249954359Sroberto if (bp->b_flags & B_VMIO) 250054359Sroberto vmiospace += (newbsize - bp->b_bufsize); 250154359Sroberto bufspace += (newbsize - bp->b_bufsize); 250254359Sroberto runningbufspace += (newbsize - bp->b_bufsize); 250354359Sroberto if (newbsize < bp->b_bufsize) 250454359Sroberto bufspacewakeup(); 250554359Sroberto bp->b_bufsize = newbsize; /* actual buffer allocation */ 250654359Sroberto bp->b_bcount = size; /* requested buffer size */ 250754359Sroberto return 1; 2508132451Sroberto} 250954359Sroberto 251054359Sroberto/* 251154359Sroberto * biowait: 251254359Sroberto * 251354359Sroberto * Wait for buffer I/O completion, returning error status. The buffer 251454359Sroberto * is left locked and B_DONE on return. B_EINTR is converted into a EINTR 251554359Sroberto * error and cleared. 251654359Sroberto */ 251754359Srobertoint 251854359Srobertobiowait(register struct buf * bp) 251954359Sroberto{ 252054359Sroberto int s; 252154359Sroberto 252254359Sroberto s = splbio(); 252354359Sroberto while ((bp->b_flags & B_DONE) == 0) { 252454359Sroberto#if defined(NO_SCHEDULE_MODS) 252554359Sroberto tsleep(bp, PRIBIO, "biowait", 0); 252654359Sroberto#else 252754359Sroberto if (bp->b_flags & B_READ) 252854359Sroberto tsleep(bp, PRIBIO, "biord", 0); 252954359Sroberto else 253054359Sroberto tsleep(bp, PRIBIO, "biowr", 0); 253154359Sroberto#endif 253254359Sroberto } 253354359Sroberto splx(s); 253454359Sroberto if (bp->b_flags & B_EINTR) { 253554359Sroberto bp->b_flags &= ~B_EINTR; 253654359Sroberto return (EINTR); 253754359Sroberto } 253854359Sroberto if (bp->b_flags & B_ERROR) { 253954359Sroberto return (bp->b_error ? bp->b_error : EIO); 254054359Sroberto } else { 254154359Sroberto return (0); 254254359Sroberto } 254354359Sroberto} 254454359Sroberto 254554359Sroberto/* 254654359Sroberto * biodone: 254754359Sroberto * 254854359Sroberto * Finish I/O on a buffer, optionally calling a completion function. 254954359Sroberto * This is usually called from an interrupt so process blocking is 255054359Sroberto * not allowed. 255154359Sroberto * 255254359Sroberto * biodone is also responsible for setting B_CACHE in a B_VMIO bp. 255354359Sroberto * In a non-VMIO bp, B_CACHE will be set on the next getblk() 255454359Sroberto * assuming B_INVAL is clear. 255554359Sroberto * 255654359Sroberto * For the VMIO case, we set B_CACHE if the op was a read and no 255754359Sroberto * read error occured, or if the op was a write. B_CACHE is never 255854359Sroberto * set if the buffer is invalid or otherwise uncacheable. 255954359Sroberto * 256054359Sroberto * biodone does not mess with B_INVAL, allowing the I/O routine or the 256154359Sroberto * initiator to leave B_INVAL set to brelse the buffer out of existance 256254359Sroberto * in the biodone routine. 256354359Sroberto */ 2564132451Srobertovoid 256554359Srobertobiodone(register struct buf * bp) 256654359Sroberto{ 256754359Sroberto int s; 256854359Sroberto 256954359Sroberto s = splbio(); 257054359Sroberto 257154359Sroberto KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); 257254359Sroberto KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); 257354359Sroberto 257454359Sroberto bp->b_flags |= B_DONE; 257554359Sroberto 257654359Sroberto if (bp->b_flags & B_FREEBUF) { 257754359Sroberto brelse(bp); 257854359Sroberto splx(s); 257954359Sroberto return; 258054359Sroberto } 258154359Sroberto 258254359Sroberto if ((bp->b_flags & B_READ) == 0) { 258354359Sroberto vwakeup(bp); 258454359Sroberto } 258554359Sroberto 258654359Sroberto /* call optional completion function if requested */ 258754359Sroberto if (bp->b_flags & B_CALL) { 258854359Sroberto bp->b_flags &= ~B_CALL; 258954359Sroberto (*bp->b_iodone) (bp); 259054359Sroberto splx(s); 259154359Sroberto return; 259254359Sroberto } 259354359Sroberto if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) 259454359Sroberto (*bioops.io_complete)(bp); 259554359Sroberto 259654359Sroberto if (bp->b_flags & B_VMIO) { 259754359Sroberto int i, resid; 259854359Sroberto vm_ooffset_t foff; 259954359Sroberto vm_page_t m; 260054359Sroberto vm_object_t obj; 260154359Sroberto int iosize; 260254359Sroberto struct vnode *vp = bp->b_vp; 260354359Sroberto 260454359Sroberto obj = vp->v_object; 260554359Sroberto 260654359Sroberto#if defined(VFS_BIO_DEBUG) 260754359Sroberto if (vp->v_usecount == 0) { 260854359Sroberto panic("biodone: zero vnode ref count"); 260954359Sroberto } 261054359Sroberto 261154359Sroberto if (vp->v_object == NULL) { 261254359Sroberto panic("biodone: missing VM object"); 261354359Sroberto } 261454359Sroberto 261554359Sroberto if ((vp->v_flag & VOBJBUF) == 0) { 261654359Sroberto panic("biodone: vnode is not setup for merged cache"); 261754359Sroberto } 261854359Sroberto#endif 2619132451Sroberto 262054359Sroberto foff = bp->b_offset; 262154359Sroberto KASSERT(bp->b_offset != NOOFFSET, 262254359Sroberto ("biodone: no buffer offset")); 262354359Sroberto 262454359Sroberto#if !defined(MAX_PERF) 262554359Sroberto if (!obj) { 262654359Sroberto panic("biodone: no object"); 262754359Sroberto } 2628132451Sroberto#endif 2629132451Sroberto#if defined(VFS_BIO_DEBUG) 263054359Sroberto if (obj->paging_in_progress < bp->b_npages) { 263154359Sroberto printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 263254359Sroberto obj->paging_in_progress, bp->b_npages); 2633132451Sroberto } 2634132451Sroberto#endif 2635132451Sroberto 2636132451Sroberto /* 2637132451Sroberto * Set B_CACHE if the op was a normal read and no error 263854359Sroberto * occured. B_CACHE is set for writes in the b*write() 263954359Sroberto * routines. 264054359Sroberto */ 264154359Sroberto iosize = bp->b_bcount; 264254359Sroberto if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) { 264354359Sroberto bp->b_flags |= B_CACHE; 264454359Sroberto } 2645132451Sroberto 2646132451Sroberto for (i = 0; i < bp->b_npages; i++) { 2647132451Sroberto int bogusflag = 0; 264854359Sroberto m = bp->b_pages[i]; 264954359Sroberto if (m == bogus_page) { 265054359Sroberto bogusflag = 1; 265154359Sroberto m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 265254359Sroberto if (!m) { 265354359Sroberto#if defined(VFS_BIO_DEBUG) 265454359Sroberto printf("biodone: page disappeared\n"); 265554359Sroberto#endif 265654359Sroberto vm_object_pip_subtract(obj, 1); 2657132451Sroberto bp->b_flags &= ~B_CACHE; 265854359Sroberto continue; 265954359Sroberto } 266054359Sroberto bp->b_pages[i] = m; 266154359Sroberto pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 266254359Sroberto } 266354359Sroberto#if defined(VFS_BIO_DEBUG) 266454359Sroberto if (OFF_TO_IDX(foff) != m->pindex) { 266554359Sroberto printf( 266654359Sroberto"biodone: foff(%lu)/m->pindex(%d) mismatch\n", 266754359Sroberto (unsigned long)foff, m->pindex); 266854359Sroberto } 2669132451Sroberto#endif 267054359Sroberto resid = IDX_TO_OFF(m->pindex + 1) - foff; 267154359Sroberto if (resid > iosize) 267254359Sroberto resid = iosize; 267354359Sroberto 267454359Sroberto /* 267554359Sroberto * In the write case, the valid and clean bits are 267654359Sroberto * already changed correctly ( see bdwrite() ), so we 267754359Sroberto * only need to do this here in the read case. 267854359Sroberto */ 267954359Sroberto if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 268054359Sroberto vfs_page_set_valid(bp, foff, i, m); 268154359Sroberto } 268254359Sroberto vm_page_flag_clear(m, PG_ZERO); 268354359Sroberto 268454359Sroberto /* 268554359Sroberto * when debugging new filesystems or buffer I/O methods, this 268654359Sroberto * is the most common error that pops up. if you see this, you 268754359Sroberto * have not set the page busy flag correctly!!! 2688132451Sroberto */ 268954359Sroberto if (m->busy == 0) { 269054359Sroberto#if !defined(MAX_PERF) 269154359Sroberto printf("biodone: page busy < 0, " 269254359Sroberto "pindex: %d, foff: 0x(%x,%x), " 269354359Sroberto "resid: %d, index: %d\n", 269454359Sroberto (int) m->pindex, (int)(foff >> 32), 269554359Sroberto (int) foff & 0xffffffff, resid, i); 2696132451Sroberto#endif 2697132451Sroberto if (vp->v_type != VBLK) 269854359Sroberto#if !defined(MAX_PERF) 269954359Sroberto printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", 270054359Sroberto bp->b_vp->v_mount->mnt_stat.f_iosize, 270154359Sroberto (int) bp->b_lblkno, 270254359Sroberto bp->b_flags, bp->b_npages); 270354359Sroberto else 270454359Sroberto printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 270554359Sroberto (int) bp->b_lblkno, 2706132451Sroberto bp->b_flags, bp->b_npages); 2707132451Sroberto printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 2708132451Sroberto m->valid, m->dirty, m->wire_count); 2709132451Sroberto#endif 2710132451Sroberto panic("biodone: page busy < 0\n"); 2711132451Sroberto } 2712132451Sroberto vm_page_io_finish(m); 2713132451Sroberto vm_object_pip_subtract(obj, 1); 271454359Sroberto foff += resid; 271554359Sroberto iosize -= resid; 271654359Sroberto } 271754359Sroberto if (obj) 271854359Sroberto vm_object_pip_wakeupn(obj, 0); 271954359Sroberto } 272054359Sroberto /* 272154359Sroberto * For asynchronous completions, release the buffer now. The brelse 272254359Sroberto * will do a wakeup there if necessary - so no need to do a wakeup 272354359Sroberto * here in the async case. The sync case always needs to do a wakeup. 272454359Sroberto */ 272554359Sroberto 272654359Sroberto if (bp->b_flags & B_ASYNC) { 272754359Sroberto if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) 272854359Sroberto brelse(bp); 272954359Sroberto else 273054359Sroberto bqrelse(bp); 273154359Sroberto } else { 273254359Sroberto wakeup(bp); 273354359Sroberto } 273454359Sroberto splx(s); 273554359Sroberto} 273654359Sroberto 273754359Sroberto/* 273854359Sroberto * This routine is called in lieu of iodone in the case of 2739132451Sroberto * incomplete I/O. This keeps the busy status for pages 274054359Sroberto * consistant. 274154359Sroberto */ 274254359Srobertovoid 274354359Srobertovfs_unbusy_pages(struct buf * bp) 274482498Sroberto{ 274554359Sroberto int i; 274654359Sroberto 274754359Sroberto if (bp->b_flags & B_VMIO) { 274854359Sroberto struct vnode *vp = bp->b_vp; 274954359Sroberto vm_object_t obj = vp->v_object; 275054359Sroberto 275154359Sroberto for (i = 0; i < bp->b_npages; i++) { 275254359Sroberto vm_page_t m = bp->b_pages[i]; 275354359Sroberto 275454359Sroberto if (m == bogus_page) { 275554359Sroberto m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); 275654359Sroberto#if !defined(MAX_PERF) 275754359Sroberto if (!m) { 275854359Sroberto panic("vfs_unbusy_pages: page missing\n"); 275954359Sroberto } 276054359Sroberto#endif 276154359Sroberto bp->b_pages[i] = m; 2762132451Sroberto pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 276354359Sroberto } 276454359Sroberto vm_object_pip_subtract(obj, 1); 276554359Sroberto vm_page_flag_clear(m, PG_ZERO); 276654359Sroberto vm_page_io_finish(m); 276754359Sroberto } 276854359Sroberto vm_object_pip_wakeupn(obj, 0); 276954359Sroberto } 277054359Sroberto} 277154359Sroberto 2772132451Sroberto/* 2773132451Sroberto * vfs_page_set_valid: 277454359Sroberto * 277554359Sroberto * Set the valid bits in a page based on the supplied offset. The 2776132451Sroberto * range is restricted to the buffer's size. 2777132451Sroberto * 2778132451Sroberto * This routine is typically called after a read completes. 2779132451Sroberto */ 2780132451Srobertostatic void 278154359Srobertovfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) 278254359Sroberto{ 278354359Sroberto vm_ooffset_t soff, eoff; 278454359Sroberto 278554359Sroberto /* 278654359Sroberto * Start and end offsets in buffer. eoff - soff may not cross a 278754359Sroberto * page boundry or cross the end of the buffer. The end of the 2788132451Sroberto * buffer, in this case, is our file EOF, not the allocation size 2789132451Sroberto * of the buffer. 2790132451Sroberto */ 279154359Sroberto soff = off; 279254359Sroberto eoff = (off + PAGE_SIZE) & ~PAGE_MASK; 279354359Sroberto if (eoff > bp->b_offset + bp->b_bcount) 279454359Sroberto eoff = bp->b_offset + bp->b_bcount; 279554359Sroberto 279654359Sroberto /* 279754359Sroberto * Set valid range. This is typically the entire buffer and thus the 279854359Sroberto * entire page. 279954359Sroberto */ 280054359Sroberto if (eoff > soff) { 280154359Sroberto vm_page_set_validclean( 280254359Sroberto m, 2803132451Sroberto (vm_offset_t) (soff & PAGE_MASK), 280454359Sroberto (vm_offset_t) (eoff - soff) 280554359Sroberto ); 280654359Sroberto } 280754359Sroberto} 280854359Sroberto 280954359Sroberto/* 281054359Sroberto * This routine is called before a device strategy routine. 281154359Sroberto * It is used to tell the VM system that paging I/O is in 281254359Sroberto * progress, and treat the pages associated with the buffer 281354359Sroberto * almost as being PG_BUSY. Also the object paging_in_progress 281454359Sroberto * flag is handled to make sure that the object doesn't become 281554359Sroberto * inconsistant. 281654359Sroberto * 281754359Sroberto * Since I/O has not been initiated yet, certain buffer flags 281854359Sroberto * such as B_ERROR or B_INVAL may be in an inconsistant state 281954359Sroberto * and should be ignored. 282054359Sroberto */ 282154359Srobertovoid 282254359Srobertovfs_busy_pages(struct buf * bp, int clear_modify) 282354359Sroberto{ 282454359Sroberto int i, bogus; 282554359Sroberto 2826182007Sroberto if (bp->b_flags & B_VMIO) { 2827182007Sroberto struct vnode *vp = bp->b_vp; 2828182007Sroberto vm_object_t obj = vp->v_object; 2829182007Sroberto vm_ooffset_t foff; 2830182007Sroberto 2831182007Sroberto foff = bp->b_offset; 2832182007Sroberto KASSERT(bp->b_offset != NOOFFSET, 2833182007Sroberto ("vfs_busy_pages: no buffer offset")); 2834182007Sroberto vfs_setdirty(bp); 2835182007Sroberto 2836182007Srobertoretry: 2837182007Sroberto for (i = 0; i < bp->b_npages; i++) { 2838182007Sroberto vm_page_t m = bp->b_pages[i]; 2839182007Sroberto if (vm_page_sleep_busy(m, FALSE, "vbpage")) 2840182007Sroberto goto retry; 2841182007Sroberto } 2842182007Sroberto 2843182007Sroberto bogus = 0; 2844182007Sroberto for (i = 0; i < bp->b_npages; i++) { 2845182007Sroberto vm_page_t m = bp->b_pages[i]; 2846182007Sroberto 2847182007Sroberto vm_page_flag_clear(m, PG_ZERO); 2848182007Sroberto if ((bp->b_flags & B_CLUSTER) == 0) { 2849182007Sroberto vm_object_pip_add(obj, 1); 2850182007Sroberto vm_page_io_start(m); 2851182007Sroberto } 2852182007Sroberto 2853182007Sroberto /* 2854182007Sroberto * When readying a buffer for a read ( i.e 2855182007Sroberto * clear_modify == 0 ), it is important to do 2856182007Sroberto * bogus_page replacement for valid pages in 2857182007Sroberto * partially instantiated buffers. Partially 2858182007Sroberto * instantiated buffers can, in turn, occur when 2859182007Sroberto * reconstituting a buffer from its VM backing store 2860182007Sroberto * base. We only have to do this if B_CACHE is 2861182007Sroberto * clear ( which causes the I/O to occur in the 2862182007Sroberto * first place ). The replacement prevents the read 2863182007Sroberto * I/O from overwriting potentially dirty VM-backed 2864182007Sroberto * pages. XXX bogus page replacement is, uh, bogus. 2865182007Sroberto * It may not work properly with small-block devices. 2866182007Sroberto * We need to find a better way. 2867182007Sroberto */ 2868182007Sroberto 2869182007Sroberto vm_page_protect(m, VM_PROT_NONE); 2870182007Sroberto if (clear_modify) 2871182007Sroberto vfs_page_set_valid(bp, foff, i, m); 2872182007Sroberto else if (m->valid == VM_PAGE_BITS_ALL && 2873182007Sroberto (bp->b_flags & B_CACHE) == 0) { 2874182007Sroberto bp->b_pages[i] = bogus_page; 2875182007Sroberto bogus++; 2876182007Sroberto } 2877182007Sroberto foff = (foff + PAGE_SIZE) & ~PAGE_MASK; 2878182007Sroberto } 2879182007Sroberto if (bogus) 2880182007Sroberto pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); 2881182007Sroberto } 2882182007Sroberto} 2883182007Sroberto 2884182007Sroberto/* 2885182007Sroberto * Tell the VM system that the pages associated with this buffer 2886182007Sroberto * are clean. This is used for delayed writes where the data is 2887182007Sroberto * going to go to disk eventually without additional VM intevention. 2888182007Sroberto * 2889182007Sroberto * Note that while we only really need to clean through to b_bcount, we 2890182007Sroberto * just go ahead and clean through to b_bufsize. 2891182007Sroberto */ 2892182007Srobertostatic void 2893182007Srobertovfs_clean_pages(struct buf * bp) 2894182007Sroberto{ 2895182007Sroberto int i; 2896182007Sroberto 2897182007Sroberto if (bp->b_flags & B_VMIO) { 2898182007Sroberto vm_ooffset_t foff; 2899182007Sroberto 2900182007Sroberto foff = bp->b_offset; 2901182007Sroberto KASSERT(bp->b_offset != NOOFFSET, 2902182007Sroberto ("vfs_clean_pages: no buffer offset")); 2903182007Sroberto for (i = 0; i < bp->b_npages; i++) { 2904182007Sroberto vm_page_t m = bp->b_pages[i]; 2905182007Sroberto vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK; 2906182007Sroberto vm_ooffset_t eoff = noff; 2907182007Sroberto 2908182007Sroberto if (eoff > bp->b_offset + bp->b_bufsize) 2909182007Sroberto eoff = bp->b_offset + bp->b_bufsize; 2910182007Sroberto vfs_page_set_valid(bp, foff, i, m); 2911182007Sroberto /* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */ 2912182007Sroberto foff = noff; 2913182007Sroberto } 2914 } 2915} 2916 2917/* 2918 * vfs_bio_set_validclean: 2919 * 2920 * Set the range within the buffer to valid and clean. The range is 2921 * relative to the beginning of the buffer, b_offset. Note that b_offset 2922 * itself may be offset from the beginning of the first page. 2923 */ 2924 2925void 2926vfs_bio_set_validclean(struct buf *bp, int base, int size) 2927{ 2928 if (bp->b_flags & B_VMIO) { 2929 int i; 2930 int n; 2931 2932 /* 2933 * Fixup base to be relative to beginning of first page. 2934 * Set initial n to be the maximum number of bytes in the 2935 * first page that can be validated. 2936 */ 2937 2938 base += (bp->b_offset & PAGE_MASK); 2939 n = PAGE_SIZE - (base & PAGE_MASK); 2940 2941 for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) { 2942 vm_page_t m = bp->b_pages[i]; 2943 2944 if (n > size) 2945 n = size; 2946 2947 vm_page_set_validclean(m, base & PAGE_MASK, n); 2948 base += n; 2949 size -= n; 2950 n = PAGE_SIZE; 2951 } 2952 } 2953} 2954 2955/* 2956 * vfs_bio_clrbuf: 2957 * 2958 * clear a buffer. This routine essentially fakes an I/O, so we need 2959 * to clear B_ERROR and B_INVAL. 2960 * 2961 * Note that while we only theoretically need to clear through b_bcount, 2962 * we go ahead and clear through b_bufsize. 2963 */ 2964 2965void 2966vfs_bio_clrbuf(struct buf *bp) { 2967 int i, mask = 0; 2968 caddr_t sa, ea; 2969 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 2970 bp->b_flags &= ~(B_INVAL|B_ERROR); 2971 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 2972 (bp->b_offset & PAGE_MASK) == 0) { 2973 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 2974 if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && 2975 ((bp->b_pages[0]->valid & mask) != mask)) { 2976 bzero(bp->b_data, bp->b_bufsize); 2977 } 2978 bp->b_pages[0]->valid |= mask; 2979 bp->b_resid = 0; 2980 return; 2981 } 2982 ea = sa = bp->b_data; 2983 for(i=0;i<bp->b_npages;i++,sa=ea) { 2984 int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 2985 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 2986 ea = (caddr_t)(vm_offset_t)ulmin( 2987 (u_long)(vm_offset_t)ea, 2988 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 2989 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 2990 if ((bp->b_pages[i]->valid & mask) == mask) 2991 continue; 2992 if ((bp->b_pages[i]->valid & mask) == 0) { 2993 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { 2994 bzero(sa, ea - sa); 2995 } 2996 } else { 2997 for (; sa < ea; sa += DEV_BSIZE, j++) { 2998 if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && 2999 (bp->b_pages[i]->valid & (1<<j)) == 0) 3000 bzero(sa, DEV_BSIZE); 3001 } 3002 } 3003 bp->b_pages[i]->valid |= mask; 3004 vm_page_flag_clear(bp->b_pages[i], PG_ZERO); 3005 } 3006 bp->b_resid = 0; 3007 } else { 3008 clrbuf(bp); 3009 } 3010} 3011 3012/* 3013 * vm_hold_load_pages and vm_hold_unload pages get pages into 3014 * a buffers address space. The pages are anonymous and are 3015 * not associated with a file object. 3016 */ 3017void 3018vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3019{ 3020 vm_offset_t pg; 3021 vm_page_t p; 3022 int index; 3023 3024 to = round_page(to); 3025 from = round_page(from); 3026 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3027 3028 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3029 3030tryagain: 3031 3032 p = vm_page_alloc(kernel_object, 3033 ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 3034 VM_ALLOC_NORMAL); 3035 if (!p) { 3036 vm_pageout_deficit += (to - from) >> PAGE_SHIFT; 3037 VM_WAIT; 3038 goto tryagain; 3039 } 3040 vm_page_wire(p); 3041 p->valid = VM_PAGE_BITS_ALL; 3042 vm_page_flag_clear(p, PG_ZERO); 3043 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 3044 bp->b_pages[index] = p; 3045 vm_page_wakeup(p); 3046 } 3047 bp->b_npages = index; 3048} 3049 3050void 3051vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 3052{ 3053 vm_offset_t pg; 3054 vm_page_t p; 3055 int index, newnpages; 3056 3057 from = round_page(from); 3058 to = round_page(to); 3059 newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 3060 3061 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 3062 p = bp->b_pages[index]; 3063 if (p && (index < bp->b_npages)) { 3064#if !defined(MAX_PERF) 3065 if (p->busy) { 3066 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", 3067 bp->b_blkno, bp->b_lblkno); 3068 } 3069#endif 3070 bp->b_pages[index] = NULL; 3071 pmap_kremove(pg); 3072 vm_page_busy(p); 3073 vm_page_unwire(p, 0); 3074 vm_page_free(p); 3075 } 3076 } 3077 bp->b_npages = newnpages; 3078} 3079 3080 3081#include "opt_ddb.h" 3082#ifdef DDB 3083#include <ddb/ddb.h> 3084 3085DB_SHOW_COMMAND(buffer, db_show_buffer) 3086{ 3087 /* get args */ 3088 struct buf *bp = (struct buf *)addr; 3089 3090 if (!have_addr) { 3091 db_printf("usage: show buffer <addr>\n"); 3092 return; 3093 } 3094 3095 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 3096 db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " 3097 "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, " 3098 "b_blkno = %d, b_pblkno = %d\n", 3099 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 3100 major(bp->b_dev), minor(bp->b_dev), 3101 bp->b_data, bp->b_blkno, bp->b_pblkno); 3102 if (bp->b_npages) { 3103 int i; 3104 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); 3105 for (i = 0; i < bp->b_npages; i++) { 3106 vm_page_t m; 3107 m = bp->b_pages[i]; 3108 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 3109 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 3110 if ((i + 1) < bp->b_npages) 3111 db_printf(","); 3112 } 3113 db_printf("\n"); 3114 } 3115} 3116#endif /* DDB */ 3117