vfs_bio.c revision 28465
1152909Sanholt/* 2152909Sanholt * Copyright (c) 1994 John S. Dyson 395584Sanholt * All rights reserved. 495584Sanholt * 595584Sanholt * Redistribution and use in source and binary forms, with or without 695584Sanholt * modification, are permitted provided that the following conditions 795584Sanholt * are met: 895584Sanholt * 1. Redistributions of source code must retain the above copyright 995584Sanholt * notice immediately at the beginning of the file, without modification, 1095584Sanholt * this list of conditions, and the following disclaimer. 1195584Sanholt * 2. Redistributions in binary form must reproduce the above copyright 1295584Sanholt * notice, this list of conditions and the following disclaimer in the 1395584Sanholt * documentation and/or other materials provided with the distribution. 1495584Sanholt * 3. Absolutely no warranty of function or purpose is made by the author 1595584Sanholt * John S. Dyson. 1695584Sanholt * 4. This work was done expressly for inclusion into FreeBSD. Other use 1795584Sanholt * is allowed if this notation is included. 1895584Sanholt * 5. Modifications may be freely made to this file if the above conditions 1995584Sanholt * are met. 2095584Sanholt * 2195584Sanholt * $Id: vfs_bio.c,v 1.122 1997/08/09 10:13:12 dyson Exp $ 2295584Sanholt */ 2395584Sanholt 2495584Sanholt/* 2595584Sanholt * this file contains a new buffer I/O scheme implementing a coherent 2695584Sanholt * VM object and buffer cache scheme. Pains have been taken to make 2795584Sanholt * sure that the performance degradation associated with schemes such 2895584Sanholt * as this is not realized. 2995584Sanholt * 3095584Sanholt * Author: John S. Dyson 31152909Sanholt * Significant help during the development and debugging phases 32152909Sanholt * had been provided by David Greenman, also of the FreeBSD core team. 33152909Sanholt */ 3495584Sanholt 3595584Sanholt#include "opt_bounce.h" 3695584Sanholt 37145132Sanholt#define VMIO 38145132Sanholt#include <sys/param.h> 39145132Sanholt#include <sys/systm.h> 40145132Sanholt#include <sys/sysproto.h> 41145132Sanholt#include <sys/kernel.h> 42145132Sanholt#include <sys/sysctl.h> 43145132Sanholt#include <sys/proc.h> 44196470Srnoland#include <sys/vnode.h> 45145132Sanholt#include <sys/vmmeter.h> 46145132Sanholt#include <vm/vm.h> 47145132Sanholt#include <vm/vm_param.h> 48145132Sanholt#include <vm/vm_prot.h> 49145132Sanholt#include <vm/vm_kern.h> 50145132Sanholt#include <vm/vm_pageout.h> 51145132Sanholt#include <vm/vm_page.h> 52145132Sanholt#include <vm/vm_object.h> 53145132Sanholt#include <vm/vm_extern.h> 54145132Sanholt#include <vm/vm_map.h> 55145132Sanholt#include <sys/buf.h> 56145132Sanholt#include <sys/mount.h> 57145132Sanholt#include <sys/malloc.h> 58145132Sanholt#include <sys/resourcevar.h> 59145132Sanholt#include <sys/proc.h> 60145132Sanholt 61145132Sanholt#include <miscfs/specfs/specdev.h> 62145132Sanholt 63145132Sanholtstatic void vfs_update __P((void)); 64145132Sanholtstatic struct proc *updateproc; 65145132Sanholtstatic struct kproc_desc up_kp = { 66145132Sanholt "update", 67145132Sanholt vfs_update, 68145132Sanholt &updateproc 69145132Sanholt}; 70145132SanholtSYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 71145132Sanholt 72145132Sanholtstruct buf *buf; /* buffer header pool */ 73145132Sanholtstruct swqueue bswlist; 74145132Sanholt 75145132Sanholtint count_lock_queue __P((void)); 76145132Sanholtstatic void vm_hold_free_pages(struct buf * bp, vm_offset_t from, 77145132Sanholt vm_offset_t to); 78145132Sanholtstatic void vm_hold_load_pages(struct buf * bp, vm_offset_t from, 79145132Sanholt vm_offset_t to); 80145132Sanholtstatic void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff, 81145132Sanholt vm_offset_t off, vm_offset_t size, 82145132Sanholt vm_page_t m); 83145132Sanholtstatic void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, 84145132Sanholt int pageno, vm_page_t m); 85145132Sanholtstatic void vfs_clean_pages(struct buf * bp); 86145132Sanholtstatic void vfs_setdirty(struct buf *bp); 87145132Sanholtstatic void vfs_vmio_release(struct buf *bp); 88152909Sanholtstatic void flushdirtybuffers(int slpflag, int slptimeo); 89157617Sanholt 90157617Sanholtint needsbuffer; 91157617Sanholt 92157617Sanholt/* 93152909Sanholt * Internal update daemon, process 3 94157617Sanholt * The variable vfs_update_wakeup allows for internal syncs. 95157617Sanholt */ 96157617Sanholtint vfs_update_wakeup; 97157617Sanholt 98157617Sanholt 99162132Sanholt/* 100162132Sanholt * buffers base kva 101182080Srnoland */ 102182080Srnoland 103182080Srnoland/* 104182080Srnoland * bogus page -- for I/O to/from partially complete buffers 105196471Srnoland * this is a temporary solution to the problem, but it is not 106196471Srnoland * really that bad. it would be better to split the buffer 107145132Sanholt * for input in the case of buffers partially already in memory, 108145132Sanholt * but the code is intricate enough already. 109196471Srnoland */ 110145132Sanholtvm_page_t bogus_page; 111145132Sanholtstatic vm_offset_t bogus_offset; 112157617Sanholt 113157617Sanholtstatic int bufspace, maxbufspace, vmiospace, maxvmiobufspace, 114157617Sanholt bufmallocspace, maxbufmallocspace; 115145132Sanholtint numdirtybuffers, lodirtybuffers, hidirtybuffers; 116145132Sanholtstatic int numfreebuffers, lofreebuffers, hifreebuffers; 117157617Sanholt 118145132SanholtSYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, 119152909Sanholt &numdirtybuffers, 0, ""); 120157617SanholtSYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, 121145132Sanholt &lodirtybuffers, 0, ""); 122145132SanholtSYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, 123157617Sanholt &hidirtybuffers, 0, ""); 124145132SanholtSYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, 125145132Sanholt &numfreebuffers, 0, ""); 126148211SanholtSYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, 127145132Sanholt &lofreebuffers, 0, ""); 128157617SanholtSYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, 129148211Sanholt &hifreebuffers, 0, ""); 130183830SrnolandSYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, 131157617Sanholt &maxbufspace, 0, ""); 132157617SanholtSYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, 133182080Srnoland &bufspace, 0, ""); 134189499SrnolandSYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, 135182080Srnoland &maxvmiobufspace, 0, ""); 136183828SrnolandSYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, 137182080Srnoland &vmiospace, 0, ""); 138182080SrnolandSYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, 139182080Srnoland &maxbufmallocspace, 0, ""); 140182080SrnolandSYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, 141182080Srnoland &bufmallocspace, 0, ""); 142182080Srnoland 143189499Srnolandstatic struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; 144189499Srnolandstatic struct bqueues bufqueues[BUFFER_QUEUES]; 145189499Srnoland 146197605Srnolandextern int vm_swap_size; 147189499Srnoland 148189499Srnoland#define BUF_MAXUSE 24 149189499Srnoland 150196142Srnoland#define VFS_BIO_NEED_ANY 1 151189499Srnoland#define VFS_BIO_NEED_LOWLIMIT 2 152189499Srnoland#define VFS_BIO_NEED_FREE 4 153189499Srnoland 154197605Srnoland/* 155145132Sanholt * Initialize buffer headers and related structures. 156145132Sanholt */ 157145132Sanholtvoid 158189499Srnolandbufinit() 159189499Srnoland{ 160189499Srnoland struct buf *bp; 161189499Srnoland int i; 162189499Srnoland 163189499Srnoland TAILQ_INIT(&bswlist); 164145132Sanholt LIST_INIT(&invalhash); 165145132Sanholt 166145132Sanholt /* first, make a null hash table */ 167145132Sanholt for (i = 0; i < BUFHSZ; i++) 168182080Srnoland LIST_INIT(&bufhashtbl[i]); 169182080Srnoland 170182080Srnoland /* next, make a null set of free lists */ 171182080Srnoland for (i = 0; i < BUFFER_QUEUES; i++) 172182080Srnoland TAILQ_INIT(&bufqueues[i]); 173182080Srnoland 174182080Srnoland /* finally, initialize each buffer header and stick on empty q */ 175182080Srnoland for (i = 0; i < nbuf; i++) { 176182080Srnoland bp = &buf[i]; 177182080Srnoland bzero(bp, sizeof *bp); 178182080Srnoland bp->b_flags = B_INVAL; /* we're just an empty header */ 179145132Sanholt bp->b_dev = NODEV; 180145132Sanholt bp->b_rcred = NOCRED; 18195584Sanholt bp->b_wcred = NOCRED; 182145132Sanholt bp->b_qindex = QUEUE_EMPTY; 183182080Srnoland bp->b_vnbufs.le_next = NOLIST; 184145132Sanholt TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 185145132Sanholt LIST_INSERT_HEAD(&invalhash, bp, b_hash); 18695584Sanholt } 18795584Sanholt/* 18895584Sanholt * maxbufspace is currently calculated to support all filesystem blocks 18995584Sanholt * to be 8K. If you happen to use a 16K filesystem, the size of the buffer 19095584Sanholt * cache is still the same as it would be for 8K filesystems. This 191189499Srnoland * keeps the size of the buffer cache "in check" for big block filesystems. 192189499Srnoland */ 19395584Sanholt maxbufspace = (nbuf + 8) * DFLTBSIZE; 194182080Srnoland/* 195182080Srnoland * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed 196182080Srnoland */ 197182080Srnoland maxvmiobufspace = 2 * maxbufspace / 3; 198182080Srnoland/* 199182080Srnoland * Limit the amount of malloc memory since it is wired permanently into 20095584Sanholt * the kernel space. Even though this is accounted for in the buffer 20195584Sanholt * allocation, we don't want the malloced region to grow uncontrolled. 20295584Sanholt * The malloc scheme improves memory utilization significantly on average 20395584Sanholt * (small) directories. 20495584Sanholt */ 20595584Sanholt maxbufmallocspace = maxbufspace / 20; 20695584Sanholt 20795584Sanholt/* 20895584Sanholt * Remove the probability of deadlock conditions by limiting the 20995584Sanholt * number of dirty buffers. 21095584Sanholt */ 21195584Sanholt hidirtybuffers = nbuf / 6 + 20; 21295584Sanholt lodirtybuffers = nbuf / 12 + 10; 213145132Sanholt numdirtybuffers = 0; 214145132Sanholt lofreebuffers = nbuf / 18 + 5; 215145132Sanholt hifreebuffers = 2 * lofreebuffers; 216112015Sanholt numfreebuffers = nbuf; 217112015Sanholt 218112015Sanholt bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); 219112015Sanholt bogus_page = vm_page_alloc(kernel_object, 220112015Sanholt ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 221112015Sanholt VM_ALLOC_NORMAL); 222182080Srnoland 223112015Sanholt} 224112015Sanholt 225145132Sanholt/* 226145132Sanholt * Free the kva allocation for a buffer 227145132Sanholt * Must be called only at splbio or higher, 228145132Sanholt * as this is the only locking for buffer_map. 229145132Sanholt */ 230145132Sanholtstatic void 231145132Sanholtbfreekva(struct buf * bp) 232145132Sanholt{ 233145132Sanholt if (bp->b_kvasize == 0) 234145132Sanholt return; 235145132Sanholt 236145132Sanholt vm_map_delete(buffer_map, 237182080Srnoland (vm_offset_t) bp->b_kvabase, 238189499Srnoland (vm_offset_t) bp->b_kvabase + bp->b_kvasize); 239145132Sanholt 240145132Sanholt bp->b_kvasize = 0; 241196470Srnoland 242196470Srnoland} 243196470Srnoland 244196470Srnoland/* 245196470Srnoland * remove the buffer from the appropriate free list 246196470Srnoland */ 247196470Srnolandvoid 248196470Srnolandbremfree(struct buf * bp) 249196470Srnoland{ 250196470Srnoland int s = splbio(); 251196470Srnoland 252196470Srnoland if (bp->b_qindex != QUEUE_NONE) { 253196470Srnoland TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 254196470Srnoland bp->b_qindex = QUEUE_NONE; 255196470Srnoland } else { 256196470Srnoland#if !defined(MAX_PERF) 257196470Srnoland panic("bremfree: removing a buffer when not on a queue"); 258196470Srnoland#endif 259196470Srnoland } 260196470Srnoland if ((bp->b_flags & B_INVAL) || 261196470Srnoland (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0) 262196470Srnoland --numfreebuffers; 263196470Srnoland splx(s); 264196470Srnoland} 265196470Srnoland 266196470Srnoland/* 267196470Srnoland * Get a buffer with the specified data. Look in the cache first. 268196470Srnoland */ 269196470Srnolandint 270196470Srnolandbread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, 271196470Srnoland struct buf ** bpp) 272196470Srnoland{ 273196470Srnoland struct buf *bp; 274196470Srnoland 275196470Srnoland bp = getblk(vp, blkno, size, 0, 0); 276196470Srnoland *bpp = bp; 277196470Srnoland 278196470Srnoland /* if not found in cache, do some I/O */ 279196470Srnoland if ((bp->b_flags & B_CACHE) == 0) { 280196470Srnoland if (curproc != NULL) 281189499Srnoland curproc->p_stats->p_ru.ru_inblock++; 282189499Srnoland bp->b_flags |= B_READ; 283182080Srnoland bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 28495584Sanholt if (bp->b_rcred == NOCRED) { 28595584Sanholt if (cred != NOCRED) 28695584Sanholt crhold(cred); 28795584Sanholt bp->b_rcred = cred; 288122580Sanholt } 289157617Sanholt vfs_busy_pages(bp, 0); 290157617Sanholt VOP_STRATEGY(bp); 291122580Sanholt return (biowait(bp)); 292119895Sanholt } 293119895Sanholt return (0); 294119895Sanholt} 29595584Sanholt 29695584Sanholt/* 29795584Sanholt * Operates like bread, but also starts asynchronous I/O on 29895584Sanholt * read-ahead blocks. 299145132Sanholt */ 300145132Sanholtint 30195584Sanholtbreadn(struct vnode * vp, daddr_t blkno, int size, 302112015Sanholt daddr_t * rablkno, int *rabsize, 30395584Sanholt int cnt, struct ucred * cred, struct buf ** bpp) 30495584Sanholt{ 305112015Sanholt struct buf *bp, *rabp; 306189499Srnoland int i; 307189499Srnoland int rv = 0, readwait = 0; 308112015Sanholt 309112015Sanholt *bpp = bp = getblk(vp, blkno, size, 0, 0); 310112015Sanholt 311112015Sanholt /* if not found in cache, do some I/O */ 312112015Sanholt if ((bp->b_flags & B_CACHE) == 0) { 313112015Sanholt if (curproc != NULL) 314112015Sanholt curproc->p_stats->p_ru.ru_inblock++; 315112015Sanholt bp->b_flags |= B_READ; 316112015Sanholt bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 317112015Sanholt if (bp->b_rcred == NOCRED) { 31895584Sanholt if (cred != NOCRED) 319112015Sanholt crhold(cred); 32095584Sanholt bp->b_rcred = cred; 32195584Sanholt } 32295584Sanholt vfs_busy_pages(bp, 0); 32395584Sanholt VOP_STRATEGY(bp); 32495584Sanholt ++readwait; 32595584Sanholt } 32695584Sanholt for (i = 0; i < cnt; i++, rablkno++, rabsize++) { 32795584Sanholt if (inmem(vp, *rablkno)) 32895584Sanholt continue; 32995584Sanholt rabp = getblk(vp, *rablkno, *rabsize, 0, 0); 33095584Sanholt 33195584Sanholt if ((rabp->b_flags & B_CACHE) == 0) { 33295584Sanholt if (curproc != NULL) 33395584Sanholt curproc->p_stats->p_ru.ru_inblock++; 33495584Sanholt rabp->b_flags |= B_READ | B_ASYNC; 33595584Sanholt rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); 33695584Sanholt if (rabp->b_rcred == NOCRED) { 337145132Sanholt if (cred != NOCRED) 338113995Sanholt crhold(cred); 339113995Sanholt rabp->b_rcred = cred; 340113995Sanholt } 341119895Sanholt vfs_busy_pages(rabp, 0); 34295584Sanholt VOP_STRATEGY(rabp); 343112015Sanholt } else { 344112015Sanholt brelse(rabp); 345112015Sanholt } 346119895Sanholt } 347112015Sanholt 348119895Sanholt if (readwait) { 349112015Sanholt rv = biowait(bp); 350112015Sanholt } 351112015Sanholt return (rv); 352145132Sanholt} 353145132Sanholt 354182080Srnoland/* 355182080Srnoland * Write, release buffer on completion. (Done by iodone 356182080Srnoland * if async.) 357182080Srnoland */ 358112015Sanholtint 359145132Sanholtbwrite(struct buf * bp) 360182080Srnoland{ 361152909Sanholt int oldflags = bp->b_flags; 362152909Sanholt 363182080Srnoland if (bp->b_flags & B_INVAL) { 364182080Srnoland brelse(bp); 365157617Sanholt return (0); 366157617Sanholt } 367157617Sanholt#if !defined(MAX_PERF) 368145132Sanholt if (!(bp->b_flags & B_BUSY)) 369145132Sanholt panic("bwrite: buffer is not busy???"); 370182080Srnoland#endif 371145132Sanholt 372182080Srnoland bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 373196471Srnoland bp->b_flags |= B_WRITEINPROG; 374182080Srnoland 375189499Srnoland if ((oldflags & B_DELWRI) == B_DELWRI) { 376189499Srnoland --numdirtybuffers; 377189499Srnoland reassignbuf(bp, bp->b_vp); 378189499Srnoland } 379189499Srnoland 380189499Srnoland bp->b_vp->v_numoutput++; 381189499Srnoland vfs_busy_pages(bp, 1); 382189499Srnoland if (curproc != NULL) 383189499Srnoland curproc->p_stats->p_ru.ru_oublock++; 384189499Srnoland VOP_STRATEGY(bp); 385189499Srnoland 386189499Srnoland /* 387189499Srnoland * Handle ordered writes here. 388189499Srnoland * If the write was originally flagged as ordered, 389189499Srnoland * then we check to see if it was converted to async. 390189499Srnoland * If it was converted to async, and is done now, then 391189499Srnoland * we release the buffer. Otherwise we clear the 392189499Srnoland * ordered flag because it is not needed anymore. 393189499Srnoland * 394189499Srnoland * Note that biodone has been modified so that it does 395196470Srnoland * not release ordered buffers. This allows us to have 396196470Srnoland * a chance to determine whether or not the driver 397189499Srnoland * has set the async flag in the strategy routine. Otherwise 398196470Srnoland * if biodone was not modified, then the buffer may have been 399196470Srnoland * reused before we have had a chance to check the flag. 400196470Srnoland */ 401196470Srnoland 40295584Sanholt if ((oldflags & B_ORDERED) == B_ORDERED) { 40395584Sanholt int s; 40495584Sanholt s = splbio(); 40595584Sanholt if (bp->b_flags & B_ASYNC) { 40695584Sanholt if ((bp->b_flags & B_DONE)) { 40795584Sanholt if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) 408157617Sanholt brelse(bp); 409157617Sanholt else 410157617Sanholt bqrelse(bp); 411157617Sanholt } 412182080Srnoland splx(s); 413157617Sanholt return (0); 414157617Sanholt } else { 415152909Sanholt bp->b_flags &= ~B_ORDERED; 416182080Srnoland } 417152909Sanholt splx(s); 418152909Sanholt } 419189499Srnoland 420189499Srnoland if ((oldflags & B_ASYNC) == 0) { 421189499Srnoland int rtval = biowait(bp); 422189499Srnoland 423189499Srnoland if (oldflags & B_DELWRI) { 424189499Srnoland reassignbuf(bp, bp->b_vp); 425182080Srnoland } 426182080Srnoland brelse(bp); 427182080Srnoland return (rtval); 428182080Srnoland } 429182080Srnoland return (0); 430182080Srnoland} 431196470Srnoland 432196470Srnolandint 433196470Srnolandvn_bwrite(ap) 434196470Srnoland struct vop_bwrite_args *ap; 435182080Srnoland{ 436182080Srnoland return (bwrite(ap->a_bp)); 437182080Srnoland} 438182080Srnoland 439182080Srnolandvoid 44095584Sanholtvfs_bio_need_satisfy(void) { 441182080Srnoland ++numfreebuffers; 442182080Srnoland if (!needsbuffer) 443182080Srnoland return; 444182080Srnoland if (numdirtybuffers < lodirtybuffers) { 445182080Srnoland needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT); 446182080Srnoland } else { 447182080Srnoland needsbuffer &= ~VFS_BIO_NEED_ANY; 448182080Srnoland } 449182080Srnoland if (numfreebuffers >= hifreebuffers) { 450182080Srnoland needsbuffer &= ~VFS_BIO_NEED_FREE; 451189499Srnoland } 452189499Srnoland wakeup(&needsbuffer); 453189499Srnoland} 45495584Sanholt 455182080Srnoland/* 456182080Srnoland * Delayed write. (Buffer is marked dirty). 45795584Sanholt */ 458145132Sanholtvoid 45995584Sanholtbdwrite(struct buf * bp) 460145132Sanholt{ 46195584Sanholt 462189499Srnoland#if !defined(MAX_PERF) 463189499Srnoland if ((bp->b_flags & B_BUSY) == 0) { 464189499Srnoland panic("bdwrite: buffer is not busy"); 465189499Srnoland } 466182080Srnoland#endif 467182080Srnoland 468182080Srnoland if (bp->b_flags & B_INVAL) { 469145132Sanholt brelse(bp); 470182080Srnoland return; 471182080Srnoland } 47295584Sanholt if (bp->b_flags & B_TAPE) { 473189499Srnoland bawrite(bp); 474189499Srnoland return; 475189499Srnoland } 476189499Srnoland bp->b_flags &= ~(B_READ|B_RELBUF); 477112015Sanholt if ((bp->b_flags & B_DELWRI) == 0) { 478182080Srnoland bp->b_flags |= B_DONE | B_DELWRI; 479182080Srnoland reassignbuf(bp, bp->b_vp); 480182080Srnoland ++numdirtybuffers; 481112015Sanholt } 482182080Srnoland 483182080Srnoland /* 484182080Srnoland * This bmap keeps the system from needing to do the bmap later, 485182080Srnoland * perhaps when the system is attempting to do a sync. Since it 486145132Sanholt * is likely that the indirect block -- or whatever other datastructure 487182080Srnoland * that the filesystem needs is still in memory now, it is a good 488189499Srnoland * thing to do this. Note also, that if the pageout daemon is 489182080Srnoland * requesting a sync -- there might not be enough memory to do 490189499Srnoland * the bmap then... So, this is important to do. 491182080Srnoland */ 492182080Srnoland if( bp->b_lblkno == bp->b_blkno) { 493112015Sanholt VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); 494152909Sanholt } 495152909Sanholt 496152909Sanholt /* 497189499Srnoland * Set the *dirty* buffer range based upon the VM system dirty pages. 498182080Srnoland */ 499189499Srnoland vfs_setdirty(bp); 500182080Srnoland 501182080Srnoland /* 502189499Srnoland * We need to do this here to satisfy the vnode_pager and the 503189499Srnoland * pageout daemon, so that it thinks that the pages have been 504152909Sanholt * "cleaned". Note that since the pages are in a delayed write 505189499Srnoland * buffer -- the VFS layer "will" see that the pages get written 506152909Sanholt * out on the next sync, or perhaps the cluster will be completed. 507148211Sanholt */ 508182080Srnoland vfs_clean_pages(bp); 509148211Sanholt bqrelse(bp); 510182080Srnoland 511182080Srnoland if (numdirtybuffers >= hidirtybuffers) 512182080Srnoland flushdirtybuffers(0, 0); 513148211Sanholt 514189499Srnoland return; 515189499Srnoland} 516189499Srnoland 517189499Srnoland/* 518189499Srnoland * Asynchronous write. 519189499Srnoland * Start output on a buffer, but do not wait for it to complete. 520189499Srnoland * The buffer is released when the output completes. 521189499Srnoland */ 522189499Srnolandvoid 523189499Srnolandbawrite(struct buf * bp) 524189499Srnoland{ 525189499Srnoland bp->b_flags |= B_ASYNC; 526189499Srnoland (void) VOP_BWRITE(bp); 527189499Srnoland} 528196470Srnoland 529196470Srnoland/* 530196470Srnoland * Ordered write. 531196470Srnoland * Start output on a buffer, but only wait for it to complete if the 532196470Srnoland * output device cannot guarantee ordering in some other way. Devices 533189499Srnoland * that can perform asynchronous ordered writes will set the B_ASYNC 534196470Srnoland * flag in their strategy routine. 535196470Srnoland * The buffer is released when the output completes. 536196470Srnoland */ 537196470Srnolandint 538196470Srnolandbowrite(struct buf * bp) 539196470Srnoland{ 540196470Srnoland bp->b_flags |= B_ORDERED; 541196470Srnoland return (VOP_BWRITE(bp)); 542196470Srnoland} 543196470Srnoland 544196470Srnoland/* 545196470Srnoland * Release a buffer. 546196470Srnoland */ 547196470Srnolandvoid 548196470Srnolandbrelse(struct buf * bp) 549196470Srnoland{ 550196470Srnoland int s; 551196470Srnoland 552196470Srnoland if (bp->b_flags & B_CLUSTER) { 553196470Srnoland relpbuf(bp); 554196470Srnoland return; 555196470Srnoland } 556112015Sanholt /* anyone need a "free" block? */ 557112015Sanholt s = splbio(); 558112015Sanholt 559112015Sanholt /* anyone need this block? */ 560112015Sanholt if (bp->b_flags & B_WANTED) { 561112015Sanholt bp->b_flags &= ~(B_WANTED | B_AGE); 562112015Sanholt wakeup(bp); 563112015Sanholt } 56495584Sanholt 56595584Sanholt if (bp->b_flags & B_LOCKED) 56695584Sanholt bp->b_flags &= ~B_ERROR; 567189499Srnoland 568189499Srnoland if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 569189499Srnoland (bp->b_bufsize <= 0)) { 570145132Sanholt bp->b_flags |= B_INVAL; 571189499Srnoland if (bp->b_flags & B_DELWRI) 572189499Srnoland --numdirtybuffers; 57395584Sanholt bp->b_flags &= ~(B_DELWRI | B_CACHE); 57495584Sanholt if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) { 57595584Sanholt if (bp->b_bufsize) 57695584Sanholt allocbuf(bp, 0); 57795584Sanholt brelvp(bp); 57895584Sanholt } 57995584Sanholt } 58095584Sanholt 581183830Srnoland /* 582183830Srnoland * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer 583183830Srnoland * constituted, so the B_INVAL flag is used to *invalidate* the buffer, 584183830Srnoland * but the VM object is kept around. The B_NOCACHE flag is used to 585183830Srnoland * invalidate the pages in the VM object. 586183830Srnoland * 58795584Sanholt * If the buffer is a partially filled NFS buffer, keep it 588184374Srnoland * since invalidating it now will lose informatio. The valid 58995584Sanholt * flags in the vm_pages have only DEV_BSIZE resolution but 590184374Srnoland * the b_validoff, b_validend fields have byte resolution. 591184374Srnoland * This can avoid unnecessary re-reads of the buffer. 592184374Srnoland * XXX this seems to cause performance problems. 593189499Srnoland */ 59495584Sanholt if ((bp->b_flags & B_VMIO) 595183830Srnoland && !(bp->b_vp->v_tag == VT_NFS && 596183830Srnoland (bp->b_flags & B_DELWRI) != 0) 597183830Srnoland#ifdef notdef 598183830Srnoland && (bp->b_vp->v_tag != VT_NFS 599183830Srnoland || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) 600183830Srnoland || bp->b_validend == 0 601183830Srnoland || (bp->b_validoff == 0 602183830Srnoland && bp->b_validend == bp->b_bufsize)) 603183830Srnoland#endif 604183830Srnoland ) { 605183830Srnoland vm_ooffset_t foff; 606183830Srnoland vm_object_t obj; 60795584Sanholt int i, resid; 60895584Sanholt vm_page_t m; 60995584Sanholt struct vnode *vp; 61095584Sanholt int iototal = bp->b_bufsize; 611189499Srnoland 61295584Sanholt vp = bp->b_vp; 61395584Sanholt 61495584Sanholt#if !defined(MAX_PERF) 61595584Sanholt if (!vp) 616112015Sanholt panic("brelse: missing vp"); 617112015Sanholt#endif 61895584Sanholt 619148211Sanholt if (bp->b_npages) { 620148211Sanholt vm_pindex_t poff; 621148211Sanholt obj = (vm_object_t) vp->v_object; 622182080Srnoland if (vp->v_type == VBLK) 623182080Srnoland foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; 624182080Srnoland else 625182080Srnoland foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 626182080Srnoland poff = OFF_TO_IDX(foff); 627182080Srnoland for (i = 0; i < bp->b_npages; i++) { 628182080Srnoland m = bp->b_pages[i]; 629182080Srnoland if (m == bogus_page) { 630148211Sanholt m = vm_page_lookup(obj, poff + i); 631148211Sanholt#if !defined(MAX_PERF) 632182080Srnoland if (!m) { 633148211Sanholt panic("brelse: page missing\n"); 634148211Sanholt } 635148211Sanholt#endif 636148211Sanholt bp->b_pages[i] = m; 637148211Sanholt pmap_qenter(trunc_page(bp->b_data), 638182080Srnoland bp->b_pages, bp->b_npages); 639182080Srnoland } 640182080Srnoland resid = IDX_TO_OFF(m->pindex+1) - foff; 641182080Srnoland if (resid > iototal) 642182080Srnoland resid = iototal; 643182080Srnoland if (resid > 0) { 644182080Srnoland /* 645182080Srnoland * Don't invalidate the page if the local machine has already 646182080Srnoland * modified it. This is the lesser of two evils, and should 647182080Srnoland * be fixed. 648182080Srnoland */ 649182080Srnoland if (bp->b_flags & (B_NOCACHE | B_ERROR)) { 650182080Srnoland vm_page_test_dirty(m); 651182080Srnoland if (m->dirty == 0) { 652182080Srnoland vm_page_set_invalid(m, (vm_offset_t) foff, resid); 653182080Srnoland if (m->valid == 0) 654182080Srnoland vm_page_protect(m, VM_PROT_NONE); 655182080Srnoland } 656182080Srnoland } 657182080Srnoland if (resid >= PAGE_SIZE) { 658182080Srnoland if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { 659182080Srnoland bp->b_flags |= B_INVAL; 660182080Srnoland } 661182080Srnoland } else { 662261455Seadler if (!vm_page_is_valid(m, 663182080Srnoland (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) { 664182080Srnoland bp->b_flags |= B_INVAL; 665182080Srnoland } 666182080Srnoland } 667182080Srnoland } 668182080Srnoland foff += resid; 669182080Srnoland iototal -= resid; 670182080Srnoland } 671182080Srnoland } 672182080Srnoland if (bp->b_flags & (B_INVAL | B_RELBUF)) 673182080Srnoland vfs_vmio_release(bp); 674182080Srnoland } 675182080Srnoland#if !defined(MAX_PERF) 676182080Srnoland if (bp->b_qindex != QUEUE_NONE) 677182080Srnoland panic("brelse: free buffer onto another queue???"); 678182080Srnoland#endif 679182080Srnoland 680182080Srnoland /* enqueue */ 681182080Srnoland /* buffers with no memory */ 682182080Srnoland if (bp->b_bufsize == 0) { 683182080Srnoland bp->b_flags |= B_INVAL; 684182080Srnoland bp->b_qindex = QUEUE_EMPTY; 685182080Srnoland TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); 686182080Srnoland LIST_REMOVE(bp, b_hash); 687182080Srnoland LIST_INSERT_HEAD(&invalhash, bp, b_hash); 688182080Srnoland bp->b_dev = NODEV; 689182080Srnoland /* 690182080Srnoland * Get rid of the kva allocation *now* 691189499Srnoland */ 692189499Srnoland bfreekva(bp); 693189499Srnoland 694189499Srnoland /* buffers with junk contents */ 695189499Srnoland } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { 696189499Srnoland bp->b_flags |= B_INVAL; 697189499Srnoland bp->b_qindex = QUEUE_AGE; 698189499Srnoland TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); 699189499Srnoland LIST_REMOVE(bp, b_hash); 700189499Srnoland LIST_INSERT_HEAD(&invalhash, bp, b_hash); 701189499Srnoland bp->b_dev = NODEV; 702189499Srnoland 703189499Srnoland /* buffers that are locked */ 704189499Srnoland } else if (bp->b_flags & B_LOCKED) { 705189499Srnoland bp->b_qindex = QUEUE_LOCKED; 706189499Srnoland TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 707189499Srnoland 708189499Srnoland /* buffers with stale but valid contents */ 709189499Srnoland } else if (bp->b_flags & B_AGE) { 710189499Srnoland bp->b_qindex = QUEUE_AGE; 711189499Srnoland TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); 712189499Srnoland 713189499Srnoland /* buffers with valid and quite potentially reuseable contents */ 714189499Srnoland } else { 715189499Srnoland bp->b_qindex = QUEUE_LRU; 716189499Srnoland TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 717189499Srnoland } 718189499Srnoland 719189499Srnoland if ((bp->b_flags & B_INVAL) || 720189499Srnoland (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { 721189499Srnoland if (bp->b_flags & B_DELWRI) { 722189499Srnoland --numdirtybuffers; 723189499Srnoland bp->b_flags &= ~B_DELWRI; 724189499Srnoland } 725189499Srnoland vfs_bio_need_satisfy(); 726189499Srnoland } 727189499Srnoland 728189499Srnoland /* unlock */ 729189499Srnoland bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | 730189499Srnoland B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 731189499Srnoland splx(s); 732189499Srnoland} 733189499Srnoland 734189499Srnoland/* 735189499Srnoland * Release a buffer. 736189499Srnoland */ 737189499Srnolandvoid 738189499Srnolandbqrelse(struct buf * bp) 739189499Srnoland{ 740189499Srnoland int s; 741182080Srnoland 742182080Srnoland s = splbio(); 743182080Srnoland 744182080Srnoland /* anyone need this block? */ 745182080Srnoland if (bp->b_flags & B_WANTED) { 746182080Srnoland bp->b_flags &= ~(B_WANTED | B_AGE); 747182080Srnoland wakeup(bp); 748182080Srnoland } 749182080Srnoland 750182080Srnoland#if !defined(MAX_PERF) 751182080Srnoland if (bp->b_qindex != QUEUE_NONE) 752182080Srnoland panic("bqrelse: free buffer onto another queue???"); 753182080Srnoland#endif 754182080Srnoland 755145132Sanholt if (bp->b_flags & B_LOCKED) { 756145132Sanholt bp->b_flags &= ~B_ERROR; 757145132Sanholt bp->b_qindex = QUEUE_LOCKED; 758182080Srnoland TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); 759182080Srnoland /* buffers with stale but valid contents */ 760145132Sanholt } else { 761145132Sanholt bp->b_qindex = QUEUE_LRU; 762182080Srnoland TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 763182080Srnoland } 764196471Srnoland 765182080Srnoland if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { 766182080Srnoland vfs_bio_need_satisfy(); 767182080Srnoland } 768182080Srnoland 769182080Srnoland /* unlock */ 770182080Srnoland bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | 771182080Srnoland B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); 772182080Srnoland splx(s); 773182080Srnoland} 774182080Srnoland 775182080Srnolandstatic void 776182080Srnolandvfs_vmio_release(bp) 777182080Srnoland struct buf *bp; 778261455Seadler{ 779182080Srnoland int i; 780182080Srnoland vm_page_t m; 781182080Srnoland 782182080Srnoland for (i = 0; i < bp->b_npages; i++) { 783122580Sanholt m = bp->b_pages[i]; 78495584Sanholt bp->b_pages[i] = NULL; 78595584Sanholt vm_page_unwire(m); 786182080Srnoland /* 787182080Srnoland * We don't mess with busy pages, it is 78895584Sanholt * the responsibility of the process that 78995584Sanholt * busied the pages to deal with them. 79095584Sanholt */ 79195584Sanholt if ((m->flags & PG_BUSY) || (m->busy != 0)) 79295584Sanholt continue; 79395584Sanholt 79495584Sanholt if (m->wire_count == 0) { 79595584Sanholt 79695584Sanholt if (m->flags & PG_WANTED) { 79795584Sanholt m->flags &= ~PG_WANTED; 79895584Sanholt wakeup(m); 79995584Sanholt } 80095584Sanholt 80195584Sanholt /* 80295584Sanholt * If this is an async free -- we cannot place 80395584Sanholt * pages onto the cache queue. If it is an 80495584Sanholt * async free, then we don't modify any queues. 805182080Srnoland * This is probably in error (for perf reasons), 80695584Sanholt * and we will eventually need to build 80795584Sanholt * a more complete infrastructure to support I/O 80895584Sanholt * rundown. 80995584Sanholt */ 810261455Seadler if ((bp->b_flags & B_ASYNC) == 0) { 811261455Seadler 81295584Sanholt /* 81395584Sanholt * In the case of sync buffer frees, we can do pretty much 81495584Sanholt * anything to any of the memory queues. Specifically, 81595584Sanholt * the cache queue is okay to be modified. 81695584Sanholt */ 81795584Sanholt if (m->valid) { 81895584Sanholt if(m->dirty == 0) 81995584Sanholt vm_page_test_dirty(m); 82095584Sanholt /* 82195584Sanholt * this keeps pressure off of the process memory 822112015Sanholt */ 823112015Sanholt if (m->dirty == 0 && m->hold_count == 0) 824189499Srnoland vm_page_cache(m); 825112015Sanholt else 826189499Srnoland vm_page_deactivate(m); 827182080Srnoland } else if (m->hold_count == 0) { 828189499Srnoland vm_page_protect(m, VM_PROT_NONE); 829189499Srnoland vm_page_free(m); 830189499Srnoland } 831189499Srnoland } else { 832189499Srnoland /* 833189499Srnoland * If async, then at least we clear the 834189499Srnoland * act_count. 835189499Srnoland */ 836189499Srnoland m->act_count = 0; 837189499Srnoland } 838182080Srnoland } 839189499Srnoland } 840189499Srnoland bufspace -= bp->b_bufsize; 841112015Sanholt vmiospace -= bp->b_bufsize; 842112015Sanholt pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); 843182080Srnoland bp->b_npages = 0; 844112015Sanholt bp->b_bufsize = 0; 845112015Sanholt bp->b_flags &= ~B_VMIO; 846112015Sanholt if (bp->b_vp) 847112015Sanholt brelvp(bp); 848112015Sanholt} 849182080Srnoland 850182080Srnoland/* 851182080Srnoland * Check to see if a block is currently memory resident. 852112015Sanholt */ 853112015Sanholtstruct buf * 854182080Srnolandgbincore(struct vnode * vp, daddr_t blkno) 855112015Sanholt{ 856182080Srnoland struct buf *bp; 857112015Sanholt struct bufhashhdr *bh; 85895584Sanholt 85995584Sanholt bh = BUFHASH(vp, blkno); 86095584Sanholt bp = bh->lh_first; 86195584Sanholt 86295584Sanholt /* Search hash chain */ 86395584Sanholt while (bp != NULL) { 86495584Sanholt /* hit */ 86595584Sanholt if (bp->b_vp == vp && bp->b_lblkno == blkno && 86695584Sanholt (bp->b_flags & B_INVAL) == 0) { 86795584Sanholt break; 86895584Sanholt } 86995584Sanholt bp = bp->b_hash.le_next; 87095584Sanholt } 871112015Sanholt return (bp); 872112015Sanholt} 873112015Sanholt 874112015Sanholt/* 875112015Sanholt * this routine implements clustered async writes for 876112015Sanholt * clearing out B_DELWRI buffers... This is much better 87795584Sanholt * than the old way of writing only one buffer at a time. 87895584Sanholt */ 87995584Sanholtint 88095584Sanholtvfs_bio_awrite(struct buf * bp) 88195584Sanholt{ 88295584Sanholt int i; 88395584Sanholt daddr_t lblkno = bp->b_lblkno; 88495584Sanholt struct vnode *vp = bp->b_vp; 88595584Sanholt int s; 88695584Sanholt int ncl; 88795584Sanholt struct buf *bpa; 88895584Sanholt int nwritten; 88995584Sanholt 89095584Sanholt s = splbio(); 89195584Sanholt /* 89295584Sanholt * right now we support clustered writing only to regular files 89395584Sanholt */ 89495584Sanholt if ((vp->v_type == VREG) && 89595584Sanholt (vp->v_mount != 0) && /* Only on nodes that have the size info */ 896122580Sanholt (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 89795584Sanholt int size; 89895584Sanholt int maxcl; 89995584Sanholt 900182080Srnoland size = vp->v_mount->mnt_stat.f_iosize; 901182080Srnoland maxcl = MAXPHYS / size; 902182080Srnoland 903182080Srnoland for (i = 1; i < maxcl; i++) { 904182080Srnoland if ((bpa = gbincore(vp, lblkno + i)) && 905261455Seadler ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == 90695584Sanholt (B_DELWRI | B_CLUSTEROK)) && 90795584Sanholt (bpa->b_bufsize == size)) { 90895584Sanholt if ((bpa->b_blkno == bpa->b_lblkno) || 90995584Sanholt (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) 91095584Sanholt break; 91195584Sanholt } else { 91295584Sanholt break; 91395584Sanholt } 91495584Sanholt } 91595584Sanholt ncl = i; 916145132Sanholt /* 91795584Sanholt * this is a possible cluster write 918145132Sanholt */ 919112015Sanholt if (ncl != 1) { 92095584Sanholt nwritten = cluster_wbuild(vp, size, lblkno, ncl); 92195584Sanholt splx(s); 92295584Sanholt return nwritten; 92395584Sanholt } 92495584Sanholt } 92595584Sanholt bremfree(bp); 92695584Sanholt splx(s); 927261455Seadler /* 928182080Srnoland * default (old) behavior, writing out only one block 929182080Srnoland */ 930182080Srnoland bp->b_flags |= B_BUSY | B_ASYNC; 931261455Seadler nwritten = bp->b_bufsize; 932189499Srnoland (void) VOP_BWRITE(bp); 933162132Sanholt return nwritten; 934162132Sanholt} 935162132Sanholt 936261455Seadler 937182080Srnoland/* 938182080Srnoland * Find a buffer header which is available for use. 939182080Srnoland */ 940182080Srnolandstatic struct buf * 94195584Sanholtgetnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize) 94295584Sanholt{ 94395584Sanholt struct buf *bp; 944189499Srnoland int nbyteswritten = 0; 94595584Sanholt vm_offset_t addr; 946112015Sanholt static int writerecursion = 0; 947112015Sanholt 948112015Sanholtstart: 949189499Srnoland if (bufspace >= maxbufspace) 950189499Srnoland goto trytofreespace; 95195584Sanholt 952261455Seadler /* can we constitute a new buffer? */ 95395584Sanholt if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) { 95495584Sanholt#if !defined(MAX_PERF) 95595584Sanholt if (bp->b_qindex != QUEUE_EMPTY) 95695584Sanholt panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", 95795584Sanholt bp->b_qindex); 95895584Sanholt#endif 95995584Sanholt bp->b_flags |= B_BUSY; 96095584Sanholt bremfree(bp); 96195584Sanholt goto fillbuf; 962182080Srnoland } 963182080Srnolandtrytofreespace: 964182080Srnoland /* 965182080Srnoland * We keep the file I/O from hogging metadata I/O 966182080Srnoland * This is desirable because file data is cached in the 967182080Srnoland * VM/Buffer cache even if a buffer is freed. 968182080Srnoland */ 969182080Srnoland if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) { 970182080Srnoland#if !defined(MAX_PERF) 971182080Srnoland if (bp->b_qindex != QUEUE_AGE) 972182080Srnoland panic("getnewbuf: inconsistent AGE queue, qindex=%d", 973182080Srnoland bp->b_qindex); 974182080Srnoland#endif 975182080Srnoland } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) { 976182080Srnoland#if !defined(MAX_PERF) 977182080Srnoland if (bp->b_qindex != QUEUE_LRU) 978182080Srnoland panic("getnewbuf: inconsistent LRU queue, qindex=%d", 979182080Srnoland bp->b_qindex); 980182080Srnoland#endif 98195584Sanholt } 982182080Srnoland if (!bp) { 983182080Srnoland /* wait for a free buffer of any kind */ 984182080Srnoland needsbuffer |= VFS_BIO_NEED_ANY; 98595584Sanholt do 986182080Srnoland tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf", 987182080Srnoland slptimeo); 988182080Srnoland while (needsbuffer & VFS_BIO_NEED_ANY); 989182080Srnoland return (0); 990182080Srnoland } 991182080Srnoland 992182080Srnoland#if defined(DIAGNOSTIC) 993182080Srnoland if (bp->b_flags & B_BUSY) { 994182080Srnoland panic("getnewbuf: busy buffer on free list\n"); 995182080Srnoland } 996182080Srnoland#endif 997182080Srnoland 998182080Srnoland /* 999182080Srnoland * We are fairly aggressive about freeing VMIO buffers, but since 1000182080Srnoland * the buffering is intact without buffer headers, there is not 1001182080Srnoland * much loss. We gain by maintaining non-VMIOed metadata in buffers. 1002182080Srnoland */ 1003182080Srnoland if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { 1004182080Srnoland if ((bp->b_flags & B_VMIO) == 0 || 1005182080Srnoland (vmiospace < maxvmiobufspace)) { 1006261455Seadler --bp->b_usecount; 100795584Sanholt TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); 100895584Sanholt if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { 100995584Sanholt TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 101095584Sanholt goto start; 101195584Sanholt } 101295584Sanholt TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); 101395584Sanholt } 101495584Sanholt } 101595584Sanholt 101695584Sanholt 101795584Sanholt /* if we are a delayed write, convert to an async write */ 101895584Sanholt if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { 101995584Sanholt 102095584Sanholt if (writerecursion > 0) { 102195584Sanholt bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); 102295584Sanholt while (bp) { 102395584Sanholt if ((bp->b_flags & B_DELWRI) == 0) 102495584Sanholt break; 102595584Sanholt bp = TAILQ_NEXT(bp, b_freelist); 102695584Sanholt } 102795584Sanholt if (bp == NULL) { 102895584Sanholt bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); 102995584Sanholt while (bp) { 103095584Sanholt if ((bp->b_flags & B_DELWRI) == 0) 103195584Sanholt break; 103295584Sanholt bp = TAILQ_NEXT(bp, b_freelist); 103395584Sanholt } 103495584Sanholt } 103595584Sanholt if (bp == NULL) 103695584Sanholt panic("getnewbuf: cannot get buffer, infinite recursion failure"); 103795584Sanholt } else { 103895584Sanholt ++writerecursion; 103995584Sanholt nbyteswritten += vfs_bio_awrite(bp); 104095584Sanholt --writerecursion; 104195584Sanholt if (!slpflag && !slptimeo) { 1042112015Sanholt return (0); 1043112015Sanholt } 1044112015Sanholt goto start; 1045112015Sanholt } 1046112015Sanholt } 1047112015Sanholt 1048112015Sanholt if (bp->b_flags & B_WANTED) { 1049112015Sanholt bp->b_flags &= ~B_WANTED; 1050112015Sanholt wakeup(bp); 1051112015Sanholt } 105295584Sanholt bremfree(bp); 105395584Sanholt bp->b_flags |= B_BUSY; 105495584Sanholt 105595584Sanholt if (bp->b_flags & B_VMIO) { 105695584Sanholt bp->b_flags &= ~B_ASYNC; 105795584Sanholt vfs_vmio_release(bp); 105895584Sanholt } 105995584Sanholt 106095584Sanholt if (bp->b_vp) 106195584Sanholt brelvp(bp); 106295584Sanholt 106395584Sanholtfillbuf: 106495584Sanholt /* we are not free, nor do we contain interesting data */ 106595584Sanholt if (bp->b_rcred != NOCRED) { 106695584Sanholt crfree(bp->b_rcred); 106795584Sanholt bp->b_rcred = NOCRED; 106895584Sanholt } 106995584Sanholt if (bp->b_wcred != NOCRED) { 107095584Sanholt crfree(bp->b_wcred); 107195584Sanholt bp->b_wcred = NOCRED; 107295584Sanholt } 1073145132Sanholt 107495584Sanholt LIST_REMOVE(bp, b_hash); 107595584Sanholt LIST_INSERT_HEAD(&invalhash, bp, b_hash); 107695584Sanholt if (bp->b_bufsize) { 107795584Sanholt allocbuf(bp, 0); 107895584Sanholt } 107995584Sanholt bp->b_flags = B_BUSY; 108095584Sanholt bp->b_dev = NODEV; 108195584Sanholt bp->b_vp = NULL; 108295584Sanholt bp->b_blkno = bp->b_lblkno = 0; 108395584Sanholt bp->b_iodone = 0; 108495584Sanholt bp->b_error = 0; 108595584Sanholt bp->b_resid = 0; 108695584Sanholt bp->b_bcount = 0; 108795584Sanholt bp->b_npages = 0; 108895584Sanholt bp->b_dirtyoff = bp->b_dirtyend = 0; 108995584Sanholt bp->b_validoff = bp->b_validend = 0; 109095584Sanholt bp->b_usecount = 4; 109195584Sanholt 109295584Sanholt maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; 109395584Sanholt 109495584Sanholt /* 109595584Sanholt * we assume that buffer_map is not at address 0 109695584Sanholt */ 109795584Sanholt addr = 0; 109895584Sanholt if (maxsize != bp->b_kvasize) { 1099148211Sanholt bfreekva(bp); 1100148211Sanholt 110195584Sanholt /* 110295584Sanholt * See if we have buffer kva space 110395584Sanholt */ 110495584Sanholt if (vm_map_findspace(buffer_map, 1105145132Sanholt vm_map_min(buffer_map), maxsize, &addr)) { 110695584Sanholt bp->b_flags |= B_INVAL; 110795584Sanholt brelse(bp); 110895584Sanholt goto trytofreespace; 110995584Sanholt } 111095584Sanholt } 111195584Sanholt 111295584Sanholt /* 111395584Sanholt * See if we are below are allocated minimum 111495584Sanholt */ 111595584Sanholt if (bufspace >= (maxbufspace + nbyteswritten)) { 111695584Sanholt bp->b_flags |= B_INVAL; 111795584Sanholt brelse(bp); 1118112015Sanholt goto trytofreespace; 1119162132Sanholt } 1120261455Seadler 112195584Sanholt /* 112295584Sanholt * create a map entry for the buffer -- in essence 112395584Sanholt * reserving the kva space. 112495584Sanholt */ 112595584Sanholt if (addr) { 112695584Sanholt vm_map_insert(buffer_map, NULL, 0, 112795584Sanholt addr, addr + maxsize, 112895584Sanholt VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); 112995584Sanholt 113095584Sanholt bp->b_kvabase = (caddr_t) addr; 113195584Sanholt bp->b_kvasize = maxsize; 113295584Sanholt } 113395584Sanholt bp->b_data = bp->b_kvabase; 113495584Sanholt 113595584Sanholt return (bp); 113695584Sanholt} 113795584Sanholt 113895584Sanholtstatic void 113995584Sanholtwaitfreebuffers(int slpflag, int slptimeo) { 114095584Sanholt while (numfreebuffers < hifreebuffers) { 114195584Sanholt flushdirtybuffers(slpflag, slptimeo); 1142184374Srnoland if (numfreebuffers < hifreebuffers) 114395584Sanholt break; 114495584Sanholt needsbuffer |= VFS_BIO_NEED_FREE; 114595584Sanholt if (tsleep(&needsbuffer, PRIBIO|slpflag, "biofre", slptimeo)) 114695584Sanholt break; 114795584Sanholt } 114895584Sanholt} 114995584Sanholt 115095584Sanholtstatic void 115195584Sanholtflushdirtybuffers(int slpflag, int slptimeo) { 115295584Sanholt int s; 115395584Sanholt static pid_t flushing = 0; 115495584Sanholt 115595584Sanholt s = splbio(); 1156148211Sanholt 1157148211Sanholt if (flushing) { 1158148211Sanholt if (flushing == curproc->p_pid) { 1159148211Sanholt splx(s); 1160189499Srnoland return; 116195584Sanholt } 116295584Sanholt while (flushing) { 1163112015Sanholt if (tsleep(&flushing, PRIBIO|slpflag, "biofls", slptimeo)) { 116495584Sanholt splx(s); 1165112015Sanholt return; 1166148211Sanholt } 1167112015Sanholt } 1168145132Sanholt } 1169145132Sanholt flushing = curproc->p_pid; 1170145132Sanholt 1171148211Sanholt while (numdirtybuffers > lodirtybuffers) { 1172148211Sanholt struct buf *bp; 1173148211Sanholt needsbuffer |= VFS_BIO_NEED_LOWLIMIT; 1174148211Sanholt bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); 1175145132Sanholt if (bp == NULL) 1176148211Sanholt bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); 117795584Sanholt 117895584Sanholt while (bp && ((bp->b_flags & B_DELWRI) == 0)) { 117995584Sanholt bp = TAILQ_NEXT(bp, b_freelist); 1180112015Sanholt } 118195584Sanholt 1182189499Srnoland if (bp) { 1183189499Srnoland splx(s); 1184189499Srnoland vfs_bio_awrite(bp); 1185189499Srnoland s = splbio(); 1186189499Srnoland continue; 1187189499Srnoland } 1188189499Srnoland break; 1189189499Srnoland } 119095584Sanholt 119195584Sanholt flushing = 0; 119295584Sanholt wakeup(&flushing); 119395584Sanholt splx(s); 119495584Sanholt} 119595584Sanholt 1196261455Seadler/* 1197112015Sanholt * Check to see if a block is currently memory resident. 119895584Sanholt */ 119995584Sanholtstruct buf * 120095584Sanholtincore(struct vnode * vp, daddr_t blkno) 120195584Sanholt{ 120295584Sanholt struct buf *bp; 120395584Sanholt 120495584Sanholt int s = splbio(); 120595584Sanholt bp = gbincore(vp, blkno); 120695584Sanholt splx(s); 120795584Sanholt return (bp); 120895584Sanholt} 120995584Sanholt 1210112015Sanholt/* 121195584Sanholt * Returns true if no I/O is needed to access the 121295584Sanholt * associated VM object. This is like incore except 121395584Sanholt * it also hunts around in the VM system for the data. 121495584Sanholt */ 121595584Sanholt 121695584Sanholtint 121795584Sanholtinmem(struct vnode * vp, daddr_t blkno) 121895584Sanholt{ 121995584Sanholt vm_object_t obj; 122095584Sanholt vm_offset_t toff, tinc; 122195584Sanholt vm_page_t m; 122295584Sanholt vm_ooffset_t off; 122395584Sanholt 122495584Sanholt if (incore(vp, blkno)) 122595584Sanholt return 1; 122695584Sanholt if (vp->v_mount == NULL) 122795584Sanholt return 0; 122895584Sanholt if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0) 122995584Sanholt return 0; 123095584Sanholt 123195584Sanholt obj = vp->v_object; 123295584Sanholt tinc = PAGE_SIZE; 123395584Sanholt if (tinc > vp->v_mount->mnt_stat.f_iosize) 123495584Sanholt tinc = vp->v_mount->mnt_stat.f_iosize; 123595584Sanholt off = blkno * vp->v_mount->mnt_stat.f_iosize; 123695584Sanholt 1237119098Sanholt for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 1238112015Sanholt 1239112015Sanholt m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); 1240112015Sanholt if (!m) 1241112015Sanholt return 0; 1242112015Sanholt if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) 124395584Sanholt return 0; 1244112015Sanholt } 1245112015Sanholt return 1; 1246112015Sanholt} 1247112015Sanholt 1248112015Sanholt/* 1249112015Sanholt * now we set the dirty range for the buffer -- 1250112015Sanholt * for NFS -- if the file is mapped and pages have 1251112015Sanholt * been written to, let it know. We want the 1252145132Sanholt * entire range of the buffer to be marked dirty if 1253112015Sanholt * any of the pages have been written to for consistancy 1254112015Sanholt * with the b_validoff, b_validend set in the nfs write 1255112015Sanholt * code, and used by the nfs read code. 1256112015Sanholt */ 1257145132Sanholtstatic void 1258145132Sanholtvfs_setdirty(struct buf *bp) { 1259145132Sanholt int i; 1260145132Sanholt vm_object_t object; 1261145132Sanholt vm_offset_t boffset, offset; 1262145132Sanholt /* 1263145132Sanholt * We qualify the scan for modified pages on whether the 1264145132Sanholt * object has been flushed yet. The OBJ_WRITEABLE flag 1265112015Sanholt * is not cleared simply by protecting pages off. 1266112015Sanholt */ 1267112015Sanholt if ((bp->b_flags & B_VMIO) && 1268112015Sanholt ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { 1269112015Sanholt /* 1270112015Sanholt * test the pages to see if they have been modified directly 1271112015Sanholt * by users through the VM system. 1272112015Sanholt */ 1273112015Sanholt for (i = 0; i < bp->b_npages; i++) 1274112015Sanholt vm_page_test_dirty(bp->b_pages[i]); 1275112015Sanholt 1276112015Sanholt /* 1277112015Sanholt * scan forwards for the first page modified 1278112015Sanholt */ 1279112015Sanholt for (i = 0; i < bp->b_npages; i++) { 1280112015Sanholt if (bp->b_pages[i]->dirty) { 1281112015Sanholt break; 1282112015Sanholt } 1283112015Sanholt } 1284112015Sanholt boffset = (i << PAGE_SHIFT); 1285112015Sanholt if (boffset < bp->b_dirtyoff) { 1286112015Sanholt bp->b_dirtyoff = boffset; 1287112015Sanholt } 1288112015Sanholt 1289112015Sanholt /* 1290112015Sanholt * scan backwards for the last page modified 1291112015Sanholt */ 1292112015Sanholt for (i = bp->b_npages - 1; i >= 0; --i) { 1293112015Sanholt if (bp->b_pages[i]->dirty) { 1294112015Sanholt break; 1295112015Sanholt } 1296112015Sanholt } 1297112015Sanholt boffset = (i + 1); 1298112015Sanholt offset = boffset + bp->b_pages[0]->pindex; 1299112015Sanholt if (offset >= object->size) 1300112015Sanholt boffset = object->size - bp->b_pages[0]->pindex; 1301112015Sanholt if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) 1302112015Sanholt bp->b_dirtyend = (boffset << PAGE_SHIFT); 1303112015Sanholt } 1304112015Sanholt} 1305112015Sanholt 1306112015Sanholt/* 1307112015Sanholt * Get a block given a specified block and offset into a file/device. 1308112015Sanholt */ 1309112015Sanholtstruct buf * 1310112015Sanholtgetblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1311112015Sanholt{ 1312112015Sanholt struct buf *bp; 1313112015Sanholt int s; 1314112015Sanholt struct bufhashhdr *bh; 1315112015Sanholt int maxsize; 1316112015Sanholt static pid_t flushing = 0; 1317112015Sanholt 1318145132Sanholt if (vp->v_mount) { 1319112015Sanholt maxsize = vp->v_mount->mnt_stat.f_iosize; 1320112015Sanholt /* 1321112015Sanholt * This happens on mount points. 1322112015Sanholt */ 1323112015Sanholt if (maxsize < size) 1324145132Sanholt maxsize = size; 1325119098Sanholt } else { 1326119098Sanholt maxsize = size; 1327112015Sanholt } 1328145132Sanholt 1329145132Sanholt#if !defined(MAX_PERF) 1330145132Sanholt if (size > MAXBSIZE) 1331145132Sanholt panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); 1332145132Sanholt#endif 1333145132Sanholt 1334119098Sanholt s = splbio(); 1335162132Sanholtloop: 1336162132Sanholt if (numfreebuffers < lofreebuffers) { 1337112015Sanholt waitfreebuffers(slpflag, slptimeo); 1338112015Sanholt } 1339112015Sanholt 1340112015Sanholt if ((bp = gbincore(vp, blkno))) { 1341112015Sanholt if (bp->b_flags & B_BUSY) { 1342112015Sanholt bp->b_flags |= B_WANTED; 1343112015Sanholt if (bp->b_usecount < BUF_MAXUSE) 1344112015Sanholt ++bp->b_usecount; 1345112015Sanholt if (!tsleep(bp, 1346112015Sanholt (PRIBIO + 1) | slpflag, "getblk", slptimeo)) 1347145132Sanholt goto loop; 1348112015Sanholt 1349130331Sanholt splx(s); 1350112015Sanholt return (struct buf *) NULL; 1351145132Sanholt } 1352145132Sanholt bp->b_flags |= B_BUSY | B_CACHE; 1353189499Srnoland bremfree(bp); 1354145132Sanholt 1355152909Sanholt /* 1356189499Srnoland * check for size inconsistancies (note that they shouldn't happen 1357152909Sanholt * but do when filesystems don't handle the size changes correctly.) 1358162132Sanholt * We are conservative on metadata and don't just extend the buffer 1359162132Sanholt * but write and re-constitute it. 1360189499Srnoland */ 1361189499Srnoland 1362152909Sanholt if (bp->b_bcount != size) { 1363182080Srnoland if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) { 1364182080Srnoland allocbuf(bp, size); 1365182080Srnoland } else { 1366152909Sanholt bp->b_flags |= B_NOCACHE; 1367182080Srnoland VOP_BWRITE(bp); 1368182080Srnoland goto loop; 1369152909Sanholt } 1370182080Srnoland } 1371182080Srnoland 1372182080Srnoland if (bp->b_usecount < BUF_MAXUSE) 1373182080Srnoland ++bp->b_usecount; 1374182080Srnoland splx(s); 1375182080Srnoland return (bp); 1376182080Srnoland } else { 1377182080Srnoland vm_object_t obj; 1378182080Srnoland 1379182080Srnoland if ((bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize)) == 0) { 1380182080Srnoland if (slpflag || slptimeo) { 1381182080Srnoland splx(s); 1382182080Srnoland return NULL; 1383182080Srnoland } 1384182080Srnoland goto loop; 1385182080Srnoland } 1386182080Srnoland 1387182080Srnoland /* 1388189499Srnoland * This code is used to make sure that a buffer is not 1389189499Srnoland * created while the getnewbuf routine is blocked. 1390189499Srnoland * Normally the vnode is locked so this isn't a problem. 1391189499Srnoland * VBLK type I/O requests, however, don't lock the vnode. 1392189499Srnoland */ 1393189499Srnoland if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { 1394189499Srnoland bp->b_flags |= B_INVAL; 1395189499Srnoland brelse(bp); 1396189499Srnoland goto loop; 1397189499Srnoland } 1398189499Srnoland 1399189499Srnoland /* 1400189499Srnoland * Insert the buffer into the hash, so that it can 1401189499Srnoland * be found by incore. 1402189499Srnoland */ 1403189499Srnoland bp->b_blkno = bp->b_lblkno = blkno; 1404189499Srnoland bgetvp(vp, bp); 1405189499Srnoland LIST_REMOVE(bp, b_hash); 1406189499Srnoland bh = BUFHASH(vp, blkno); 1407189499Srnoland LIST_INSERT_HEAD(bh, bp, b_hash); 1408189499Srnoland 1409189499Srnoland if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { 1410189499Srnoland bp->b_flags |= (B_VMIO | B_CACHE); 1411189499Srnoland#if defined(VFS_BIO_DEBUG) 1412189499Srnoland if (vp->v_type != VREG && vp->v_type != VBLK) 1413189499Srnoland printf("getblk: vmioing file type %d???\n", vp->v_type); 1414189499Srnoland#endif 1415189499Srnoland } else { 1416189499Srnoland bp->b_flags &= ~B_VMIO; 1417189499Srnoland } 1418189499Srnoland splx(s); 1419189499Srnoland 1420189499Srnoland allocbuf(bp, size); 1421189499Srnoland#ifdef PC98 1422189499Srnoland /* 1423189499Srnoland * 1024byte/sector support 1424189499Srnoland */ 1425189499Srnoland#define B_XXX2 0x8000000 1426189499Srnoland if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2; 1427189499Srnoland#endif 1428189499Srnoland return (bp); 1429189499Srnoland } 1430189499Srnoland} 1431189499Srnoland 1432189499Srnoland/* 1433189499Srnoland * Get an empty, disassociated buffer of given size. 1434189499Srnoland */ 1435189499Srnolandstruct buf * 1436189499Srnolandgeteblk(int size) 1437189499Srnoland{ 1438189499Srnoland struct buf *bp; 1439189499Srnoland int s; 1440189499Srnoland 1441189499Srnoland s = splbio(); 1442189499Srnoland while ((bp = getnewbuf(0, 0, 0, size, MAXBSIZE)) == 0); 1443189499Srnoland splx(s); 1444189499Srnoland allocbuf(bp, size); 1445189499Srnoland bp->b_flags |= B_INVAL; 1446189499Srnoland return (bp); 1447189499Srnoland} 1448189499Srnoland 1449189499Srnoland 1450189499Srnoland/* 1451189499Srnoland * This code constitutes the buffer memory from either anonymous system 1452189499Srnoland * memory (in the case of non-VMIO operations) or from an associated 1453189499Srnoland * VM object (in the case of VMIO operations). 1454189499Srnoland * 1455189499Srnoland * Note that this code is tricky, and has many complications to resolve 1456189499Srnoland * deadlock or inconsistant data situations. Tread lightly!!! 1457189499Srnoland * 1458189499Srnoland * Modify the length of a buffer's underlying buffer storage without 1459189499Srnoland * destroying information (unless, of course the buffer is shrinking). 1460189499Srnoland */ 1461189499Srnolandint 1462189499Srnolandallocbuf(struct buf * bp, int size) 1463189499Srnoland{ 1464189499Srnoland 1465189499Srnoland int s; 1466189499Srnoland int newbsize, mbsize; 1467189499Srnoland int i; 1468189499Srnoland 1469189499Srnoland#if !defined(MAX_PERF) 1470189499Srnoland if (!(bp->b_flags & B_BUSY)) 1471189499Srnoland panic("allocbuf: buffer not busy"); 1472189499Srnoland 1473189499Srnoland if (bp->b_kvasize < size) 1474189499Srnoland panic("allocbuf: buffer too small"); 1475189499Srnoland#endif 1476189499Srnoland 1477189499Srnoland if ((bp->b_flags & B_VMIO) == 0) { 1478189499Srnoland caddr_t origbuf; 1479189499Srnoland int origbufsize; 1480189499Srnoland /* 1481189499Srnoland * Just get anonymous memory from the kernel 1482189499Srnoland */ 1483189499Srnoland mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1484189499Srnoland#if !defined(NO_B_MALLOC) 1485189499Srnoland if (bp->b_flags & B_MALLOC) 1486189499Srnoland newbsize = mbsize; 1487189499Srnoland else 1488189499Srnoland#endif 1489189499Srnoland newbsize = round_page(size); 1490189499Srnoland 1491189499Srnoland if (newbsize < bp->b_bufsize) { 1492189499Srnoland#if !defined(NO_B_MALLOC) 1493189499Srnoland /* 1494189499Srnoland * malloced buffers are not shrunk 1495189499Srnoland */ 1496189499Srnoland if (bp->b_flags & B_MALLOC) { 1497189499Srnoland if (newbsize) { 1498189499Srnoland bp->b_bcount = size; 1499189499Srnoland } else { 1500189499Srnoland free(bp->b_data, M_BIOBUF); 1501189499Srnoland bufspace -= bp->b_bufsize; 1502189499Srnoland bufmallocspace -= bp->b_bufsize; 1503189499Srnoland bp->b_data = bp->b_kvabase; 1504189499Srnoland bp->b_bufsize = 0; 1505189499Srnoland bp->b_bcount = 0; 1506189499Srnoland bp->b_flags &= ~B_MALLOC; 1507189499Srnoland } 1508189499Srnoland return 1; 1509189499Srnoland } 1510189499Srnoland#endif 1511189499Srnoland vm_hold_free_pages( 1512189499Srnoland bp, 1513189499Srnoland (vm_offset_t) bp->b_data + newbsize, 1514189499Srnoland (vm_offset_t) bp->b_data + bp->b_bufsize); 1515189499Srnoland } else if (newbsize > bp->b_bufsize) { 1516189499Srnoland#if !defined(NO_B_MALLOC) 1517189499Srnoland /* 1518189499Srnoland * We only use malloced memory on the first allocation. 1519189499Srnoland * and revert to page-allocated memory when the buffer grows. 1520189499Srnoland */ 1521189499Srnoland if ( (bufmallocspace < maxbufmallocspace) && 1522189499Srnoland (bp->b_bufsize == 0) && 1523189499Srnoland (mbsize <= PAGE_SIZE/2)) { 1524189499Srnoland 1525189499Srnoland bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); 1526189499Srnoland bp->b_bufsize = mbsize; 1527261455Seadler bp->b_bcount = size; 1528189499Srnoland bp->b_flags |= B_MALLOC; 1529189499Srnoland bufspace += mbsize; 1530189499Srnoland bufmallocspace += mbsize; 1531189499Srnoland return 1; 1532189499Srnoland } 1533189499Srnoland#endif 1534189499Srnoland origbuf = NULL; 1535189499Srnoland origbufsize = 0; 1536189499Srnoland#if !defined(NO_B_MALLOC) 1537189499Srnoland /* 1538189499Srnoland * If the buffer is growing on it's other-than-first allocation, 1539189499Srnoland * then we revert to the page-allocation scheme. 1540189499Srnoland */ 1541189499Srnoland if (bp->b_flags & B_MALLOC) { 1542189499Srnoland origbuf = bp->b_data; 1543189499Srnoland origbufsize = bp->b_bufsize; 1544189499Srnoland bp->b_data = bp->b_kvabase; 1545189499Srnoland bufspace -= bp->b_bufsize; 1546189499Srnoland bufmallocspace -= bp->b_bufsize; 1547189499Srnoland bp->b_bufsize = 0; 1548189499Srnoland bp->b_flags &= ~B_MALLOC; 1549261455Seadler newbsize = round_page(newbsize); 1550189499Srnoland } 1551189499Srnoland#endif 1552189499Srnoland vm_hold_load_pages( 1553189499Srnoland bp, 1554189499Srnoland (vm_offset_t) bp->b_data + bp->b_bufsize, 1555189499Srnoland (vm_offset_t) bp->b_data + newbsize); 1556189499Srnoland#if !defined(NO_B_MALLOC) 1557189499Srnoland if (origbuf) { 1558189499Srnoland bcopy(origbuf, bp->b_data, origbufsize); 1559189499Srnoland free(origbuf, M_BIOBUF); 1560189499Srnoland } 1561189499Srnoland#endif 1562189499Srnoland } 1563189499Srnoland } else { 1564189499Srnoland vm_page_t m; 1565189499Srnoland int desiredpages; 1566189499Srnoland 1567189499Srnoland newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 1568189499Srnoland desiredpages = (round_page(newbsize) >> PAGE_SHIFT); 1569189499Srnoland 1570189499Srnoland#if !defined(NO_B_MALLOC) 1571189499Srnoland if (bp->b_flags & B_MALLOC) 1572189499Srnoland panic("allocbuf: VMIO buffer can't be malloced"); 1573189499Srnoland#endif 1574189499Srnoland 1575189499Srnoland if (newbsize < bp->b_bufsize) { 1576189499Srnoland if (desiredpages < bp->b_npages) { 1577189499Srnoland for (i = desiredpages; i < bp->b_npages; i++) { 1578189499Srnoland /* 1579189499Srnoland * the page is not freed here -- it 1580189499Srnoland * is the responsibility of vnode_pager_setsize 1581189499Srnoland */ 1582189499Srnoland m = bp->b_pages[i]; 1583189499Srnoland#if defined(DIAGNOSTIC) 1584189499Srnoland if (m == bogus_page) 1585189499Srnoland panic("allocbuf: bogus page found"); 1586189499Srnoland#endif 1587189499Srnoland s = splvm(); 1588189499Srnoland while ((m->flags & PG_BUSY) || (m->busy != 0)) { 1589189499Srnoland m->flags |= PG_WANTED; 1590189499Srnoland tsleep(m, PVM, "biodep", 0); 1591189499Srnoland } 1592189499Srnoland splx(s); 1593189499Srnoland 1594189499Srnoland bp->b_pages[i] = NULL; 1595189499Srnoland vm_page_unwire(m); 1596189499Srnoland } 1597189499Srnoland pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + 1598189499Srnoland (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); 1599189499Srnoland bp->b_npages = desiredpages; 1600189499Srnoland } 1601189499Srnoland } else if (newbsize > bp->b_bufsize) { 1602189499Srnoland vm_object_t obj; 1603189499Srnoland vm_offset_t tinc, toff; 1604189499Srnoland vm_ooffset_t off; 1605189499Srnoland vm_pindex_t objoff; 1606189499Srnoland int pageindex, curbpnpages; 1607189499Srnoland struct vnode *vp; 1608189499Srnoland int bsize; 1609189499Srnoland 1610189499Srnoland vp = bp->b_vp; 1611189499Srnoland 1612189499Srnoland if (vp->v_type == VBLK) 1613189499Srnoland bsize = DEV_BSIZE; 1614189499Srnoland else 1615189499Srnoland bsize = vp->v_mount->mnt_stat.f_iosize; 1616189499Srnoland 1617189499Srnoland if (bp->b_npages < desiredpages) { 1618189499Srnoland obj = vp->v_object; 1619189499Srnoland tinc = PAGE_SIZE; 1620189499Srnoland if (tinc > bsize) 1621189499Srnoland tinc = bsize; 1622261455Seadler off = (vm_ooffset_t) bp->b_lblkno * bsize; 1623189499Srnoland curbpnpages = bp->b_npages; 1624189499Srnoland doretry: 1625189499Srnoland bp->b_flags |= B_CACHE; 1626189499Srnoland bp->b_validoff = bp->b_validend = 0; 1627189499Srnoland for (toff = 0; toff < newbsize; toff += tinc) { 1628189499Srnoland int bytesinpage; 1629189499Srnoland 1630189499Srnoland pageindex = toff >> PAGE_SHIFT; 1631189499Srnoland objoff = OFF_TO_IDX(off + toff); 1632189499Srnoland if (pageindex < curbpnpages) { 1633189499Srnoland 1634189499Srnoland m = bp->b_pages[pageindex]; 1635189499Srnoland#ifdef VFS_BIO_DIAG 1636189499Srnoland if (m->pindex != objoff) 1637189499Srnoland panic("allocbuf: page changed offset??!!!?"); 1638189499Srnoland#endif 1639189499Srnoland bytesinpage = tinc; 1640189499Srnoland if (tinc > (newbsize - toff)) 1641189499Srnoland bytesinpage = newbsize - toff; 1642189499Srnoland if (bp->b_flags & B_CACHE) 1643189499Srnoland vfs_buf_set_valid(bp, off, toff, bytesinpage, m); 1644189499Srnoland continue; 1645189499Srnoland } 1646189499Srnoland m = vm_page_lookup(obj, objoff); 1647189499Srnoland if (!m) { 1648189499Srnoland m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); 1649189499Srnoland if (!m) { 1650189499Srnoland VM_WAIT; 1651189499Srnoland goto doretry; 1652189499Srnoland } 1653189499Srnoland /* 1654189499Srnoland * Normally it is unwise to clear PG_BUSY without 1655189499Srnoland * PAGE_WAKEUP -- but it is okay here, as there is 1656189499Srnoland * no chance for blocking between here and vm_page_alloc 1657189499Srnoland */ 1658189499Srnoland m->flags &= ~PG_BUSY; 1659189499Srnoland vm_page_wire(m); 1660189499Srnoland bp->b_flags &= ~B_CACHE; 1661189499Srnoland } else if (m->flags & PG_BUSY) { 1662189499Srnoland s = splvm(); 1663189499Srnoland if (m->flags & PG_BUSY) { 1664189499Srnoland m->flags |= PG_WANTED; 1665189499Srnoland tsleep(m, PVM, "pgtblk", 0); 1666189499Srnoland } 1667189499Srnoland splx(s); 1668189499Srnoland goto doretry; 1669189499Srnoland } else { 1670189499Srnoland if ((curproc != pageproc) && 1671189499Srnoland ((m->queue - m->pc) == PQ_CACHE) && 1672189499Srnoland ((cnt.v_free_count + cnt.v_cache_count) < 1673189499Srnoland (cnt.v_free_min + cnt.v_cache_min))) { 1674189499Srnoland pagedaemon_wakeup(); 1675189499Srnoland } 1676189499Srnoland bytesinpage = tinc; 1677189499Srnoland if (tinc > (newbsize - toff)) 1678189499Srnoland bytesinpage = newbsize - toff; 1679189499Srnoland if (bp->b_flags & B_CACHE) 1680189499Srnoland vfs_buf_set_valid(bp, off, toff, bytesinpage, m); 1681189499Srnoland vm_page_wire(m); 1682261455Seadler } 1683189499Srnoland bp->b_pages[pageindex] = m; 1684189499Srnoland curbpnpages = pageindex + 1; 1685189499Srnoland } 1686189499Srnoland if (vp->v_tag == VT_NFS) { 1687189499Srnoland if (bp->b_dirtyend > 0) { 1688189499Srnoland bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); 1689189499Srnoland bp->b_validend = max(bp->b_validend, bp->b_dirtyend); 1690189499Srnoland } 1691189499Srnoland if (bp->b_validend == 0) 1692189499Srnoland bp->b_flags &= ~B_CACHE; 1693189499Srnoland } 1694189499Srnoland bp->b_data = (caddr_t) trunc_page(bp->b_data); 1695189499Srnoland bp->b_npages = curbpnpages; 1696189499Srnoland pmap_qenter((vm_offset_t) bp->b_data, 1697189499Srnoland bp->b_pages, bp->b_npages); 1698189499Srnoland ((vm_offset_t) bp->b_data) |= off & PAGE_MASK; 1699189499Srnoland } 1700189499Srnoland } 1701189499Srnoland } 1702189499Srnoland if (bp->b_flags & B_VMIO) 1703189499Srnoland vmiospace += (newbsize - bp->b_bufsize); 1704189499Srnoland bufspace += (newbsize - bp->b_bufsize); 1705189499Srnoland bp->b_bufsize = newbsize; 1706189499Srnoland bp->b_bcount = size; 1707189499Srnoland return 1; 1708189499Srnoland} 1709189499Srnoland 1710189499Srnoland/* 1711189499Srnoland * Wait for buffer I/O completion, returning error status. 1712189499Srnoland */ 1713189499Srnolandint 1714189499Srnolandbiowait(register struct buf * bp) 1715189499Srnoland{ 1716189499Srnoland int s; 1717189499Srnoland 1718189499Srnoland s = splbio(); 1719189499Srnoland while ((bp->b_flags & B_DONE) == 0) 1720189499Srnoland#if defined(NO_SCHEDULE_MODS) 1721189499Srnoland tsleep(bp, PRIBIO, "biowait", 0); 1722189499Srnoland#else 1723189499Srnoland tsleep(bp, curproc->p_usrpri, "biowait", 0); 1724189499Srnoland#endif 1725189499Srnoland splx(s); 1726189499Srnoland if (bp->b_flags & B_EINTR) { 1727189499Srnoland bp->b_flags &= ~B_EINTR; 1728189499Srnoland return (EINTR); 1729189499Srnoland } 1730189499Srnoland if (bp->b_flags & B_ERROR) { 1731189499Srnoland return (bp->b_error ? bp->b_error : EIO); 1732189499Srnoland } else { 1733189499Srnoland return (0); 1734189499Srnoland } 1735189499Srnoland} 1736189499Srnoland 1737189499Srnoland/* 1738189499Srnoland * Finish I/O on a buffer, calling an optional function. 1739189499Srnoland * This is usually called from interrupt level, so process blocking 1740189499Srnoland * is not *a good idea*. 1741189499Srnoland */ 1742189499Srnolandvoid 1743189499Srnolandbiodone(register struct buf * bp) 1744189499Srnoland{ 1745189499Srnoland int s; 1746189499Srnoland 1747189499Srnoland s = splbio(); 1748189499Srnoland 1749189499Srnoland#if !defined(MAX_PERF) 1750189499Srnoland if (!(bp->b_flags & B_BUSY)) 1751189499Srnoland panic("biodone: buffer not busy"); 1752189499Srnoland#endif 1753189499Srnoland 1754189499Srnoland if (bp->b_flags & B_DONE) { 1755189499Srnoland splx(s); 1756261455Seadler#if !defined(MAX_PERF) 1757189499Srnoland printf("biodone: buffer already done\n"); 1758189499Srnoland#endif 1759189499Srnoland return; 1760189499Srnoland } 1761189499Srnoland bp->b_flags |= B_DONE; 1762189499Srnoland 1763189499Srnoland if ((bp->b_flags & B_READ) == 0) { 1764189499Srnoland vwakeup(bp); 1765189499Srnoland } 1766189499Srnoland#ifdef BOUNCE_BUFFERS 1767189499Srnoland if (bp->b_flags & B_BOUNCE) 1768189499Srnoland vm_bounce_free(bp); 1769189499Srnoland#endif 1770189499Srnoland 1771189499Srnoland /* call optional completion function if requested */ 1772189499Srnoland if (bp->b_flags & B_CALL) { 1773189499Srnoland bp->b_flags &= ~B_CALL; 1774189499Srnoland (*bp->b_iodone) (bp); 1775189499Srnoland splx(s); 1776189499Srnoland return; 1777189499Srnoland } 1778189499Srnoland if (bp->b_flags & B_VMIO) { 1779189499Srnoland int i, resid; 1780189499Srnoland vm_ooffset_t foff; 1781189499Srnoland vm_page_t m; 1782189499Srnoland vm_object_t obj; 1783189499Srnoland int iosize; 1784189499Srnoland struct vnode *vp = bp->b_vp; 1785189499Srnoland 1786189499Srnoland if (vp->v_type == VBLK) 1787189499Srnoland foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 1788189499Srnoland else 1789189499Srnoland foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1790189499Srnoland obj = vp->v_object; 1791189499Srnoland#if !defined(MAX_PERF) 1792189499Srnoland if (!obj) { 1793189499Srnoland panic("biodone: no object"); 1794189499Srnoland } 1795189499Srnoland#endif 1796189499Srnoland#if defined(VFS_BIO_DEBUG) 1797189499Srnoland if (obj->paging_in_progress < bp->b_npages) { 1798189499Srnoland printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", 1799189499Srnoland obj->paging_in_progress, bp->b_npages); 1800189499Srnoland } 1801189499Srnoland#endif 1802189499Srnoland iosize = bp->b_bufsize; 1803189499Srnoland for (i = 0; i < bp->b_npages; i++) { 180495584Sanholt int bogusflag = 0; 180595584Sanholt m = bp->b_pages[i]; 180695584Sanholt if (m == bogus_page) { 180795584Sanholt bogusflag = 1; 180895584Sanholt m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 180995584Sanholt if (!m) { 1810112015Sanholt#if defined(VFS_BIO_DEBUG) 181195584Sanholt printf("biodone: page disappeared\n"); 181295584Sanholt#endif 1813189499Srnoland --obj->paging_in_progress; 1814189499Srnoland continue; 1815189499Srnoland } 1816189499Srnoland bp->b_pages[i] = m; 1817189499Srnoland pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 181895584Sanholt } 181995584Sanholt#if defined(VFS_BIO_DEBUG) 182095584Sanholt if (OFF_TO_IDX(foff) != m->pindex) { 182195584Sanholt printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); 182295584Sanholt } 1823152909Sanholt#endif 1824152909Sanholt resid = IDX_TO_OFF(m->pindex + 1) - foff; 1825189499Srnoland if (resid > iosize) 1826189499Srnoland resid = iosize; 1827189499Srnoland /* 1828189499Srnoland * In the write case, the valid and clean bits are 1829189499Srnoland * already changed correctly, so we only need to do this 1830189499Srnoland * here in the read case. 1831189499Srnoland */ 1832189499Srnoland if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { 1833189499Srnoland vfs_page_set_valid(bp, foff, i, m); 1834189499Srnoland } 1835112015Sanholt 1836112015Sanholt /* 183795584Sanholt * when debugging new filesystems or buffer I/O methods, this 1838189499Srnoland * is the most common error that pops up. if you see this, you 183995584Sanholt * have not set the page busy flag correctly!!! 1840189499Srnoland */ 184195584Sanholt if (m->busy == 0) { 1842189499Srnoland#if !defined(MAX_PERF) 184395584Sanholt printf("biodone: page busy < 0, " 184495584Sanholt "pindex: %d, foff: 0x(%x,%x), " 1845189499Srnoland "resid: %d, index: %d\n", 1846148211Sanholt (int) m->pindex, (int)(foff >> 32), 1847189499Srnoland (int) foff & 0xffffffff, resid, i); 1848148211Sanholt#endif 1849189499Srnoland if (vp->v_type != VBLK) 1850148211Sanholt#if !defined(MAX_PERF) 1851148211Sanholt printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", 1852189499Srnoland bp->b_vp->v_mount->mnt_stat.f_iosize, 1853182080Srnoland (int) bp->b_lblkno, 1854182080Srnoland bp->b_flags, bp->b_npages); 1855182080Srnoland else 1856182080Srnoland printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", 1857182080Srnoland (int) bp->b_lblkno, 1858182080Srnoland bp->b_flags, bp->b_npages); 1859189499Srnoland printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 1860182080Srnoland m->valid, m->dirty, m->wire_count); 1861189499Srnoland#endif 1862182080Srnoland panic("biodone: page busy < 0\n"); 1863189499Srnoland } 1864189499Srnoland --m->busy; 1865182080Srnoland if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1866182080Srnoland m->flags &= ~PG_WANTED; 1867189499Srnoland wakeup(m); 1868182080Srnoland } 1869182080Srnoland --obj->paging_in_progress; 1870182080Srnoland foff += resid; 1871182080Srnoland iosize -= resid; 1872182080Srnoland } 1873182080Srnoland if (obj && obj->paging_in_progress == 0 && 1874189499Srnoland (obj->flags & OBJ_PIPWNT)) { 1875189499Srnoland obj->flags &= ~OBJ_PIPWNT; 1876189499Srnoland wakeup(obj); 1877189499Srnoland } 1878189499Srnoland } 1879189499Srnoland /* 1880189499Srnoland * For asynchronous completions, release the buffer now. The brelse 1881182080Srnoland * checks for B_WANTED and will do the wakeup there if necessary - so 1882189499Srnoland * no need to do a wakeup here in the async case. 1883189499Srnoland */ 1884189499Srnoland 1885189499Srnoland if (bp->b_flags & B_ASYNC) { 1886189499Srnoland if ((bp->b_flags & B_ORDERED) == 0) { 1887189499Srnoland if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) 1888189499Srnoland brelse(bp); 1889182080Srnoland else 1890182080Srnoland bqrelse(bp); 189195584Sanholt } 189295584Sanholt } else { 189395584Sanholt bp->b_flags &= ~B_WANTED; 189495584Sanholt wakeup(bp); 189595584Sanholt } 189695584Sanholt splx(s); 189795584Sanholt} 189895584Sanholt 189995584Sanholtint 190095584Sanholtcount_lock_queue() 190195584Sanholt{ 190295584Sanholt int count; 190395584Sanholt struct buf *bp; 190495584Sanholt 190595584Sanholt count = 0; 190695584Sanholt for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]); 1907196470Srnoland bp != NULL; 1908196470Srnoland bp = TAILQ_NEXT(bp, b_freelist)) 1909196470Srnoland count++; 1910196470Srnoland return (count); 191195584Sanholt} 191295584Sanholt 191395584Sanholtint vfs_update_interval = 30; 191495584Sanholt 191595584Sanholtstatic void 1916196470Srnolandvfs_update() 1917196470Srnoland{ 1918196470Srnoland while (1) { 1919196470Srnoland tsleep(&vfs_update_wakeup, PUSER, "update", 192095584Sanholt hz * vfs_update_interval); 192195584Sanholt vfs_update_wakeup = 0; 192295584Sanholt sync(curproc, NULL, NULL); 192395584Sanholt } 192495584Sanholt} 1925196470Srnoland 1926196470Srnolandstatic int 1927196470Srnolandsysctl_kern_updateinterval SYSCTL_HANDLER_ARGS 1928196470Srnoland{ 192995584Sanholt int error = sysctl_handle_int(oidp, 193095584Sanholt oidp->oid_arg1, oidp->oid_arg2, req); 193195584Sanholt if (!error) 193295584Sanholt wakeup(&vfs_update_wakeup); 193395584Sanholt return error; 193495584Sanholt} 1935196470Srnoland 1936196470SrnolandSYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, 1937196470Srnoland &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); 1938196470Srnoland 193995584Sanholt 194095584Sanholt/* 194195584Sanholt * This routine is called in lieu of iodone in the case of 194295584Sanholt * incomplete I/O. This keeps the busy status for pages 1943182080Srnoland * consistant. 1944189499Srnoland */ 1945189499Srnolandvoid 1946182080Srnolandvfs_unbusy_pages(struct buf * bp) 1947189499Srnoland{ 1948189499Srnoland int i; 1949189499Srnoland 195095584Sanholt if (bp->b_flags & B_VMIO) { 195195584Sanholt struct vnode *vp = bp->b_vp; 195295584Sanholt vm_object_t obj = vp->v_object; 1953182080Srnoland vm_ooffset_t foff; 1954189499Srnoland 1955189499Srnoland foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 1956182080Srnoland 1957189499Srnoland for (i = 0; i < bp->b_npages; i++) { 1958189499Srnoland vm_page_t m = bp->b_pages[i]; 1959189499Srnoland 196095584Sanholt if (m == bogus_page) { 196195584Sanholt m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); 196295584Sanholt#if !defined(MAX_PERF) 1963182080Srnoland if (!m) { 1964189499Srnoland panic("vfs_unbusy_pages: page missing\n"); 1965189499Srnoland } 1966182080Srnoland#endif 1967189499Srnoland bp->b_pages[i] = m; 1968189499Srnoland pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 1969189499Srnoland } 197095584Sanholt --obj->paging_in_progress; 197195584Sanholt --m->busy; 197295584Sanholt if ((m->busy == 0) && (m->flags & PG_WANTED)) { 1973182080Srnoland m->flags &= ~PG_WANTED; 1974189499Srnoland wakeup(m); 1975189499Srnoland } 1976182080Srnoland } 1977189499Srnoland if (obj->paging_in_progress == 0 && 1978189499Srnoland (obj->flags & OBJ_PIPWNT)) { 1979189499Srnoland obj->flags &= ~OBJ_PIPWNT; 198095584Sanholt wakeup(obj); 198195584Sanholt } 198295584Sanholt } 198395584Sanholt} 198495584Sanholt 198595584Sanholt/* 1986145132Sanholt * Set NFS' b_validoff and b_validend fields from the valid bits 1987112015Sanholt * of a page. If the consumer is not NFS, and the page is not 198895584Sanholt * valid for the entire range, clear the B_CACHE flag to force 198995584Sanholt * the consumer to re-read the page. 1990112015Sanholt */ 1991113995Sanholtstatic void 1992112015Sanholtvfs_buf_set_valid(struct buf *bp, 1993112015Sanholt vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 199495584Sanholt vm_page_t m) 199595584Sanholt{ 199695584Sanholt if (bp->b_vp->v_tag == VT_NFS) { 199795584Sanholt vm_offset_t svalid, evalid; 1998189499Srnoland int validbits = m->valid; 1999189499Srnoland 200095584Sanholt /* 2001189499Srnoland * This only bothers with the first valid range in the 2002189499Srnoland * page. 2003189499Srnoland */ 2004189499Srnoland svalid = off; 2005189499Srnoland while (validbits && !(validbits & 1)) { 200695584Sanholt svalid += DEV_BSIZE; 200795584Sanholt validbits >>= 1; 200895584Sanholt } 200995584Sanholt evalid = svalid; 201095584Sanholt while (validbits & 1) { 201195584Sanholt evalid += DEV_BSIZE; 201295584Sanholt validbits >>= 1; 201395584Sanholt } 201495584Sanholt /* 201595584Sanholt * Make sure this range is contiguous with the range 201695584Sanholt * built up from previous pages. If not, then we will 201795584Sanholt * just use the range from the previous pages. 201895584Sanholt */ 201995584Sanholt if (svalid == bp->b_validend) { 202095584Sanholt bp->b_validoff = min(bp->b_validoff, svalid); 202195584Sanholt bp->b_validend = max(bp->b_validend, evalid); 202295584Sanholt } 202395584Sanholt } else if (!vm_page_is_valid(m, 202495584Sanholt (vm_offset_t) ((foff + off) & PAGE_MASK), 202595584Sanholt size)) { 202695584Sanholt bp->b_flags &= ~B_CACHE; 2027189499Srnoland } 2028189499Srnoland} 2029189499Srnoland 2030189499Srnoland/* 2031189499Srnoland * Set the valid bits in a page, taking care of the b_validoff, 2032189499Srnoland * b_validend fields which NFS uses to optimise small reads. Off is 2033189499Srnoland * the offset within the file and pageno is the page index within the buf. 2034189499Srnoland */ 2035189499Srnolandstatic void 2036189499Srnolandvfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) 2037189499Srnoland{ 2038189499Srnoland struct vnode *vp = bp->b_vp; 2039189499Srnoland vm_ooffset_t soff, eoff; 2040189499Srnoland 2041189499Srnoland soff = off; 2042189499Srnoland eoff = off + min(PAGE_SIZE, bp->b_bufsize); 2043189499Srnoland vm_page_set_invalid(m, 2044189499Srnoland (vm_offset_t) (soff & PAGE_MASK), 204595584Sanholt (vm_offset_t) (eoff - soff)); 204695584Sanholt if (vp->v_tag == VT_NFS) { 204795584Sanholt vm_ooffset_t sv, ev; 204895584Sanholt off = off - pageno * PAGE_SIZE; 204995584Sanholt sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1)); 205095584Sanholt ev = off + (bp->b_validend & ~(DEV_BSIZE - 1)); 2051189499Srnoland soff = max(sv, soff); 205295584Sanholt eoff = min(ev, eoff); 2053196470Srnoland } 2054196470Srnoland if (eoff > soff) 205595584Sanholt vm_page_set_validclean(m, 205695584Sanholt (vm_offset_t) (soff & PAGE_MASK), 2057182080Srnoland (vm_offset_t) (eoff - soff)); 205895584Sanholt} 2059196470Srnoland 2060196470Srnoland/* 2061196470Srnoland * This routine is called before a device strategy routine. 2062196470Srnoland * It is used to tell the VM system that paging I/O is in 2063196470Srnoland * progress, and treat the pages associated with the buffer 206495584Sanholt * almost as being PG_BUSY. Also the object paging_in_progress 2065112015Sanholt * flag is handled to make sure that the object doesn't become 206695584Sanholt * inconsistant. 206795584Sanholt */ 206895584Sanholtvoid 206995584Sanholtvfs_busy_pages(struct buf * bp, int clear_modify) 207095584Sanholt{ 207195584Sanholt int i; 207295584Sanholt 207395584Sanholt if (bp->b_flags & B_VMIO) { 207495584Sanholt struct vnode *vp = bp->b_vp; 207595584Sanholt vm_object_t obj = vp->v_object; 2076112015Sanholt vm_ooffset_t foff; 2077182080Srnoland 2078112015Sanholt if (vp->v_type == VBLK) 2079112015Sanholt foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 2080189499Srnoland else 2081112015Sanholt foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 2082112015Sanholt vfs_setdirty(bp); 208395584Sanholt for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) { 208495584Sanholt vm_page_t m = bp->b_pages[i]; 2085189499Srnoland 2086189499Srnoland if ((bp->b_flags & B_CLUSTER) == 0) { 2087112015Sanholt obj->paging_in_progress++; 2088189499Srnoland m->busy++; 2089189499Srnoland } 2090112015Sanholt vm_page_protect(m, VM_PROT_NONE); 209195584Sanholt if (clear_modify) 209295584Sanholt vfs_page_set_valid(bp, foff, i, m); 209395584Sanholt else if (bp->b_bcount >= PAGE_SIZE) { 209495584Sanholt if (m->valid && (bp->b_flags & B_CACHE) == 0) { 209595584Sanholt bp->b_pages[i] = bogus_page; 209695584Sanholt pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); 209795584Sanholt } 209895584Sanholt } 209995584Sanholt } 2100112015Sanholt } 2101112015Sanholt} 2102112015Sanholt 2103112015Sanholt/* 210495584Sanholt * Tell the VM system that the pages associated with this buffer 2105189499Srnoland * are clean. This is used for delayed writes where the data is 2106112015Sanholt * going to go to disk eventually without additional VM intevention. 2107145132Sanholt */ 2108112015Sanholtvoid 2109112015Sanholtvfs_clean_pages(struct buf * bp) 2110145132Sanholt{ 2111145132Sanholt int i; 2112189499Srnoland 2113145132Sanholt if (bp->b_flags & B_VMIO) { 2114145132Sanholt struct vnode *vp = bp->b_vp; 2115145132Sanholt vm_object_t obj = vp->v_object; 2116145132Sanholt vm_ooffset_t foff; 2117112015Sanholt 2118145132Sanholt if (vp->v_type == VBLK) 2119112015Sanholt foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; 2120145132Sanholt else 2121145132Sanholt foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; 2122145132Sanholt for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) { 2123145132Sanholt vm_page_t m = bp->b_pages[i]; 2124145132Sanholt 2125112015Sanholt vfs_page_set_valid(bp, foff, i, m); 2126112015Sanholt } 2127112015Sanholt } 2128145132Sanholt} 2129 2130void 2131vfs_bio_clrbuf(struct buf *bp) { 2132 int i; 2133 if( bp->b_flags & B_VMIO) { 2134 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { 2135 int mask; 2136 mask = 0; 2137 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE) 2138 mask |= (1 << (i/DEV_BSIZE)); 2139 if( bp->b_pages[0]->valid != mask) { 2140 bzero(bp->b_data, bp->b_bufsize); 2141 } 2142 bp->b_pages[0]->valid = mask; 2143 bp->b_resid = 0; 2144 return; 2145 } 2146 for(i=0;i<bp->b_npages;i++) { 2147 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) 2148 continue; 2149 if( bp->b_pages[i]->valid == 0) { 2150 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { 2151 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); 2152 } 2153 } else { 2154 int j; 2155 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { 2156 if( (bp->b_pages[i]->valid & (1<<j)) == 0) 2157 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); 2158 } 2159 } 2160 /* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */ 2161 } 2162 bp->b_resid = 0; 2163 } else { 2164 clrbuf(bp); 2165 } 2166} 2167 2168/* 2169 * vm_hold_load_pages and vm_hold_unload pages get pages into 2170 * a buffers address space. The pages are anonymous and are 2171 * not associated with a file object. 2172 */ 2173void 2174vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 2175{ 2176 vm_offset_t pg; 2177 vm_page_t p; 2178 int index; 2179 2180 to = round_page(to); 2181 from = round_page(from); 2182 index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; 2183 2184 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 2185 2186tryagain: 2187 2188 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), 2189 VM_ALLOC_NORMAL); 2190 if (!p) { 2191 VM_WAIT; 2192 goto tryagain; 2193 } 2194 vm_page_wire(p); 2195 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 2196 bp->b_pages[index] = p; 2197 PAGE_WAKEUP(p); 2198 } 2199 bp->b_npages = index; 2200} 2201 2202void 2203vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) 2204{ 2205 vm_offset_t pg; 2206 vm_page_t p; 2207 int index, newnpages; 2208 2209 from = round_page(from); 2210 to = round_page(to); 2211 newnpages = index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; 2212 2213 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 2214 p = bp->b_pages[index]; 2215 if (p && (index < bp->b_npages)) { 2216#if !defined(MAX_PERF) 2217 if (p->busy) { 2218 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", 2219 bp->b_blkno, bp->b_lblkno); 2220 } 2221#endif 2222 bp->b_pages[index] = NULL; 2223 pmap_kremove(pg); 2224 vm_page_unwire(p); 2225 vm_page_free(p); 2226 } 2227 } 2228 bp->b_npages = newnpages; 2229} 2230 2231 2232#include "opt_ddb.h" 2233#ifdef DDB 2234#include <ddb/ddb.h> 2235 2236DB_SHOW_COMMAND(buffer, db_show_buffer) 2237{ 2238 /* get args */ 2239 struct buf *bp = (struct buf *)addr; 2240 2241 if (!have_addr) { 2242 db_printf("usage: show buffer <addr>\n"); 2243 return; 2244 } 2245 2246 db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc, 2247 bp->b_flags, "\20\40bounce\37cluster\36vmio\35ram\34ordered" 2248 "\33paging\32xxx\31writeinprog\30wanted\27relbuf\26tape" 2249 "\25read\24raw\23phys\22clusterok\21malloc\20nocache" 2250 "\17locked\16inval\15gathered\14error\13eintr\12done\11dirty" 2251 "\10delwri\7call\6cache\5busy\4bad\3async\2needcommit\1age"); 2252 db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " 2253 "b_resid = %ld\nb_dev = 0x%x, b_un.b_addr = %p, " 2254 "b_blkno = %d, b_pblkno = %d\n", 2255 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 2256 bp->b_dev, bp->b_un.b_addr, bp->b_blkno, bp->b_pblkno); 2257} 2258#endif /* DDB */ 2259