1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22209962Smm * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25249195Smm/* 26249195Smm * Copyright (c) 2013 by Delphix. All rights reserved. 27249195Smm */ 28168404Spjd 29168404Spjd#include <sys/zfs_context.h> 30168404Spjd#include <sys/spa.h> 31168404Spjd#include <sys/vdev_impl.h> 32168404Spjd#include <sys/zio.h> 33185029Spjd#include <sys/kstat.h> 34168404Spjd 35168404Spjd/* 36168404Spjd * Virtual device read-ahead caching. 37168404Spjd * 38168404Spjd * This file implements a simple LRU read-ahead cache. When the DMU reads 39168404Spjd * a given block, it will often want other, nearby blocks soon thereafter. 40168404Spjd * We take advantage of this by reading a larger disk region and caching 41185029Spjd * the result. In the best case, this can turn 128 back-to-back 512-byte 42185029Spjd * reads into a single 64k read followed by 127 cache hits; this reduces 43168404Spjd * latency dramatically. In the worst case, it can turn an isolated 512-byte 44185029Spjd * read into a 64k read, which doesn't affect latency all that much but is 45168404Spjd * terribly wasteful of bandwidth. A more intelligent version of the cache 46168404Spjd * could keep track of access patterns and not do read-ahead unless it sees 47185029Spjd * at least two temporally close I/Os to the same region. Currently, only 48185029Spjd * metadata I/O is inflated. A futher enhancement could take advantage of 49185029Spjd * more semantic information about the I/O. And it could use something 50185029Spjd * faster than an AVL tree; that was chosen solely for convenience. 51168404Spjd * 52168404Spjd * There are five cache operations: allocate, fill, read, write, evict. 53168404Spjd * 54168404Spjd * (1) Allocate. This reserves a cache entry for the specified region. 55168404Spjd * We separate the allocate and fill operations so that multiple threads 56168404Spjd * don't generate I/O for the same cache miss. 57168404Spjd * 58168404Spjd * (2) Fill. When the I/O for a cache miss completes, the fill routine 59168404Spjd * places the data in the previously allocated cache entry. 60168404Spjd * 61168404Spjd * (3) Read. Read data from the cache. 62168404Spjd * 63168404Spjd * (4) Write. Update cache contents after write completion. 64168404Spjd * 65168404Spjd * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry 66168404Spjd * if the total cache size exceeds zfs_vdev_cache_size. 67168404Spjd */ 68168404Spjd 69168404Spjd/* 70168404Spjd * These tunables are for performance analysis. 71168404Spjd */ 72168404Spjd/* 73168404Spjd * All i/os smaller than zfs_vdev_cache_max will be turned into 74168404Spjd * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software 75185029Spjd * track buffer). At most zfs_vdev_cache_size bytes will be kept in each 76168404Spjd * vdev's vdev_cache. 77223622Smm * 78223622Smm * TODO: Note that with the current ZFS code, it turns out that the 79223622Smm * vdev cache is not helpful, and in some cases actually harmful. It 80223622Smm * is better if we disable this. Once some time has passed, we should 81223622Smm * actually remove this to simplify the code. For now we just disable 82223622Smm * it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11 83223622Smm * has made these same changes. 84168404Spjd */ 85185029Spjdint zfs_vdev_cache_max = 1<<14; /* 16KB */ 86223622Smmint zfs_vdev_cache_size = 0; 87168404Spjdint zfs_vdev_cache_bshift = 16; 88168404Spjd 89185029Spjd#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */ 90185029Spjd 91168404SpjdSYSCTL_DECL(_vfs_zfs_vdev); 92168404SpjdSYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); 93168404SpjdTUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max); 94168404SpjdSYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN, 95168404Spjd &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size"); 96168404SpjdTUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size); 97168404SpjdSYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN, 98168404Spjd &zfs_vdev_cache_size, 0, "Size of VDEV cache"); 99185029SpjdTUNABLE_INT("vfs.zfs.vdev.cache.bshift", &zfs_vdev_cache_bshift); 100185029SpjdSYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN, 101185029Spjd &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value"); 102168404Spjd 103185029Spjdkstat_t *vdc_ksp = NULL; 104168404Spjd 105185029Spjdtypedef struct vdc_stats { 106185029Spjd kstat_named_t vdc_stat_delegations; 107185029Spjd kstat_named_t vdc_stat_hits; 108185029Spjd kstat_named_t vdc_stat_misses; 109185029Spjd} vdc_stats_t; 110185029Spjd 111185029Spjdstatic vdc_stats_t vdc_stats = { 112185029Spjd { "delegations", KSTAT_DATA_UINT64 }, 113185029Spjd { "hits", KSTAT_DATA_UINT64 }, 114185029Spjd { "misses", KSTAT_DATA_UINT64 } 115185029Spjd}; 116185029Spjd 117185029Spjd#define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1); 118185029Spjd 119168404Spjdstatic int 120168404Spjdvdev_cache_offset_compare(const void *a1, const void *a2) 121168404Spjd{ 122168404Spjd const vdev_cache_entry_t *ve1 = a1; 123168404Spjd const vdev_cache_entry_t *ve2 = a2; 124168404Spjd 125168404Spjd if (ve1->ve_offset < ve2->ve_offset) 126168404Spjd return (-1); 127168404Spjd if (ve1->ve_offset > ve2->ve_offset) 128168404Spjd return (1); 129168404Spjd return (0); 130168404Spjd} 131168404Spjd 132168404Spjdstatic int 133168404Spjdvdev_cache_lastused_compare(const void *a1, const void *a2) 134168404Spjd{ 135168404Spjd const vdev_cache_entry_t *ve1 = a1; 136168404Spjd const vdev_cache_entry_t *ve2 = a2; 137168404Spjd 138168404Spjd if (ve1->ve_lastused < ve2->ve_lastused) 139168404Spjd return (-1); 140168404Spjd if (ve1->ve_lastused > ve2->ve_lastused) 141168404Spjd return (1); 142168404Spjd 143168404Spjd /* 144168404Spjd * Among equally old entries, sort by offset to ensure uniqueness. 145168404Spjd */ 146168404Spjd return (vdev_cache_offset_compare(a1, a2)); 147168404Spjd} 148168404Spjd 149168404Spjd/* 150168404Spjd * Evict the specified entry from the cache. 151168404Spjd */ 152168404Spjdstatic void 153168404Spjdvdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) 154168404Spjd{ 155168404Spjd ASSERT(MUTEX_HELD(&vc->vc_lock)); 156168404Spjd ASSERT(ve->ve_fill_io == NULL); 157168404Spjd ASSERT(ve->ve_data != NULL); 158168404Spjd 159168404Spjd avl_remove(&vc->vc_lastused_tree, ve); 160168404Spjd avl_remove(&vc->vc_offset_tree, ve); 161168404Spjd zio_buf_free(ve->ve_data, VCBS); 162168404Spjd kmem_free(ve, sizeof (vdev_cache_entry_t)); 163168404Spjd} 164168404Spjd 165168404Spjd/* 166168404Spjd * Allocate an entry in the cache. At the point we don't have the data, 167168404Spjd * we're just creating a placeholder so that multiple threads don't all 168168404Spjd * go off and read the same blocks. 169168404Spjd */ 170168404Spjdstatic vdev_cache_entry_t * 171168404Spjdvdev_cache_allocate(zio_t *zio) 172168404Spjd{ 173168404Spjd vdev_cache_t *vc = &zio->io_vd->vdev_cache; 174168404Spjd uint64_t offset = P2ALIGN(zio->io_offset, VCBS); 175168404Spjd vdev_cache_entry_t *ve; 176168404Spjd 177168404Spjd ASSERT(MUTEX_HELD(&vc->vc_lock)); 178168404Spjd 179168404Spjd if (zfs_vdev_cache_size == 0) 180168404Spjd return (NULL); 181168404Spjd 182168404Spjd /* 183168404Spjd * If adding a new entry would exceed the cache size, 184168404Spjd * evict the oldest entry (LRU). 185168404Spjd */ 186168404Spjd if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > 187168404Spjd zfs_vdev_cache_size) { 188168404Spjd ve = avl_first(&vc->vc_lastused_tree); 189185029Spjd if (ve->ve_fill_io != NULL) 190168404Spjd return (NULL); 191168404Spjd ASSERT(ve->ve_hits != 0); 192168404Spjd vdev_cache_evict(vc, ve); 193168404Spjd } 194168404Spjd 195168404Spjd ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); 196168404Spjd ve->ve_offset = offset; 197219089Spjd ve->ve_lastused = ddi_get_lbolt(); 198168404Spjd ve->ve_data = zio_buf_alloc(VCBS); 199168404Spjd 200168404Spjd avl_add(&vc->vc_offset_tree, ve); 201168404Spjd avl_add(&vc->vc_lastused_tree, ve); 202168404Spjd 203168404Spjd return (ve); 204168404Spjd} 205168404Spjd 206168404Spjdstatic void 207168404Spjdvdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) 208168404Spjd{ 209168404Spjd uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 210168404Spjd 211168404Spjd ASSERT(MUTEX_HELD(&vc->vc_lock)); 212168404Spjd ASSERT(ve->ve_fill_io == NULL); 213168404Spjd 214219089Spjd if (ve->ve_lastused != ddi_get_lbolt()) { 215168404Spjd avl_remove(&vc->vc_lastused_tree, ve); 216219089Spjd ve->ve_lastused = ddi_get_lbolt(); 217168404Spjd avl_add(&vc->vc_lastused_tree, ve); 218168404Spjd } 219168404Spjd 220168404Spjd ve->ve_hits++; 221168404Spjd bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); 222168404Spjd} 223168404Spjd 224168404Spjd/* 225168404Spjd * Fill a previously allocated cache entry with data. 226168404Spjd */ 227168404Spjdstatic void 228209962Smmvdev_cache_fill(zio_t *fio) 229168404Spjd{ 230209962Smm vdev_t *vd = fio->io_vd; 231168404Spjd vdev_cache_t *vc = &vd->vdev_cache; 232209962Smm vdev_cache_entry_t *ve = fio->io_private; 233209962Smm zio_t *pio; 234168404Spjd 235209962Smm ASSERT(fio->io_size == VCBS); 236168404Spjd 237168404Spjd /* 238168404Spjd * Add data to the cache. 239168404Spjd */ 240168404Spjd mutex_enter(&vc->vc_lock); 241168404Spjd 242209962Smm ASSERT(ve->ve_fill_io == fio); 243209962Smm ASSERT(ve->ve_offset == fio->io_offset); 244209962Smm ASSERT(ve->ve_data == fio->io_data); 245168404Spjd 246168404Spjd ve->ve_fill_io = NULL; 247168404Spjd 248168404Spjd /* 249168404Spjd * Even if this cache line was invalidated by a missed write update, 250168404Spjd * any reads that were queued up before the missed update are still 251168404Spjd * valid, so we can satisfy them from this line before we evict it. 252168404Spjd */ 253209962Smm while ((pio = zio_walk_parents(fio)) != NULL) 254209962Smm vdev_cache_hit(vc, ve, pio); 255168404Spjd 256209962Smm if (fio->io_error || ve->ve_missed_update) 257168404Spjd vdev_cache_evict(vc, ve); 258168404Spjd 259168404Spjd mutex_exit(&vc->vc_lock); 260168404Spjd} 261168404Spjd 262168404Spjd/* 263168404Spjd * Read data from the cache. Returns 0 on cache hit, errno on a miss. 264168404Spjd */ 265168404Spjdint 266168404Spjdvdev_cache_read(zio_t *zio) 267168404Spjd{ 268168404Spjd vdev_cache_t *vc = &zio->io_vd->vdev_cache; 269168404Spjd vdev_cache_entry_t *ve, ve_search; 270168404Spjd uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); 271168404Spjd uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); 272168404Spjd zio_t *fio; 273168404Spjd 274168404Spjd ASSERT(zio->io_type == ZIO_TYPE_READ); 275168404Spjd 276168404Spjd if (zio->io_flags & ZIO_FLAG_DONT_CACHE) 277249195Smm return (SET_ERROR(EINVAL)); 278168404Spjd 279168404Spjd if (zio->io_size > zfs_vdev_cache_max) 280249195Smm return (SET_ERROR(EOVERFLOW)); 281168404Spjd 282168404Spjd /* 283168404Spjd * If the I/O straddles two or more cache blocks, don't cache it. 284168404Spjd */ 285208047Smm if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) 286249195Smm return (SET_ERROR(EXDEV)); 287168404Spjd 288168404Spjd ASSERT(cache_phase + zio->io_size <= VCBS); 289168404Spjd 290168404Spjd mutex_enter(&vc->vc_lock); 291168404Spjd 292168404Spjd ve_search.ve_offset = cache_offset; 293168404Spjd ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); 294168404Spjd 295168404Spjd if (ve != NULL) { 296168404Spjd if (ve->ve_missed_update) { 297168404Spjd mutex_exit(&vc->vc_lock); 298249195Smm return (SET_ERROR(ESTALE)); 299168404Spjd } 300168404Spjd 301168404Spjd if ((fio = ve->ve_fill_io) != NULL) { 302168404Spjd zio_vdev_io_bypass(zio); 303209962Smm zio_add_child(zio, fio); 304168404Spjd mutex_exit(&vc->vc_lock); 305185029Spjd VDCSTAT_BUMP(vdc_stat_delegations); 306168404Spjd return (0); 307168404Spjd } 308168404Spjd 309168404Spjd vdev_cache_hit(vc, ve, zio); 310168404Spjd zio_vdev_io_bypass(zio); 311168404Spjd 312168404Spjd mutex_exit(&vc->vc_lock); 313185029Spjd VDCSTAT_BUMP(vdc_stat_hits); 314168404Spjd return (0); 315168404Spjd } 316168404Spjd 317168404Spjd ve = vdev_cache_allocate(zio); 318168404Spjd 319168404Spjd if (ve == NULL) { 320168404Spjd mutex_exit(&vc->vc_lock); 321249195Smm return (SET_ERROR(ENOMEM)); 322168404Spjd } 323168404Spjd 324185029Spjd fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, 325168404Spjd ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, 326185029Spjd ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); 327168404Spjd 328168404Spjd ve->ve_fill_io = fio; 329168404Spjd zio_vdev_io_bypass(zio); 330209962Smm zio_add_child(zio, fio); 331168404Spjd 332168404Spjd mutex_exit(&vc->vc_lock); 333168404Spjd zio_nowait(fio); 334185029Spjd VDCSTAT_BUMP(vdc_stat_misses); 335168404Spjd 336168404Spjd return (0); 337168404Spjd} 338168404Spjd 339168404Spjd/* 340168404Spjd * Update cache contents upon write completion. 341168404Spjd */ 342168404Spjdvoid 343168404Spjdvdev_cache_write(zio_t *zio) 344168404Spjd{ 345168404Spjd vdev_cache_t *vc = &zio->io_vd->vdev_cache; 346168404Spjd vdev_cache_entry_t *ve, ve_search; 347168404Spjd uint64_t io_start = zio->io_offset; 348168404Spjd uint64_t io_end = io_start + zio->io_size; 349168404Spjd uint64_t min_offset = P2ALIGN(io_start, VCBS); 350168404Spjd uint64_t max_offset = P2ROUNDUP(io_end, VCBS); 351168404Spjd avl_index_t where; 352168404Spjd 353168404Spjd ASSERT(zio->io_type == ZIO_TYPE_WRITE); 354168404Spjd 355168404Spjd mutex_enter(&vc->vc_lock); 356168404Spjd 357168404Spjd ve_search.ve_offset = min_offset; 358168404Spjd ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); 359168404Spjd 360168404Spjd if (ve == NULL) 361168404Spjd ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); 362168404Spjd 363168404Spjd while (ve != NULL && ve->ve_offset < max_offset) { 364168404Spjd uint64_t start = MAX(ve->ve_offset, io_start); 365168404Spjd uint64_t end = MIN(ve->ve_offset + VCBS, io_end); 366168404Spjd 367168404Spjd if (ve->ve_fill_io != NULL) { 368168404Spjd ve->ve_missed_update = 1; 369168404Spjd } else { 370168404Spjd bcopy((char *)zio->io_data + start - io_start, 371168404Spjd ve->ve_data + start - ve->ve_offset, end - start); 372168404Spjd } 373168404Spjd ve = AVL_NEXT(&vc->vc_offset_tree, ve); 374168404Spjd } 375168404Spjd mutex_exit(&vc->vc_lock); 376168404Spjd} 377168404Spjd 378168404Spjdvoid 379185029Spjdvdev_cache_purge(vdev_t *vd) 380185029Spjd{ 381185029Spjd vdev_cache_t *vc = &vd->vdev_cache; 382185029Spjd vdev_cache_entry_t *ve; 383185029Spjd 384185029Spjd mutex_enter(&vc->vc_lock); 385185029Spjd while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) 386185029Spjd vdev_cache_evict(vc, ve); 387185029Spjd mutex_exit(&vc->vc_lock); 388185029Spjd} 389185029Spjd 390185029Spjdvoid 391168404Spjdvdev_cache_init(vdev_t *vd) 392168404Spjd{ 393168404Spjd vdev_cache_t *vc = &vd->vdev_cache; 394168404Spjd 395168404Spjd mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); 396168404Spjd 397168404Spjd avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, 398168404Spjd sizeof (vdev_cache_entry_t), 399168404Spjd offsetof(struct vdev_cache_entry, ve_offset_node)); 400168404Spjd 401168404Spjd avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, 402168404Spjd sizeof (vdev_cache_entry_t), 403168404Spjd offsetof(struct vdev_cache_entry, ve_lastused_node)); 404168404Spjd} 405168404Spjd 406168404Spjdvoid 407168404Spjdvdev_cache_fini(vdev_t *vd) 408168404Spjd{ 409168404Spjd vdev_cache_t *vc = &vd->vdev_cache; 410168404Spjd 411185029Spjd vdev_cache_purge(vd); 412168404Spjd 413168404Spjd avl_destroy(&vc->vc_offset_tree); 414168404Spjd avl_destroy(&vc->vc_lastused_tree); 415168404Spjd 416168404Spjd mutex_destroy(&vc->vc_lock); 417168404Spjd} 418185029Spjd 419185029Spjdvoid 420185029Spjdvdev_cache_stat_init(void) 421185029Spjd{ 422185029Spjd vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", 423185029Spjd KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), 424185029Spjd KSTAT_FLAG_VIRTUAL); 425185029Spjd if (vdc_ksp != NULL) { 426185029Spjd vdc_ksp->ks_data = &vdc_stats; 427185029Spjd kstat_install(vdc_ksp); 428185029Spjd } 429185029Spjd} 430185029Spjd 431185029Spjdvoid 432185029Spjdvdev_cache_stat_fini(void) 433185029Spjd{ 434185029Spjd if (vdc_ksp != NULL) { 435185029Spjd kstat_delete(vdc_ksp); 436185029Spjd vdc_ksp = NULL; 437185029Spjd } 438185029Spjd} 439