dmu_zfetch.c revision 262167
1253883Ssjg/* 2246149Ssjg * CDDL HEADER START 3246149Ssjg * 4246149Ssjg * The contents of this file are subject to the terms of the 5246149Ssjg * Common Development and Distribution License (the "License"). 6246149Ssjg * You may not use this file except in compliance with the License. 7246149Ssjg * 8246149Ssjg * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9246149Ssjg * or http://www.opensolaris.org/os/licensing. 10246149Ssjg * See the License for the specific language governing permissions 11246149Ssjg * and limitations under the License. 12246149Ssjg * 13246149Ssjg * When distributing Covered Code, include this CDDL HEADER in each 14246149Ssjg * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15246149Ssjg * If applicable, add the following below this CDDL HEADER, with the 16246149Ssjg * fields enclosed by brackets "[]" replaced with your own identifying 17246149Ssjg * information: Portions Copyright [yyyy] [name of copyright owner] 18246149Ssjg * 19246149Ssjg * CDDL HEADER END 20246149Ssjg */ 21246149Ssjg/* 22246149Ssjg * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23246149Ssjg * Use is subject to license terms. 24246149Ssjg */ 25246149Ssjg 26246149Ssjg/* 27246149Ssjg * Copyright (c) 2013 by Delphix. All rights reserved. 28246149Ssjg */ 29246149Ssjg 30246149Ssjg#include <sys/zfs_context.h> 31246149Ssjg#include <sys/dnode.h> 32246149Ssjg#include <sys/dmu_objset.h> 33246149Ssjg#include <sys/dmu_zfetch.h> 34246149Ssjg#include <sys/dmu.h> 35246149Ssjg#include <sys/dbuf.h> 36246149Ssjg#include <sys/kstat.h> 37246149Ssjg 38246149Ssjg/* 39246149Ssjg * I'm against tune-ables, but these should probably exist as tweakable globals 40246149Ssjg * until we can get this working the way we want it to. 41246149Ssjg */ 42246149Ssjg 43246149Ssjgint zfs_prefetch_disable = 0; 44246149Ssjg 45246149Ssjg/* max # of streams per zfetch */ 46246149Ssjguint32_t zfetch_max_streams = 8; 47246149Ssjg/* min time before stream reclaim */ 48246149Ssjguint32_t zfetch_min_sec_reap = 2; 49246149Ssjg/* max number of blocks to fetch at a time */ 50246149Ssjguint32_t zfetch_block_cap = 256; 51246149Ssjg/* number of bytes in a array_read at which we stop prefetching (1Mb) */ 52246149Ssjguint64_t zfetch_array_rd_sz = 1024 * 1024; 53246149Ssjg 54246149SsjgSYSCTL_DECL(_vfs_zfs); 55246149SsjgSYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW, 56246149Ssjg &zfs_prefetch_disable, 0, "Disable prefetch"); 57246149SsjgSYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); 58246149SsjgTUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams); 59246149SsjgSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW, 60246149Ssjg &zfetch_max_streams, 0, "Max # of streams per zfetch"); 61246149SsjgTUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap); 62246149SsjgSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN, 63246149Ssjg &zfetch_min_sec_reap, 0, "Min time before stream reclaim"); 64246149SsjgTUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap); 65246149SsjgSYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN, 66246149Ssjg &zfetch_block_cap, 0, "Max number of blocks to fetch at a time"); 67246149SsjgTUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz); 68246149SsjgSYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN, 69246149Ssjg &zfetch_array_rd_sz, 0, 70246149Ssjg "Number of bytes in a array_read at which we stop prefetching"); 71246149Ssjg 72246149Ssjg/* forward decls for static routines */ 73246149Ssjgstatic boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); 74246149Ssjgstatic void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); 75246149Ssjgstatic uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); 76246149Ssjgstatic uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); 77246149Ssjgstatic boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); 78246149Ssjgstatic int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); 79246149Ssjgstatic zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); 80246149Ssjgstatic void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); 81246149Ssjgstatic int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); 82246149Ssjg 83246149Ssjgtypedef struct zfetch_stats { 84246149Ssjg kstat_named_t zfetchstat_hits; 85246149Ssjg kstat_named_t zfetchstat_misses; 86246149Ssjg kstat_named_t zfetchstat_colinear_hits; 87246149Ssjg kstat_named_t zfetchstat_colinear_misses; 88246149Ssjg kstat_named_t zfetchstat_stride_hits; 89246149Ssjg kstat_named_t zfetchstat_stride_misses; 90246149Ssjg kstat_named_t zfetchstat_reclaim_successes; 91246149Ssjg kstat_named_t zfetchstat_reclaim_failures; 92246149Ssjg kstat_named_t zfetchstat_stream_resets; 93246149Ssjg kstat_named_t zfetchstat_stream_noresets; 94246149Ssjg kstat_named_t zfetchstat_bogus_streams; 95246149Ssjg} zfetch_stats_t; 96246149Ssjg 97246149Ssjgstatic zfetch_stats_t zfetch_stats = { 98246149Ssjg { "hits", KSTAT_DATA_UINT64 }, 99246149Ssjg { "misses", KSTAT_DATA_UINT64 }, 100246149Ssjg { "colinear_hits", KSTAT_DATA_UINT64 }, 101246149Ssjg { "colinear_misses", KSTAT_DATA_UINT64 }, 102246149Ssjg { "stride_hits", KSTAT_DATA_UINT64 }, 103246149Ssjg { "stride_misses", KSTAT_DATA_UINT64 }, 104246149Ssjg { "reclaim_successes", KSTAT_DATA_UINT64 }, 105246149Ssjg { "reclaim_failures", KSTAT_DATA_UINT64 }, 106246149Ssjg { "streams_resets", KSTAT_DATA_UINT64 }, 107246149Ssjg { "streams_noresets", KSTAT_DATA_UINT64 }, 108246149Ssjg { "bogus_streams", KSTAT_DATA_UINT64 }, 109246149Ssjg}; 110246149Ssjg 111246149Ssjg#define ZFETCHSTAT_INCR(stat, val) \ 112246149Ssjg atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); 113246149Ssjg 114246149Ssjg#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1); 115246149Ssjg 116246149Ssjgkstat_t *zfetch_ksp; 117246149Ssjg 118246149Ssjg/* 119246149Ssjg * Given a zfetch structure and a zstream structure, determine whether the 120246149Ssjg * blocks to be read are part of a co-linear pair of existing prefetch 121246149Ssjg * streams. If a set is found, coalesce the streams, removing one, and 122246149Ssjg * configure the prefetch so it looks for a strided access pattern. 123246149Ssjg * 124246149Ssjg * In other words: if we find two sequential access streams that are 125246149Ssjg * the same length and distance N appart, and this read is N from the 126246149Ssjg * last stream, then we are probably in a strided access pattern. So 127246149Ssjg * combine the two sequential streams into a single strided stream. 128246149Ssjg * 129246149Ssjg * Returns whether co-linear streams were found. 130246149Ssjg */ 131246149Ssjgstatic boolean_t 132246149Ssjgdmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) 133246149Ssjg{ 134246149Ssjg zstream_t *z_walk; 135246149Ssjg zstream_t *z_comp; 136246149Ssjg 137246149Ssjg if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 138246149Ssjg return (0); 139246149Ssjg 140246149Ssjg if (zh == NULL) { 141246149Ssjg rw_exit(&zf->zf_rwlock); 142246149Ssjg return (0); 143246149Ssjg } 144246149Ssjg 145246149Ssjg for (z_walk = list_head(&zf->zf_stream); z_walk; 146246149Ssjg z_walk = list_next(&zf->zf_stream, z_walk)) { 147246149Ssjg for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; 148246149Ssjg z_comp = list_next(&zf->zf_stream, z_comp)) { 149246149Ssjg int64_t diff; 150246149Ssjg 151246149Ssjg if (z_walk->zst_len != z_walk->zst_stride || 152246149Ssjg z_comp->zst_len != z_comp->zst_stride) { 153246149Ssjg continue; 154246149Ssjg } 155246149Ssjg 156246149Ssjg diff = z_comp->zst_offset - z_walk->zst_offset; 157246149Ssjg if (z_comp->zst_offset + diff == zh->zst_offset) { 158246149Ssjg z_walk->zst_offset = zh->zst_offset; 159246149Ssjg z_walk->zst_direction = diff < 0 ? -1 : 1; 160246149Ssjg z_walk->zst_stride = 161246149Ssjg diff * z_walk->zst_direction; 162246149Ssjg z_walk->zst_ph_offset = 163246149Ssjg zh->zst_offset + z_walk->zst_stride; 164246149Ssjg dmu_zfetch_stream_remove(zf, z_comp); 165246149Ssjg mutex_destroy(&z_comp->zst_lock); 166246149Ssjg kmem_free(z_comp, sizeof (zstream_t)); 167246149Ssjg 168246149Ssjg dmu_zfetch_dofetch(zf, z_walk); 169246149Ssjg 170246149Ssjg rw_exit(&zf->zf_rwlock); 171246149Ssjg return (1); 172246149Ssjg } 173246149Ssjg 174246149Ssjg diff = z_walk->zst_offset - z_comp->zst_offset; 175246149Ssjg if (z_walk->zst_offset + diff == zh->zst_offset) { 176246149Ssjg z_walk->zst_offset = zh->zst_offset; 177246149Ssjg z_walk->zst_direction = diff < 0 ? -1 : 1; 178246149Ssjg z_walk->zst_stride = 179246149Ssjg diff * z_walk->zst_direction; 180246149Ssjg z_walk->zst_ph_offset = 181246149Ssjg zh->zst_offset + z_walk->zst_stride; 182246149Ssjg dmu_zfetch_stream_remove(zf, z_comp); 183246149Ssjg mutex_destroy(&z_comp->zst_lock); 184246149Ssjg kmem_free(z_comp, sizeof (zstream_t)); 185246149Ssjg 186246149Ssjg dmu_zfetch_dofetch(zf, z_walk); 187246149Ssjg 188246149Ssjg rw_exit(&zf->zf_rwlock); 189246149Ssjg return (1); 190246149Ssjg } 191246149Ssjg } 192246149Ssjg } 193246149Ssjg 194246149Ssjg rw_exit(&zf->zf_rwlock); 195246149Ssjg return (0); 196246149Ssjg} 197246149Ssjg 198246149Ssjg/* 199246149Ssjg * Given a zstream_t, determine the bounds of the prefetch. Then call the 200246149Ssjg * routine that actually prefetches the individual blocks. 201246149Ssjg */ 202246149Ssjgstatic void 203246149Ssjgdmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) 204246149Ssjg{ 205246149Ssjg uint64_t prefetch_tail; 206246149Ssjg uint64_t prefetch_limit; 207246149Ssjg uint64_t prefetch_ofst; 208246149Ssjg uint64_t prefetch_len; 209246149Ssjg uint64_t blocks_fetched; 210246149Ssjg 211246149Ssjg zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); 212246149Ssjg zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); 213246149Ssjg 214246149Ssjg prefetch_tail = MAX((int64_t)zs->zst_ph_offset, 215246149Ssjg (int64_t)(zs->zst_offset + zs->zst_stride)); 216246149Ssjg /* 217246149Ssjg * XXX: use a faster division method? 218246149Ssjg */ 219246149Ssjg prefetch_limit = zs->zst_offset + zs->zst_len + 220246149Ssjg (zs->zst_cap * zs->zst_stride) / zs->zst_len; 221246149Ssjg 222246149Ssjg while (prefetch_tail < prefetch_limit) { 223246149Ssjg prefetch_ofst = zs->zst_offset + zs->zst_direction * 224246149Ssjg (prefetch_tail - zs->zst_offset); 225246149Ssjg 226246149Ssjg prefetch_len = zs->zst_len; 227246149Ssjg 228246149Ssjg /* 229246149Ssjg * Don't prefetch beyond the end of the file, if working 230246149Ssjg * backwards. 231246149Ssjg */ 232246149Ssjg if ((zs->zst_direction == ZFETCH_BACKWARD) && 233246149Ssjg (prefetch_ofst > prefetch_tail)) { 234246149Ssjg prefetch_len += prefetch_ofst; 235246149Ssjg prefetch_ofst = 0; 236246149Ssjg } 237246149Ssjg 238246149Ssjg /* don't prefetch more than we're supposed to */ 239246149Ssjg if (prefetch_len > zs->zst_len) 240246149Ssjg break; 241246149Ssjg 242246149Ssjg blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, 243246149Ssjg prefetch_ofst, zs->zst_len); 244246149Ssjg 245246149Ssjg prefetch_tail += zs->zst_stride; 246246149Ssjg /* stop if we've run out of stuff to prefetch */ 247246149Ssjg if (blocks_fetched < zs->zst_len) 248246149Ssjg break; 249246149Ssjg } 250246149Ssjg zs->zst_ph_offset = prefetch_tail; 251246149Ssjg zs->zst_last = ddi_get_lbolt(); 252246149Ssjg} 253246149Ssjg 254246149Ssjgvoid 255246149Ssjgzfetch_init(void) 256246149Ssjg{ 257246149Ssjg 258246149Ssjg zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 259246149Ssjg KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 260246149Ssjg KSTAT_FLAG_VIRTUAL); 261246149Ssjg 262246149Ssjg if (zfetch_ksp != NULL) { 263246149Ssjg zfetch_ksp->ks_data = &zfetch_stats; 264246149Ssjg kstat_install(zfetch_ksp); 265246149Ssjg } 266246149Ssjg} 267246149Ssjg 268246149Ssjgvoid 269246149Ssjgzfetch_fini(void) 270246149Ssjg{ 271246149Ssjg if (zfetch_ksp != NULL) { 272246149Ssjg kstat_delete(zfetch_ksp); 273246149Ssjg zfetch_ksp = NULL; 274246149Ssjg } 275246149Ssjg} 276246149Ssjg 277246149Ssjg/* 278246149Ssjg * This takes a pointer to a zfetch structure and a dnode. It performs the 279246149Ssjg * necessary setup for the zfetch structure, grokking data from the 280246149Ssjg * associated dnode. 281246149Ssjg */ 282246149Ssjgvoid 283246149Ssjgdmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 284253883Ssjg{ 285246149Ssjg if (zf == NULL) { 286246149Ssjg return; 287246149Ssjg } 288246149Ssjg 289246149Ssjg zf->zf_dnode = dno; 290246149Ssjg zf->zf_stream_cnt = 0; 291246149Ssjg zf->zf_alloc_fail = 0; 292246149Ssjg 293253883Ssjg list_create(&zf->zf_stream, sizeof (zstream_t), 294253883Ssjg offsetof(zstream_t, zst_node)); 295246149Ssjg 296246149Ssjg rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); 297246149Ssjg} 298246149Ssjg 299253883Ssjg/* 300253883Ssjg * This function computes the actual size, in blocks, that can be prefetched, 301246149Ssjg * and fetches it. 302246149Ssjg */ 303246149Ssjgstatic uint64_t 304246149Ssjgdmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) 305246149Ssjg{ 306246149Ssjg uint64_t fetchsz; 307246149Ssjg uint64_t i; 308246149Ssjg 309246149Ssjg fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); 310246149Ssjg 311246149Ssjg for (i = 0; i < fetchsz; i++) { 312246149Ssjg dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); 313246149Ssjg } 314246149Ssjg 315246149Ssjg return (fetchsz); 316246149Ssjg} 317246149Ssjg 318246149Ssjg/* 319253883Ssjg * this function returns the number of blocks that would be prefetched, based 320246149Ssjg * upon the supplied dnode, blockid, and nblks. This is used so that we can 321246149Ssjg * update streams in place, and then prefetch with their old value after the 322246149Ssjg * fact. This way, we can delay the prefetch, but subsequent accesses to the 323246149Ssjg * stream won't result in the same data being prefetched multiple times. 324246149Ssjg */ 325246149Ssjgstatic uint64_t 326246149Ssjgdmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) 327246149Ssjg{ 328246149Ssjg uint64_t fetchsz; 329246149Ssjg 330246149Ssjg if (blkid > dn->dn_maxblkid) { 331246149Ssjg return (0); 332246149Ssjg } 333246149Ssjg 334246149Ssjg /* compute fetch size */ 335253883Ssjg if (blkid + nblks + 1 > dn->dn_maxblkid) { 336253883Ssjg fetchsz = (dn->dn_maxblkid - blkid) + 1; 337246149Ssjg ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); 338246149Ssjg } else { 339246149Ssjg fetchsz = nblks; 340246149Ssjg } 341246149Ssjg 342246149Ssjg 343246149Ssjg return (fetchsz); 344246149Ssjg} 345246149Ssjg 346246149Ssjg/* 347246149Ssjg * given a zfetch and a zstream structure, see if there is an associated zstream 348246149Ssjg * for this block read. If so, it starts a prefetch for the stream it 349246149Ssjg * located and returns true, otherwise it returns false 350246149Ssjg */ 351246149Ssjgstatic boolean_t 352246149Ssjgdmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) 353246149Ssjg{ 354246149Ssjg zstream_t *zs; 355246149Ssjg int64_t diff; 356246149Ssjg int reset = !prefetched; 357246149Ssjg int rc = 0; 358246149Ssjg 359246149Ssjg if (zh == NULL) 360246149Ssjg return (0); 361246149Ssjg 362246149Ssjg /* 363246149Ssjg * XXX: This locking strategy is a bit coarse; however, it's impact has 364246149Ssjg * yet to be tested. If this turns out to be an issue, it can be 365246149Ssjg * modified in a number of different ways. 366246149Ssjg */ 367246149Ssjg 368246149Ssjg rw_enter(&zf->zf_rwlock, RW_READER); 369246149Ssjgtop: 370246149Ssjg 371246149Ssjg for (zs = list_head(&zf->zf_stream); zs; 372246149Ssjg zs = list_next(&zf->zf_stream, zs)) { 373246149Ssjg 374246149Ssjg /* 375246149Ssjg * XXX - should this be an assert? 376246149Ssjg */ 377246149Ssjg if (zs->zst_len == 0) { 378246149Ssjg /* bogus stream */ 379246149Ssjg ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); 380246149Ssjg continue; 381246149Ssjg } 382246149Ssjg 383246149Ssjg /* 384246149Ssjg * We hit this case when we are in a strided prefetch stream: 385246149Ssjg * we will read "len" blocks before "striding". 386246149Ssjg */ 387246149Ssjg if (zh->zst_offset >= zs->zst_offset && 388246149Ssjg zh->zst_offset < zs->zst_offset + zs->zst_len) { 389246149Ssjg if (prefetched) { 390246149Ssjg /* already fetched */ 391246149Ssjg ZFETCHSTAT_BUMP(zfetchstat_stride_hits); 392246149Ssjg rc = 1; 393246149Ssjg goto out; 394246149Ssjg } else { 395246149Ssjg ZFETCHSTAT_BUMP(zfetchstat_stride_misses); 396246149Ssjg } 397246149Ssjg } 398246149Ssjg 399246149Ssjg /* 400246149Ssjg * This is the forward sequential read case: we increment 401246149Ssjg * len by one each time we hit here, so we will enter this 402246149Ssjg * case on every read. 403246149Ssjg */ 404246149Ssjg if (zh->zst_offset == zs->zst_offset + zs->zst_len) { 405246149Ssjg 406246149Ssjg reset = !prefetched && zs->zst_len > 1; 407246149Ssjg 408246149Ssjg if (mutex_tryenter(&zs->zst_lock) == 0) { 409246149Ssjg rc = 1; 410246149Ssjg goto out; 411246149Ssjg } 412246149Ssjg 413246149Ssjg if (zh->zst_offset != zs->zst_offset + zs->zst_len) { 414246149Ssjg mutex_exit(&zs->zst_lock); 415246149Ssjg goto top; 416246149Ssjg } 417246149Ssjg zs->zst_len += zh->zst_len; 418246149Ssjg diff = zs->zst_len - zfetch_block_cap; 419246149Ssjg if (diff > 0) { 420246149Ssjg zs->zst_offset += diff; 421246149Ssjg zs->zst_len = zs->zst_len > diff ? 422246149Ssjg zs->zst_len - diff : 0; 423246149Ssjg } 424246149Ssjg zs->zst_direction = ZFETCH_FORWARD; 425246149Ssjg 426246149Ssjg break; 427246149Ssjg 428246149Ssjg /* 429246149Ssjg * Same as above, but reading backwards through the file. 430246149Ssjg */ 431246149Ssjg } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { 432246149Ssjg /* backwards sequential access */ 433246149Ssjg 434246149Ssjg reset = !prefetched && zs->zst_len > 1; 435246149Ssjg 436246149Ssjg if (mutex_tryenter(&zs->zst_lock) == 0) { 437246149Ssjg rc = 1; 438246149Ssjg goto out; 439246149Ssjg } 440246149Ssjg 441246149Ssjg if (zh->zst_offset != zs->zst_offset - zh->zst_len) { 442246149Ssjg mutex_exit(&zs->zst_lock); 443246149Ssjg goto top; 444246149Ssjg } 445246149Ssjg 446246149Ssjg zs->zst_offset = zs->zst_offset > zh->zst_len ? 447246149Ssjg zs->zst_offset - zh->zst_len : 0; 448246149Ssjg zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? 449246149Ssjg zs->zst_ph_offset - zh->zst_len : 0; 450246149Ssjg zs->zst_len += zh->zst_len; 451246149Ssjg 452246149Ssjg diff = zs->zst_len - zfetch_block_cap; 453246149Ssjg if (diff > 0) { 454246149Ssjg zs->zst_ph_offset = zs->zst_ph_offset > diff ? 455246149Ssjg zs->zst_ph_offset - diff : 0; 456246149Ssjg zs->zst_len = zs->zst_len > diff ? 457246149Ssjg zs->zst_len - diff : zs->zst_len; 458246149Ssjg } 459246149Ssjg zs->zst_direction = ZFETCH_BACKWARD; 460246149Ssjg 461246149Ssjg break; 462246149Ssjg 463246149Ssjg } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < 464246149Ssjg zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 465246149Ssjg /* strided forward access */ 466246149Ssjg 467246149Ssjg if (mutex_tryenter(&zs->zst_lock) == 0) { 468246149Ssjg rc = 1; 469246149Ssjg goto out; 470246149Ssjg } 471246149Ssjg 472246149Ssjg if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= 473246149Ssjg zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 474246149Ssjg mutex_exit(&zs->zst_lock); 475246149Ssjg goto top; 476246149Ssjg } 477246149Ssjg 478246149Ssjg zs->zst_offset += zs->zst_stride; 479246149Ssjg zs->zst_direction = ZFETCH_FORWARD; 480246149Ssjg 481246149Ssjg break; 482246149Ssjg 483246149Ssjg } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < 484246149Ssjg zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 485246149Ssjg /* strided reverse access */ 486246149Ssjg 487246149Ssjg if (mutex_tryenter(&zs->zst_lock) == 0) { 488246149Ssjg rc = 1; 489246149Ssjg goto out; 490246149Ssjg } 491246149Ssjg 492246149Ssjg if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= 493246149Ssjg zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 494246149Ssjg mutex_exit(&zs->zst_lock); 495246149Ssjg goto top; 496246149Ssjg } 497246149Ssjg 498246149Ssjg zs->zst_offset = zs->zst_offset > zs->zst_stride ? 499246149Ssjg zs->zst_offset - zs->zst_stride : 0; 500246149Ssjg zs->zst_ph_offset = (zs->zst_ph_offset > 501246149Ssjg (2 * zs->zst_stride)) ? 502246149Ssjg (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; 503246149Ssjg zs->zst_direction = ZFETCH_BACKWARD; 504246149Ssjg 505246149Ssjg break; 506246149Ssjg } 507246149Ssjg } 508246149Ssjg 509246149Ssjg if (zs) { 510246149Ssjg if (reset) { 511246149Ssjg zstream_t *remove = zs; 512246149Ssjg 513246149Ssjg ZFETCHSTAT_BUMP(zfetchstat_stream_resets); 514246149Ssjg rc = 0; 515246149Ssjg mutex_exit(&zs->zst_lock); 516246149Ssjg rw_exit(&zf->zf_rwlock); 517246149Ssjg rw_enter(&zf->zf_rwlock, RW_WRITER); 518246149Ssjg /* 519246149Ssjg * Relocate the stream, in case someone removes 520246149Ssjg * it while we were acquiring the WRITER lock. 521246149Ssjg */ 522246149Ssjg for (zs = list_head(&zf->zf_stream); zs; 523246149Ssjg zs = list_next(&zf->zf_stream, zs)) { 524246149Ssjg if (zs == remove) { 525246149Ssjg dmu_zfetch_stream_remove(zf, zs); 526246149Ssjg mutex_destroy(&zs->zst_lock); 527246149Ssjg kmem_free(zs, sizeof (zstream_t)); 528246149Ssjg break; 529246149Ssjg } 530246149Ssjg } 531246149Ssjg } else { 532246149Ssjg ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); 533246149Ssjg rc = 1; 534246149Ssjg dmu_zfetch_dofetch(zf, zs); 535246149Ssjg mutex_exit(&zs->zst_lock); 536246149Ssjg } 537246149Ssjg } 538246149Ssjgout: 539246149Ssjg rw_exit(&zf->zf_rwlock); 540246149Ssjg return (rc); 541246149Ssjg} 542246149Ssjg 543246149Ssjg/* 544246149Ssjg * Clean-up state associated with a zfetch structure. This frees allocated 545246149Ssjg * structure members, empties the zf_stream tree, and generally makes things 546246149Ssjg * nice. This doesn't free the zfetch_t itself, that's left to the caller. 547246149Ssjg */ 548246149Ssjgvoid 549246149Ssjgdmu_zfetch_rele(zfetch_t *zf) 550246149Ssjg{ 551246149Ssjg zstream_t *zs; 552246149Ssjg zstream_t *zs_next; 553246149Ssjg 554246149Ssjg ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); 555246149Ssjg 556246149Ssjg for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { 557246149Ssjg zs_next = list_next(&zf->zf_stream, zs); 558246149Ssjg 559246149Ssjg list_remove(&zf->zf_stream, zs); 560246149Ssjg mutex_destroy(&zs->zst_lock); 561246149Ssjg kmem_free(zs, sizeof (zstream_t)); 562246149Ssjg } 563246149Ssjg list_destroy(&zf->zf_stream); 564246149Ssjg rw_destroy(&zf->zf_rwlock); 565246149Ssjg 566246149Ssjg zf->zf_dnode = NULL; 567246149Ssjg} 568246149Ssjg 569246149Ssjg/* 570246149Ssjg * Given a zfetch and zstream structure, insert the zstream structure into the 571246149Ssjg * AVL tree contained within the zfetch structure. Peform the appropriate 572246149Ssjg * book-keeping. It is possible that another thread has inserted a stream which 573246149Ssjg * matches one that we are about to insert, so we must be sure to check for this 574246149Ssjg * case. If one is found, return failure, and let the caller cleanup the 575246149Ssjg * duplicates. 576246149Ssjg */ 577246149Ssjgstatic int 578246149Ssjgdmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) 579246149Ssjg{ 580246149Ssjg zstream_t *zs_walk; 581246149Ssjg zstream_t *zs_next; 582246149Ssjg 583246149Ssjg ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 584246149Ssjg 585246149Ssjg for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { 586246149Ssjg zs_next = list_next(&zf->zf_stream, zs_walk); 587246149Ssjg 588246149Ssjg if (dmu_zfetch_streams_equal(zs_walk, zs)) { 589246149Ssjg return (0); 590246149Ssjg } 591246149Ssjg } 592246149Ssjg 593246149Ssjg list_insert_head(&zf->zf_stream, zs); 594246149Ssjg zf->zf_stream_cnt++; 595246149Ssjg return (1); 596246149Ssjg} 597246149Ssjg 598246149Ssjg 599246149Ssjg/* 600246149Ssjg * Walk the list of zstreams in the given zfetch, find an old one (by time), and 601246149Ssjg * reclaim it for use by the caller. 602246149Ssjg */ 603static zstream_t * 604dmu_zfetch_stream_reclaim(zfetch_t *zf) 605{ 606 zstream_t *zs; 607 clock_t ticks; 608 609 ticks = zfetch_min_sec_reap * hz; 610 if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 611 return (0); 612 613 for (zs = list_head(&zf->zf_stream); zs; 614 zs = list_next(&zf->zf_stream, zs)) { 615 616 if (ddi_get_lbolt() - zs->zst_last > ticks) 617 break; 618 } 619 620 if (zs) { 621 dmu_zfetch_stream_remove(zf, zs); 622 mutex_destroy(&zs->zst_lock); 623 bzero(zs, sizeof (zstream_t)); 624 } else { 625 zf->zf_alloc_fail++; 626 } 627 rw_exit(&zf->zf_rwlock); 628 629 return (zs); 630} 631 632/* 633 * Given a zfetch and zstream structure, remove the zstream structure from its 634 * container in the zfetch structure. Perform the appropriate book-keeping. 635 */ 636static void 637dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 638{ 639 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 640 641 list_remove(&zf->zf_stream, zs); 642 zf->zf_stream_cnt--; 643} 644 645static int 646dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) 647{ 648 if (zs1->zst_offset != zs2->zst_offset) 649 return (0); 650 651 if (zs1->zst_len != zs2->zst_len) 652 return (0); 653 654 if (zs1->zst_stride != zs2->zst_stride) 655 return (0); 656 657 if (zs1->zst_ph_offset != zs2->zst_ph_offset) 658 return (0); 659 660 if (zs1->zst_cap != zs2->zst_cap) 661 return (0); 662 663 if (zs1->zst_direction != zs2->zst_direction) 664 return (0); 665 666 return (1); 667} 668 669/* 670 * This is the prefetch entry point. It calls all of the other dmu_zfetch 671 * routines to create, delete, find, or operate upon prefetch streams. 672 */ 673void 674dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) 675{ 676 zstream_t zst; 677 zstream_t *newstream; 678 boolean_t fetched; 679 int inserted; 680 unsigned int blkshft; 681 uint64_t blksz; 682 683 if (zfs_prefetch_disable) 684 return; 685 686 /* files that aren't ln2 blocksz are only one block -- nothing to do */ 687 if (!zf->zf_dnode->dn_datablkshift) 688 return; 689 690 /* convert offset and size, into blockid and nblocks */ 691 blkshft = zf->zf_dnode->dn_datablkshift; 692 blksz = (1 << blkshft); 693 694 bzero(&zst, sizeof (zstream_t)); 695 zst.zst_offset = offset >> blkshft; 696 zst.zst_len = (P2ROUNDUP(offset + size, blksz) - 697 P2ALIGN(offset, blksz)) >> blkshft; 698 699 fetched = dmu_zfetch_find(zf, &zst, prefetched); 700 if (fetched) { 701 ZFETCHSTAT_BUMP(zfetchstat_hits); 702 } else { 703 ZFETCHSTAT_BUMP(zfetchstat_misses); 704 fetched = dmu_zfetch_colinear(zf, &zst); 705 if (fetched) { 706 ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); 707 } else { 708 ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); 709 } 710 } 711 712 if (!fetched) { 713 newstream = dmu_zfetch_stream_reclaim(zf); 714 715 /* 716 * we still couldn't find a stream, drop the lock, and allocate 717 * one if possible. Otherwise, give up and go home. 718 */ 719 if (newstream) { 720 ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); 721 } else { 722 uint64_t maxblocks; 723 uint32_t max_streams; 724 uint32_t cur_streams; 725 726 ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); 727 cur_streams = zf->zf_stream_cnt; 728 maxblocks = zf->zf_dnode->dn_maxblkid; 729 730 max_streams = MIN(zfetch_max_streams, 731 (maxblocks / zfetch_block_cap)); 732 if (max_streams == 0) { 733 max_streams++; 734 } 735 736 if (cur_streams >= max_streams) { 737 return; 738 } 739 newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); 740 } 741 742 newstream->zst_offset = zst.zst_offset; 743 newstream->zst_len = zst.zst_len; 744 newstream->zst_stride = zst.zst_len; 745 newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; 746 newstream->zst_cap = zst.zst_len; 747 newstream->zst_direction = ZFETCH_FORWARD; 748 newstream->zst_last = ddi_get_lbolt(); 749 750 mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); 751 752 rw_enter(&zf->zf_rwlock, RW_WRITER); 753 inserted = dmu_zfetch_stream_insert(zf, newstream); 754 rw_exit(&zf->zf_rwlock); 755 756 if (!inserted) { 757 mutex_destroy(&newstream->zst_lock); 758 kmem_free(newstream, sizeof (zstream_t)); 759 } 760 } 761} 762