arc.c revision 288580
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexs, rather they rely on the 86 * hash table mutexs for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexs). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123#include <sys/spa.h> 124#include <sys/zio.h> 125#include <sys/zio_compress.h> 126#include <sys/zfs_context.h> 127#include <sys/arc.h> 128#include <sys/refcount.h> 129#include <sys/vdev.h> 130#include <sys/vdev_impl.h> 131#include <sys/dsl_pool.h> 132#ifdef _KERNEL 133#include <sys/dnlc.h> 134#endif 135#include <sys/callb.h> 136#include <sys/kstat.h> 137#include <sys/trim_map.h> 138#include <zfs_fletcher.h> 139#include <sys/sdt.h> 140 141#include <vm/vm_pageout.h> 142#include <machine/vmparam.h> 143 144#ifdef illumos 145#ifndef _KERNEL 146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147boolean_t arc_watch = B_FALSE; 148int arc_procfd; 149#endif 150#endif /* illumos */ 151 152static kmutex_t arc_reclaim_thr_lock; 153static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154static uint8_t arc_thread_exit; 155 156uint_t arc_reduce_dnlc_percent = 3; 157 158/* 159 * The number of iterations through arc_evict_*() before we 160 * drop & reacquire the lock. 161 */ 162int arc_evict_iterations = 100; 163 164/* number of seconds before growing cache again */ 165static int arc_grow_retry = 60; 166 167/* shift of arc_c for calculating both min and max arc_p */ 168static int arc_p_min_shift = 4; 169 170/* log2(fraction of arc to reclaim) */ 171static int arc_shrink_shift = 7; 172 173/* 174 * log2(fraction of ARC which must be free to allow growing). 175 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 176 * when reading a new block into the ARC, we will evict an equal-sized block 177 * from the ARC. 178 * 179 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 180 * we will still not allow it to grow. 181 */ 182int arc_no_grow_shift = 5; 183 184 185/* 186 * minimum lifespan of a prefetch block in clock ticks 187 * (initialized in arc_init()) 188 */ 189static int arc_min_prefetch_lifespan; 190 191/* 192 * If this percent of memory is free, don't throttle. 193 */ 194int arc_lotsfree_percent = 10; 195 196static int arc_dead; 197extern int zfs_prefetch_disable; 198 199/* 200 * The arc has filled available memory and has now warmed up. 201 */ 202static boolean_t arc_warm; 203 204/* 205 * These tunables are for performance analysis. 206 */ 207uint64_t zfs_arc_max; 208uint64_t zfs_arc_min; 209uint64_t zfs_arc_meta_limit = 0; 210uint64_t zfs_arc_meta_min = 0; 211int zfs_arc_grow_retry = 0; 212int zfs_arc_shrink_shift = 0; 213int zfs_arc_p_min_shift = 0; 214int zfs_disable_dup_eviction = 0; 215uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 216u_int zfs_arc_free_target = 0; 217 218static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 219static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 220 221#ifdef _KERNEL 222static void 223arc_free_target_init(void *unused __unused) 224{ 225 226 zfs_arc_free_target = vm_pageout_wakeup_thresh; 227} 228SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 229 arc_free_target_init, NULL); 230 231TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 232TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 233TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 234TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 235TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize); 236TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 237SYSCTL_DECL(_vfs_zfs); 238SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 239 "Maximum ARC size"); 240SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 241 "Minimum ARC size"); 242SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 243 &zfs_arc_average_blocksize, 0, 244 "ARC average blocksize"); 245SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 246 &arc_shrink_shift, 0, 247 "log2(fraction of arc to reclaim)"); 248 249/* 250 * We don't have a tunable for arc_free_target due to the dependency on 251 * pagedaemon initialisation. 252 */ 253SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 254 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 255 sysctl_vfs_zfs_arc_free_target, "IU", 256 "Desired number of free pages below which ARC triggers reclaim"); 257 258static int 259sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 260{ 261 u_int val; 262 int err; 263 264 val = zfs_arc_free_target; 265 err = sysctl_handle_int(oidp, &val, 0, req); 266 if (err != 0 || req->newptr == NULL) 267 return (err); 268 269 if (val < minfree) 270 return (EINVAL); 271 if (val > cnt.v_page_count) 272 return (EINVAL); 273 274 zfs_arc_free_target = val; 275 276 return (0); 277} 278 279/* 280 * Must be declared here, before the definition of corresponding kstat 281 * macro which uses the same names will confuse the compiler. 282 */ 283SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 284 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 285 sysctl_vfs_zfs_arc_meta_limit, "QU", 286 "ARC metadata limit"); 287#endif 288 289/* 290 * Note that buffers can be in one of 6 states: 291 * ARC_anon - anonymous (discussed below) 292 * ARC_mru - recently used, currently cached 293 * ARC_mru_ghost - recentely used, no longer in cache 294 * ARC_mfu - frequently used, currently cached 295 * ARC_mfu_ghost - frequently used, no longer in cache 296 * ARC_l2c_only - exists in L2ARC but not other states 297 * When there are no active references to the buffer, they are 298 * are linked onto a list in one of these arc states. These are 299 * the only buffers that can be evicted or deleted. Within each 300 * state there are multiple lists, one for meta-data and one for 301 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 302 * etc.) is tracked separately so that it can be managed more 303 * explicitly: favored over data, limited explicitly. 304 * 305 * Anonymous buffers are buffers that are not associated with 306 * a DVA. These are buffers that hold dirty block copies 307 * before they are written to stable storage. By definition, 308 * they are "ref'd" and are considered part of arc_mru 309 * that cannot be freed. Generally, they will aquire a DVA 310 * as they are written and migrate onto the arc_mru list. 311 * 312 * The ARC_l2c_only state is for buffers that are in the second 313 * level ARC but no longer in any of the ARC_m* lists. The second 314 * level ARC itself may also contain buffers that are in any of 315 * the ARC_m* states - meaning that a buffer can exist in two 316 * places. The reason for the ARC_l2c_only state is to keep the 317 * buffer header in the hash table, so that reads that hit the 318 * second level ARC benefit from these fast lookups. 319 */ 320 321typedef struct arc_state { 322 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 323 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 324 uint64_t arcs_size; /* total amount of data in this state */ 325 kmutex_t arcs_mtx; 326} arc_state_t; 327 328/* The 6 states: */ 329static arc_state_t ARC_anon; 330static arc_state_t ARC_mru; 331static arc_state_t ARC_mru_ghost; 332static arc_state_t ARC_mfu; 333static arc_state_t ARC_mfu_ghost; 334static arc_state_t ARC_l2c_only; 335 336typedef struct arc_stats { 337 kstat_named_t arcstat_hits; 338 kstat_named_t arcstat_misses; 339 kstat_named_t arcstat_demand_data_hits; 340 kstat_named_t arcstat_demand_data_misses; 341 kstat_named_t arcstat_demand_metadata_hits; 342 kstat_named_t arcstat_demand_metadata_misses; 343 kstat_named_t arcstat_prefetch_data_hits; 344 kstat_named_t arcstat_prefetch_data_misses; 345 kstat_named_t arcstat_prefetch_metadata_hits; 346 kstat_named_t arcstat_prefetch_metadata_misses; 347 kstat_named_t arcstat_mru_hits; 348 kstat_named_t arcstat_mru_ghost_hits; 349 kstat_named_t arcstat_mfu_hits; 350 kstat_named_t arcstat_mfu_ghost_hits; 351 kstat_named_t arcstat_allocated; 352 kstat_named_t arcstat_deleted; 353 kstat_named_t arcstat_recycle_miss; 354 /* 355 * Number of buffers that could not be evicted because the hash lock 356 * was held by another thread. The lock may not necessarily be held 357 * by something using the same buffer, since hash locks are shared 358 * by multiple buffers. 359 */ 360 kstat_named_t arcstat_mutex_miss; 361 /* 362 * Number of buffers skipped because they have I/O in progress, are 363 * indrect prefetch buffers that have not lived long enough, or are 364 * not from the spa we're trying to evict from. 365 */ 366 kstat_named_t arcstat_evict_skip; 367 kstat_named_t arcstat_evict_l2_cached; 368 kstat_named_t arcstat_evict_l2_eligible; 369 kstat_named_t arcstat_evict_l2_ineligible; 370 kstat_named_t arcstat_hash_elements; 371 kstat_named_t arcstat_hash_elements_max; 372 kstat_named_t arcstat_hash_collisions; 373 kstat_named_t arcstat_hash_chains; 374 kstat_named_t arcstat_hash_chain_max; 375 kstat_named_t arcstat_p; 376 kstat_named_t arcstat_c; 377 kstat_named_t arcstat_c_min; 378 kstat_named_t arcstat_c_max; 379 kstat_named_t arcstat_size; 380 /* 381 * Number of bytes consumed by internal ARC structures necessary 382 * for tracking purposes; these structures are not actually 383 * backed by ARC buffers. This includes arc_buf_hdr_t structures 384 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 385 * caches), and arc_buf_t structures (allocated via arc_buf_t 386 * cache). 387 */ 388 kstat_named_t arcstat_hdr_size; 389 /* 390 * Number of bytes consumed by ARC buffers of type equal to 391 * ARC_BUFC_DATA. This is generally consumed by buffers backing 392 * on disk user data (e.g. plain file contents). 393 */ 394 kstat_named_t arcstat_data_size; 395 /* 396 * Number of bytes consumed by ARC buffers of type equal to 397 * ARC_BUFC_METADATA. This is generally consumed by buffers 398 * backing on disk data that is used for internal ZFS 399 * structures (e.g. ZAP, dnode, indirect blocks, etc). 400 */ 401 kstat_named_t arcstat_metadata_size; 402 /* 403 * Number of bytes consumed by various buffers and structures 404 * not actually backed with ARC buffers. This includes bonus 405 * buffers (allocated directly via zio_buf_* functions), 406 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 407 * cache), and dnode_t structures (allocated via dnode_t cache). 408 */ 409 kstat_named_t arcstat_other_size; 410 /* 411 * Total number of bytes consumed by ARC buffers residing in the 412 * arc_anon state. This includes *all* buffers in the arc_anon 413 * state; e.g. data, metadata, evictable, and unevictable buffers 414 * are all included in this value. 415 */ 416 kstat_named_t arcstat_anon_size; 417 /* 418 * Number of bytes consumed by ARC buffers that meet the 419 * following criteria: backing buffers of type ARC_BUFC_DATA, 420 * residing in the arc_anon state, and are eligible for eviction 421 * (e.g. have no outstanding holds on the buffer). 422 */ 423 kstat_named_t arcstat_anon_evictable_data; 424 /* 425 * Number of bytes consumed by ARC buffers that meet the 426 * following criteria: backing buffers of type ARC_BUFC_METADATA, 427 * residing in the arc_anon state, and are eligible for eviction 428 * (e.g. have no outstanding holds on the buffer). 429 */ 430 kstat_named_t arcstat_anon_evictable_metadata; 431 /* 432 * Total number of bytes consumed by ARC buffers residing in the 433 * arc_mru state. This includes *all* buffers in the arc_mru 434 * state; e.g. data, metadata, evictable, and unevictable buffers 435 * are all included in this value. 436 */ 437 kstat_named_t arcstat_mru_size; 438 /* 439 * Number of bytes consumed by ARC buffers that meet the 440 * following criteria: backing buffers of type ARC_BUFC_DATA, 441 * residing in the arc_mru state, and are eligible for eviction 442 * (e.g. have no outstanding holds on the buffer). 443 */ 444 kstat_named_t arcstat_mru_evictable_data; 445 /* 446 * Number of bytes consumed by ARC buffers that meet the 447 * following criteria: backing buffers of type ARC_BUFC_METADATA, 448 * residing in the arc_mru state, and are eligible for eviction 449 * (e.g. have no outstanding holds on the buffer). 450 */ 451 kstat_named_t arcstat_mru_evictable_metadata; 452 /* 453 * Total number of bytes that *would have been* consumed by ARC 454 * buffers in the arc_mru_ghost state. The key thing to note 455 * here, is the fact that this size doesn't actually indicate 456 * RAM consumption. The ghost lists only consist of headers and 457 * don't actually have ARC buffers linked off of these headers. 458 * Thus, *if* the headers had associated ARC buffers, these 459 * buffers *would have* consumed this number of bytes. 460 */ 461 kstat_named_t arcstat_mru_ghost_size; 462 /* 463 * Number of bytes that *would have been* consumed by ARC 464 * buffers that are eligible for eviction, of type 465 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 466 */ 467 kstat_named_t arcstat_mru_ghost_evictable_data; 468 /* 469 * Number of bytes that *would have been* consumed by ARC 470 * buffers that are eligible for eviction, of type 471 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 472 */ 473 kstat_named_t arcstat_mru_ghost_evictable_metadata; 474 /* 475 * Total number of bytes consumed by ARC buffers residing in the 476 * arc_mfu state. This includes *all* buffers in the arc_mfu 477 * state; e.g. data, metadata, evictable, and unevictable buffers 478 * are all included in this value. 479 */ 480 kstat_named_t arcstat_mfu_size; 481 /* 482 * Number of bytes consumed by ARC buffers that are eligible for 483 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 484 * state. 485 */ 486 kstat_named_t arcstat_mfu_evictable_data; 487 /* 488 * Number of bytes consumed by ARC buffers that are eligible for 489 * eviction, of type ARC_BUFC_METADATA, and reside in the 490 * arc_mfu state. 491 */ 492 kstat_named_t arcstat_mfu_evictable_metadata; 493 /* 494 * Total number of bytes that *would have been* consumed by ARC 495 * buffers in the arc_mfu_ghost state. See the comment above 496 * arcstat_mru_ghost_size for more details. 497 */ 498 kstat_named_t arcstat_mfu_ghost_size; 499 /* 500 * Number of bytes that *would have been* consumed by ARC 501 * buffers that are eligible for eviction, of type 502 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 503 */ 504 kstat_named_t arcstat_mfu_ghost_evictable_data; 505 /* 506 * Number of bytes that *would have been* consumed by ARC 507 * buffers that are eligible for eviction, of type 508 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 509 */ 510 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 511 kstat_named_t arcstat_l2_hits; 512 kstat_named_t arcstat_l2_misses; 513 kstat_named_t arcstat_l2_feeds; 514 kstat_named_t arcstat_l2_rw_clash; 515 kstat_named_t arcstat_l2_read_bytes; 516 kstat_named_t arcstat_l2_write_bytes; 517 kstat_named_t arcstat_l2_writes_sent; 518 kstat_named_t arcstat_l2_writes_done; 519 kstat_named_t arcstat_l2_writes_error; 520 kstat_named_t arcstat_l2_writes_hdr_miss; 521 kstat_named_t arcstat_l2_evict_lock_retry; 522 kstat_named_t arcstat_l2_evict_reading; 523 kstat_named_t arcstat_l2_evict_l1cached; 524 kstat_named_t arcstat_l2_free_on_write; 525 kstat_named_t arcstat_l2_cdata_free_on_write; 526 kstat_named_t arcstat_l2_abort_lowmem; 527 kstat_named_t arcstat_l2_cksum_bad; 528 kstat_named_t arcstat_l2_io_error; 529 kstat_named_t arcstat_l2_size; 530 kstat_named_t arcstat_l2_asize; 531 kstat_named_t arcstat_l2_hdr_size; 532 kstat_named_t arcstat_l2_compress_successes; 533 kstat_named_t arcstat_l2_compress_zeros; 534 kstat_named_t arcstat_l2_compress_failures; 535 kstat_named_t arcstat_l2_write_trylock_fail; 536 kstat_named_t arcstat_l2_write_passed_headroom; 537 kstat_named_t arcstat_l2_write_spa_mismatch; 538 kstat_named_t arcstat_l2_write_in_l2; 539 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 540 kstat_named_t arcstat_l2_write_not_cacheable; 541 kstat_named_t arcstat_l2_write_full; 542 kstat_named_t arcstat_l2_write_buffer_iter; 543 kstat_named_t arcstat_l2_write_pios; 544 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 545 kstat_named_t arcstat_l2_write_buffer_list_iter; 546 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 547 kstat_named_t arcstat_memory_throttle_count; 548 kstat_named_t arcstat_duplicate_buffers; 549 kstat_named_t arcstat_duplicate_buffers_size; 550 kstat_named_t arcstat_duplicate_reads; 551 kstat_named_t arcstat_meta_used; 552 kstat_named_t arcstat_meta_limit; 553 kstat_named_t arcstat_meta_max; 554 kstat_named_t arcstat_meta_min; 555} arc_stats_t; 556 557static arc_stats_t arc_stats = { 558 { "hits", KSTAT_DATA_UINT64 }, 559 { "misses", KSTAT_DATA_UINT64 }, 560 { "demand_data_hits", KSTAT_DATA_UINT64 }, 561 { "demand_data_misses", KSTAT_DATA_UINT64 }, 562 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 563 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 564 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 565 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 566 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 567 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 568 { "mru_hits", KSTAT_DATA_UINT64 }, 569 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 570 { "mfu_hits", KSTAT_DATA_UINT64 }, 571 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 572 { "allocated", KSTAT_DATA_UINT64 }, 573 { "deleted", KSTAT_DATA_UINT64 }, 574 { "recycle_miss", KSTAT_DATA_UINT64 }, 575 { "mutex_miss", KSTAT_DATA_UINT64 }, 576 { "evict_skip", KSTAT_DATA_UINT64 }, 577 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 578 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 579 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 580 { "hash_elements", KSTAT_DATA_UINT64 }, 581 { "hash_elements_max", KSTAT_DATA_UINT64 }, 582 { "hash_collisions", KSTAT_DATA_UINT64 }, 583 { "hash_chains", KSTAT_DATA_UINT64 }, 584 { "hash_chain_max", KSTAT_DATA_UINT64 }, 585 { "p", KSTAT_DATA_UINT64 }, 586 { "c", KSTAT_DATA_UINT64 }, 587 { "c_min", KSTAT_DATA_UINT64 }, 588 { "c_max", KSTAT_DATA_UINT64 }, 589 { "size", KSTAT_DATA_UINT64 }, 590 { "hdr_size", KSTAT_DATA_UINT64 }, 591 { "data_size", KSTAT_DATA_UINT64 }, 592 { "metadata_size", KSTAT_DATA_UINT64 }, 593 { "other_size", KSTAT_DATA_UINT64 }, 594 { "anon_size", KSTAT_DATA_UINT64 }, 595 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 596 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 597 { "mru_size", KSTAT_DATA_UINT64 }, 598 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 599 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 600 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 601 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 602 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 603 { "mfu_size", KSTAT_DATA_UINT64 }, 604 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 605 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 606 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 607 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 608 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 609 { "l2_hits", KSTAT_DATA_UINT64 }, 610 { "l2_misses", KSTAT_DATA_UINT64 }, 611 { "l2_feeds", KSTAT_DATA_UINT64 }, 612 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 613 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 614 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 615 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 616 { "l2_writes_done", KSTAT_DATA_UINT64 }, 617 { "l2_writes_error", KSTAT_DATA_UINT64 }, 618 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 619 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 620 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 621 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 622 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 623 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 624 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 625 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 626 { "l2_io_error", KSTAT_DATA_UINT64 }, 627 { "l2_size", KSTAT_DATA_UINT64 }, 628 { "l2_asize", KSTAT_DATA_UINT64 }, 629 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 630 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 631 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 632 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 633 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 634 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 635 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 636 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 637 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 638 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 639 { "l2_write_full", KSTAT_DATA_UINT64 }, 640 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 641 { "l2_write_pios", KSTAT_DATA_UINT64 }, 642 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 643 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 644 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 645 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 646 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 647 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 648 { "duplicate_reads", KSTAT_DATA_UINT64 }, 649 { "arc_meta_used", KSTAT_DATA_UINT64 }, 650 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 651 { "arc_meta_max", KSTAT_DATA_UINT64 }, 652 { "arc_meta_min", KSTAT_DATA_UINT64 } 653}; 654 655#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 656 657#define ARCSTAT_INCR(stat, val) \ 658 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 659 660#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 661#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 662 663#define ARCSTAT_MAX(stat, val) { \ 664 uint64_t m; \ 665 while ((val) > (m = arc_stats.stat.value.ui64) && \ 666 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 667 continue; \ 668} 669 670#define ARCSTAT_MAXSTAT(stat) \ 671 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 672 673/* 674 * We define a macro to allow ARC hits/misses to be easily broken down by 675 * two separate conditions, giving a total of four different subtypes for 676 * each of hits and misses (so eight statistics total). 677 */ 678#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 679 if (cond1) { \ 680 if (cond2) { \ 681 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 682 } else { \ 683 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 684 } \ 685 } else { \ 686 if (cond2) { \ 687 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 688 } else { \ 689 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 690 } \ 691 } 692 693kstat_t *arc_ksp; 694static arc_state_t *arc_anon; 695static arc_state_t *arc_mru; 696static arc_state_t *arc_mru_ghost; 697static arc_state_t *arc_mfu; 698static arc_state_t *arc_mfu_ghost; 699static arc_state_t *arc_l2c_only; 700 701/* 702 * There are several ARC variables that are critical to export as kstats -- 703 * but we don't want to have to grovel around in the kstat whenever we wish to 704 * manipulate them. For these variables, we therefore define them to be in 705 * terms of the statistic variable. This assures that we are not introducing 706 * the possibility of inconsistency by having shadow copies of the variables, 707 * while still allowing the code to be readable. 708 */ 709#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 710#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 711#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 712#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 713#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 714#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 715#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 716#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 717#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 718 719#define L2ARC_IS_VALID_COMPRESS(_c_) \ 720 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 721 722static int arc_no_grow; /* Don't try to grow cache size */ 723static uint64_t arc_tempreserve; 724static uint64_t arc_loaned_bytes; 725 726typedef struct arc_callback arc_callback_t; 727 728struct arc_callback { 729 void *acb_private; 730 arc_done_func_t *acb_done; 731 arc_buf_t *acb_buf; 732 zio_t *acb_zio_dummy; 733 arc_callback_t *acb_next; 734}; 735 736typedef struct arc_write_callback arc_write_callback_t; 737 738struct arc_write_callback { 739 void *awcb_private; 740 arc_done_func_t *awcb_ready; 741 arc_done_func_t *awcb_physdone; 742 arc_done_func_t *awcb_done; 743 arc_buf_t *awcb_buf; 744}; 745 746/* 747 * ARC buffers are separated into multiple structs as a memory saving measure: 748 * - Common fields struct, always defined, and embedded within it: 749 * - L2-only fields, always allocated but undefined when not in L2ARC 750 * - L1-only fields, only allocated when in L1ARC 751 * 752 * Buffer in L1 Buffer only in L2 753 * +------------------------+ +------------------------+ 754 * | arc_buf_hdr_t | | arc_buf_hdr_t | 755 * | | | | 756 * | | | | 757 * | | | | 758 * +------------------------+ +------------------------+ 759 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 760 * | (undefined if L1-only) | | | 761 * +------------------------+ +------------------------+ 762 * | l1arc_buf_hdr_t | 763 * | | 764 * | | 765 * | | 766 * | | 767 * +------------------------+ 768 * 769 * Because it's possible for the L2ARC to become extremely large, we can wind 770 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 771 * is minimized by only allocating the fields necessary for an L1-cached buffer 772 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 773 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 774 * words in pointers. arc_hdr_realloc() is used to switch a header between 775 * these two allocation states. 776 */ 777typedef struct l1arc_buf_hdr { 778 kmutex_t b_freeze_lock; 779#ifdef ZFS_DEBUG 780 /* 781 * used for debugging wtih kmem_flags - by allocating and freeing 782 * b_thawed when the buffer is thawed, we get a record of the stack 783 * trace that thawed it. 784 */ 785 void *b_thawed; 786#endif 787 788 arc_buf_t *b_buf; 789 uint32_t b_datacnt; 790 /* for waiting on writes to complete */ 791 kcondvar_t b_cv; 792 793 /* protected by arc state mutex */ 794 arc_state_t *b_state; 795 list_node_t b_arc_node; 796 797 /* updated atomically */ 798 clock_t b_arc_access; 799 800 /* self protecting */ 801 refcount_t b_refcnt; 802 803 arc_callback_t *b_acb; 804 /* temporary buffer holder for in-flight compressed data */ 805 void *b_tmp_cdata; 806} l1arc_buf_hdr_t; 807 808typedef struct l2arc_dev l2arc_dev_t; 809 810typedef struct l2arc_buf_hdr { 811 /* protected by arc_buf_hdr mutex */ 812 l2arc_dev_t *b_dev; /* L2ARC device */ 813 uint64_t b_daddr; /* disk address, offset byte */ 814 /* real alloc'd buffer size depending on b_compress applied */ 815 int32_t b_asize; 816 817 list_node_t b_l2node; 818} l2arc_buf_hdr_t; 819 820struct arc_buf_hdr { 821 /* protected by hash lock */ 822 dva_t b_dva; 823 uint64_t b_birth; 824 /* 825 * Even though this checksum is only set/verified when a buffer is in 826 * the L1 cache, it needs to be in the set of common fields because it 827 * must be preserved from the time before a buffer is written out to 828 * L2ARC until after it is read back in. 829 */ 830 zio_cksum_t *b_freeze_cksum; 831 832 arc_buf_hdr_t *b_hash_next; 833 arc_flags_t b_flags; 834 835 /* immutable */ 836 int32_t b_size; 837 uint64_t b_spa; 838 839 /* L2ARC fields. Undefined when not in L2ARC. */ 840 l2arc_buf_hdr_t b_l2hdr; 841 /* L1ARC fields. Undefined when in l2arc_only state */ 842 l1arc_buf_hdr_t b_l1hdr; 843}; 844 845#ifdef _KERNEL 846static int 847sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 848{ 849 uint64_t val; 850 int err; 851 852 val = arc_meta_limit; 853 err = sysctl_handle_64(oidp, &val, 0, req); 854 if (err != 0 || req->newptr == NULL) 855 return (err); 856 857 if (val <= 0 || val > arc_c_max) 858 return (EINVAL); 859 860 arc_meta_limit = val; 861 return (0); 862} 863#endif 864 865static arc_buf_t *arc_eviction_list; 866static kmutex_t arc_eviction_mtx; 867static arc_buf_hdr_t arc_eviction_hdr; 868 869#define GHOST_STATE(state) \ 870 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 871 (state) == arc_l2c_only) 872 873#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 874#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 875#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 876#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 877#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 878#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 879 880#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 881#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 882#define HDR_L2_READING(hdr) \ 883 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 884 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 885#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 886#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 887#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 888 889#define HDR_ISTYPE_METADATA(hdr) \ 890 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 891#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 892 893#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 894#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 895 896/* For storing compression mode in b_flags */ 897#define HDR_COMPRESS_OFFSET 24 898#define HDR_COMPRESS_NBITS 7 899 900#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 901 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 902#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 903 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 904 905/* 906 * Other sizes 907 */ 908 909#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 910#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 911 912/* 913 * Hash table routines 914 */ 915 916#define HT_LOCK_PAD CACHE_LINE_SIZE 917 918struct ht_lock { 919 kmutex_t ht_lock; 920#ifdef _KERNEL 921 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 922#endif 923}; 924 925#define BUF_LOCKS 256 926typedef struct buf_hash_table { 927 uint64_t ht_mask; 928 arc_buf_hdr_t **ht_table; 929 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 930} buf_hash_table_t; 931 932static buf_hash_table_t buf_hash_table; 933 934#define BUF_HASH_INDEX(spa, dva, birth) \ 935 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 936#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 937#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 938#define HDR_LOCK(hdr) \ 939 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 940 941uint64_t zfs_crc64_table[256]; 942 943/* 944 * Level 2 ARC 945 */ 946 947#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 948#define L2ARC_HEADROOM 2 /* num of writes */ 949/* 950 * If we discover during ARC scan any buffers to be compressed, we boost 951 * our headroom for the next scanning cycle by this percentage multiple. 952 */ 953#define L2ARC_HEADROOM_BOOST 200 954#define L2ARC_FEED_SECS 1 /* caching interval secs */ 955#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 956 957/* 958 * Used to distinguish headers that are being process by 959 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 960 * address. This can happen when the header is added to the l2arc's list 961 * of buffers to write in the first stage of l2arc_write_buffers(), but 962 * has not yet been written out which happens in the second stage of 963 * l2arc_write_buffers(). 964 */ 965#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 966 967#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 968#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 969 970/* L2ARC Performance Tunables */ 971uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 972uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 973uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 974uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 975uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 976uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 977boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 978boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 979boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 980 981SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 982 &l2arc_write_max, 0, "max write size"); 983SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 984 &l2arc_write_boost, 0, "extra write during warmup"); 985SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 986 &l2arc_headroom, 0, "number of dev writes"); 987SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 988 &l2arc_feed_secs, 0, "interval seconds"); 989SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 990 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 991 992SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 993 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 994SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 995 &l2arc_feed_again, 0, "turbo warmup"); 996SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 997 &l2arc_norw, 0, "no reads during writes"); 998 999SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1000 &ARC_anon.arcs_size, 0, "size of anonymous state"); 1001SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1002 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1003SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1004 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1005 1006SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1007 &ARC_mru.arcs_size, 0, "size of mru state"); 1008SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1009 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1010SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1011 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1012 1013SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1014 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1015SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1016 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1017 "size of metadata in mru ghost state"); 1018SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1019 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1020 "size of data in mru ghost state"); 1021 1022SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1023 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1024SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1025 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1026SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1027 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1028 1029SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1030 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1031SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1032 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1033 "size of metadata in mfu ghost state"); 1034SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1035 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1036 "size of data in mfu ghost state"); 1037 1038SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1039 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1040 1041/* 1042 * L2ARC Internals 1043 */ 1044struct l2arc_dev { 1045 vdev_t *l2ad_vdev; /* vdev */ 1046 spa_t *l2ad_spa; /* spa */ 1047 uint64_t l2ad_hand; /* next write location */ 1048 uint64_t l2ad_start; /* first addr on device */ 1049 uint64_t l2ad_end; /* last addr on device */ 1050 boolean_t l2ad_first; /* first sweep through */ 1051 boolean_t l2ad_writing; /* currently writing */ 1052 kmutex_t l2ad_mtx; /* lock for buffer list */ 1053 list_t l2ad_buflist; /* buffer list */ 1054 list_node_t l2ad_node; /* device list node */ 1055 refcount_t l2ad_alloc; /* allocated bytes */ 1056}; 1057 1058static list_t L2ARC_dev_list; /* device list */ 1059static list_t *l2arc_dev_list; /* device list pointer */ 1060static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1061static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1062static list_t L2ARC_free_on_write; /* free after write buf list */ 1063static list_t *l2arc_free_on_write; /* free after write list ptr */ 1064static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1065static uint64_t l2arc_ndev; /* number of devices */ 1066 1067typedef struct l2arc_read_callback { 1068 arc_buf_t *l2rcb_buf; /* read buffer */ 1069 spa_t *l2rcb_spa; /* spa */ 1070 blkptr_t l2rcb_bp; /* original blkptr */ 1071 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1072 int l2rcb_flags; /* original flags */ 1073 enum zio_compress l2rcb_compress; /* applied compress */ 1074} l2arc_read_callback_t; 1075 1076typedef struct l2arc_write_callback { 1077 l2arc_dev_t *l2wcb_dev; /* device info */ 1078 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1079} l2arc_write_callback_t; 1080 1081typedef struct l2arc_data_free { 1082 /* protected by l2arc_free_on_write_mtx */ 1083 void *l2df_data; 1084 size_t l2df_size; 1085 void (*l2df_func)(void *, size_t); 1086 list_node_t l2df_list_node; 1087} l2arc_data_free_t; 1088 1089static kmutex_t l2arc_feed_thr_lock; 1090static kcondvar_t l2arc_feed_thr_cv; 1091static uint8_t l2arc_thread_exit; 1092 1093static void arc_get_data_buf(arc_buf_t *); 1094static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1095static int arc_evict_needed(arc_buf_contents_t); 1096static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1097static void arc_buf_watch(arc_buf_t *); 1098 1099static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1100static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1101 1102static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1103static void l2arc_read_done(zio_t *); 1104 1105static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1106static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1107static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1108 1109static uint64_t 1110buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1111{ 1112 uint8_t *vdva = (uint8_t *)dva; 1113 uint64_t crc = -1ULL; 1114 int i; 1115 1116 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1117 1118 for (i = 0; i < sizeof (dva_t); i++) 1119 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1120 1121 crc ^= (spa>>8) ^ birth; 1122 1123 return (crc); 1124} 1125 1126#define BUF_EMPTY(buf) \ 1127 ((buf)->b_dva.dva_word[0] == 0 && \ 1128 (buf)->b_dva.dva_word[1] == 0) 1129 1130#define BUF_EQUAL(spa, dva, birth, buf) \ 1131 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1132 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1133 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1134 1135static void 1136buf_discard_identity(arc_buf_hdr_t *hdr) 1137{ 1138 hdr->b_dva.dva_word[0] = 0; 1139 hdr->b_dva.dva_word[1] = 0; 1140 hdr->b_birth = 0; 1141} 1142 1143static arc_buf_hdr_t * 1144buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1145{ 1146 const dva_t *dva = BP_IDENTITY(bp); 1147 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1148 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1149 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1150 arc_buf_hdr_t *hdr; 1151 1152 mutex_enter(hash_lock); 1153 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1154 hdr = hdr->b_hash_next) { 1155 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1156 *lockp = hash_lock; 1157 return (hdr); 1158 } 1159 } 1160 mutex_exit(hash_lock); 1161 *lockp = NULL; 1162 return (NULL); 1163} 1164 1165/* 1166 * Insert an entry into the hash table. If there is already an element 1167 * equal to elem in the hash table, then the already existing element 1168 * will be returned and the new element will not be inserted. 1169 * Otherwise returns NULL. 1170 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1171 */ 1172static arc_buf_hdr_t * 1173buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1174{ 1175 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1176 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1177 arc_buf_hdr_t *fhdr; 1178 uint32_t i; 1179 1180 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1181 ASSERT(hdr->b_birth != 0); 1182 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1183 1184 if (lockp != NULL) { 1185 *lockp = hash_lock; 1186 mutex_enter(hash_lock); 1187 } else { 1188 ASSERT(MUTEX_HELD(hash_lock)); 1189 } 1190 1191 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1192 fhdr = fhdr->b_hash_next, i++) { 1193 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1194 return (fhdr); 1195 } 1196 1197 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1198 buf_hash_table.ht_table[idx] = hdr; 1199 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1200 1201 /* collect some hash table performance data */ 1202 if (i > 0) { 1203 ARCSTAT_BUMP(arcstat_hash_collisions); 1204 if (i == 1) 1205 ARCSTAT_BUMP(arcstat_hash_chains); 1206 1207 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1208 } 1209 1210 ARCSTAT_BUMP(arcstat_hash_elements); 1211 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1212 1213 return (NULL); 1214} 1215 1216static void 1217buf_hash_remove(arc_buf_hdr_t *hdr) 1218{ 1219 arc_buf_hdr_t *fhdr, **hdrp; 1220 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1221 1222 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1223 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1224 1225 hdrp = &buf_hash_table.ht_table[idx]; 1226 while ((fhdr = *hdrp) != hdr) { 1227 ASSERT(fhdr != NULL); 1228 hdrp = &fhdr->b_hash_next; 1229 } 1230 *hdrp = hdr->b_hash_next; 1231 hdr->b_hash_next = NULL; 1232 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1233 1234 /* collect some hash table performance data */ 1235 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1236 1237 if (buf_hash_table.ht_table[idx] && 1238 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1239 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1240} 1241 1242/* 1243 * Global data structures and functions for the buf kmem cache. 1244 */ 1245static kmem_cache_t *hdr_full_cache; 1246static kmem_cache_t *hdr_l2only_cache; 1247static kmem_cache_t *buf_cache; 1248 1249static void 1250buf_fini(void) 1251{ 1252 int i; 1253 1254 kmem_free(buf_hash_table.ht_table, 1255 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1256 for (i = 0; i < BUF_LOCKS; i++) 1257 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1258 kmem_cache_destroy(hdr_full_cache); 1259 kmem_cache_destroy(hdr_l2only_cache); 1260 kmem_cache_destroy(buf_cache); 1261} 1262 1263/* 1264 * Constructor callback - called when the cache is empty 1265 * and a new buf is requested. 1266 */ 1267/* ARGSUSED */ 1268static int 1269hdr_full_cons(void *vbuf, void *unused, int kmflag) 1270{ 1271 arc_buf_hdr_t *hdr = vbuf; 1272 1273 bzero(hdr, HDR_FULL_SIZE); 1274 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1275 refcount_create(&hdr->b_l1hdr.b_refcnt); 1276 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1277 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1278 1279 return (0); 1280} 1281 1282/* ARGSUSED */ 1283static int 1284hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1285{ 1286 arc_buf_hdr_t *hdr = vbuf; 1287 1288 bzero(hdr, HDR_L2ONLY_SIZE); 1289 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1290 1291 return (0); 1292} 1293 1294/* ARGSUSED */ 1295static int 1296buf_cons(void *vbuf, void *unused, int kmflag) 1297{ 1298 arc_buf_t *buf = vbuf; 1299 1300 bzero(buf, sizeof (arc_buf_t)); 1301 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1302 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1303 1304 return (0); 1305} 1306 1307/* 1308 * Destructor callback - called when a cached buf is 1309 * no longer required. 1310 */ 1311/* ARGSUSED */ 1312static void 1313hdr_full_dest(void *vbuf, void *unused) 1314{ 1315 arc_buf_hdr_t *hdr = vbuf; 1316 1317 ASSERT(BUF_EMPTY(hdr)); 1318 cv_destroy(&hdr->b_l1hdr.b_cv); 1319 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1320 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1321 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1322} 1323 1324/* ARGSUSED */ 1325static void 1326hdr_l2only_dest(void *vbuf, void *unused) 1327{ 1328 arc_buf_hdr_t *hdr = vbuf; 1329 1330 ASSERT(BUF_EMPTY(hdr)); 1331 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1332} 1333 1334/* ARGSUSED */ 1335static void 1336buf_dest(void *vbuf, void *unused) 1337{ 1338 arc_buf_t *buf = vbuf; 1339 1340 mutex_destroy(&buf->b_evict_lock); 1341 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1342} 1343 1344/* 1345 * Reclaim callback -- invoked when memory is low. 1346 */ 1347/* ARGSUSED */ 1348static void 1349hdr_recl(void *unused) 1350{ 1351 dprintf("hdr_recl called\n"); 1352 /* 1353 * umem calls the reclaim func when we destroy the buf cache, 1354 * which is after we do arc_fini(). 1355 */ 1356 if (!arc_dead) 1357 cv_signal(&arc_reclaim_thr_cv); 1358} 1359 1360static void 1361buf_init(void) 1362{ 1363 uint64_t *ct; 1364 uint64_t hsize = 1ULL << 12; 1365 int i, j; 1366 1367 /* 1368 * The hash table is big enough to fill all of physical memory 1369 * with an average block size of zfs_arc_average_blocksize (default 8K). 1370 * By default, the table will take up 1371 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1372 */ 1373 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1374 hsize <<= 1; 1375retry: 1376 buf_hash_table.ht_mask = hsize - 1; 1377 buf_hash_table.ht_table = 1378 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1379 if (buf_hash_table.ht_table == NULL) { 1380 ASSERT(hsize > (1ULL << 8)); 1381 hsize >>= 1; 1382 goto retry; 1383 } 1384 1385 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1386 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1387 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1388 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1389 NULL, NULL, 0); 1390 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1391 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1392 1393 for (i = 0; i < 256; i++) 1394 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1395 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1396 1397 for (i = 0; i < BUF_LOCKS; i++) { 1398 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1399 NULL, MUTEX_DEFAULT, NULL); 1400 } 1401} 1402 1403/* 1404 * Transition between the two allocation states for the arc_buf_hdr struct. 1405 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1406 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1407 * version is used when a cache buffer is only in the L2ARC in order to reduce 1408 * memory usage. 1409 */ 1410static arc_buf_hdr_t * 1411arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1412{ 1413 ASSERT(HDR_HAS_L2HDR(hdr)); 1414 1415 arc_buf_hdr_t *nhdr; 1416 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1417 1418 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1419 (old == hdr_l2only_cache && new == hdr_full_cache)); 1420 1421 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1422 1423 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1424 buf_hash_remove(hdr); 1425 1426 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1427 1428 if (new == hdr_full_cache) { 1429 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1430 /* 1431 * arc_access and arc_change_state need to be aware that a 1432 * header has just come out of L2ARC, so we set its state to 1433 * l2c_only even though it's about to change. 1434 */ 1435 nhdr->b_l1hdr.b_state = arc_l2c_only; 1436 } else { 1437 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1438 ASSERT0(hdr->b_l1hdr.b_datacnt); 1439 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1440 /* 1441 * We might be removing the L1hdr of a buffer which was just 1442 * written out to L2ARC. If such a buffer is compressed then we 1443 * need to free its b_tmp_cdata before destroying the header. 1444 */ 1445 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1446 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1447 l2arc_release_cdata_buf(hdr); 1448 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1449 } 1450 /* 1451 * The header has been reallocated so we need to re-insert it into any 1452 * lists it was on. 1453 */ 1454 (void) buf_hash_insert(nhdr, NULL); 1455 1456 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1457 1458 mutex_enter(&dev->l2ad_mtx); 1459 1460 /* 1461 * We must place the realloc'ed header back into the list at 1462 * the same spot. Otherwise, if it's placed earlier in the list, 1463 * l2arc_write_buffers() could find it during the function's 1464 * write phase, and try to write it out to the l2arc. 1465 */ 1466 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1467 list_remove(&dev->l2ad_buflist, hdr); 1468 1469 mutex_exit(&dev->l2ad_mtx); 1470 1471 /* 1472 * Since we're using the pointer address as the tag when 1473 * incrementing and decrementing the l2ad_alloc refcount, we 1474 * must remove the old pointer (that we're about to destroy) and 1475 * add the new pointer to the refcount. Otherwise we'd remove 1476 * the wrong pointer address when calling arc_hdr_destroy() later. 1477 */ 1478 1479 (void) refcount_remove_many(&dev->l2ad_alloc, 1480 hdr->b_l2hdr.b_asize, hdr); 1481 1482 (void) refcount_add_many(&dev->l2ad_alloc, 1483 nhdr->b_l2hdr.b_asize, nhdr); 1484 1485 buf_discard_identity(hdr); 1486 hdr->b_freeze_cksum = NULL; 1487 kmem_cache_free(old, hdr); 1488 1489 return (nhdr); 1490} 1491 1492 1493#define ARC_MINTIME (hz>>4) /* 62 ms */ 1494 1495static void 1496arc_cksum_verify(arc_buf_t *buf) 1497{ 1498 zio_cksum_t zc; 1499 1500 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1501 return; 1502 1503 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1504 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1505 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1506 return; 1507 } 1508 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1509 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1510 panic("buffer modified while frozen!"); 1511 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1512} 1513 1514static int 1515arc_cksum_equal(arc_buf_t *buf) 1516{ 1517 zio_cksum_t zc; 1518 int equal; 1519 1520 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1521 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1522 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1523 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1524 1525 return (equal); 1526} 1527 1528static void 1529arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1530{ 1531 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1532 return; 1533 1534 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1535 if (buf->b_hdr->b_freeze_cksum != NULL) { 1536 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1537 return; 1538 } 1539 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1540 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1541 buf->b_hdr->b_freeze_cksum); 1542 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1543#ifdef illumos 1544 arc_buf_watch(buf); 1545#endif /* illumos */ 1546} 1547 1548#ifdef illumos 1549#ifndef _KERNEL 1550typedef struct procctl { 1551 long cmd; 1552 prwatch_t prwatch; 1553} procctl_t; 1554#endif 1555 1556/* ARGSUSED */ 1557static void 1558arc_buf_unwatch(arc_buf_t *buf) 1559{ 1560#ifndef _KERNEL 1561 if (arc_watch) { 1562 int result; 1563 procctl_t ctl; 1564 ctl.cmd = PCWATCH; 1565 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1566 ctl.prwatch.pr_size = 0; 1567 ctl.prwatch.pr_wflags = 0; 1568 result = write(arc_procfd, &ctl, sizeof (ctl)); 1569 ASSERT3U(result, ==, sizeof (ctl)); 1570 } 1571#endif 1572} 1573 1574/* ARGSUSED */ 1575static void 1576arc_buf_watch(arc_buf_t *buf) 1577{ 1578#ifndef _KERNEL 1579 if (arc_watch) { 1580 int result; 1581 procctl_t ctl; 1582 ctl.cmd = PCWATCH; 1583 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1584 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1585 ctl.prwatch.pr_wflags = WA_WRITE; 1586 result = write(arc_procfd, &ctl, sizeof (ctl)); 1587 ASSERT3U(result, ==, sizeof (ctl)); 1588 } 1589#endif 1590} 1591#endif /* illumos */ 1592 1593static arc_buf_contents_t 1594arc_buf_type(arc_buf_hdr_t *hdr) 1595{ 1596 if (HDR_ISTYPE_METADATA(hdr)) { 1597 return (ARC_BUFC_METADATA); 1598 } else { 1599 return (ARC_BUFC_DATA); 1600 } 1601} 1602 1603static uint32_t 1604arc_bufc_to_flags(arc_buf_contents_t type) 1605{ 1606 switch (type) { 1607 case ARC_BUFC_DATA: 1608 /* metadata field is 0 if buffer contains normal data */ 1609 return (0); 1610 case ARC_BUFC_METADATA: 1611 return (ARC_FLAG_BUFC_METADATA); 1612 default: 1613 break; 1614 } 1615 panic("undefined ARC buffer type!"); 1616 return ((uint32_t)-1); 1617} 1618 1619void 1620arc_buf_thaw(arc_buf_t *buf) 1621{ 1622 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1623 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1624 panic("modifying non-anon buffer!"); 1625 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1626 panic("modifying buffer while i/o in progress!"); 1627 arc_cksum_verify(buf); 1628 } 1629 1630 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1631 if (buf->b_hdr->b_freeze_cksum != NULL) { 1632 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1633 buf->b_hdr->b_freeze_cksum = NULL; 1634 } 1635 1636#ifdef ZFS_DEBUG 1637 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1638 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1639 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1640 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1641 } 1642#endif 1643 1644 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1645 1646#ifdef illumos 1647 arc_buf_unwatch(buf); 1648#endif /* illumos */ 1649} 1650 1651void 1652arc_buf_freeze(arc_buf_t *buf) 1653{ 1654 kmutex_t *hash_lock; 1655 1656 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1657 return; 1658 1659 hash_lock = HDR_LOCK(buf->b_hdr); 1660 mutex_enter(hash_lock); 1661 1662 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1663 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1664 arc_cksum_compute(buf, B_FALSE); 1665 mutex_exit(hash_lock); 1666 1667} 1668 1669static void 1670add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1671{ 1672 ASSERT(HDR_HAS_L1HDR(hdr)); 1673 ASSERT(MUTEX_HELD(hash_lock)); 1674 arc_state_t *state = hdr->b_l1hdr.b_state; 1675 1676 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1677 (state != arc_anon)) { 1678 /* We don't use the L2-only state list. */ 1679 if (state != arc_l2c_only) { 1680 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1681 list_t *list = &state->arcs_list[arc_buf_type(hdr)]; 1682 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1683 1684 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1685 mutex_enter(&state->arcs_mtx); 1686 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1687 list_remove(list, hdr); 1688 if (GHOST_STATE(state)) { 1689 ASSERT0(hdr->b_l1hdr.b_datacnt); 1690 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1691 delta = hdr->b_size; 1692 } 1693 ASSERT(delta > 0); 1694 ASSERT3U(*size, >=, delta); 1695 atomic_add_64(size, -delta); 1696 mutex_exit(&state->arcs_mtx); 1697 } 1698 /* remove the prefetch flag if we get a reference */ 1699 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1700 } 1701} 1702 1703static int 1704remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1705{ 1706 int cnt; 1707 arc_state_t *state = hdr->b_l1hdr.b_state; 1708 1709 ASSERT(HDR_HAS_L1HDR(hdr)); 1710 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1711 ASSERT(!GHOST_STATE(state)); 1712 1713 /* 1714 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1715 * check to prevent usage of the arc_l2c_only list. 1716 */ 1717 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1718 (state != arc_anon)) { 1719 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1720 1721 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1722 mutex_enter(&state->arcs_mtx); 1723 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1724 list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); 1725 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1726 atomic_add_64(size, hdr->b_size * 1727 hdr->b_l1hdr.b_datacnt); 1728 mutex_exit(&state->arcs_mtx); 1729 } 1730 return (cnt); 1731} 1732 1733/* 1734 * Move the supplied buffer to the indicated state. The mutex 1735 * for the buffer must be held by the caller. 1736 */ 1737static void 1738arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1739 kmutex_t *hash_lock) 1740{ 1741 arc_state_t *old_state; 1742 int64_t refcnt; 1743 uint32_t datacnt; 1744 uint64_t from_delta, to_delta; 1745 arc_buf_contents_t buftype = arc_buf_type(hdr); 1746 1747 /* 1748 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1749 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1750 * L1 hdr doesn't always exist when we change state to arc_anon before 1751 * destroying a header, in which case reallocating to add the L1 hdr is 1752 * pointless. 1753 */ 1754 if (HDR_HAS_L1HDR(hdr)) { 1755 old_state = hdr->b_l1hdr.b_state; 1756 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1757 datacnt = hdr->b_l1hdr.b_datacnt; 1758 } else { 1759 old_state = arc_l2c_only; 1760 refcnt = 0; 1761 datacnt = 0; 1762 } 1763 1764 ASSERT(MUTEX_HELD(hash_lock)); 1765 ASSERT3P(new_state, !=, old_state); 1766 ASSERT(refcnt == 0 || datacnt > 0); 1767 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1768 ASSERT(old_state != arc_anon || datacnt <= 1); 1769 1770 from_delta = to_delta = datacnt * hdr->b_size; 1771 1772 /* 1773 * If this buffer is evictable, transfer it from the 1774 * old state list to the new state list. 1775 */ 1776 if (refcnt == 0) { 1777 if (old_state != arc_anon && old_state != arc_l2c_only) { 1778 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1779 uint64_t *size = &old_state->arcs_lsize[buftype]; 1780 1781 if (use_mutex) 1782 mutex_enter(&old_state->arcs_mtx); 1783 1784 ASSERT(HDR_HAS_L1HDR(hdr)); 1785 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1786 list_remove(&old_state->arcs_list[buftype], hdr); 1787 1788 /* 1789 * If prefetching out of the ghost cache, 1790 * we will have a non-zero datacnt. 1791 */ 1792 if (GHOST_STATE(old_state) && datacnt == 0) { 1793 /* ghost elements have a ghost size */ 1794 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1795 from_delta = hdr->b_size; 1796 } 1797 ASSERT3U(*size, >=, from_delta); 1798 atomic_add_64(size, -from_delta); 1799 1800 if (use_mutex) 1801 mutex_exit(&old_state->arcs_mtx); 1802 } 1803 if (new_state != arc_anon && new_state != arc_l2c_only) { 1804 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1805 uint64_t *size = &new_state->arcs_lsize[buftype]; 1806 1807 /* 1808 * An L1 header always exists here, since if we're 1809 * moving to some L1-cached state (i.e. not l2c_only or 1810 * anonymous), we realloc the header to add an L1hdr 1811 * beforehand. 1812 */ 1813 ASSERT(HDR_HAS_L1HDR(hdr)); 1814 if (use_mutex) 1815 mutex_enter(&new_state->arcs_mtx); 1816 1817 list_insert_head(&new_state->arcs_list[buftype], hdr); 1818 1819 /* ghost elements have a ghost size */ 1820 if (GHOST_STATE(new_state)) { 1821 ASSERT0(datacnt); 1822 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1823 to_delta = hdr->b_size; 1824 } 1825 atomic_add_64(size, to_delta); 1826 1827 if (use_mutex) 1828 mutex_exit(&new_state->arcs_mtx); 1829 } 1830 } 1831 1832 ASSERT(!BUF_EMPTY(hdr)); 1833 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1834 buf_hash_remove(hdr); 1835 1836 /* adjust state sizes (ignore arc_l2c_only) */ 1837 if (to_delta && new_state != arc_l2c_only) 1838 atomic_add_64(&new_state->arcs_size, to_delta); 1839 if (from_delta && old_state != arc_l2c_only) { 1840 ASSERT3U(old_state->arcs_size, >=, from_delta); 1841 atomic_add_64(&old_state->arcs_size, -from_delta); 1842 } 1843 if (HDR_HAS_L1HDR(hdr)) 1844 hdr->b_l1hdr.b_state = new_state; 1845 1846 /* 1847 * L2 headers should never be on the L2 state list since they don't 1848 * have L1 headers allocated. 1849 */ 1850 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1851 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1852} 1853 1854void 1855arc_space_consume(uint64_t space, arc_space_type_t type) 1856{ 1857 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1858 1859 switch (type) { 1860 case ARC_SPACE_DATA: 1861 ARCSTAT_INCR(arcstat_data_size, space); 1862 break; 1863 case ARC_SPACE_META: 1864 ARCSTAT_INCR(arcstat_metadata_size, space); 1865 break; 1866 case ARC_SPACE_OTHER: 1867 ARCSTAT_INCR(arcstat_other_size, space); 1868 break; 1869 case ARC_SPACE_HDRS: 1870 ARCSTAT_INCR(arcstat_hdr_size, space); 1871 break; 1872 case ARC_SPACE_L2HDRS: 1873 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1874 break; 1875 } 1876 1877 if (type != ARC_SPACE_DATA) 1878 ARCSTAT_INCR(arcstat_meta_used, space); 1879 1880 atomic_add_64(&arc_size, space); 1881} 1882 1883void 1884arc_space_return(uint64_t space, arc_space_type_t type) 1885{ 1886 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1887 1888 switch (type) { 1889 case ARC_SPACE_DATA: 1890 ARCSTAT_INCR(arcstat_data_size, -space); 1891 break; 1892 case ARC_SPACE_META: 1893 ARCSTAT_INCR(arcstat_metadata_size, -space); 1894 break; 1895 case ARC_SPACE_OTHER: 1896 ARCSTAT_INCR(arcstat_other_size, -space); 1897 break; 1898 case ARC_SPACE_HDRS: 1899 ARCSTAT_INCR(arcstat_hdr_size, -space); 1900 break; 1901 case ARC_SPACE_L2HDRS: 1902 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1903 break; 1904 } 1905 1906 if (type != ARC_SPACE_DATA) { 1907 ASSERT(arc_meta_used >= space); 1908 if (arc_meta_max < arc_meta_used) 1909 arc_meta_max = arc_meta_used; 1910 ARCSTAT_INCR(arcstat_meta_used, -space); 1911 } 1912 1913 ASSERT(arc_size >= space); 1914 atomic_add_64(&arc_size, -space); 1915} 1916 1917arc_buf_t * 1918arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1919{ 1920 arc_buf_hdr_t *hdr; 1921 arc_buf_t *buf; 1922 1923 ASSERT3U(size, >, 0); 1924 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1925 ASSERT(BUF_EMPTY(hdr)); 1926 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1927 hdr->b_size = size; 1928 hdr->b_spa = spa_load_guid(spa); 1929 1930 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1931 buf->b_hdr = hdr; 1932 buf->b_data = NULL; 1933 buf->b_efunc = NULL; 1934 buf->b_private = NULL; 1935 buf->b_next = NULL; 1936 1937 hdr->b_flags = arc_bufc_to_flags(type); 1938 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1939 1940 hdr->b_l1hdr.b_buf = buf; 1941 hdr->b_l1hdr.b_state = arc_anon; 1942 hdr->b_l1hdr.b_arc_access = 0; 1943 hdr->b_l1hdr.b_datacnt = 1; 1944 1945 arc_get_data_buf(buf); 1946 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1947 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1948 1949 return (buf); 1950} 1951 1952static char *arc_onloan_tag = "onloan"; 1953 1954/* 1955 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1956 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1957 * buffers must be returned to the arc before they can be used by the DMU or 1958 * freed. 1959 */ 1960arc_buf_t * 1961arc_loan_buf(spa_t *spa, int size) 1962{ 1963 arc_buf_t *buf; 1964 1965 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1966 1967 atomic_add_64(&arc_loaned_bytes, size); 1968 return (buf); 1969} 1970 1971/* 1972 * Return a loaned arc buffer to the arc. 1973 */ 1974void 1975arc_return_buf(arc_buf_t *buf, void *tag) 1976{ 1977 arc_buf_hdr_t *hdr = buf->b_hdr; 1978 1979 ASSERT(buf->b_data != NULL); 1980 ASSERT(HDR_HAS_L1HDR(hdr)); 1981 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1982 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1983 1984 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1985} 1986 1987/* Detach an arc_buf from a dbuf (tag) */ 1988void 1989arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1990{ 1991 arc_buf_hdr_t *hdr = buf->b_hdr; 1992 1993 ASSERT(buf->b_data != NULL); 1994 ASSERT(HDR_HAS_L1HDR(hdr)); 1995 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1996 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 1997 buf->b_efunc = NULL; 1998 buf->b_private = NULL; 1999 2000 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2001} 2002 2003static arc_buf_t * 2004arc_buf_clone(arc_buf_t *from) 2005{ 2006 arc_buf_t *buf; 2007 arc_buf_hdr_t *hdr = from->b_hdr; 2008 uint64_t size = hdr->b_size; 2009 2010 ASSERT(HDR_HAS_L1HDR(hdr)); 2011 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2012 2013 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2014 buf->b_hdr = hdr; 2015 buf->b_data = NULL; 2016 buf->b_efunc = NULL; 2017 buf->b_private = NULL; 2018 buf->b_next = hdr->b_l1hdr.b_buf; 2019 hdr->b_l1hdr.b_buf = buf; 2020 arc_get_data_buf(buf); 2021 bcopy(from->b_data, buf->b_data, size); 2022 2023 /* 2024 * This buffer already exists in the arc so create a duplicate 2025 * copy for the caller. If the buffer is associated with user data 2026 * then track the size and number of duplicates. These stats will be 2027 * updated as duplicate buffers are created and destroyed. 2028 */ 2029 if (HDR_ISTYPE_DATA(hdr)) { 2030 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2031 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2032 } 2033 hdr->b_l1hdr.b_datacnt += 1; 2034 return (buf); 2035} 2036 2037void 2038arc_buf_add_ref(arc_buf_t *buf, void* tag) 2039{ 2040 arc_buf_hdr_t *hdr; 2041 kmutex_t *hash_lock; 2042 2043 /* 2044 * Check to see if this buffer is evicted. Callers 2045 * must verify b_data != NULL to know if the add_ref 2046 * was successful. 2047 */ 2048 mutex_enter(&buf->b_evict_lock); 2049 if (buf->b_data == NULL) { 2050 mutex_exit(&buf->b_evict_lock); 2051 return; 2052 } 2053 hash_lock = HDR_LOCK(buf->b_hdr); 2054 mutex_enter(hash_lock); 2055 hdr = buf->b_hdr; 2056 ASSERT(HDR_HAS_L1HDR(hdr)); 2057 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2058 mutex_exit(&buf->b_evict_lock); 2059 2060 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2061 hdr->b_l1hdr.b_state == arc_mfu); 2062 2063 add_reference(hdr, hash_lock, tag); 2064 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2065 arc_access(hdr, hash_lock); 2066 mutex_exit(hash_lock); 2067 ARCSTAT_BUMP(arcstat_hits); 2068 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2069 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2070 data, metadata, hits); 2071} 2072 2073static void 2074arc_buf_free_on_write(void *data, size_t size, 2075 void (*free_func)(void *, size_t)) 2076{ 2077 l2arc_data_free_t *df; 2078 2079 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2080 df->l2df_data = data; 2081 df->l2df_size = size; 2082 df->l2df_func = free_func; 2083 mutex_enter(&l2arc_free_on_write_mtx); 2084 list_insert_head(l2arc_free_on_write, df); 2085 mutex_exit(&l2arc_free_on_write_mtx); 2086} 2087 2088/* 2089 * Free the arc data buffer. If it is an l2arc write in progress, 2090 * the buffer is placed on l2arc_free_on_write to be freed later. 2091 */ 2092static void 2093arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2094{ 2095 arc_buf_hdr_t *hdr = buf->b_hdr; 2096 2097 if (HDR_L2_WRITING(hdr)) { 2098 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2099 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2100 } else { 2101 free_func(buf->b_data, hdr->b_size); 2102 } 2103} 2104 2105/* 2106 * Free up buf->b_data and if 'remove' is set, then pull the 2107 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2108 */ 2109static void 2110arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2111{ 2112 ASSERT(HDR_HAS_L2HDR(hdr)); 2113 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2114 2115 /* 2116 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2117 * that doesn't exist, the header is in the arc_l2c_only state, 2118 * and there isn't anything to free (it's already been freed). 2119 */ 2120 if (!HDR_HAS_L1HDR(hdr)) 2121 return; 2122 2123 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2124 return; 2125 2126 ASSERT(HDR_L2_WRITING(hdr)); 2127 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2128 zio_data_buf_free); 2129 2130 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2131 hdr->b_l1hdr.b_tmp_cdata = NULL; 2132} 2133 2134static void 2135arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2136{ 2137 arc_buf_t **bufp; 2138 2139 /* free up data associated with the buf */ 2140 if (buf->b_data != NULL) { 2141 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2142 uint64_t size = buf->b_hdr->b_size; 2143 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2144 2145 arc_cksum_verify(buf); 2146#ifdef illumos 2147 arc_buf_unwatch(buf); 2148#endif /* illumos */ 2149 2150 if (!recycle) { 2151 if (type == ARC_BUFC_METADATA) { 2152 arc_buf_data_free(buf, zio_buf_free); 2153 arc_space_return(size, ARC_SPACE_META); 2154 } else { 2155 ASSERT(type == ARC_BUFC_DATA); 2156 arc_buf_data_free(buf, zio_data_buf_free); 2157 arc_space_return(size, ARC_SPACE_DATA); 2158 } 2159 } 2160 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2161 uint64_t *cnt = &state->arcs_lsize[type]; 2162 2163 ASSERT(refcount_is_zero( 2164 &buf->b_hdr->b_l1hdr.b_refcnt)); 2165 ASSERT(state != arc_anon && state != arc_l2c_only); 2166 2167 ASSERT3U(*cnt, >=, size); 2168 atomic_add_64(cnt, -size); 2169 } 2170 ASSERT3U(state->arcs_size, >=, size); 2171 atomic_add_64(&state->arcs_size, -size); 2172 buf->b_data = NULL; 2173 2174 /* 2175 * If we're destroying a duplicate buffer make sure 2176 * that the appropriate statistics are updated. 2177 */ 2178 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2179 HDR_ISTYPE_DATA(buf->b_hdr)) { 2180 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2181 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2182 } 2183 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2184 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2185 } 2186 2187 /* only remove the buf if requested */ 2188 if (!remove) 2189 return; 2190 2191 /* remove the buf from the hdr list */ 2192 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2193 bufp = &(*bufp)->b_next) 2194 continue; 2195 *bufp = buf->b_next; 2196 buf->b_next = NULL; 2197 2198 ASSERT(buf->b_efunc == NULL); 2199 2200 /* clean up the buf */ 2201 buf->b_hdr = NULL; 2202 kmem_cache_free(buf_cache, buf); 2203} 2204 2205static void 2206arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2207{ 2208 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2209 l2arc_dev_t *dev = l2hdr->b_dev; 2210 2211 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2212 ASSERT(HDR_HAS_L2HDR(hdr)); 2213 2214 list_remove(&dev->l2ad_buflist, hdr); 2215 2216 /* 2217 * We don't want to leak the b_tmp_cdata buffer that was 2218 * allocated in l2arc_write_buffers() 2219 */ 2220 arc_buf_l2_cdata_free(hdr); 2221 2222 /* 2223 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2224 * this header is being processed by l2arc_write_buffers() (i.e. 2225 * it's in the first stage of l2arc_write_buffers()). 2226 * Re-affirming that truth here, just to serve as a reminder. If 2227 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2228 * may not have its HDR_L2_WRITING flag set. (the write may have 2229 * completed, in which case HDR_L2_WRITING will be false and the 2230 * b_daddr field will point to the address of the buffer on disk). 2231 */ 2232 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2233 2234 /* 2235 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2236 * l2arc_write_buffers(). Since we've just removed this header 2237 * from the l2arc buffer list, this header will never reach the 2238 * second stage of l2arc_write_buffers(), which increments the 2239 * accounting stats for this header. Thus, we must be careful 2240 * not to decrement them for this header either. 2241 */ 2242 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2243 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2244 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2245 2246 vdev_space_update(dev->l2ad_vdev, 2247 -l2hdr->b_asize, 0, 0); 2248 2249 (void) refcount_remove_many(&dev->l2ad_alloc, 2250 l2hdr->b_asize, hdr); 2251 } 2252 2253 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2254} 2255 2256static void 2257arc_hdr_destroy(arc_buf_hdr_t *hdr) 2258{ 2259 if (HDR_HAS_L1HDR(hdr)) { 2260 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2261 hdr->b_l1hdr.b_datacnt > 0); 2262 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2263 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2264 } 2265 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2266 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2267 2268 if (HDR_HAS_L2HDR(hdr)) { 2269 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2270 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2271 2272 if (!buflist_held) 2273 mutex_enter(&dev->l2ad_mtx); 2274 2275 /* 2276 * Even though we checked this conditional above, we 2277 * need to check this again now that we have the 2278 * l2ad_mtx. This is because we could be racing with 2279 * another thread calling l2arc_evict() which might have 2280 * destroyed this header's L2 portion as we were waiting 2281 * to acquire the l2ad_mtx. If that happens, we don't 2282 * want to re-destroy the header's L2 portion. 2283 */ 2284 if (HDR_HAS_L2HDR(hdr)) { 2285 if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) 2286 trim_map_free(dev->l2ad_vdev, 2287 hdr->b_l2hdr.b_daddr, 2288 hdr->b_l2hdr.b_asize, 0); 2289 arc_hdr_l2hdr_destroy(hdr); 2290 } 2291 2292 if (!buflist_held) 2293 mutex_exit(&dev->l2ad_mtx); 2294 } 2295 2296 if (!BUF_EMPTY(hdr)) 2297 buf_discard_identity(hdr); 2298 if (hdr->b_freeze_cksum != NULL) { 2299 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2300 hdr->b_freeze_cksum = NULL; 2301 } 2302 2303 if (HDR_HAS_L1HDR(hdr)) { 2304 while (hdr->b_l1hdr.b_buf) { 2305 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2306 2307 if (buf->b_efunc != NULL) { 2308 mutex_enter(&arc_eviction_mtx); 2309 mutex_enter(&buf->b_evict_lock); 2310 ASSERT(buf->b_hdr != NULL); 2311 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2312 FALSE); 2313 hdr->b_l1hdr.b_buf = buf->b_next; 2314 buf->b_hdr = &arc_eviction_hdr; 2315 buf->b_next = arc_eviction_list; 2316 arc_eviction_list = buf; 2317 mutex_exit(&buf->b_evict_lock); 2318 mutex_exit(&arc_eviction_mtx); 2319 } else { 2320 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2321 TRUE); 2322 } 2323 } 2324#ifdef ZFS_DEBUG 2325 if (hdr->b_l1hdr.b_thawed != NULL) { 2326 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2327 hdr->b_l1hdr.b_thawed = NULL; 2328 } 2329#endif 2330 } 2331 2332 ASSERT3P(hdr->b_hash_next, ==, NULL); 2333 if (HDR_HAS_L1HDR(hdr)) { 2334 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2335 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2336 kmem_cache_free(hdr_full_cache, hdr); 2337 } else { 2338 kmem_cache_free(hdr_l2only_cache, hdr); 2339 } 2340} 2341 2342void 2343arc_buf_free(arc_buf_t *buf, void *tag) 2344{ 2345 arc_buf_hdr_t *hdr = buf->b_hdr; 2346 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2347 2348 ASSERT(buf->b_efunc == NULL); 2349 ASSERT(buf->b_data != NULL); 2350 2351 if (hashed) { 2352 kmutex_t *hash_lock = HDR_LOCK(hdr); 2353 2354 mutex_enter(hash_lock); 2355 hdr = buf->b_hdr; 2356 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2357 2358 (void) remove_reference(hdr, hash_lock, tag); 2359 if (hdr->b_l1hdr.b_datacnt > 1) { 2360 arc_buf_destroy(buf, FALSE, TRUE); 2361 } else { 2362 ASSERT(buf == hdr->b_l1hdr.b_buf); 2363 ASSERT(buf->b_efunc == NULL); 2364 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2365 } 2366 mutex_exit(hash_lock); 2367 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2368 int destroy_hdr; 2369 /* 2370 * We are in the middle of an async write. Don't destroy 2371 * this buffer unless the write completes before we finish 2372 * decrementing the reference count. 2373 */ 2374 mutex_enter(&arc_eviction_mtx); 2375 (void) remove_reference(hdr, NULL, tag); 2376 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2377 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2378 mutex_exit(&arc_eviction_mtx); 2379 if (destroy_hdr) 2380 arc_hdr_destroy(hdr); 2381 } else { 2382 if (remove_reference(hdr, NULL, tag) > 0) 2383 arc_buf_destroy(buf, FALSE, TRUE); 2384 else 2385 arc_hdr_destroy(hdr); 2386 } 2387} 2388 2389boolean_t 2390arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2391{ 2392 arc_buf_hdr_t *hdr = buf->b_hdr; 2393 kmutex_t *hash_lock = HDR_LOCK(hdr); 2394 boolean_t no_callback = (buf->b_efunc == NULL); 2395 2396 if (hdr->b_l1hdr.b_state == arc_anon) { 2397 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2398 arc_buf_free(buf, tag); 2399 return (no_callback); 2400 } 2401 2402 mutex_enter(hash_lock); 2403 hdr = buf->b_hdr; 2404 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2405 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2406 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2407 ASSERT(buf->b_data != NULL); 2408 2409 (void) remove_reference(hdr, hash_lock, tag); 2410 if (hdr->b_l1hdr.b_datacnt > 1) { 2411 if (no_callback) 2412 arc_buf_destroy(buf, FALSE, TRUE); 2413 } else if (no_callback) { 2414 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2415 ASSERT(buf->b_efunc == NULL); 2416 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2417 } 2418 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2419 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2420 mutex_exit(hash_lock); 2421 return (no_callback); 2422} 2423 2424int32_t 2425arc_buf_size(arc_buf_t *buf) 2426{ 2427 return (buf->b_hdr->b_size); 2428} 2429 2430/* 2431 * Called from the DMU to determine if the current buffer should be 2432 * evicted. In order to ensure proper locking, the eviction must be initiated 2433 * from the DMU. Return true if the buffer is associated with user data and 2434 * duplicate buffers still exist. 2435 */ 2436boolean_t 2437arc_buf_eviction_needed(arc_buf_t *buf) 2438{ 2439 arc_buf_hdr_t *hdr; 2440 boolean_t evict_needed = B_FALSE; 2441 2442 if (zfs_disable_dup_eviction) 2443 return (B_FALSE); 2444 2445 mutex_enter(&buf->b_evict_lock); 2446 hdr = buf->b_hdr; 2447 if (hdr == NULL) { 2448 /* 2449 * We are in arc_do_user_evicts(); let that function 2450 * perform the eviction. 2451 */ 2452 ASSERT(buf->b_data == NULL); 2453 mutex_exit(&buf->b_evict_lock); 2454 return (B_FALSE); 2455 } else if (buf->b_data == NULL) { 2456 /* 2457 * We have already been added to the arc eviction list; 2458 * recommend eviction. 2459 */ 2460 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2461 mutex_exit(&buf->b_evict_lock); 2462 return (B_TRUE); 2463 } 2464 2465 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2466 evict_needed = B_TRUE; 2467 2468 mutex_exit(&buf->b_evict_lock); 2469 return (evict_needed); 2470} 2471 2472/* 2473 * Evict buffers from list until we've removed the specified number of 2474 * bytes. Move the removed buffers to the appropriate evict state. 2475 * If the recycle flag is set, then attempt to "recycle" a buffer: 2476 * - look for a buffer to evict that is `bytes' long. 2477 * - return the data block from this buffer rather than freeing it. 2478 * This flag is used by callers that are trying to make space for a 2479 * new buffer in a full arc cache. 2480 * 2481 * This function makes a "best effort". It skips over any buffers 2482 * it can't get a hash_lock on, and so may not catch all candidates. 2483 * It may also return without evicting as much space as requested. 2484 */ 2485static void * 2486arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2487 arc_buf_contents_t type) 2488{ 2489 arc_state_t *evicted_state; 2490 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2491 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2492 kmutex_t *hash_lock; 2493 boolean_t have_lock; 2494 void *stolen = NULL; 2495 arc_buf_hdr_t marker = { 0 }; 2496 int count = 0; 2497 2498 ASSERT(state == arc_mru || state == arc_mfu); 2499 2500 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2501 2502 /* 2503 * The ghost list lock must be acquired first in order to prevent 2504 * a 3 party deadlock: 2505 * 2506 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2507 * l2ad_mtx in arc_hdr_realloc 2508 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2509 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2510 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2511 * 2512 * This situation is avoided by acquiring the ghost list lock first. 2513 */ 2514 mutex_enter(&evicted_state->arcs_mtx); 2515 mutex_enter(&state->arcs_mtx); 2516 2517 /* 2518 * Decide which "type" (data vs metadata) to recycle from. 2519 * 2520 * If we are over the metadata limit, recycle from metadata. 2521 * If we are under the metadata minimum, recycle from data. 2522 * Otherwise, recycle from whichever type has the oldest (least 2523 * recently accessed) header. 2524 */ 2525 if (recycle) { 2526 arc_buf_hdr_t *data_hdr = 2527 list_tail(&state->arcs_list[ARC_BUFC_DATA]); 2528 arc_buf_hdr_t *metadata_hdr = 2529 list_tail(&state->arcs_list[ARC_BUFC_METADATA]); 2530 arc_buf_contents_t realtype; 2531 2532 if (data_hdr == NULL) { 2533 realtype = ARC_BUFC_METADATA; 2534 } else if (metadata_hdr == NULL) { 2535 realtype = ARC_BUFC_DATA; 2536 } else if (arc_meta_used >= arc_meta_limit) { 2537 realtype = ARC_BUFC_METADATA; 2538 } else if (arc_meta_used <= arc_meta_min) { 2539 realtype = ARC_BUFC_DATA; 2540 } else if (HDR_HAS_L1HDR(data_hdr) && 2541 HDR_HAS_L1HDR(metadata_hdr) && 2542 data_hdr->b_l1hdr.b_arc_access < 2543 metadata_hdr->b_l1hdr.b_arc_access) { 2544 realtype = ARC_BUFC_DATA; 2545 } else { 2546 realtype = ARC_BUFC_METADATA; 2547 } 2548 if (realtype != type) { 2549 /* 2550 * If we want to evict from a different list, 2551 * we can not recycle, because DATA vs METADATA 2552 * buffers are segregated into different kmem 2553 * caches (and vmem arenas). 2554 */ 2555 type = realtype; 2556 recycle = B_FALSE; 2557 } 2558 } 2559 2560 list_t *list = &state->arcs_list[type]; 2561 2562 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2563 hdr_prev = list_prev(list, hdr); 2564 /* prefetch buffers have a minimum lifespan */ 2565 if (HDR_IO_IN_PROGRESS(hdr) || 2566 (spa && hdr->b_spa != spa) || 2567 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2568 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2569 arc_min_prefetch_lifespan)) { 2570 skipped++; 2571 continue; 2572 } 2573 /* "lookahead" for better eviction candidate */ 2574 if (recycle && hdr->b_size != bytes && 2575 hdr_prev && hdr_prev->b_size == bytes) 2576 continue; 2577 2578 /* ignore markers */ 2579 if (hdr->b_spa == 0) 2580 continue; 2581 2582 /* 2583 * It may take a long time to evict all the bufs requested. 2584 * To avoid blocking all arc activity, periodically drop 2585 * the arcs_mtx and give other threads a chance to run 2586 * before reacquiring the lock. 2587 * 2588 * If we are looking for a buffer to recycle, we are in 2589 * the hot code path, so don't sleep. 2590 */ 2591 if (!recycle && count++ > arc_evict_iterations) { 2592 list_insert_after(list, hdr, &marker); 2593 mutex_exit(&state->arcs_mtx); 2594 mutex_exit(&evicted_state->arcs_mtx); 2595 kpreempt(KPREEMPT_SYNC); 2596 mutex_enter(&evicted_state->arcs_mtx); 2597 mutex_enter(&state->arcs_mtx); 2598 hdr_prev = list_prev(list, &marker); 2599 list_remove(list, &marker); 2600 count = 0; 2601 continue; 2602 } 2603 2604 hash_lock = HDR_LOCK(hdr); 2605 have_lock = MUTEX_HELD(hash_lock); 2606 if (have_lock || mutex_tryenter(hash_lock)) { 2607 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2608 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2609 while (hdr->b_l1hdr.b_buf) { 2610 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2611 if (!mutex_tryenter(&buf->b_evict_lock)) { 2612 missed += 1; 2613 break; 2614 } 2615 if (buf->b_data != NULL) { 2616 bytes_evicted += hdr->b_size; 2617 if (recycle && 2618 arc_buf_type(hdr) == type && 2619 hdr->b_size == bytes && 2620 !HDR_L2_WRITING(hdr)) { 2621 stolen = buf->b_data; 2622 recycle = FALSE; 2623 } 2624 } 2625 if (buf->b_efunc != NULL) { 2626 mutex_enter(&arc_eviction_mtx); 2627 arc_buf_destroy(buf, 2628 buf->b_data == stolen, FALSE); 2629 hdr->b_l1hdr.b_buf = buf->b_next; 2630 buf->b_hdr = &arc_eviction_hdr; 2631 buf->b_next = arc_eviction_list; 2632 arc_eviction_list = buf; 2633 mutex_exit(&arc_eviction_mtx); 2634 mutex_exit(&buf->b_evict_lock); 2635 } else { 2636 mutex_exit(&buf->b_evict_lock); 2637 arc_buf_destroy(buf, 2638 buf->b_data == stolen, TRUE); 2639 } 2640 } 2641 2642 if (HDR_HAS_L2HDR(hdr)) { 2643 ARCSTAT_INCR(arcstat_evict_l2_cached, 2644 hdr->b_size); 2645 } else { 2646 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2647 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2648 hdr->b_size); 2649 } else { 2650 ARCSTAT_INCR( 2651 arcstat_evict_l2_ineligible, 2652 hdr->b_size); 2653 } 2654 } 2655 2656 if (hdr->b_l1hdr.b_datacnt == 0) { 2657 arc_change_state(evicted_state, hdr, hash_lock); 2658 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2659 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2660 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2661 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2662 } 2663 if (!have_lock) 2664 mutex_exit(hash_lock); 2665 if (bytes >= 0 && bytes_evicted >= bytes) 2666 break; 2667 } else { 2668 missed += 1; 2669 } 2670 } 2671 2672 mutex_exit(&state->arcs_mtx); 2673 mutex_exit(&evicted_state->arcs_mtx); 2674 2675 if (bytes_evicted < bytes) 2676 dprintf("only evicted %lld bytes from %x", 2677 (longlong_t)bytes_evicted, state); 2678 2679 if (skipped) 2680 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2681 2682 if (missed) 2683 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2684 2685 /* 2686 * Note: we have just evicted some data into the ghost state, 2687 * potentially putting the ghost size over the desired size. Rather 2688 * that evicting from the ghost list in this hot code path, leave 2689 * this chore to the arc_reclaim_thread(). 2690 */ 2691 2692 return (stolen); 2693} 2694 2695/* 2696 * Remove buffers from list until we've removed the specified number of 2697 * bytes. Destroy the buffers that are removed. 2698 */ 2699static void 2700arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2701{ 2702 arc_buf_hdr_t *hdr, *hdr_prev; 2703 arc_buf_hdr_t marker = { 0 }; 2704 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 2705 kmutex_t *hash_lock; 2706 uint64_t bytes_deleted = 0; 2707 uint64_t bufs_skipped = 0; 2708 int count = 0; 2709 2710 ASSERT(GHOST_STATE(state)); 2711top: 2712 mutex_enter(&state->arcs_mtx); 2713 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2714 hdr_prev = list_prev(list, hdr); 2715 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2716 panic("invalid hdr=%p", (void *)hdr); 2717 if (spa && hdr->b_spa != spa) 2718 continue; 2719 2720 /* ignore markers */ 2721 if (hdr->b_spa == 0) 2722 continue; 2723 2724 hash_lock = HDR_LOCK(hdr); 2725 /* caller may be trying to modify this buffer, skip it */ 2726 if (MUTEX_HELD(hash_lock)) 2727 continue; 2728 2729 /* 2730 * It may take a long time to evict all the bufs requested. 2731 * To avoid blocking all arc activity, periodically drop 2732 * the arcs_mtx and give other threads a chance to run 2733 * before reacquiring the lock. 2734 */ 2735 if (count++ > arc_evict_iterations) { 2736 list_insert_after(list, hdr, &marker); 2737 mutex_exit(&state->arcs_mtx); 2738 kpreempt(KPREEMPT_SYNC); 2739 mutex_enter(&state->arcs_mtx); 2740 hdr_prev = list_prev(list, &marker); 2741 list_remove(list, &marker); 2742 count = 0; 2743 continue; 2744 } 2745 if (mutex_tryenter(hash_lock)) { 2746 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2747 ASSERT(!HDR_HAS_L1HDR(hdr) || 2748 hdr->b_l1hdr.b_buf == NULL); 2749 ARCSTAT_BUMP(arcstat_deleted); 2750 bytes_deleted += hdr->b_size; 2751 2752 if (HDR_HAS_L2HDR(hdr)) { 2753 /* 2754 * This buffer is cached on the 2nd Level ARC; 2755 * don't destroy the header. 2756 */ 2757 arc_change_state(arc_l2c_only, hdr, hash_lock); 2758 /* 2759 * dropping from L1+L2 cached to L2-only, 2760 * realloc to remove the L1 header. 2761 */ 2762 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2763 hdr_l2only_cache); 2764 mutex_exit(hash_lock); 2765 } else { 2766 arc_change_state(arc_anon, hdr, hash_lock); 2767 mutex_exit(hash_lock); 2768 arc_hdr_destroy(hdr); 2769 } 2770 2771 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2772 if (bytes >= 0 && bytes_deleted >= bytes) 2773 break; 2774 } else if (bytes < 0) { 2775 /* 2776 * Insert a list marker and then wait for the 2777 * hash lock to become available. Once its 2778 * available, restart from where we left off. 2779 */ 2780 list_insert_after(list, hdr, &marker); 2781 mutex_exit(&state->arcs_mtx); 2782 mutex_enter(hash_lock); 2783 mutex_exit(hash_lock); 2784 mutex_enter(&state->arcs_mtx); 2785 hdr_prev = list_prev(list, &marker); 2786 list_remove(list, &marker); 2787 } else { 2788 bufs_skipped += 1; 2789 } 2790 2791 } 2792 mutex_exit(&state->arcs_mtx); 2793 2794 if (list == &state->arcs_list[ARC_BUFC_DATA] && 2795 (bytes < 0 || bytes_deleted < bytes)) { 2796 list = &state->arcs_list[ARC_BUFC_METADATA]; 2797 goto top; 2798 } 2799 2800 if (bufs_skipped) { 2801 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2802 ASSERT(bytes >= 0); 2803 } 2804 2805 if (bytes_deleted < bytes) 2806 dprintf("only deleted %lld bytes from %p", 2807 (longlong_t)bytes_deleted, state); 2808} 2809 2810static void 2811arc_adjust(void) 2812{ 2813 int64_t adjustment, delta; 2814 2815 /* 2816 * Adjust MRU size 2817 */ 2818 2819 adjustment = MIN((int64_t)(arc_size - arc_c), 2820 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2821 arc_p)); 2822 2823 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2824 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2825 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2826 adjustment -= delta; 2827 } 2828 2829 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2830 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2831 (void) arc_evict(arc_mru, 0, delta, FALSE, 2832 ARC_BUFC_METADATA); 2833 } 2834 2835 /* 2836 * Adjust MFU size 2837 */ 2838 2839 adjustment = arc_size - arc_c; 2840 2841 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2842 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2843 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2844 adjustment -= delta; 2845 } 2846 2847 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2848 int64_t delta = MIN(adjustment, 2849 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2850 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2851 ARC_BUFC_METADATA); 2852 } 2853 2854 /* 2855 * Adjust ghost lists 2856 */ 2857 2858 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2859 2860 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2861 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2862 arc_evict_ghost(arc_mru_ghost, 0, delta); 2863 } 2864 2865 adjustment = 2866 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2867 2868 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2869 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2870 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2871 } 2872} 2873 2874static void 2875arc_do_user_evicts(void) 2876{ 2877 mutex_enter(&arc_eviction_mtx); 2878 while (arc_eviction_list != NULL) { 2879 arc_buf_t *buf = arc_eviction_list; 2880 arc_eviction_list = buf->b_next; 2881 mutex_enter(&buf->b_evict_lock); 2882 buf->b_hdr = NULL; 2883 mutex_exit(&buf->b_evict_lock); 2884 mutex_exit(&arc_eviction_mtx); 2885 2886 if (buf->b_efunc != NULL) 2887 VERIFY0(buf->b_efunc(buf->b_private)); 2888 2889 buf->b_efunc = NULL; 2890 buf->b_private = NULL; 2891 kmem_cache_free(buf_cache, buf); 2892 mutex_enter(&arc_eviction_mtx); 2893 } 2894 mutex_exit(&arc_eviction_mtx); 2895} 2896 2897/* 2898 * Flush all *evictable* data from the cache for the given spa. 2899 * NOTE: this will not touch "active" (i.e. referenced) data. 2900 */ 2901void 2902arc_flush(spa_t *spa) 2903{ 2904 uint64_t guid = 0; 2905 2906 if (spa != NULL) 2907 guid = spa_load_guid(spa); 2908 2909 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2910 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2911 if (spa != NULL) 2912 break; 2913 } 2914 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2915 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2916 if (spa != NULL) 2917 break; 2918 } 2919 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2920 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2921 if (spa != NULL) 2922 break; 2923 } 2924 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2925 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2926 if (spa != NULL) 2927 break; 2928 } 2929 2930 arc_evict_ghost(arc_mru_ghost, guid, -1); 2931 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2932 2933 mutex_enter(&arc_reclaim_thr_lock); 2934 arc_do_user_evicts(); 2935 mutex_exit(&arc_reclaim_thr_lock); 2936 ASSERT(spa || arc_eviction_list == NULL); 2937} 2938 2939void 2940arc_shrink(int64_t to_free) 2941{ 2942 if (arc_c > arc_c_min) { 2943 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 2944 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 2945 if (arc_c > arc_c_min + to_free) 2946 atomic_add_64(&arc_c, -to_free); 2947 else 2948 arc_c = arc_c_min; 2949 2950 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2951 if (arc_c > arc_size) 2952 arc_c = MAX(arc_size, arc_c_min); 2953 if (arc_p > arc_c) 2954 arc_p = (arc_c >> 1); 2955 2956 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 2957 arc_p); 2958 2959 ASSERT(arc_c >= arc_c_min); 2960 ASSERT((int64_t)arc_p >= 0); 2961 } 2962 2963 if (arc_size > arc_c) { 2964 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 2965 uint64_t, arc_c); 2966 arc_adjust(); 2967 } 2968} 2969 2970static long needfree = 0; 2971 2972typedef enum free_memory_reason_t { 2973 FMR_UNKNOWN, 2974 FMR_NEEDFREE, 2975 FMR_LOTSFREE, 2976 FMR_SWAPFS_MINFREE, 2977 FMR_PAGES_PP_MAXIMUM, 2978 FMR_HEAP_ARENA, 2979 FMR_ZIO_ARENA, 2980 FMR_ZIO_FRAG, 2981} free_memory_reason_t; 2982 2983int64_t last_free_memory; 2984free_memory_reason_t last_free_reason; 2985 2986/* 2987 * Additional reserve of pages for pp_reserve. 2988 */ 2989int64_t arc_pages_pp_reserve = 64; 2990 2991/* 2992 * Additional reserve of pages for swapfs. 2993 */ 2994int64_t arc_swapfs_reserve = 64; 2995 2996/* 2997 * Return the amount of memory that can be consumed before reclaim will be 2998 * needed. Positive if there is sufficient free memory, negative indicates 2999 * the amount of memory that needs to be freed up. 3000 */ 3001static int64_t 3002arc_available_memory(void) 3003{ 3004 int64_t lowest = INT64_MAX; 3005 int64_t n; 3006 free_memory_reason_t r = FMR_UNKNOWN; 3007 3008#ifdef _KERNEL 3009 if (needfree > 0) { 3010 n = PAGESIZE * (-needfree); 3011 if (n < lowest) { 3012 lowest = n; 3013 r = FMR_NEEDFREE; 3014 } 3015 } 3016 3017 /* 3018 * Cooperate with pagedaemon when it's time for it to scan 3019 * and reclaim some pages. 3020 */ 3021 n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 3022 if (n < lowest) { 3023 lowest = n; 3024 r = FMR_LOTSFREE; 3025 } 3026 3027#ifdef sun 3028 /* 3029 * check that we're out of range of the pageout scanner. It starts to 3030 * schedule paging if freemem is less than lotsfree and needfree. 3031 * lotsfree is the high-water mark for pageout, and needfree is the 3032 * number of needed free pages. We add extra pages here to make sure 3033 * the scanner doesn't start up while we're freeing memory. 3034 */ 3035 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3036 if (n < lowest) { 3037 lowest = n; 3038 r = FMR_LOTSFREE; 3039 } 3040 3041 /* 3042 * check to make sure that swapfs has enough space so that anon 3043 * reservations can still succeed. anon_resvmem() checks that the 3044 * availrmem is greater than swapfs_minfree, and the number of reserved 3045 * swap pages. We also add a bit of extra here just to prevent 3046 * circumstances from getting really dire. 3047 */ 3048 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3049 desfree - arc_swapfs_reserve); 3050 if (n < lowest) { 3051 lowest = n; 3052 r = FMR_SWAPFS_MINFREE; 3053 } 3054 3055 3056 /* 3057 * Check that we have enough availrmem that memory locking (e.g., via 3058 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3059 * stores the number of pages that cannot be locked; when availrmem 3060 * drops below pages_pp_maximum, page locking mechanisms such as 3061 * page_pp_lock() will fail.) 3062 */ 3063 n = PAGESIZE * (availrmem - pages_pp_maximum - 3064 arc_pages_pp_reserve); 3065 if (n < lowest) { 3066 lowest = n; 3067 r = FMR_PAGES_PP_MAXIMUM; 3068 } 3069 3070#endif /* sun */ 3071#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3072 /* 3073 * If we're on an i386 platform, it's possible that we'll exhaust the 3074 * kernel heap space before we ever run out of available physical 3075 * memory. Most checks of the size of the heap_area compare against 3076 * tune.t_minarmem, which is the minimum available real memory that we 3077 * can have in the system. However, this is generally fixed at 25 pages 3078 * which is so low that it's useless. In this comparison, we seek to 3079 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3080 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3081 * free) 3082 */ 3083 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 3084 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3085 if (n < lowest) { 3086 lowest = n; 3087 r = FMR_HEAP_ARENA; 3088 } 3089#define zio_arena NULL 3090#else 3091#define zio_arena heap_arena 3092#endif 3093 3094 /* 3095 * If zio data pages are being allocated out of a separate heap segment, 3096 * then enforce that the size of available vmem for this arena remains 3097 * above about 1/16th free. 3098 * 3099 * Note: The 1/16th arena free requirement was put in place 3100 * to aggressively evict memory from the arc in order to avoid 3101 * memory fragmentation issues. 3102 */ 3103 if (zio_arena != NULL) { 3104 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 3105 (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 3106 if (n < lowest) { 3107 lowest = n; 3108 r = FMR_ZIO_ARENA; 3109 } 3110 } 3111 3112 /* 3113 * Above limits know nothing about real level of KVA fragmentation. 3114 * Start aggressive reclamation if too little sequential KVA left. 3115 */ 3116 if (lowest > 0) { 3117 n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? 3118 -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 3119 INT64_MAX; 3120 if (n < lowest) { 3121 lowest = n; 3122 r = FMR_ZIO_FRAG; 3123 } 3124 } 3125 3126#else /* _KERNEL */ 3127 /* Every 100 calls, free a small amount */ 3128 if (spa_get_random(100) == 0) 3129 lowest = -1024; 3130#endif /* _KERNEL */ 3131 3132 last_free_memory = lowest; 3133 last_free_reason = r; 3134 DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 3135 return (lowest); 3136} 3137 3138 3139/* 3140 * Determine if the system is under memory pressure and is asking 3141 * to reclaim memory. A return value of TRUE indicates that the system 3142 * is under memory pressure and that the arc should adjust accordingly. 3143 */ 3144static boolean_t 3145arc_reclaim_needed(void) 3146{ 3147 return (arc_available_memory() < 0); 3148} 3149 3150extern kmem_cache_t *zio_buf_cache[]; 3151extern kmem_cache_t *zio_data_buf_cache[]; 3152extern kmem_cache_t *range_seg_cache; 3153 3154static __noinline void 3155arc_kmem_reap_now(void) 3156{ 3157 size_t i; 3158 kmem_cache_t *prev_cache = NULL; 3159 kmem_cache_t *prev_data_cache = NULL; 3160 3161 DTRACE_PROBE(arc__kmem_reap_start); 3162#ifdef _KERNEL 3163 if (arc_meta_used >= arc_meta_limit) { 3164 /* 3165 * We are exceeding our meta-data cache limit. 3166 * Purge some DNLC entries to release holds on meta-data. 3167 */ 3168 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3169 } 3170#if defined(__i386) 3171 /* 3172 * Reclaim unused memory from all kmem caches. 3173 */ 3174 kmem_reap(); 3175#endif 3176#endif 3177 3178 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3179 if (zio_buf_cache[i] != prev_cache) { 3180 prev_cache = zio_buf_cache[i]; 3181 kmem_cache_reap_now(zio_buf_cache[i]); 3182 } 3183 if (zio_data_buf_cache[i] != prev_data_cache) { 3184 prev_data_cache = zio_data_buf_cache[i]; 3185 kmem_cache_reap_now(zio_data_buf_cache[i]); 3186 } 3187 } 3188 kmem_cache_reap_now(buf_cache); 3189 kmem_cache_reap_now(hdr_full_cache); 3190 kmem_cache_reap_now(hdr_l2only_cache); 3191 kmem_cache_reap_now(range_seg_cache); 3192 3193#ifdef sun 3194 if (zio_arena != NULL) { 3195 /* 3196 * Ask the vmem arena to reclaim unused memory from its 3197 * quantum caches. 3198 */ 3199 vmem_qcache_reap(zio_arena); 3200 } 3201#endif 3202 DTRACE_PROBE(arc__kmem_reap_end); 3203} 3204 3205static void 3206arc_reclaim_thread(void *dummy __unused) 3207{ 3208 clock_t growtime = 0; 3209 callb_cpr_t cpr; 3210 3211 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3212 3213 mutex_enter(&arc_reclaim_thr_lock); 3214 while (arc_thread_exit == 0) { 3215 int64_t free_memory = arc_available_memory(); 3216 if (free_memory < 0) { 3217 3218 arc_no_grow = B_TRUE; 3219 arc_warm = B_TRUE; 3220 3221 /* 3222 * Wait at least zfs_grow_retry (default 60) seconds 3223 * before considering growing. 3224 */ 3225 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3226 3227 arc_kmem_reap_now(); 3228 3229 /* 3230 * If we are still low on memory, shrink the ARC 3231 * so that we have arc_shrink_min free space. 3232 */ 3233 free_memory = arc_available_memory(); 3234 3235 int64_t to_free = 3236 (arc_c >> arc_shrink_shift) - free_memory; 3237 if (to_free > 0) { 3238#ifdef _KERNEL 3239 to_free = MAX(to_free, ptob(needfree)); 3240#endif 3241 arc_shrink(to_free); 3242 } 3243 } else if (free_memory < arc_c >> arc_no_grow_shift) { 3244 arc_no_grow = B_TRUE; 3245 } else if (ddi_get_lbolt() >= growtime) { 3246 arc_no_grow = B_FALSE; 3247 } 3248 3249 arc_adjust(); 3250 3251 if (arc_eviction_list != NULL) 3252 arc_do_user_evicts(); 3253 3254#ifdef _KERNEL 3255 if (needfree) { 3256 needfree = 0; 3257 wakeup(&needfree); 3258 } 3259#endif 3260 3261 /* 3262 * This is necessary in order for the mdb ::arc dcmd to 3263 * show up to date information. Since the ::arc command 3264 * does not call the kstat's update function, without 3265 * this call, the command may show stale stats for the 3266 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3267 * with this change, the data might be up to 1 second 3268 * out of date; but that should suffice. The arc_state_t 3269 * structures can be queried directly if more accurate 3270 * information is needed. 3271 */ 3272 if (arc_ksp != NULL) 3273 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3274 3275 /* block until needed, or one second, whichever is shorter */ 3276 CALLB_CPR_SAFE_BEGIN(&cpr); 3277 (void) cv_timedwait(&arc_reclaim_thr_cv, 3278 &arc_reclaim_thr_lock, hz); 3279 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3280 } 3281 3282 arc_thread_exit = 0; 3283 cv_broadcast(&arc_reclaim_thr_cv); 3284 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3285 thread_exit(); 3286} 3287 3288/* 3289 * Adapt arc info given the number of bytes we are trying to add and 3290 * the state that we are comming from. This function is only called 3291 * when we are adding new content to the cache. 3292 */ 3293static void 3294arc_adapt(int bytes, arc_state_t *state) 3295{ 3296 int mult; 3297 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3298 3299 if (state == arc_l2c_only) 3300 return; 3301 3302 ASSERT(bytes > 0); 3303 /* 3304 * Adapt the target size of the MRU list: 3305 * - if we just hit in the MRU ghost list, then increase 3306 * the target size of the MRU list. 3307 * - if we just hit in the MFU ghost list, then increase 3308 * the target size of the MFU list by decreasing the 3309 * target size of the MRU list. 3310 */ 3311 if (state == arc_mru_ghost) { 3312 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3313 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3314 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3315 3316 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3317 } else if (state == arc_mfu_ghost) { 3318 uint64_t delta; 3319 3320 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3321 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3322 mult = MIN(mult, 10); 3323 3324 delta = MIN(bytes * mult, arc_p); 3325 arc_p = MAX(arc_p_min, arc_p - delta); 3326 } 3327 ASSERT((int64_t)arc_p >= 0); 3328 3329 if (arc_reclaim_needed()) { 3330 cv_signal(&arc_reclaim_thr_cv); 3331 return; 3332 } 3333 3334 if (arc_no_grow) 3335 return; 3336 3337 if (arc_c >= arc_c_max) 3338 return; 3339 3340 /* 3341 * If we're within (2 * maxblocksize) bytes of the target 3342 * cache size, increment the target cache size 3343 */ 3344 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3345 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3346 atomic_add_64(&arc_c, (int64_t)bytes); 3347 if (arc_c > arc_c_max) 3348 arc_c = arc_c_max; 3349 else if (state == arc_anon) 3350 atomic_add_64(&arc_p, (int64_t)bytes); 3351 if (arc_p > arc_c) 3352 arc_p = arc_c; 3353 } 3354 ASSERT((int64_t)arc_p >= 0); 3355} 3356 3357/* 3358 * Check if the cache has reached its limits and eviction is required 3359 * prior to insert. 3360 */ 3361static int 3362arc_evict_needed(arc_buf_contents_t type) 3363{ 3364 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3365 return (1); 3366 3367 if (arc_reclaim_needed()) 3368 return (1); 3369 3370 return (arc_size > arc_c); 3371} 3372 3373/* 3374 * The buffer, supplied as the first argument, needs a data block. 3375 * So, if we are at cache max, determine which cache should be victimized. 3376 * We have the following cases: 3377 * 3378 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3379 * In this situation if we're out of space, but the resident size of the MFU is 3380 * under the limit, victimize the MFU cache to satisfy this insertion request. 3381 * 3382 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3383 * Here, we've used up all of the available space for the MRU, so we need to 3384 * evict from our own cache instead. Evict from the set of resident MRU 3385 * entries. 3386 * 3387 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3388 * c minus p represents the MFU space in the cache, since p is the size of the 3389 * cache that is dedicated to the MRU. In this situation there's still space on 3390 * the MFU side, so the MRU side needs to be victimized. 3391 * 3392 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3393 * MFU's resident set is consuming more space than it has been allotted. In 3394 * this situation, we must victimize our own cache, the MFU, for this insertion. 3395 */ 3396static void 3397arc_get_data_buf(arc_buf_t *buf) 3398{ 3399 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3400 uint64_t size = buf->b_hdr->b_size; 3401 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3402 3403 arc_adapt(size, state); 3404 3405 /* 3406 * We have not yet reached cache maximum size, 3407 * just allocate a new buffer. 3408 */ 3409 if (!arc_evict_needed(type)) { 3410 if (type == ARC_BUFC_METADATA) { 3411 buf->b_data = zio_buf_alloc(size); 3412 arc_space_consume(size, ARC_SPACE_META); 3413 } else { 3414 ASSERT(type == ARC_BUFC_DATA); 3415 buf->b_data = zio_data_buf_alloc(size); 3416 arc_space_consume(size, ARC_SPACE_DATA); 3417 } 3418 goto out; 3419 } 3420 3421 /* 3422 * If we are prefetching from the mfu ghost list, this buffer 3423 * will end up on the mru list; so steal space from there. 3424 */ 3425 if (state == arc_mfu_ghost) 3426 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3427 else if (state == arc_mru_ghost) 3428 state = arc_mru; 3429 3430 if (state == arc_mru || state == arc_anon) { 3431 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3432 state = (arc_mfu->arcs_lsize[type] >= size && 3433 arc_p > mru_used) ? arc_mfu : arc_mru; 3434 } else { 3435 /* MFU cases */ 3436 uint64_t mfu_space = arc_c - arc_p; 3437 state = (arc_mru->arcs_lsize[type] >= size && 3438 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3439 } 3440 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3441 if (type == ARC_BUFC_METADATA) { 3442 buf->b_data = zio_buf_alloc(size); 3443 arc_space_consume(size, ARC_SPACE_META); 3444 } else { 3445 ASSERT(type == ARC_BUFC_DATA); 3446 buf->b_data = zio_data_buf_alloc(size); 3447 arc_space_consume(size, ARC_SPACE_DATA); 3448 } 3449 ARCSTAT_BUMP(arcstat_recycle_miss); 3450 } 3451 ASSERT(buf->b_data != NULL); 3452out: 3453 /* 3454 * Update the state size. Note that ghost states have a 3455 * "ghost size" and so don't need to be updated. 3456 */ 3457 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3458 arc_buf_hdr_t *hdr = buf->b_hdr; 3459 3460 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3461 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3462 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3463 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3464 size); 3465 } 3466 /* 3467 * If we are growing the cache, and we are adding anonymous 3468 * data, and we have outgrown arc_p, update arc_p 3469 */ 3470 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3471 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3472 arc_p = MIN(arc_c, arc_p + size); 3473 } 3474 ARCSTAT_BUMP(arcstat_allocated); 3475} 3476 3477/* 3478 * This routine is called whenever a buffer is accessed. 3479 * NOTE: the hash lock is dropped in this function. 3480 */ 3481static void 3482arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3483{ 3484 clock_t now; 3485 3486 ASSERT(MUTEX_HELD(hash_lock)); 3487 ASSERT(HDR_HAS_L1HDR(hdr)); 3488 3489 if (hdr->b_l1hdr.b_state == arc_anon) { 3490 /* 3491 * This buffer is not in the cache, and does not 3492 * appear in our "ghost" list. Add the new buffer 3493 * to the MRU state. 3494 */ 3495 3496 ASSERT0(hdr->b_l1hdr.b_arc_access); 3497 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3498 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3499 arc_change_state(arc_mru, hdr, hash_lock); 3500 3501 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3502 now = ddi_get_lbolt(); 3503 3504 /* 3505 * If this buffer is here because of a prefetch, then either: 3506 * - clear the flag if this is a "referencing" read 3507 * (any subsequent access will bump this into the MFU state). 3508 * or 3509 * - move the buffer to the head of the list if this is 3510 * another prefetch (to make it less likely to be evicted). 3511 */ 3512 if (HDR_PREFETCH(hdr)) { 3513 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3514 ASSERT(list_link_active( 3515 &hdr->b_l1hdr.b_arc_node)); 3516 } else { 3517 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3518 ARCSTAT_BUMP(arcstat_mru_hits); 3519 } 3520 hdr->b_l1hdr.b_arc_access = now; 3521 return; 3522 } 3523 3524 /* 3525 * This buffer has been "accessed" only once so far, 3526 * but it is still in the cache. Move it to the MFU 3527 * state. 3528 */ 3529 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3530 /* 3531 * More than 125ms have passed since we 3532 * instantiated this buffer. Move it to the 3533 * most frequently used state. 3534 */ 3535 hdr->b_l1hdr.b_arc_access = now; 3536 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3537 arc_change_state(arc_mfu, hdr, hash_lock); 3538 } 3539 ARCSTAT_BUMP(arcstat_mru_hits); 3540 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3541 arc_state_t *new_state; 3542 /* 3543 * This buffer has been "accessed" recently, but 3544 * was evicted from the cache. Move it to the 3545 * MFU state. 3546 */ 3547 3548 if (HDR_PREFETCH(hdr)) { 3549 new_state = arc_mru; 3550 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3551 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3552 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3553 } else { 3554 new_state = arc_mfu; 3555 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3556 } 3557 3558 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3559 arc_change_state(new_state, hdr, hash_lock); 3560 3561 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3562 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3563 /* 3564 * This buffer has been accessed more than once and is 3565 * still in the cache. Keep it in the MFU state. 3566 * 3567 * NOTE: an add_reference() that occurred when we did 3568 * the arc_read() will have kicked this off the list. 3569 * If it was a prefetch, we will explicitly move it to 3570 * the head of the list now. 3571 */ 3572 if ((HDR_PREFETCH(hdr)) != 0) { 3573 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3574 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3575 } 3576 ARCSTAT_BUMP(arcstat_mfu_hits); 3577 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3578 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3579 arc_state_t *new_state = arc_mfu; 3580 /* 3581 * This buffer has been accessed more than once but has 3582 * been evicted from the cache. Move it back to the 3583 * MFU state. 3584 */ 3585 3586 if (HDR_PREFETCH(hdr)) { 3587 /* 3588 * This is a prefetch access... 3589 * move this block back to the MRU state. 3590 */ 3591 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3592 new_state = arc_mru; 3593 } 3594 3595 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3596 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3597 arc_change_state(new_state, hdr, hash_lock); 3598 3599 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3600 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3601 /* 3602 * This buffer is on the 2nd Level ARC. 3603 */ 3604 3605 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3606 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3607 arc_change_state(arc_mfu, hdr, hash_lock); 3608 } else { 3609 ASSERT(!"invalid arc state"); 3610 } 3611} 3612 3613/* a generic arc_done_func_t which you can use */ 3614/* ARGSUSED */ 3615void 3616arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3617{ 3618 if (zio == NULL || zio->io_error == 0) 3619 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3620 VERIFY(arc_buf_remove_ref(buf, arg)); 3621} 3622 3623/* a generic arc_done_func_t */ 3624void 3625arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3626{ 3627 arc_buf_t **bufp = arg; 3628 if (zio && zio->io_error) { 3629 VERIFY(arc_buf_remove_ref(buf, arg)); 3630 *bufp = NULL; 3631 } else { 3632 *bufp = buf; 3633 ASSERT(buf->b_data); 3634 } 3635} 3636 3637static void 3638arc_read_done(zio_t *zio) 3639{ 3640 arc_buf_hdr_t *hdr; 3641 arc_buf_t *buf; 3642 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3643 kmutex_t *hash_lock = NULL; 3644 arc_callback_t *callback_list, *acb; 3645 int freeable = FALSE; 3646 3647 buf = zio->io_private; 3648 hdr = buf->b_hdr; 3649 3650 /* 3651 * The hdr was inserted into hash-table and removed from lists 3652 * prior to starting I/O. We should find this header, since 3653 * it's in the hash table, and it should be legit since it's 3654 * not possible to evict it during the I/O. The only possible 3655 * reason for it not to be found is if we were freed during the 3656 * read. 3657 */ 3658 if (HDR_IN_HASH_TABLE(hdr)) { 3659 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3660 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3661 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3662 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3663 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3664 3665 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3666 &hash_lock); 3667 3668 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3669 hash_lock == NULL) || 3670 (found == hdr && 3671 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3672 (found == hdr && HDR_L2_READING(hdr))); 3673 } 3674 3675 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3676 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3677 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3678 3679 /* byteswap if necessary */ 3680 callback_list = hdr->b_l1hdr.b_acb; 3681 ASSERT(callback_list != NULL); 3682 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3683 dmu_object_byteswap_t bswap = 3684 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3685 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3686 byteswap_uint64_array : 3687 dmu_ot_byteswap[bswap].ob_func; 3688 func(buf->b_data, hdr->b_size); 3689 } 3690 3691 arc_cksum_compute(buf, B_FALSE); 3692#ifdef illumos 3693 arc_buf_watch(buf); 3694#endif /* illumos */ 3695 3696 if (hash_lock && zio->io_error == 0 && 3697 hdr->b_l1hdr.b_state == arc_anon) { 3698 /* 3699 * Only call arc_access on anonymous buffers. This is because 3700 * if we've issued an I/O for an evicted buffer, we've already 3701 * called arc_access (to prevent any simultaneous readers from 3702 * getting confused). 3703 */ 3704 arc_access(hdr, hash_lock); 3705 } 3706 3707 /* create copies of the data buffer for the callers */ 3708 abuf = buf; 3709 for (acb = callback_list; acb; acb = acb->acb_next) { 3710 if (acb->acb_done) { 3711 if (abuf == NULL) { 3712 ARCSTAT_BUMP(arcstat_duplicate_reads); 3713 abuf = arc_buf_clone(buf); 3714 } 3715 acb->acb_buf = abuf; 3716 abuf = NULL; 3717 } 3718 } 3719 hdr->b_l1hdr.b_acb = NULL; 3720 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3721 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3722 if (abuf == buf) { 3723 ASSERT(buf->b_efunc == NULL); 3724 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3725 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3726 } 3727 3728 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3729 callback_list != NULL); 3730 3731 if (zio->io_error != 0) { 3732 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3733 if (hdr->b_l1hdr.b_state != arc_anon) 3734 arc_change_state(arc_anon, hdr, hash_lock); 3735 if (HDR_IN_HASH_TABLE(hdr)) 3736 buf_hash_remove(hdr); 3737 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3738 } 3739 3740 /* 3741 * Broadcast before we drop the hash_lock to avoid the possibility 3742 * that the hdr (and hence the cv) might be freed before we get to 3743 * the cv_broadcast(). 3744 */ 3745 cv_broadcast(&hdr->b_l1hdr.b_cv); 3746 3747 if (hash_lock != NULL) { 3748 mutex_exit(hash_lock); 3749 } else { 3750 /* 3751 * This block was freed while we waited for the read to 3752 * complete. It has been removed from the hash table and 3753 * moved to the anonymous state (so that it won't show up 3754 * in the cache). 3755 */ 3756 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3757 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3758 } 3759 3760 /* execute each callback and free its structure */ 3761 while ((acb = callback_list) != NULL) { 3762 if (acb->acb_done) 3763 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3764 3765 if (acb->acb_zio_dummy != NULL) { 3766 acb->acb_zio_dummy->io_error = zio->io_error; 3767 zio_nowait(acb->acb_zio_dummy); 3768 } 3769 3770 callback_list = acb->acb_next; 3771 kmem_free(acb, sizeof (arc_callback_t)); 3772 } 3773 3774 if (freeable) 3775 arc_hdr_destroy(hdr); 3776} 3777 3778/* 3779 * "Read" the block at the specified DVA (in bp) via the 3780 * cache. If the block is found in the cache, invoke the provided 3781 * callback immediately and return. Note that the `zio' parameter 3782 * in the callback will be NULL in this case, since no IO was 3783 * required. If the block is not in the cache pass the read request 3784 * on to the spa with a substitute callback function, so that the 3785 * requested block will be added to the cache. 3786 * 3787 * If a read request arrives for a block that has a read in-progress, 3788 * either wait for the in-progress read to complete (and return the 3789 * results); or, if this is a read with a "done" func, add a record 3790 * to the read to invoke the "done" func when the read completes, 3791 * and return; or just return. 3792 * 3793 * arc_read_done() will invoke all the requested "done" functions 3794 * for readers of this block. 3795 */ 3796int 3797arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3798 void *private, zio_priority_t priority, int zio_flags, 3799 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3800{ 3801 arc_buf_hdr_t *hdr = NULL; 3802 arc_buf_t *buf = NULL; 3803 kmutex_t *hash_lock = NULL; 3804 zio_t *rzio; 3805 uint64_t guid = spa_load_guid(spa); 3806 3807 ASSERT(!BP_IS_EMBEDDED(bp) || 3808 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3809 3810top: 3811 if (!BP_IS_EMBEDDED(bp)) { 3812 /* 3813 * Embedded BP's have no DVA and require no I/O to "read". 3814 * Create an anonymous arc buf to back it. 3815 */ 3816 hdr = buf_hash_find(guid, bp, &hash_lock); 3817 } 3818 3819 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3820 3821 *arc_flags |= ARC_FLAG_CACHED; 3822 3823 if (HDR_IO_IN_PROGRESS(hdr)) { 3824 3825 if (*arc_flags & ARC_FLAG_WAIT) { 3826 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3827 mutex_exit(hash_lock); 3828 goto top; 3829 } 3830 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3831 3832 if (done) { 3833 arc_callback_t *acb = NULL; 3834 3835 acb = kmem_zalloc(sizeof (arc_callback_t), 3836 KM_SLEEP); 3837 acb->acb_done = done; 3838 acb->acb_private = private; 3839 if (pio != NULL) 3840 acb->acb_zio_dummy = zio_null(pio, 3841 spa, NULL, NULL, NULL, zio_flags); 3842 3843 ASSERT(acb->acb_done != NULL); 3844 acb->acb_next = hdr->b_l1hdr.b_acb; 3845 hdr->b_l1hdr.b_acb = acb; 3846 add_reference(hdr, hash_lock, private); 3847 mutex_exit(hash_lock); 3848 return (0); 3849 } 3850 mutex_exit(hash_lock); 3851 return (0); 3852 } 3853 3854 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3855 hdr->b_l1hdr.b_state == arc_mfu); 3856 3857 if (done) { 3858 add_reference(hdr, hash_lock, private); 3859 /* 3860 * If this block is already in use, create a new 3861 * copy of the data so that we will be guaranteed 3862 * that arc_release() will always succeed. 3863 */ 3864 buf = hdr->b_l1hdr.b_buf; 3865 ASSERT(buf); 3866 ASSERT(buf->b_data); 3867 if (HDR_BUF_AVAILABLE(hdr)) { 3868 ASSERT(buf->b_efunc == NULL); 3869 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3870 } else { 3871 buf = arc_buf_clone(buf); 3872 } 3873 3874 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3875 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3876 hdr->b_flags |= ARC_FLAG_PREFETCH; 3877 } 3878 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3879 arc_access(hdr, hash_lock); 3880 if (*arc_flags & ARC_FLAG_L2CACHE) 3881 hdr->b_flags |= ARC_FLAG_L2CACHE; 3882 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3883 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3884 mutex_exit(hash_lock); 3885 ARCSTAT_BUMP(arcstat_hits); 3886 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3887 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3888 data, metadata, hits); 3889 3890 if (done) 3891 done(NULL, buf, private); 3892 } else { 3893 uint64_t size = BP_GET_LSIZE(bp); 3894 arc_callback_t *acb; 3895 vdev_t *vd = NULL; 3896 uint64_t addr = 0; 3897 boolean_t devw = B_FALSE; 3898 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3899 int32_t b_asize = 0; 3900 3901 if (hdr == NULL) { 3902 /* this block is not in the cache */ 3903 arc_buf_hdr_t *exists = NULL; 3904 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3905 buf = arc_buf_alloc(spa, size, private, type); 3906 hdr = buf->b_hdr; 3907 if (!BP_IS_EMBEDDED(bp)) { 3908 hdr->b_dva = *BP_IDENTITY(bp); 3909 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3910 exists = buf_hash_insert(hdr, &hash_lock); 3911 } 3912 if (exists != NULL) { 3913 /* somebody beat us to the hash insert */ 3914 mutex_exit(hash_lock); 3915 buf_discard_identity(hdr); 3916 (void) arc_buf_remove_ref(buf, private); 3917 goto top; /* restart the IO request */ 3918 } 3919 3920 /* if this is a prefetch, we don't have a reference */ 3921 if (*arc_flags & ARC_FLAG_PREFETCH) { 3922 (void) remove_reference(hdr, hash_lock, 3923 private); 3924 hdr->b_flags |= ARC_FLAG_PREFETCH; 3925 } 3926 if (*arc_flags & ARC_FLAG_L2CACHE) 3927 hdr->b_flags |= ARC_FLAG_L2CACHE; 3928 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3929 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3930 if (BP_GET_LEVEL(bp) > 0) 3931 hdr->b_flags |= ARC_FLAG_INDIRECT; 3932 } else { 3933 /* 3934 * This block is in the ghost cache. If it was L2-only 3935 * (and thus didn't have an L1 hdr), we realloc the 3936 * header to add an L1 hdr. 3937 */ 3938 if (!HDR_HAS_L1HDR(hdr)) { 3939 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 3940 hdr_full_cache); 3941 } 3942 3943 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 3944 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3945 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3946 ASSERT(hdr->b_l1hdr.b_buf == NULL); 3947 3948 /* if this is a prefetch, we don't have a reference */ 3949 if (*arc_flags & ARC_FLAG_PREFETCH) 3950 hdr->b_flags |= ARC_FLAG_PREFETCH; 3951 else 3952 add_reference(hdr, hash_lock, private); 3953 if (*arc_flags & ARC_FLAG_L2CACHE) 3954 hdr->b_flags |= ARC_FLAG_L2CACHE; 3955 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3956 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3957 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3958 buf->b_hdr = hdr; 3959 buf->b_data = NULL; 3960 buf->b_efunc = NULL; 3961 buf->b_private = NULL; 3962 buf->b_next = NULL; 3963 hdr->b_l1hdr.b_buf = buf; 3964 ASSERT0(hdr->b_l1hdr.b_datacnt); 3965 hdr->b_l1hdr.b_datacnt = 1; 3966 arc_get_data_buf(buf); 3967 arc_access(hdr, hash_lock); 3968 } 3969 3970 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 3971 3972 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3973 acb->acb_done = done; 3974 acb->acb_private = private; 3975 3976 ASSERT(hdr->b_l1hdr.b_acb == NULL); 3977 hdr->b_l1hdr.b_acb = acb; 3978 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 3979 3980 if (HDR_HAS_L2HDR(hdr) && 3981 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 3982 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 3983 addr = hdr->b_l2hdr.b_daddr; 3984 b_compress = HDR_GET_COMPRESS(hdr); 3985 b_asize = hdr->b_l2hdr.b_asize; 3986 /* 3987 * Lock out device removal. 3988 */ 3989 if (vdev_is_dead(vd) || 3990 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3991 vd = NULL; 3992 } 3993 3994 if (hash_lock != NULL) 3995 mutex_exit(hash_lock); 3996 3997 /* 3998 * At this point, we have a level 1 cache miss. Try again in 3999 * L2ARC if possible. 4000 */ 4001 ASSERT3U(hdr->b_size, ==, size); 4002 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4003 uint64_t, size, zbookmark_phys_t *, zb); 4004 ARCSTAT_BUMP(arcstat_misses); 4005 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4006 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4007 data, metadata, misses); 4008#ifdef _KERNEL 4009 curthread->td_ru.ru_inblock++; 4010#endif 4011 4012 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4013 /* 4014 * Read from the L2ARC if the following are true: 4015 * 1. The L2ARC vdev was previously cached. 4016 * 2. This buffer still has L2ARC metadata. 4017 * 3. This buffer isn't currently writing to the L2ARC. 4018 * 4. The L2ARC entry wasn't evicted, which may 4019 * also have invalidated the vdev. 4020 * 5. This isn't prefetch and l2arc_noprefetch is set. 4021 */ 4022 if (HDR_HAS_L2HDR(hdr) && 4023 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4024 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4025 l2arc_read_callback_t *cb; 4026 4027 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4028 ARCSTAT_BUMP(arcstat_l2_hits); 4029 4030 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4031 KM_SLEEP); 4032 cb->l2rcb_buf = buf; 4033 cb->l2rcb_spa = spa; 4034 cb->l2rcb_bp = *bp; 4035 cb->l2rcb_zb = *zb; 4036 cb->l2rcb_flags = zio_flags; 4037 cb->l2rcb_compress = b_compress; 4038 4039 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4040 addr + size < vd->vdev_psize - 4041 VDEV_LABEL_END_SIZE); 4042 4043 /* 4044 * l2arc read. The SCL_L2ARC lock will be 4045 * released by l2arc_read_done(). 4046 * Issue a null zio if the underlying buffer 4047 * was squashed to zero size by compression. 4048 */ 4049 if (b_compress == ZIO_COMPRESS_EMPTY) { 4050 rzio = zio_null(pio, spa, vd, 4051 l2arc_read_done, cb, 4052 zio_flags | ZIO_FLAG_DONT_CACHE | 4053 ZIO_FLAG_CANFAIL | 4054 ZIO_FLAG_DONT_PROPAGATE | 4055 ZIO_FLAG_DONT_RETRY); 4056 } else { 4057 rzio = zio_read_phys(pio, vd, addr, 4058 b_asize, buf->b_data, 4059 ZIO_CHECKSUM_OFF, 4060 l2arc_read_done, cb, priority, 4061 zio_flags | ZIO_FLAG_DONT_CACHE | 4062 ZIO_FLAG_CANFAIL | 4063 ZIO_FLAG_DONT_PROPAGATE | 4064 ZIO_FLAG_DONT_RETRY, B_FALSE); 4065 } 4066 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4067 zio_t *, rzio); 4068 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4069 4070 if (*arc_flags & ARC_FLAG_NOWAIT) { 4071 zio_nowait(rzio); 4072 return (0); 4073 } 4074 4075 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4076 if (zio_wait(rzio) == 0) 4077 return (0); 4078 4079 /* l2arc read error; goto zio_read() */ 4080 } else { 4081 DTRACE_PROBE1(l2arc__miss, 4082 arc_buf_hdr_t *, hdr); 4083 ARCSTAT_BUMP(arcstat_l2_misses); 4084 if (HDR_L2_WRITING(hdr)) 4085 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4086 spa_config_exit(spa, SCL_L2ARC, vd); 4087 } 4088 } else { 4089 if (vd != NULL) 4090 spa_config_exit(spa, SCL_L2ARC, vd); 4091 if (l2arc_ndev != 0) { 4092 DTRACE_PROBE1(l2arc__miss, 4093 arc_buf_hdr_t *, hdr); 4094 ARCSTAT_BUMP(arcstat_l2_misses); 4095 } 4096 } 4097 4098 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4099 arc_read_done, buf, priority, zio_flags, zb); 4100 4101 if (*arc_flags & ARC_FLAG_WAIT) 4102 return (zio_wait(rzio)); 4103 4104 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4105 zio_nowait(rzio); 4106 } 4107 return (0); 4108} 4109 4110void 4111arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4112{ 4113 ASSERT(buf->b_hdr != NULL); 4114 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4115 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4116 func == NULL); 4117 ASSERT(buf->b_efunc == NULL); 4118 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4119 4120 buf->b_efunc = func; 4121 buf->b_private = private; 4122} 4123 4124/* 4125 * Notify the arc that a block was freed, and thus will never be used again. 4126 */ 4127void 4128arc_freed(spa_t *spa, const blkptr_t *bp) 4129{ 4130 arc_buf_hdr_t *hdr; 4131 kmutex_t *hash_lock; 4132 uint64_t guid = spa_load_guid(spa); 4133 4134 ASSERT(!BP_IS_EMBEDDED(bp)); 4135 4136 hdr = buf_hash_find(guid, bp, &hash_lock); 4137 if (hdr == NULL) 4138 return; 4139 if (HDR_BUF_AVAILABLE(hdr)) { 4140 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4141 add_reference(hdr, hash_lock, FTAG); 4142 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4143 mutex_exit(hash_lock); 4144 4145 arc_release(buf, FTAG); 4146 (void) arc_buf_remove_ref(buf, FTAG); 4147 } else { 4148 mutex_exit(hash_lock); 4149 } 4150 4151} 4152 4153/* 4154 * Clear the user eviction callback set by arc_set_callback(), first calling 4155 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4156 * clearing the callback may result in the arc_buf being destroyed. However, 4157 * it will not result in the *last* arc_buf being destroyed, hence the data 4158 * will remain cached in the ARC. We make a copy of the arc buffer here so 4159 * that we can process the callback without holding any locks. 4160 * 4161 * It's possible that the callback is already in the process of being cleared 4162 * by another thread. In this case we can not clear the callback. 4163 * 4164 * Returns B_TRUE if the callback was successfully called and cleared. 4165 */ 4166boolean_t 4167arc_clear_callback(arc_buf_t *buf) 4168{ 4169 arc_buf_hdr_t *hdr; 4170 kmutex_t *hash_lock; 4171 arc_evict_func_t *efunc = buf->b_efunc; 4172 void *private = buf->b_private; 4173 4174 mutex_enter(&buf->b_evict_lock); 4175 hdr = buf->b_hdr; 4176 if (hdr == NULL) { 4177 /* 4178 * We are in arc_do_user_evicts(). 4179 */ 4180 ASSERT(buf->b_data == NULL); 4181 mutex_exit(&buf->b_evict_lock); 4182 return (B_FALSE); 4183 } else if (buf->b_data == NULL) { 4184 /* 4185 * We are on the eviction list; process this buffer now 4186 * but let arc_do_user_evicts() do the reaping. 4187 */ 4188 buf->b_efunc = NULL; 4189 mutex_exit(&buf->b_evict_lock); 4190 VERIFY0(efunc(private)); 4191 return (B_TRUE); 4192 } 4193 hash_lock = HDR_LOCK(hdr); 4194 mutex_enter(hash_lock); 4195 hdr = buf->b_hdr; 4196 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4197 4198 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4199 hdr->b_l1hdr.b_datacnt); 4200 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4201 hdr->b_l1hdr.b_state == arc_mfu); 4202 4203 buf->b_efunc = NULL; 4204 buf->b_private = NULL; 4205 4206 if (hdr->b_l1hdr.b_datacnt > 1) { 4207 mutex_exit(&buf->b_evict_lock); 4208 arc_buf_destroy(buf, FALSE, TRUE); 4209 } else { 4210 ASSERT(buf == hdr->b_l1hdr.b_buf); 4211 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4212 mutex_exit(&buf->b_evict_lock); 4213 } 4214 4215 mutex_exit(hash_lock); 4216 VERIFY0(efunc(private)); 4217 return (B_TRUE); 4218} 4219 4220/* 4221 * Release this buffer from the cache, making it an anonymous buffer. This 4222 * must be done after a read and prior to modifying the buffer contents. 4223 * If the buffer has more than one reference, we must make 4224 * a new hdr for the buffer. 4225 */ 4226void 4227arc_release(arc_buf_t *buf, void *tag) 4228{ 4229 arc_buf_hdr_t *hdr = buf->b_hdr; 4230 4231 /* 4232 * It would be nice to assert that if it's DMU metadata (level > 4233 * 0 || it's the dnode file), then it must be syncing context. 4234 * But we don't know that information at this level. 4235 */ 4236 4237 mutex_enter(&buf->b_evict_lock); 4238 /* 4239 * We don't grab the hash lock prior to this check, because if 4240 * the buffer's header is in the arc_anon state, it won't be 4241 * linked into the hash table. 4242 */ 4243 if (hdr->b_l1hdr.b_state == arc_anon) { 4244 mutex_exit(&buf->b_evict_lock); 4245 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4246 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4247 ASSERT(!HDR_HAS_L2HDR(hdr)); 4248 ASSERT(BUF_EMPTY(hdr)); 4249 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4250 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4251 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4252 4253 ASSERT3P(buf->b_efunc, ==, NULL); 4254 ASSERT3P(buf->b_private, ==, NULL); 4255 4256 hdr->b_l1hdr.b_arc_access = 0; 4257 arc_buf_thaw(buf); 4258 4259 return; 4260 } 4261 4262 kmutex_t *hash_lock = HDR_LOCK(hdr); 4263 mutex_enter(hash_lock); 4264 4265 /* 4266 * This assignment is only valid as long as the hash_lock is 4267 * held, we must be careful not to reference state or the 4268 * b_state field after dropping the lock. 4269 */ 4270 arc_state_t *state = hdr->b_l1hdr.b_state; 4271 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4272 ASSERT3P(state, !=, arc_anon); 4273 4274 /* this buffer is not on any list */ 4275 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4276 4277 if (HDR_HAS_L2HDR(hdr)) { 4278 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4279 4280 /* 4281 * We have to recheck this conditional again now that 4282 * we're holding the l2ad_mtx to prevent a race with 4283 * another thread which might be concurrently calling 4284 * l2arc_evict(). In that case, l2arc_evict() might have 4285 * destroyed the header's L2 portion as we were waiting 4286 * to acquire the l2ad_mtx. 4287 */ 4288 if (HDR_HAS_L2HDR(hdr)) { 4289 if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) 4290 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4291 hdr->b_l2hdr.b_daddr, 4292 hdr->b_l2hdr.b_asize, 0); 4293 arc_hdr_l2hdr_destroy(hdr); 4294 } 4295 4296 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4297 } 4298 4299 /* 4300 * Do we have more than one buf? 4301 */ 4302 if (hdr->b_l1hdr.b_datacnt > 1) { 4303 arc_buf_hdr_t *nhdr; 4304 arc_buf_t **bufp; 4305 uint64_t blksz = hdr->b_size; 4306 uint64_t spa = hdr->b_spa; 4307 arc_buf_contents_t type = arc_buf_type(hdr); 4308 uint32_t flags = hdr->b_flags; 4309 4310 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4311 /* 4312 * Pull the data off of this hdr and attach it to 4313 * a new anonymous hdr. 4314 */ 4315 (void) remove_reference(hdr, hash_lock, tag); 4316 bufp = &hdr->b_l1hdr.b_buf; 4317 while (*bufp != buf) 4318 bufp = &(*bufp)->b_next; 4319 *bufp = buf->b_next; 4320 buf->b_next = NULL; 4321 4322 ASSERT3P(state, !=, arc_l2c_only); 4323 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4324 atomic_add_64(&state->arcs_size, -hdr->b_size); 4325 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4326 ASSERT3P(state, !=, arc_l2c_only); 4327 uint64_t *size = &state->arcs_lsize[type]; 4328 ASSERT3U(*size, >=, hdr->b_size); 4329 atomic_add_64(size, -hdr->b_size); 4330 } 4331 4332 /* 4333 * We're releasing a duplicate user data buffer, update 4334 * our statistics accordingly. 4335 */ 4336 if (HDR_ISTYPE_DATA(hdr)) { 4337 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4338 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4339 -hdr->b_size); 4340 } 4341 hdr->b_l1hdr.b_datacnt -= 1; 4342 arc_cksum_verify(buf); 4343#ifdef illumos 4344 arc_buf_unwatch(buf); 4345#endif /* illumos */ 4346 4347 mutex_exit(hash_lock); 4348 4349 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4350 nhdr->b_size = blksz; 4351 nhdr->b_spa = spa; 4352 4353 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4354 nhdr->b_flags |= arc_bufc_to_flags(type); 4355 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4356 4357 nhdr->b_l1hdr.b_buf = buf; 4358 nhdr->b_l1hdr.b_datacnt = 1; 4359 nhdr->b_l1hdr.b_state = arc_anon; 4360 nhdr->b_l1hdr.b_arc_access = 0; 4361 nhdr->b_freeze_cksum = NULL; 4362 4363 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4364 buf->b_hdr = nhdr; 4365 mutex_exit(&buf->b_evict_lock); 4366 atomic_add_64(&arc_anon->arcs_size, blksz); 4367 } else { 4368 mutex_exit(&buf->b_evict_lock); 4369 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4370 /* protected by hash lock */ 4371 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4372 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4373 arc_change_state(arc_anon, hdr, hash_lock); 4374 hdr->b_l1hdr.b_arc_access = 0; 4375 mutex_exit(hash_lock); 4376 4377 buf_discard_identity(hdr); 4378 arc_buf_thaw(buf); 4379 } 4380 buf->b_efunc = NULL; 4381 buf->b_private = NULL; 4382} 4383 4384int 4385arc_released(arc_buf_t *buf) 4386{ 4387 int released; 4388 4389 mutex_enter(&buf->b_evict_lock); 4390 released = (buf->b_data != NULL && 4391 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4392 mutex_exit(&buf->b_evict_lock); 4393 return (released); 4394} 4395 4396#ifdef ZFS_DEBUG 4397int 4398arc_referenced(arc_buf_t *buf) 4399{ 4400 int referenced; 4401 4402 mutex_enter(&buf->b_evict_lock); 4403 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4404 mutex_exit(&buf->b_evict_lock); 4405 return (referenced); 4406} 4407#endif 4408 4409static void 4410arc_write_ready(zio_t *zio) 4411{ 4412 arc_write_callback_t *callback = zio->io_private; 4413 arc_buf_t *buf = callback->awcb_buf; 4414 arc_buf_hdr_t *hdr = buf->b_hdr; 4415 4416 ASSERT(HDR_HAS_L1HDR(hdr)); 4417 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4418 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4419 callback->awcb_ready(zio, buf, callback->awcb_private); 4420 4421 /* 4422 * If the IO is already in progress, then this is a re-write 4423 * attempt, so we need to thaw and re-compute the cksum. 4424 * It is the responsibility of the callback to handle the 4425 * accounting for any re-write attempt. 4426 */ 4427 if (HDR_IO_IN_PROGRESS(hdr)) { 4428 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4429 if (hdr->b_freeze_cksum != NULL) { 4430 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4431 hdr->b_freeze_cksum = NULL; 4432 } 4433 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4434 } 4435 arc_cksum_compute(buf, B_FALSE); 4436 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4437} 4438 4439/* 4440 * The SPA calls this callback for each physical write that happens on behalf 4441 * of a logical write. See the comment in dbuf_write_physdone() for details. 4442 */ 4443static void 4444arc_write_physdone(zio_t *zio) 4445{ 4446 arc_write_callback_t *cb = zio->io_private; 4447 if (cb->awcb_physdone != NULL) 4448 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4449} 4450 4451static void 4452arc_write_done(zio_t *zio) 4453{ 4454 arc_write_callback_t *callback = zio->io_private; 4455 arc_buf_t *buf = callback->awcb_buf; 4456 arc_buf_hdr_t *hdr = buf->b_hdr; 4457 4458 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4459 4460 if (zio->io_error == 0) { 4461 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4462 buf_discard_identity(hdr); 4463 } else { 4464 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4465 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4466 } 4467 } else { 4468 ASSERT(BUF_EMPTY(hdr)); 4469 } 4470 4471 /* 4472 * If the block to be written was all-zero or compressed enough to be 4473 * embedded in the BP, no write was performed so there will be no 4474 * dva/birth/checksum. The buffer must therefore remain anonymous 4475 * (and uncached). 4476 */ 4477 if (!BUF_EMPTY(hdr)) { 4478 arc_buf_hdr_t *exists; 4479 kmutex_t *hash_lock; 4480 4481 ASSERT(zio->io_error == 0); 4482 4483 arc_cksum_verify(buf); 4484 4485 exists = buf_hash_insert(hdr, &hash_lock); 4486 if (exists != NULL) { 4487 /* 4488 * This can only happen if we overwrite for 4489 * sync-to-convergence, because we remove 4490 * buffers from the hash table when we arc_free(). 4491 */ 4492 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4493 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4494 panic("bad overwrite, hdr=%p exists=%p", 4495 (void *)hdr, (void *)exists); 4496 ASSERT(refcount_is_zero( 4497 &exists->b_l1hdr.b_refcnt)); 4498 arc_change_state(arc_anon, exists, hash_lock); 4499 mutex_exit(hash_lock); 4500 arc_hdr_destroy(exists); 4501 exists = buf_hash_insert(hdr, &hash_lock); 4502 ASSERT3P(exists, ==, NULL); 4503 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4504 /* nopwrite */ 4505 ASSERT(zio->io_prop.zp_nopwrite); 4506 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4507 panic("bad nopwrite, hdr=%p exists=%p", 4508 (void *)hdr, (void *)exists); 4509 } else { 4510 /* Dedup */ 4511 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4512 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4513 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4514 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4515 } 4516 } 4517 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4518 /* if it's not anon, we are doing a scrub */ 4519 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4520 arc_access(hdr, hash_lock); 4521 mutex_exit(hash_lock); 4522 } else { 4523 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4524 } 4525 4526 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4527 callback->awcb_done(zio, buf, callback->awcb_private); 4528 4529 kmem_free(callback, sizeof (arc_write_callback_t)); 4530} 4531 4532zio_t * 4533arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4534 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4535 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4536 arc_done_func_t *done, void *private, zio_priority_t priority, 4537 int zio_flags, const zbookmark_phys_t *zb) 4538{ 4539 arc_buf_hdr_t *hdr = buf->b_hdr; 4540 arc_write_callback_t *callback; 4541 zio_t *zio; 4542 4543 ASSERT(ready != NULL); 4544 ASSERT(done != NULL); 4545 ASSERT(!HDR_IO_ERROR(hdr)); 4546 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4547 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4548 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4549 if (l2arc) 4550 hdr->b_flags |= ARC_FLAG_L2CACHE; 4551 if (l2arc_compress) 4552 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4553 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4554 callback->awcb_ready = ready; 4555 callback->awcb_physdone = physdone; 4556 callback->awcb_done = done; 4557 callback->awcb_private = private; 4558 callback->awcb_buf = buf; 4559 4560 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4561 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4562 priority, zio_flags, zb); 4563 4564 return (zio); 4565} 4566 4567static int 4568arc_memory_throttle(uint64_t reserve, uint64_t txg) 4569{ 4570#ifdef _KERNEL 4571 uint64_t available_memory = ptob(freemem); 4572 static uint64_t page_load = 0; 4573 static uint64_t last_txg = 0; 4574 4575#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4576 available_memory = 4577 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4578#endif 4579 4580 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4581 return (0); 4582 4583 if (txg > last_txg) { 4584 last_txg = txg; 4585 page_load = 0; 4586 } 4587 /* 4588 * If we are in pageout, we know that memory is already tight, 4589 * the arc is already going to be evicting, so we just want to 4590 * continue to let page writes occur as quickly as possible. 4591 */ 4592 if (curproc == pageproc) { 4593 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4594 return (SET_ERROR(ERESTART)); 4595 /* Note: reserve is inflated, so we deflate */ 4596 page_load += reserve / 8; 4597 return (0); 4598 } else if (page_load > 0 && arc_reclaim_needed()) { 4599 /* memory is low, delay before restarting */ 4600 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4601 return (SET_ERROR(EAGAIN)); 4602 } 4603 page_load = 0; 4604#endif 4605 return (0); 4606} 4607 4608void 4609arc_tempreserve_clear(uint64_t reserve) 4610{ 4611 atomic_add_64(&arc_tempreserve, -reserve); 4612 ASSERT((int64_t)arc_tempreserve >= 0); 4613} 4614 4615int 4616arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4617{ 4618 int error; 4619 uint64_t anon_size; 4620 4621 if (reserve > arc_c/4 && !arc_no_grow) { 4622 arc_c = MIN(arc_c_max, reserve * 4); 4623 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4624 } 4625 if (reserve > arc_c) 4626 return (SET_ERROR(ENOMEM)); 4627 4628 /* 4629 * Don't count loaned bufs as in flight dirty data to prevent long 4630 * network delays from blocking transactions that are ready to be 4631 * assigned to a txg. 4632 */ 4633 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4634 4635 /* 4636 * Writes will, almost always, require additional memory allocations 4637 * in order to compress/encrypt/etc the data. We therefore need to 4638 * make sure that there is sufficient available memory for this. 4639 */ 4640 error = arc_memory_throttle(reserve, txg); 4641 if (error != 0) 4642 return (error); 4643 4644 /* 4645 * Throttle writes when the amount of dirty data in the cache 4646 * gets too large. We try to keep the cache less than half full 4647 * of dirty blocks so that our sync times don't grow too large. 4648 * Note: if two requests come in concurrently, we might let them 4649 * both succeed, when one of them should fail. Not a huge deal. 4650 */ 4651 4652 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4653 anon_size > arc_c / 4) { 4654 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4655 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4656 arc_tempreserve>>10, 4657 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4658 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4659 reserve>>10, arc_c>>10); 4660 return (SET_ERROR(ERESTART)); 4661 } 4662 atomic_add_64(&arc_tempreserve, reserve); 4663 return (0); 4664} 4665 4666static void 4667arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4668 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4669{ 4670 size->value.ui64 = state->arcs_size; 4671 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4672 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4673} 4674 4675static int 4676arc_kstat_update(kstat_t *ksp, int rw) 4677{ 4678 arc_stats_t *as = ksp->ks_data; 4679 4680 if (rw == KSTAT_WRITE) { 4681 return (EACCES); 4682 } else { 4683 arc_kstat_update_state(arc_anon, 4684 &as->arcstat_anon_size, 4685 &as->arcstat_anon_evictable_data, 4686 &as->arcstat_anon_evictable_metadata); 4687 arc_kstat_update_state(arc_mru, 4688 &as->arcstat_mru_size, 4689 &as->arcstat_mru_evictable_data, 4690 &as->arcstat_mru_evictable_metadata); 4691 arc_kstat_update_state(arc_mru_ghost, 4692 &as->arcstat_mru_ghost_size, 4693 &as->arcstat_mru_ghost_evictable_data, 4694 &as->arcstat_mru_ghost_evictable_metadata); 4695 arc_kstat_update_state(arc_mfu, 4696 &as->arcstat_mfu_size, 4697 &as->arcstat_mfu_evictable_data, 4698 &as->arcstat_mfu_evictable_metadata); 4699 arc_kstat_update_state(arc_mfu_ghost, 4700 &as->arcstat_mfu_ghost_size, 4701 &as->arcstat_mfu_ghost_evictable_data, 4702 &as->arcstat_mfu_ghost_evictable_metadata); 4703 } 4704 4705 return (0); 4706} 4707 4708#ifdef _KERNEL 4709static eventhandler_tag arc_event_lowmem = NULL; 4710 4711static void 4712arc_lowmem(void *arg __unused, int howto __unused) 4713{ 4714 4715 mutex_enter(&arc_reclaim_thr_lock); 4716 /* XXX: Memory deficit should be passed as argument. */ 4717 needfree = btoc(arc_c >> arc_shrink_shift); 4718 DTRACE_PROBE(arc__needfree); 4719 cv_signal(&arc_reclaim_thr_cv); 4720 4721 /* 4722 * It is unsafe to block here in arbitrary threads, because we can come 4723 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4724 * with ARC reclaim thread. 4725 */ 4726 if (curproc == pageproc) 4727 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4728 mutex_exit(&arc_reclaim_thr_lock); 4729} 4730#endif 4731 4732void 4733arc_init(void) 4734{ 4735 int i, prefetch_tunable_set = 0; 4736 4737 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4738 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4739 4740 /* Convert seconds to clock ticks */ 4741 arc_min_prefetch_lifespan = 1 * hz; 4742 4743 /* Start out with 1/8 of all memory */ 4744 arc_c = kmem_size() / 8; 4745 4746#ifdef sun 4747#ifdef _KERNEL 4748 /* 4749 * On architectures where the physical memory can be larger 4750 * than the addressable space (intel in 32-bit mode), we may 4751 * need to limit the cache to 1/8 of VM size. 4752 */ 4753 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4754#endif 4755#endif /* sun */ 4756 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4757 arc_c_min = MAX(arc_c / 4, 16 << 20); 4758 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4759 if (arc_c * 8 >= 1 << 30) 4760 arc_c_max = (arc_c * 8) - (1 << 30); 4761 else 4762 arc_c_max = arc_c_min; 4763 arc_c_max = MAX(arc_c * 5, arc_c_max); 4764 4765#ifdef _KERNEL 4766 /* 4767 * Allow the tunables to override our calculations if they are 4768 * reasonable (ie. over 16MB) 4769 */ 4770 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4771 arc_c_max = zfs_arc_max; 4772 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4773 arc_c_min = zfs_arc_min; 4774#endif 4775 4776 arc_c = arc_c_max; 4777 arc_p = (arc_c >> 1); 4778 4779 /* limit meta-data to 1/4 of the arc capacity */ 4780 arc_meta_limit = arc_c_max / 4; 4781 4782 /* Allow the tunable to override if it is reasonable */ 4783 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4784 arc_meta_limit = zfs_arc_meta_limit; 4785 4786 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4787 arc_c_min = arc_meta_limit / 2; 4788 4789 if (zfs_arc_meta_min > 0) { 4790 arc_meta_min = zfs_arc_meta_min; 4791 } else { 4792 arc_meta_min = arc_c_min / 2; 4793 } 4794 4795 if (zfs_arc_grow_retry > 0) 4796 arc_grow_retry = zfs_arc_grow_retry; 4797 4798 if (zfs_arc_shrink_shift > 0) 4799 arc_shrink_shift = zfs_arc_shrink_shift; 4800 4801 /* 4802 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 4803 */ 4804 if (arc_no_grow_shift >= arc_shrink_shift) 4805 arc_no_grow_shift = arc_shrink_shift - 1; 4806 4807 if (zfs_arc_p_min_shift > 0) 4808 arc_p_min_shift = zfs_arc_p_min_shift; 4809 4810 /* if kmem_flags are set, lets try to use less memory */ 4811 if (kmem_debugging()) 4812 arc_c = arc_c / 2; 4813 if (arc_c < arc_c_min) 4814 arc_c = arc_c_min; 4815 4816 zfs_arc_min = arc_c_min; 4817 zfs_arc_max = arc_c_max; 4818 4819 arc_anon = &ARC_anon; 4820 arc_mru = &ARC_mru; 4821 arc_mru_ghost = &ARC_mru_ghost; 4822 arc_mfu = &ARC_mfu; 4823 arc_mfu_ghost = &ARC_mfu_ghost; 4824 arc_l2c_only = &ARC_l2c_only; 4825 arc_size = 0; 4826 4827 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4828 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4829 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4830 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4831 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4832 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4833 4834 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 4835 sizeof (arc_buf_hdr_t), 4836 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4837 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 4838 sizeof (arc_buf_hdr_t), 4839 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4840 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 4841 sizeof (arc_buf_hdr_t), 4842 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4843 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 4844 sizeof (arc_buf_hdr_t), 4845 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4846 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 4847 sizeof (arc_buf_hdr_t), 4848 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4849 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 4850 sizeof (arc_buf_hdr_t), 4851 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4852 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 4853 sizeof (arc_buf_hdr_t), 4854 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4855 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 4856 sizeof (arc_buf_hdr_t), 4857 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4858 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 4859 sizeof (arc_buf_hdr_t), 4860 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4861 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 4862 sizeof (arc_buf_hdr_t), 4863 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4864 4865 buf_init(); 4866 4867 arc_thread_exit = 0; 4868 arc_eviction_list = NULL; 4869 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4870 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4871 4872 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4873 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4874 4875 if (arc_ksp != NULL) { 4876 arc_ksp->ks_data = &arc_stats; 4877 arc_ksp->ks_update = arc_kstat_update; 4878 kstat_install(arc_ksp); 4879 } 4880 4881 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4882 TS_RUN, minclsyspri); 4883 4884#ifdef _KERNEL 4885 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4886 EVENTHANDLER_PRI_FIRST); 4887#endif 4888 4889 arc_dead = FALSE; 4890 arc_warm = B_FALSE; 4891 4892 /* 4893 * Calculate maximum amount of dirty data per pool. 4894 * 4895 * If it has been set by /etc/system, take that. 4896 * Otherwise, use a percentage of physical memory defined by 4897 * zfs_dirty_data_max_percent (default 10%) with a cap at 4898 * zfs_dirty_data_max_max (default 4GB). 4899 */ 4900 if (zfs_dirty_data_max == 0) { 4901 zfs_dirty_data_max = ptob(physmem) * 4902 zfs_dirty_data_max_percent / 100; 4903 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4904 zfs_dirty_data_max_max); 4905 } 4906 4907#ifdef _KERNEL 4908 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4909 prefetch_tunable_set = 1; 4910 4911#ifdef __i386__ 4912 if (prefetch_tunable_set == 0) { 4913 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4914 "-- to enable,\n"); 4915 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4916 "to /boot/loader.conf.\n"); 4917 zfs_prefetch_disable = 1; 4918 } 4919#else 4920 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4921 prefetch_tunable_set == 0) { 4922 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4923 "than 4GB of RAM is present;\n" 4924 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4925 "to /boot/loader.conf.\n"); 4926 zfs_prefetch_disable = 1; 4927 } 4928#endif 4929 /* Warn about ZFS memory and address space requirements. */ 4930 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4931 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4932 "expect unstable behavior.\n"); 4933 } 4934 if (kmem_size() < 512 * (1 << 20)) { 4935 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4936 "expect unstable behavior.\n"); 4937 printf(" Consider tuning vm.kmem_size and " 4938 "vm.kmem_size_max\n"); 4939 printf(" in /boot/loader.conf.\n"); 4940 } 4941#endif 4942} 4943 4944void 4945arc_fini(void) 4946{ 4947 mutex_enter(&arc_reclaim_thr_lock); 4948 arc_thread_exit = 1; 4949 cv_signal(&arc_reclaim_thr_cv); 4950 while (arc_thread_exit != 0) 4951 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4952 mutex_exit(&arc_reclaim_thr_lock); 4953 4954 arc_flush(NULL); 4955 4956 arc_dead = TRUE; 4957 4958 if (arc_ksp != NULL) { 4959 kstat_delete(arc_ksp); 4960 arc_ksp = NULL; 4961 } 4962 4963 mutex_destroy(&arc_eviction_mtx); 4964 mutex_destroy(&arc_reclaim_thr_lock); 4965 cv_destroy(&arc_reclaim_thr_cv); 4966 4967 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 4968 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 4969 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 4970 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 4971 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 4972 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 4973 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 4974 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 4975 4976 mutex_destroy(&arc_anon->arcs_mtx); 4977 mutex_destroy(&arc_mru->arcs_mtx); 4978 mutex_destroy(&arc_mru_ghost->arcs_mtx); 4979 mutex_destroy(&arc_mfu->arcs_mtx); 4980 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 4981 mutex_destroy(&arc_l2c_only->arcs_mtx); 4982 4983 buf_fini(); 4984 4985 ASSERT0(arc_loaned_bytes); 4986 4987#ifdef _KERNEL 4988 if (arc_event_lowmem != NULL) 4989 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4990#endif 4991} 4992 4993/* 4994 * Level 2 ARC 4995 * 4996 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4997 * It uses dedicated storage devices to hold cached data, which are populated 4998 * using large infrequent writes. The main role of this cache is to boost 4999 * the performance of random read workloads. The intended L2ARC devices 5000 * include short-stroked disks, solid state disks, and other media with 5001 * substantially faster read latency than disk. 5002 * 5003 * +-----------------------+ 5004 * | ARC | 5005 * +-----------------------+ 5006 * | ^ ^ 5007 * | | | 5008 * l2arc_feed_thread() arc_read() 5009 * | | | 5010 * | l2arc read | 5011 * V | | 5012 * +---------------+ | 5013 * | L2ARC | | 5014 * +---------------+ | 5015 * | ^ | 5016 * l2arc_write() | | 5017 * | | | 5018 * V | | 5019 * +-------+ +-------+ 5020 * | vdev | | vdev | 5021 * | cache | | cache | 5022 * +-------+ +-------+ 5023 * +=========+ .-----. 5024 * : L2ARC : |-_____-| 5025 * : devices : | Disks | 5026 * +=========+ `-_____-' 5027 * 5028 * Read requests are satisfied from the following sources, in order: 5029 * 5030 * 1) ARC 5031 * 2) vdev cache of L2ARC devices 5032 * 3) L2ARC devices 5033 * 4) vdev cache of disks 5034 * 5) disks 5035 * 5036 * Some L2ARC device types exhibit extremely slow write performance. 5037 * To accommodate for this there are some significant differences between 5038 * the L2ARC and traditional cache design: 5039 * 5040 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5041 * the ARC behave as usual, freeing buffers and placing headers on ghost 5042 * lists. The ARC does not send buffers to the L2ARC during eviction as 5043 * this would add inflated write latencies for all ARC memory pressure. 5044 * 5045 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5046 * It does this by periodically scanning buffers from the eviction-end of 5047 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5048 * not already there. It scans until a headroom of buffers is satisfied, 5049 * which itself is a buffer for ARC eviction. If a compressible buffer is 5050 * found during scanning and selected for writing to an L2ARC device, we 5051 * temporarily boost scanning headroom during the next scan cycle to make 5052 * sure we adapt to compression effects (which might significantly reduce 5053 * the data volume we write to L2ARC). The thread that does this is 5054 * l2arc_feed_thread(), illustrated below; example sizes are included to 5055 * provide a better sense of ratio than this diagram: 5056 * 5057 * head --> tail 5058 * +---------------------+----------+ 5059 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5060 * +---------------------+----------+ | o L2ARC eligible 5061 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5062 * +---------------------+----------+ | 5063 * 15.9 Gbytes ^ 32 Mbytes | 5064 * headroom | 5065 * l2arc_feed_thread() 5066 * | 5067 * l2arc write hand <--[oooo]--' 5068 * | 8 Mbyte 5069 * | write max 5070 * V 5071 * +==============================+ 5072 * L2ARC dev |####|#|###|###| |####| ... | 5073 * +==============================+ 5074 * 32 Gbytes 5075 * 5076 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5077 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5078 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5079 * safe to say that this is an uncommon case, since buffers at the end of 5080 * the ARC lists have moved there due to inactivity. 5081 * 5082 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5083 * then the L2ARC simply misses copying some buffers. This serves as a 5084 * pressure valve to prevent heavy read workloads from both stalling the ARC 5085 * with waits and clogging the L2ARC with writes. This also helps prevent 5086 * the potential for the L2ARC to churn if it attempts to cache content too 5087 * quickly, such as during backups of the entire pool. 5088 * 5089 * 5. After system boot and before the ARC has filled main memory, there are 5090 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5091 * lists can remain mostly static. Instead of searching from tail of these 5092 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5093 * for eligible buffers, greatly increasing its chance of finding them. 5094 * 5095 * The L2ARC device write speed is also boosted during this time so that 5096 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5097 * there are no L2ARC reads, and no fear of degrading read performance 5098 * through increased writes. 5099 * 5100 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5101 * the vdev queue can aggregate them into larger and fewer writes. Each 5102 * device is written to in a rotor fashion, sweeping writes through 5103 * available space then repeating. 5104 * 5105 * 7. The L2ARC does not store dirty content. It never needs to flush 5106 * write buffers back to disk based storage. 5107 * 5108 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5109 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5110 * 5111 * The performance of the L2ARC can be tweaked by a number of tunables, which 5112 * may be necessary for different workloads: 5113 * 5114 * l2arc_write_max max write bytes per interval 5115 * l2arc_write_boost extra write bytes during device warmup 5116 * l2arc_noprefetch skip caching prefetched buffers 5117 * l2arc_headroom number of max device writes to precache 5118 * l2arc_headroom_boost when we find compressed buffers during ARC 5119 * scanning, we multiply headroom by this 5120 * percentage factor for the next scan cycle, 5121 * since more compressed buffers are likely to 5122 * be present 5123 * l2arc_feed_secs seconds between L2ARC writing 5124 * 5125 * Tunables may be removed or added as future performance improvements are 5126 * integrated, and also may become zpool properties. 5127 * 5128 * There are three key functions that control how the L2ARC warms up: 5129 * 5130 * l2arc_write_eligible() check if a buffer is eligible to cache 5131 * l2arc_write_size() calculate how much to write 5132 * l2arc_write_interval() calculate sleep delay between writes 5133 * 5134 * These three functions determine what to write, how much, and how quickly 5135 * to send writes. 5136 */ 5137 5138static boolean_t 5139l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5140{ 5141 /* 5142 * A buffer is *not* eligible for the L2ARC if it: 5143 * 1. belongs to a different spa. 5144 * 2. is already cached on the L2ARC. 5145 * 3. has an I/O in progress (it may be an incomplete read). 5146 * 4. is flagged not eligible (zfs property). 5147 */ 5148 if (hdr->b_spa != spa_guid) { 5149 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5150 return (B_FALSE); 5151 } 5152 if (HDR_HAS_L2HDR(hdr)) { 5153 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5154 return (B_FALSE); 5155 } 5156 if (HDR_IO_IN_PROGRESS(hdr)) { 5157 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5158 return (B_FALSE); 5159 } 5160 if (!HDR_L2CACHE(hdr)) { 5161 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5162 return (B_FALSE); 5163 } 5164 5165 return (B_TRUE); 5166} 5167 5168static uint64_t 5169l2arc_write_size(void) 5170{ 5171 uint64_t size; 5172 5173 /* 5174 * Make sure our globals have meaningful values in case the user 5175 * altered them. 5176 */ 5177 size = l2arc_write_max; 5178 if (size == 0) { 5179 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5180 "be greater than zero, resetting it to the default (%d)", 5181 L2ARC_WRITE_SIZE); 5182 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5183 } 5184 5185 if (arc_warm == B_FALSE) 5186 size += l2arc_write_boost; 5187 5188 return (size); 5189 5190} 5191 5192static clock_t 5193l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5194{ 5195 clock_t interval, next, now; 5196 5197 /* 5198 * If the ARC lists are busy, increase our write rate; if the 5199 * lists are stale, idle back. This is achieved by checking 5200 * how much we previously wrote - if it was more than half of 5201 * what we wanted, schedule the next write much sooner. 5202 */ 5203 if (l2arc_feed_again && wrote > (wanted / 2)) 5204 interval = (hz * l2arc_feed_min_ms) / 1000; 5205 else 5206 interval = hz * l2arc_feed_secs; 5207 5208 now = ddi_get_lbolt(); 5209 next = MAX(now, MIN(now + interval, began + interval)); 5210 5211 return (next); 5212} 5213 5214/* 5215 * Cycle through L2ARC devices. This is how L2ARC load balances. 5216 * If a device is returned, this also returns holding the spa config lock. 5217 */ 5218static l2arc_dev_t * 5219l2arc_dev_get_next(void) 5220{ 5221 l2arc_dev_t *first, *next = NULL; 5222 5223 /* 5224 * Lock out the removal of spas (spa_namespace_lock), then removal 5225 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5226 * both locks will be dropped and a spa config lock held instead. 5227 */ 5228 mutex_enter(&spa_namespace_lock); 5229 mutex_enter(&l2arc_dev_mtx); 5230 5231 /* if there are no vdevs, there is nothing to do */ 5232 if (l2arc_ndev == 0) 5233 goto out; 5234 5235 first = NULL; 5236 next = l2arc_dev_last; 5237 do { 5238 /* loop around the list looking for a non-faulted vdev */ 5239 if (next == NULL) { 5240 next = list_head(l2arc_dev_list); 5241 } else { 5242 next = list_next(l2arc_dev_list, next); 5243 if (next == NULL) 5244 next = list_head(l2arc_dev_list); 5245 } 5246 5247 /* if we have come back to the start, bail out */ 5248 if (first == NULL) 5249 first = next; 5250 else if (next == first) 5251 break; 5252 5253 } while (vdev_is_dead(next->l2ad_vdev)); 5254 5255 /* if we were unable to find any usable vdevs, return NULL */ 5256 if (vdev_is_dead(next->l2ad_vdev)) 5257 next = NULL; 5258 5259 l2arc_dev_last = next; 5260 5261out: 5262 mutex_exit(&l2arc_dev_mtx); 5263 5264 /* 5265 * Grab the config lock to prevent the 'next' device from being 5266 * removed while we are writing to it. 5267 */ 5268 if (next != NULL) 5269 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5270 mutex_exit(&spa_namespace_lock); 5271 5272 return (next); 5273} 5274 5275/* 5276 * Free buffers that were tagged for destruction. 5277 */ 5278static void 5279l2arc_do_free_on_write() 5280{ 5281 list_t *buflist; 5282 l2arc_data_free_t *df, *df_prev; 5283 5284 mutex_enter(&l2arc_free_on_write_mtx); 5285 buflist = l2arc_free_on_write; 5286 5287 for (df = list_tail(buflist); df; df = df_prev) { 5288 df_prev = list_prev(buflist, df); 5289 ASSERT(df->l2df_data != NULL); 5290 ASSERT(df->l2df_func != NULL); 5291 df->l2df_func(df->l2df_data, df->l2df_size); 5292 list_remove(buflist, df); 5293 kmem_free(df, sizeof (l2arc_data_free_t)); 5294 } 5295 5296 mutex_exit(&l2arc_free_on_write_mtx); 5297} 5298 5299/* 5300 * A write to a cache device has completed. Update all headers to allow 5301 * reads from these buffers to begin. 5302 */ 5303static void 5304l2arc_write_done(zio_t *zio) 5305{ 5306 l2arc_write_callback_t *cb; 5307 l2arc_dev_t *dev; 5308 list_t *buflist; 5309 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5310 kmutex_t *hash_lock; 5311 int64_t bytes_dropped = 0; 5312 5313 cb = zio->io_private; 5314 ASSERT(cb != NULL); 5315 dev = cb->l2wcb_dev; 5316 ASSERT(dev != NULL); 5317 head = cb->l2wcb_head; 5318 ASSERT(head != NULL); 5319 buflist = &dev->l2ad_buflist; 5320 ASSERT(buflist != NULL); 5321 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5322 l2arc_write_callback_t *, cb); 5323 5324 if (zio->io_error != 0) 5325 ARCSTAT_BUMP(arcstat_l2_writes_error); 5326 5327 mutex_enter(&dev->l2ad_mtx); 5328 5329 /* 5330 * All writes completed, or an error was hit. 5331 */ 5332 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5333 hdr_prev = list_prev(buflist, hdr); 5334 5335 hash_lock = HDR_LOCK(hdr); 5336 if (!mutex_tryenter(hash_lock)) { 5337 /* 5338 * This buffer misses out. It may be in a stage 5339 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5340 * left set, denying reads to this buffer. 5341 */ 5342 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5343 continue; 5344 } 5345 5346 /* 5347 * It's possible that this buffer got evicted from the L1 cache 5348 * before we grabbed the vdev + hash locks, in which case 5349 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5350 * Only free the buffer if we still have an L1 hdr. 5351 */ 5352 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5353 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5354 l2arc_release_cdata_buf(hdr); 5355 5356 if (zio->io_error != 0) { 5357 /* 5358 * Error - drop L2ARC entry. 5359 */ 5360 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5361 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5362 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5363 5364 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5365 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5366 5367 bytes_dropped += hdr->b_l2hdr.b_asize; 5368 (void) refcount_remove_many(&dev->l2ad_alloc, 5369 hdr->b_l2hdr.b_asize, hdr); 5370 } 5371 5372 /* 5373 * Allow ARC to begin reads to this L2ARC entry. 5374 */ 5375 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5376 5377 mutex_exit(hash_lock); 5378 } 5379 5380 atomic_inc_64(&l2arc_writes_done); 5381 list_remove(buflist, head); 5382 ASSERT(!HDR_HAS_L1HDR(head)); 5383 kmem_cache_free(hdr_l2only_cache, head); 5384 mutex_exit(&dev->l2ad_mtx); 5385 5386 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5387 5388 l2arc_do_free_on_write(); 5389 5390 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5391} 5392 5393/* 5394 * A read to a cache device completed. Validate buffer contents before 5395 * handing over to the regular ARC routines. 5396 */ 5397static void 5398l2arc_read_done(zio_t *zio) 5399{ 5400 l2arc_read_callback_t *cb; 5401 arc_buf_hdr_t *hdr; 5402 arc_buf_t *buf; 5403 kmutex_t *hash_lock; 5404 int equal; 5405 5406 ASSERT(zio->io_vd != NULL); 5407 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5408 5409 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5410 5411 cb = zio->io_private; 5412 ASSERT(cb != NULL); 5413 buf = cb->l2rcb_buf; 5414 ASSERT(buf != NULL); 5415 5416 hash_lock = HDR_LOCK(buf->b_hdr); 5417 mutex_enter(hash_lock); 5418 hdr = buf->b_hdr; 5419 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5420 5421 /* 5422 * If the buffer was compressed, decompress it first. 5423 */ 5424 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5425 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5426 ASSERT(zio->io_data != NULL); 5427 5428 /* 5429 * Check this survived the L2ARC journey. 5430 */ 5431 equal = arc_cksum_equal(buf); 5432 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5433 mutex_exit(hash_lock); 5434 zio->io_private = buf; 5435 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5436 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5437 arc_read_done(zio); 5438 } else { 5439 mutex_exit(hash_lock); 5440 /* 5441 * Buffer didn't survive caching. Increment stats and 5442 * reissue to the original storage device. 5443 */ 5444 if (zio->io_error != 0) { 5445 ARCSTAT_BUMP(arcstat_l2_io_error); 5446 } else { 5447 zio->io_error = SET_ERROR(EIO); 5448 } 5449 if (!equal) 5450 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5451 5452 /* 5453 * If there's no waiter, issue an async i/o to the primary 5454 * storage now. If there *is* a waiter, the caller must 5455 * issue the i/o in a context where it's OK to block. 5456 */ 5457 if (zio->io_waiter == NULL) { 5458 zio_t *pio = zio_unique_parent(zio); 5459 5460 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5461 5462 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5463 buf->b_data, zio->io_size, arc_read_done, buf, 5464 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5465 } 5466 } 5467 5468 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5469} 5470 5471/* 5472 * This is the list priority from which the L2ARC will search for pages to 5473 * cache. This is used within loops (0..3) to cycle through lists in the 5474 * desired order. This order can have a significant effect on cache 5475 * performance. 5476 * 5477 * Currently the metadata lists are hit first, MFU then MRU, followed by 5478 * the data lists. This function returns a locked list, and also returns 5479 * the lock pointer. 5480 */ 5481static list_t * 5482l2arc_list_locked(int list_num, kmutex_t **lock) 5483{ 5484 list_t *list = NULL; 5485 5486 ASSERT(list_num >= 0 && list_num <= 3); 5487 5488 switch (list_num) { 5489 case 0: 5490 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 5491 *lock = &arc_mfu->arcs_mtx; 5492 break; 5493 case 1: 5494 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 5495 *lock = &arc_mru->arcs_mtx; 5496 break; 5497 case 2: 5498 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 5499 *lock = &arc_mfu->arcs_mtx; 5500 break; 5501 case 3: 5502 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 5503 *lock = &arc_mru->arcs_mtx; 5504 break; 5505 } 5506 5507 ASSERT(!(MUTEX_HELD(*lock))); 5508 mutex_enter(*lock); 5509 return (list); 5510} 5511 5512/* 5513 * Evict buffers from the device write hand to the distance specified in 5514 * bytes. This distance may span populated buffers, it may span nothing. 5515 * This is clearing a region on the L2ARC device ready for writing. 5516 * If the 'all' boolean is set, every buffer is evicted. 5517 */ 5518static void 5519l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5520{ 5521 list_t *buflist; 5522 arc_buf_hdr_t *hdr, *hdr_prev; 5523 kmutex_t *hash_lock; 5524 uint64_t taddr; 5525 5526 buflist = &dev->l2ad_buflist; 5527 5528 if (!all && dev->l2ad_first) { 5529 /* 5530 * This is the first sweep through the device. There is 5531 * nothing to evict. 5532 */ 5533 return; 5534 } 5535 5536 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5537 /* 5538 * When nearing the end of the device, evict to the end 5539 * before the device write hand jumps to the start. 5540 */ 5541 taddr = dev->l2ad_end; 5542 } else { 5543 taddr = dev->l2ad_hand + distance; 5544 } 5545 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5546 uint64_t, taddr, boolean_t, all); 5547 5548top: 5549 mutex_enter(&dev->l2ad_mtx); 5550 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5551 hdr_prev = list_prev(buflist, hdr); 5552 5553 hash_lock = HDR_LOCK(hdr); 5554 if (!mutex_tryenter(hash_lock)) { 5555 /* 5556 * Missed the hash lock. Retry. 5557 */ 5558 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5559 mutex_exit(&dev->l2ad_mtx); 5560 mutex_enter(hash_lock); 5561 mutex_exit(hash_lock); 5562 goto top; 5563 } 5564 5565 if (HDR_L2_WRITE_HEAD(hdr)) { 5566 /* 5567 * We hit a write head node. Leave it for 5568 * l2arc_write_done(). 5569 */ 5570 list_remove(buflist, hdr); 5571 mutex_exit(hash_lock); 5572 continue; 5573 } 5574 5575 if (!all && HDR_HAS_L2HDR(hdr) && 5576 (hdr->b_l2hdr.b_daddr > taddr || 5577 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5578 /* 5579 * We've evicted to the target address, 5580 * or the end of the device. 5581 */ 5582 mutex_exit(hash_lock); 5583 break; 5584 } 5585 5586 ASSERT(HDR_HAS_L2HDR(hdr)); 5587 if (!HDR_HAS_L1HDR(hdr)) { 5588 ASSERT(!HDR_L2_READING(hdr)); 5589 /* 5590 * This doesn't exist in the ARC. Destroy. 5591 * arc_hdr_destroy() will call list_remove() 5592 * and decrement arcstat_l2_size. 5593 */ 5594 arc_change_state(arc_anon, hdr, hash_lock); 5595 arc_hdr_destroy(hdr); 5596 } else { 5597 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5598 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5599 /* 5600 * Invalidate issued or about to be issued 5601 * reads, since we may be about to write 5602 * over this location. 5603 */ 5604 if (HDR_L2_READING(hdr)) { 5605 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5606 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5607 } 5608 5609 arc_hdr_l2hdr_destroy(hdr); 5610 } 5611 mutex_exit(hash_lock); 5612 } 5613 mutex_exit(&dev->l2ad_mtx); 5614} 5615 5616/* 5617 * Find and write ARC buffers to the L2ARC device. 5618 * 5619 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5620 * for reading until they have completed writing. 5621 * The headroom_boost is an in-out parameter used to maintain headroom boost 5622 * state between calls to this function. 5623 * 5624 * Returns the number of bytes actually written (which may be smaller than 5625 * the delta by which the device hand has changed due to alignment). 5626 */ 5627static uint64_t 5628l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5629 boolean_t *headroom_boost) 5630{ 5631 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5632 list_t *list; 5633 uint64_t write_asize, write_sz, headroom, buf_compress_minsz; 5634 void *buf_data; 5635 kmutex_t *list_lock; 5636 boolean_t full; 5637 l2arc_write_callback_t *cb; 5638 zio_t *pio, *wzio; 5639 uint64_t guid = spa_load_guid(spa); 5640 const boolean_t do_headroom_boost = *headroom_boost; 5641 int try; 5642 5643 ASSERT(dev->l2ad_vdev != NULL); 5644 5645 /* Lower the flag now, we might want to raise it again later. */ 5646 *headroom_boost = B_FALSE; 5647 5648 pio = NULL; 5649 write_sz = write_asize = 0; 5650 full = B_FALSE; 5651 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5652 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5653 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5654 5655 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5656 /* 5657 * We will want to try to compress buffers that are at least 2x the 5658 * device sector size. 5659 */ 5660 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5661 5662 /* 5663 * Copy buffers for L2ARC writing. 5664 */ 5665 mutex_enter(&dev->l2ad_mtx); 5666 for (try = 0; try <= 3; try++) { 5667 uint64_t passed_sz = 0; 5668 5669 list = l2arc_list_locked(try, &list_lock); 5670 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5671 5672 /* 5673 * L2ARC fast warmup. 5674 * 5675 * Until the ARC is warm and starts to evict, read from the 5676 * head of the ARC lists rather than the tail. 5677 */ 5678 if (arc_warm == B_FALSE) 5679 hdr = list_head(list); 5680 else 5681 hdr = list_tail(list); 5682 if (hdr == NULL) 5683 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5684 5685 headroom = target_sz * l2arc_headroom; 5686 if (do_headroom_boost) 5687 headroom = (headroom * l2arc_headroom_boost) / 100; 5688 5689 for (; hdr; hdr = hdr_prev) { 5690 kmutex_t *hash_lock; 5691 uint64_t buf_sz; 5692 uint64_t buf_a_sz; 5693 5694 if (arc_warm == B_FALSE) 5695 hdr_prev = list_next(list, hdr); 5696 else 5697 hdr_prev = list_prev(list, hdr); 5698 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5699 5700 hash_lock = HDR_LOCK(hdr); 5701 if (!mutex_tryenter(hash_lock)) { 5702 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5703 /* 5704 * Skip this buffer rather than waiting. 5705 */ 5706 continue; 5707 } 5708 5709 passed_sz += hdr->b_size; 5710 if (passed_sz > headroom) { 5711 /* 5712 * Searched too far. 5713 */ 5714 mutex_exit(hash_lock); 5715 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5716 break; 5717 } 5718 5719 if (!l2arc_write_eligible(guid, hdr)) { 5720 mutex_exit(hash_lock); 5721 continue; 5722 } 5723 5724 /* 5725 * Assume that the buffer is not going to be compressed 5726 * and could take more space on disk because of a larger 5727 * disk block size. 5728 */ 5729 buf_sz = hdr->b_size; 5730 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5731 5732 if ((write_asize + buf_a_sz) > target_sz) { 5733 full = B_TRUE; 5734 mutex_exit(hash_lock); 5735 ARCSTAT_BUMP(arcstat_l2_write_full); 5736 break; 5737 } 5738 5739 if (pio == NULL) { 5740 /* 5741 * Insert a dummy header on the buflist so 5742 * l2arc_write_done() can find where the 5743 * write buffers begin without searching. 5744 */ 5745 list_insert_head(&dev->l2ad_buflist, head); 5746 5747 cb = kmem_alloc( 5748 sizeof (l2arc_write_callback_t), KM_SLEEP); 5749 cb->l2wcb_dev = dev; 5750 cb->l2wcb_head = head; 5751 pio = zio_root(spa, l2arc_write_done, cb, 5752 ZIO_FLAG_CANFAIL); 5753 ARCSTAT_BUMP(arcstat_l2_write_pios); 5754 } 5755 5756 /* 5757 * Create and add a new L2ARC header. 5758 */ 5759 hdr->b_l2hdr.b_dev = dev; 5760 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5761 /* 5762 * Temporarily stash the data buffer in b_tmp_cdata. 5763 * The subsequent write step will pick it up from 5764 * there. This is because can't access b_l1hdr.b_buf 5765 * without holding the hash_lock, which we in turn 5766 * can't access without holding the ARC list locks 5767 * (which we want to avoid during compression/writing). 5768 */ 5769 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5770 hdr->b_l2hdr.b_asize = hdr->b_size; 5771 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5772 5773 /* 5774 * Explicitly set the b_daddr field to a known 5775 * value which means "invalid address". This 5776 * enables us to differentiate which stage of 5777 * l2arc_write_buffers() the particular header 5778 * is in (e.g. this loop, or the one below). 5779 * ARC_FLAG_L2_WRITING is not enough to make 5780 * this distinction, and we need to know in 5781 * order to do proper l2arc vdev accounting in 5782 * arc_release() and arc_hdr_destroy(). 5783 * 5784 * Note, we can't use a new flag to distinguish 5785 * the two stages because we don't hold the 5786 * header's hash_lock below, in the second stage 5787 * of this function. Thus, we can't simply 5788 * change the b_flags field to denote that the 5789 * IO has been sent. We can change the b_daddr 5790 * field of the L2 portion, though, since we'll 5791 * be holding the l2ad_mtx; which is why we're 5792 * using it to denote the header's state change. 5793 */ 5794 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 5795 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5796 5797 list_insert_head(&dev->l2ad_buflist, hdr); 5798 5799 /* 5800 * Compute and store the buffer cksum before 5801 * writing. On debug the cksum is verified first. 5802 */ 5803 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5804 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5805 5806 mutex_exit(hash_lock); 5807 5808 write_sz += buf_sz; 5809 write_asize += buf_a_sz; 5810 } 5811 5812 mutex_exit(list_lock); 5813 5814 if (full == B_TRUE) 5815 break; 5816 } 5817 5818 /* No buffers selected for writing? */ 5819 if (pio == NULL) { 5820 ASSERT0(write_sz); 5821 mutex_exit(&dev->l2ad_mtx); 5822 ASSERT(!HDR_HAS_L1HDR(head)); 5823 kmem_cache_free(hdr_l2only_cache, head); 5824 return (0); 5825 } 5826 5827 /* 5828 * Note that elsewhere in this file arcstat_l2_asize 5829 * and the used space on l2ad_vdev are updated using b_asize, 5830 * which is not necessarily rounded up to the device block size. 5831 * Too keep accounting consistent we do the same here as well: 5832 * stats_size accumulates the sum of b_asize of the written buffers, 5833 * while write_asize accumulates the sum of b_asize rounded up 5834 * to the device block size. 5835 * The latter sum is used only to validate the corectness of the code. 5836 */ 5837 uint64_t stats_size = 0; 5838 write_asize = 0; 5839 5840 /* 5841 * Now start writing the buffers. We're starting at the write head 5842 * and work backwards, retracing the course of the buffer selector 5843 * loop above. 5844 */ 5845 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5846 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5847 uint64_t buf_sz; 5848 5849 /* 5850 * We shouldn't need to lock the buffer here, since we flagged 5851 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5852 * take care to only access its L2 cache parameters. In 5853 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5854 * ARC eviction. 5855 */ 5856 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5857 5858 if ((HDR_L2COMPRESS(hdr)) && 5859 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5860 if (l2arc_compress_buf(hdr)) { 5861 /* 5862 * If compression succeeded, enable headroom 5863 * boost on the next scan cycle. 5864 */ 5865 *headroom_boost = B_TRUE; 5866 } 5867 } 5868 5869 /* 5870 * Pick up the buffer data we had previously stashed away 5871 * (and now potentially also compressed). 5872 */ 5873 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5874 buf_sz = hdr->b_l2hdr.b_asize; 5875 5876 /* 5877 * If the data has not been compressed, then clear b_tmp_cdata 5878 * to make sure that it points only to a temporary compression 5879 * buffer. 5880 */ 5881 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5882 hdr->b_l1hdr.b_tmp_cdata = NULL; 5883 5884 /* 5885 * We need to do this regardless if buf_sz is zero or 5886 * not, otherwise, when this l2hdr is evicted we'll 5887 * remove a reference that was never added. 5888 */ 5889 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 5890 5891 /* Compression may have squashed the buffer to zero length. */ 5892 if (buf_sz != 0) { 5893 uint64_t buf_a_sz; 5894 5895 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5896 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5897 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5898 ZIO_FLAG_CANFAIL, B_FALSE); 5899 5900 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5901 zio_t *, wzio); 5902 (void) zio_nowait(wzio); 5903 5904 stats_size += buf_sz; 5905 5906 /* 5907 * Keep the clock hand suitably device-aligned. 5908 */ 5909 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5910 write_asize += buf_a_sz; 5911 dev->l2ad_hand += buf_a_sz; 5912 } 5913 } 5914 5915 mutex_exit(&dev->l2ad_mtx); 5916 5917 ASSERT3U(write_asize, <=, target_sz); 5918 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5919 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5920 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5921 ARCSTAT_INCR(arcstat_l2_asize, stats_size); 5922 vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); 5923 5924 /* 5925 * Bump device hand to the device start if it is approaching the end. 5926 * l2arc_evict() will already have evicted ahead for this case. 5927 */ 5928 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5929 dev->l2ad_hand = dev->l2ad_start; 5930 dev->l2ad_first = B_FALSE; 5931 } 5932 5933 dev->l2ad_writing = B_TRUE; 5934 (void) zio_wait(pio); 5935 dev->l2ad_writing = B_FALSE; 5936 5937 return (write_asize); 5938} 5939 5940/* 5941 * Compresses an L2ARC buffer. 5942 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5943 * size in l2hdr->b_asize. This routine tries to compress the data and 5944 * depending on the compression result there are three possible outcomes: 5945 * *) The buffer was incompressible. The original l2hdr contents were left 5946 * untouched and are ready for writing to an L2 device. 5947 * *) The buffer was all-zeros, so there is no need to write it to an L2 5948 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5949 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5950 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5951 * data buffer which holds the compressed data to be written, and b_asize 5952 * tells us how much data there is. b_compress is set to the appropriate 5953 * compression algorithm. Once writing is done, invoke 5954 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5955 * 5956 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5957 * buffer was incompressible). 5958 */ 5959static boolean_t 5960l2arc_compress_buf(arc_buf_hdr_t *hdr) 5961{ 5962 void *cdata; 5963 size_t csize, len, rounded; 5964 ASSERT(HDR_HAS_L2HDR(hdr)); 5965 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 5966 5967 ASSERT(HDR_HAS_L1HDR(hdr)); 5968 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 5969 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 5970 5971 len = l2hdr->b_asize; 5972 cdata = zio_data_buf_alloc(len); 5973 ASSERT3P(cdata, !=, NULL); 5974 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 5975 cdata, l2hdr->b_asize); 5976 5977 if (csize == 0) { 5978 /* zero block, indicate that there's nothing to write */ 5979 zio_data_buf_free(cdata, len); 5980 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 5981 l2hdr->b_asize = 0; 5982 hdr->b_l1hdr.b_tmp_cdata = NULL; 5983 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5984 return (B_TRUE); 5985 } 5986 5987 rounded = P2ROUNDUP(csize, 5988 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 5989 if (rounded < len) { 5990 /* 5991 * Compression succeeded, we'll keep the cdata around for 5992 * writing and release it afterwards. 5993 */ 5994 if (rounded > csize) { 5995 bzero((char *)cdata + csize, rounded - csize); 5996 csize = rounded; 5997 } 5998 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 5999 l2hdr->b_asize = csize; 6000 hdr->b_l1hdr.b_tmp_cdata = cdata; 6001 ARCSTAT_BUMP(arcstat_l2_compress_successes); 6002 return (B_TRUE); 6003 } else { 6004 /* 6005 * Compression failed, release the compressed buffer. 6006 * l2hdr will be left unmodified. 6007 */ 6008 zio_data_buf_free(cdata, len); 6009 ARCSTAT_BUMP(arcstat_l2_compress_failures); 6010 return (B_FALSE); 6011 } 6012} 6013 6014/* 6015 * Decompresses a zio read back from an l2arc device. On success, the 6016 * underlying zio's io_data buffer is overwritten by the uncompressed 6017 * version. On decompression error (corrupt compressed stream), the 6018 * zio->io_error value is set to signal an I/O error. 6019 * 6020 * Please note that the compressed data stream is not checksummed, so 6021 * if the underlying device is experiencing data corruption, we may feed 6022 * corrupt data to the decompressor, so the decompressor needs to be 6023 * able to handle this situation (LZ4 does). 6024 */ 6025static void 6026l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6027{ 6028 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6029 6030 if (zio->io_error != 0) { 6031 /* 6032 * An io error has occured, just restore the original io 6033 * size in preparation for a main pool read. 6034 */ 6035 zio->io_orig_size = zio->io_size = hdr->b_size; 6036 return; 6037 } 6038 6039 if (c == ZIO_COMPRESS_EMPTY) { 6040 /* 6041 * An empty buffer results in a null zio, which means we 6042 * need to fill its io_data after we're done restoring the 6043 * buffer's contents. 6044 */ 6045 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6046 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6047 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6048 } else { 6049 ASSERT(zio->io_data != NULL); 6050 /* 6051 * We copy the compressed data from the start of the arc buffer 6052 * (the zio_read will have pulled in only what we need, the 6053 * rest is garbage which we will overwrite at decompression) 6054 * and then decompress back to the ARC data buffer. This way we 6055 * can minimize copying by simply decompressing back over the 6056 * original compressed data (rather than decompressing to an 6057 * aux buffer and then copying back the uncompressed buffer, 6058 * which is likely to be much larger). 6059 */ 6060 uint64_t csize; 6061 void *cdata; 6062 6063 csize = zio->io_size; 6064 cdata = zio_data_buf_alloc(csize); 6065 bcopy(zio->io_data, cdata, csize); 6066 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6067 hdr->b_size) != 0) 6068 zio->io_error = EIO; 6069 zio_data_buf_free(cdata, csize); 6070 } 6071 6072 /* Restore the expected uncompressed IO size. */ 6073 zio->io_orig_size = zio->io_size = hdr->b_size; 6074} 6075 6076/* 6077 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6078 * This buffer serves as a temporary holder of compressed data while 6079 * the buffer entry is being written to an l2arc device. Once that is 6080 * done, we can dispose of it. 6081 */ 6082static void 6083l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6084{ 6085 ASSERT(HDR_HAS_L1HDR(hdr)); 6086 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6087 /* 6088 * If the data was compressed, then we've allocated a 6089 * temporary buffer for it, so now we need to release it. 6090 */ 6091 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6092 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6093 hdr->b_size); 6094 hdr->b_l1hdr.b_tmp_cdata = NULL; 6095 } else { 6096 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6097 } 6098} 6099 6100/* 6101 * This thread feeds the L2ARC at regular intervals. This is the beating 6102 * heart of the L2ARC. 6103 */ 6104static void 6105l2arc_feed_thread(void *dummy __unused) 6106{ 6107 callb_cpr_t cpr; 6108 l2arc_dev_t *dev; 6109 spa_t *spa; 6110 uint64_t size, wrote; 6111 clock_t begin, next = ddi_get_lbolt(); 6112 boolean_t headroom_boost = B_FALSE; 6113 6114 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6115 6116 mutex_enter(&l2arc_feed_thr_lock); 6117 6118 while (l2arc_thread_exit == 0) { 6119 CALLB_CPR_SAFE_BEGIN(&cpr); 6120 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6121 next - ddi_get_lbolt()); 6122 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6123 next = ddi_get_lbolt() + hz; 6124 6125 /* 6126 * Quick check for L2ARC devices. 6127 */ 6128 mutex_enter(&l2arc_dev_mtx); 6129 if (l2arc_ndev == 0) { 6130 mutex_exit(&l2arc_dev_mtx); 6131 continue; 6132 } 6133 mutex_exit(&l2arc_dev_mtx); 6134 begin = ddi_get_lbolt(); 6135 6136 /* 6137 * This selects the next l2arc device to write to, and in 6138 * doing so the next spa to feed from: dev->l2ad_spa. This 6139 * will return NULL if there are now no l2arc devices or if 6140 * they are all faulted. 6141 * 6142 * If a device is returned, its spa's config lock is also 6143 * held to prevent device removal. l2arc_dev_get_next() 6144 * will grab and release l2arc_dev_mtx. 6145 */ 6146 if ((dev = l2arc_dev_get_next()) == NULL) 6147 continue; 6148 6149 spa = dev->l2ad_spa; 6150 ASSERT(spa != NULL); 6151 6152 /* 6153 * If the pool is read-only then force the feed thread to 6154 * sleep a little longer. 6155 */ 6156 if (!spa_writeable(spa)) { 6157 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6158 spa_config_exit(spa, SCL_L2ARC, dev); 6159 continue; 6160 } 6161 6162 /* 6163 * Avoid contributing to memory pressure. 6164 */ 6165 if (arc_reclaim_needed()) { 6166 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6167 spa_config_exit(spa, SCL_L2ARC, dev); 6168 continue; 6169 } 6170 6171 ARCSTAT_BUMP(arcstat_l2_feeds); 6172 6173 size = l2arc_write_size(); 6174 6175 /* 6176 * Evict L2ARC buffers that will be overwritten. 6177 */ 6178 l2arc_evict(dev, size, B_FALSE); 6179 6180 /* 6181 * Write ARC buffers. 6182 */ 6183 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6184 6185 /* 6186 * Calculate interval between writes. 6187 */ 6188 next = l2arc_write_interval(begin, size, wrote); 6189 spa_config_exit(spa, SCL_L2ARC, dev); 6190 } 6191 6192 l2arc_thread_exit = 0; 6193 cv_broadcast(&l2arc_feed_thr_cv); 6194 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6195 thread_exit(); 6196} 6197 6198boolean_t 6199l2arc_vdev_present(vdev_t *vd) 6200{ 6201 l2arc_dev_t *dev; 6202 6203 mutex_enter(&l2arc_dev_mtx); 6204 for (dev = list_head(l2arc_dev_list); dev != NULL; 6205 dev = list_next(l2arc_dev_list, dev)) { 6206 if (dev->l2ad_vdev == vd) 6207 break; 6208 } 6209 mutex_exit(&l2arc_dev_mtx); 6210 6211 return (dev != NULL); 6212} 6213 6214/* 6215 * Add a vdev for use by the L2ARC. By this point the spa has already 6216 * validated the vdev and opened it. 6217 */ 6218void 6219l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6220{ 6221 l2arc_dev_t *adddev; 6222 6223 ASSERT(!l2arc_vdev_present(vd)); 6224 6225 vdev_ashift_optimize(vd); 6226 6227 /* 6228 * Create a new l2arc device entry. 6229 */ 6230 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6231 adddev->l2ad_spa = spa; 6232 adddev->l2ad_vdev = vd; 6233 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6234 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6235 adddev->l2ad_hand = adddev->l2ad_start; 6236 adddev->l2ad_first = B_TRUE; 6237 adddev->l2ad_writing = B_FALSE; 6238 6239 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6240 /* 6241 * This is a list of all ARC buffers that are still valid on the 6242 * device. 6243 */ 6244 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6245 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6246 6247 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6248 refcount_create(&adddev->l2ad_alloc); 6249 6250 /* 6251 * Add device to global list 6252 */ 6253 mutex_enter(&l2arc_dev_mtx); 6254 list_insert_head(l2arc_dev_list, adddev); 6255 atomic_inc_64(&l2arc_ndev); 6256 mutex_exit(&l2arc_dev_mtx); 6257} 6258 6259/* 6260 * Remove a vdev from the L2ARC. 6261 */ 6262void 6263l2arc_remove_vdev(vdev_t *vd) 6264{ 6265 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6266 6267 /* 6268 * Find the device by vdev 6269 */ 6270 mutex_enter(&l2arc_dev_mtx); 6271 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6272 nextdev = list_next(l2arc_dev_list, dev); 6273 if (vd == dev->l2ad_vdev) { 6274 remdev = dev; 6275 break; 6276 } 6277 } 6278 ASSERT(remdev != NULL); 6279 6280 /* 6281 * Remove device from global list 6282 */ 6283 list_remove(l2arc_dev_list, remdev); 6284 l2arc_dev_last = NULL; /* may have been invalidated */ 6285 atomic_dec_64(&l2arc_ndev); 6286 mutex_exit(&l2arc_dev_mtx); 6287 6288 /* 6289 * Clear all buflists and ARC references. L2ARC device flush. 6290 */ 6291 l2arc_evict(remdev, 0, B_TRUE); 6292 list_destroy(&remdev->l2ad_buflist); 6293 mutex_destroy(&remdev->l2ad_mtx); 6294 refcount_destroy(&remdev->l2ad_alloc); 6295 kmem_free(remdev, sizeof (l2arc_dev_t)); 6296} 6297 6298void 6299l2arc_init(void) 6300{ 6301 l2arc_thread_exit = 0; 6302 l2arc_ndev = 0; 6303 l2arc_writes_sent = 0; 6304 l2arc_writes_done = 0; 6305 6306 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6307 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6308 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6309 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6310 6311 l2arc_dev_list = &L2ARC_dev_list; 6312 l2arc_free_on_write = &L2ARC_free_on_write; 6313 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6314 offsetof(l2arc_dev_t, l2ad_node)); 6315 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6316 offsetof(l2arc_data_free_t, l2df_list_node)); 6317} 6318 6319void 6320l2arc_fini(void) 6321{ 6322 /* 6323 * This is called from dmu_fini(), which is called from spa_fini(); 6324 * Because of this, we can assume that all l2arc devices have 6325 * already been removed when the pools themselves were removed. 6326 */ 6327 6328 l2arc_do_free_on_write(); 6329 6330 mutex_destroy(&l2arc_feed_thr_lock); 6331 cv_destroy(&l2arc_feed_thr_cv); 6332 mutex_destroy(&l2arc_dev_mtx); 6333 mutex_destroy(&l2arc_free_on_write_mtx); 6334 6335 list_destroy(l2arc_dev_list); 6336 list_destroy(l2arc_free_on_write); 6337} 6338 6339void 6340l2arc_start(void) 6341{ 6342 if (!(spa_mode_global & FWRITE)) 6343 return; 6344 6345 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6346 TS_RUN, minclsyspri); 6347} 6348 6349void 6350l2arc_stop(void) 6351{ 6352 if (!(spa_mode_global & FWRITE)) 6353 return; 6354 6355 mutex_enter(&l2arc_feed_thr_lock); 6356 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6357 l2arc_thread_exit = 1; 6358 while (l2arc_thread_exit != 0) 6359 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6360 mutex_exit(&l2arc_feed_thr_lock); 6361} 6362