arc.c revision 288550
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexs, rather they rely on the 86 * hash table mutexs for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexs). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123#include <sys/spa.h> 124#include <sys/zio.h> 125#include <sys/zio_compress.h> 126#include <sys/zfs_context.h> 127#include <sys/arc.h> 128#include <sys/refcount.h> 129#include <sys/vdev.h> 130#include <sys/vdev_impl.h> 131#include <sys/dsl_pool.h> 132#ifdef _KERNEL 133#include <sys/dnlc.h> 134#endif 135#include <sys/callb.h> 136#include <sys/kstat.h> 137#include <sys/trim_map.h> 138#include <zfs_fletcher.h> 139#include <sys/sdt.h> 140 141#include <vm/vm_pageout.h> 142#include <machine/vmparam.h> 143 144#ifdef illumos 145#ifndef _KERNEL 146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147boolean_t arc_watch = B_FALSE; 148int arc_procfd; 149#endif 150#endif /* illumos */ 151 152static kmutex_t arc_reclaim_thr_lock; 153static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154static uint8_t arc_thread_exit; 155 156#define ARC_REDUCE_DNLC_PERCENT 3 157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 158 159typedef enum arc_reclaim_strategy { 160 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 161 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 162} arc_reclaim_strategy_t; 163 164/* 165 * The number of iterations through arc_evict_*() before we 166 * drop & reacquire the lock. 167 */ 168int arc_evict_iterations = 100; 169 170/* number of seconds before growing cache again */ 171static int arc_grow_retry = 60; 172 173/* shift of arc_c for calculating both min and max arc_p */ 174static int arc_p_min_shift = 4; 175 176/* log2(fraction of arc to reclaim) */ 177static int arc_shrink_shift = 5; 178 179/* 180 * minimum lifespan of a prefetch block in clock ticks 181 * (initialized in arc_init()) 182 */ 183static int arc_min_prefetch_lifespan; 184 185/* 186 * If this percent of memory is free, don't throttle. 187 */ 188int arc_lotsfree_percent = 10; 189 190static int arc_dead; 191extern int zfs_prefetch_disable; 192 193/* 194 * The arc has filled available memory and has now warmed up. 195 */ 196static boolean_t arc_warm; 197 198uint64_t zfs_arc_max; 199uint64_t zfs_arc_min; 200uint64_t zfs_arc_meta_limit = 0; 201uint64_t zfs_arc_meta_min = 0; 202int zfs_arc_grow_retry = 0; 203int zfs_arc_shrink_shift = 0; 204int zfs_arc_p_min_shift = 0; 205int zfs_disable_dup_eviction = 0; 206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 207u_int zfs_arc_free_target = 0; 208 209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 211 212#ifdef _KERNEL 213static void 214arc_free_target_init(void *unused __unused) 215{ 216 217 zfs_arc_free_target = vm_pageout_wakeup_thresh; 218} 219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 220 arc_free_target_init, NULL); 221 222TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); 223TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); 224TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 225TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 226TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize); 227TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 228SYSCTL_DECL(_vfs_zfs); 229SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 230 "Maximum ARC size"); 231SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 232 "Minimum ARC size"); 233SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 234 &zfs_arc_average_blocksize, 0, 235 "ARC average blocksize"); 236SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 237 &arc_shrink_shift, 0, 238 "log2(fraction of arc to reclaim)"); 239 240/* 241 * We don't have a tunable for arc_free_target due to the dependency on 242 * pagedaemon initialisation. 243 */ 244SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 245 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 246 sysctl_vfs_zfs_arc_free_target, "IU", 247 "Desired number of free pages below which ARC triggers reclaim"); 248 249static int 250sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 251{ 252 u_int val; 253 int err; 254 255 val = zfs_arc_free_target; 256 err = sysctl_handle_int(oidp, &val, 0, req); 257 if (err != 0 || req->newptr == NULL) 258 return (err); 259 260 if (val < minfree) 261 return (EINVAL); 262 if (val > cnt.v_page_count) 263 return (EINVAL); 264 265 zfs_arc_free_target = val; 266 267 return (0); 268} 269 270/* 271 * Must be declared here, before the definition of corresponding kstat 272 * macro which uses the same names will confuse the compiler. 273 */ 274SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 275 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 276 sysctl_vfs_zfs_arc_meta_limit, "QU", 277 "ARC metadata limit"); 278#endif 279 280/* 281 * Note that buffers can be in one of 6 states: 282 * ARC_anon - anonymous (discussed below) 283 * ARC_mru - recently used, currently cached 284 * ARC_mru_ghost - recentely used, no longer in cache 285 * ARC_mfu - frequently used, currently cached 286 * ARC_mfu_ghost - frequently used, no longer in cache 287 * ARC_l2c_only - exists in L2ARC but not other states 288 * When there are no active references to the buffer, they are 289 * are linked onto a list in one of these arc states. These are 290 * the only buffers that can be evicted or deleted. Within each 291 * state there are multiple lists, one for meta-data and one for 292 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 293 * etc.) is tracked separately so that it can be managed more 294 * explicitly: favored over data, limited explicitly. 295 * 296 * Anonymous buffers are buffers that are not associated with 297 * a DVA. These are buffers that hold dirty block copies 298 * before they are written to stable storage. By definition, 299 * they are "ref'd" and are considered part of arc_mru 300 * that cannot be freed. Generally, they will aquire a DVA 301 * as they are written and migrate onto the arc_mru list. 302 * 303 * The ARC_l2c_only state is for buffers that are in the second 304 * level ARC but no longer in any of the ARC_m* lists. The second 305 * level ARC itself may also contain buffers that are in any of 306 * the ARC_m* states - meaning that a buffer can exist in two 307 * places. The reason for the ARC_l2c_only state is to keep the 308 * buffer header in the hash table, so that reads that hit the 309 * second level ARC benefit from these fast lookups. 310 */ 311 312#define ARCS_LOCK_PAD CACHE_LINE_SIZE 313struct arcs_lock { 314 kmutex_t arcs_lock; 315#ifdef _KERNEL 316 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 317#endif 318}; 319 320/* 321 * must be power of two for mask use to work 322 * 323 */ 324#define ARC_BUFC_NUMDATALISTS 16 325#define ARC_BUFC_NUMMETADATALISTS 16 326#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 327 328typedef struct arc_state { 329 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 330 uint64_t arcs_size; /* total amount of data in this state */ 331 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 332 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 333} arc_state_t; 334 335#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 336 337/* The 6 states: */ 338static arc_state_t ARC_anon; 339static arc_state_t ARC_mru; 340static arc_state_t ARC_mru_ghost; 341static arc_state_t ARC_mfu; 342static arc_state_t ARC_mfu_ghost; 343static arc_state_t ARC_l2c_only; 344 345typedef struct arc_stats { 346 kstat_named_t arcstat_hits; 347 kstat_named_t arcstat_misses; 348 kstat_named_t arcstat_demand_data_hits; 349 kstat_named_t arcstat_demand_data_misses; 350 kstat_named_t arcstat_demand_metadata_hits; 351 kstat_named_t arcstat_demand_metadata_misses; 352 kstat_named_t arcstat_prefetch_data_hits; 353 kstat_named_t arcstat_prefetch_data_misses; 354 kstat_named_t arcstat_prefetch_metadata_hits; 355 kstat_named_t arcstat_prefetch_metadata_misses; 356 kstat_named_t arcstat_mru_hits; 357 kstat_named_t arcstat_mru_ghost_hits; 358 kstat_named_t arcstat_mfu_hits; 359 kstat_named_t arcstat_mfu_ghost_hits; 360 kstat_named_t arcstat_allocated; 361 kstat_named_t arcstat_deleted; 362 kstat_named_t arcstat_stolen; 363 kstat_named_t arcstat_recycle_miss; 364 /* 365 * Number of buffers that could not be evicted because the hash lock 366 * was held by another thread. The lock may not necessarily be held 367 * by something using the same buffer, since hash locks are shared 368 * by multiple buffers. 369 */ 370 kstat_named_t arcstat_mutex_miss; 371 /* 372 * Number of buffers skipped because they have I/O in progress, are 373 * indrect prefetch buffers that have not lived long enough, or are 374 * not from the spa we're trying to evict from. 375 */ 376 kstat_named_t arcstat_evict_skip; 377 kstat_named_t arcstat_evict_l2_cached; 378 kstat_named_t arcstat_evict_l2_eligible; 379 kstat_named_t arcstat_evict_l2_ineligible; 380 kstat_named_t arcstat_hash_elements; 381 kstat_named_t arcstat_hash_elements_max; 382 kstat_named_t arcstat_hash_collisions; 383 kstat_named_t arcstat_hash_chains; 384 kstat_named_t arcstat_hash_chain_max; 385 kstat_named_t arcstat_p; 386 kstat_named_t arcstat_c; 387 kstat_named_t arcstat_c_min; 388 kstat_named_t arcstat_c_max; 389 kstat_named_t arcstat_size; 390 /* 391 * Number of bytes consumed by internal ARC structures necessary 392 * for tracking purposes; these structures are not actually 393 * backed by ARC buffers. This includes arc_buf_hdr_t structures 394 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 395 * caches), and arc_buf_t structures (allocated via arc_buf_t 396 * cache). 397 */ 398 kstat_named_t arcstat_hdr_size; 399 /* 400 * Number of bytes consumed by ARC buffers of type equal to 401 * ARC_BUFC_DATA. This is generally consumed by buffers backing 402 * on disk user data (e.g. plain file contents). 403 */ 404 kstat_named_t arcstat_data_size; 405 /* 406 * Number of bytes consumed by ARC buffers of type equal to 407 * ARC_BUFC_METADATA. This is generally consumed by buffers 408 * backing on disk data that is used for internal ZFS 409 * structures (e.g. ZAP, dnode, indirect blocks, etc). 410 */ 411 kstat_named_t arcstat_metadata_size; 412 /* 413 * Number of bytes consumed by various buffers and structures 414 * not actually backed with ARC buffers. This includes bonus 415 * buffers (allocated directly via zio_buf_* functions), 416 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 417 * cache), and dnode_t structures (allocated via dnode_t cache). 418 */ 419 kstat_named_t arcstat_other_size; 420 /* 421 * Total number of bytes consumed by ARC buffers residing in the 422 * arc_anon state. This includes *all* buffers in the arc_anon 423 * state; e.g. data, metadata, evictable, and unevictable buffers 424 * are all included in this value. 425 */ 426 kstat_named_t arcstat_anon_size; 427 /* 428 * Number of bytes consumed by ARC buffers that meet the 429 * following criteria: backing buffers of type ARC_BUFC_DATA, 430 * residing in the arc_anon state, and are eligible for eviction 431 * (e.g. have no outstanding holds on the buffer). 432 */ 433 kstat_named_t arcstat_anon_evictable_data; 434 /* 435 * Number of bytes consumed by ARC buffers that meet the 436 * following criteria: backing buffers of type ARC_BUFC_METADATA, 437 * residing in the arc_anon state, and are eligible for eviction 438 * (e.g. have no outstanding holds on the buffer). 439 */ 440 kstat_named_t arcstat_anon_evictable_metadata; 441 /* 442 * Total number of bytes consumed by ARC buffers residing in the 443 * arc_mru state. This includes *all* buffers in the arc_mru 444 * state; e.g. data, metadata, evictable, and unevictable buffers 445 * are all included in this value. 446 */ 447 kstat_named_t arcstat_mru_size; 448 /* 449 * Number of bytes consumed by ARC buffers that meet the 450 * following criteria: backing buffers of type ARC_BUFC_DATA, 451 * residing in the arc_mru state, and are eligible for eviction 452 * (e.g. have no outstanding holds on the buffer). 453 */ 454 kstat_named_t arcstat_mru_evictable_data; 455 /* 456 * Number of bytes consumed by ARC buffers that meet the 457 * following criteria: backing buffers of type ARC_BUFC_METADATA, 458 * residing in the arc_mru state, and are eligible for eviction 459 * (e.g. have no outstanding holds on the buffer). 460 */ 461 kstat_named_t arcstat_mru_evictable_metadata; 462 /* 463 * Total number of bytes that *would have been* consumed by ARC 464 * buffers in the arc_mru_ghost state. The key thing to note 465 * here, is the fact that this size doesn't actually indicate 466 * RAM consumption. The ghost lists only consist of headers and 467 * don't actually have ARC buffers linked off of these headers. 468 * Thus, *if* the headers had associated ARC buffers, these 469 * buffers *would have* consumed this number of bytes. 470 */ 471 kstat_named_t arcstat_mru_ghost_size; 472 /* 473 * Number of bytes that *would have been* consumed by ARC 474 * buffers that are eligible for eviction, of type 475 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 476 */ 477 kstat_named_t arcstat_mru_ghost_evictable_data; 478 /* 479 * Number of bytes that *would have been* consumed by ARC 480 * buffers that are eligible for eviction, of type 481 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 482 */ 483 kstat_named_t arcstat_mru_ghost_evictable_metadata; 484 /* 485 * Total number of bytes consumed by ARC buffers residing in the 486 * arc_mfu state. This includes *all* buffers in the arc_mfu 487 * state; e.g. data, metadata, evictable, and unevictable buffers 488 * are all included in this value. 489 */ 490 kstat_named_t arcstat_mfu_size; 491 /* 492 * Number of bytes consumed by ARC buffers that are eligible for 493 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 494 * state. 495 */ 496 kstat_named_t arcstat_mfu_evictable_data; 497 /* 498 * Number of bytes consumed by ARC buffers that are eligible for 499 * eviction, of type ARC_BUFC_METADATA, and reside in the 500 * arc_mfu state. 501 */ 502 kstat_named_t arcstat_mfu_evictable_metadata; 503 /* 504 * Total number of bytes that *would have been* consumed by ARC 505 * buffers in the arc_mfu_ghost state. See the comment above 506 * arcstat_mru_ghost_size for more details. 507 */ 508 kstat_named_t arcstat_mfu_ghost_size; 509 /* 510 * Number of bytes that *would have been* consumed by ARC 511 * buffers that are eligible for eviction, of type 512 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 513 */ 514 kstat_named_t arcstat_mfu_ghost_evictable_data; 515 /* 516 * Number of bytes that *would have been* consumed by ARC 517 * buffers that are eligible for eviction, of type 518 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 519 */ 520 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 521 kstat_named_t arcstat_l2_hits; 522 kstat_named_t arcstat_l2_misses; 523 kstat_named_t arcstat_l2_feeds; 524 kstat_named_t arcstat_l2_rw_clash; 525 kstat_named_t arcstat_l2_read_bytes; 526 kstat_named_t arcstat_l2_write_bytes; 527 kstat_named_t arcstat_l2_writes_sent; 528 kstat_named_t arcstat_l2_writes_done; 529 kstat_named_t arcstat_l2_writes_error; 530 kstat_named_t arcstat_l2_writes_hdr_miss; 531 kstat_named_t arcstat_l2_evict_lock_retry; 532 kstat_named_t arcstat_l2_evict_reading; 533 kstat_named_t arcstat_l2_evict_l1cached; 534 kstat_named_t arcstat_l2_free_on_write; 535 kstat_named_t arcstat_l2_cdata_free_on_write; 536 kstat_named_t arcstat_l2_abort_lowmem; 537 kstat_named_t arcstat_l2_cksum_bad; 538 kstat_named_t arcstat_l2_io_error; 539 kstat_named_t arcstat_l2_size; 540 kstat_named_t arcstat_l2_asize; 541 kstat_named_t arcstat_l2_hdr_size; 542 kstat_named_t arcstat_l2_compress_successes; 543 kstat_named_t arcstat_l2_compress_zeros; 544 kstat_named_t arcstat_l2_compress_failures; 545 kstat_named_t arcstat_l2_write_trylock_fail; 546 kstat_named_t arcstat_l2_write_passed_headroom; 547 kstat_named_t arcstat_l2_write_spa_mismatch; 548 kstat_named_t arcstat_l2_write_in_l2; 549 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 550 kstat_named_t arcstat_l2_write_not_cacheable; 551 kstat_named_t arcstat_l2_write_full; 552 kstat_named_t arcstat_l2_write_buffer_iter; 553 kstat_named_t arcstat_l2_write_pios; 554 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 555 kstat_named_t arcstat_l2_write_buffer_list_iter; 556 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 557 kstat_named_t arcstat_memory_throttle_count; 558 kstat_named_t arcstat_duplicate_buffers; 559 kstat_named_t arcstat_duplicate_buffers_size; 560 kstat_named_t arcstat_duplicate_reads; 561 kstat_named_t arcstat_meta_used; 562 kstat_named_t arcstat_meta_limit; 563 kstat_named_t arcstat_meta_max; 564 kstat_named_t arcstat_meta_min; 565} arc_stats_t; 566 567static arc_stats_t arc_stats = { 568 { "hits", KSTAT_DATA_UINT64 }, 569 { "misses", KSTAT_DATA_UINT64 }, 570 { "demand_data_hits", KSTAT_DATA_UINT64 }, 571 { "demand_data_misses", KSTAT_DATA_UINT64 }, 572 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 573 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 574 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 575 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 576 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 577 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 578 { "mru_hits", KSTAT_DATA_UINT64 }, 579 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 580 { "mfu_hits", KSTAT_DATA_UINT64 }, 581 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 582 { "allocated", KSTAT_DATA_UINT64 }, 583 { "deleted", KSTAT_DATA_UINT64 }, 584 { "stolen", KSTAT_DATA_UINT64 }, 585 { "recycle_miss", KSTAT_DATA_UINT64 }, 586 { "mutex_miss", KSTAT_DATA_UINT64 }, 587 { "evict_skip", KSTAT_DATA_UINT64 }, 588 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 589 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 590 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 591 { "hash_elements", KSTAT_DATA_UINT64 }, 592 { "hash_elements_max", KSTAT_DATA_UINT64 }, 593 { "hash_collisions", KSTAT_DATA_UINT64 }, 594 { "hash_chains", KSTAT_DATA_UINT64 }, 595 { "hash_chain_max", KSTAT_DATA_UINT64 }, 596 { "p", KSTAT_DATA_UINT64 }, 597 { "c", KSTAT_DATA_UINT64 }, 598 { "c_min", KSTAT_DATA_UINT64 }, 599 { "c_max", KSTAT_DATA_UINT64 }, 600 { "size", KSTAT_DATA_UINT64 }, 601 { "hdr_size", KSTAT_DATA_UINT64 }, 602 { "data_size", KSTAT_DATA_UINT64 }, 603 { "metadata_size", KSTAT_DATA_UINT64 }, 604 { "other_size", KSTAT_DATA_UINT64 }, 605 { "anon_size", KSTAT_DATA_UINT64 }, 606 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 607 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 608 { "mru_size", KSTAT_DATA_UINT64 }, 609 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 610 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 611 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 612 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 613 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 614 { "mfu_size", KSTAT_DATA_UINT64 }, 615 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 616 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 617 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 618 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 619 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 620 { "l2_hits", KSTAT_DATA_UINT64 }, 621 { "l2_misses", KSTAT_DATA_UINT64 }, 622 { "l2_feeds", KSTAT_DATA_UINT64 }, 623 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 624 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 625 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 626 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 627 { "l2_writes_done", KSTAT_DATA_UINT64 }, 628 { "l2_writes_error", KSTAT_DATA_UINT64 }, 629 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 630 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 631 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 632 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 633 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 634 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 635 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 636 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 637 { "l2_io_error", KSTAT_DATA_UINT64 }, 638 { "l2_size", KSTAT_DATA_UINT64 }, 639 { "l2_asize", KSTAT_DATA_UINT64 }, 640 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 641 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 642 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 643 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 644 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 645 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 646 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 647 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 648 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 649 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 650 { "l2_write_full", KSTAT_DATA_UINT64 }, 651 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 652 { "l2_write_pios", KSTAT_DATA_UINT64 }, 653 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 654 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 655 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 656 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 657 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 658 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 659 { "duplicate_reads", KSTAT_DATA_UINT64 }, 660 { "arc_meta_used", KSTAT_DATA_UINT64 }, 661 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 662 { "arc_meta_max", KSTAT_DATA_UINT64 }, 663 { "arc_meta_min", KSTAT_DATA_UINT64 } 664}; 665 666#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 667 668#define ARCSTAT_INCR(stat, val) \ 669 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 670 671#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 672#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 673 674#define ARCSTAT_MAX(stat, val) { \ 675 uint64_t m; \ 676 while ((val) > (m = arc_stats.stat.value.ui64) && \ 677 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 678 continue; \ 679} 680 681#define ARCSTAT_MAXSTAT(stat) \ 682 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 683 684/* 685 * We define a macro to allow ARC hits/misses to be easily broken down by 686 * two separate conditions, giving a total of four different subtypes for 687 * each of hits and misses (so eight statistics total). 688 */ 689#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 690 if (cond1) { \ 691 if (cond2) { \ 692 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 693 } else { \ 694 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 695 } \ 696 } else { \ 697 if (cond2) { \ 698 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 699 } else { \ 700 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 701 } \ 702 } 703 704kstat_t *arc_ksp; 705static arc_state_t *arc_anon; 706static arc_state_t *arc_mru; 707static arc_state_t *arc_mru_ghost; 708static arc_state_t *arc_mfu; 709static arc_state_t *arc_mfu_ghost; 710static arc_state_t *arc_l2c_only; 711 712/* 713 * There are several ARC variables that are critical to export as kstats -- 714 * but we don't want to have to grovel around in the kstat whenever we wish to 715 * manipulate them. For these variables, we therefore define them to be in 716 * terms of the statistic variable. This assures that we are not introducing 717 * the possibility of inconsistency by having shadow copies of the variables, 718 * while still allowing the code to be readable. 719 */ 720#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 721#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 722#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 723#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 724#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 725#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 726#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 727#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 728#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 729 730#define L2ARC_IS_VALID_COMPRESS(_c_) \ 731 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 732 733static int arc_no_grow; /* Don't try to grow cache size */ 734static uint64_t arc_tempreserve; 735static uint64_t arc_loaned_bytes; 736 737typedef struct arc_callback arc_callback_t; 738 739struct arc_callback { 740 void *acb_private; 741 arc_done_func_t *acb_done; 742 arc_buf_t *acb_buf; 743 zio_t *acb_zio_dummy; 744 arc_callback_t *acb_next; 745}; 746 747typedef struct arc_write_callback arc_write_callback_t; 748 749struct arc_write_callback { 750 void *awcb_private; 751 arc_done_func_t *awcb_ready; 752 arc_done_func_t *awcb_physdone; 753 arc_done_func_t *awcb_done; 754 arc_buf_t *awcb_buf; 755}; 756 757/* 758 * ARC buffers are separated into multiple structs as a memory saving measure: 759 * - Common fields struct, always defined, and embedded within it: 760 * - L2-only fields, always allocated but undefined when not in L2ARC 761 * - L1-only fields, only allocated when in L1ARC 762 * 763 * Buffer in L1 Buffer only in L2 764 * +------------------------+ +------------------------+ 765 * | arc_buf_hdr_t | | arc_buf_hdr_t | 766 * | | | | 767 * | | | | 768 * | | | | 769 * +------------------------+ +------------------------+ 770 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 771 * | (undefined if L1-only) | | | 772 * +------------------------+ +------------------------+ 773 * | l1arc_buf_hdr_t | 774 * | | 775 * | | 776 * | | 777 * | | 778 * +------------------------+ 779 * 780 * Because it's possible for the L2ARC to become extremely large, we can wind 781 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 782 * is minimized by only allocating the fields necessary for an L1-cached buffer 783 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 784 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 785 * words in pointers. arc_hdr_realloc() is used to switch a header between 786 * these two allocation states. 787 */ 788typedef struct l1arc_buf_hdr { 789 kmutex_t b_freeze_lock; 790#ifdef ZFS_DEBUG 791 /* 792 * used for debugging wtih kmem_flags - by allocating and freeing 793 * b_thawed when the buffer is thawed, we get a record of the stack 794 * trace that thawed it. 795 */ 796 void *b_thawed; 797#endif 798 799 arc_buf_t *b_buf; 800 uint32_t b_datacnt; 801 /* for waiting on writes to complete */ 802 kcondvar_t b_cv; 803 804 /* protected by arc state mutex */ 805 arc_state_t *b_state; 806 list_node_t b_arc_node; 807 808 /* updated atomically */ 809 clock_t b_arc_access; 810 811 /* self protecting */ 812 refcount_t b_refcnt; 813 814 arc_callback_t *b_acb; 815 /* temporary buffer holder for in-flight compressed data */ 816 void *b_tmp_cdata; 817} l1arc_buf_hdr_t; 818 819typedef struct l2arc_dev l2arc_dev_t; 820 821typedef struct l2arc_buf_hdr { 822 /* protected by arc_buf_hdr mutex */ 823 l2arc_dev_t *b_dev; /* L2ARC device */ 824 uint64_t b_daddr; /* disk address, offset byte */ 825 /* real alloc'd buffer size depending on b_compress applied */ 826 int32_t b_asize; 827 828 list_node_t b_l2node; 829} l2arc_buf_hdr_t; 830 831struct arc_buf_hdr { 832 /* protected by hash lock */ 833 dva_t b_dva; 834 uint64_t b_birth; 835 /* 836 * Even though this checksum is only set/verified when a buffer is in 837 * the L1 cache, it needs to be in the set of common fields because it 838 * must be preserved from the time before a buffer is written out to 839 * L2ARC until after it is read back in. 840 */ 841 zio_cksum_t *b_freeze_cksum; 842 843 arc_buf_hdr_t *b_hash_next; 844 arc_flags_t b_flags; 845 846 /* immutable */ 847 int32_t b_size; 848 uint64_t b_spa; 849 850 /* L2ARC fields. Undefined when not in L2ARC. */ 851 l2arc_buf_hdr_t b_l2hdr; 852 /* L1ARC fields. Undefined when in l2arc_only state */ 853 l1arc_buf_hdr_t b_l1hdr; 854}; 855 856#ifdef _KERNEL 857static int 858sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 859{ 860 uint64_t val; 861 int err; 862 863 val = arc_meta_limit; 864 err = sysctl_handle_64(oidp, &val, 0, req); 865 if (err != 0 || req->newptr == NULL) 866 return (err); 867 868 if (val <= 0 || val > arc_c_max) 869 return (EINVAL); 870 871 arc_meta_limit = val; 872 return (0); 873} 874#endif 875 876static arc_buf_t *arc_eviction_list; 877static kmutex_t arc_eviction_mtx; 878static arc_buf_hdr_t arc_eviction_hdr; 879 880#define GHOST_STATE(state) \ 881 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 882 (state) == arc_l2c_only) 883 884#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 885#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 886#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 887#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 888#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 889#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 890 891#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 892#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 893#define HDR_L2_READING(hdr) \ 894 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 895 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 896#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 897#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 898#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 899 900#define HDR_ISTYPE_METADATA(hdr) \ 901 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 902#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 903 904#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 905#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 906 907/* For storing compression mode in b_flags */ 908#define HDR_COMPRESS_OFFSET 24 909#define HDR_COMPRESS_NBITS 7 910 911#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 912 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 913#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 914 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 915 916/* 917 * Other sizes 918 */ 919 920#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 921#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 922 923/* 924 * Hash table routines 925 */ 926 927#define HT_LOCK_PAD CACHE_LINE_SIZE 928 929struct ht_lock { 930 kmutex_t ht_lock; 931#ifdef _KERNEL 932 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 933#endif 934}; 935 936#define BUF_LOCKS 256 937typedef struct buf_hash_table { 938 uint64_t ht_mask; 939 arc_buf_hdr_t **ht_table; 940 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 941} buf_hash_table_t; 942 943static buf_hash_table_t buf_hash_table; 944 945#define BUF_HASH_INDEX(spa, dva, birth) \ 946 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 947#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 948#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 949#define HDR_LOCK(hdr) \ 950 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 951 952uint64_t zfs_crc64_table[256]; 953 954/* 955 * Level 2 ARC 956 */ 957 958#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 959#define L2ARC_HEADROOM 2 /* num of writes */ 960/* 961 * If we discover during ARC scan any buffers to be compressed, we boost 962 * our headroom for the next scanning cycle by this percentage multiple. 963 */ 964#define L2ARC_HEADROOM_BOOST 200 965#define L2ARC_FEED_SECS 1 /* caching interval secs */ 966#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 967 968#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 969#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 970 971/* L2ARC Performance Tunables */ 972uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 973uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 974uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 975uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 976uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 977uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 978boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 979boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 980boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 981 982SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 983 &l2arc_write_max, 0, "max write size"); 984SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 985 &l2arc_write_boost, 0, "extra write during warmup"); 986SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 987 &l2arc_headroom, 0, "number of dev writes"); 988SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 989 &l2arc_feed_secs, 0, "interval seconds"); 990SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 991 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 992 993SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 994 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 995SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 996 &l2arc_feed_again, 0, "turbo warmup"); 997SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 998 &l2arc_norw, 0, "no reads during writes"); 999 1000SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1001 &ARC_anon.arcs_size, 0, "size of anonymous state"); 1002SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1003 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1004SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1005 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1006 1007SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1008 &ARC_mru.arcs_size, 0, "size of mru state"); 1009SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1010 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1011SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1012 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1013 1014SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1015 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1016SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1017 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1018 "size of metadata in mru ghost state"); 1019SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1020 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1021 "size of data in mru ghost state"); 1022 1023SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1024 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1025SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1026 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1027SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1028 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1029 1030SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1031 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1032SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1033 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1034 "size of metadata in mfu ghost state"); 1035SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1036 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1037 "size of data in mfu ghost state"); 1038 1039SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1040 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1041 1042/* 1043 * L2ARC Internals 1044 */ 1045struct l2arc_dev { 1046 vdev_t *l2ad_vdev; /* vdev */ 1047 spa_t *l2ad_spa; /* spa */ 1048 uint64_t l2ad_hand; /* next write location */ 1049 uint64_t l2ad_start; /* first addr on device */ 1050 uint64_t l2ad_end; /* last addr on device */ 1051 uint64_t l2ad_evict; /* last addr eviction reached */ 1052 boolean_t l2ad_first; /* first sweep through */ 1053 boolean_t l2ad_writing; /* currently writing */ 1054 kmutex_t l2ad_mtx; /* lock for buffer list */ 1055 list_t l2ad_buflist; /* buffer list */ 1056 list_node_t l2ad_node; /* device list node */ 1057}; 1058 1059static list_t L2ARC_dev_list; /* device list */ 1060static list_t *l2arc_dev_list; /* device list pointer */ 1061static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1062static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1063static list_t L2ARC_free_on_write; /* free after write buf list */ 1064static list_t *l2arc_free_on_write; /* free after write list ptr */ 1065static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1066static uint64_t l2arc_ndev; /* number of devices */ 1067 1068typedef struct l2arc_read_callback { 1069 arc_buf_t *l2rcb_buf; /* read buffer */ 1070 spa_t *l2rcb_spa; /* spa */ 1071 blkptr_t l2rcb_bp; /* original blkptr */ 1072 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1073 int l2rcb_flags; /* original flags */ 1074 enum zio_compress l2rcb_compress; /* applied compress */ 1075} l2arc_read_callback_t; 1076 1077typedef struct l2arc_write_callback { 1078 l2arc_dev_t *l2wcb_dev; /* device info */ 1079 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1080} l2arc_write_callback_t; 1081 1082typedef struct l2arc_data_free { 1083 /* protected by l2arc_free_on_write_mtx */ 1084 void *l2df_data; 1085 size_t l2df_size; 1086 void (*l2df_func)(void *, size_t); 1087 list_node_t l2df_list_node; 1088} l2arc_data_free_t; 1089 1090static kmutex_t l2arc_feed_thr_lock; 1091static kcondvar_t l2arc_feed_thr_cv; 1092static uint8_t l2arc_thread_exit; 1093 1094static void arc_get_data_buf(arc_buf_t *); 1095static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1096static int arc_evict_needed(arc_buf_contents_t); 1097static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1098static void arc_buf_watch(arc_buf_t *); 1099 1100static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1101static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1102 1103static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1104static void l2arc_read_done(zio_t *); 1105 1106static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1107static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1108static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1109 1110static uint64_t 1111buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1112{ 1113 uint8_t *vdva = (uint8_t *)dva; 1114 uint64_t crc = -1ULL; 1115 int i; 1116 1117 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1118 1119 for (i = 0; i < sizeof (dva_t); i++) 1120 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1121 1122 crc ^= (spa>>8) ^ birth; 1123 1124 return (crc); 1125} 1126 1127#define BUF_EMPTY(buf) \ 1128 ((buf)->b_dva.dva_word[0] == 0 && \ 1129 (buf)->b_dva.dva_word[1] == 0) 1130 1131#define BUF_EQUAL(spa, dva, birth, buf) \ 1132 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1133 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1134 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1135 1136static void 1137buf_discard_identity(arc_buf_hdr_t *hdr) 1138{ 1139 hdr->b_dva.dva_word[0] = 0; 1140 hdr->b_dva.dva_word[1] = 0; 1141 hdr->b_birth = 0; 1142} 1143 1144static arc_buf_hdr_t * 1145buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1146{ 1147 const dva_t *dva = BP_IDENTITY(bp); 1148 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1149 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1150 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1151 arc_buf_hdr_t *hdr; 1152 1153 mutex_enter(hash_lock); 1154 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1155 hdr = hdr->b_hash_next) { 1156 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1157 *lockp = hash_lock; 1158 return (hdr); 1159 } 1160 } 1161 mutex_exit(hash_lock); 1162 *lockp = NULL; 1163 return (NULL); 1164} 1165 1166/* 1167 * Insert an entry into the hash table. If there is already an element 1168 * equal to elem in the hash table, then the already existing element 1169 * will be returned and the new element will not be inserted. 1170 * Otherwise returns NULL. 1171 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1172 */ 1173static arc_buf_hdr_t * 1174buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1175{ 1176 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1177 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1178 arc_buf_hdr_t *fhdr; 1179 uint32_t i; 1180 1181 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1182 ASSERT(hdr->b_birth != 0); 1183 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1184 1185 if (lockp != NULL) { 1186 *lockp = hash_lock; 1187 mutex_enter(hash_lock); 1188 } else { 1189 ASSERT(MUTEX_HELD(hash_lock)); 1190 } 1191 1192 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1193 fhdr = fhdr->b_hash_next, i++) { 1194 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1195 return (fhdr); 1196 } 1197 1198 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1199 buf_hash_table.ht_table[idx] = hdr; 1200 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1201 1202 /* collect some hash table performance data */ 1203 if (i > 0) { 1204 ARCSTAT_BUMP(arcstat_hash_collisions); 1205 if (i == 1) 1206 ARCSTAT_BUMP(arcstat_hash_chains); 1207 1208 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1209 } 1210 1211 ARCSTAT_BUMP(arcstat_hash_elements); 1212 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1213 1214 return (NULL); 1215} 1216 1217static void 1218buf_hash_remove(arc_buf_hdr_t *hdr) 1219{ 1220 arc_buf_hdr_t *fhdr, **hdrp; 1221 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1222 1223 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1224 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1225 1226 hdrp = &buf_hash_table.ht_table[idx]; 1227 while ((fhdr = *hdrp) != hdr) { 1228 ASSERT(fhdr != NULL); 1229 hdrp = &fhdr->b_hash_next; 1230 } 1231 *hdrp = hdr->b_hash_next; 1232 hdr->b_hash_next = NULL; 1233 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1234 1235 /* collect some hash table performance data */ 1236 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1237 1238 if (buf_hash_table.ht_table[idx] && 1239 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1240 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1241} 1242 1243/* 1244 * Global data structures and functions for the buf kmem cache. 1245 */ 1246static kmem_cache_t *hdr_full_cache; 1247static kmem_cache_t *hdr_l2only_cache; 1248static kmem_cache_t *buf_cache; 1249 1250static void 1251buf_fini(void) 1252{ 1253 int i; 1254 1255 kmem_free(buf_hash_table.ht_table, 1256 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1257 for (i = 0; i < BUF_LOCKS; i++) 1258 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1259 kmem_cache_destroy(hdr_full_cache); 1260 kmem_cache_destroy(hdr_l2only_cache); 1261 kmem_cache_destroy(buf_cache); 1262} 1263 1264/* 1265 * Constructor callback - called when the cache is empty 1266 * and a new buf is requested. 1267 */ 1268/* ARGSUSED */ 1269static int 1270hdr_full_cons(void *vbuf, void *unused, int kmflag) 1271{ 1272 arc_buf_hdr_t *hdr = vbuf; 1273 1274 bzero(hdr, HDR_FULL_SIZE); 1275 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1276 refcount_create(&hdr->b_l1hdr.b_refcnt); 1277 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1278 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1279 1280 return (0); 1281} 1282 1283/* ARGSUSED */ 1284static int 1285hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1286{ 1287 arc_buf_hdr_t *hdr = vbuf; 1288 1289 bzero(hdr, HDR_L2ONLY_SIZE); 1290 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1291 1292 return (0); 1293} 1294 1295/* ARGSUSED */ 1296static int 1297buf_cons(void *vbuf, void *unused, int kmflag) 1298{ 1299 arc_buf_t *buf = vbuf; 1300 1301 bzero(buf, sizeof (arc_buf_t)); 1302 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1303 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1304 1305 return (0); 1306} 1307 1308/* 1309 * Destructor callback - called when a cached buf is 1310 * no longer required. 1311 */ 1312/* ARGSUSED */ 1313static void 1314hdr_full_dest(void *vbuf, void *unused) 1315{ 1316 arc_buf_hdr_t *hdr = vbuf; 1317 1318 ASSERT(BUF_EMPTY(hdr)); 1319 cv_destroy(&hdr->b_l1hdr.b_cv); 1320 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1321 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1322 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1323} 1324 1325/* ARGSUSED */ 1326static void 1327hdr_l2only_dest(void *vbuf, void *unused) 1328{ 1329 arc_buf_hdr_t *hdr = vbuf; 1330 1331 ASSERT(BUF_EMPTY(hdr)); 1332 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1333} 1334 1335/* ARGSUSED */ 1336static void 1337buf_dest(void *vbuf, void *unused) 1338{ 1339 arc_buf_t *buf = vbuf; 1340 1341 mutex_destroy(&buf->b_evict_lock); 1342 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1343} 1344 1345/* 1346 * Reclaim callback -- invoked when memory is low. 1347 */ 1348/* ARGSUSED */ 1349static void 1350hdr_recl(void *unused) 1351{ 1352 dprintf("hdr_recl called\n"); 1353 /* 1354 * umem calls the reclaim func when we destroy the buf cache, 1355 * which is after we do arc_fini(). 1356 */ 1357 if (!arc_dead) 1358 cv_signal(&arc_reclaim_thr_cv); 1359} 1360 1361static void 1362buf_init(void) 1363{ 1364 uint64_t *ct; 1365 uint64_t hsize = 1ULL << 12; 1366 int i, j; 1367 1368 /* 1369 * The hash table is big enough to fill all of physical memory 1370 * with an average block size of zfs_arc_average_blocksize (default 8K). 1371 * By default, the table will take up 1372 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1373 */ 1374 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1375 hsize <<= 1; 1376retry: 1377 buf_hash_table.ht_mask = hsize - 1; 1378 buf_hash_table.ht_table = 1379 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1380 if (buf_hash_table.ht_table == NULL) { 1381 ASSERT(hsize > (1ULL << 8)); 1382 hsize >>= 1; 1383 goto retry; 1384 } 1385 1386 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1387 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1388 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1389 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1390 NULL, NULL, 0); 1391 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1392 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1393 1394 for (i = 0; i < 256; i++) 1395 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1396 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1397 1398 for (i = 0; i < BUF_LOCKS; i++) { 1399 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1400 NULL, MUTEX_DEFAULT, NULL); 1401 } 1402} 1403 1404/* 1405 * Transition between the two allocation states for the arc_buf_hdr struct. 1406 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1407 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1408 * version is used when a cache buffer is only in the L2ARC in order to reduce 1409 * memory usage. 1410 */ 1411static arc_buf_hdr_t * 1412arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1413{ 1414 ASSERT(HDR_HAS_L2HDR(hdr)); 1415 1416 arc_buf_hdr_t *nhdr; 1417 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1418 1419 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1420 (old == hdr_l2only_cache && new == hdr_full_cache)); 1421 1422 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1423 1424 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1425 buf_hash_remove(hdr); 1426 1427 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1428 if (new == hdr_full_cache) { 1429 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1430 /* 1431 * arc_access and arc_change_state need to be aware that a 1432 * header has just come out of L2ARC, so we set its state to 1433 * l2c_only even though it's about to change. 1434 */ 1435 nhdr->b_l1hdr.b_state = arc_l2c_only; 1436 } else { 1437 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1438 ASSERT0(hdr->b_l1hdr.b_datacnt); 1439 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1440 /* 1441 * We might be removing the L1hdr of a buffer which was just 1442 * written out to L2ARC. If such a buffer is compressed then we 1443 * need to free its b_tmp_cdata before destroying the header. 1444 */ 1445 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1446 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1447 l2arc_release_cdata_buf(hdr); 1448 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1449 } 1450 /* 1451 * The header has been reallocated so we need to re-insert it into any 1452 * lists it was on. 1453 */ 1454 (void) buf_hash_insert(nhdr, NULL); 1455 1456 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1457 1458 mutex_enter(&dev->l2ad_mtx); 1459 1460 /* 1461 * We must place the realloc'ed header back into the list at 1462 * the same spot. Otherwise, if it's placed earlier in the list, 1463 * l2arc_write_buffers() could find it during the function's 1464 * write phase, and try to write it out to the l2arc. 1465 */ 1466 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1467 list_remove(&dev->l2ad_buflist, hdr); 1468 1469 mutex_exit(&dev->l2ad_mtx); 1470 1471 buf_discard_identity(hdr); 1472 hdr->b_freeze_cksum = NULL; 1473 kmem_cache_free(old, hdr); 1474 1475 return (nhdr); 1476} 1477 1478 1479#define ARC_MINTIME (hz>>4) /* 62 ms */ 1480 1481static void 1482arc_cksum_verify(arc_buf_t *buf) 1483{ 1484 zio_cksum_t zc; 1485 1486 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1487 return; 1488 1489 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1490 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1491 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1492 return; 1493 } 1494 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1495 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1496 panic("buffer modified while frozen!"); 1497 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1498} 1499 1500static int 1501arc_cksum_equal(arc_buf_t *buf) 1502{ 1503 zio_cksum_t zc; 1504 int equal; 1505 1506 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1507 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1508 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1509 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1510 1511 return (equal); 1512} 1513 1514static void 1515arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1516{ 1517 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1518 return; 1519 1520 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1521 if (buf->b_hdr->b_freeze_cksum != NULL) { 1522 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1523 return; 1524 } 1525 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1526 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1527 buf->b_hdr->b_freeze_cksum); 1528 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1529#ifdef illumos 1530 arc_buf_watch(buf); 1531#endif /* illumos */ 1532} 1533 1534#ifdef illumos 1535#ifndef _KERNEL 1536typedef struct procctl { 1537 long cmd; 1538 prwatch_t prwatch; 1539} procctl_t; 1540#endif 1541 1542/* ARGSUSED */ 1543static void 1544arc_buf_unwatch(arc_buf_t *buf) 1545{ 1546#ifndef _KERNEL 1547 if (arc_watch) { 1548 int result; 1549 procctl_t ctl; 1550 ctl.cmd = PCWATCH; 1551 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1552 ctl.prwatch.pr_size = 0; 1553 ctl.prwatch.pr_wflags = 0; 1554 result = write(arc_procfd, &ctl, sizeof (ctl)); 1555 ASSERT3U(result, ==, sizeof (ctl)); 1556 } 1557#endif 1558} 1559 1560/* ARGSUSED */ 1561static void 1562arc_buf_watch(arc_buf_t *buf) 1563{ 1564#ifndef _KERNEL 1565 if (arc_watch) { 1566 int result; 1567 procctl_t ctl; 1568 ctl.cmd = PCWATCH; 1569 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1570 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1571 ctl.prwatch.pr_wflags = WA_WRITE; 1572 result = write(arc_procfd, &ctl, sizeof (ctl)); 1573 ASSERT3U(result, ==, sizeof (ctl)); 1574 } 1575#endif 1576} 1577#endif /* illumos */ 1578 1579static arc_buf_contents_t 1580arc_buf_type(arc_buf_hdr_t *hdr) 1581{ 1582 if (HDR_ISTYPE_METADATA(hdr)) { 1583 return (ARC_BUFC_METADATA); 1584 } else { 1585 return (ARC_BUFC_DATA); 1586 } 1587} 1588 1589static uint32_t 1590arc_bufc_to_flags(arc_buf_contents_t type) 1591{ 1592 switch (type) { 1593 case ARC_BUFC_DATA: 1594 /* metadata field is 0 if buffer contains normal data */ 1595 return (0); 1596 case ARC_BUFC_METADATA: 1597 return (ARC_FLAG_BUFC_METADATA); 1598 default: 1599 break; 1600 } 1601 panic("undefined ARC buffer type!"); 1602 return ((uint32_t)-1); 1603} 1604 1605void 1606arc_buf_thaw(arc_buf_t *buf) 1607{ 1608 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1609 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1610 panic("modifying non-anon buffer!"); 1611 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1612 panic("modifying buffer while i/o in progress!"); 1613 arc_cksum_verify(buf); 1614 } 1615 1616 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1617 if (buf->b_hdr->b_freeze_cksum != NULL) { 1618 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1619 buf->b_hdr->b_freeze_cksum = NULL; 1620 } 1621 1622#ifdef ZFS_DEBUG 1623 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1624 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1625 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1626 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1627 } 1628#endif 1629 1630 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1631 1632#ifdef illumos 1633 arc_buf_unwatch(buf); 1634#endif /* illumos */ 1635} 1636 1637void 1638arc_buf_freeze(arc_buf_t *buf) 1639{ 1640 kmutex_t *hash_lock; 1641 1642 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1643 return; 1644 1645 hash_lock = HDR_LOCK(buf->b_hdr); 1646 mutex_enter(hash_lock); 1647 1648 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1649 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1650 arc_cksum_compute(buf, B_FALSE); 1651 mutex_exit(hash_lock); 1652 1653} 1654 1655static void 1656get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) 1657{ 1658 uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1659 1660 if (arc_buf_type(hdr) == ARC_BUFC_METADATA) 1661 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1662 else { 1663 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1664 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1665 } 1666 1667 *list = &state->arcs_lists[buf_hashid]; 1668 *lock = ARCS_LOCK(state, buf_hashid); 1669} 1670 1671 1672static void 1673add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1674{ 1675 ASSERT(HDR_HAS_L1HDR(hdr)); 1676 ASSERT(MUTEX_HELD(hash_lock)); 1677 arc_state_t *state = hdr->b_l1hdr.b_state; 1678 1679 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1680 (state != arc_anon)) { 1681 /* We don't use the L2-only state list. */ 1682 if (state != arc_l2c_only) { 1683 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1684 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1685 list_t *list; 1686 kmutex_t *lock; 1687 1688 get_buf_info(hdr, state, &list, &lock); 1689 ASSERT(!MUTEX_HELD(lock)); 1690 mutex_enter(lock); 1691 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1692 list_remove(list, hdr); 1693 if (GHOST_STATE(state)) { 1694 ASSERT0(hdr->b_l1hdr.b_datacnt); 1695 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1696 delta = hdr->b_size; 1697 } 1698 ASSERT(delta > 0); 1699 ASSERT3U(*size, >=, delta); 1700 atomic_add_64(size, -delta); 1701 mutex_exit(lock); 1702 } 1703 /* remove the prefetch flag if we get a reference */ 1704 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1705 } 1706} 1707 1708static int 1709remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1710{ 1711 int cnt; 1712 arc_state_t *state = hdr->b_l1hdr.b_state; 1713 1714 ASSERT(HDR_HAS_L1HDR(hdr)); 1715 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1716 ASSERT(!GHOST_STATE(state)); 1717 1718 /* 1719 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1720 * check to prevent usage of the arc_l2c_only list. 1721 */ 1722 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1723 (state != arc_anon)) { 1724 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1725 list_t *list; 1726 kmutex_t *lock; 1727 1728 get_buf_info(hdr, state, &list, &lock); 1729 ASSERT(!MUTEX_HELD(lock)); 1730 mutex_enter(lock); 1731 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1732 list_insert_head(list, hdr); 1733 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1734 atomic_add_64(size, hdr->b_size * 1735 hdr->b_l1hdr.b_datacnt); 1736 mutex_exit(lock); 1737 } 1738 return (cnt); 1739} 1740 1741/* 1742 * Move the supplied buffer to the indicated state. The mutex 1743 * for the buffer must be held by the caller. 1744 */ 1745static void 1746arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1747 kmutex_t *hash_lock) 1748{ 1749 arc_state_t *old_state; 1750 int64_t refcnt; 1751 uint32_t datacnt; 1752 uint64_t from_delta, to_delta; 1753 arc_buf_contents_t buftype = arc_buf_type(hdr); 1754 list_t *list; 1755 kmutex_t *lock; 1756 1757 /* 1758 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1759 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1760 * L1 hdr doesn't always exist when we change state to arc_anon before 1761 * destroying a header, in which case reallocating to add the L1 hdr is 1762 * pointless. 1763 */ 1764 if (HDR_HAS_L1HDR(hdr)) { 1765 old_state = hdr->b_l1hdr.b_state; 1766 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1767 datacnt = hdr->b_l1hdr.b_datacnt; 1768 } else { 1769 old_state = arc_l2c_only; 1770 refcnt = 0; 1771 datacnt = 0; 1772 } 1773 1774 ASSERT(MUTEX_HELD(hash_lock)); 1775 ASSERT3P(new_state, !=, old_state); 1776 ASSERT(refcnt == 0 || datacnt > 0); 1777 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1778 ASSERT(old_state != arc_anon || datacnt <= 1); 1779 1780 from_delta = to_delta = datacnt * hdr->b_size; 1781 1782 /* 1783 * If this buffer is evictable, transfer it from the 1784 * old state list to the new state list. 1785 */ 1786 if (refcnt == 0) { 1787 if (old_state != arc_anon && old_state != arc_l2c_only) { 1788 int use_mutex; 1789 uint64_t *size = &old_state->arcs_lsize[buftype]; 1790 1791 get_buf_info(hdr, old_state, &list, &lock); 1792 use_mutex = !MUTEX_HELD(lock); 1793 if (use_mutex) 1794 mutex_enter(lock); 1795 1796 ASSERT(HDR_HAS_L1HDR(hdr)); 1797 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1798 list_remove(list, hdr); 1799 1800 /* 1801 * If prefetching out of the ghost cache, 1802 * we will have a non-zero datacnt. 1803 */ 1804 if (GHOST_STATE(old_state) && datacnt == 0) { 1805 /* ghost elements have a ghost size */ 1806 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1807 from_delta = hdr->b_size; 1808 } 1809 ASSERT3U(*size, >=, from_delta); 1810 atomic_add_64(size, -from_delta); 1811 1812 if (use_mutex) 1813 mutex_exit(lock); 1814 } 1815 if (new_state != arc_anon && new_state != arc_l2c_only) { 1816 int use_mutex; 1817 uint64_t *size = &new_state->arcs_lsize[buftype]; 1818 1819 /* 1820 * An L1 header always exists here, since if we're 1821 * moving to some L1-cached state (i.e. not l2c_only or 1822 * anonymous), we realloc the header to add an L1hdr 1823 * beforehand. 1824 */ 1825 ASSERT(HDR_HAS_L1HDR(hdr)); 1826 get_buf_info(hdr, new_state, &list, &lock); 1827 use_mutex = !MUTEX_HELD(lock); 1828 if (use_mutex) 1829 mutex_enter(lock); 1830 1831 list_insert_head(list, hdr); 1832 1833 /* ghost elements have a ghost size */ 1834 if (GHOST_STATE(new_state)) { 1835 ASSERT(datacnt == 0); 1836 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1837 to_delta = hdr->b_size; 1838 } 1839 atomic_add_64(size, to_delta); 1840 1841 if (use_mutex) 1842 mutex_exit(lock); 1843 } 1844 } 1845 1846 ASSERT(!BUF_EMPTY(hdr)); 1847 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1848 buf_hash_remove(hdr); 1849 1850 /* adjust state sizes (ignore arc_l2c_only) */ 1851 if (to_delta && new_state != arc_l2c_only) 1852 atomic_add_64(&new_state->arcs_size, to_delta); 1853 if (from_delta && old_state != arc_l2c_only) { 1854 ASSERT3U(old_state->arcs_size, >=, from_delta); 1855 atomic_add_64(&old_state->arcs_size, -from_delta); 1856 } 1857 if (HDR_HAS_L1HDR(hdr)) 1858 hdr->b_l1hdr.b_state = new_state; 1859 1860 /* 1861 * L2 headers should never be on the L2 state list since they don't 1862 * have L1 headers allocated. 1863 */ 1864#ifdef illumos 1865 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1866 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1867#endif 1868} 1869 1870void 1871arc_space_consume(uint64_t space, arc_space_type_t type) 1872{ 1873 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1874 1875 switch (type) { 1876 case ARC_SPACE_DATA: 1877 ARCSTAT_INCR(arcstat_data_size, space); 1878 break; 1879 case ARC_SPACE_META: 1880 ARCSTAT_INCR(arcstat_metadata_size, space); 1881 break; 1882 case ARC_SPACE_OTHER: 1883 ARCSTAT_INCR(arcstat_other_size, space); 1884 break; 1885 case ARC_SPACE_HDRS: 1886 ARCSTAT_INCR(arcstat_hdr_size, space); 1887 break; 1888 case ARC_SPACE_L2HDRS: 1889 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1890 break; 1891 } 1892 1893 if (type != ARC_SPACE_DATA) 1894 ARCSTAT_INCR(arcstat_meta_used, space); 1895 1896 atomic_add_64(&arc_size, space); 1897} 1898 1899void 1900arc_space_return(uint64_t space, arc_space_type_t type) 1901{ 1902 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1903 1904 switch (type) { 1905 case ARC_SPACE_DATA: 1906 ARCSTAT_INCR(arcstat_data_size, -space); 1907 break; 1908 case ARC_SPACE_META: 1909 ARCSTAT_INCR(arcstat_metadata_size, -space); 1910 break; 1911 case ARC_SPACE_OTHER: 1912 ARCSTAT_INCR(arcstat_other_size, -space); 1913 break; 1914 case ARC_SPACE_HDRS: 1915 ARCSTAT_INCR(arcstat_hdr_size, -space); 1916 break; 1917 case ARC_SPACE_L2HDRS: 1918 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1919 break; 1920 } 1921 1922 if (type != ARC_SPACE_DATA) { 1923 ASSERT(arc_meta_used >= space); 1924 if (arc_meta_max < arc_meta_used) 1925 arc_meta_max = arc_meta_used; 1926 ARCSTAT_INCR(arcstat_meta_used, -space); 1927 } 1928 1929 ASSERT(arc_size >= space); 1930 atomic_add_64(&arc_size, -space); 1931} 1932 1933arc_buf_t * 1934arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1935{ 1936 arc_buf_hdr_t *hdr; 1937 arc_buf_t *buf; 1938 1939 ASSERT3U(size, >, 0); 1940 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1941 ASSERT(BUF_EMPTY(hdr)); 1942 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1943 hdr->b_size = size; 1944 hdr->b_spa = spa_load_guid(spa); 1945 1946 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1947 buf->b_hdr = hdr; 1948 buf->b_data = NULL; 1949 buf->b_efunc = NULL; 1950 buf->b_private = NULL; 1951 buf->b_next = NULL; 1952 1953 hdr->b_flags = arc_bufc_to_flags(type); 1954 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1955 1956 hdr->b_l1hdr.b_buf = buf; 1957 hdr->b_l1hdr.b_state = arc_anon; 1958 hdr->b_l1hdr.b_arc_access = 0; 1959 hdr->b_l1hdr.b_datacnt = 1; 1960 1961 arc_get_data_buf(buf); 1962 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1963 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1964 1965 return (buf); 1966} 1967 1968static char *arc_onloan_tag = "onloan"; 1969 1970/* 1971 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1972 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1973 * buffers must be returned to the arc before they can be used by the DMU or 1974 * freed. 1975 */ 1976arc_buf_t * 1977arc_loan_buf(spa_t *spa, int size) 1978{ 1979 arc_buf_t *buf; 1980 1981 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1982 1983 atomic_add_64(&arc_loaned_bytes, size); 1984 return (buf); 1985} 1986 1987/* 1988 * Return a loaned arc buffer to the arc. 1989 */ 1990void 1991arc_return_buf(arc_buf_t *buf, void *tag) 1992{ 1993 arc_buf_hdr_t *hdr = buf->b_hdr; 1994 1995 ASSERT(buf->b_data != NULL); 1996 ASSERT(HDR_HAS_L1HDR(hdr)); 1997 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1998 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1999 2000 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 2001} 2002 2003/* Detach an arc_buf from a dbuf (tag) */ 2004void 2005arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2006{ 2007 arc_buf_hdr_t *hdr = buf->b_hdr; 2008 2009 ASSERT(buf->b_data != NULL); 2010 ASSERT(HDR_HAS_L1HDR(hdr)); 2011 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2012 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2013 buf->b_efunc = NULL; 2014 buf->b_private = NULL; 2015 2016 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2017} 2018 2019static arc_buf_t * 2020arc_buf_clone(arc_buf_t *from) 2021{ 2022 arc_buf_t *buf; 2023 arc_buf_hdr_t *hdr = from->b_hdr; 2024 uint64_t size = hdr->b_size; 2025 2026 ASSERT(HDR_HAS_L1HDR(hdr)); 2027 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2028 2029 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2030 buf->b_hdr = hdr; 2031 buf->b_data = NULL; 2032 buf->b_efunc = NULL; 2033 buf->b_private = NULL; 2034 buf->b_next = hdr->b_l1hdr.b_buf; 2035 hdr->b_l1hdr.b_buf = buf; 2036 arc_get_data_buf(buf); 2037 bcopy(from->b_data, buf->b_data, size); 2038 2039 /* 2040 * This buffer already exists in the arc so create a duplicate 2041 * copy for the caller. If the buffer is associated with user data 2042 * then track the size and number of duplicates. These stats will be 2043 * updated as duplicate buffers are created and destroyed. 2044 */ 2045 if (HDR_ISTYPE_DATA(hdr)) { 2046 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2047 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2048 } 2049 hdr->b_l1hdr.b_datacnt += 1; 2050 return (buf); 2051} 2052 2053void 2054arc_buf_add_ref(arc_buf_t *buf, void* tag) 2055{ 2056 arc_buf_hdr_t *hdr; 2057 kmutex_t *hash_lock; 2058 2059 /* 2060 * Check to see if this buffer is evicted. Callers 2061 * must verify b_data != NULL to know if the add_ref 2062 * was successful. 2063 */ 2064 mutex_enter(&buf->b_evict_lock); 2065 if (buf->b_data == NULL) { 2066 mutex_exit(&buf->b_evict_lock); 2067 return; 2068 } 2069 hash_lock = HDR_LOCK(buf->b_hdr); 2070 mutex_enter(hash_lock); 2071 hdr = buf->b_hdr; 2072 ASSERT(HDR_HAS_L1HDR(hdr)); 2073 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2074 mutex_exit(&buf->b_evict_lock); 2075 2076 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2077 hdr->b_l1hdr.b_state == arc_mfu); 2078 2079 add_reference(hdr, hash_lock, tag); 2080 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2081 arc_access(hdr, hash_lock); 2082 mutex_exit(hash_lock); 2083 ARCSTAT_BUMP(arcstat_hits); 2084 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2085 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2086 data, metadata, hits); 2087} 2088 2089static void 2090arc_buf_free_on_write(void *data, size_t size, 2091 void (*free_func)(void *, size_t)) 2092{ 2093 l2arc_data_free_t *df; 2094 2095 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2096 df->l2df_data = data; 2097 df->l2df_size = size; 2098 df->l2df_func = free_func; 2099 mutex_enter(&l2arc_free_on_write_mtx); 2100 list_insert_head(l2arc_free_on_write, df); 2101 mutex_exit(&l2arc_free_on_write_mtx); 2102} 2103 2104/* 2105 * Free the arc data buffer. If it is an l2arc write in progress, 2106 * the buffer is placed on l2arc_free_on_write to be freed later. 2107 */ 2108static void 2109arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2110{ 2111 arc_buf_hdr_t *hdr = buf->b_hdr; 2112 2113 if (HDR_L2_WRITING(hdr)) { 2114 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2115 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2116 } else { 2117 free_func(buf->b_data, hdr->b_size); 2118 } 2119} 2120 2121/* 2122 * Free up buf->b_data and if 'remove' is set, then pull the 2123 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2124 */ 2125static void 2126arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2127{ 2128 ASSERT(HDR_HAS_L2HDR(hdr)); 2129 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2130 2131 /* 2132 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2133 * that doesn't exist, the header is in the arc_l2c_only state, 2134 * and there isn't anything to free (it's already been freed). 2135 */ 2136 if (!HDR_HAS_L1HDR(hdr)) 2137 return; 2138 2139 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2140 return; 2141 2142 ASSERT(HDR_L2_WRITING(hdr)); 2143 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2144 zio_data_buf_free); 2145 2146 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2147 hdr->b_l1hdr.b_tmp_cdata = NULL; 2148} 2149 2150static void 2151arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2152{ 2153 arc_buf_t **bufp; 2154 2155 /* free up data associated with the buf */ 2156 if (buf->b_data != NULL) { 2157 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2158 uint64_t size = buf->b_hdr->b_size; 2159 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2160 2161 arc_cksum_verify(buf); 2162#ifdef illumos 2163 arc_buf_unwatch(buf); 2164#endif /* illumos */ 2165 2166 if (!recycle) { 2167 if (type == ARC_BUFC_METADATA) { 2168 arc_buf_data_free(buf, zio_buf_free); 2169 arc_space_return(size, ARC_SPACE_META); 2170 } else { 2171 ASSERT(type == ARC_BUFC_DATA); 2172 arc_buf_data_free(buf, zio_data_buf_free); 2173 arc_space_return(size, ARC_SPACE_DATA); 2174 } 2175 } 2176 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2177 uint64_t *cnt = &state->arcs_lsize[type]; 2178 2179 ASSERT(refcount_is_zero( 2180 &buf->b_hdr->b_l1hdr.b_refcnt)); 2181 ASSERT(state != arc_anon && state != arc_l2c_only); 2182 2183 ASSERT3U(*cnt, >=, size); 2184 atomic_add_64(cnt, -size); 2185 } 2186 ASSERT3U(state->arcs_size, >=, size); 2187 atomic_add_64(&state->arcs_size, -size); 2188 buf->b_data = NULL; 2189 2190 /* 2191 * If we're destroying a duplicate buffer make sure 2192 * that the appropriate statistics are updated. 2193 */ 2194 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2195 HDR_ISTYPE_DATA(buf->b_hdr)) { 2196 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2197 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2198 } 2199 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2200 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2201 } 2202 2203 /* only remove the buf if requested */ 2204 if (!remove) 2205 return; 2206 2207 /* remove the buf from the hdr list */ 2208 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2209 bufp = &(*bufp)->b_next) 2210 continue; 2211 *bufp = buf->b_next; 2212 buf->b_next = NULL; 2213 2214 ASSERT(buf->b_efunc == NULL); 2215 2216 /* clean up the buf */ 2217 buf->b_hdr = NULL; 2218 kmem_cache_free(buf_cache, buf); 2219} 2220 2221static void 2222arc_hdr_destroy(arc_buf_hdr_t *hdr) 2223{ 2224 if (HDR_HAS_L1HDR(hdr)) { 2225 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2226 hdr->b_l1hdr.b_datacnt > 0); 2227 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2228 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2229 } 2230 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2231 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2232 2233 if (HDR_HAS_L2HDR(hdr)) { 2234 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2235 boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); 2236 2237 if (!buflist_held) { 2238 mutex_enter(&l2hdr->b_dev->l2ad_mtx); 2239 l2hdr = &hdr->b_l2hdr; 2240 } 2241 2242 trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, 2243 l2hdr->b_asize, 0); 2244 list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); 2245 2246 /* 2247 * We don't want to leak the b_tmp_cdata buffer that was 2248 * allocated in l2arc_write_buffers() 2249 */ 2250 arc_buf_l2_cdata_free(hdr); 2251 2252 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2253 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2254 2255 if (!buflist_held) 2256 mutex_exit(&l2hdr->b_dev->l2ad_mtx); 2257 2258 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2259 } 2260 2261 if (!BUF_EMPTY(hdr)) 2262 buf_discard_identity(hdr); 2263 if (hdr->b_freeze_cksum != NULL) { 2264 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2265 hdr->b_freeze_cksum = NULL; 2266 } 2267 2268 if (HDR_HAS_L1HDR(hdr)) { 2269 while (hdr->b_l1hdr.b_buf) { 2270 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2271 2272 if (buf->b_efunc != NULL) { 2273 mutex_enter(&arc_eviction_mtx); 2274 mutex_enter(&buf->b_evict_lock); 2275 ASSERT(buf->b_hdr != NULL); 2276 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2277 FALSE); 2278 hdr->b_l1hdr.b_buf = buf->b_next; 2279 buf->b_hdr = &arc_eviction_hdr; 2280 buf->b_next = arc_eviction_list; 2281 arc_eviction_list = buf; 2282 mutex_exit(&buf->b_evict_lock); 2283 mutex_exit(&arc_eviction_mtx); 2284 } else { 2285 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2286 TRUE); 2287 } 2288 } 2289#ifdef ZFS_DEBUG 2290 if (hdr->b_l1hdr.b_thawed != NULL) { 2291 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2292 hdr->b_l1hdr.b_thawed = NULL; 2293 } 2294#endif 2295 } 2296 2297 ASSERT3P(hdr->b_hash_next, ==, NULL); 2298 if (HDR_HAS_L1HDR(hdr)) { 2299 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2300 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2301 kmem_cache_free(hdr_full_cache, hdr); 2302 } else { 2303 kmem_cache_free(hdr_l2only_cache, hdr); 2304 } 2305} 2306 2307void 2308arc_buf_free(arc_buf_t *buf, void *tag) 2309{ 2310 arc_buf_hdr_t *hdr = buf->b_hdr; 2311 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2312 2313 ASSERT(buf->b_efunc == NULL); 2314 ASSERT(buf->b_data != NULL); 2315 2316 if (hashed) { 2317 kmutex_t *hash_lock = HDR_LOCK(hdr); 2318 2319 mutex_enter(hash_lock); 2320 hdr = buf->b_hdr; 2321 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2322 2323 (void) remove_reference(hdr, hash_lock, tag); 2324 if (hdr->b_l1hdr.b_datacnt > 1) { 2325 arc_buf_destroy(buf, FALSE, TRUE); 2326 } else { 2327 ASSERT(buf == hdr->b_l1hdr.b_buf); 2328 ASSERT(buf->b_efunc == NULL); 2329 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2330 } 2331 mutex_exit(hash_lock); 2332 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2333 int destroy_hdr; 2334 /* 2335 * We are in the middle of an async write. Don't destroy 2336 * this buffer unless the write completes before we finish 2337 * decrementing the reference count. 2338 */ 2339 mutex_enter(&arc_eviction_mtx); 2340 (void) remove_reference(hdr, NULL, tag); 2341 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2342 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2343 mutex_exit(&arc_eviction_mtx); 2344 if (destroy_hdr) 2345 arc_hdr_destroy(hdr); 2346 } else { 2347 if (remove_reference(hdr, NULL, tag) > 0) 2348 arc_buf_destroy(buf, FALSE, TRUE); 2349 else 2350 arc_hdr_destroy(hdr); 2351 } 2352} 2353 2354boolean_t 2355arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2356{ 2357 arc_buf_hdr_t *hdr = buf->b_hdr; 2358 kmutex_t *hash_lock = HDR_LOCK(hdr); 2359 boolean_t no_callback = (buf->b_efunc == NULL); 2360 2361 if (hdr->b_l1hdr.b_state == arc_anon) { 2362 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2363 arc_buf_free(buf, tag); 2364 return (no_callback); 2365 } 2366 2367 mutex_enter(hash_lock); 2368 hdr = buf->b_hdr; 2369 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2370 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2371 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2372 ASSERT(buf->b_data != NULL); 2373 2374 (void) remove_reference(hdr, hash_lock, tag); 2375 if (hdr->b_l1hdr.b_datacnt > 1) { 2376 if (no_callback) 2377 arc_buf_destroy(buf, FALSE, TRUE); 2378 } else if (no_callback) { 2379 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2380 ASSERT(buf->b_efunc == NULL); 2381 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2382 } 2383 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2384 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2385 mutex_exit(hash_lock); 2386 return (no_callback); 2387} 2388 2389int32_t 2390arc_buf_size(arc_buf_t *buf) 2391{ 2392 return (buf->b_hdr->b_size); 2393} 2394 2395/* 2396 * Called from the DMU to determine if the current buffer should be 2397 * evicted. In order to ensure proper locking, the eviction must be initiated 2398 * from the DMU. Return true if the buffer is associated with user data and 2399 * duplicate buffers still exist. 2400 */ 2401boolean_t 2402arc_buf_eviction_needed(arc_buf_t *buf) 2403{ 2404 arc_buf_hdr_t *hdr; 2405 boolean_t evict_needed = B_FALSE; 2406 2407 if (zfs_disable_dup_eviction) 2408 return (B_FALSE); 2409 2410 mutex_enter(&buf->b_evict_lock); 2411 hdr = buf->b_hdr; 2412 if (hdr == NULL) { 2413 /* 2414 * We are in arc_do_user_evicts(); let that function 2415 * perform the eviction. 2416 */ 2417 ASSERT(buf->b_data == NULL); 2418 mutex_exit(&buf->b_evict_lock); 2419 return (B_FALSE); 2420 } else if (buf->b_data == NULL) { 2421 /* 2422 * We have already been added to the arc eviction list; 2423 * recommend eviction. 2424 */ 2425 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2426 mutex_exit(&buf->b_evict_lock); 2427 return (B_TRUE); 2428 } 2429 2430 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2431 evict_needed = B_TRUE; 2432 2433 mutex_exit(&buf->b_evict_lock); 2434 return (evict_needed); 2435} 2436 2437/* 2438 * Evict buffers from list until we've removed the specified number of 2439 * bytes. Move the removed buffers to the appropriate evict state. 2440 * If the recycle flag is set, then attempt to "recycle" a buffer: 2441 * - look for a buffer to evict that is `bytes' long. 2442 * - return the data block from this buffer rather than freeing it. 2443 * This flag is used by callers that are trying to make space for a 2444 * new buffer in a full arc cache. 2445 * 2446 * This function makes a "best effort". It skips over any buffers 2447 * it can't get a hash_lock on, and so may not catch all candidates. 2448 * It may also return without evicting as much space as requested. 2449 */ 2450static void * 2451arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2452 arc_buf_contents_t type) 2453{ 2454 arc_state_t *evicted_state; 2455 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2456 int64_t bytes_remaining; 2457 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2458 list_t *evicted_list, *list, *evicted_list_start, *list_start; 2459 kmutex_t *lock, *evicted_lock; 2460 kmutex_t *hash_lock; 2461 boolean_t have_lock; 2462 void *stolen = NULL; 2463 arc_buf_hdr_t marker = { 0 }; 2464 int count = 0; 2465 static int evict_metadata_offset, evict_data_offset; 2466 int i, idx, offset, list_count, lists; 2467 2468 ASSERT(state == arc_mru || state == arc_mfu); 2469 2470 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2471 2472 /* 2473 * Decide which "type" (data vs metadata) to recycle from. 2474 * 2475 * If we are over the metadata limit, recycle from metadata. 2476 * If we are under the metadata minimum, recycle from data. 2477 * Otherwise, recycle from whichever type has the oldest (least 2478 * recently accessed) header. This is not yet implemented. 2479 */ 2480 if (recycle) { 2481 arc_buf_contents_t realtype; 2482 if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { 2483 realtype = ARC_BUFC_METADATA; 2484 } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { 2485 realtype = ARC_BUFC_DATA; 2486 } else if (arc_meta_used >= arc_meta_limit) { 2487 realtype = ARC_BUFC_METADATA; 2488 } else if (arc_meta_used <= arc_meta_min) { 2489 realtype = ARC_BUFC_DATA; 2490#ifdef illumos 2491 } else if (HDR_HAS_L1HDR(data_hdr) && 2492 HDR_HAS_L1HDR(metadata_hdr) && 2493 data_hdr->b_l1hdr.b_arc_access < 2494 metadata_hdr->b_l1hdr.b_arc_access) { 2495 realtype = ARC_BUFC_DATA; 2496 } else { 2497 realtype = ARC_BUFC_METADATA; 2498#else 2499 } else { 2500 /* TODO */ 2501 realtype = type; 2502#endif 2503 } 2504 if (realtype != type) { 2505 /* 2506 * If we want to evict from a different list, 2507 * we can not recycle, because DATA vs METADATA 2508 * buffers are segregated into different kmem 2509 * caches (and vmem arenas). 2510 */ 2511 type = realtype; 2512 recycle = B_FALSE; 2513 } 2514 } 2515 2516 if (type == ARC_BUFC_METADATA) { 2517 offset = 0; 2518 list_count = ARC_BUFC_NUMMETADATALISTS; 2519 list_start = &state->arcs_lists[0]; 2520 evicted_list_start = &evicted_state->arcs_lists[0]; 2521 idx = evict_metadata_offset; 2522 } else { 2523 offset = ARC_BUFC_NUMMETADATALISTS; 2524 list_start = &state->arcs_lists[offset]; 2525 evicted_list_start = &evicted_state->arcs_lists[offset]; 2526 list_count = ARC_BUFC_NUMDATALISTS; 2527 idx = evict_data_offset; 2528 } 2529 bytes_remaining = evicted_state->arcs_lsize[type]; 2530 lists = 0; 2531 2532evict_start: 2533 list = &list_start[idx]; 2534 evicted_list = &evicted_list_start[idx]; 2535 lock = ARCS_LOCK(state, (offset + idx)); 2536 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 2537 2538 /* 2539 * The ghost list lock must be acquired first in order to prevent 2540 * a 3 party deadlock: 2541 * 2542 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2543 * l2ad_mtx in arc_hdr_realloc 2544 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2545 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2546 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2547 * 2548 * This situation is avoided by acquiring the ghost list lock first. 2549 */ 2550 mutex_enter(evicted_lock); 2551 mutex_enter(lock); 2552 2553 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2554 hdr_prev = list_prev(list, hdr); 2555 if (HDR_HAS_L1HDR(hdr)) { 2556 bytes_remaining -= 2557 (hdr->b_size * hdr->b_l1hdr.b_datacnt); 2558 } 2559 /* prefetch buffers have a minimum lifespan */ 2560 if (HDR_IO_IN_PROGRESS(hdr) || 2561 (spa && hdr->b_spa != spa) || 2562 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2563 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2564 arc_min_prefetch_lifespan)) { 2565 skipped++; 2566 continue; 2567 } 2568 /* "lookahead" for better eviction candidate */ 2569 if (recycle && hdr->b_size != bytes && 2570 hdr_prev && hdr_prev->b_size == bytes) 2571 continue; 2572 2573 /* ignore markers */ 2574 if (hdr->b_spa == 0) 2575 continue; 2576 2577 /* 2578 * It may take a long time to evict all the bufs requested. 2579 * To avoid blocking all arc activity, periodically drop 2580 * the arcs_mtx and give other threads a chance to run 2581 * before reacquiring the lock. 2582 * 2583 * If we are looking for a buffer to recycle, we are in 2584 * the hot code path, so don't sleep. 2585 */ 2586 if (!recycle && count++ > arc_evict_iterations) { 2587 list_insert_after(list, hdr, &marker); 2588 mutex_exit(lock); 2589 mutex_exit(evicted_lock); 2590 kpreempt(KPREEMPT_SYNC); 2591 mutex_enter(evicted_lock); 2592 mutex_enter(lock); 2593 hdr_prev = list_prev(list, &marker); 2594 list_remove(list, &marker); 2595 count = 0; 2596 continue; 2597 } 2598 2599 hash_lock = HDR_LOCK(hdr); 2600 have_lock = MUTEX_HELD(hash_lock); 2601 if (have_lock || mutex_tryenter(hash_lock)) { 2602 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2603 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2604 while (hdr->b_l1hdr.b_buf) { 2605 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2606 if (!mutex_tryenter(&buf->b_evict_lock)) { 2607 missed += 1; 2608 break; 2609 } 2610 if (buf->b_data != NULL) { 2611 bytes_evicted += hdr->b_size; 2612 if (recycle && 2613 arc_buf_type(hdr) == type && 2614 hdr->b_size == bytes && 2615 !HDR_L2_WRITING(hdr)) { 2616 stolen = buf->b_data; 2617 recycle = FALSE; 2618 } 2619 } 2620 if (buf->b_efunc != NULL) { 2621 mutex_enter(&arc_eviction_mtx); 2622 arc_buf_destroy(buf, 2623 buf->b_data == stolen, FALSE); 2624 hdr->b_l1hdr.b_buf = buf->b_next; 2625 buf->b_hdr = &arc_eviction_hdr; 2626 buf->b_next = arc_eviction_list; 2627 arc_eviction_list = buf; 2628 mutex_exit(&arc_eviction_mtx); 2629 mutex_exit(&buf->b_evict_lock); 2630 } else { 2631 mutex_exit(&buf->b_evict_lock); 2632 arc_buf_destroy(buf, 2633 buf->b_data == stolen, TRUE); 2634 } 2635 } 2636 2637 if (HDR_HAS_L2HDR(hdr)) { 2638 ARCSTAT_INCR(arcstat_evict_l2_cached, 2639 hdr->b_size); 2640 } else { 2641 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2642 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2643 hdr->b_size); 2644 } else { 2645 ARCSTAT_INCR( 2646 arcstat_evict_l2_ineligible, 2647 hdr->b_size); 2648 } 2649 } 2650 2651 if (hdr->b_l1hdr.b_datacnt == 0) { 2652 arc_change_state(evicted_state, hdr, hash_lock); 2653 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2654 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2655 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2656 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2657 } 2658 if (!have_lock) 2659 mutex_exit(hash_lock); 2660 if (bytes >= 0 && bytes_evicted >= bytes) 2661 break; 2662 if (bytes_remaining > 0) { 2663 mutex_exit(evicted_lock); 2664 mutex_exit(lock); 2665 idx = ((idx + 1) & (list_count - 1)); 2666 lists++; 2667 goto evict_start; 2668 } 2669 } else { 2670 missed += 1; 2671 } 2672 } 2673 2674 mutex_exit(lock); 2675 mutex_exit(evicted_lock); 2676 2677 idx = ((idx + 1) & (list_count - 1)); 2678 lists++; 2679 2680 if (bytes_evicted < bytes) { 2681 if (lists < list_count) 2682 goto evict_start; 2683 else 2684 dprintf("only evicted %lld bytes from %x", 2685 (longlong_t)bytes_evicted, state); 2686 } 2687 if (type == ARC_BUFC_METADATA) 2688 evict_metadata_offset = idx; 2689 else 2690 evict_data_offset = idx; 2691 2692 if (skipped) 2693 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2694 2695 if (missed) 2696 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2697 2698 /* 2699 * Note: we have just evicted some data into the ghost state, 2700 * potentially putting the ghost size over the desired size. Rather 2701 * that evicting from the ghost list in this hot code path, leave 2702 * this chore to the arc_reclaim_thread(). 2703 */ 2704 2705 if (stolen) 2706 ARCSTAT_BUMP(arcstat_stolen); 2707 return (stolen); 2708} 2709 2710/* 2711 * Remove buffers from list until we've removed the specified number of 2712 * bytes. Destroy the buffers that are removed. 2713 */ 2714static void 2715arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2716{ 2717 arc_buf_hdr_t *hdr, *hdr_prev; 2718 arc_buf_hdr_t marker = { 0 }; 2719 list_t *list, *list_start; 2720 kmutex_t *hash_lock, *lock; 2721 uint64_t bytes_deleted = 0; 2722 uint64_t bufs_skipped = 0; 2723 int count = 0; 2724 static int evict_offset; 2725 int list_count, idx = evict_offset; 2726 int offset, lists = 0; 2727 2728 ASSERT(GHOST_STATE(state)); 2729 2730 /* 2731 * data lists come after metadata lists 2732 */ 2733 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2734 list_count = ARC_BUFC_NUMDATALISTS; 2735 offset = ARC_BUFC_NUMMETADATALISTS; 2736 2737evict_start: 2738 list = &list_start[idx]; 2739 lock = ARCS_LOCK(state, idx + offset); 2740 2741 mutex_enter(lock); 2742 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2743 hdr_prev = list_prev(list, hdr); 2744 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2745 panic("invalid hdr=%p", (void *)hdr); 2746 if (spa && hdr->b_spa != spa) 2747 continue; 2748 2749 /* ignore markers */ 2750 if (hdr->b_spa == 0) 2751 continue; 2752 2753 hash_lock = HDR_LOCK(hdr); 2754 /* caller may be trying to modify this buffer, skip it */ 2755 if (MUTEX_HELD(hash_lock)) 2756 continue; 2757 2758 /* 2759 * It may take a long time to evict all the bufs requested. 2760 * To avoid blocking all arc activity, periodically drop 2761 * the arcs_mtx and give other threads a chance to run 2762 * before reacquiring the lock. 2763 */ 2764 if (count++ > arc_evict_iterations) { 2765 list_insert_after(list, hdr, &marker); 2766 mutex_exit(lock); 2767 kpreempt(KPREEMPT_SYNC); 2768 mutex_enter(lock); 2769 hdr_prev = list_prev(list, &marker); 2770 list_remove(list, &marker); 2771 count = 0; 2772 continue; 2773 } 2774 if (mutex_tryenter(hash_lock)) { 2775 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2776 ASSERT(!HDR_HAS_L1HDR(hdr) || 2777 hdr->b_l1hdr.b_buf == NULL); 2778 ARCSTAT_BUMP(arcstat_deleted); 2779 bytes_deleted += hdr->b_size; 2780 2781 if (HDR_HAS_L2HDR(hdr)) { 2782 /* 2783 * This buffer is cached on the 2nd Level ARC; 2784 * don't destroy the header. 2785 */ 2786 arc_change_state(arc_l2c_only, hdr, hash_lock); 2787 /* 2788 * dropping from L1+L2 cached to L2-only, 2789 * realloc to remove the L1 header. 2790 */ 2791 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2792 hdr_l2only_cache); 2793 mutex_exit(hash_lock); 2794 } else { 2795 arc_change_state(arc_anon, hdr, hash_lock); 2796 mutex_exit(hash_lock); 2797 arc_hdr_destroy(hdr); 2798 } 2799 2800 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2801 if (bytes >= 0 && bytes_deleted >= bytes) 2802 break; 2803 } else if (bytes < 0) { 2804 /* 2805 * Insert a list marker and then wait for the 2806 * hash lock to become available. Once its 2807 * available, restart from where we left off. 2808 */ 2809 list_insert_after(list, hdr, &marker); 2810 mutex_exit(lock); 2811 mutex_enter(hash_lock); 2812 mutex_exit(hash_lock); 2813 mutex_enter(lock); 2814 hdr_prev = list_prev(list, &marker); 2815 list_remove(list, &marker); 2816 } else { 2817 bufs_skipped += 1; 2818 } 2819 2820 } 2821 mutex_exit(lock); 2822 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2823 lists++; 2824 2825 if (lists < list_count) 2826 goto evict_start; 2827 2828 evict_offset = idx; 2829 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2830 (bytes < 0 || bytes_deleted < bytes)) { 2831 list_start = &state->arcs_lists[0]; 2832 list_count = ARC_BUFC_NUMMETADATALISTS; 2833 offset = lists = 0; 2834 goto evict_start; 2835 } 2836 2837 if (bufs_skipped) { 2838 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2839 ASSERT(bytes >= 0); 2840 } 2841 2842 if (bytes_deleted < bytes) 2843 dprintf("only deleted %lld bytes from %p", 2844 (longlong_t)bytes_deleted, state); 2845} 2846 2847static void 2848arc_adjust(void) 2849{ 2850 int64_t adjustment, delta; 2851 2852 /* 2853 * Adjust MRU size 2854 */ 2855 2856 adjustment = MIN((int64_t)(arc_size - arc_c), 2857 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2858 arc_p)); 2859 2860 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2861 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2862 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2863 adjustment -= delta; 2864 } 2865 2866 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2867 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2868 (void) arc_evict(arc_mru, 0, delta, FALSE, 2869 ARC_BUFC_METADATA); 2870 } 2871 2872 /* 2873 * Adjust MFU size 2874 */ 2875 2876 adjustment = arc_size - arc_c; 2877 2878 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2879 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2880 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2881 adjustment -= delta; 2882 } 2883 2884 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2885 int64_t delta = MIN(adjustment, 2886 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2887 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2888 ARC_BUFC_METADATA); 2889 } 2890 2891 /* 2892 * Adjust ghost lists 2893 */ 2894 2895 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2896 2897 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2898 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2899 arc_evict_ghost(arc_mru_ghost, 0, delta); 2900 } 2901 2902 adjustment = 2903 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2904 2905 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2906 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2907 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2908 } 2909} 2910 2911static void 2912arc_do_user_evicts(void) 2913{ 2914 static arc_buf_t *tmp_arc_eviction_list; 2915 2916 /* 2917 * Move list over to avoid LOR 2918 */ 2919restart: 2920 mutex_enter(&arc_eviction_mtx); 2921 tmp_arc_eviction_list = arc_eviction_list; 2922 arc_eviction_list = NULL; 2923 mutex_exit(&arc_eviction_mtx); 2924 2925 while (tmp_arc_eviction_list != NULL) { 2926 arc_buf_t *buf = tmp_arc_eviction_list; 2927 tmp_arc_eviction_list = buf->b_next; 2928 mutex_enter(&buf->b_evict_lock); 2929 buf->b_hdr = NULL; 2930 mutex_exit(&buf->b_evict_lock); 2931 2932 if (buf->b_efunc != NULL) 2933 VERIFY0(buf->b_efunc(buf->b_private)); 2934 2935 buf->b_efunc = NULL; 2936 buf->b_private = NULL; 2937 kmem_cache_free(buf_cache, buf); 2938 } 2939 2940 if (arc_eviction_list != NULL) 2941 goto restart; 2942} 2943 2944/* 2945 * Flush all *evictable* data from the cache for the given spa. 2946 * NOTE: this will not touch "active" (i.e. referenced) data. 2947 */ 2948void 2949arc_flush(spa_t *spa) 2950{ 2951 uint64_t guid = 0; 2952 2953 if (spa != NULL) 2954 guid = spa_load_guid(spa); 2955 2956 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2957 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2958 if (spa != NULL) 2959 break; 2960 } 2961 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2962 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2963 if (spa != NULL) 2964 break; 2965 } 2966 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2967 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2968 if (spa != NULL) 2969 break; 2970 } 2971 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2972 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2973 if (spa != NULL) 2974 break; 2975 } 2976 2977 arc_evict_ghost(arc_mru_ghost, guid, -1); 2978 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2979 2980 mutex_enter(&arc_reclaim_thr_lock); 2981 arc_do_user_evicts(); 2982 mutex_exit(&arc_reclaim_thr_lock); 2983 ASSERT(spa || arc_eviction_list == NULL); 2984} 2985 2986void 2987arc_shrink(void) 2988{ 2989 2990 if (arc_c > arc_c_min) { 2991 uint64_t to_free; 2992 2993 to_free = arc_c >> arc_shrink_shift; 2994 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 2995 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 2996 if (arc_c > arc_c_min + to_free) 2997 atomic_add_64(&arc_c, -to_free); 2998 else 2999 arc_c = arc_c_min; 3000 3001 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3002 if (arc_c > arc_size) 3003 arc_c = MAX(arc_size, arc_c_min); 3004 if (arc_p > arc_c) 3005 arc_p = (arc_c >> 1); 3006 3007 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3008 arc_p); 3009 3010 ASSERT(arc_c >= arc_c_min); 3011 ASSERT((int64_t)arc_p >= 0); 3012 } 3013 3014 if (arc_size > arc_c) { 3015 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3016 uint64_t, arc_c); 3017 arc_adjust(); 3018 } 3019} 3020 3021static int needfree = 0; 3022 3023static int 3024arc_reclaim_needed(void) 3025{ 3026 3027#ifdef _KERNEL 3028 3029 if (needfree) { 3030 DTRACE_PROBE(arc__reclaim_needfree); 3031 return (1); 3032 } 3033 3034 /* 3035 * Cooperate with pagedaemon when it's time for it to scan 3036 * and reclaim some pages. 3037 */ 3038 if (freemem < zfs_arc_free_target) { 3039 DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, 3040 freemem, uint64_t, zfs_arc_free_target); 3041 return (1); 3042 } 3043 3044#ifdef sun 3045 /* 3046 * take 'desfree' extra pages, so we reclaim sooner, rather than later 3047 */ 3048 extra = desfree; 3049 3050 /* 3051 * check that we're out of range of the pageout scanner. It starts to 3052 * schedule paging if freemem is less than lotsfree and needfree. 3053 * lotsfree is the high-water mark for pageout, and needfree is the 3054 * number of needed free pages. We add extra pages here to make sure 3055 * the scanner doesn't start up while we're freeing memory. 3056 */ 3057 if (freemem < lotsfree + needfree + extra) 3058 return (1); 3059 3060 /* 3061 * check to make sure that swapfs has enough space so that anon 3062 * reservations can still succeed. anon_resvmem() checks that the 3063 * availrmem is greater than swapfs_minfree, and the number of reserved 3064 * swap pages. We also add a bit of extra here just to prevent 3065 * circumstances from getting really dire. 3066 */ 3067 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 3068 return (1); 3069 3070 /* 3071 * Check that we have enough availrmem that memory locking (e.g., via 3072 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3073 * stores the number of pages that cannot be locked; when availrmem 3074 * drops below pages_pp_maximum, page locking mechanisms such as 3075 * page_pp_lock() will fail.) 3076 */ 3077 if (availrmem <= pages_pp_maximum) 3078 return (1); 3079 3080#endif /* sun */ 3081#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3082 /* 3083 * If we're on an i386 platform, it's possible that we'll exhaust the 3084 * kernel heap space before we ever run out of available physical 3085 * memory. Most checks of the size of the heap_area compare against 3086 * tune.t_minarmem, which is the minimum available real memory that we 3087 * can have in the system. However, this is generally fixed at 25 pages 3088 * which is so low that it's useless. In this comparison, we seek to 3089 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3090 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3091 * free) 3092 */ 3093 if (vmem_size(heap_arena, VMEM_FREE) < 3094 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { 3095 DTRACE_PROBE2(arc__reclaim_used, uint64_t, 3096 vmem_size(heap_arena, VMEM_FREE), uint64_t, 3097 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); 3098 return (1); 3099 } 3100#define zio_arena NULL 3101#else 3102#define zio_arena heap_arena 3103#endif 3104 3105 /* 3106 * If zio data pages are being allocated out of a separate heap segment, 3107 * then enforce that the size of available vmem for this arena remains 3108 * above about 1/16th free. 3109 * 3110 * Note: The 1/16th arena free requirement was put in place 3111 * to aggressively evict memory from the arc in order to avoid 3112 * memory fragmentation issues. 3113 */ 3114 if (zio_arena != NULL && 3115 vmem_size(zio_arena, VMEM_FREE) < 3116 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 3117 return (1); 3118 3119 /* 3120 * Above limits know nothing about real level of KVA fragmentation. 3121 * Start aggressive reclamation if too little sequential KVA left. 3122 */ 3123 if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) { 3124 DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t, 3125 vmem_size(heap_arena, VMEM_MAXFREE), 3126 uint64_t, zfs_max_recordsize); 3127 return (1); 3128 } 3129 3130#else /* _KERNEL */ 3131 if (spa_get_random(100) == 0) 3132 return (1); 3133#endif /* _KERNEL */ 3134 DTRACE_PROBE(arc__reclaim_no); 3135 3136 return (0); 3137} 3138 3139extern kmem_cache_t *zio_buf_cache[]; 3140extern kmem_cache_t *zio_data_buf_cache[]; 3141extern kmem_cache_t *range_seg_cache; 3142 3143static __noinline void 3144arc_kmem_reap_now(arc_reclaim_strategy_t strat) 3145{ 3146 size_t i; 3147 kmem_cache_t *prev_cache = NULL; 3148 kmem_cache_t *prev_data_cache = NULL; 3149 3150 DTRACE_PROBE(arc__kmem_reap_start); 3151#ifdef _KERNEL 3152 if (arc_meta_used >= arc_meta_limit) { 3153 /* 3154 * We are exceeding our meta-data cache limit. 3155 * Purge some DNLC entries to release holds on meta-data. 3156 */ 3157 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3158 } 3159#if defined(__i386) 3160 /* 3161 * Reclaim unused memory from all kmem caches. 3162 */ 3163 kmem_reap(); 3164#endif 3165#endif 3166 3167 /* 3168 * An aggressive reclamation will shrink the cache size as well as 3169 * reap free buffers from the arc kmem caches. 3170 */ 3171 if (strat == ARC_RECLAIM_AGGR) 3172 arc_shrink(); 3173 3174 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3175 if (zio_buf_cache[i] != prev_cache) { 3176 prev_cache = zio_buf_cache[i]; 3177 kmem_cache_reap_now(zio_buf_cache[i]); 3178 } 3179 if (zio_data_buf_cache[i] != prev_data_cache) { 3180 prev_data_cache = zio_data_buf_cache[i]; 3181 kmem_cache_reap_now(zio_data_buf_cache[i]); 3182 } 3183 } 3184 kmem_cache_reap_now(buf_cache); 3185 kmem_cache_reap_now(hdr_full_cache); 3186 kmem_cache_reap_now(hdr_l2only_cache); 3187 kmem_cache_reap_now(range_seg_cache); 3188 3189#ifdef sun 3190 /* 3191 * Ask the vmem arena to reclaim unused memory from its 3192 * quantum caches. 3193 */ 3194 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 3195 vmem_qcache_reap(zio_arena); 3196#endif 3197 DTRACE_PROBE(arc__kmem_reap_end); 3198} 3199 3200static void 3201arc_reclaim_thread(void *dummy __unused) 3202{ 3203 clock_t growtime = 0; 3204 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 3205 callb_cpr_t cpr; 3206 3207 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3208 3209 mutex_enter(&arc_reclaim_thr_lock); 3210 while (arc_thread_exit == 0) { 3211 if (arc_reclaim_needed()) { 3212 3213 if (arc_no_grow) { 3214 if (last_reclaim == ARC_RECLAIM_CONS) { 3215 DTRACE_PROBE(arc__reclaim_aggr_no_grow); 3216 last_reclaim = ARC_RECLAIM_AGGR; 3217 } else { 3218 last_reclaim = ARC_RECLAIM_CONS; 3219 } 3220 } else { 3221 arc_no_grow = TRUE; 3222 last_reclaim = ARC_RECLAIM_AGGR; 3223 DTRACE_PROBE(arc__reclaim_aggr); 3224 membar_producer(); 3225 } 3226 3227 /* reset the growth delay for every reclaim */ 3228 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3229 3230 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 3231 /* 3232 * If needfree is TRUE our vm_lowmem hook 3233 * was called and in that case we must free some 3234 * memory, so switch to aggressive mode. 3235 */ 3236 arc_no_grow = TRUE; 3237 last_reclaim = ARC_RECLAIM_AGGR; 3238 } 3239 arc_kmem_reap_now(last_reclaim); 3240 arc_warm = B_TRUE; 3241 3242 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 3243 arc_no_grow = FALSE; 3244 } 3245 3246 arc_adjust(); 3247 3248 if (arc_eviction_list != NULL) 3249 arc_do_user_evicts(); 3250 3251#ifdef _KERNEL 3252 if (needfree) { 3253 needfree = 0; 3254 wakeup(&needfree); 3255 } 3256#endif 3257 3258 /* 3259 * This is necessary in order for the mdb ::arc dcmd to 3260 * show up to date information. Since the ::arc command 3261 * does not call the kstat's update function, without 3262 * this call, the command may show stale stats for the 3263 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3264 * with this change, the data might be up to 1 second 3265 * out of date; but that should suffice. The arc_state_t 3266 * structures can be queried directly if more accurate 3267 * information is needed. 3268 */ 3269 if (arc_ksp != NULL) 3270 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3271 3272 /* block until needed, or one second, whichever is shorter */ 3273 CALLB_CPR_SAFE_BEGIN(&cpr); 3274 (void) cv_timedwait(&arc_reclaim_thr_cv, 3275 &arc_reclaim_thr_lock, hz); 3276 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3277 } 3278 3279 arc_thread_exit = 0; 3280 cv_broadcast(&arc_reclaim_thr_cv); 3281 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3282 thread_exit(); 3283} 3284 3285/* 3286 * Adapt arc info given the number of bytes we are trying to add and 3287 * the state that we are comming from. This function is only called 3288 * when we are adding new content to the cache. 3289 */ 3290static void 3291arc_adapt(int bytes, arc_state_t *state) 3292{ 3293 int mult; 3294 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3295 3296 if (state == arc_l2c_only) 3297 return; 3298 3299 ASSERT(bytes > 0); 3300 /* 3301 * Adapt the target size of the MRU list: 3302 * - if we just hit in the MRU ghost list, then increase 3303 * the target size of the MRU list. 3304 * - if we just hit in the MFU ghost list, then increase 3305 * the target size of the MFU list by decreasing the 3306 * target size of the MRU list. 3307 */ 3308 if (state == arc_mru_ghost) { 3309 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3310 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3311 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3312 3313 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3314 } else if (state == arc_mfu_ghost) { 3315 uint64_t delta; 3316 3317 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3318 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3319 mult = MIN(mult, 10); 3320 3321 delta = MIN(bytes * mult, arc_p); 3322 arc_p = MAX(arc_p_min, arc_p - delta); 3323 } 3324 ASSERT((int64_t)arc_p >= 0); 3325 3326 if (arc_reclaim_needed()) { 3327 cv_signal(&arc_reclaim_thr_cv); 3328 return; 3329 } 3330 3331 if (arc_no_grow) 3332 return; 3333 3334 if (arc_c >= arc_c_max) 3335 return; 3336 3337 /* 3338 * If we're within (2 * maxblocksize) bytes of the target 3339 * cache size, increment the target cache size 3340 */ 3341 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3342 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3343 atomic_add_64(&arc_c, (int64_t)bytes); 3344 if (arc_c > arc_c_max) 3345 arc_c = arc_c_max; 3346 else if (state == arc_anon) 3347 atomic_add_64(&arc_p, (int64_t)bytes); 3348 if (arc_p > arc_c) 3349 arc_p = arc_c; 3350 } 3351 ASSERT((int64_t)arc_p >= 0); 3352} 3353 3354/* 3355 * Check if the cache has reached its limits and eviction is required 3356 * prior to insert. 3357 */ 3358static int 3359arc_evict_needed(arc_buf_contents_t type) 3360{ 3361 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3362 return (1); 3363 3364 if (arc_reclaim_needed()) 3365 return (1); 3366 3367 return (arc_size > arc_c); 3368} 3369 3370/* 3371 * The buffer, supplied as the first argument, needs a data block. 3372 * So, if we are at cache max, determine which cache should be victimized. 3373 * We have the following cases: 3374 * 3375 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3376 * In this situation if we're out of space, but the resident size of the MFU is 3377 * under the limit, victimize the MFU cache to satisfy this insertion request. 3378 * 3379 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3380 * Here, we've used up all of the available space for the MRU, so we need to 3381 * evict from our own cache instead. Evict from the set of resident MRU 3382 * entries. 3383 * 3384 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3385 * c minus p represents the MFU space in the cache, since p is the size of the 3386 * cache that is dedicated to the MRU. In this situation there's still space on 3387 * the MFU side, so the MRU side needs to be victimized. 3388 * 3389 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3390 * MFU's resident set is consuming more space than it has been allotted. In 3391 * this situation, we must victimize our own cache, the MFU, for this insertion. 3392 */ 3393static void 3394arc_get_data_buf(arc_buf_t *buf) 3395{ 3396 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3397 uint64_t size = buf->b_hdr->b_size; 3398 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3399 3400 arc_adapt(size, state); 3401 3402 /* 3403 * We have not yet reached cache maximum size, 3404 * just allocate a new buffer. 3405 */ 3406 if (!arc_evict_needed(type)) { 3407 if (type == ARC_BUFC_METADATA) { 3408 buf->b_data = zio_buf_alloc(size); 3409 arc_space_consume(size, ARC_SPACE_META); 3410 } else { 3411 ASSERT(type == ARC_BUFC_DATA); 3412 buf->b_data = zio_data_buf_alloc(size); 3413 arc_space_consume(size, ARC_SPACE_DATA); 3414 } 3415 goto out; 3416 } 3417 3418 /* 3419 * If we are prefetching from the mfu ghost list, this buffer 3420 * will end up on the mru list; so steal space from there. 3421 */ 3422 if (state == arc_mfu_ghost) 3423 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3424 else if (state == arc_mru_ghost) 3425 state = arc_mru; 3426 3427 if (state == arc_mru || state == arc_anon) { 3428 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3429 state = (arc_mfu->arcs_lsize[type] >= size && 3430 arc_p > mru_used) ? arc_mfu : arc_mru; 3431 } else { 3432 /* MFU cases */ 3433 uint64_t mfu_space = arc_c - arc_p; 3434 state = (arc_mru->arcs_lsize[type] >= size && 3435 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3436 } 3437 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3438 if (type == ARC_BUFC_METADATA) { 3439 buf->b_data = zio_buf_alloc(size); 3440 arc_space_consume(size, ARC_SPACE_META); 3441 } else { 3442 ASSERT(type == ARC_BUFC_DATA); 3443 buf->b_data = zio_data_buf_alloc(size); 3444 arc_space_consume(size, ARC_SPACE_DATA); 3445 } 3446 ARCSTAT_BUMP(arcstat_recycle_miss); 3447 } 3448 ASSERT(buf->b_data != NULL); 3449out: 3450 /* 3451 * Update the state size. Note that ghost states have a 3452 * "ghost size" and so don't need to be updated. 3453 */ 3454 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3455 arc_buf_hdr_t *hdr = buf->b_hdr; 3456 3457 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3458 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3459 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3460 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3461 size); 3462 } 3463 /* 3464 * If we are growing the cache, and we are adding anonymous 3465 * data, and we have outgrown arc_p, update arc_p 3466 */ 3467 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3468 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3469 arc_p = MIN(arc_c, arc_p + size); 3470 } 3471 ARCSTAT_BUMP(arcstat_allocated); 3472} 3473 3474/* 3475 * This routine is called whenever a buffer is accessed. 3476 * NOTE: the hash lock is dropped in this function. 3477 */ 3478static void 3479arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3480{ 3481 clock_t now; 3482 3483 ASSERT(MUTEX_HELD(hash_lock)); 3484 ASSERT(HDR_HAS_L1HDR(hdr)); 3485 3486 if (hdr->b_l1hdr.b_state == arc_anon) { 3487 /* 3488 * This buffer is not in the cache, and does not 3489 * appear in our "ghost" list. Add the new buffer 3490 * to the MRU state. 3491 */ 3492 3493 ASSERT0(hdr->b_l1hdr.b_arc_access); 3494 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3495 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3496 arc_change_state(arc_mru, hdr, hash_lock); 3497 3498 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3499 now = ddi_get_lbolt(); 3500 3501 /* 3502 * If this buffer is here because of a prefetch, then either: 3503 * - clear the flag if this is a "referencing" read 3504 * (any subsequent access will bump this into the MFU state). 3505 * or 3506 * - move the buffer to the head of the list if this is 3507 * another prefetch (to make it less likely to be evicted). 3508 */ 3509 if (HDR_PREFETCH(hdr)) { 3510 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3511 ASSERT(list_link_active( 3512 &hdr->b_l1hdr.b_arc_node)); 3513 } else { 3514 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3515 ARCSTAT_BUMP(arcstat_mru_hits); 3516 } 3517 hdr->b_l1hdr.b_arc_access = now; 3518 return; 3519 } 3520 3521 /* 3522 * This buffer has been "accessed" only once so far, 3523 * but it is still in the cache. Move it to the MFU 3524 * state. 3525 */ 3526 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3527 /* 3528 * More than 125ms have passed since we 3529 * instantiated this buffer. Move it to the 3530 * most frequently used state. 3531 */ 3532 hdr->b_l1hdr.b_arc_access = now; 3533 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3534 arc_change_state(arc_mfu, hdr, hash_lock); 3535 } 3536 ARCSTAT_BUMP(arcstat_mru_hits); 3537 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3538 arc_state_t *new_state; 3539 /* 3540 * This buffer has been "accessed" recently, but 3541 * was evicted from the cache. Move it to the 3542 * MFU state. 3543 */ 3544 3545 if (HDR_PREFETCH(hdr)) { 3546 new_state = arc_mru; 3547 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3548 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3549 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3550 } else { 3551 new_state = arc_mfu; 3552 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3553 } 3554 3555 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3556 arc_change_state(new_state, hdr, hash_lock); 3557 3558 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3559 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3560 /* 3561 * This buffer has been accessed more than once and is 3562 * still in the cache. Keep it in the MFU state. 3563 * 3564 * NOTE: an add_reference() that occurred when we did 3565 * the arc_read() will have kicked this off the list. 3566 * If it was a prefetch, we will explicitly move it to 3567 * the head of the list now. 3568 */ 3569 if ((HDR_PREFETCH(hdr)) != 0) { 3570 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3571 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3572 } 3573 ARCSTAT_BUMP(arcstat_mfu_hits); 3574 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3575 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3576 arc_state_t *new_state = arc_mfu; 3577 /* 3578 * This buffer has been accessed more than once but has 3579 * been evicted from the cache. Move it back to the 3580 * MFU state. 3581 */ 3582 3583 if (HDR_PREFETCH(hdr)) { 3584 /* 3585 * This is a prefetch access... 3586 * move this block back to the MRU state. 3587 */ 3588 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3589 new_state = arc_mru; 3590 } 3591 3592 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3593 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3594 arc_change_state(new_state, hdr, hash_lock); 3595 3596 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3597 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3598 /* 3599 * This buffer is on the 2nd Level ARC. 3600 */ 3601 3602 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3603 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3604 arc_change_state(arc_mfu, hdr, hash_lock); 3605 } else { 3606 ASSERT(!"invalid arc state"); 3607 } 3608} 3609 3610/* a generic arc_done_func_t which you can use */ 3611/* ARGSUSED */ 3612void 3613arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3614{ 3615 if (zio == NULL || zio->io_error == 0) 3616 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3617 VERIFY(arc_buf_remove_ref(buf, arg)); 3618} 3619 3620/* a generic arc_done_func_t */ 3621void 3622arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3623{ 3624 arc_buf_t **bufp = arg; 3625 if (zio && zio->io_error) { 3626 VERIFY(arc_buf_remove_ref(buf, arg)); 3627 *bufp = NULL; 3628 } else { 3629 *bufp = buf; 3630 ASSERT(buf->b_data); 3631 } 3632} 3633 3634static void 3635arc_read_done(zio_t *zio) 3636{ 3637 arc_buf_hdr_t *hdr; 3638 arc_buf_t *buf; 3639 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3640 kmutex_t *hash_lock = NULL; 3641 arc_callback_t *callback_list, *acb; 3642 int freeable = FALSE; 3643 3644 buf = zio->io_private; 3645 hdr = buf->b_hdr; 3646 3647 /* 3648 * The hdr was inserted into hash-table and removed from lists 3649 * prior to starting I/O. We should find this header, since 3650 * it's in the hash table, and it should be legit since it's 3651 * not possible to evict it during the I/O. The only possible 3652 * reason for it not to be found is if we were freed during the 3653 * read. 3654 */ 3655 if (HDR_IN_HASH_TABLE(hdr)) { 3656 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3657 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3658 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3659 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3660 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3661 3662 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3663 &hash_lock); 3664 3665 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3666 hash_lock == NULL) || 3667 (found == hdr && 3668 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3669 (found == hdr && HDR_L2_READING(hdr))); 3670 } 3671 3672 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3673 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3674 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3675 3676 /* byteswap if necessary */ 3677 callback_list = hdr->b_l1hdr.b_acb; 3678 ASSERT(callback_list != NULL); 3679 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3680 dmu_object_byteswap_t bswap = 3681 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3682 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3683 byteswap_uint64_array : 3684 dmu_ot_byteswap[bswap].ob_func; 3685 func(buf->b_data, hdr->b_size); 3686 } 3687 3688 arc_cksum_compute(buf, B_FALSE); 3689#ifdef illumos 3690 arc_buf_watch(buf); 3691#endif /* illumos */ 3692 3693 if (hash_lock && zio->io_error == 0 && 3694 hdr->b_l1hdr.b_state == arc_anon) { 3695 /* 3696 * Only call arc_access on anonymous buffers. This is because 3697 * if we've issued an I/O for an evicted buffer, we've already 3698 * called arc_access (to prevent any simultaneous readers from 3699 * getting confused). 3700 */ 3701 arc_access(hdr, hash_lock); 3702 } 3703 3704 /* create copies of the data buffer for the callers */ 3705 abuf = buf; 3706 for (acb = callback_list; acb; acb = acb->acb_next) { 3707 if (acb->acb_done) { 3708 if (abuf == NULL) { 3709 ARCSTAT_BUMP(arcstat_duplicate_reads); 3710 abuf = arc_buf_clone(buf); 3711 } 3712 acb->acb_buf = abuf; 3713 abuf = NULL; 3714 } 3715 } 3716 hdr->b_l1hdr.b_acb = NULL; 3717 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3718 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3719 if (abuf == buf) { 3720 ASSERT(buf->b_efunc == NULL); 3721 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3722 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3723 } 3724 3725 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3726 callback_list != NULL); 3727 3728 if (zio->io_error != 0) { 3729 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3730 if (hdr->b_l1hdr.b_state != arc_anon) 3731 arc_change_state(arc_anon, hdr, hash_lock); 3732 if (HDR_IN_HASH_TABLE(hdr)) 3733 buf_hash_remove(hdr); 3734 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3735 } 3736 3737 /* 3738 * Broadcast before we drop the hash_lock to avoid the possibility 3739 * that the hdr (and hence the cv) might be freed before we get to 3740 * the cv_broadcast(). 3741 */ 3742 cv_broadcast(&hdr->b_l1hdr.b_cv); 3743 3744 if (hash_lock != NULL) { 3745 mutex_exit(hash_lock); 3746 } else { 3747 /* 3748 * This block was freed while we waited for the read to 3749 * complete. It has been removed from the hash table and 3750 * moved to the anonymous state (so that it won't show up 3751 * in the cache). 3752 */ 3753 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3754 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3755 } 3756 3757 /* execute each callback and free its structure */ 3758 while ((acb = callback_list) != NULL) { 3759 if (acb->acb_done) 3760 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3761 3762 if (acb->acb_zio_dummy != NULL) { 3763 acb->acb_zio_dummy->io_error = zio->io_error; 3764 zio_nowait(acb->acb_zio_dummy); 3765 } 3766 3767 callback_list = acb->acb_next; 3768 kmem_free(acb, sizeof (arc_callback_t)); 3769 } 3770 3771 if (freeable) 3772 arc_hdr_destroy(hdr); 3773} 3774 3775/* 3776 * "Read" the block block at the specified DVA (in bp) via the 3777 * cache. If the block is found in the cache, invoke the provided 3778 * callback immediately and return. Note that the `zio' parameter 3779 * in the callback will be NULL in this case, since no IO was 3780 * required. If the block is not in the cache pass the read request 3781 * on to the spa with a substitute callback function, so that the 3782 * requested block will be added to the cache. 3783 * 3784 * If a read request arrives for a block that has a read in-progress, 3785 * either wait for the in-progress read to complete (and return the 3786 * results); or, if this is a read with a "done" func, add a record 3787 * to the read to invoke the "done" func when the read completes, 3788 * and return; or just return. 3789 * 3790 * arc_read_done() will invoke all the requested "done" functions 3791 * for readers of this block. 3792 */ 3793int 3794arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3795 void *private, zio_priority_t priority, int zio_flags, 3796 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3797{ 3798 arc_buf_hdr_t *hdr = NULL; 3799 arc_buf_t *buf = NULL; 3800 kmutex_t *hash_lock = NULL; 3801 zio_t *rzio; 3802 uint64_t guid = spa_load_guid(spa); 3803 3804 ASSERT(!BP_IS_EMBEDDED(bp) || 3805 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3806 3807top: 3808 if (!BP_IS_EMBEDDED(bp)) { 3809 /* 3810 * Embedded BP's have no DVA and require no I/O to "read". 3811 * Create an anonymous arc buf to back it. 3812 */ 3813 hdr = buf_hash_find(guid, bp, &hash_lock); 3814 } 3815 3816 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3817 3818 *arc_flags |= ARC_FLAG_CACHED; 3819 3820 if (HDR_IO_IN_PROGRESS(hdr)) { 3821 3822 if (*arc_flags & ARC_FLAG_WAIT) { 3823 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3824 mutex_exit(hash_lock); 3825 goto top; 3826 } 3827 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3828 3829 if (done) { 3830 arc_callback_t *acb = NULL; 3831 3832 acb = kmem_zalloc(sizeof (arc_callback_t), 3833 KM_SLEEP); 3834 acb->acb_done = done; 3835 acb->acb_private = private; 3836 if (pio != NULL) 3837 acb->acb_zio_dummy = zio_null(pio, 3838 spa, NULL, NULL, NULL, zio_flags); 3839 3840 ASSERT(acb->acb_done != NULL); 3841 acb->acb_next = hdr->b_l1hdr.b_acb; 3842 hdr->b_l1hdr.b_acb = acb; 3843 add_reference(hdr, hash_lock, private); 3844 mutex_exit(hash_lock); 3845 return (0); 3846 } 3847 mutex_exit(hash_lock); 3848 return (0); 3849 } 3850 3851 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3852 hdr->b_l1hdr.b_state == arc_mfu); 3853 3854 if (done) { 3855 add_reference(hdr, hash_lock, private); 3856 /* 3857 * If this block is already in use, create a new 3858 * copy of the data so that we will be guaranteed 3859 * that arc_release() will always succeed. 3860 */ 3861 buf = hdr->b_l1hdr.b_buf; 3862 ASSERT(buf); 3863 ASSERT(buf->b_data); 3864 if (HDR_BUF_AVAILABLE(hdr)) { 3865 ASSERT(buf->b_efunc == NULL); 3866 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3867 } else { 3868 buf = arc_buf_clone(buf); 3869 } 3870 3871 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3872 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3873 hdr->b_flags |= ARC_FLAG_PREFETCH; 3874 } 3875 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3876 arc_access(hdr, hash_lock); 3877 if (*arc_flags & ARC_FLAG_L2CACHE) 3878 hdr->b_flags |= ARC_FLAG_L2CACHE; 3879 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3880 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3881 mutex_exit(hash_lock); 3882 ARCSTAT_BUMP(arcstat_hits); 3883 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3884 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3885 data, metadata, hits); 3886 3887 if (done) 3888 done(NULL, buf, private); 3889 } else { 3890 uint64_t size = BP_GET_LSIZE(bp); 3891 arc_callback_t *acb; 3892 vdev_t *vd = NULL; 3893 uint64_t addr = 0; 3894 boolean_t devw = B_FALSE; 3895 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3896 int32_t b_asize = 0; 3897 3898 if (hdr == NULL) { 3899 /* this block is not in the cache */ 3900 arc_buf_hdr_t *exists = NULL; 3901 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3902 buf = arc_buf_alloc(spa, size, private, type); 3903 hdr = buf->b_hdr; 3904 if (!BP_IS_EMBEDDED(bp)) { 3905 hdr->b_dva = *BP_IDENTITY(bp); 3906 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3907 exists = buf_hash_insert(hdr, &hash_lock); 3908 } 3909 if (exists != NULL) { 3910 /* somebody beat us to the hash insert */ 3911 mutex_exit(hash_lock); 3912 buf_discard_identity(hdr); 3913 (void) arc_buf_remove_ref(buf, private); 3914 goto top; /* restart the IO request */ 3915 } 3916 3917 /* if this is a prefetch, we don't have a reference */ 3918 if (*arc_flags & ARC_FLAG_PREFETCH) { 3919 (void) remove_reference(hdr, hash_lock, 3920 private); 3921 hdr->b_flags |= ARC_FLAG_PREFETCH; 3922 } 3923 if (*arc_flags & ARC_FLAG_L2CACHE) 3924 hdr->b_flags |= ARC_FLAG_L2CACHE; 3925 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3926 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3927 if (BP_GET_LEVEL(bp) > 0) 3928 hdr->b_flags |= ARC_FLAG_INDIRECT; 3929 } else { 3930 /* 3931 * This block is in the ghost cache. If it was L2-only 3932 * (and thus didn't have an L1 hdr), we realloc the 3933 * header to add an L1 hdr. 3934 */ 3935 if (!HDR_HAS_L1HDR(hdr)) { 3936 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 3937 hdr_full_cache); 3938 } 3939 3940 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 3941 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3942 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3943 ASSERT(hdr->b_l1hdr.b_buf == NULL); 3944 3945 /* if this is a prefetch, we don't have a reference */ 3946 if (*arc_flags & ARC_FLAG_PREFETCH) 3947 hdr->b_flags |= ARC_FLAG_PREFETCH; 3948 else 3949 add_reference(hdr, hash_lock, private); 3950 if (*arc_flags & ARC_FLAG_L2CACHE) 3951 hdr->b_flags |= ARC_FLAG_L2CACHE; 3952 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3953 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3954 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3955 buf->b_hdr = hdr; 3956 buf->b_data = NULL; 3957 buf->b_efunc = NULL; 3958 buf->b_private = NULL; 3959 buf->b_next = NULL; 3960 hdr->b_l1hdr.b_buf = buf; 3961 ASSERT0(hdr->b_l1hdr.b_datacnt); 3962 hdr->b_l1hdr.b_datacnt = 1; 3963 arc_get_data_buf(buf); 3964 arc_access(hdr, hash_lock); 3965 } 3966 3967 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 3968 3969 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3970 acb->acb_done = done; 3971 acb->acb_private = private; 3972 3973 ASSERT(hdr->b_l1hdr.b_acb == NULL); 3974 hdr->b_l1hdr.b_acb = acb; 3975 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 3976 3977 if (HDR_HAS_L2HDR(hdr) && 3978 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 3979 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 3980 addr = hdr->b_l2hdr.b_daddr; 3981 b_compress = HDR_GET_COMPRESS(hdr); 3982 b_asize = hdr->b_l2hdr.b_asize; 3983 /* 3984 * Lock out device removal. 3985 */ 3986 if (vdev_is_dead(vd) || 3987 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3988 vd = NULL; 3989 } 3990 3991 if (hash_lock != NULL) 3992 mutex_exit(hash_lock); 3993 3994 /* 3995 * At this point, we have a level 1 cache miss. Try again in 3996 * L2ARC if possible. 3997 */ 3998 ASSERT3U(hdr->b_size, ==, size); 3999 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4000 uint64_t, size, zbookmark_phys_t *, zb); 4001 ARCSTAT_BUMP(arcstat_misses); 4002 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4003 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4004 data, metadata, misses); 4005#ifdef _KERNEL 4006 curthread->td_ru.ru_inblock++; 4007#endif 4008 4009 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4010 /* 4011 * Read from the L2ARC if the following are true: 4012 * 1. The L2ARC vdev was previously cached. 4013 * 2. This buffer still has L2ARC metadata. 4014 * 3. This buffer isn't currently writing to the L2ARC. 4015 * 4. The L2ARC entry wasn't evicted, which may 4016 * also have invalidated the vdev. 4017 * 5. This isn't prefetch and l2arc_noprefetch is set. 4018 */ 4019 if (HDR_HAS_L2HDR(hdr) && 4020 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4021 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4022 l2arc_read_callback_t *cb; 4023 4024 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4025 ARCSTAT_BUMP(arcstat_l2_hits); 4026 4027 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4028 KM_SLEEP); 4029 cb->l2rcb_buf = buf; 4030 cb->l2rcb_spa = spa; 4031 cb->l2rcb_bp = *bp; 4032 cb->l2rcb_zb = *zb; 4033 cb->l2rcb_flags = zio_flags; 4034 cb->l2rcb_compress = b_compress; 4035 4036 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4037 addr + size < vd->vdev_psize - 4038 VDEV_LABEL_END_SIZE); 4039 4040 /* 4041 * l2arc read. The SCL_L2ARC lock will be 4042 * released by l2arc_read_done(). 4043 * Issue a null zio if the underlying buffer 4044 * was squashed to zero size by compression. 4045 */ 4046 if (b_compress == ZIO_COMPRESS_EMPTY) { 4047 rzio = zio_null(pio, spa, vd, 4048 l2arc_read_done, cb, 4049 zio_flags | ZIO_FLAG_DONT_CACHE | 4050 ZIO_FLAG_CANFAIL | 4051 ZIO_FLAG_DONT_PROPAGATE | 4052 ZIO_FLAG_DONT_RETRY); 4053 } else { 4054 rzio = zio_read_phys(pio, vd, addr, 4055 b_asize, buf->b_data, 4056 ZIO_CHECKSUM_OFF, 4057 l2arc_read_done, cb, priority, 4058 zio_flags | ZIO_FLAG_DONT_CACHE | 4059 ZIO_FLAG_CANFAIL | 4060 ZIO_FLAG_DONT_PROPAGATE | 4061 ZIO_FLAG_DONT_RETRY, B_FALSE); 4062 } 4063 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4064 zio_t *, rzio); 4065 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4066 4067 if (*arc_flags & ARC_FLAG_NOWAIT) { 4068 zio_nowait(rzio); 4069 return (0); 4070 } 4071 4072 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4073 if (zio_wait(rzio) == 0) 4074 return (0); 4075 4076 /* l2arc read error; goto zio_read() */ 4077 } else { 4078 DTRACE_PROBE1(l2arc__miss, 4079 arc_buf_hdr_t *, hdr); 4080 ARCSTAT_BUMP(arcstat_l2_misses); 4081 if (HDR_L2_WRITING(hdr)) 4082 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4083 spa_config_exit(spa, SCL_L2ARC, vd); 4084 } 4085 } else { 4086 if (vd != NULL) 4087 spa_config_exit(spa, SCL_L2ARC, vd); 4088 if (l2arc_ndev != 0) { 4089 DTRACE_PROBE1(l2arc__miss, 4090 arc_buf_hdr_t *, hdr); 4091 ARCSTAT_BUMP(arcstat_l2_misses); 4092 } 4093 } 4094 4095 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4096 arc_read_done, buf, priority, zio_flags, zb); 4097 4098 if (*arc_flags & ARC_FLAG_WAIT) 4099 return (zio_wait(rzio)); 4100 4101 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4102 zio_nowait(rzio); 4103 } 4104 return (0); 4105} 4106 4107void 4108arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4109{ 4110 ASSERT(buf->b_hdr != NULL); 4111 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4112 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4113 func == NULL); 4114 ASSERT(buf->b_efunc == NULL); 4115 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4116 4117 buf->b_efunc = func; 4118 buf->b_private = private; 4119} 4120 4121/* 4122 * Notify the arc that a block was freed, and thus will never be used again. 4123 */ 4124void 4125arc_freed(spa_t *spa, const blkptr_t *bp) 4126{ 4127 arc_buf_hdr_t *hdr; 4128 kmutex_t *hash_lock; 4129 uint64_t guid = spa_load_guid(spa); 4130 4131 ASSERT(!BP_IS_EMBEDDED(bp)); 4132 4133 hdr = buf_hash_find(guid, bp, &hash_lock); 4134 if (hdr == NULL) 4135 return; 4136 if (HDR_BUF_AVAILABLE(hdr)) { 4137 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4138 add_reference(hdr, hash_lock, FTAG); 4139 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4140 mutex_exit(hash_lock); 4141 4142 arc_release(buf, FTAG); 4143 (void) arc_buf_remove_ref(buf, FTAG); 4144 } else { 4145 mutex_exit(hash_lock); 4146 } 4147 4148} 4149 4150/* 4151 * Clear the user eviction callback set by arc_set_callback(), first calling 4152 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4153 * clearing the callback may result in the arc_buf being destroyed. However, 4154 * it will not result in the *last* arc_buf being destroyed, hence the data 4155 * will remain cached in the ARC. We make a copy of the arc buffer here so 4156 * that we can process the callback without holding any locks. 4157 * 4158 * It's possible that the callback is already in the process of being cleared 4159 * by another thread. In this case we can not clear the callback. 4160 * 4161 * Returns B_TRUE if the callback was successfully called and cleared. 4162 */ 4163boolean_t 4164arc_clear_callback(arc_buf_t *buf) 4165{ 4166 arc_buf_hdr_t *hdr; 4167 kmutex_t *hash_lock; 4168 arc_evict_func_t *efunc = buf->b_efunc; 4169 void *private = buf->b_private; 4170 list_t *list, *evicted_list; 4171 kmutex_t *lock, *evicted_lock; 4172 4173 mutex_enter(&buf->b_evict_lock); 4174 hdr = buf->b_hdr; 4175 if (hdr == NULL) { 4176 /* 4177 * We are in arc_do_user_evicts(). 4178 */ 4179 ASSERT(buf->b_data == NULL); 4180 mutex_exit(&buf->b_evict_lock); 4181 return (B_FALSE); 4182 } else if (buf->b_data == NULL) { 4183 /* 4184 * We are on the eviction list; process this buffer now 4185 * but let arc_do_user_evicts() do the reaping. 4186 */ 4187 buf->b_efunc = NULL; 4188 mutex_exit(&buf->b_evict_lock); 4189 VERIFY0(efunc(private)); 4190 return (B_TRUE); 4191 } 4192 hash_lock = HDR_LOCK(hdr); 4193 mutex_enter(hash_lock); 4194 hdr = buf->b_hdr; 4195 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4196 4197 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4198 hdr->b_l1hdr.b_datacnt); 4199 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4200 hdr->b_l1hdr.b_state == arc_mfu); 4201 4202 buf->b_efunc = NULL; 4203 buf->b_private = NULL; 4204 4205 if (hdr->b_l1hdr.b_datacnt > 1) { 4206 mutex_exit(&buf->b_evict_lock); 4207 arc_buf_destroy(buf, FALSE, TRUE); 4208 } else { 4209 ASSERT(buf == hdr->b_l1hdr.b_buf); 4210 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4211 mutex_exit(&buf->b_evict_lock); 4212 } 4213 4214 mutex_exit(hash_lock); 4215 VERIFY0(efunc(private)); 4216 return (B_TRUE); 4217} 4218 4219/* 4220 * Release this buffer from the cache, making it an anonymous buffer. This 4221 * must be done after a read and prior to modifying the buffer contents. 4222 * If the buffer has more than one reference, we must make 4223 * a new hdr for the buffer. 4224 */ 4225void 4226arc_release(arc_buf_t *buf, void *tag) 4227{ 4228 arc_buf_hdr_t *hdr = buf->b_hdr; 4229 4230 /* 4231 * It would be nice to assert that if it's DMU metadata (level > 4232 * 0 || it's the dnode file), then it must be syncing context. 4233 * But we don't know that information at this level. 4234 */ 4235 4236 mutex_enter(&buf->b_evict_lock); 4237 /* 4238 * We don't grab the hash lock prior to this check, because if 4239 * the buffer's header is in the arc_anon state, it won't be 4240 * linked into the hash table. 4241 */ 4242 if (hdr->b_l1hdr.b_state == arc_anon) { 4243 mutex_exit(&buf->b_evict_lock); 4244 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4245 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4246 ASSERT(!HDR_HAS_L2HDR(hdr)); 4247 ASSERT(BUF_EMPTY(hdr)); 4248 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4249 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4250 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4251 4252 ASSERT3P(buf->b_efunc, ==, NULL); 4253 ASSERT3P(buf->b_private, ==, NULL); 4254 4255 hdr->b_l1hdr.b_arc_access = 0; 4256 arc_buf_thaw(buf); 4257 4258 return; 4259 } 4260 4261 kmutex_t *hash_lock = HDR_LOCK(hdr); 4262 mutex_enter(hash_lock); 4263 4264 /* 4265 * This assignment is only valid as long as the hash_lock is 4266 * held, we must be careful not to reference state or the 4267 * b_state field after dropping the lock. 4268 */ 4269 arc_state_t *state = hdr->b_l1hdr.b_state; 4270 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4271 ASSERT3P(state, !=, arc_anon); 4272 4273 /* this buffer is not on any list */ 4274 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4275 4276 if (HDR_HAS_L2HDR(hdr)) { 4277 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 4278 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 4279 4280 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4281 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4282 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 4283 list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); 4284 4285 /* 4286 * We don't want to leak the b_tmp_cdata buffer that was 4287 * allocated in l2arc_write_buffers() 4288 */ 4289 arc_buf_l2_cdata_free(hdr); 4290 4291 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4292 4293 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 4294 } 4295 4296 /* 4297 * Do we have more than one buf? 4298 */ 4299 if (hdr->b_l1hdr.b_datacnt > 1) { 4300 arc_buf_hdr_t *nhdr; 4301 arc_buf_t **bufp; 4302 uint64_t blksz = hdr->b_size; 4303 uint64_t spa = hdr->b_spa; 4304 arc_buf_contents_t type = arc_buf_type(hdr); 4305 uint32_t flags = hdr->b_flags; 4306 4307 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4308 /* 4309 * Pull the data off of this hdr and attach it to 4310 * a new anonymous hdr. 4311 */ 4312 (void) remove_reference(hdr, hash_lock, tag); 4313 bufp = &hdr->b_l1hdr.b_buf; 4314 while (*bufp != buf) 4315 bufp = &(*bufp)->b_next; 4316 *bufp = buf->b_next; 4317 buf->b_next = NULL; 4318 4319 ASSERT3P(state, !=, arc_l2c_only); 4320 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4321 atomic_add_64(&state->arcs_size, -hdr->b_size); 4322 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4323 ASSERT3P(state, !=, arc_l2c_only); 4324 uint64_t *size = &state->arcs_lsize[type]; 4325 ASSERT3U(*size, >=, hdr->b_size); 4326 atomic_add_64(size, -hdr->b_size); 4327 } 4328 4329 /* 4330 * We're releasing a duplicate user data buffer, update 4331 * our statistics accordingly. 4332 */ 4333 if (HDR_ISTYPE_DATA(hdr)) { 4334 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4335 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4336 -hdr->b_size); 4337 } 4338 hdr->b_l1hdr.b_datacnt -= 1; 4339 arc_cksum_verify(buf); 4340#ifdef illumos 4341 arc_buf_unwatch(buf); 4342#endif /* illumos */ 4343 4344 mutex_exit(hash_lock); 4345 4346 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4347 nhdr->b_size = blksz; 4348 nhdr->b_spa = spa; 4349 4350 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4351 nhdr->b_flags |= arc_bufc_to_flags(type); 4352 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4353 4354 nhdr->b_l1hdr.b_buf = buf; 4355 nhdr->b_l1hdr.b_datacnt = 1; 4356 nhdr->b_l1hdr.b_state = arc_anon; 4357 nhdr->b_l1hdr.b_arc_access = 0; 4358 nhdr->b_freeze_cksum = NULL; 4359 4360 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4361 buf->b_hdr = nhdr; 4362 mutex_exit(&buf->b_evict_lock); 4363 atomic_add_64(&arc_anon->arcs_size, blksz); 4364 } else { 4365 mutex_exit(&buf->b_evict_lock); 4366 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4367 /* protected by hash lock */ 4368 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4369 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4370 arc_change_state(arc_anon, hdr, hash_lock); 4371 hdr->b_l1hdr.b_arc_access = 0; 4372 mutex_exit(hash_lock); 4373 4374 buf_discard_identity(hdr); 4375 arc_buf_thaw(buf); 4376 } 4377 buf->b_efunc = NULL; 4378 buf->b_private = NULL; 4379} 4380 4381int 4382arc_released(arc_buf_t *buf) 4383{ 4384 int released; 4385 4386 mutex_enter(&buf->b_evict_lock); 4387 released = (buf->b_data != NULL && 4388 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4389 mutex_exit(&buf->b_evict_lock); 4390 return (released); 4391} 4392 4393#ifdef ZFS_DEBUG 4394int 4395arc_referenced(arc_buf_t *buf) 4396{ 4397 int referenced; 4398 4399 mutex_enter(&buf->b_evict_lock); 4400 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4401 mutex_exit(&buf->b_evict_lock); 4402 return (referenced); 4403} 4404#endif 4405 4406static void 4407arc_write_ready(zio_t *zio) 4408{ 4409 arc_write_callback_t *callback = zio->io_private; 4410 arc_buf_t *buf = callback->awcb_buf; 4411 arc_buf_hdr_t *hdr = buf->b_hdr; 4412 4413 ASSERT(HDR_HAS_L1HDR(hdr)); 4414 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4415 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4416 callback->awcb_ready(zio, buf, callback->awcb_private); 4417 4418 /* 4419 * If the IO is already in progress, then this is a re-write 4420 * attempt, so we need to thaw and re-compute the cksum. 4421 * It is the responsibility of the callback to handle the 4422 * accounting for any re-write attempt. 4423 */ 4424 if (HDR_IO_IN_PROGRESS(hdr)) { 4425 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4426 if (hdr->b_freeze_cksum != NULL) { 4427 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4428 hdr->b_freeze_cksum = NULL; 4429 } 4430 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4431 } 4432 arc_cksum_compute(buf, B_FALSE); 4433 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4434} 4435 4436/* 4437 * The SPA calls this callback for each physical write that happens on behalf 4438 * of a logical write. See the comment in dbuf_write_physdone() for details. 4439 */ 4440static void 4441arc_write_physdone(zio_t *zio) 4442{ 4443 arc_write_callback_t *cb = zio->io_private; 4444 if (cb->awcb_physdone != NULL) 4445 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4446} 4447 4448static void 4449arc_write_done(zio_t *zio) 4450{ 4451 arc_write_callback_t *callback = zio->io_private; 4452 arc_buf_t *buf = callback->awcb_buf; 4453 arc_buf_hdr_t *hdr = buf->b_hdr; 4454 4455 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4456 4457 if (zio->io_error == 0) { 4458 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4459 buf_discard_identity(hdr); 4460 } else { 4461 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4462 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4463 } 4464 } else { 4465 ASSERT(BUF_EMPTY(hdr)); 4466 } 4467 4468 /* 4469 * If the block to be written was all-zero or compressed enough to be 4470 * embedded in the BP, no write was performed so there will be no 4471 * dva/birth/checksum. The buffer must therefore remain anonymous 4472 * (and uncached). 4473 */ 4474 if (!BUF_EMPTY(hdr)) { 4475 arc_buf_hdr_t *exists; 4476 kmutex_t *hash_lock; 4477 4478 ASSERT(zio->io_error == 0); 4479 4480 arc_cksum_verify(buf); 4481 4482 exists = buf_hash_insert(hdr, &hash_lock); 4483 if (exists != NULL) { 4484 /* 4485 * This can only happen if we overwrite for 4486 * sync-to-convergence, because we remove 4487 * buffers from the hash table when we arc_free(). 4488 */ 4489 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4490 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4491 panic("bad overwrite, hdr=%p exists=%p", 4492 (void *)hdr, (void *)exists); 4493 ASSERT(refcount_is_zero( 4494 &exists->b_l1hdr.b_refcnt)); 4495 arc_change_state(arc_anon, exists, hash_lock); 4496 mutex_exit(hash_lock); 4497 arc_hdr_destroy(exists); 4498 exists = buf_hash_insert(hdr, &hash_lock); 4499 ASSERT3P(exists, ==, NULL); 4500 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4501 /* nopwrite */ 4502 ASSERT(zio->io_prop.zp_nopwrite); 4503 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4504 panic("bad nopwrite, hdr=%p exists=%p", 4505 (void *)hdr, (void *)exists); 4506 } else { 4507 /* Dedup */ 4508 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4509 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4510 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4511 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4512 } 4513 } 4514 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4515 /* if it's not anon, we are doing a scrub */ 4516 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4517 arc_access(hdr, hash_lock); 4518 mutex_exit(hash_lock); 4519 } else { 4520 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4521 } 4522 4523 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4524 callback->awcb_done(zio, buf, callback->awcb_private); 4525 4526 kmem_free(callback, sizeof (arc_write_callback_t)); 4527} 4528 4529zio_t * 4530arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4531 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4532 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4533 arc_done_func_t *done, void *private, zio_priority_t priority, 4534 int zio_flags, const zbookmark_phys_t *zb) 4535{ 4536 arc_buf_hdr_t *hdr = buf->b_hdr; 4537 arc_write_callback_t *callback; 4538 zio_t *zio; 4539 4540 ASSERT(ready != NULL); 4541 ASSERT(done != NULL); 4542 ASSERT(!HDR_IO_ERROR(hdr)); 4543 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4544 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4545 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4546 if (l2arc) 4547 hdr->b_flags |= ARC_FLAG_L2CACHE; 4548 if (l2arc_compress) 4549 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4550 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4551 callback->awcb_ready = ready; 4552 callback->awcb_physdone = physdone; 4553 callback->awcb_done = done; 4554 callback->awcb_private = private; 4555 callback->awcb_buf = buf; 4556 4557 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4558 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4559 priority, zio_flags, zb); 4560 4561 return (zio); 4562} 4563 4564static int 4565arc_memory_throttle(uint64_t reserve, uint64_t txg) 4566{ 4567#ifdef _KERNEL 4568 uint64_t available_memory = ptob(freemem); 4569 static uint64_t page_load = 0; 4570 static uint64_t last_txg = 0; 4571 4572#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4573 available_memory = 4574 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4575#endif 4576 4577 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4578 return (0); 4579 4580 if (txg > last_txg) { 4581 last_txg = txg; 4582 page_load = 0; 4583 } 4584 /* 4585 * If we are in pageout, we know that memory is already tight, 4586 * the arc is already going to be evicting, so we just want to 4587 * continue to let page writes occur as quickly as possible. 4588 */ 4589 if (curproc == pageproc) { 4590 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4591 return (SET_ERROR(ERESTART)); 4592 /* Note: reserve is inflated, so we deflate */ 4593 page_load += reserve / 8; 4594 return (0); 4595 } else if (page_load > 0 && arc_reclaim_needed()) { 4596 /* memory is low, delay before restarting */ 4597 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4598 return (SET_ERROR(EAGAIN)); 4599 } 4600 page_load = 0; 4601#endif 4602 return (0); 4603} 4604 4605static void 4606arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4607 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4608{ 4609 size->value.ui64 = state->arcs_size; 4610 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4611 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4612} 4613 4614static int 4615arc_kstat_update(kstat_t *ksp, int rw) 4616{ 4617 arc_stats_t *as = ksp->ks_data; 4618 4619 if (rw == KSTAT_WRITE) { 4620 return (EACCES); 4621 } else { 4622 arc_kstat_update_state(arc_anon, 4623 &as->arcstat_anon_size, 4624 &as->arcstat_anon_evictable_data, 4625 &as->arcstat_anon_evictable_metadata); 4626 arc_kstat_update_state(arc_mru, 4627 &as->arcstat_mru_size, 4628 &as->arcstat_mru_evictable_data, 4629 &as->arcstat_mru_evictable_metadata); 4630 arc_kstat_update_state(arc_mru_ghost, 4631 &as->arcstat_mru_ghost_size, 4632 &as->arcstat_mru_ghost_evictable_data, 4633 &as->arcstat_mru_ghost_evictable_metadata); 4634 arc_kstat_update_state(arc_mfu, 4635 &as->arcstat_mfu_size, 4636 &as->arcstat_mfu_evictable_data, 4637 &as->arcstat_mfu_evictable_metadata); 4638 arc_kstat_update_state(arc_mfu_ghost, 4639 &as->arcstat_mfu_ghost_size, 4640 &as->arcstat_mfu_ghost_evictable_data, 4641 &as->arcstat_mfu_ghost_evictable_metadata); 4642 } 4643 4644 return (0); 4645} 4646 4647void 4648arc_tempreserve_clear(uint64_t reserve) 4649{ 4650 atomic_add_64(&arc_tempreserve, -reserve); 4651 ASSERT((int64_t)arc_tempreserve >= 0); 4652} 4653 4654int 4655arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4656{ 4657 int error; 4658 uint64_t anon_size; 4659 4660 if (reserve > arc_c/4 && !arc_no_grow) { 4661 arc_c = MIN(arc_c_max, reserve * 4); 4662 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4663 } 4664 if (reserve > arc_c) 4665 return (SET_ERROR(ENOMEM)); 4666 4667 /* 4668 * Don't count loaned bufs as in flight dirty data to prevent long 4669 * network delays from blocking transactions that are ready to be 4670 * assigned to a txg. 4671 */ 4672 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4673 4674 /* 4675 * Writes will, almost always, require additional memory allocations 4676 * in order to compress/encrypt/etc the data. We therefore need to 4677 * make sure that there is sufficient available memory for this. 4678 */ 4679 error = arc_memory_throttle(reserve, txg); 4680 if (error != 0) 4681 return (error); 4682 4683 /* 4684 * Throttle writes when the amount of dirty data in the cache 4685 * gets too large. We try to keep the cache less than half full 4686 * of dirty blocks so that our sync times don't grow too large. 4687 * Note: if two requests come in concurrently, we might let them 4688 * both succeed, when one of them should fail. Not a huge deal. 4689 */ 4690 4691 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4692 anon_size > arc_c / 4) { 4693 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4694 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4695 arc_tempreserve>>10, 4696 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4697 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4698 reserve>>10, arc_c>>10); 4699 return (SET_ERROR(ERESTART)); 4700 } 4701 atomic_add_64(&arc_tempreserve, reserve); 4702 return (0); 4703} 4704 4705static kmutex_t arc_lowmem_lock; 4706#ifdef _KERNEL 4707static eventhandler_tag arc_event_lowmem = NULL; 4708 4709static void 4710arc_lowmem(void *arg __unused, int howto __unused) 4711{ 4712 4713 /* Serialize access via arc_lowmem_lock. */ 4714 mutex_enter(&arc_lowmem_lock); 4715 mutex_enter(&arc_reclaim_thr_lock); 4716 needfree = 1; 4717 DTRACE_PROBE(arc__needfree); 4718 cv_signal(&arc_reclaim_thr_cv); 4719 4720 /* 4721 * It is unsafe to block here in arbitrary threads, because we can come 4722 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4723 * with ARC reclaim thread. 4724 */ 4725 if (curproc == pageproc) { 4726 while (needfree) 4727 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4728 } 4729 mutex_exit(&arc_reclaim_thr_lock); 4730 mutex_exit(&arc_lowmem_lock); 4731} 4732#endif 4733 4734void 4735arc_init(void) 4736{ 4737 int i, prefetch_tunable_set = 0; 4738 4739 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4740 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4741 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 4742 4743 /* Convert seconds to clock ticks */ 4744 arc_min_prefetch_lifespan = 1 * hz; 4745 4746 /* Start out with 1/8 of all memory */ 4747 arc_c = kmem_size() / 8; 4748 4749#ifdef sun 4750#ifdef _KERNEL 4751 /* 4752 * On architectures where the physical memory can be larger 4753 * than the addressable space (intel in 32-bit mode), we may 4754 * need to limit the cache to 1/8 of VM size. 4755 */ 4756 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4757#endif 4758#endif /* sun */ 4759 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4760 arc_c_min = MAX(arc_c / 4, 16 << 20); 4761 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4762 if (arc_c * 8 >= 1 << 30) 4763 arc_c_max = (arc_c * 8) - (1 << 30); 4764 else 4765 arc_c_max = arc_c_min; 4766 arc_c_max = MAX(arc_c * 5, arc_c_max); 4767 4768#ifdef _KERNEL 4769 /* 4770 * Allow the tunables to override our calculations if they are 4771 * reasonable (ie. over 16MB) 4772 */ 4773 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4774 arc_c_max = zfs_arc_max; 4775 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4776 arc_c_min = zfs_arc_min; 4777#endif 4778 4779 arc_c = arc_c_max; 4780 arc_p = (arc_c >> 1); 4781 4782 /* limit meta-data to 1/4 of the arc capacity */ 4783 arc_meta_limit = arc_c_max / 4; 4784 4785 /* Allow the tunable to override if it is reasonable */ 4786 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4787 arc_meta_limit = zfs_arc_meta_limit; 4788 4789 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4790 arc_c_min = arc_meta_limit / 2; 4791 4792 if (zfs_arc_meta_min > 0) { 4793 arc_meta_min = zfs_arc_meta_min; 4794 } else { 4795 arc_meta_min = arc_c_min / 2; 4796 } 4797 4798 if (zfs_arc_grow_retry > 0) 4799 arc_grow_retry = zfs_arc_grow_retry; 4800 4801 if (zfs_arc_shrink_shift > 0) 4802 arc_shrink_shift = zfs_arc_shrink_shift; 4803 4804 if (zfs_arc_p_min_shift > 0) 4805 arc_p_min_shift = zfs_arc_p_min_shift; 4806 4807 /* if kmem_flags are set, lets try to use less memory */ 4808 if (kmem_debugging()) 4809 arc_c = arc_c / 2; 4810 if (arc_c < arc_c_min) 4811 arc_c = arc_c_min; 4812 4813 zfs_arc_min = arc_c_min; 4814 zfs_arc_max = arc_c_max; 4815 4816 arc_anon = &ARC_anon; 4817 arc_mru = &ARC_mru; 4818 arc_mru_ghost = &ARC_mru_ghost; 4819 arc_mfu = &ARC_mfu; 4820 arc_mfu_ghost = &ARC_mfu_ghost; 4821 arc_l2c_only = &ARC_l2c_only; 4822 arc_size = 0; 4823 4824 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4825 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 4826 NULL, MUTEX_DEFAULT, NULL); 4827 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 4828 NULL, MUTEX_DEFAULT, NULL); 4829 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 4830 NULL, MUTEX_DEFAULT, NULL); 4831 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 4832 NULL, MUTEX_DEFAULT, NULL); 4833 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 4834 NULL, MUTEX_DEFAULT, NULL); 4835 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 4836 NULL, MUTEX_DEFAULT, NULL); 4837 4838 list_create(&arc_mru->arcs_lists[i], 4839 sizeof (arc_buf_hdr_t), 4840 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4841 list_create(&arc_mru_ghost->arcs_lists[i], 4842 sizeof (arc_buf_hdr_t), 4843 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4844 list_create(&arc_mfu->arcs_lists[i], 4845 sizeof (arc_buf_hdr_t), 4846 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4847 list_create(&arc_mfu_ghost->arcs_lists[i], 4848 sizeof (arc_buf_hdr_t), 4849 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4850 list_create(&arc_mfu_ghost->arcs_lists[i], 4851 sizeof (arc_buf_hdr_t), 4852 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4853 list_create(&arc_l2c_only->arcs_lists[i], 4854 sizeof (arc_buf_hdr_t), 4855 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4856 } 4857 4858 buf_init(); 4859 4860 arc_thread_exit = 0; 4861 arc_eviction_list = NULL; 4862 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4863 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4864 4865 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4866 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4867 4868 if (arc_ksp != NULL) { 4869 arc_ksp->ks_data = &arc_stats; 4870 arc_ksp->ks_update = arc_kstat_update; 4871 kstat_install(arc_ksp); 4872 } 4873 4874 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4875 TS_RUN, minclsyspri); 4876 4877#ifdef _KERNEL 4878 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4879 EVENTHANDLER_PRI_FIRST); 4880#endif 4881 4882 arc_dead = FALSE; 4883 arc_warm = B_FALSE; 4884 4885 /* 4886 * Calculate maximum amount of dirty data per pool. 4887 * 4888 * If it has been set by /etc/system, take that. 4889 * Otherwise, use a percentage of physical memory defined by 4890 * zfs_dirty_data_max_percent (default 10%) with a cap at 4891 * zfs_dirty_data_max_max (default 4GB). 4892 */ 4893 if (zfs_dirty_data_max == 0) { 4894 zfs_dirty_data_max = ptob(physmem) * 4895 zfs_dirty_data_max_percent / 100; 4896 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4897 zfs_dirty_data_max_max); 4898 } 4899 4900#ifdef _KERNEL 4901 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4902 prefetch_tunable_set = 1; 4903 4904#ifdef __i386__ 4905 if (prefetch_tunable_set == 0) { 4906 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4907 "-- to enable,\n"); 4908 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4909 "to /boot/loader.conf.\n"); 4910 zfs_prefetch_disable = 1; 4911 } 4912#else 4913 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4914 prefetch_tunable_set == 0) { 4915 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4916 "than 4GB of RAM is present;\n" 4917 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4918 "to /boot/loader.conf.\n"); 4919 zfs_prefetch_disable = 1; 4920 } 4921#endif 4922 /* Warn about ZFS memory and address space requirements. */ 4923 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4924 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4925 "expect unstable behavior.\n"); 4926 } 4927 if (kmem_size() < 512 * (1 << 20)) { 4928 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4929 "expect unstable behavior.\n"); 4930 printf(" Consider tuning vm.kmem_size and " 4931 "vm.kmem_size_max\n"); 4932 printf(" in /boot/loader.conf.\n"); 4933 } 4934#endif 4935} 4936 4937void 4938arc_fini(void) 4939{ 4940 int i; 4941 4942 mutex_enter(&arc_reclaim_thr_lock); 4943 arc_thread_exit = 1; 4944 cv_signal(&arc_reclaim_thr_cv); 4945 while (arc_thread_exit != 0) 4946 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4947 mutex_exit(&arc_reclaim_thr_lock); 4948 4949 arc_flush(NULL); 4950 4951 arc_dead = TRUE; 4952 4953 if (arc_ksp != NULL) { 4954 kstat_delete(arc_ksp); 4955 arc_ksp = NULL; 4956 } 4957 4958 mutex_destroy(&arc_eviction_mtx); 4959 mutex_destroy(&arc_reclaim_thr_lock); 4960 cv_destroy(&arc_reclaim_thr_cv); 4961 4962 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4963 list_destroy(&arc_mru->arcs_lists[i]); 4964 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4965 list_destroy(&arc_mfu->arcs_lists[i]); 4966 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4967 list_destroy(&arc_l2c_only->arcs_lists[i]); 4968 4969 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4970 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4971 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4972 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4973 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4974 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4975 } 4976 4977 buf_fini(); 4978 4979 ASSERT0(arc_loaned_bytes); 4980 4981 mutex_destroy(&arc_lowmem_lock); 4982#ifdef _KERNEL 4983 if (arc_event_lowmem != NULL) 4984 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4985#endif 4986} 4987 4988/* 4989 * Level 2 ARC 4990 * 4991 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4992 * It uses dedicated storage devices to hold cached data, which are populated 4993 * using large infrequent writes. The main role of this cache is to boost 4994 * the performance of random read workloads. The intended L2ARC devices 4995 * include short-stroked disks, solid state disks, and other media with 4996 * substantially faster read latency than disk. 4997 * 4998 * +-----------------------+ 4999 * | ARC | 5000 * +-----------------------+ 5001 * | ^ ^ 5002 * | | | 5003 * l2arc_feed_thread() arc_read() 5004 * | | | 5005 * | l2arc read | 5006 * V | | 5007 * +---------------+ | 5008 * | L2ARC | | 5009 * +---------------+ | 5010 * | ^ | 5011 * l2arc_write() | | 5012 * | | | 5013 * V | | 5014 * +-------+ +-------+ 5015 * | vdev | | vdev | 5016 * | cache | | cache | 5017 * +-------+ +-------+ 5018 * +=========+ .-----. 5019 * : L2ARC : |-_____-| 5020 * : devices : | Disks | 5021 * +=========+ `-_____-' 5022 * 5023 * Read requests are satisfied from the following sources, in order: 5024 * 5025 * 1) ARC 5026 * 2) vdev cache of L2ARC devices 5027 * 3) L2ARC devices 5028 * 4) vdev cache of disks 5029 * 5) disks 5030 * 5031 * Some L2ARC device types exhibit extremely slow write performance. 5032 * To accommodate for this there are some significant differences between 5033 * the L2ARC and traditional cache design: 5034 * 5035 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5036 * the ARC behave as usual, freeing buffers and placing headers on ghost 5037 * lists. The ARC does not send buffers to the L2ARC during eviction as 5038 * this would add inflated write latencies for all ARC memory pressure. 5039 * 5040 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5041 * It does this by periodically scanning buffers from the eviction-end of 5042 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5043 * not already there. It scans until a headroom of buffers is satisfied, 5044 * which itself is a buffer for ARC eviction. If a compressible buffer is 5045 * found during scanning and selected for writing to an L2ARC device, we 5046 * temporarily boost scanning headroom during the next scan cycle to make 5047 * sure we adapt to compression effects (which might significantly reduce 5048 * the data volume we write to L2ARC). The thread that does this is 5049 * l2arc_feed_thread(), illustrated below; example sizes are included to 5050 * provide a better sense of ratio than this diagram: 5051 * 5052 * head --> tail 5053 * +---------------------+----------+ 5054 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5055 * +---------------------+----------+ | o L2ARC eligible 5056 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5057 * +---------------------+----------+ | 5058 * 15.9 Gbytes ^ 32 Mbytes | 5059 * headroom | 5060 * l2arc_feed_thread() 5061 * | 5062 * l2arc write hand <--[oooo]--' 5063 * | 8 Mbyte 5064 * | write max 5065 * V 5066 * +==============================+ 5067 * L2ARC dev |####|#|###|###| |####| ... | 5068 * +==============================+ 5069 * 32 Gbytes 5070 * 5071 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5072 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5073 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5074 * safe to say that this is an uncommon case, since buffers at the end of 5075 * the ARC lists have moved there due to inactivity. 5076 * 5077 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5078 * then the L2ARC simply misses copying some buffers. This serves as a 5079 * pressure valve to prevent heavy read workloads from both stalling the ARC 5080 * with waits and clogging the L2ARC with writes. This also helps prevent 5081 * the potential for the L2ARC to churn if it attempts to cache content too 5082 * quickly, such as during backups of the entire pool. 5083 * 5084 * 5. After system boot and before the ARC has filled main memory, there are 5085 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5086 * lists can remain mostly static. Instead of searching from tail of these 5087 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5088 * for eligible buffers, greatly increasing its chance of finding them. 5089 * 5090 * The L2ARC device write speed is also boosted during this time so that 5091 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5092 * there are no L2ARC reads, and no fear of degrading read performance 5093 * through increased writes. 5094 * 5095 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5096 * the vdev queue can aggregate them into larger and fewer writes. Each 5097 * device is written to in a rotor fashion, sweeping writes through 5098 * available space then repeating. 5099 * 5100 * 7. The L2ARC does not store dirty content. It never needs to flush 5101 * write buffers back to disk based storage. 5102 * 5103 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5104 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5105 * 5106 * The performance of the L2ARC can be tweaked by a number of tunables, which 5107 * may be necessary for different workloads: 5108 * 5109 * l2arc_write_max max write bytes per interval 5110 * l2arc_write_boost extra write bytes during device warmup 5111 * l2arc_noprefetch skip caching prefetched buffers 5112 * l2arc_headroom number of max device writes to precache 5113 * l2arc_headroom_boost when we find compressed buffers during ARC 5114 * scanning, we multiply headroom by this 5115 * percentage factor for the next scan cycle, 5116 * since more compressed buffers are likely to 5117 * be present 5118 * l2arc_feed_secs seconds between L2ARC writing 5119 * 5120 * Tunables may be removed or added as future performance improvements are 5121 * integrated, and also may become zpool properties. 5122 * 5123 * There are three key functions that control how the L2ARC warms up: 5124 * 5125 * l2arc_write_eligible() check if a buffer is eligible to cache 5126 * l2arc_write_size() calculate how much to write 5127 * l2arc_write_interval() calculate sleep delay between writes 5128 * 5129 * These three functions determine what to write, how much, and how quickly 5130 * to send writes. 5131 */ 5132 5133static boolean_t 5134l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5135{ 5136 /* 5137 * A buffer is *not* eligible for the L2ARC if it: 5138 * 1. belongs to a different spa. 5139 * 2. is already cached on the L2ARC. 5140 * 3. has an I/O in progress (it may be an incomplete read). 5141 * 4. is flagged not eligible (zfs property). 5142 */ 5143 if (hdr->b_spa != spa_guid) { 5144 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5145 return (B_FALSE); 5146 } 5147 if (HDR_HAS_L2HDR(hdr)) { 5148 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5149 return (B_FALSE); 5150 } 5151 if (HDR_IO_IN_PROGRESS(hdr)) { 5152 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5153 return (B_FALSE); 5154 } 5155 if (!HDR_L2CACHE(hdr)) { 5156 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5157 return (B_FALSE); 5158 } 5159 5160 return (B_TRUE); 5161} 5162 5163static uint64_t 5164l2arc_write_size(void) 5165{ 5166 uint64_t size; 5167 5168 /* 5169 * Make sure our globals have meaningful values in case the user 5170 * altered them. 5171 */ 5172 size = l2arc_write_max; 5173 if (size == 0) { 5174 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5175 "be greater than zero, resetting it to the default (%d)", 5176 L2ARC_WRITE_SIZE); 5177 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5178 } 5179 5180 if (arc_warm == B_FALSE) 5181 size += l2arc_write_boost; 5182 5183 return (size); 5184 5185} 5186 5187static clock_t 5188l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5189{ 5190 clock_t interval, next, now; 5191 5192 /* 5193 * If the ARC lists are busy, increase our write rate; if the 5194 * lists are stale, idle back. This is achieved by checking 5195 * how much we previously wrote - if it was more than half of 5196 * what we wanted, schedule the next write much sooner. 5197 */ 5198 if (l2arc_feed_again && wrote > (wanted / 2)) 5199 interval = (hz * l2arc_feed_min_ms) / 1000; 5200 else 5201 interval = hz * l2arc_feed_secs; 5202 5203 now = ddi_get_lbolt(); 5204 next = MAX(now, MIN(now + interval, began + interval)); 5205 5206 return (next); 5207} 5208 5209/* 5210 * Cycle through L2ARC devices. This is how L2ARC load balances. 5211 * If a device is returned, this also returns holding the spa config lock. 5212 */ 5213static l2arc_dev_t * 5214l2arc_dev_get_next(void) 5215{ 5216 l2arc_dev_t *first, *next = NULL; 5217 5218 /* 5219 * Lock out the removal of spas (spa_namespace_lock), then removal 5220 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5221 * both locks will be dropped and a spa config lock held instead. 5222 */ 5223 mutex_enter(&spa_namespace_lock); 5224 mutex_enter(&l2arc_dev_mtx); 5225 5226 /* if there are no vdevs, there is nothing to do */ 5227 if (l2arc_ndev == 0) 5228 goto out; 5229 5230 first = NULL; 5231 next = l2arc_dev_last; 5232 do { 5233 /* loop around the list looking for a non-faulted vdev */ 5234 if (next == NULL) { 5235 next = list_head(l2arc_dev_list); 5236 } else { 5237 next = list_next(l2arc_dev_list, next); 5238 if (next == NULL) 5239 next = list_head(l2arc_dev_list); 5240 } 5241 5242 /* if we have come back to the start, bail out */ 5243 if (first == NULL) 5244 first = next; 5245 else if (next == first) 5246 break; 5247 5248 } while (vdev_is_dead(next->l2ad_vdev)); 5249 5250 /* if we were unable to find any usable vdevs, return NULL */ 5251 if (vdev_is_dead(next->l2ad_vdev)) 5252 next = NULL; 5253 5254 l2arc_dev_last = next; 5255 5256out: 5257 mutex_exit(&l2arc_dev_mtx); 5258 5259 /* 5260 * Grab the config lock to prevent the 'next' device from being 5261 * removed while we are writing to it. 5262 */ 5263 if (next != NULL) 5264 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5265 mutex_exit(&spa_namespace_lock); 5266 5267 return (next); 5268} 5269 5270/* 5271 * Free buffers that were tagged for destruction. 5272 */ 5273static void 5274l2arc_do_free_on_write() 5275{ 5276 list_t *buflist; 5277 l2arc_data_free_t *df, *df_prev; 5278 5279 mutex_enter(&l2arc_free_on_write_mtx); 5280 buflist = l2arc_free_on_write; 5281 5282 for (df = list_tail(buflist); df; df = df_prev) { 5283 df_prev = list_prev(buflist, df); 5284 ASSERT(df->l2df_data != NULL); 5285 ASSERT(df->l2df_func != NULL); 5286 df->l2df_func(df->l2df_data, df->l2df_size); 5287 list_remove(buflist, df); 5288 kmem_free(df, sizeof (l2arc_data_free_t)); 5289 } 5290 5291 mutex_exit(&l2arc_free_on_write_mtx); 5292} 5293 5294/* 5295 * A write to a cache device has completed. Update all headers to allow 5296 * reads from these buffers to begin. 5297 */ 5298static void 5299l2arc_write_done(zio_t *zio) 5300{ 5301 l2arc_write_callback_t *cb; 5302 l2arc_dev_t *dev; 5303 list_t *buflist; 5304 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5305 kmutex_t *hash_lock; 5306 int64_t bytes_dropped = 0; 5307 5308 cb = zio->io_private; 5309 ASSERT(cb != NULL); 5310 dev = cb->l2wcb_dev; 5311 ASSERT(dev != NULL); 5312 head = cb->l2wcb_head; 5313 ASSERT(head != NULL); 5314 buflist = &dev->l2ad_buflist; 5315 ASSERT(buflist != NULL); 5316 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5317 l2arc_write_callback_t *, cb); 5318 5319 if (zio->io_error != 0) 5320 ARCSTAT_BUMP(arcstat_l2_writes_error); 5321 5322 mutex_enter(&dev->l2ad_mtx); 5323 5324 /* 5325 * All writes completed, or an error was hit. 5326 */ 5327 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5328 hdr_prev = list_prev(buflist, hdr); 5329 5330 hash_lock = HDR_LOCK(hdr); 5331 if (!mutex_tryenter(hash_lock)) { 5332 /* 5333 * This buffer misses out. It may be in a stage 5334 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5335 * left set, denying reads to this buffer. 5336 */ 5337 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5338 continue; 5339 } 5340 5341 /* 5342 * It's possible that this buffer got evicted from the L1 cache 5343 * before we grabbed the vdev + hash locks, in which case 5344 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5345 * Only free the buffer if we still have an L1 hdr. 5346 */ 5347 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5348 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5349 l2arc_release_cdata_buf(hdr); 5350 5351 if (zio->io_error != 0) { 5352 /* 5353 * Error - drop L2ARC entry. 5354 */ 5355 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5356 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5357 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5358 5359 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5360 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5361 } 5362 5363 /* 5364 * Allow ARC to begin reads to this L2ARC entry. 5365 */ 5366 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5367 5368 mutex_exit(hash_lock); 5369 } 5370 5371 atomic_inc_64(&l2arc_writes_done); 5372 list_remove(buflist, head); 5373 ASSERT(!HDR_HAS_L1HDR(head)); 5374 kmem_cache_free(hdr_l2only_cache, head); 5375 mutex_exit(&dev->l2ad_mtx); 5376 5377 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5378 5379 l2arc_do_free_on_write(); 5380 5381 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5382} 5383 5384/* 5385 * A read to a cache device completed. Validate buffer contents before 5386 * handing over to the regular ARC routines. 5387 */ 5388static void 5389l2arc_read_done(zio_t *zio) 5390{ 5391 l2arc_read_callback_t *cb; 5392 arc_buf_hdr_t *hdr; 5393 arc_buf_t *buf; 5394 kmutex_t *hash_lock; 5395 int equal; 5396 5397 ASSERT(zio->io_vd != NULL); 5398 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5399 5400 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5401 5402 cb = zio->io_private; 5403 ASSERT(cb != NULL); 5404 buf = cb->l2rcb_buf; 5405 ASSERT(buf != NULL); 5406 5407 hash_lock = HDR_LOCK(buf->b_hdr); 5408 mutex_enter(hash_lock); 5409 hdr = buf->b_hdr; 5410 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5411 5412 /* 5413 * If the buffer was compressed, decompress it first. 5414 */ 5415 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5416 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5417 ASSERT(zio->io_data != NULL); 5418 5419 /* 5420 * Check this survived the L2ARC journey. 5421 */ 5422 equal = arc_cksum_equal(buf); 5423 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5424 mutex_exit(hash_lock); 5425 zio->io_private = buf; 5426 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5427 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5428 arc_read_done(zio); 5429 } else { 5430 mutex_exit(hash_lock); 5431 /* 5432 * Buffer didn't survive caching. Increment stats and 5433 * reissue to the original storage device. 5434 */ 5435 if (zio->io_error != 0) { 5436 ARCSTAT_BUMP(arcstat_l2_io_error); 5437 } else { 5438 zio->io_error = SET_ERROR(EIO); 5439 } 5440 if (!equal) 5441 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5442 5443 /* 5444 * If there's no waiter, issue an async i/o to the primary 5445 * storage now. If there *is* a waiter, the caller must 5446 * issue the i/o in a context where it's OK to block. 5447 */ 5448 if (zio->io_waiter == NULL) { 5449 zio_t *pio = zio_unique_parent(zio); 5450 5451 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5452 5453 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5454 buf->b_data, zio->io_size, arc_read_done, buf, 5455 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5456 } 5457 } 5458 5459 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5460} 5461 5462/* 5463 * This is the list priority from which the L2ARC will search for pages to 5464 * cache. This is used within loops (0..3) to cycle through lists in the 5465 * desired order. This order can have a significant effect on cache 5466 * performance. 5467 * 5468 * Currently the metadata lists are hit first, MFU then MRU, followed by 5469 * the data lists. This function returns a locked list, and also returns 5470 * the lock pointer. 5471 */ 5472static list_t * 5473l2arc_list_locked(int list_num, kmutex_t **lock) 5474{ 5475 list_t *list = NULL; 5476 int idx; 5477 5478 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 5479 5480 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 5481 idx = list_num; 5482 list = &arc_mfu->arcs_lists[idx]; 5483 *lock = ARCS_LOCK(arc_mfu, idx); 5484 } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 5485 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5486 list = &arc_mru->arcs_lists[idx]; 5487 *lock = ARCS_LOCK(arc_mru, idx); 5488 } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 5489 ARC_BUFC_NUMDATALISTS)) { 5490 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5491 list = &arc_mfu->arcs_lists[idx]; 5492 *lock = ARCS_LOCK(arc_mfu, idx); 5493 } else { 5494 idx = list_num - ARC_BUFC_NUMLISTS; 5495 list = &arc_mru->arcs_lists[idx]; 5496 *lock = ARCS_LOCK(arc_mru, idx); 5497 } 5498 5499 ASSERT(!(MUTEX_HELD(*lock))); 5500 mutex_enter(*lock); 5501 return (list); 5502} 5503 5504/* 5505 * Evict buffers from the device write hand to the distance specified in 5506 * bytes. This distance may span populated buffers, it may span nothing. 5507 * This is clearing a region on the L2ARC device ready for writing. 5508 * If the 'all' boolean is set, every buffer is evicted. 5509 */ 5510static void 5511l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5512{ 5513 list_t *buflist; 5514 arc_buf_hdr_t *hdr, *hdr_prev; 5515 kmutex_t *hash_lock; 5516 uint64_t taddr; 5517 int64_t bytes_evicted = 0; 5518 5519 buflist = &dev->l2ad_buflist; 5520 5521 if (!all && dev->l2ad_first) { 5522 /* 5523 * This is the first sweep through the device. There is 5524 * nothing to evict. 5525 */ 5526 return; 5527 } 5528 5529 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5530 /* 5531 * When nearing the end of the device, evict to the end 5532 * before the device write hand jumps to the start. 5533 */ 5534 taddr = dev->l2ad_end; 5535 } else { 5536 taddr = dev->l2ad_hand + distance; 5537 } 5538 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5539 uint64_t, taddr, boolean_t, all); 5540 5541top: 5542 mutex_enter(&dev->l2ad_mtx); 5543 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5544 hdr_prev = list_prev(buflist, hdr); 5545 5546 hash_lock = HDR_LOCK(hdr); 5547 if (!mutex_tryenter(hash_lock)) { 5548 /* 5549 * Missed the hash lock. Retry. 5550 */ 5551 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5552 mutex_exit(&dev->l2ad_mtx); 5553 mutex_enter(hash_lock); 5554 mutex_exit(hash_lock); 5555 goto top; 5556 } 5557 5558 if (HDR_L2_WRITE_HEAD(hdr)) { 5559 /* 5560 * We hit a write head node. Leave it for 5561 * l2arc_write_done(). 5562 */ 5563 list_remove(buflist, hdr); 5564 mutex_exit(hash_lock); 5565 continue; 5566 } 5567 5568 if (!all && HDR_HAS_L2HDR(hdr) && 5569 (hdr->b_l2hdr.b_daddr > taddr || 5570 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5571 /* 5572 * We've evicted to the target address, 5573 * or the end of the device. 5574 */ 5575 mutex_exit(hash_lock); 5576 break; 5577 } 5578 5579 ASSERT(HDR_HAS_L2HDR(hdr)); 5580 if (!HDR_HAS_L1HDR(hdr)) { 5581 ASSERT(!HDR_L2_READING(hdr)); 5582 /* 5583 * This doesn't exist in the ARC. Destroy. 5584 * arc_hdr_destroy() will call list_remove() 5585 * and decrement arcstat_l2_size. 5586 */ 5587 arc_change_state(arc_anon, hdr, hash_lock); 5588 arc_hdr_destroy(hdr); 5589 } else { 5590 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5591 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5592 /* 5593 * Invalidate issued or about to be issued 5594 * reads, since we may be about to write 5595 * over this location. 5596 */ 5597 if (HDR_L2_READING(hdr)) { 5598 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5599 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5600 } 5601 5602 /* Tell ARC this no longer exists in L2ARC. */ 5603 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5604 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5605 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5606 list_remove(buflist, hdr); 5607 5608 /* This may have been leftover after a failed write. */ 5609 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5610 } 5611 mutex_exit(hash_lock); 5612 } 5613 mutex_exit(&dev->l2ad_mtx); 5614 5615 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); 5616 dev->l2ad_evict = taddr; 5617} 5618 5619/* 5620 * Find and write ARC buffers to the L2ARC device. 5621 * 5622 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5623 * for reading until they have completed writing. 5624 * The headroom_boost is an in-out parameter used to maintain headroom boost 5625 * state between calls to this function. 5626 * 5627 * Returns the number of bytes actually written (which may be smaller than 5628 * the delta by which the device hand has changed due to alignment). 5629 */ 5630static uint64_t 5631l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5632 boolean_t *headroom_boost) 5633{ 5634 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5635 list_t *list; 5636 uint64_t write_asize, write_sz, headroom, buf_compress_minsz; 5637 void *buf_data; 5638 kmutex_t *list_lock; 5639 boolean_t full; 5640 l2arc_write_callback_t *cb; 5641 zio_t *pio, *wzio; 5642 uint64_t guid = spa_load_guid(spa); 5643 const boolean_t do_headroom_boost = *headroom_boost; 5644 int try; 5645 5646 ASSERT(dev->l2ad_vdev != NULL); 5647 5648 /* Lower the flag now, we might want to raise it again later. */ 5649 *headroom_boost = B_FALSE; 5650 5651 pio = NULL; 5652 write_sz = write_asize = 0; 5653 full = B_FALSE; 5654 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5655 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5656 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5657 5658 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5659 /* 5660 * We will want to try to compress buffers that are at least 2x the 5661 * device sector size. 5662 */ 5663 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5664 5665 /* 5666 * Copy buffers for L2ARC writing. 5667 */ 5668 mutex_enter(&dev->l2ad_mtx); 5669 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 5670 uint64_t passed_sz = 0; 5671 5672 list = l2arc_list_locked(try, &list_lock); 5673 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5674 5675 /* 5676 * L2ARC fast warmup. 5677 * 5678 * Until the ARC is warm and starts to evict, read from the 5679 * head of the ARC lists rather than the tail. 5680 */ 5681 if (arc_warm == B_FALSE) 5682 hdr = list_head(list); 5683 else 5684 hdr = list_tail(list); 5685 if (hdr == NULL) 5686 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5687 5688 headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; 5689 if (do_headroom_boost) 5690 headroom = (headroom * l2arc_headroom_boost) / 100; 5691 5692 for (; hdr; hdr = hdr_prev) { 5693 kmutex_t *hash_lock; 5694 uint64_t buf_sz; 5695 uint64_t buf_a_sz; 5696 5697 if (arc_warm == B_FALSE) 5698 hdr_prev = list_next(list, hdr); 5699 else 5700 hdr_prev = list_prev(list, hdr); 5701 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5702 5703 hash_lock = HDR_LOCK(hdr); 5704 if (!mutex_tryenter(hash_lock)) { 5705 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5706 /* 5707 * Skip this buffer rather than waiting. 5708 */ 5709 continue; 5710 } 5711 5712 passed_sz += hdr->b_size; 5713 if (passed_sz > headroom) { 5714 /* 5715 * Searched too far. 5716 */ 5717 mutex_exit(hash_lock); 5718 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5719 break; 5720 } 5721 5722 if (!l2arc_write_eligible(guid, hdr)) { 5723 mutex_exit(hash_lock); 5724 continue; 5725 } 5726 5727 /* 5728 * Assume that the buffer is not going to be compressed 5729 * and could take more space on disk because of a larger 5730 * disk block size. 5731 */ 5732 buf_sz = hdr->b_size; 5733 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5734 5735 if ((write_asize + buf_a_sz) > target_sz) { 5736 full = B_TRUE; 5737 mutex_exit(hash_lock); 5738 ARCSTAT_BUMP(arcstat_l2_write_full); 5739 break; 5740 } 5741 5742 if (pio == NULL) { 5743 /* 5744 * Insert a dummy header on the buflist so 5745 * l2arc_write_done() can find where the 5746 * write buffers begin without searching. 5747 */ 5748 list_insert_head(&dev->l2ad_buflist, head); 5749 5750 cb = kmem_alloc( 5751 sizeof (l2arc_write_callback_t), KM_SLEEP); 5752 cb->l2wcb_dev = dev; 5753 cb->l2wcb_head = head; 5754 pio = zio_root(spa, l2arc_write_done, cb, 5755 ZIO_FLAG_CANFAIL); 5756 ARCSTAT_BUMP(arcstat_l2_write_pios); 5757 } 5758 5759 /* 5760 * Create and add a new L2ARC header. 5761 */ 5762 hdr->b_l2hdr.b_dev = dev; 5763 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5764 /* 5765 * Temporarily stash the data buffer in b_tmp_cdata. 5766 * The subsequent write step will pick it up from 5767 * there. This is because can't access b_l1hdr.b_buf 5768 * without holding the hash_lock, which we in turn 5769 * can't access without holding the ARC list locks 5770 * (which we want to avoid during compression/writing). 5771 */ 5772 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5773 hdr->b_l2hdr.b_asize = hdr->b_size; 5774 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5775 5776 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5777 5778 list_insert_head(&dev->l2ad_buflist, hdr); 5779 5780 /* 5781 * Compute and store the buffer cksum before 5782 * writing. On debug the cksum is verified first. 5783 */ 5784 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5785 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5786 5787 mutex_exit(hash_lock); 5788 5789 write_sz += buf_sz; 5790 write_asize += buf_a_sz; 5791 } 5792 5793 mutex_exit(list_lock); 5794 5795 if (full == B_TRUE) 5796 break; 5797 } 5798 5799 /* No buffers selected for writing? */ 5800 if (pio == NULL) { 5801 ASSERT0(write_sz); 5802 mutex_exit(&dev->l2ad_mtx); 5803 ASSERT(!HDR_HAS_L1HDR(head)); 5804 kmem_cache_free(hdr_l2only_cache, head); 5805 return (0); 5806 } 5807 5808 /* 5809 * Note that elsewhere in this file arcstat_l2_asize 5810 * and the used space on l2ad_vdev are updated using b_asize, 5811 * which is not necessarily rounded up to the device block size. 5812 * Too keep accounting consistent we do the same here as well: 5813 * stats_size accumulates the sum of b_asize of the written buffers, 5814 * while write_asize accumulates the sum of b_asize rounded up 5815 * to the device block size. 5816 * The latter sum is used only to validate the corectness of the code. 5817 */ 5818 uint64_t stats_size = 0; 5819 write_asize = 0; 5820 5821 /* 5822 * Now start writing the buffers. We're starting at the write head 5823 * and work backwards, retracing the course of the buffer selector 5824 * loop above. 5825 */ 5826 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5827 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5828 uint64_t buf_sz; 5829 5830 /* 5831 * We shouldn't need to lock the buffer here, since we flagged 5832 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5833 * take care to only access its L2 cache parameters. In 5834 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5835 * ARC eviction. 5836 */ 5837 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5838 5839 if ((HDR_L2COMPRESS(hdr)) && 5840 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5841 if (l2arc_compress_buf(hdr)) { 5842 /* 5843 * If compression succeeded, enable headroom 5844 * boost on the next scan cycle. 5845 */ 5846 *headroom_boost = B_TRUE; 5847 } 5848 } 5849 5850 /* 5851 * Pick up the buffer data we had previously stashed away 5852 * (and now potentially also compressed). 5853 */ 5854 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5855 buf_sz = hdr->b_l2hdr.b_asize; 5856 5857 /* 5858 * If the data has not been compressed, then clear b_tmp_cdata 5859 * to make sure that it points only to a temporary compression 5860 * buffer. 5861 */ 5862 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5863 hdr->b_l1hdr.b_tmp_cdata = NULL; 5864 5865 /* Compression may have squashed the buffer to zero length. */ 5866 if (buf_sz != 0) { 5867 uint64_t buf_a_sz; 5868 5869 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5870 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5871 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5872 ZIO_FLAG_CANFAIL, B_FALSE); 5873 5874 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5875 zio_t *, wzio); 5876 (void) zio_nowait(wzio); 5877 5878 stats_size += buf_sz; 5879 /* 5880 * Keep the clock hand suitably device-aligned. 5881 */ 5882 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5883 write_asize += buf_a_sz; 5884 dev->l2ad_hand += buf_a_sz; 5885 } 5886 } 5887 5888 mutex_exit(&dev->l2ad_mtx); 5889 5890 ASSERT3U(write_asize, <=, target_sz); 5891 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5892 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5893 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5894 ARCSTAT_INCR(arcstat_l2_asize, stats_size); 5895 vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); 5896 5897 /* 5898 * Bump device hand to the device start if it is approaching the end. 5899 * l2arc_evict() will already have evicted ahead for this case. 5900 */ 5901 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5902 dev->l2ad_hand = dev->l2ad_start; 5903 dev->l2ad_evict = dev->l2ad_start; 5904 dev->l2ad_first = B_FALSE; 5905 } 5906 5907 dev->l2ad_writing = B_TRUE; 5908 (void) zio_wait(pio); 5909 dev->l2ad_writing = B_FALSE; 5910 5911 return (write_asize); 5912} 5913 5914/* 5915 * Compresses an L2ARC buffer. 5916 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5917 * size in l2hdr->b_asize. This routine tries to compress the data and 5918 * depending on the compression result there are three possible outcomes: 5919 * *) The buffer was incompressible. The original l2hdr contents were left 5920 * untouched and are ready for writing to an L2 device. 5921 * *) The buffer was all-zeros, so there is no need to write it to an L2 5922 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5923 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5924 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5925 * data buffer which holds the compressed data to be written, and b_asize 5926 * tells us how much data there is. b_compress is set to the appropriate 5927 * compression algorithm. Once writing is done, invoke 5928 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5929 * 5930 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5931 * buffer was incompressible). 5932 */ 5933static boolean_t 5934l2arc_compress_buf(arc_buf_hdr_t *hdr) 5935{ 5936 void *cdata; 5937 size_t csize, len, rounded; 5938 ASSERT(HDR_HAS_L2HDR(hdr)); 5939 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 5940 5941 ASSERT(HDR_HAS_L1HDR(hdr)); 5942 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 5943 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 5944 5945 len = l2hdr->b_asize; 5946 cdata = zio_data_buf_alloc(len); 5947 ASSERT3P(cdata, !=, NULL); 5948 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 5949 cdata, l2hdr->b_asize); 5950 5951 if (csize == 0) { 5952 /* zero block, indicate that there's nothing to write */ 5953 zio_data_buf_free(cdata, len); 5954 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 5955 l2hdr->b_asize = 0; 5956 hdr->b_l1hdr.b_tmp_cdata = NULL; 5957 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5958 return (B_TRUE); 5959 } 5960 5961 rounded = P2ROUNDUP(csize, 5962 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 5963 if (rounded < len) { 5964 /* 5965 * Compression succeeded, we'll keep the cdata around for 5966 * writing and release it afterwards. 5967 */ 5968 if (rounded > csize) { 5969 bzero((char *)cdata + csize, rounded - csize); 5970 csize = rounded; 5971 } 5972 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 5973 l2hdr->b_asize = csize; 5974 hdr->b_l1hdr.b_tmp_cdata = cdata; 5975 ARCSTAT_BUMP(arcstat_l2_compress_successes); 5976 return (B_TRUE); 5977 } else { 5978 /* 5979 * Compression failed, release the compressed buffer. 5980 * l2hdr will be left unmodified. 5981 */ 5982 zio_data_buf_free(cdata, len); 5983 ARCSTAT_BUMP(arcstat_l2_compress_failures); 5984 return (B_FALSE); 5985 } 5986} 5987 5988/* 5989 * Decompresses a zio read back from an l2arc device. On success, the 5990 * underlying zio's io_data buffer is overwritten by the uncompressed 5991 * version. On decompression error (corrupt compressed stream), the 5992 * zio->io_error value is set to signal an I/O error. 5993 * 5994 * Please note that the compressed data stream is not checksummed, so 5995 * if the underlying device is experiencing data corruption, we may feed 5996 * corrupt data to the decompressor, so the decompressor needs to be 5997 * able to handle this situation (LZ4 does). 5998 */ 5999static void 6000l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6001{ 6002 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6003 6004 if (zio->io_error != 0) { 6005 /* 6006 * An io error has occured, just restore the original io 6007 * size in preparation for a main pool read. 6008 */ 6009 zio->io_orig_size = zio->io_size = hdr->b_size; 6010 return; 6011 } 6012 6013 if (c == ZIO_COMPRESS_EMPTY) { 6014 /* 6015 * An empty buffer results in a null zio, which means we 6016 * need to fill its io_data after we're done restoring the 6017 * buffer's contents. 6018 */ 6019 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6020 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6021 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6022 } else { 6023 ASSERT(zio->io_data != NULL); 6024 /* 6025 * We copy the compressed data from the start of the arc buffer 6026 * (the zio_read will have pulled in only what we need, the 6027 * rest is garbage which we will overwrite at decompression) 6028 * and then decompress back to the ARC data buffer. This way we 6029 * can minimize copying by simply decompressing back over the 6030 * original compressed data (rather than decompressing to an 6031 * aux buffer and then copying back the uncompressed buffer, 6032 * which is likely to be much larger). 6033 */ 6034 uint64_t csize; 6035 void *cdata; 6036 6037 csize = zio->io_size; 6038 cdata = zio_data_buf_alloc(csize); 6039 bcopy(zio->io_data, cdata, csize); 6040 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6041 hdr->b_size) != 0) 6042 zio->io_error = EIO; 6043 zio_data_buf_free(cdata, csize); 6044 } 6045 6046 /* Restore the expected uncompressed IO size. */ 6047 zio->io_orig_size = zio->io_size = hdr->b_size; 6048} 6049 6050/* 6051 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6052 * This buffer serves as a temporary holder of compressed data while 6053 * the buffer entry is being written to an l2arc device. Once that is 6054 * done, we can dispose of it. 6055 */ 6056static void 6057l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6058{ 6059 ASSERT(HDR_HAS_L1HDR(hdr)); 6060 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6061 /* 6062 * If the data was compressed, then we've allocated a 6063 * temporary buffer for it, so now we need to release it. 6064 */ 6065 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6066 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6067 hdr->b_size); 6068 hdr->b_l1hdr.b_tmp_cdata = NULL; 6069 } else { 6070 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6071 } 6072} 6073 6074/* 6075 * This thread feeds the L2ARC at regular intervals. This is the beating 6076 * heart of the L2ARC. 6077 */ 6078static void 6079l2arc_feed_thread(void *dummy __unused) 6080{ 6081 callb_cpr_t cpr; 6082 l2arc_dev_t *dev; 6083 spa_t *spa; 6084 uint64_t size, wrote; 6085 clock_t begin, next = ddi_get_lbolt(); 6086 boolean_t headroom_boost = B_FALSE; 6087 6088 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6089 6090 mutex_enter(&l2arc_feed_thr_lock); 6091 6092 while (l2arc_thread_exit == 0) { 6093 CALLB_CPR_SAFE_BEGIN(&cpr); 6094 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6095 next - ddi_get_lbolt()); 6096 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6097 next = ddi_get_lbolt() + hz; 6098 6099 /* 6100 * Quick check for L2ARC devices. 6101 */ 6102 mutex_enter(&l2arc_dev_mtx); 6103 if (l2arc_ndev == 0) { 6104 mutex_exit(&l2arc_dev_mtx); 6105 continue; 6106 } 6107 mutex_exit(&l2arc_dev_mtx); 6108 begin = ddi_get_lbolt(); 6109 6110 /* 6111 * This selects the next l2arc device to write to, and in 6112 * doing so the next spa to feed from: dev->l2ad_spa. This 6113 * will return NULL if there are now no l2arc devices or if 6114 * they are all faulted. 6115 * 6116 * If a device is returned, its spa's config lock is also 6117 * held to prevent device removal. l2arc_dev_get_next() 6118 * will grab and release l2arc_dev_mtx. 6119 */ 6120 if ((dev = l2arc_dev_get_next()) == NULL) 6121 continue; 6122 6123 spa = dev->l2ad_spa; 6124 ASSERT(spa != NULL); 6125 6126 /* 6127 * If the pool is read-only then force the feed thread to 6128 * sleep a little longer. 6129 */ 6130 if (!spa_writeable(spa)) { 6131 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6132 spa_config_exit(spa, SCL_L2ARC, dev); 6133 continue; 6134 } 6135 6136 /* 6137 * Avoid contributing to memory pressure. 6138 */ 6139 if (arc_reclaim_needed()) { 6140 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6141 spa_config_exit(spa, SCL_L2ARC, dev); 6142 continue; 6143 } 6144 6145 ARCSTAT_BUMP(arcstat_l2_feeds); 6146 6147 size = l2arc_write_size(); 6148 6149 /* 6150 * Evict L2ARC buffers that will be overwritten. 6151 */ 6152 l2arc_evict(dev, size, B_FALSE); 6153 6154 /* 6155 * Write ARC buffers. 6156 */ 6157 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6158 6159 /* 6160 * Calculate interval between writes. 6161 */ 6162 next = l2arc_write_interval(begin, size, wrote); 6163 spa_config_exit(spa, SCL_L2ARC, dev); 6164 } 6165 6166 l2arc_thread_exit = 0; 6167 cv_broadcast(&l2arc_feed_thr_cv); 6168 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6169 thread_exit(); 6170} 6171 6172boolean_t 6173l2arc_vdev_present(vdev_t *vd) 6174{ 6175 l2arc_dev_t *dev; 6176 6177 mutex_enter(&l2arc_dev_mtx); 6178 for (dev = list_head(l2arc_dev_list); dev != NULL; 6179 dev = list_next(l2arc_dev_list, dev)) { 6180 if (dev->l2ad_vdev == vd) 6181 break; 6182 } 6183 mutex_exit(&l2arc_dev_mtx); 6184 6185 return (dev != NULL); 6186} 6187 6188/* 6189 * Add a vdev for use by the L2ARC. By this point the spa has already 6190 * validated the vdev and opened it. 6191 */ 6192void 6193l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6194{ 6195 l2arc_dev_t *adddev; 6196 6197 ASSERT(!l2arc_vdev_present(vd)); 6198 6199 vdev_ashift_optimize(vd); 6200 6201 /* 6202 * Create a new l2arc device entry. 6203 */ 6204 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6205 adddev->l2ad_spa = spa; 6206 adddev->l2ad_vdev = vd; 6207 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6208 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6209 adddev->l2ad_hand = adddev->l2ad_start; 6210 adddev->l2ad_evict = adddev->l2ad_start; 6211 adddev->l2ad_first = B_TRUE; 6212 adddev->l2ad_writing = B_FALSE; 6213 6214 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6215 /* 6216 * This is a list of all ARC buffers that are still valid on the 6217 * device. 6218 */ 6219 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6220 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6221 6222 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6223 6224 /* 6225 * Add device to global list 6226 */ 6227 mutex_enter(&l2arc_dev_mtx); 6228 list_insert_head(l2arc_dev_list, adddev); 6229 atomic_inc_64(&l2arc_ndev); 6230 mutex_exit(&l2arc_dev_mtx); 6231} 6232 6233/* 6234 * Remove a vdev from the L2ARC. 6235 */ 6236void 6237l2arc_remove_vdev(vdev_t *vd) 6238{ 6239 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6240 6241 /* 6242 * Find the device by vdev 6243 */ 6244 mutex_enter(&l2arc_dev_mtx); 6245 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6246 nextdev = list_next(l2arc_dev_list, dev); 6247 if (vd == dev->l2ad_vdev) { 6248 remdev = dev; 6249 break; 6250 } 6251 } 6252 ASSERT(remdev != NULL); 6253 6254 /* 6255 * Remove device from global list 6256 */ 6257 list_remove(l2arc_dev_list, remdev); 6258 l2arc_dev_last = NULL; /* may have been invalidated */ 6259 atomic_dec_64(&l2arc_ndev); 6260 mutex_exit(&l2arc_dev_mtx); 6261 6262 /* 6263 * Clear all buflists and ARC references. L2ARC device flush. 6264 */ 6265 l2arc_evict(remdev, 0, B_TRUE); 6266 list_destroy(&remdev->l2ad_buflist); 6267 mutex_destroy(&remdev->l2ad_mtx); 6268 kmem_free(remdev, sizeof (l2arc_dev_t)); 6269} 6270 6271void 6272l2arc_init(void) 6273{ 6274 l2arc_thread_exit = 0; 6275 l2arc_ndev = 0; 6276 l2arc_writes_sent = 0; 6277 l2arc_writes_done = 0; 6278 6279 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6280 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6281 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6282 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6283 6284 l2arc_dev_list = &L2ARC_dev_list; 6285 l2arc_free_on_write = &L2ARC_free_on_write; 6286 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6287 offsetof(l2arc_dev_t, l2ad_node)); 6288 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6289 offsetof(l2arc_data_free_t, l2df_list_node)); 6290} 6291 6292void 6293l2arc_fini(void) 6294{ 6295 /* 6296 * This is called from dmu_fini(), which is called from spa_fini(); 6297 * Because of this, we can assume that all l2arc devices have 6298 * already been removed when the pools themselves were removed. 6299 */ 6300 6301 l2arc_do_free_on_write(); 6302 6303 mutex_destroy(&l2arc_feed_thr_lock); 6304 cv_destroy(&l2arc_feed_thr_cv); 6305 mutex_destroy(&l2arc_dev_mtx); 6306 mutex_destroy(&l2arc_free_on_write_mtx); 6307 6308 list_destroy(l2arc_dev_list); 6309 list_destroy(l2arc_free_on_write); 6310} 6311 6312void 6313l2arc_start(void) 6314{ 6315 if (!(spa_mode_global & FWRITE)) 6316 return; 6317 6318 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6319 TS_RUN, minclsyspri); 6320} 6321 6322void 6323l2arc_stop(void) 6324{ 6325 if (!(spa_mode_global & FWRITE)) 6326 return; 6327 6328 mutex_enter(&l2arc_feed_thr_lock); 6329 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6330 l2arc_thread_exit = 1; 6331 while (l2arc_thread_exit != 0) 6332 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6333 mutex_exit(&l2arc_feed_thr_lock); 6334} 6335