arc.c revision 275609
1180740Sdes/*
2180740Sdes * CDDL HEADER START
3226046Sdes *
4226046Sdes * The contents of this file are subject to the terms of the
5226046Sdes * Common Development and Distribution License (the "License").
6180740Sdes * You may not use this file except in compliance with the License.
7180740Sdes *
8180740Sdes * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9180740Sdes * or http://www.opensolaris.org/os/licensing.
10180740Sdes * See the License for the specific language governing permissions
11180740Sdes * and limitations under the License.
12180740Sdes *
13180746Sdes * When distributing Covered Code, include this CDDL HEADER in each
14180746Sdes * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15180746Sdes * If applicable, add the following below this CDDL HEADER, with the
16180740Sdes * fields enclosed by brackets "[]" replaced with your own identifying
17180740Sdes * information: Portions Copyright [yyyy] [name of copyright owner]
18180740Sdes *
19240075Sdes * CDDL HEADER END
20240075Sdes */
21240075Sdes/*
22180740Sdes * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23180740Sdes * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24180740Sdes * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
25180740Sdes * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26180740Sdes */
27180740Sdes
28180740Sdes/*
29180740Sdes * DVA-based Adjustable Replacement Cache
30180740Sdes *
31180746Sdes * While much of the theory of operation used here is
32180746Sdes * based on the self-tuning, low overhead replacement cache
33180746Sdes * presented by Megiddo and Modha at FAST 2003, there are some
34180740Sdes * significant differences:
35180740Sdes *
36180740Sdes * 1. The Megiddo and Modha model assumes any page is evictable.
37180740Sdes * Pages in its cache cannot be "locked" into memory.  This makes
38180740Sdes * the eviction algorithm simple: evict the last page in the list.
39180740Sdes * This also make the performance characteristics easy to reason
40180740Sdes * about.  Our cache is not so simple.  At any given moment, some
41180740Sdes * subset of the blocks in the cache are un-evictable because we
42180740Sdes * have handed out a reference to them.  Blocks are only evictable
43180740Sdes * when there are no external references active.  This makes
44180740Sdes * eviction far more problematic:  we choose to evict the evictable
45180740Sdes * blocks that are the "lowest" in the list.
46180740Sdes *
47180750Sdes * There are times when it is not possible to evict the requested
48180750Sdes * space.  In these circumstances we are unable to adjust the cache
49180750Sdes * size.  To prevent the cache growing unbounded at these times we
50262566Sdes * implement a "cache throttle" that slows the flow of new data
51262566Sdes * into the cache until we can make space available.
52262566Sdes *
53262566Sdes * 2. The Megiddo and Modha model assumes a fixed cache size.
54180740Sdes * Pages are evicted when the cache is full and there is a cache
55180740Sdes * miss.  Our model has a variable sized cache.  It grows with
56180740Sdes * high use, but also tries to react to memory pressure from the
57180740Sdes * operating system: decreasing its size when system memory is
58180740Sdes * tight.
59180740Sdes *
60180740Sdes * 3. The Megiddo and Modha model assumes a fixed page size. All
61180740Sdes * elements of the cache are therefore exactly the same size.  So
62180740Sdes * when adjusting the cache size following a cache miss, its simply
63180740Sdes * a matter of choosing a single page to evict.  In our model, we
64180740Sdes * have variable sized cache blocks (rangeing from 512 bytes to
65180740Sdes * 128K bytes).  We therefore choose a set of blocks to evict to make
66180740Sdes * space for a cache miss that approximates as closely as possible
67180740Sdes * the space used by the new block.
68180740Sdes *
69180740Sdes * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70180740Sdes * by N. Megiddo & D. Modha, FAST 2003
71180740Sdes */
72180740Sdes
73180740Sdes/*
74180740Sdes * The locking model:
75180746Sdes *
76180746Sdes * A new reference to a cache buffer can be obtained in two
77180746Sdes * ways: 1) via a hash table lookup using the DVA as a key,
78180740Sdes * or 2) via one of the ARC lists.  The arc_read() interface
79180740Sdes * uses method 1, while the internal arc algorithms for
80180740Sdes * adjusting the cache use method 2.  We therefore provide two
81262566Sdes * types of locks: 1) the hash table lock array, and 2) the
82248619Sdes * arc list locks.
83248619Sdes *
84197679Sdes * Buffers do not have their own mutexs, rather they rely on the
85197679Sdes * hash table mutexs for the bulk of their protection (i.e. most
86197679Sdes * fields in the arc_buf_hdr_t are protected by these mutexs).
87180740Sdes *
88180740Sdes * buf_hash_find() returns the appropriate mutex (held) when it
89180740Sdes * locates the requested buffer in the hash table.  It returns
90180740Sdes * NULL for the mutex if the buffer was not in the table.
91180740Sdes *
92180740Sdes * buf_hash_remove() expects the appropriate hash mutex to be
93180740Sdes * already held before it is invoked.
94180740Sdes *
95180740Sdes * Each arc state also has a mutex which is used to protect the
96180740Sdes * buffer list associated with the state.  When attempting to
97180740Sdes * obtain a hash table lock while holding an arc list lock you
98180740Sdes * must use: mutex_tryenter() to avoid deadlock.  Also note that
99180740Sdes * the active state mutex must be held before the ghost state mutex.
100180740Sdes *
101180740Sdes * Arc buffers may have an associated eviction callback function.
102180740Sdes * This function will be invoked prior to removing the buffer (e.g.
103180740Sdes * in arc_do_user_evicts()).  Note however that the data associated
104180740Sdes * with the buffer may be evicted prior to the callback.  The callback
105180740Sdes * must be made with *no locks held* (to prevent deadlock).  Additionally,
106180740Sdes * the users of callbacks must ensure that their private data is
107180740Sdes * protected from simultaneous callbacks from arc_clear_callback()
108180740Sdes * and arc_do_user_evicts().
109180740Sdes *
110180740Sdes * Note that the majority of the performance stats are manipulated
111180740Sdes * with atomic operations.
112180740Sdes *
113180740Sdes * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114180740Sdes *
115180740Sdes *	- L2ARC buflist creation
116180740Sdes *	- L2ARC buflist eviction
117180740Sdes *	- L2ARC write completion, which walks L2ARC buflists
118180740Sdes *	- ARC header destruction, as it removes from L2ARC buflists
119180740Sdes *	- ARC header release, as it removes from L2ARC buflists
120180740Sdes */
121180740Sdes
122180740Sdes#include <sys/spa.h>
123180740Sdes#include <sys/zio.h>
124180740Sdes#include <sys/zio_compress.h>
125180740Sdes#include <sys/zfs_context.h>
126180740Sdes#include <sys/arc.h>
127180740Sdes#include <sys/refcount.h>
128180740Sdes#include <sys/vdev.h>
129180740Sdes#include <sys/vdev_impl.h>
130180740Sdes#include <sys/dsl_pool.h>
131180740Sdes#ifdef _KERNEL
132180740Sdes#include <sys/dnlc.h>
133180740Sdes#endif
134180740Sdes#include <sys/callb.h>
135204917Sdes#include <sys/kstat.h>
136204917Sdes#include <sys/trim_map.h>
137204917Sdes#include <zfs_fletcher.h>
138221420Sdes#include <sys/sdt.h>
139221420Sdes
140221420Sdes#include <vm/vm_pageout.h>
141197679Sdes#include <machine/vmparam.h>
142180750Sdes
143180750Sdes#ifdef illumos
144197679Sdes#ifndef _KERNEL
145197679Sdes/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
146197679Sdesboolean_t arc_watch = B_FALSE;
147180740Sdesint arc_procfd;
148180740Sdes#endif
149180740Sdes#endif /* illumos */
150180740Sdes
151180740Sdesstatic kmutex_t		arc_reclaim_thr_lock;
152180740Sdesstatic kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
153180740Sdesstatic uint8_t		arc_thread_exit;
154180740Sdes
155180740Sdes#define	ARC_REDUCE_DNLC_PERCENT	3
156180740Sdesuint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
157180740Sdes
158180740Sdestypedef enum arc_reclaim_strategy {
159221420Sdes	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
160221420Sdes	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
161221420Sdes} arc_reclaim_strategy_t;
162180740Sdes
163180740Sdes/*
164180740Sdes * The number of iterations through arc_evict_*() before we
165180740Sdes * drop & reacquire the lock.
166180740Sdes */
167180740Sdesint arc_evict_iterations = 100;
168180740Sdes
169180740Sdes/* number of seconds before growing cache again */
170180740Sdesstatic int		arc_grow_retry = 60;
171180740Sdes
172180740Sdes/* shift of arc_c for calculating both min and max arc_p */
173180740Sdesstatic int		arc_p_min_shift = 4;
174180740Sdes
175180740Sdes/* log2(fraction of arc to reclaim) */
176180740Sdesstatic int		arc_shrink_shift = 5;
177180740Sdes
178180740Sdes/*
179180740Sdes * minimum lifespan of a prefetch block in clock ticks
180180740Sdes * (initialized in arc_init())
181180740Sdes */
182180740Sdesstatic int		arc_min_prefetch_lifespan;
183180740Sdes
184180740Sdes/*
185180740Sdes * If this percent of memory is free, don't throttle.
186180750Sdes */
187180750Sdesint arc_lotsfree_percent = 10;
188180750Sdes
189262566Sdesstatic int arc_dead;
190262566Sdesextern int zfs_prefetch_disable;
191262566Sdes
192180750Sdes/*
193180750Sdes * The arc has filled available memory and has now warmed up.
194180750Sdes */
195180740Sdesstatic boolean_t arc_warm;
196180740Sdes
197180740Sdesuint64_t zfs_arc_max;
198180740Sdesuint64_t zfs_arc_min;
199180740Sdesuint64_t zfs_arc_meta_limit = 0;
200180740Sdesint zfs_arc_grow_retry = 0;
201180744Sdesint zfs_arc_shrink_shift = 0;
202180744Sdesint zfs_arc_p_min_shift = 0;
203180744Sdesint zfs_disable_dup_eviction = 0;
204180740Sdesuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
205180740Sdesu_int zfs_arc_free_target = 0;
206180740Sdes
207180746Sdesstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
208180746Sdes
209180746Sdes#ifdef _KERNEL
210180740Sdesstatic void
211180740Sdesarc_free_target_init(void *unused __unused)
212180740Sdes{
213180740Sdes
214180740Sdes	zfs_arc_free_target = vm_pageout_wakeup_thresh;
215180740Sdes}
216180740SdesSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
217180740Sdes    arc_free_target_init, NULL);
218180740Sdes
219180740SdesTUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
220180740SdesTUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
221180740SdesTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
222262566SdesTUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
223262566SdesTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
224262566SdesSYSCTL_DECL(_vfs_zfs);
225180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
226180740Sdes    "Maximum ARC size");
227180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
228262566Sdes    "Minimum ARC size");
229262566SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
230262566Sdes    &zfs_arc_average_blocksize, 0,
231262566Sdes    "ARC average blocksize");
232262566SdesSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
233262566Sdes    &arc_shrink_shift, 0,
234262566Sdes    "log2(fraction of arc to reclaim)");
235262566Sdes
236262566Sdes/*
237262566Sdes * We don't have a tunable for arc_free_target due to the dependency on
238262566Sdes * pagedaemon initialisation.
239262566Sdes */
240262566SdesSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
241262566Sdes    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
242262566Sdes    sysctl_vfs_zfs_arc_free_target, "IU",
243262566Sdes    "Desired number of free pages below which ARC triggers reclaim");
244262566Sdes
245262566Sdesstatic int
246221420Sdessysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
247221420Sdes{
248221420Sdes	u_int val;
249248619Sdes	int err;
250248619Sdes
251248619Sdes	val = zfs_arc_free_target;
252180740Sdes	err = sysctl_handle_int(oidp, &val, 0, req);
253180740Sdes	if (err != 0 || req->newptr == NULL)
254180740Sdes		return (err);
255180740Sdes
256180740Sdes	if (val < minfree)
257180740Sdes		return (EINVAL);
258262566Sdes	if (val > cnt.v_page_count)
259262566Sdes		return (EINVAL);
260262566Sdes
261180740Sdes	zfs_arc_free_target = val;
262180740Sdes
263180740Sdes	return (0);
264255767Sdes}
265255767Sdes#endif
266255767Sdes
267180740Sdes/*
268180740Sdes * Note that buffers can be in one of 6 states:
269180740Sdes *	ARC_anon	- anonymous (discussed below)
270180740Sdes *	ARC_mru		- recently used, currently cached
271180740Sdes *	ARC_mru_ghost	- recentely used, no longer in cache
272180740Sdes *	ARC_mfu		- frequently used, currently cached
273180740Sdes *	ARC_mfu_ghost	- frequently used, no longer in cache
274180740Sdes *	ARC_l2c_only	- exists in L2ARC but not other states
275180740Sdes * When there are no active references to the buffer, they are
276180740Sdes * are linked onto a list in one of these arc states.  These are
277180740Sdes * the only buffers that can be evicted or deleted.  Within each
278180740Sdes * state there are multiple lists, one for meta-data and one for
279255767Sdes * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
280255767Sdes * etc.) is tracked separately so that it can be managed more
281255767Sdes * explicitly: favored over data, limited explicitly.
282180740Sdes *
283180740Sdes * Anonymous buffers are buffers that are not associated with
284180740Sdes * a DVA.  These are buffers that hold dirty block copies
285180740Sdes * before they are written to stable storage.  By definition,
286180740Sdes * they are "ref'd" and are considered part of arc_mru
287180740Sdes * that cannot be freed.  Generally, they will aquire a DVA
288180740Sdes * as they are written and migrate onto the arc_mru list.
289180740Sdes *
290180740Sdes * The ARC_l2c_only state is for buffers that are in the second
291180740Sdes * level ARC but no longer in any of the ARC_m* lists.  The second
292180740Sdes * level ARC itself may also contain buffers that are in any of
293180740Sdes * the ARC_m* states - meaning that a buffer can exist in two
294180740Sdes * places.  The reason for the ARC_l2c_only state is to keep the
295180740Sdes * buffer header in the hash table, so that reads that hit the
296180740Sdes * second level ARC benefit from these fast lookups.
297180740Sdes */
298180740Sdes
299180740Sdes#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
300180740Sdesstruct arcs_lock {
301180740Sdes	kmutex_t	arcs_lock;
302248619Sdes#ifdef _KERNEL
303248619Sdes	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
304248619Sdes#endif
305248619Sdes};
306255767Sdes
307255767Sdes/*
308255767Sdes * must be power of two for mask use to work
309255767Sdes *
310180740Sdes */
311180740Sdes#define ARC_BUFC_NUMDATALISTS		16
312180740Sdes#define ARC_BUFC_NUMMETADATALISTS	16
313180740Sdes#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
314180740Sdes
315180740Sdestypedef struct arc_state {
316180740Sdes	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
317180740Sdes	uint64_t arcs_size;	/* total amount of data in this state */
318180740Sdes	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
319180740Sdes	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
320180740Sdes} arc_state_t;
321180740Sdes
322180740Sdes#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
323180740Sdes
324180740Sdes/* The 6 states: */
325180740Sdesstatic arc_state_t ARC_anon;
326180744Sdesstatic arc_state_t ARC_mru;
327180744Sdesstatic arc_state_t ARC_mru_ghost;
328180744Sdesstatic arc_state_t ARC_mfu;
329180744Sdesstatic arc_state_t ARC_mfu_ghost;
330255767Sdesstatic arc_state_t ARC_l2c_only;
331255767Sdes
332255767Sdestypedef struct arc_stats {
333255767Sdes	kstat_named_t arcstat_hits;
334180744Sdes	kstat_named_t arcstat_misses;
335180744Sdes	kstat_named_t arcstat_demand_data_hits;
336180744Sdes	kstat_named_t arcstat_demand_data_misses;
337180744Sdes	kstat_named_t arcstat_demand_metadata_hits;
338180740Sdes	kstat_named_t arcstat_demand_metadata_misses;
339180740Sdes	kstat_named_t arcstat_prefetch_data_hits;
340180740Sdes	kstat_named_t arcstat_prefetch_data_misses;
341180740Sdes	kstat_named_t arcstat_prefetch_metadata_hits;
342180740Sdes	kstat_named_t arcstat_prefetch_metadata_misses;
343180740Sdes	kstat_named_t arcstat_mru_hits;
344180740Sdes	kstat_named_t arcstat_mru_ghost_hits;
345180740Sdes	kstat_named_t arcstat_mfu_hits;
346180740Sdes	kstat_named_t arcstat_mfu_ghost_hits;
347180740Sdes	kstat_named_t arcstat_allocated;
348180740Sdes	kstat_named_t arcstat_deleted;
349180740Sdes	kstat_named_t arcstat_stolen;
350180740Sdes	kstat_named_t arcstat_recycle_miss;
351180740Sdes	/*
352180740Sdes	 * Number of buffers that could not be evicted because the hash lock
353180740Sdes	 * was held by another thread.  The lock may not necessarily be held
354180740Sdes	 * by something using the same buffer, since hash locks are shared
355180740Sdes	 * by multiple buffers.
356180740Sdes	 */
357180740Sdes	kstat_named_t arcstat_mutex_miss;
358180740Sdes	/*
359180740Sdes	 * Number of buffers skipped because they have I/O in progress, are
360180740Sdes	 * indrect prefetch buffers that have not lived long enough, or are
361180740Sdes	 * not from the spa we're trying to evict from.
362180740Sdes	 */
363180740Sdes	kstat_named_t arcstat_evict_skip;
364180740Sdes	kstat_named_t arcstat_evict_l2_cached;
365180740Sdes	kstat_named_t arcstat_evict_l2_eligible;
366255767Sdes	kstat_named_t arcstat_evict_l2_ineligible;
367255767Sdes	kstat_named_t arcstat_hash_elements;
368255767Sdes	kstat_named_t arcstat_hash_elements_max;
369180740Sdes	kstat_named_t arcstat_hash_collisions;
370180740Sdes	kstat_named_t arcstat_hash_chains;
371180740Sdes	kstat_named_t arcstat_hash_chain_max;
372180740Sdes	kstat_named_t arcstat_p;
373180740Sdes	kstat_named_t arcstat_c;
374180740Sdes	kstat_named_t arcstat_c_min;
375180740Sdes	kstat_named_t arcstat_c_max;
376180740Sdes	kstat_named_t arcstat_size;
377180740Sdes	kstat_named_t arcstat_hdr_size;
378180740Sdes	kstat_named_t arcstat_data_size;
379180740Sdes	kstat_named_t arcstat_other_size;
380180740Sdes	kstat_named_t arcstat_l2_hits;
381180740Sdes	kstat_named_t arcstat_l2_misses;
382180740Sdes	kstat_named_t arcstat_l2_feeds;
383180740Sdes	kstat_named_t arcstat_l2_rw_clash;
384221420Sdes	kstat_named_t arcstat_l2_read_bytes;
385221420Sdes	kstat_named_t arcstat_l2_write_bytes;
386221420Sdes	kstat_named_t arcstat_l2_writes_sent;
387248619Sdes	kstat_named_t arcstat_l2_writes_done;
388248619Sdes	kstat_named_t arcstat_l2_writes_error;
389248619Sdes	kstat_named_t arcstat_l2_writes_hdr_miss;
390255767Sdes	kstat_named_t arcstat_l2_evict_lock_retry;
391255767Sdes	kstat_named_t arcstat_l2_evict_reading;
392255767Sdes	kstat_named_t arcstat_l2_free_on_write;
393180740Sdes	kstat_named_t arcstat_l2_cdata_free_on_write;
394180740Sdes	kstat_named_t arcstat_l2_abort_lowmem;
395180740Sdes	kstat_named_t arcstat_l2_cksum_bad;
396180740Sdes	kstat_named_t arcstat_l2_io_error;
397180740Sdes	kstat_named_t arcstat_l2_size;
398180740Sdes	kstat_named_t arcstat_l2_asize;
399180740Sdes	kstat_named_t arcstat_l2_hdr_size;
400180740Sdes	kstat_named_t arcstat_l2_compress_successes;
401180740Sdes	kstat_named_t arcstat_l2_compress_zeros;
402180740Sdes	kstat_named_t arcstat_l2_compress_failures;
403180740Sdes	kstat_named_t arcstat_l2_write_trylock_fail;
404180740Sdes	kstat_named_t arcstat_l2_write_passed_headroom;
405248619Sdes	kstat_named_t arcstat_l2_write_spa_mismatch;
406248619Sdes	kstat_named_t arcstat_l2_write_in_l2;
407248619Sdes	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
408262566Sdes	kstat_named_t arcstat_l2_write_not_cacheable;
409262566Sdes	kstat_named_t arcstat_l2_write_full;
410262566Sdes	kstat_named_t arcstat_l2_write_buffer_iter;
411262566Sdes	kstat_named_t arcstat_l2_write_pios;
412262566Sdes	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
413262566Sdes	kstat_named_t arcstat_l2_write_buffer_list_iter;
414262566Sdes	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
415262566Sdes	kstat_named_t arcstat_memory_throttle_count;
416262566Sdes	kstat_named_t arcstat_duplicate_buffers;
417264377Sdes	kstat_named_t arcstat_duplicate_buffers_size;
418264377Sdes	kstat_named_t arcstat_duplicate_reads;
419264377Sdes} arc_stats_t;
420262566Sdes
421262566Sdesstatic arc_stats_t arc_stats = {
422262566Sdes	{ "hits",			KSTAT_DATA_UINT64 },
423180740Sdes	{ "misses",			KSTAT_DATA_UINT64 },
424180740Sdes	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
425180740Sdes	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
426180740Sdes	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
427180740Sdes	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
428180740Sdes	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
429264377Sdes	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
430264377Sdes	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
431264377Sdes	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
432180740Sdes	{ "mru_hits",			KSTAT_DATA_UINT64 },
433180740Sdes	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
434180740Sdes	{ "mfu_hits",			KSTAT_DATA_UINT64 },
435180740Sdes	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
436180740Sdes	{ "allocated",			KSTAT_DATA_UINT64 },
437180740Sdes	{ "deleted",			KSTAT_DATA_UINT64 },
438180740Sdes	{ "stolen",			KSTAT_DATA_UINT64 },
439180740Sdes	{ "recycle_miss",		KSTAT_DATA_UINT64 },
440180740Sdes	{ "mutex_miss",			KSTAT_DATA_UINT64 },
441180740Sdes	{ "evict_skip",			KSTAT_DATA_UINT64 },
442180740Sdes	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
443180740Sdes	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
444255767Sdes	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
445255767Sdes	{ "hash_elements",		KSTAT_DATA_UINT64 },
446255767Sdes	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
447180740Sdes	{ "hash_collisions",		KSTAT_DATA_UINT64 },
448180740Sdes	{ "hash_chains",		KSTAT_DATA_UINT64 },
449180740Sdes	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
450180740Sdes	{ "p",				KSTAT_DATA_UINT64 },
451180740Sdes	{ "c",				KSTAT_DATA_UINT64 },
452180740Sdes	{ "c_min",			KSTAT_DATA_UINT64 },
453180750Sdes	{ "c_max",			KSTAT_DATA_UINT64 },
454180750Sdes	{ "size",			KSTAT_DATA_UINT64 },
455180750Sdes	{ "hdr_size",			KSTAT_DATA_UINT64 },
456180740Sdes	{ "data_size",			KSTAT_DATA_UINT64 },
457180740Sdes	{ "other_size",			KSTAT_DATA_UINT64 },
458180740Sdes	{ "l2_hits",			KSTAT_DATA_UINT64 },
459180750Sdes	{ "l2_misses",			KSTAT_DATA_UINT64 },
460180750Sdes	{ "l2_feeds",			KSTAT_DATA_UINT64 },
461180750Sdes	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
462180750Sdes	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
463180750Sdes	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
464180750Sdes	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
465262566Sdes	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
466262566Sdes	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
467262566Sdes	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
468180750Sdes	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
469180750Sdes	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
470180750Sdes	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
471180740Sdes	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
472180740Sdes	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
473180740Sdes	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
474180740Sdes	{ "l2_io_error",		KSTAT_DATA_UINT64 },
475180740Sdes	{ "l2_size",			KSTAT_DATA_UINT64 },
476180740Sdes	{ "l2_asize",			KSTAT_DATA_UINT64 },
477180740Sdes	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
478180740Sdes	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
479180740Sdes	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
480180740Sdes	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
481180740Sdes	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
482180740Sdes	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
483180740Sdes	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
484180740Sdes	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
485180740Sdes	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
486180740Sdes	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
487180740Sdes	{ "l2_write_full",		KSTAT_DATA_UINT64 },
488180740Sdes	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
489180740Sdes	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
490180740Sdes	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
491180740Sdes	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
492180746Sdes	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
493180746Sdes	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
494180746Sdes	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
495192595Sdes	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
496192595Sdes	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
497192595Sdes};
498180740Sdes
499180740Sdes#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
500180740Sdes
501180740Sdes#define	ARCSTAT_INCR(stat, val) \
502180740Sdes	atomic_add_64(&arc_stats.stat.value.ui64, (val))
503180740Sdes
504180740Sdes#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
505180740Sdes#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
506180740Sdes
507180740Sdes#define	ARCSTAT_MAX(stat, val) {					\
508180740Sdes	uint64_t m;							\
509180740Sdes	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
510180740Sdes	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
511180740Sdes		continue;						\
512180740Sdes}
513180740Sdes
514180740Sdes#define	ARCSTAT_MAXSTAT(stat) \
515180740Sdes	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
516180740Sdes
517180740Sdes/*
518180740Sdes * We define a macro to allow ARC hits/misses to be easily broken down by
519180744Sdes * two separate conditions, giving a total of four different subtypes for
520180744Sdes * each of hits and misses (so eight statistics total).
521180744Sdes */
522248619Sdes#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
523248619Sdes	if (cond1) {							\
524248619Sdes		if (cond2) {						\
525248619Sdes			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
526248619Sdes		} else {						\
527248619Sdes			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
528180740Sdes		}							\
529180740Sdes	} else {							\
530180740Sdes		if (cond2) {						\
531180740Sdes			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
532180740Sdes		} else {						\
533180740Sdes			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
534180740Sdes		}							\
535180740Sdes	}
536180740Sdes
537180740Sdeskstat_t			*arc_ksp;
538180740Sdesstatic arc_state_t	*arc_anon;
539180740Sdesstatic arc_state_t	*arc_mru;
540180740Sdesstatic arc_state_t	*arc_mru_ghost;
541180740Sdesstatic arc_state_t	*arc_mfu;
542180740Sdesstatic arc_state_t	*arc_mfu_ghost;
543180740Sdesstatic arc_state_t	*arc_l2c_only;
544180740Sdes
545180740Sdes/*
546180740Sdes * There are several ARC variables that are critical to export as kstats --
547180740Sdes * but we don't want to have to grovel around in the kstat whenever we wish to
548180740Sdes * manipulate them.  For these variables, we therefore define them to be in
549180740Sdes * terms of the statistic variable.  This assures that we are not introducing
550180740Sdes * the possibility of inconsistency by having shadow copies of the variables,
551180740Sdes * while still allowing the code to be readable.
552180740Sdes */
553180740Sdes#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
554180740Sdes#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
555180740Sdes#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
556180740Sdes#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
557180740Sdes#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
558180740Sdes
559180740Sdes#define	L2ARC_IS_VALID_COMPRESS(_c_) \
560180740Sdes	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
561180740Sdes
562180740Sdesstatic int		arc_no_grow;	/* Don't try to grow cache size */
563180740Sdesstatic uint64_t		arc_tempreserve;
564180740Sdesstatic uint64_t		arc_loaned_bytes;
565180740Sdesstatic uint64_t		arc_meta_used;
566180740Sdesstatic uint64_t		arc_meta_limit;
567207319Sdesstatic uint64_t		arc_meta_max = 0;
568207319SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
569207319Sdes    "ARC metadata used");
570180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
571180740Sdes    "ARC metadata limit");
572180740Sdes
573180740Sdestypedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
574180740Sdes
575180740Sdestypedef struct arc_callback arc_callback_t;
576180740Sdes
577180740Sdesstruct arc_callback {
578180740Sdes	void			*acb_private;
579204917Sdes	arc_done_func_t		*acb_done;
580204917Sdes	arc_buf_t		*acb_buf;
581204917Sdes	zio_t			*acb_zio_dummy;
582180740Sdes	arc_callback_t		*acb_next;
583180740Sdes};
584180740Sdes
585180740Sdestypedef struct arc_write_callback arc_write_callback_t;
586180740Sdes
587180740Sdesstruct arc_write_callback {
588180740Sdes	void		*awcb_private;
589180740Sdes	arc_done_func_t	*awcb_ready;
590180740Sdes	arc_done_func_t	*awcb_physdone;
591180740Sdes	arc_done_func_t	*awcb_done;
592180740Sdes	arc_buf_t	*awcb_buf;
593180740Sdes};
594180740Sdes
595180740Sdesstruct arc_buf_hdr {
596180740Sdes	/* protected by hash lock */
597180740Sdes	dva_t			b_dva;
598180740Sdes	uint64_t		b_birth;
599180740Sdes	uint64_t		b_cksum0;
600180740Sdes
601180740Sdes	kmutex_t		b_freeze_lock;
602180740Sdes	zio_cksum_t		*b_freeze_cksum;
603240075Sdes	void			*b_thawed;
604240075Sdes
605240075Sdes	arc_buf_hdr_t		*b_hash_next;
606180740Sdes	arc_buf_t		*b_buf;
607180740Sdes	uint32_t		b_flags;
608180740Sdes	uint32_t		b_datacnt;
609180740Sdes
610180740Sdes	arc_callback_t		*b_acb;
611180740Sdes	kcondvar_t		b_cv;
612180740Sdes
613180740Sdes	/* immutable */
614180740Sdes	arc_buf_contents_t	b_type;
615180740Sdes	uint64_t		b_size;
616180740Sdes	uint64_t		b_spa;
617180740Sdes
618180740Sdes	/* protected by arc state mutex */
619180740Sdes	arc_state_t		*b_state;
620180740Sdes	list_node_t		b_arc_node;
621180740Sdes
622180740Sdes	/* updated atomically */
623180740Sdes	clock_t			b_arc_access;
624180740Sdes
625180740Sdes	/* self protecting */
626180740Sdes	refcount_t		b_refcnt;
627180740Sdes
628180740Sdes	l2arc_buf_hdr_t		*b_l2hdr;
629180740Sdes	list_node_t		b_l2node;
630180740Sdes};
631180740Sdes
632180740Sdesstatic arc_buf_t *arc_eviction_list;
633180740Sdesstatic kmutex_t arc_eviction_mtx;
634180740Sdesstatic arc_buf_hdr_t arc_eviction_hdr;
635180740Sdesstatic void arc_get_data_buf(arc_buf_t *buf);
636180740Sdesstatic void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
637180740Sdesstatic int arc_evict_needed(arc_buf_contents_t type);
638180740Sdesstatic void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
639262566Sdes#ifdef illumos
640262566Sdesstatic void arc_buf_watch(arc_buf_t *buf);
641262566Sdes#endif /* illumos */
642180740Sdes
643180740Sdesstatic boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
644180740Sdes
645180740Sdes#define	GHOST_STATE(state)	\
646180740Sdes	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
647180740Sdes	(state) == arc_l2c_only)
648180740Sdes
649180740Sdes/*
650180740Sdes * Private ARC flags.  These flags are private ARC only flags that will show up
651197679Sdes * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
652197679Sdes * be passed in as arc_flags in things like arc_read.  However, these flags
653197679Sdes * should never be passed and should only be set by ARC code.  When adding new
654221420Sdes * public flags, make sure not to smash the private ones.
655221420Sdes */
656221420Sdes
657255767Sdes#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
658255767Sdes#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
659255767Sdes#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
660255767Sdes#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
661255767Sdes#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
662255767Sdes#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
663255767Sdes#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
664255767Sdes#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
665255767Sdes#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
666180740Sdes#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
667180740Sdes
668180740Sdes#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
669240075Sdes#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
670240075Sdes#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
671240075Sdes#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
672221420Sdes#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
673221420Sdes#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
674221420Sdes#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
675180740Sdes#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
676180740Sdes#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
677180740Sdes				    (hdr)->b_l2hdr != NULL)
678180740Sdes#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
679180740Sdes#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
680180740Sdes#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
681180740Sdes
682180740Sdes/*
683180740Sdes * Other sizes
684180740Sdes */
685180740Sdes
686180740Sdes#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
687180746Sdes#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
688180746Sdes
689180746Sdes/*
690207319Sdes * Hash table routines
691207319Sdes */
692207319Sdes
693180740Sdes#define	HT_LOCK_PAD	CACHE_LINE_SIZE
694180740Sdes
695180740Sdesstruct ht_lock {
696180740Sdes	kmutex_t	ht_lock;
697180740Sdes#ifdef _KERNEL
698180740Sdes	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
699180740Sdes#endif
700180740Sdes};
701180740Sdes
702180740Sdes#define	BUF_LOCKS 256
703180740Sdestypedef struct buf_hash_table {
704180740Sdes	uint64_t ht_mask;
705180740Sdes	arc_buf_hdr_t **ht_table;
706180740Sdes	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
707180740Sdes} buf_hash_table_t;
708180740Sdes
709180740Sdesstatic buf_hash_table_t buf_hash_table;
710180740Sdes
711180740Sdes#define	BUF_HASH_INDEX(spa, dva, birth) \
712180740Sdes	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
713180740Sdes#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
714240075Sdes#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
715240075Sdes#define	HDR_LOCK(hdr) \
716240075Sdes	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
717240075Sdes
718240075Sdesuint64_t zfs_crc64_table[256];
719240075Sdes
720180740Sdes/*
721180740Sdes * Level 2 ARC
722180740Sdes */
723240075Sdes
724240075Sdes#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
725240075Sdes#define	L2ARC_HEADROOM		2			/* num of writes */
726255767Sdes/*
727255767Sdes * If we discover during ARC scan any buffers to be compressed, we boost
728255767Sdes * our headroom for the next scanning cycle by this percentage multiple.
729240075Sdes */
730180740Sdes#define	L2ARC_HEADROOM_BOOST	200
731180740Sdes#define	L2ARC_FEED_SECS		1		/* caching interval secs */
732180740Sdes#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
733180740Sdes
734180740Sdes#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
735180740Sdes#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
736180740Sdes
737180740Sdes/* L2ARC Performance Tunables */
738180740Sdesuint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
739180740Sdesuint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
740180740Sdesuint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
741180740Sdesuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
742180740Sdesuint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
743180740Sdesuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
744180740Sdesboolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
745180740Sdesboolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
746180740Sdesboolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
747180740Sdes
748180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
749180740Sdes    &l2arc_write_max, 0, "max write size");
750180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
751180740Sdes    &l2arc_write_boost, 0, "extra write during warmup");
752180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
753180740Sdes    &l2arc_headroom, 0, "number of dev writes");
754180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
755180740Sdes    &l2arc_feed_secs, 0, "interval seconds");
756255767SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
757255767Sdes    &l2arc_feed_min_ms, 0, "min interval milliseconds");
758255767Sdes
759180740SdesSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
760180740Sdes    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
761180740SdesSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
762180740Sdes    &l2arc_feed_again, 0, "turbo warmup");
763180740SdesSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
764180740Sdes    &l2arc_norw, 0, "no reads during writes");
765180740Sdes
766180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
767180740Sdes    &ARC_anon.arcs_size, 0, "size of anonymous state");
768180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
769180740Sdes    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
770180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
771180740Sdes    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
772180740Sdes
773180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
774180740Sdes    &ARC_mru.arcs_size, 0, "size of mru state");
775180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
776180740Sdes    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
777180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
778180740Sdes    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
779180740Sdes
780180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
781180740Sdes    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
782180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
783180740Sdes    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
784180740Sdes    "size of metadata in mru ghost state");
785180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
786180740Sdes    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
787180740Sdes    "size of data in mru ghost state");
788180740Sdes
789180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
790180740Sdes    &ARC_mfu.arcs_size, 0, "size of mfu state");
791180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
792180740Sdes    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
793180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
794180740Sdes    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
795180740Sdes
796180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
797180740Sdes    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
798180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
799180740Sdes    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
800180740Sdes    "size of metadata in mfu ghost state");
801180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
802180740Sdes    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
803180740Sdes    "size of data in mfu ghost state");
804180740Sdes
805180740SdesSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
806180740Sdes    &ARC_l2c_only.arcs_size, 0, "size of mru state");
807180740Sdes
808180740Sdes/*
809180740Sdes * L2ARC Internals
810180740Sdes */
811180740Sdestypedef struct l2arc_dev {
812180740Sdes	vdev_t			*l2ad_vdev;	/* vdev */
813180740Sdes	spa_t			*l2ad_spa;	/* spa */
814180740Sdes	uint64_t		l2ad_hand;	/* next write location */
815180740Sdes	uint64_t		l2ad_start;	/* first addr on device */
816180740Sdes	uint64_t		l2ad_end;	/* last addr on device */
817180740Sdes	uint64_t		l2ad_evict;	/* last addr eviction reached */
818180740Sdes	boolean_t		l2ad_first;	/* first sweep through */
819180740Sdes	boolean_t		l2ad_writing;	/* currently writing */
820180740Sdes	list_t			*l2ad_buflist;	/* buffer list */
821180740Sdes	list_node_t		l2ad_node;	/* device list node */
822180740Sdes} l2arc_dev_t;
823180740Sdes
824180740Sdesstatic list_t L2ARC_dev_list;			/* device list */
825180740Sdesstatic list_t *l2arc_dev_list;			/* device list pointer */
826180740Sdesstatic kmutex_t l2arc_dev_mtx;			/* device list mutex */
827180740Sdesstatic l2arc_dev_t *l2arc_dev_last;		/* last device used */
828180740Sdesstatic kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
829180740Sdesstatic list_t L2ARC_free_on_write;		/* free after write buf list */
830180740Sdesstatic list_t *l2arc_free_on_write;		/* free after write list ptr */
831180740Sdesstatic kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
832180740Sdesstatic uint64_t l2arc_ndev;			/* number of devices */
833180740Sdes
834180740Sdestypedef struct l2arc_read_callback {
835180740Sdes	arc_buf_t		*l2rcb_buf;		/* read buffer */
836180740Sdes	spa_t			*l2rcb_spa;		/* spa */
837180740Sdes	blkptr_t		l2rcb_bp;		/* original blkptr */
838180740Sdes	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
839180740Sdes	int			l2rcb_flags;		/* original flags */
840180740Sdes	enum zio_compress	l2rcb_compress;		/* applied compress */
841180744Sdes} l2arc_read_callback_t;
842180744Sdes
843180744Sdestypedef struct l2arc_write_callback {
844180744Sdes	l2arc_dev_t	*l2wcb_dev;		/* device info */
845180744Sdes	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
846180744Sdes} l2arc_write_callback_t;
847180740Sdes
848180740Sdesstruct l2arc_buf_hdr {
849180740Sdes	/* protected by arc_buf_hdr  mutex */
850180740Sdes	l2arc_dev_t		*b_dev;		/* L2ARC device */
851180740Sdes	uint64_t		b_daddr;	/* disk address, offset byte */
852180740Sdes	/* compression applied to buffer data */
853180740Sdes	enum zio_compress	b_compress;
854180740Sdes	/* real alloc'd buffer size depending on b_compress applied */
855180740Sdes	int			b_asize;
856180740Sdes	/* temporary buffer holder for in-flight compressed data */
857180740Sdes	void			*b_tmp_cdata;
858180740Sdes};
859180740Sdes
860180740Sdestypedef struct l2arc_data_free {
861180740Sdes	/* protected by l2arc_free_on_write_mtx */
862180740Sdes	void		*l2df_data;
863180740Sdes	size_t		l2df_size;
864180740Sdes	void		(*l2df_func)(void *, size_t);
865180740Sdes	list_node_t	l2df_list_node;
866180740Sdes} l2arc_data_free_t;
867180740Sdes
868180740Sdesstatic kmutex_t l2arc_feed_thr_lock;
869180740Sdesstatic kcondvar_t l2arc_feed_thr_cv;
870180740Sdesstatic uint8_t l2arc_thread_exit;
871180740Sdes
872180740Sdesstatic void l2arc_read_done(zio_t *zio);
873180740Sdesstatic void l2arc_hdr_stat_add(void);
874180740Sdesstatic void l2arc_hdr_stat_remove(void);
875180740Sdes
876180740Sdesstatic boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
877226046Sdesstatic void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
878226046Sdes    enum zio_compress c);
879226046Sdesstatic void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
880180740Sdes
881180740Sdesstatic uint64_t
882180740Sdesbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
883180740Sdes{
884180740Sdes	uint8_t *vdva = (uint8_t *)dva;
885180740Sdes	uint64_t crc = -1ULL;
886221420Sdes	int i;
887221420Sdes
888221420Sdes	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
889221420Sdes
890221420Sdes	for (i = 0; i < sizeof (dva_t); i++)
891221420Sdes		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
892226046Sdes
893226046Sdes	crc ^= (spa>>8) ^ birth;
894226046Sdes
895226046Sdes	return (crc);
896226046Sdes}
897226046Sdes
898180740Sdes#define	BUF_EMPTY(buf)						\
899180740Sdes	((buf)->b_dva.dva_word[0] == 0 &&			\
900180740Sdes	(buf)->b_dva.dva_word[1] == 0 &&			\
901255767Sdes	(buf)->b_cksum0 == 0)
902255767Sdes
903255767Sdes#define	BUF_EQUAL(spa, dva, birth, buf)				\
904180740Sdes	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
905180740Sdes	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
906180740Sdes	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
907180740Sdes
908180740Sdesstatic void
909180740Sdesbuf_discard_identity(arc_buf_hdr_t *hdr)
910180740Sdes{
911180740Sdes	hdr->b_dva.dva_word[0] = 0;
912180740Sdes	hdr->b_dva.dva_word[1] = 0;
913180740Sdes	hdr->b_birth = 0;
914180740Sdes	hdr->b_cksum0 = 0;
915180740Sdes}
916180740Sdes
917180740Sdesstatic arc_buf_hdr_t *
918180740Sdesbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
919180740Sdes{
920180740Sdes	const dva_t *dva = BP_IDENTITY(bp);
921180740Sdes	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
922180740Sdes	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
923180740Sdes	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
924180740Sdes	arc_buf_hdr_t *buf;
925180740Sdes
926180740Sdes	mutex_enter(hash_lock);
927180740Sdes	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
928204917Sdes	    buf = buf->b_hash_next) {
929204917Sdes		if (BUF_EQUAL(spa, dva, birth, buf)) {
930204917Sdes			*lockp = hash_lock;
931180740Sdes			return (buf);
932180740Sdes		}
933180740Sdes	}
934240075Sdes	mutex_exit(hash_lock);
935240075Sdes	*lockp = NULL;
936240075Sdes	return (NULL);
937180740Sdes}
938180740Sdes
939180740Sdes/*
940180740Sdes * Insert an entry into the hash table.  If there is already an element
941180740Sdes * equal to elem in the hash table, then the already existing element
942180740Sdes * will be returned and the new element will not be inserted.
943204917Sdes * Otherwise returns NULL.
944204917Sdes */
945204917Sdesstatic arc_buf_hdr_t *
946180740Sdesbuf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
947180740Sdes{
948180740Sdes	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
949180740Sdes	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
950180740Sdes	arc_buf_hdr_t *fbuf;
951180740Sdes	uint32_t i;
952180740Sdes
953180740Sdes	ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
954180740Sdes	ASSERT(buf->b_birth != 0);
955180740Sdes	ASSERT(!HDR_IN_HASH_TABLE(buf));
956180740Sdes	*lockp = hash_lock;
957180740Sdes	mutex_enter(hash_lock);
958180740Sdes	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
959180740Sdes	    fbuf = fbuf->b_hash_next, i++) {
960180740Sdes		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
961180740Sdes			return (fbuf);
962180740Sdes	}
963180740Sdes
964180740Sdes	buf->b_hash_next = buf_hash_table.ht_table[idx];
965180740Sdes	buf_hash_table.ht_table[idx] = buf;
966180740Sdes	buf->b_flags |= ARC_IN_HASH_TABLE;
967180740Sdes
968180740Sdes	/* collect some hash table performance data */
969180740Sdes	if (i > 0) {
970180740Sdes		ARCSTAT_BUMP(arcstat_hash_collisions);
971180740Sdes		if (i == 1)
972180740Sdes			ARCSTAT_BUMP(arcstat_hash_chains);
973207319Sdes
974207319Sdes		ARCSTAT_MAX(arcstat_hash_chain_max, i);
975207319Sdes	}
976180740Sdes
977180740Sdes	ARCSTAT_BUMP(arcstat_hash_elements);
978180740Sdes	ARCSTAT_MAXSTAT(arcstat_hash_elements);
979180740Sdes
980180740Sdes	return (NULL);
981180740Sdes}
982180744Sdes
983180744Sdesstatic void
984180744Sdesbuf_hash_remove(arc_buf_hdr_t *buf)
985180740Sdes{
986180740Sdes	arc_buf_hdr_t *fbuf, **bufp;
987180740Sdes	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
988180740Sdes
989180740Sdes	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
990180740Sdes	ASSERT(HDR_IN_HASH_TABLE(buf));
991180740Sdes
992180740Sdes	bufp = &buf_hash_table.ht_table[idx];
993180740Sdes	while ((fbuf = *bufp) != buf) {
994180740Sdes		ASSERT(fbuf != NULL);
995180740Sdes		bufp = &fbuf->b_hash_next;
996180740Sdes	}
997180740Sdes	*bufp = buf->b_hash_next;
998180740Sdes	buf->b_hash_next = NULL;
999180740Sdes	buf->b_flags &= ~ARC_IN_HASH_TABLE;
1000180740Sdes
1001180740Sdes	/* collect some hash table performance data */
1002180740Sdes	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1003180740Sdes
1004180740Sdes	if (buf_hash_table.ht_table[idx] &&
1005180740Sdes	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1006180740Sdes		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1007180740Sdes}
1008180740Sdes
1009180740Sdes/*
1010180740Sdes * Global data structures and functions for the buf kmem cache.
1011180740Sdes */
1012180740Sdesstatic kmem_cache_t *hdr_cache;
1013180740Sdesstatic kmem_cache_t *buf_cache;
1014180740Sdes
1015180740Sdesstatic void
1016180740Sdesbuf_fini(void)
1017180740Sdes{
1018180740Sdes	int i;
1019180740Sdes
1020180740Sdes	kmem_free(buf_hash_table.ht_table,
1021180750Sdes	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1022180750Sdes	for (i = 0; i < BUF_LOCKS; i++)
1023180750Sdes		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1024180750Sdes	kmem_cache_destroy(hdr_cache);
1025180750Sdes	kmem_cache_destroy(buf_cache);
1026180750Sdes}
1027180740Sdes
1028180740Sdes/*
1029180740Sdes * Constructor callback - called when the cache is empty
1030180740Sdes * and a new buf is requested.
1031180740Sdes */
1032180740Sdes/* ARGSUSED */
1033180740Sdesstatic int
1034180740Sdeshdr_cons(void *vbuf, void *unused, int kmflag)
1035180740Sdes{
1036180740Sdes	arc_buf_hdr_t *buf = vbuf;
1037180740Sdes
1038180740Sdes	bzero(buf, sizeof (arc_buf_hdr_t));
1039180740Sdes	refcount_create(&buf->b_refcnt);
1040180740Sdes	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1041180740Sdes	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1042180740Sdes	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1043180740Sdes
1044180740Sdes	return (0);
1045180740Sdes}
1046180740Sdes
1047180740Sdes/* ARGSUSED */
1048180740Sdesstatic int
1049180740Sdesbuf_cons(void *vbuf, void *unused, int kmflag)
1050180740Sdes{
1051180740Sdes	arc_buf_t *buf = vbuf;
1052180740Sdes
1053180740Sdes	bzero(buf, sizeof (arc_buf_t));
1054180740Sdes	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1055180740Sdes	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1056180740Sdes
1057180740Sdes	return (0);
1058180740Sdes}
1059180740Sdes
1060180740Sdes/*
1061180740Sdes * Destructor callback - called when a cached buf is
1062180740Sdes * no longer required.
1063240075Sdes */
1064240075Sdes/* ARGSUSED */
1065240075Sdesstatic void
1066180740Sdeshdr_dest(void *vbuf, void *unused)
1067180740Sdes{
1068180740Sdes	arc_buf_hdr_t *buf = vbuf;
1069215116Sdes
1070215116Sdes	ASSERT(BUF_EMPTY(buf));
1071215116Sdes	refcount_destroy(&buf->b_refcnt);
1072180740Sdes	cv_destroy(&buf->b_cv);
1073180740Sdes	mutex_destroy(&buf->b_freeze_lock);
1074180740Sdes	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1075180740Sdes}
1076180740Sdes
1077180740Sdes/* ARGSUSED */
1078180740Sdesstatic void
1079180740Sdesbuf_dest(void *vbuf, void *unused)
1080180740Sdes{
1081180740Sdes	arc_buf_t *buf = vbuf;
1082180740Sdes
1083180740Sdes	mutex_destroy(&buf->b_evict_lock);
1084248619Sdes	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1085248619Sdes}
1086248619Sdes
1087180740Sdes/*
1088180740Sdes * Reclaim callback -- invoked when memory is low.
1089180740Sdes */
1090180740Sdes/* ARGSUSED */
1091180740Sdesstatic void
1092180740Sdeshdr_recl(void *unused)
1093255767Sdes{
1094255767Sdes	dprintf("hdr_recl called\n");
1095255767Sdes	/*
1096255767Sdes	 * umem calls the reclaim func when we destroy the buf cache,
1097255767Sdes	 * which is after we do arc_fini().
1098255767Sdes	 */
1099255767Sdes	if (!arc_dead)
1100255767Sdes		cv_signal(&arc_reclaim_thr_cv);
1101255767Sdes}
1102255767Sdes
1103255767Sdesstatic void
1104255767Sdesbuf_init(void)
1105180740Sdes{
1106180740Sdes	uint64_t *ct;
1107180740Sdes	uint64_t hsize = 1ULL << 12;
1108226046Sdes	int i, j;
1109192595Sdes
1110192595Sdes	/*
1111180740Sdes	 * The hash table is big enough to fill all of physical memory
1112180740Sdes	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1113180740Sdes	 * By default, the table will take up
1114226046Sdes	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1115180740Sdes	 */
1116180740Sdes	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1117180740Sdes		hsize <<= 1;
1118180740Sdesretry:
1119180740Sdes	buf_hash_table.ht_mask = hsize - 1;
1120180740Sdes	buf_hash_table.ht_table =
1121180740Sdes	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1122180740Sdes	if (buf_hash_table.ht_table == NULL) {
1123180744Sdes		ASSERT(hsize > (1ULL << 8));
1124180744Sdes		hsize >>= 1;
1125180744Sdes		goto retry;
1126180740Sdes	}
1127180740Sdes
1128180740Sdes	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1129180740Sdes	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1130180740Sdes	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1131180740Sdes	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1132180740Sdes
1133180740Sdes	for (i = 0; i < 256; i++)
1134180740Sdes		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1135180740Sdes			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1136180740Sdes
1137180740Sdes	for (i = 0; i < BUF_LOCKS; i++) {
1138180740Sdes		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1139180740Sdes		    NULL, MUTEX_DEFAULT, NULL);
1140180740Sdes	}
1141262566Sdes}
1142262566Sdes
1143262566Sdes#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1144180740Sdes
1145180740Sdesstatic void
1146180740Sdesarc_cksum_verify(arc_buf_t *buf)
1147180740Sdes{
1148180740Sdes	zio_cksum_t zc;
1149180740Sdes
1150180740Sdes	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1151180740Sdes		return;
1152180740Sdes
1153180740Sdes	mutex_enter(&buf->b_hdr->b_freeze_lock);
1154180740Sdes	if (buf->b_hdr->b_freeze_cksum == NULL ||
1155180740Sdes	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1156180750Sdes		mutex_exit(&buf->b_hdr->b_freeze_lock);
1157180750Sdes		return;
1158180750Sdes	}
1159180740Sdes	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1160180740Sdes	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1161180740Sdes		panic("buffer modified while frozen!");
1162180740Sdes	mutex_exit(&buf->b_hdr->b_freeze_lock);
1163180740Sdes}
1164180740Sdes
1165180746Sdesstatic int
1166180746Sdesarc_cksum_equal(arc_buf_t *buf)
1167180746Sdes{
1168180740Sdes	zio_cksum_t zc;
1169180740Sdes	int equal;
1170180740Sdes
1171180740Sdes	mutex_enter(&buf->b_hdr->b_freeze_lock);
1172180740Sdes	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1173180740Sdes	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1174180740Sdes	mutex_exit(&buf->b_hdr->b_freeze_lock);
1175180740Sdes
1176180740Sdes	return (equal);
1177180740Sdes}
1178180740Sdes
1179180740Sdesstatic void
1180180750Sdesarc_cksum_compute(arc_buf_t *buf, boolean_t force)
1181180750Sdes{
1182180750Sdes	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1183180740Sdes		return;
1184180740Sdes
1185180740Sdes	mutex_enter(&buf->b_hdr->b_freeze_lock);
1186180740Sdes	if (buf->b_hdr->b_freeze_cksum != NULL) {
1187180740Sdes		mutex_exit(&buf->b_hdr->b_freeze_lock);
1188180740Sdes		return;
1189180740Sdes	}
1190180740Sdes	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1191180740Sdes	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1192180740Sdes	    buf->b_hdr->b_freeze_cksum);
1193180740Sdes	mutex_exit(&buf->b_hdr->b_freeze_lock);
1194180740Sdes#ifdef illumos
1195180740Sdes	arc_buf_watch(buf);
1196180740Sdes#endif /* illumos */
1197180740Sdes}
1198180740Sdes
1199180740Sdes#ifdef illumos
1200180740Sdes#ifndef _KERNEL
1201180740Sdestypedef struct procctl {
1202180740Sdes	long cmd;
1203180740Sdes	prwatch_t prwatch;
1204180740Sdes} procctl_t;
1205180740Sdes#endif
1206180740Sdes
1207180740Sdes/* ARGSUSED */
1208180740Sdesstatic void
1209180740Sdesarc_buf_unwatch(arc_buf_t *buf)
1210180740Sdes{
1211180740Sdes#ifndef _KERNEL
1212180740Sdes	if (arc_watch) {
1213180740Sdes		int result;
1214180740Sdes		procctl_t ctl;
1215180740Sdes		ctl.cmd = PCWATCH;
1216180740Sdes		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1217180740Sdes		ctl.prwatch.pr_size = 0;
1218180740Sdes		ctl.prwatch.pr_wflags = 0;
1219180740Sdes		result = write(arc_procfd, &ctl, sizeof (ctl));
1220180740Sdes		ASSERT3U(result, ==, sizeof (ctl));
1221180740Sdes	}
1222180740Sdes#endif
1223180740Sdes}
1224180740Sdes
1225180740Sdes/* ARGSUSED */
1226180740Sdesstatic void
1227180740Sdesarc_buf_watch(arc_buf_t *buf)
1228180740Sdes{
1229180740Sdes#ifndef _KERNEL
1230180740Sdes	if (arc_watch) {
1231221420Sdes		int result;
1232221420Sdes		procctl_t ctl;
1233221420Sdes		ctl.cmd = PCWATCH;
1234180740Sdes		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1235180740Sdes		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1236180740Sdes		ctl.prwatch.pr_wflags = WA_WRITE;
1237180740Sdes		result = write(arc_procfd, &ctl, sizeof (ctl));
1238180740Sdes		ASSERT3U(result, ==, sizeof (ctl));
1239180740Sdes	}
1240180740Sdes#endif
1241180740Sdes}
1242180740Sdes#endif /* illumos */
1243180740Sdes
1244180740Sdesvoid
1245180740Sdesarc_buf_thaw(arc_buf_t *buf)
1246180740Sdes{
1247180740Sdes	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1248180740Sdes		if (buf->b_hdr->b_state != arc_anon)
1249180740Sdes			panic("modifying non-anon buffer!");
1250180740Sdes		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1251180740Sdes			panic("modifying buffer while i/o in progress!");
1252180740Sdes		arc_cksum_verify(buf);
1253180740Sdes	}
1254180740Sdes
1255180744Sdes	mutex_enter(&buf->b_hdr->b_freeze_lock);
1256180744Sdes	if (buf->b_hdr->b_freeze_cksum != NULL) {
1257180744Sdes		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1258262566Sdes		buf->b_hdr->b_freeze_cksum = NULL;
1259262566Sdes	}
1260262566Sdes
1261180740Sdes	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1262180740Sdes		if (buf->b_hdr->b_thawed)
1263180740Sdes			kmem_free(buf->b_hdr->b_thawed, 1);
1264180740Sdes		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1265180740Sdes	}
1266180740Sdes
1267180740Sdes	mutex_exit(&buf->b_hdr->b_freeze_lock);
1268180740Sdes
1269180740Sdes#ifdef illumos
1270180740Sdes	arc_buf_unwatch(buf);
1271180740Sdes#endif /* illumos */
1272180740Sdes}
1273180740Sdes
1274180740Sdesvoid
1275180740Sdesarc_buf_freeze(arc_buf_t *buf)
1276180740Sdes{
1277180740Sdes	kmutex_t *hash_lock;
1278180740Sdes
1279180740Sdes	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1280180740Sdes		return;
1281180740Sdes
1282204917Sdes	hash_lock = HDR_LOCK(buf->b_hdr);
1283204917Sdes	mutex_enter(hash_lock);
1284204917Sdes
1285248619Sdes	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1286248619Sdes	    buf->b_hdr->b_state == arc_anon);
1287248619Sdes	arc_cksum_compute(buf, B_FALSE);
1288180740Sdes	mutex_exit(hash_lock);
1289180740Sdes
1290180740Sdes}
1291180740Sdes
1292180740Sdesstatic void
1293180740Sdesget_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1294180740Sdes{
1295180740Sdes	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1296180740Sdes
1297180740Sdes	if (ab->b_type == ARC_BUFC_METADATA)
1298180740Sdes		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1299180740Sdes	else {
1300180740Sdes		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1301180740Sdes		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1302180740Sdes	}
1303180740Sdes
1304180740Sdes	*list = &state->arcs_lists[buf_hashid];
1305180740Sdes	*lock = ARCS_LOCK(state, buf_hashid);
1306180740Sdes}
1307180740Sdes
1308180740Sdes
1309180740Sdesstatic void
1310180740Sdesadd_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1311180740Sdes{
1312180740Sdes	ASSERT(MUTEX_HELD(hash_lock));
1313180740Sdes
1314180740Sdes	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1315180740Sdes	    (ab->b_state != arc_anon)) {
1316180740Sdes		uint64_t delta = ab->b_size * ab->b_datacnt;
1317180740Sdes		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1318180740Sdes		list_t *list;
1319180740Sdes		kmutex_t *lock;
1320180740Sdes
1321180740Sdes		get_buf_info(ab, ab->b_state, &list, &lock);
1322180740Sdes		ASSERT(!MUTEX_HELD(lock));
1323180740Sdes		mutex_enter(lock);
1324180740Sdes		ASSERT(list_link_active(&ab->b_arc_node));
1325180740Sdes		list_remove(list, ab);
1326180740Sdes		if (GHOST_STATE(ab->b_state)) {
1327180740Sdes			ASSERT0(ab->b_datacnt);
1328180740Sdes			ASSERT3P(ab->b_buf, ==, NULL);
1329180740Sdes			delta = ab->b_size;
1330180740Sdes		}
1331180740Sdes		ASSERT(delta > 0);
1332180740Sdes		ASSERT3U(*size, >=, delta);
1333180740Sdes		atomic_add_64(size, -delta);
1334180740Sdes		mutex_exit(lock);
1335180740Sdes		/* remove the prefetch flag if we get a reference */
1336180740Sdes		if (ab->b_flags & ARC_PREFETCH)
1337180740Sdes			ab->b_flags &= ~ARC_PREFETCH;
1338180740Sdes	}
1339180740Sdes}
1340180740Sdes
1341180740Sdesstatic int
1342180740Sdesremove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1343180740Sdes{
1344180740Sdes	int cnt;
1345180740Sdes	arc_state_t *state = ab->b_state;
1346180740Sdes
1347180740Sdes	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1348180744Sdes	ASSERT(!GHOST_STATE(state));
1349180744Sdes
1350180744Sdes	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1351180740Sdes	    (state != arc_anon)) {
1352180740Sdes		uint64_t *size = &state->arcs_lsize[ab->b_type];
1353180740Sdes		list_t *list;
1354180740Sdes		kmutex_t *lock;
1355180740Sdes
1356180740Sdes		get_buf_info(ab, state, &list, &lock);
1357180740Sdes		ASSERT(!MUTEX_HELD(lock));
1358180740Sdes		mutex_enter(lock);
1359180740Sdes		ASSERT(!list_link_active(&ab->b_arc_node));
1360180740Sdes		list_insert_head(list, ab);
1361180740Sdes		ASSERT(ab->b_datacnt > 0);
1362180740Sdes		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1363180740Sdes		mutex_exit(lock);
1364180740Sdes	}
1365180740Sdes	return (cnt);
1366180740Sdes}
1367180740Sdes
1368180740Sdes/*
1369180740Sdes * Move the supplied buffer to the indicated state.  The mutex
1370180740Sdes * for the buffer must be held by the caller.
1371180740Sdes */
1372180740Sdesstatic void
1373180740Sdesarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1374180740Sdes{
1375180740Sdes	arc_state_t *old_state = ab->b_state;
1376180740Sdes	int64_t refcnt = refcount_count(&ab->b_refcnt);
1377180740Sdes	uint64_t from_delta, to_delta;
1378180740Sdes	list_t *list;
1379180740Sdes	kmutex_t *lock;
1380180740Sdes
1381180740Sdes	ASSERT(MUTEX_HELD(hash_lock));
1382180740Sdes	ASSERT3P(new_state, !=, old_state);
1383180740Sdes	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1384180740Sdes	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1385180740Sdes	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1386180740Sdes
1387192595Sdes	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1388192595Sdes
1389192595Sdes	/*
1390180740Sdes	 * If this buffer is evictable, transfer it from the
1391180740Sdes	 * old state list to the new state list.
1392180740Sdes	 */
1393180740Sdes	if (refcnt == 0) {
1394180740Sdes		if (old_state != arc_anon) {
1395180740Sdes			int use_mutex;
1396180740Sdes			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1397204917Sdes
1398204917Sdes			get_buf_info(ab, old_state, &list, &lock);
1399204917Sdes			use_mutex = !MUTEX_HELD(lock);
1400180740Sdes			if (use_mutex)
1401180740Sdes				mutex_enter(lock);
1402180740Sdes
1403180740Sdes			ASSERT(list_link_active(&ab->b_arc_node));
1404180740Sdes			list_remove(list, ab);
1405180740Sdes
1406180740Sdes			/*
1407180740Sdes			 * If prefetching out of the ghost cache,
1408180740Sdes			 * we will have a non-zero datacnt.
1409180740Sdes			 */
1410180740Sdes			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1411180740Sdes				/* ghost elements have a ghost size */
1412180740Sdes				ASSERT(ab->b_buf == NULL);
1413180740Sdes				from_delta = ab->b_size;
1414180740Sdes			}
1415180740Sdes			ASSERT3U(*size, >=, from_delta);
1416180740Sdes			atomic_add_64(size, -from_delta);
1417180740Sdes
1418180740Sdes			if (use_mutex)
1419180740Sdes				mutex_exit(lock);
1420180740Sdes		}
1421180740Sdes		if (new_state != arc_anon) {
1422180740Sdes			int use_mutex;
1423180740Sdes			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1424180740Sdes
1425180740Sdes			get_buf_info(ab, new_state, &list, &lock);
1426180740Sdes			use_mutex = !MUTEX_HELD(lock);
1427180740Sdes			if (use_mutex)
1428226046Sdes				mutex_enter(lock);
1429180740Sdes
1430180740Sdes			list_insert_head(list, ab);
1431180740Sdes
1432180740Sdes			/* ghost elements have a ghost size */
1433180740Sdes			if (GHOST_STATE(new_state)) {
1434248619Sdes				ASSERT(ab->b_datacnt == 0);
1435248619Sdes				ASSERT(ab->b_buf == NULL);
1436248619Sdes				to_delta = ab->b_size;
1437180740Sdes			}
1438180740Sdes			atomic_add_64(size, to_delta);
1439180740Sdes
1440180740Sdes			if (use_mutex)
1441180740Sdes				mutex_exit(lock);
1442180740Sdes		}
1443180740Sdes	}
1444180740Sdes
1445180740Sdes	ASSERT(!BUF_EMPTY(ab));
1446197679Sdes	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1447197679Sdes		buf_hash_remove(ab);
1448197679Sdes
1449262566Sdes	/* adjust state sizes */
1450221420Sdes	if (to_delta)
1451221420Sdes		atomic_add_64(&new_state->arcs_size, to_delta);
1452262566Sdes	if (from_delta) {
1453262566Sdes		ASSERT3U(old_state->arcs_size, >=, from_delta);
1454262566Sdes		atomic_add_64(&old_state->arcs_size, -from_delta);
1455262566Sdes	}
1456262566Sdes	ab->b_state = new_state;
1457262566Sdes
1458262566Sdes	/* adjust l2arc hdr stats */
1459262566Sdes	if (new_state == arc_l2c_only)
1460262566Sdes		l2arc_hdr_stat_add();
1461248619Sdes	else if (old_state == arc_l2c_only)
1462248619Sdes		l2arc_hdr_stat_remove();
1463248619Sdes}
1464248619Sdes
1465248619Sdesvoid
1466248619Sdesarc_space_consume(uint64_t space, arc_space_type_t type)
1467180740Sdes{
1468180740Sdes	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1469180740Sdes
1470180740Sdes	switch (type) {
1471180740Sdes	case ARC_SPACE_DATA:
1472180740Sdes		ARCSTAT_INCR(arcstat_data_size, space);
1473180740Sdes		break;
1474180740Sdes	case ARC_SPACE_OTHER:
1475180740Sdes		ARCSTAT_INCR(arcstat_other_size, space);
1476180740Sdes		break;
1477180740Sdes	case ARC_SPACE_HDRS:
1478180740Sdes		ARCSTAT_INCR(arcstat_hdr_size, space);
1479180740Sdes		break;
1480180740Sdes	case ARC_SPACE_L2HDRS:
1481180740Sdes		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1482180740Sdes		break;
1483180740Sdes	}
1484180740Sdes
1485226046Sdes	atomic_add_64(&arc_meta_used, space);
1486226046Sdes	atomic_add_64(&arc_size, space);
1487226046Sdes}
1488180740Sdes
1489180740Sdesvoid
1490180740Sdesarc_space_return(uint64_t space, arc_space_type_t type)
1491180740Sdes{
1492180740Sdes	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1493180740Sdes
1494180740Sdes	switch (type) {
1495180740Sdes	case ARC_SPACE_DATA:
1496180740Sdes		ARCSTAT_INCR(arcstat_data_size, -space);
1497180740Sdes		break;
1498180740Sdes	case ARC_SPACE_OTHER:
1499180740Sdes		ARCSTAT_INCR(arcstat_other_size, -space);
1500180740Sdes		break;
1501248619Sdes	case ARC_SPACE_HDRS:
1502248619Sdes		ARCSTAT_INCR(arcstat_hdr_size, -space);
1503248619Sdes		break;
1504180740Sdes	case ARC_SPACE_L2HDRS:
1505180740Sdes		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1506180740Sdes		break;
1507180740Sdes	}
1508180740Sdes
1509180740Sdes	ASSERT(arc_meta_used >= space);
1510180740Sdes	if (arc_meta_max < arc_meta_used)
1511180740Sdes		arc_meta_max = arc_meta_used;
1512180740Sdes	atomic_add_64(&arc_meta_used, -space);
1513262566Sdes	ASSERT(arc_size >= space);
1514262566Sdes	atomic_add_64(&arc_size, -space);
1515262566Sdes}
1516226046Sdes
1517226046Sdesarc_buf_t *
1518226046Sdesarc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1519226046Sdes{
1520226046Sdes	arc_buf_hdr_t *hdr;
1521226046Sdes	arc_buf_t *buf;
1522226046Sdes
1523226046Sdes	ASSERT3U(size, >, 0);
1524226046Sdes	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1525240075Sdes	ASSERT(BUF_EMPTY(hdr));
1526240075Sdes	hdr->b_size = size;
1527240075Sdes	hdr->b_type = type;
1528240075Sdes	hdr->b_spa = spa_load_guid(spa);
1529240075Sdes	hdr->b_state = arc_anon;
1530240075Sdes	hdr->b_arc_access = 0;
1531262566Sdes	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1532262566Sdes	buf->b_hdr = hdr;
1533262566Sdes	buf->b_data = NULL;
1534226046Sdes	buf->b_efunc = NULL;
1535226046Sdes	buf->b_private = NULL;
1536226046Sdes	buf->b_next = NULL;
1537240075Sdes	hdr->b_buf = buf;
1538240075Sdes	arc_get_data_buf(buf);
1539240075Sdes	hdr->b_datacnt = 1;
1540180740Sdes	hdr->b_flags = 0;
1541180740Sdes	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1542180740Sdes	(void) refcount_add(&hdr->b_refcnt, tag);
1543180740Sdes
1544180740Sdes	return (buf);
1545180740Sdes}
1546180740Sdes
1547180740Sdesstatic char *arc_onloan_tag = "onloan";
1548180740Sdes
1549180740Sdes/*
1550180740Sdes * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1551180740Sdes * flight data by arc_tempreserve_space() until they are "returned". Loaned
1552180740Sdes * buffers must be returned to the arc before they can be used by the DMU or
1553180740Sdes * freed.
1554180740Sdes */
1555180740Sdesarc_buf_t *
1556180740Sdesarc_loan_buf(spa_t *spa, int size)
1557180740Sdes{
1558180740Sdes	arc_buf_t *buf;
1559180740Sdes
1560180740Sdes	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1561180740Sdes
1562180740Sdes	atomic_add_64(&arc_loaned_bytes, size);
1563180740Sdes	return (buf);
1564180740Sdes}
1565180740Sdes
1566180740Sdes/*
1567180740Sdes * Return a loaned arc buffer to the arc.
1568180740Sdes */
1569180740Sdesvoid
1570180740Sdesarc_return_buf(arc_buf_t *buf, void *tag)
1571180740Sdes{
1572180740Sdes	arc_buf_hdr_t *hdr = buf->b_hdr;
1573180740Sdes
1574180740Sdes	ASSERT(buf->b_data != NULL);
1575180740Sdes	(void) refcount_add(&hdr->b_refcnt, tag);
1576180740Sdes	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1577197679Sdes
1578197679Sdes	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1579197679Sdes}
1580180740Sdes
1581180740Sdes/* Detach an arc_buf from a dbuf (tag) */
1582180740Sdesvoid
1583180740Sdesarc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1584180740Sdes{
1585180740Sdes	arc_buf_hdr_t *hdr;
1586180740Sdes
1587180740Sdes	ASSERT(buf->b_data != NULL);
1588180740Sdes	hdr = buf->b_hdr;
1589180740Sdes	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1590180740Sdes	(void) refcount_remove(&hdr->b_refcnt, tag);
1591180740Sdes	buf->b_efunc = NULL;
1592180740Sdes	buf->b_private = NULL;
1593180740Sdes
1594180740Sdes	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1595180740Sdes}
1596180740Sdes
1597180740Sdesstatic arc_buf_t *
1598180740Sdesarc_buf_clone(arc_buf_t *from)
1599180740Sdes{
1600180740Sdes	arc_buf_t *buf;
1601180740Sdes	arc_buf_hdr_t *hdr = from->b_hdr;
1602180740Sdes	uint64_t size = hdr->b_size;
1603180740Sdes
1604180740Sdes	ASSERT(hdr->b_state != arc_anon);
1605180740Sdes
1606180740Sdes	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1607180740Sdes	buf->b_hdr = hdr;
1608180740Sdes	buf->b_data = NULL;
1609180740Sdes	buf->b_efunc = NULL;
1610180740Sdes	buf->b_private = NULL;
1611180740Sdes	buf->b_next = hdr->b_buf;
1612180740Sdes	hdr->b_buf = buf;
1613180740Sdes	arc_get_data_buf(buf);
1614180740Sdes	bcopy(from->b_data, buf->b_data, size);
1615180740Sdes
1616180740Sdes	/*
1617180740Sdes	 * This buffer already exists in the arc so create a duplicate
1618180740Sdes	 * copy for the caller.  If the buffer is associated with user data
1619180740Sdes	 * then track the size and number of duplicates.  These stats will be
1620180740Sdes	 * updated as duplicate buffers are created and destroyed.
1621180740Sdes	 */
1622180740Sdes	if (hdr->b_type == ARC_BUFC_DATA) {
1623180740Sdes		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1624180740Sdes		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1625180740Sdes	}
1626180740Sdes	hdr->b_datacnt += 1;
1627180740Sdes	return (buf);
1628221420Sdes}
1629221420Sdes
1630221420Sdesvoid
1631180740Sdesarc_buf_add_ref(arc_buf_t *buf, void* tag)
1632180740Sdes{
1633180740Sdes	arc_buf_hdr_t *hdr;
1634180740Sdes	kmutex_t *hash_lock;
1635180740Sdes
1636180740Sdes	/*
1637180740Sdes	 * Check to see if this buffer is evicted.  Callers
1638180740Sdes	 * must verify b_data != NULL to know if the add_ref
1639180740Sdes	 * was successful.
1640180740Sdes	 */
1641180740Sdes	mutex_enter(&buf->b_evict_lock);
1642180740Sdes	if (buf->b_data == NULL) {
1643221420Sdes		mutex_exit(&buf->b_evict_lock);
1644221420Sdes		return;
1645221420Sdes	}
1646180740Sdes	hash_lock = HDR_LOCK(buf->b_hdr);
1647180740Sdes	mutex_enter(hash_lock);
1648180740Sdes	hdr = buf->b_hdr;
1649180740Sdes	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1650180740Sdes	mutex_exit(&buf->b_evict_lock);
1651180740Sdes
1652180740Sdes	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1653180740Sdes	add_reference(hdr, hash_lock, tag);
1654180740Sdes	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1655180740Sdes	arc_access(hdr, hash_lock);
1656180740Sdes	mutex_exit(hash_lock);
1657180740Sdes	ARCSTAT_BUMP(arcstat_hits);
1658180740Sdes	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1659180740Sdes	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1660180740Sdes	    data, metadata, hits);
1661180740Sdes}
1662180740Sdes
1663180740Sdesstatic void
1664180740Sdesarc_buf_free_on_write(void *data, size_t size,
1665180740Sdes    void (*free_func)(void *, size_t))
1666180740Sdes{
1667180740Sdes	l2arc_data_free_t *df;
1668226046Sdes
1669226046Sdes	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1670226046Sdes	df->l2df_data = data;
1671226046Sdes	df->l2df_size = size;
1672226046Sdes	df->l2df_func = free_func;
1673226046Sdes	mutex_enter(&l2arc_free_on_write_mtx);
1674226046Sdes	list_insert_head(l2arc_free_on_write, df);
1675226046Sdes	mutex_exit(&l2arc_free_on_write_mtx);
1676226046Sdes}
1677226046Sdes
1678226046Sdes/*
1679180740Sdes * Free the arc data buffer.  If it is an l2arc write in progress,
1680180740Sdes * the buffer is placed on l2arc_free_on_write to be freed later.
1681180740Sdes */
1682180740Sdesstatic void
1683180740Sdesarc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1684180740Sdes{
1685180740Sdes	arc_buf_hdr_t *hdr = buf->b_hdr;
1686180740Sdes
1687180740Sdes	if (HDR_L2_WRITING(hdr)) {
1688180740Sdes		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1689180740Sdes		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1690180740Sdes	} else {
1691180740Sdes		free_func(buf->b_data, hdr->b_size);
1692180740Sdes	}
1693180740Sdes}
1694180740Sdes
1695180740Sdes/*
1696180740Sdes * Free up buf->b_data and if 'remove' is set, then pull the
1697180740Sdes * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1698180740Sdes */
1699180740Sdesstatic void
1700180740Sdesarc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1701180740Sdes{
1702180740Sdes	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1703180740Sdes
1704180740Sdes	ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
1705180740Sdes
1706180740Sdes	if (l2hdr->b_tmp_cdata == NULL)
1707180740Sdes		return;
1708180740Sdes
1709	ASSERT(HDR_L2_WRITING(hdr));
1710	arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
1711	    zio_data_buf_free);
1712	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1713	l2hdr->b_tmp_cdata = NULL;
1714}
1715
1716static void
1717arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1718{
1719	arc_buf_t **bufp;
1720
1721	/* free up data associated with the buf */
1722	if (buf->b_data) {
1723		arc_state_t *state = buf->b_hdr->b_state;
1724		uint64_t size = buf->b_hdr->b_size;
1725		arc_buf_contents_t type = buf->b_hdr->b_type;
1726
1727		arc_cksum_verify(buf);
1728#ifdef illumos
1729		arc_buf_unwatch(buf);
1730#endif /* illumos */
1731
1732		if (!recycle) {
1733			if (type == ARC_BUFC_METADATA) {
1734				arc_buf_data_free(buf, zio_buf_free);
1735				arc_space_return(size, ARC_SPACE_DATA);
1736			} else {
1737				ASSERT(type == ARC_BUFC_DATA);
1738				arc_buf_data_free(buf, zio_data_buf_free);
1739				ARCSTAT_INCR(arcstat_data_size, -size);
1740				atomic_add_64(&arc_size, -size);
1741			}
1742		}
1743		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1744			uint64_t *cnt = &state->arcs_lsize[type];
1745
1746			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1747			ASSERT(state != arc_anon);
1748
1749			ASSERT3U(*cnt, >=, size);
1750			atomic_add_64(cnt, -size);
1751		}
1752		ASSERT3U(state->arcs_size, >=, size);
1753		atomic_add_64(&state->arcs_size, -size);
1754		buf->b_data = NULL;
1755
1756		/*
1757		 * If we're destroying a duplicate buffer make sure
1758		 * that the appropriate statistics are updated.
1759		 */
1760		if (buf->b_hdr->b_datacnt > 1 &&
1761		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1762			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1763			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1764		}
1765		ASSERT(buf->b_hdr->b_datacnt > 0);
1766		buf->b_hdr->b_datacnt -= 1;
1767	}
1768
1769	/* only remove the buf if requested */
1770	if (!remove)
1771		return;
1772
1773	/* remove the buf from the hdr list */
1774	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1775		continue;
1776	*bufp = buf->b_next;
1777	buf->b_next = NULL;
1778
1779	ASSERT(buf->b_efunc == NULL);
1780
1781	/* clean up the buf */
1782	buf->b_hdr = NULL;
1783	kmem_cache_free(buf_cache, buf);
1784}
1785
1786static void
1787arc_hdr_destroy(arc_buf_hdr_t *hdr)
1788{
1789	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1790	ASSERT3P(hdr->b_state, ==, arc_anon);
1791	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1792	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1793
1794	if (l2hdr != NULL) {
1795		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1796		/*
1797		 * To prevent arc_free() and l2arc_evict() from
1798		 * attempting to free the same buffer at the same time,
1799		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1800		 * give it priority.  l2arc_evict() can't destroy this
1801		 * header while we are waiting on l2arc_buflist_mtx.
1802		 *
1803		 * The hdr may be removed from l2ad_buflist before we
1804		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1805		 */
1806		if (!buflist_held) {
1807			mutex_enter(&l2arc_buflist_mtx);
1808			l2hdr = hdr->b_l2hdr;
1809		}
1810
1811		if (l2hdr != NULL) {
1812			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1813			    hdr->b_size, 0);
1814			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1815			arc_buf_l2_cdata_free(hdr);
1816			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1817			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1818			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1819			    -l2hdr->b_asize, 0, 0);
1820			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1821			if (hdr->b_state == arc_l2c_only)
1822				l2arc_hdr_stat_remove();
1823			hdr->b_l2hdr = NULL;
1824		}
1825
1826		if (!buflist_held)
1827			mutex_exit(&l2arc_buflist_mtx);
1828	}
1829
1830	if (!BUF_EMPTY(hdr)) {
1831		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1832		buf_discard_identity(hdr);
1833	}
1834	while (hdr->b_buf) {
1835		arc_buf_t *buf = hdr->b_buf;
1836
1837		if (buf->b_efunc) {
1838			mutex_enter(&arc_eviction_mtx);
1839			mutex_enter(&buf->b_evict_lock);
1840			ASSERT(buf->b_hdr != NULL);
1841			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1842			hdr->b_buf = buf->b_next;
1843			buf->b_hdr = &arc_eviction_hdr;
1844			buf->b_next = arc_eviction_list;
1845			arc_eviction_list = buf;
1846			mutex_exit(&buf->b_evict_lock);
1847			mutex_exit(&arc_eviction_mtx);
1848		} else {
1849			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1850		}
1851	}
1852	if (hdr->b_freeze_cksum != NULL) {
1853		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1854		hdr->b_freeze_cksum = NULL;
1855	}
1856	if (hdr->b_thawed) {
1857		kmem_free(hdr->b_thawed, 1);
1858		hdr->b_thawed = NULL;
1859	}
1860
1861	ASSERT(!list_link_active(&hdr->b_arc_node));
1862	ASSERT3P(hdr->b_hash_next, ==, NULL);
1863	ASSERT3P(hdr->b_acb, ==, NULL);
1864	kmem_cache_free(hdr_cache, hdr);
1865}
1866
1867void
1868arc_buf_free(arc_buf_t *buf, void *tag)
1869{
1870	arc_buf_hdr_t *hdr = buf->b_hdr;
1871	int hashed = hdr->b_state != arc_anon;
1872
1873	ASSERT(buf->b_efunc == NULL);
1874	ASSERT(buf->b_data != NULL);
1875
1876	if (hashed) {
1877		kmutex_t *hash_lock = HDR_LOCK(hdr);
1878
1879		mutex_enter(hash_lock);
1880		hdr = buf->b_hdr;
1881		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1882
1883		(void) remove_reference(hdr, hash_lock, tag);
1884		if (hdr->b_datacnt > 1) {
1885			arc_buf_destroy(buf, FALSE, TRUE);
1886		} else {
1887			ASSERT(buf == hdr->b_buf);
1888			ASSERT(buf->b_efunc == NULL);
1889			hdr->b_flags |= ARC_BUF_AVAILABLE;
1890		}
1891		mutex_exit(hash_lock);
1892	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1893		int destroy_hdr;
1894		/*
1895		 * We are in the middle of an async write.  Don't destroy
1896		 * this buffer unless the write completes before we finish
1897		 * decrementing the reference count.
1898		 */
1899		mutex_enter(&arc_eviction_mtx);
1900		(void) remove_reference(hdr, NULL, tag);
1901		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1902		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1903		mutex_exit(&arc_eviction_mtx);
1904		if (destroy_hdr)
1905			arc_hdr_destroy(hdr);
1906	} else {
1907		if (remove_reference(hdr, NULL, tag) > 0)
1908			arc_buf_destroy(buf, FALSE, TRUE);
1909		else
1910			arc_hdr_destroy(hdr);
1911	}
1912}
1913
1914boolean_t
1915arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1916{
1917	arc_buf_hdr_t *hdr = buf->b_hdr;
1918	kmutex_t *hash_lock = HDR_LOCK(hdr);
1919	boolean_t no_callback = (buf->b_efunc == NULL);
1920
1921	if (hdr->b_state == arc_anon) {
1922		ASSERT(hdr->b_datacnt == 1);
1923		arc_buf_free(buf, tag);
1924		return (no_callback);
1925	}
1926
1927	mutex_enter(hash_lock);
1928	hdr = buf->b_hdr;
1929	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1930	ASSERT(hdr->b_state != arc_anon);
1931	ASSERT(buf->b_data != NULL);
1932
1933	(void) remove_reference(hdr, hash_lock, tag);
1934	if (hdr->b_datacnt > 1) {
1935		if (no_callback)
1936			arc_buf_destroy(buf, FALSE, TRUE);
1937	} else if (no_callback) {
1938		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1939		ASSERT(buf->b_efunc == NULL);
1940		hdr->b_flags |= ARC_BUF_AVAILABLE;
1941	}
1942	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1943	    refcount_is_zero(&hdr->b_refcnt));
1944	mutex_exit(hash_lock);
1945	return (no_callback);
1946}
1947
1948int
1949arc_buf_size(arc_buf_t *buf)
1950{
1951	return (buf->b_hdr->b_size);
1952}
1953
1954/*
1955 * Called from the DMU to determine if the current buffer should be
1956 * evicted. In order to ensure proper locking, the eviction must be initiated
1957 * from the DMU. Return true if the buffer is associated with user data and
1958 * duplicate buffers still exist.
1959 */
1960boolean_t
1961arc_buf_eviction_needed(arc_buf_t *buf)
1962{
1963	arc_buf_hdr_t *hdr;
1964	boolean_t evict_needed = B_FALSE;
1965
1966	if (zfs_disable_dup_eviction)
1967		return (B_FALSE);
1968
1969	mutex_enter(&buf->b_evict_lock);
1970	hdr = buf->b_hdr;
1971	if (hdr == NULL) {
1972		/*
1973		 * We are in arc_do_user_evicts(); let that function
1974		 * perform the eviction.
1975		 */
1976		ASSERT(buf->b_data == NULL);
1977		mutex_exit(&buf->b_evict_lock);
1978		return (B_FALSE);
1979	} else if (buf->b_data == NULL) {
1980		/*
1981		 * We have already been added to the arc eviction list;
1982		 * recommend eviction.
1983		 */
1984		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1985		mutex_exit(&buf->b_evict_lock);
1986		return (B_TRUE);
1987	}
1988
1989	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1990		evict_needed = B_TRUE;
1991
1992	mutex_exit(&buf->b_evict_lock);
1993	return (evict_needed);
1994}
1995
1996/*
1997 * Evict buffers from list until we've removed the specified number of
1998 * bytes.  Move the removed buffers to the appropriate evict state.
1999 * If the recycle flag is set, then attempt to "recycle" a buffer:
2000 * - look for a buffer to evict that is `bytes' long.
2001 * - return the data block from this buffer rather than freeing it.
2002 * This flag is used by callers that are trying to make space for a
2003 * new buffer in a full arc cache.
2004 *
2005 * This function makes a "best effort".  It skips over any buffers
2006 * it can't get a hash_lock on, and so may not catch all candidates.
2007 * It may also return without evicting as much space as requested.
2008 */
2009static void *
2010arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2011    arc_buf_contents_t type)
2012{
2013	arc_state_t *evicted_state;
2014	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2015	int64_t bytes_remaining;
2016	arc_buf_hdr_t *ab, *ab_prev = NULL;
2017	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2018	kmutex_t *lock, *evicted_lock;
2019	kmutex_t *hash_lock;
2020	boolean_t have_lock;
2021	void *stolen = NULL;
2022	arc_buf_hdr_t marker = { 0 };
2023	int count = 0;
2024	static int evict_metadata_offset, evict_data_offset;
2025	int i, idx, offset, list_count, lists;
2026
2027	ASSERT(state == arc_mru || state == arc_mfu);
2028
2029	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2030
2031	if (type == ARC_BUFC_METADATA) {
2032		offset = 0;
2033		list_count = ARC_BUFC_NUMMETADATALISTS;
2034		list_start = &state->arcs_lists[0];
2035		evicted_list_start = &evicted_state->arcs_lists[0];
2036		idx = evict_metadata_offset;
2037	} else {
2038		offset = ARC_BUFC_NUMMETADATALISTS;
2039		list_start = &state->arcs_lists[offset];
2040		evicted_list_start = &evicted_state->arcs_lists[offset];
2041		list_count = ARC_BUFC_NUMDATALISTS;
2042		idx = evict_data_offset;
2043	}
2044	bytes_remaining = evicted_state->arcs_lsize[type];
2045	lists = 0;
2046
2047evict_start:
2048	list = &list_start[idx];
2049	evicted_list = &evicted_list_start[idx];
2050	lock = ARCS_LOCK(state, (offset + idx));
2051	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2052
2053	mutex_enter(lock);
2054	mutex_enter(evicted_lock);
2055
2056	for (ab = list_tail(list); ab; ab = ab_prev) {
2057		ab_prev = list_prev(list, ab);
2058		bytes_remaining -= (ab->b_size * ab->b_datacnt);
2059		/* prefetch buffers have a minimum lifespan */
2060		if (HDR_IO_IN_PROGRESS(ab) ||
2061		    (spa && ab->b_spa != spa) ||
2062		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2063		    ddi_get_lbolt() - ab->b_arc_access <
2064		    arc_min_prefetch_lifespan)) {
2065			skipped++;
2066			continue;
2067		}
2068		/* "lookahead" for better eviction candidate */
2069		if (recycle && ab->b_size != bytes &&
2070		    ab_prev && ab_prev->b_size == bytes)
2071			continue;
2072
2073		/* ignore markers */
2074		if (ab->b_spa == 0)
2075			continue;
2076
2077		/*
2078		 * It may take a long time to evict all the bufs requested.
2079		 * To avoid blocking all arc activity, periodically drop
2080		 * the arcs_mtx and give other threads a chance to run
2081		 * before reacquiring the lock.
2082		 *
2083		 * If we are looking for a buffer to recycle, we are in
2084		 * the hot code path, so don't sleep.
2085		 */
2086		if (!recycle && count++ > arc_evict_iterations) {
2087			list_insert_after(list, ab, &marker);
2088			mutex_exit(evicted_lock);
2089			mutex_exit(lock);
2090			kpreempt(KPREEMPT_SYNC);
2091			mutex_enter(lock);
2092			mutex_enter(evicted_lock);
2093			ab_prev = list_prev(list, &marker);
2094			list_remove(list, &marker);
2095			count = 0;
2096			continue;
2097		}
2098
2099		hash_lock = HDR_LOCK(ab);
2100		have_lock = MUTEX_HELD(hash_lock);
2101		if (have_lock || mutex_tryenter(hash_lock)) {
2102			ASSERT0(refcount_count(&ab->b_refcnt));
2103			ASSERT(ab->b_datacnt > 0);
2104			while (ab->b_buf) {
2105				arc_buf_t *buf = ab->b_buf;
2106				if (!mutex_tryenter(&buf->b_evict_lock)) {
2107					missed += 1;
2108					break;
2109				}
2110				if (buf->b_data) {
2111					bytes_evicted += ab->b_size;
2112					if (recycle && ab->b_type == type &&
2113					    ab->b_size == bytes &&
2114					    !HDR_L2_WRITING(ab)) {
2115						stolen = buf->b_data;
2116						recycle = FALSE;
2117					}
2118				}
2119				if (buf->b_efunc) {
2120					mutex_enter(&arc_eviction_mtx);
2121					arc_buf_destroy(buf,
2122					    buf->b_data == stolen, FALSE);
2123					ab->b_buf = buf->b_next;
2124					buf->b_hdr = &arc_eviction_hdr;
2125					buf->b_next = arc_eviction_list;
2126					arc_eviction_list = buf;
2127					mutex_exit(&arc_eviction_mtx);
2128					mutex_exit(&buf->b_evict_lock);
2129				} else {
2130					mutex_exit(&buf->b_evict_lock);
2131					arc_buf_destroy(buf,
2132					    buf->b_data == stolen, TRUE);
2133				}
2134			}
2135
2136			if (ab->b_l2hdr) {
2137				ARCSTAT_INCR(arcstat_evict_l2_cached,
2138				    ab->b_size);
2139			} else {
2140				if (l2arc_write_eligible(ab->b_spa, ab)) {
2141					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2142					    ab->b_size);
2143				} else {
2144					ARCSTAT_INCR(
2145					    arcstat_evict_l2_ineligible,
2146					    ab->b_size);
2147				}
2148			}
2149
2150			if (ab->b_datacnt == 0) {
2151				arc_change_state(evicted_state, ab, hash_lock);
2152				ASSERT(HDR_IN_HASH_TABLE(ab));
2153				ab->b_flags |= ARC_IN_HASH_TABLE;
2154				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2155				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2156			}
2157			if (!have_lock)
2158				mutex_exit(hash_lock);
2159			if (bytes >= 0 && bytes_evicted >= bytes)
2160				break;
2161			if (bytes_remaining > 0) {
2162				mutex_exit(evicted_lock);
2163				mutex_exit(lock);
2164				idx  = ((idx + 1) & (list_count - 1));
2165				lists++;
2166				goto evict_start;
2167			}
2168		} else {
2169			missed += 1;
2170		}
2171	}
2172
2173	mutex_exit(evicted_lock);
2174	mutex_exit(lock);
2175
2176	idx  = ((idx + 1) & (list_count - 1));
2177	lists++;
2178
2179	if (bytes_evicted < bytes) {
2180		if (lists < list_count)
2181			goto evict_start;
2182		else
2183			dprintf("only evicted %lld bytes from %x",
2184			    (longlong_t)bytes_evicted, state);
2185	}
2186	if (type == ARC_BUFC_METADATA)
2187		evict_metadata_offset = idx;
2188	else
2189		evict_data_offset = idx;
2190
2191	if (skipped)
2192		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2193
2194	if (missed)
2195		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2196
2197	/*
2198	 * Note: we have just evicted some data into the ghost state,
2199	 * potentially putting the ghost size over the desired size.  Rather
2200	 * that evicting from the ghost list in this hot code path, leave
2201	 * this chore to the arc_reclaim_thread().
2202	 */
2203
2204	if (stolen)
2205		ARCSTAT_BUMP(arcstat_stolen);
2206	return (stolen);
2207}
2208
2209/*
2210 * Remove buffers from list until we've removed the specified number of
2211 * bytes.  Destroy the buffers that are removed.
2212 */
2213static void
2214arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2215{
2216	arc_buf_hdr_t *ab, *ab_prev;
2217	arc_buf_hdr_t marker = { 0 };
2218	list_t *list, *list_start;
2219	kmutex_t *hash_lock, *lock;
2220	uint64_t bytes_deleted = 0;
2221	uint64_t bufs_skipped = 0;
2222	int count = 0;
2223	static int evict_offset;
2224	int list_count, idx = evict_offset;
2225	int offset, lists = 0;
2226
2227	ASSERT(GHOST_STATE(state));
2228
2229	/*
2230	 * data lists come after metadata lists
2231	 */
2232	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2233	list_count = ARC_BUFC_NUMDATALISTS;
2234	offset = ARC_BUFC_NUMMETADATALISTS;
2235
2236evict_start:
2237	list = &list_start[idx];
2238	lock = ARCS_LOCK(state, idx + offset);
2239
2240	mutex_enter(lock);
2241	for (ab = list_tail(list); ab; ab = ab_prev) {
2242		ab_prev = list_prev(list, ab);
2243		if (ab->b_type > ARC_BUFC_NUMTYPES)
2244			panic("invalid ab=%p", (void *)ab);
2245		if (spa && ab->b_spa != spa)
2246			continue;
2247
2248		/* ignore markers */
2249		if (ab->b_spa == 0)
2250			continue;
2251
2252		hash_lock = HDR_LOCK(ab);
2253		/* caller may be trying to modify this buffer, skip it */
2254		if (MUTEX_HELD(hash_lock))
2255			continue;
2256
2257		/*
2258		 * It may take a long time to evict all the bufs requested.
2259		 * To avoid blocking all arc activity, periodically drop
2260		 * the arcs_mtx and give other threads a chance to run
2261		 * before reacquiring the lock.
2262		 */
2263		if (count++ > arc_evict_iterations) {
2264			list_insert_after(list, ab, &marker);
2265			mutex_exit(lock);
2266			kpreempt(KPREEMPT_SYNC);
2267			mutex_enter(lock);
2268			ab_prev = list_prev(list, &marker);
2269			list_remove(list, &marker);
2270			count = 0;
2271			continue;
2272		}
2273		if (mutex_tryenter(hash_lock)) {
2274			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2275			ASSERT(ab->b_buf == NULL);
2276			ARCSTAT_BUMP(arcstat_deleted);
2277			bytes_deleted += ab->b_size;
2278
2279			if (ab->b_l2hdr != NULL) {
2280				/*
2281				 * This buffer is cached on the 2nd Level ARC;
2282				 * don't destroy the header.
2283				 */
2284				arc_change_state(arc_l2c_only, ab, hash_lock);
2285				mutex_exit(hash_lock);
2286			} else {
2287				arc_change_state(arc_anon, ab, hash_lock);
2288				mutex_exit(hash_lock);
2289				arc_hdr_destroy(ab);
2290			}
2291
2292			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2293			if (bytes >= 0 && bytes_deleted >= bytes)
2294				break;
2295		} else if (bytes < 0) {
2296			/*
2297			 * Insert a list marker and then wait for the
2298			 * hash lock to become available. Once its
2299			 * available, restart from where we left off.
2300			 */
2301			list_insert_after(list, ab, &marker);
2302			mutex_exit(lock);
2303			mutex_enter(hash_lock);
2304			mutex_exit(hash_lock);
2305			mutex_enter(lock);
2306			ab_prev = list_prev(list, &marker);
2307			list_remove(list, &marker);
2308		} else {
2309			bufs_skipped += 1;
2310		}
2311
2312	}
2313	mutex_exit(lock);
2314	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2315	lists++;
2316
2317	if (lists < list_count)
2318		goto evict_start;
2319
2320	evict_offset = idx;
2321	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2322	    (bytes < 0 || bytes_deleted < bytes)) {
2323		list_start = &state->arcs_lists[0];
2324		list_count = ARC_BUFC_NUMMETADATALISTS;
2325		offset = lists = 0;
2326		goto evict_start;
2327	}
2328
2329	if (bufs_skipped) {
2330		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2331		ASSERT(bytes >= 0);
2332	}
2333
2334	if (bytes_deleted < bytes)
2335		dprintf("only deleted %lld bytes from %p",
2336		    (longlong_t)bytes_deleted, state);
2337}
2338
2339static void
2340arc_adjust(void)
2341{
2342	int64_t adjustment, delta;
2343
2344	/*
2345	 * Adjust MRU size
2346	 */
2347
2348	adjustment = MIN((int64_t)(arc_size - arc_c),
2349	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2350	    arc_p));
2351
2352	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2353		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2354		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2355		adjustment -= delta;
2356	}
2357
2358	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2359		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2360		(void) arc_evict(arc_mru, 0, delta, FALSE,
2361		    ARC_BUFC_METADATA);
2362	}
2363
2364	/*
2365	 * Adjust MFU size
2366	 */
2367
2368	adjustment = arc_size - arc_c;
2369
2370	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2371		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2372		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2373		adjustment -= delta;
2374	}
2375
2376	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2377		int64_t delta = MIN(adjustment,
2378		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2379		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2380		    ARC_BUFC_METADATA);
2381	}
2382
2383	/*
2384	 * Adjust ghost lists
2385	 */
2386
2387	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2388
2389	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2390		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2391		arc_evict_ghost(arc_mru_ghost, 0, delta);
2392	}
2393
2394	adjustment =
2395	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2396
2397	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2398		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2399		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2400	}
2401}
2402
2403static void
2404arc_do_user_evicts(void)
2405{
2406	static arc_buf_t *tmp_arc_eviction_list;
2407
2408	/*
2409	 * Move list over to avoid LOR
2410	 */
2411restart:
2412	mutex_enter(&arc_eviction_mtx);
2413	tmp_arc_eviction_list = arc_eviction_list;
2414	arc_eviction_list = NULL;
2415	mutex_exit(&arc_eviction_mtx);
2416
2417	while (tmp_arc_eviction_list != NULL) {
2418		arc_buf_t *buf = tmp_arc_eviction_list;
2419		tmp_arc_eviction_list = buf->b_next;
2420		mutex_enter(&buf->b_evict_lock);
2421		buf->b_hdr = NULL;
2422		mutex_exit(&buf->b_evict_lock);
2423
2424		if (buf->b_efunc != NULL)
2425			VERIFY0(buf->b_efunc(buf->b_private));
2426
2427		buf->b_efunc = NULL;
2428		buf->b_private = NULL;
2429		kmem_cache_free(buf_cache, buf);
2430	}
2431
2432	if (arc_eviction_list != NULL)
2433		goto restart;
2434}
2435
2436/*
2437 * Flush all *evictable* data from the cache for the given spa.
2438 * NOTE: this will not touch "active" (i.e. referenced) data.
2439 */
2440void
2441arc_flush(spa_t *spa)
2442{
2443	uint64_t guid = 0;
2444
2445	if (spa)
2446		guid = spa_load_guid(spa);
2447
2448	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2449		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2450		if (spa)
2451			break;
2452	}
2453	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2454		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2455		if (spa)
2456			break;
2457	}
2458	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2459		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2460		if (spa)
2461			break;
2462	}
2463	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2464		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2465		if (spa)
2466			break;
2467	}
2468
2469	arc_evict_ghost(arc_mru_ghost, guid, -1);
2470	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2471
2472	mutex_enter(&arc_reclaim_thr_lock);
2473	arc_do_user_evicts();
2474	mutex_exit(&arc_reclaim_thr_lock);
2475	ASSERT(spa || arc_eviction_list == NULL);
2476}
2477
2478void
2479arc_shrink(void)
2480{
2481
2482	if (arc_c > arc_c_min) {
2483		uint64_t to_free;
2484
2485		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2486			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2487#ifdef _KERNEL
2488		to_free = arc_c >> arc_shrink_shift;
2489#else
2490		to_free = arc_c >> arc_shrink_shift;
2491#endif
2492		if (arc_c > arc_c_min + to_free)
2493			atomic_add_64(&arc_c, -to_free);
2494		else
2495			arc_c = arc_c_min;
2496
2497		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2498		if (arc_c > arc_size)
2499			arc_c = MAX(arc_size, arc_c_min);
2500		if (arc_p > arc_c)
2501			arc_p = (arc_c >> 1);
2502
2503		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2504			arc_p);
2505
2506		ASSERT(arc_c >= arc_c_min);
2507		ASSERT((int64_t)arc_p >= 0);
2508	}
2509
2510	if (arc_size > arc_c) {
2511		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2512			uint64_t, arc_c);
2513		arc_adjust();
2514	}
2515}
2516
2517static int needfree = 0;
2518
2519static int
2520arc_reclaim_needed(void)
2521{
2522
2523#ifdef _KERNEL
2524
2525	if (needfree) {
2526		DTRACE_PROBE(arc__reclaim_needfree);
2527		return (1);
2528	}
2529
2530	/*
2531	 * Cooperate with pagedaemon when it's time for it to scan
2532	 * and reclaim some pages.
2533	 */
2534	if (freemem < zfs_arc_free_target) {
2535		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
2536		    freemem, uint64_t, zfs_arc_free_target);
2537		return (1);
2538	}
2539
2540#ifdef sun
2541	/*
2542	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2543	 */
2544	extra = desfree;
2545
2546	/*
2547	 * check that we're out of range of the pageout scanner.  It starts to
2548	 * schedule paging if freemem is less than lotsfree and needfree.
2549	 * lotsfree is the high-water mark for pageout, and needfree is the
2550	 * number of needed free pages.  We add extra pages here to make sure
2551	 * the scanner doesn't start up while we're freeing memory.
2552	 */
2553	if (freemem < lotsfree + needfree + extra)
2554		return (1);
2555
2556	/*
2557	 * check to make sure that swapfs has enough space so that anon
2558	 * reservations can still succeed. anon_resvmem() checks that the
2559	 * availrmem is greater than swapfs_minfree, and the number of reserved
2560	 * swap pages.  We also add a bit of extra here just to prevent
2561	 * circumstances from getting really dire.
2562	 */
2563	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2564		return (1);
2565
2566	/*
2567	 * Check that we have enough availrmem that memory locking (e.g., via
2568	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2569	 * stores the number of pages that cannot be locked; when availrmem
2570	 * drops below pages_pp_maximum, page locking mechanisms such as
2571	 * page_pp_lock() will fail.)
2572	 */
2573	if (availrmem <= pages_pp_maximum)
2574		return (1);
2575
2576#endif	/* sun */
2577#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2578	/*
2579	 * If we're on an i386 platform, it's possible that we'll exhaust the
2580	 * kernel heap space before we ever run out of available physical
2581	 * memory.  Most checks of the size of the heap_area compare against
2582	 * tune.t_minarmem, which is the minimum available real memory that we
2583	 * can have in the system.  However, this is generally fixed at 25 pages
2584	 * which is so low that it's useless.  In this comparison, we seek to
2585	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2586	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2587	 * free)
2588	 */
2589	if (vmem_size(heap_arena, VMEM_FREE) <
2590	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2591		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2592		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2593		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2594		return (1);
2595	}
2596#endif
2597#ifdef sun
2598	/*
2599	 * If zio data pages are being allocated out of a separate heap segment,
2600	 * then enforce that the size of available vmem for this arena remains
2601	 * above about 1/16th free.
2602	 *
2603	 * Note: The 1/16th arena free requirement was put in place
2604	 * to aggressively evict memory from the arc in order to avoid
2605	 * memory fragmentation issues.
2606	 */
2607	if (zio_arena != NULL &&
2608	    vmem_size(zio_arena, VMEM_FREE) <
2609	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2610		return (1);
2611#endif	/* sun */
2612#else	/* _KERNEL */
2613	if (spa_get_random(100) == 0)
2614		return (1);
2615#endif	/* _KERNEL */
2616	DTRACE_PROBE(arc__reclaim_no);
2617
2618	return (0);
2619}
2620
2621extern kmem_cache_t	*zio_buf_cache[];
2622extern kmem_cache_t	*zio_data_buf_cache[];
2623extern kmem_cache_t	*range_seg_cache;
2624
2625static void __noinline
2626arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2627{
2628	size_t			i;
2629	kmem_cache_t		*prev_cache = NULL;
2630	kmem_cache_t		*prev_data_cache = NULL;
2631
2632	DTRACE_PROBE(arc__kmem_reap_start);
2633#ifdef _KERNEL
2634	if (arc_meta_used >= arc_meta_limit) {
2635		/*
2636		 * We are exceeding our meta-data cache limit.
2637		 * Purge some DNLC entries to release holds on meta-data.
2638		 */
2639		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2640	}
2641#if defined(__i386)
2642	/*
2643	 * Reclaim unused memory from all kmem caches.
2644	 */
2645	kmem_reap();
2646#endif
2647#endif
2648
2649	/*
2650	 * An aggressive reclamation will shrink the cache size as well as
2651	 * reap free buffers from the arc kmem caches.
2652	 */
2653	if (strat == ARC_RECLAIM_AGGR)
2654		arc_shrink();
2655
2656	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2657		if (zio_buf_cache[i] != prev_cache) {
2658			prev_cache = zio_buf_cache[i];
2659			kmem_cache_reap_now(zio_buf_cache[i]);
2660		}
2661		if (zio_data_buf_cache[i] != prev_data_cache) {
2662			prev_data_cache = zio_data_buf_cache[i];
2663			kmem_cache_reap_now(zio_data_buf_cache[i]);
2664		}
2665	}
2666	kmem_cache_reap_now(buf_cache);
2667	kmem_cache_reap_now(hdr_cache);
2668	kmem_cache_reap_now(range_seg_cache);
2669
2670#ifdef sun
2671	/*
2672	 * Ask the vmem arena to reclaim unused memory from its
2673	 * quantum caches.
2674	 */
2675	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2676		vmem_qcache_reap(zio_arena);
2677#endif
2678	DTRACE_PROBE(arc__kmem_reap_end);
2679}
2680
2681static void
2682arc_reclaim_thread(void *dummy __unused)
2683{
2684	clock_t			growtime = 0;
2685	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2686	callb_cpr_t		cpr;
2687
2688	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2689
2690	mutex_enter(&arc_reclaim_thr_lock);
2691	while (arc_thread_exit == 0) {
2692		if (arc_reclaim_needed()) {
2693
2694			if (arc_no_grow) {
2695				if (last_reclaim == ARC_RECLAIM_CONS) {
2696					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
2697					last_reclaim = ARC_RECLAIM_AGGR;
2698				} else {
2699					last_reclaim = ARC_RECLAIM_CONS;
2700				}
2701			} else {
2702				arc_no_grow = TRUE;
2703				last_reclaim = ARC_RECLAIM_AGGR;
2704				DTRACE_PROBE(arc__reclaim_aggr);
2705				membar_producer();
2706			}
2707
2708			/* reset the growth delay for every reclaim */
2709			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2710
2711			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2712				/*
2713				 * If needfree is TRUE our vm_lowmem hook
2714				 * was called and in that case we must free some
2715				 * memory, so switch to aggressive mode.
2716				 */
2717				arc_no_grow = TRUE;
2718				last_reclaim = ARC_RECLAIM_AGGR;
2719			}
2720			arc_kmem_reap_now(last_reclaim);
2721			arc_warm = B_TRUE;
2722
2723		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2724			arc_no_grow = FALSE;
2725		}
2726
2727		arc_adjust();
2728
2729		if (arc_eviction_list != NULL)
2730			arc_do_user_evicts();
2731
2732#ifdef _KERNEL
2733		if (needfree) {
2734			needfree = 0;
2735			wakeup(&needfree);
2736		}
2737#endif
2738
2739		/* block until needed, or one second, whichever is shorter */
2740		CALLB_CPR_SAFE_BEGIN(&cpr);
2741		(void) cv_timedwait(&arc_reclaim_thr_cv,
2742		    &arc_reclaim_thr_lock, hz);
2743		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2744	}
2745
2746	arc_thread_exit = 0;
2747	cv_broadcast(&arc_reclaim_thr_cv);
2748	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2749	thread_exit();
2750}
2751
2752/*
2753 * Adapt arc info given the number of bytes we are trying to add and
2754 * the state that we are comming from.  This function is only called
2755 * when we are adding new content to the cache.
2756 */
2757static void
2758arc_adapt(int bytes, arc_state_t *state)
2759{
2760	int mult;
2761	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2762
2763	if (state == arc_l2c_only)
2764		return;
2765
2766	ASSERT(bytes > 0);
2767	/*
2768	 * Adapt the target size of the MRU list:
2769	 *	- if we just hit in the MRU ghost list, then increase
2770	 *	  the target size of the MRU list.
2771	 *	- if we just hit in the MFU ghost list, then increase
2772	 *	  the target size of the MFU list by decreasing the
2773	 *	  target size of the MRU list.
2774	 */
2775	if (state == arc_mru_ghost) {
2776		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2777		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2778		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2779
2780		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2781	} else if (state == arc_mfu_ghost) {
2782		uint64_t delta;
2783
2784		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2785		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2786		mult = MIN(mult, 10);
2787
2788		delta = MIN(bytes * mult, arc_p);
2789		arc_p = MAX(arc_p_min, arc_p - delta);
2790	}
2791	ASSERT((int64_t)arc_p >= 0);
2792
2793	if (arc_reclaim_needed()) {
2794		cv_signal(&arc_reclaim_thr_cv);
2795		return;
2796	}
2797
2798	if (arc_no_grow)
2799		return;
2800
2801	if (arc_c >= arc_c_max)
2802		return;
2803
2804	/*
2805	 * If we're within (2 * maxblocksize) bytes of the target
2806	 * cache size, increment the target cache size
2807	 */
2808	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2809		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
2810		atomic_add_64(&arc_c, (int64_t)bytes);
2811		if (arc_c > arc_c_max)
2812			arc_c = arc_c_max;
2813		else if (state == arc_anon)
2814			atomic_add_64(&arc_p, (int64_t)bytes);
2815		if (arc_p > arc_c)
2816			arc_p = arc_c;
2817	}
2818	ASSERT((int64_t)arc_p >= 0);
2819}
2820
2821/*
2822 * Check if the cache has reached its limits and eviction is required
2823 * prior to insert.
2824 */
2825static int
2826arc_evict_needed(arc_buf_contents_t type)
2827{
2828	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2829		return (1);
2830
2831	if (arc_reclaim_needed())
2832		return (1);
2833
2834	return (arc_size > arc_c);
2835}
2836
2837/*
2838 * The buffer, supplied as the first argument, needs a data block.
2839 * So, if we are at cache max, determine which cache should be victimized.
2840 * We have the following cases:
2841 *
2842 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2843 * In this situation if we're out of space, but the resident size of the MFU is
2844 * under the limit, victimize the MFU cache to satisfy this insertion request.
2845 *
2846 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2847 * Here, we've used up all of the available space for the MRU, so we need to
2848 * evict from our own cache instead.  Evict from the set of resident MRU
2849 * entries.
2850 *
2851 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2852 * c minus p represents the MFU space in the cache, since p is the size of the
2853 * cache that is dedicated to the MRU.  In this situation there's still space on
2854 * the MFU side, so the MRU side needs to be victimized.
2855 *
2856 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2857 * MFU's resident set is consuming more space than it has been allotted.  In
2858 * this situation, we must victimize our own cache, the MFU, for this insertion.
2859 */
2860static void
2861arc_get_data_buf(arc_buf_t *buf)
2862{
2863	arc_state_t		*state = buf->b_hdr->b_state;
2864	uint64_t		size = buf->b_hdr->b_size;
2865	arc_buf_contents_t	type = buf->b_hdr->b_type;
2866
2867	arc_adapt(size, state);
2868
2869	/*
2870	 * We have not yet reached cache maximum size,
2871	 * just allocate a new buffer.
2872	 */
2873	if (!arc_evict_needed(type)) {
2874		if (type == ARC_BUFC_METADATA) {
2875			buf->b_data = zio_buf_alloc(size);
2876			arc_space_consume(size, ARC_SPACE_DATA);
2877		} else {
2878			ASSERT(type == ARC_BUFC_DATA);
2879			buf->b_data = zio_data_buf_alloc(size);
2880			ARCSTAT_INCR(arcstat_data_size, size);
2881			atomic_add_64(&arc_size, size);
2882		}
2883		goto out;
2884	}
2885
2886	/*
2887	 * If we are prefetching from the mfu ghost list, this buffer
2888	 * will end up on the mru list; so steal space from there.
2889	 */
2890	if (state == arc_mfu_ghost)
2891		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2892	else if (state == arc_mru_ghost)
2893		state = arc_mru;
2894
2895	if (state == arc_mru || state == arc_anon) {
2896		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2897		state = (arc_mfu->arcs_lsize[type] >= size &&
2898		    arc_p > mru_used) ? arc_mfu : arc_mru;
2899	} else {
2900		/* MFU cases */
2901		uint64_t mfu_space = arc_c - arc_p;
2902		state =  (arc_mru->arcs_lsize[type] >= size &&
2903		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2904	}
2905	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2906		if (type == ARC_BUFC_METADATA) {
2907			buf->b_data = zio_buf_alloc(size);
2908			arc_space_consume(size, ARC_SPACE_DATA);
2909		} else {
2910			ASSERT(type == ARC_BUFC_DATA);
2911			buf->b_data = zio_data_buf_alloc(size);
2912			ARCSTAT_INCR(arcstat_data_size, size);
2913			atomic_add_64(&arc_size, size);
2914		}
2915		ARCSTAT_BUMP(arcstat_recycle_miss);
2916	}
2917	ASSERT(buf->b_data != NULL);
2918out:
2919	/*
2920	 * Update the state size.  Note that ghost states have a
2921	 * "ghost size" and so don't need to be updated.
2922	 */
2923	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2924		arc_buf_hdr_t *hdr = buf->b_hdr;
2925
2926		atomic_add_64(&hdr->b_state->arcs_size, size);
2927		if (list_link_active(&hdr->b_arc_node)) {
2928			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2929			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2930		}
2931		/*
2932		 * If we are growing the cache, and we are adding anonymous
2933		 * data, and we have outgrown arc_p, update arc_p
2934		 */
2935		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2936		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2937			arc_p = MIN(arc_c, arc_p + size);
2938	}
2939	ARCSTAT_BUMP(arcstat_allocated);
2940}
2941
2942/*
2943 * This routine is called whenever a buffer is accessed.
2944 * NOTE: the hash lock is dropped in this function.
2945 */
2946static void
2947arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2948{
2949	clock_t now;
2950
2951	ASSERT(MUTEX_HELD(hash_lock));
2952
2953	if (buf->b_state == arc_anon) {
2954		/*
2955		 * This buffer is not in the cache, and does not
2956		 * appear in our "ghost" list.  Add the new buffer
2957		 * to the MRU state.
2958		 */
2959
2960		ASSERT(buf->b_arc_access == 0);
2961		buf->b_arc_access = ddi_get_lbolt();
2962		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2963		arc_change_state(arc_mru, buf, hash_lock);
2964
2965	} else if (buf->b_state == arc_mru) {
2966		now = ddi_get_lbolt();
2967
2968		/*
2969		 * If this buffer is here because of a prefetch, then either:
2970		 * - clear the flag if this is a "referencing" read
2971		 *   (any subsequent access will bump this into the MFU state).
2972		 * or
2973		 * - move the buffer to the head of the list if this is
2974		 *   another prefetch (to make it less likely to be evicted).
2975		 */
2976		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2977			if (refcount_count(&buf->b_refcnt) == 0) {
2978				ASSERT(list_link_active(&buf->b_arc_node));
2979			} else {
2980				buf->b_flags &= ~ARC_PREFETCH;
2981				ARCSTAT_BUMP(arcstat_mru_hits);
2982			}
2983			buf->b_arc_access = now;
2984			return;
2985		}
2986
2987		/*
2988		 * This buffer has been "accessed" only once so far,
2989		 * but it is still in the cache. Move it to the MFU
2990		 * state.
2991		 */
2992		if (now > buf->b_arc_access + ARC_MINTIME) {
2993			/*
2994			 * More than 125ms have passed since we
2995			 * instantiated this buffer.  Move it to the
2996			 * most frequently used state.
2997			 */
2998			buf->b_arc_access = now;
2999			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3000			arc_change_state(arc_mfu, buf, hash_lock);
3001		}
3002		ARCSTAT_BUMP(arcstat_mru_hits);
3003	} else if (buf->b_state == arc_mru_ghost) {
3004		arc_state_t	*new_state;
3005		/*
3006		 * This buffer has been "accessed" recently, but
3007		 * was evicted from the cache.  Move it to the
3008		 * MFU state.
3009		 */
3010
3011		if (buf->b_flags & ARC_PREFETCH) {
3012			new_state = arc_mru;
3013			if (refcount_count(&buf->b_refcnt) > 0)
3014				buf->b_flags &= ~ARC_PREFETCH;
3015			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
3016		} else {
3017			new_state = arc_mfu;
3018			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3019		}
3020
3021		buf->b_arc_access = ddi_get_lbolt();
3022		arc_change_state(new_state, buf, hash_lock);
3023
3024		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3025	} else if (buf->b_state == arc_mfu) {
3026		/*
3027		 * This buffer has been accessed more than once and is
3028		 * still in the cache.  Keep it in the MFU state.
3029		 *
3030		 * NOTE: an add_reference() that occurred when we did
3031		 * the arc_read() will have kicked this off the list.
3032		 * If it was a prefetch, we will explicitly move it to
3033		 * the head of the list now.
3034		 */
3035		if ((buf->b_flags & ARC_PREFETCH) != 0) {
3036			ASSERT(refcount_count(&buf->b_refcnt) == 0);
3037			ASSERT(list_link_active(&buf->b_arc_node));
3038		}
3039		ARCSTAT_BUMP(arcstat_mfu_hits);
3040		buf->b_arc_access = ddi_get_lbolt();
3041	} else if (buf->b_state == arc_mfu_ghost) {
3042		arc_state_t	*new_state = arc_mfu;
3043		/*
3044		 * This buffer has been accessed more than once but has
3045		 * been evicted from the cache.  Move it back to the
3046		 * MFU state.
3047		 */
3048
3049		if (buf->b_flags & ARC_PREFETCH) {
3050			/*
3051			 * This is a prefetch access...
3052			 * move this block back to the MRU state.
3053			 */
3054			ASSERT0(refcount_count(&buf->b_refcnt));
3055			new_state = arc_mru;
3056		}
3057
3058		buf->b_arc_access = ddi_get_lbolt();
3059		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3060		arc_change_state(new_state, buf, hash_lock);
3061
3062		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3063	} else if (buf->b_state == arc_l2c_only) {
3064		/*
3065		 * This buffer is on the 2nd Level ARC.
3066		 */
3067
3068		buf->b_arc_access = ddi_get_lbolt();
3069		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3070		arc_change_state(arc_mfu, buf, hash_lock);
3071	} else {
3072		ASSERT(!"invalid arc state");
3073	}
3074}
3075
3076/* a generic arc_done_func_t which you can use */
3077/* ARGSUSED */
3078void
3079arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3080{
3081	if (zio == NULL || zio->io_error == 0)
3082		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3083	VERIFY(arc_buf_remove_ref(buf, arg));
3084}
3085
3086/* a generic arc_done_func_t */
3087void
3088arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3089{
3090	arc_buf_t **bufp = arg;
3091	if (zio && zio->io_error) {
3092		VERIFY(arc_buf_remove_ref(buf, arg));
3093		*bufp = NULL;
3094	} else {
3095		*bufp = buf;
3096		ASSERT(buf->b_data);
3097	}
3098}
3099
3100static void
3101arc_read_done(zio_t *zio)
3102{
3103	arc_buf_hdr_t	*hdr;
3104	arc_buf_t	*buf;
3105	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3106	kmutex_t	*hash_lock = NULL;
3107	arc_callback_t	*callback_list, *acb;
3108	int		freeable = FALSE;
3109
3110	buf = zio->io_private;
3111	hdr = buf->b_hdr;
3112
3113	/*
3114	 * The hdr was inserted into hash-table and removed from lists
3115	 * prior to starting I/O.  We should find this header, since
3116	 * it's in the hash table, and it should be legit since it's
3117	 * not possible to evict it during the I/O.  The only possible
3118	 * reason for it not to be found is if we were freed during the
3119	 * read.
3120	 */
3121	if (HDR_IN_HASH_TABLE(hdr)) {
3122		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3123		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3124		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3125		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3126		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3127
3128		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3129		    &hash_lock);
3130
3131		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3132		    hash_lock == NULL) ||
3133		    (found == hdr &&
3134		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3135		    (found == hdr && HDR_L2_READING(hdr)));
3136	}
3137
3138	hdr->b_flags &= ~ARC_L2_EVICTED;
3139	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3140		hdr->b_flags &= ~ARC_L2CACHE;
3141
3142	/* byteswap if necessary */
3143	callback_list = hdr->b_acb;
3144	ASSERT(callback_list != NULL);
3145	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3146		dmu_object_byteswap_t bswap =
3147		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3148		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3149		    byteswap_uint64_array :
3150		    dmu_ot_byteswap[bswap].ob_func;
3151		func(buf->b_data, hdr->b_size);
3152	}
3153
3154	arc_cksum_compute(buf, B_FALSE);
3155#ifdef illumos
3156	arc_buf_watch(buf);
3157#endif /* illumos */
3158
3159	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3160		/*
3161		 * Only call arc_access on anonymous buffers.  This is because
3162		 * if we've issued an I/O for an evicted buffer, we've already
3163		 * called arc_access (to prevent any simultaneous readers from
3164		 * getting confused).
3165		 */
3166		arc_access(hdr, hash_lock);
3167	}
3168
3169	/* create copies of the data buffer for the callers */
3170	abuf = buf;
3171	for (acb = callback_list; acb; acb = acb->acb_next) {
3172		if (acb->acb_done) {
3173			if (abuf == NULL) {
3174				ARCSTAT_BUMP(arcstat_duplicate_reads);
3175				abuf = arc_buf_clone(buf);
3176			}
3177			acb->acb_buf = abuf;
3178			abuf = NULL;
3179		}
3180	}
3181	hdr->b_acb = NULL;
3182	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3183	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3184	if (abuf == buf) {
3185		ASSERT(buf->b_efunc == NULL);
3186		ASSERT(hdr->b_datacnt == 1);
3187		hdr->b_flags |= ARC_BUF_AVAILABLE;
3188	}
3189
3190	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3191
3192	if (zio->io_error != 0) {
3193		hdr->b_flags |= ARC_IO_ERROR;
3194		if (hdr->b_state != arc_anon)
3195			arc_change_state(arc_anon, hdr, hash_lock);
3196		if (HDR_IN_HASH_TABLE(hdr))
3197			buf_hash_remove(hdr);
3198		freeable = refcount_is_zero(&hdr->b_refcnt);
3199	}
3200
3201	/*
3202	 * Broadcast before we drop the hash_lock to avoid the possibility
3203	 * that the hdr (and hence the cv) might be freed before we get to
3204	 * the cv_broadcast().
3205	 */
3206	cv_broadcast(&hdr->b_cv);
3207
3208	if (hash_lock) {
3209		mutex_exit(hash_lock);
3210	} else {
3211		/*
3212		 * This block was freed while we waited for the read to
3213		 * complete.  It has been removed from the hash table and
3214		 * moved to the anonymous state (so that it won't show up
3215		 * in the cache).
3216		 */
3217		ASSERT3P(hdr->b_state, ==, arc_anon);
3218		freeable = refcount_is_zero(&hdr->b_refcnt);
3219	}
3220
3221	/* execute each callback and free its structure */
3222	while ((acb = callback_list) != NULL) {
3223		if (acb->acb_done)
3224			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3225
3226		if (acb->acb_zio_dummy != NULL) {
3227			acb->acb_zio_dummy->io_error = zio->io_error;
3228			zio_nowait(acb->acb_zio_dummy);
3229		}
3230
3231		callback_list = acb->acb_next;
3232		kmem_free(acb, sizeof (arc_callback_t));
3233	}
3234
3235	if (freeable)
3236		arc_hdr_destroy(hdr);
3237}
3238
3239/*
3240 * "Read" the block block at the specified DVA (in bp) via the
3241 * cache.  If the block is found in the cache, invoke the provided
3242 * callback immediately and return.  Note that the `zio' parameter
3243 * in the callback will be NULL in this case, since no IO was
3244 * required.  If the block is not in the cache pass the read request
3245 * on to the spa with a substitute callback function, so that the
3246 * requested block will be added to the cache.
3247 *
3248 * If a read request arrives for a block that has a read in-progress,
3249 * either wait for the in-progress read to complete (and return the
3250 * results); or, if this is a read with a "done" func, add a record
3251 * to the read to invoke the "done" func when the read completes,
3252 * and return; or just return.
3253 *
3254 * arc_read_done() will invoke all the requested "done" functions
3255 * for readers of this block.
3256 */
3257int
3258arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3259    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3260    const zbookmark_phys_t *zb)
3261{
3262	arc_buf_hdr_t *hdr = NULL;
3263	arc_buf_t *buf = NULL;
3264	kmutex_t *hash_lock = NULL;
3265	zio_t *rzio;
3266	uint64_t guid = spa_load_guid(spa);
3267
3268	ASSERT(!BP_IS_EMBEDDED(bp) ||
3269	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3270
3271top:
3272	if (!BP_IS_EMBEDDED(bp)) {
3273		/*
3274		 * Embedded BP's have no DVA and require no I/O to "read".
3275		 * Create an anonymous arc buf to back it.
3276		 */
3277		hdr = buf_hash_find(guid, bp, &hash_lock);
3278	}
3279
3280	if (hdr != NULL && hdr->b_datacnt > 0) {
3281
3282		*arc_flags |= ARC_CACHED;
3283
3284		if (HDR_IO_IN_PROGRESS(hdr)) {
3285
3286			if (*arc_flags & ARC_WAIT) {
3287				cv_wait(&hdr->b_cv, hash_lock);
3288				mutex_exit(hash_lock);
3289				goto top;
3290			}
3291			ASSERT(*arc_flags & ARC_NOWAIT);
3292
3293			if (done) {
3294				arc_callback_t	*acb = NULL;
3295
3296				acb = kmem_zalloc(sizeof (arc_callback_t),
3297				    KM_SLEEP);
3298				acb->acb_done = done;
3299				acb->acb_private = private;
3300				if (pio != NULL)
3301					acb->acb_zio_dummy = zio_null(pio,
3302					    spa, NULL, NULL, NULL, zio_flags);
3303
3304				ASSERT(acb->acb_done != NULL);
3305				acb->acb_next = hdr->b_acb;
3306				hdr->b_acb = acb;
3307				add_reference(hdr, hash_lock, private);
3308				mutex_exit(hash_lock);
3309				return (0);
3310			}
3311			mutex_exit(hash_lock);
3312			return (0);
3313		}
3314
3315		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3316
3317		if (done) {
3318			add_reference(hdr, hash_lock, private);
3319			/*
3320			 * If this block is already in use, create a new
3321			 * copy of the data so that we will be guaranteed
3322			 * that arc_release() will always succeed.
3323			 */
3324			buf = hdr->b_buf;
3325			ASSERT(buf);
3326			ASSERT(buf->b_data);
3327			if (HDR_BUF_AVAILABLE(hdr)) {
3328				ASSERT(buf->b_efunc == NULL);
3329				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3330			} else {
3331				buf = arc_buf_clone(buf);
3332			}
3333
3334		} else if (*arc_flags & ARC_PREFETCH &&
3335		    refcount_count(&hdr->b_refcnt) == 0) {
3336			hdr->b_flags |= ARC_PREFETCH;
3337		}
3338		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3339		arc_access(hdr, hash_lock);
3340		if (*arc_flags & ARC_L2CACHE)
3341			hdr->b_flags |= ARC_L2CACHE;
3342		if (*arc_flags & ARC_L2COMPRESS)
3343			hdr->b_flags |= ARC_L2COMPRESS;
3344		mutex_exit(hash_lock);
3345		ARCSTAT_BUMP(arcstat_hits);
3346		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3347		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3348		    data, metadata, hits);
3349
3350		if (done)
3351			done(NULL, buf, private);
3352	} else {
3353		uint64_t size = BP_GET_LSIZE(bp);
3354		arc_callback_t *acb;
3355		vdev_t *vd = NULL;
3356		uint64_t addr = 0;
3357		boolean_t devw = B_FALSE;
3358		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3359		uint64_t b_asize = 0;
3360
3361		if (hdr == NULL) {
3362			/* this block is not in the cache */
3363			arc_buf_hdr_t *exists = NULL;
3364			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3365			buf = arc_buf_alloc(spa, size, private, type);
3366			hdr = buf->b_hdr;
3367			if (!BP_IS_EMBEDDED(bp)) {
3368				hdr->b_dva = *BP_IDENTITY(bp);
3369				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3370				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3371				exists = buf_hash_insert(hdr, &hash_lock);
3372			}
3373			if (exists != NULL) {
3374				/* somebody beat us to the hash insert */
3375				mutex_exit(hash_lock);
3376				buf_discard_identity(hdr);
3377				(void) arc_buf_remove_ref(buf, private);
3378				goto top; /* restart the IO request */
3379			}
3380			/* if this is a prefetch, we don't have a reference */
3381			if (*arc_flags & ARC_PREFETCH) {
3382				(void) remove_reference(hdr, hash_lock,
3383				    private);
3384				hdr->b_flags |= ARC_PREFETCH;
3385			}
3386			if (*arc_flags & ARC_L2CACHE)
3387				hdr->b_flags |= ARC_L2CACHE;
3388			if (*arc_flags & ARC_L2COMPRESS)
3389				hdr->b_flags |= ARC_L2COMPRESS;
3390			if (BP_GET_LEVEL(bp) > 0)
3391				hdr->b_flags |= ARC_INDIRECT;
3392		} else {
3393			/* this block is in the ghost cache */
3394			ASSERT(GHOST_STATE(hdr->b_state));
3395			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3396			ASSERT0(refcount_count(&hdr->b_refcnt));
3397			ASSERT(hdr->b_buf == NULL);
3398
3399			/* if this is a prefetch, we don't have a reference */
3400			if (*arc_flags & ARC_PREFETCH)
3401				hdr->b_flags |= ARC_PREFETCH;
3402			else
3403				add_reference(hdr, hash_lock, private);
3404			if (*arc_flags & ARC_L2CACHE)
3405				hdr->b_flags |= ARC_L2CACHE;
3406			if (*arc_flags & ARC_L2COMPRESS)
3407				hdr->b_flags |= ARC_L2COMPRESS;
3408			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3409			buf->b_hdr = hdr;
3410			buf->b_data = NULL;
3411			buf->b_efunc = NULL;
3412			buf->b_private = NULL;
3413			buf->b_next = NULL;
3414			hdr->b_buf = buf;
3415			ASSERT(hdr->b_datacnt == 0);
3416			hdr->b_datacnt = 1;
3417			arc_get_data_buf(buf);
3418			arc_access(hdr, hash_lock);
3419		}
3420
3421		ASSERT(!GHOST_STATE(hdr->b_state));
3422
3423		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3424		acb->acb_done = done;
3425		acb->acb_private = private;
3426
3427		ASSERT(hdr->b_acb == NULL);
3428		hdr->b_acb = acb;
3429		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3430
3431		if (hdr->b_l2hdr != NULL &&
3432		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3433			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3434			addr = hdr->b_l2hdr->b_daddr;
3435			b_compress = hdr->b_l2hdr->b_compress;
3436			b_asize = hdr->b_l2hdr->b_asize;
3437			/*
3438			 * Lock out device removal.
3439			 */
3440			if (vdev_is_dead(vd) ||
3441			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3442				vd = NULL;
3443		}
3444
3445		if (hash_lock != NULL)
3446			mutex_exit(hash_lock);
3447
3448		/*
3449		 * At this point, we have a level 1 cache miss.  Try again in
3450		 * L2ARC if possible.
3451		 */
3452		ASSERT3U(hdr->b_size, ==, size);
3453		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3454		    uint64_t, size, zbookmark_phys_t *, zb);
3455		ARCSTAT_BUMP(arcstat_misses);
3456		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3457		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3458		    data, metadata, misses);
3459#ifdef _KERNEL
3460		curthread->td_ru.ru_inblock++;
3461#endif
3462
3463		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3464			/*
3465			 * Read from the L2ARC if the following are true:
3466			 * 1. The L2ARC vdev was previously cached.
3467			 * 2. This buffer still has L2ARC metadata.
3468			 * 3. This buffer isn't currently writing to the L2ARC.
3469			 * 4. The L2ARC entry wasn't evicted, which may
3470			 *    also have invalidated the vdev.
3471			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3472			 */
3473			if (hdr->b_l2hdr != NULL &&
3474			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3475			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3476				l2arc_read_callback_t *cb;
3477
3478				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3479				ARCSTAT_BUMP(arcstat_l2_hits);
3480
3481				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3482				    KM_SLEEP);
3483				cb->l2rcb_buf = buf;
3484				cb->l2rcb_spa = spa;
3485				cb->l2rcb_bp = *bp;
3486				cb->l2rcb_zb = *zb;
3487				cb->l2rcb_flags = zio_flags;
3488				cb->l2rcb_compress = b_compress;
3489
3490				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3491				    addr + size < vd->vdev_psize -
3492				    VDEV_LABEL_END_SIZE);
3493
3494				/*
3495				 * l2arc read.  The SCL_L2ARC lock will be
3496				 * released by l2arc_read_done().
3497				 * Issue a null zio if the underlying buffer
3498				 * was squashed to zero size by compression.
3499				 */
3500				if (b_compress == ZIO_COMPRESS_EMPTY) {
3501					rzio = zio_null(pio, spa, vd,
3502					    l2arc_read_done, cb,
3503					    zio_flags | ZIO_FLAG_DONT_CACHE |
3504					    ZIO_FLAG_CANFAIL |
3505					    ZIO_FLAG_DONT_PROPAGATE |
3506					    ZIO_FLAG_DONT_RETRY);
3507				} else {
3508					rzio = zio_read_phys(pio, vd, addr,
3509					    b_asize, buf->b_data,
3510					    ZIO_CHECKSUM_OFF,
3511					    l2arc_read_done, cb, priority,
3512					    zio_flags | ZIO_FLAG_DONT_CACHE |
3513					    ZIO_FLAG_CANFAIL |
3514					    ZIO_FLAG_DONT_PROPAGATE |
3515					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3516				}
3517				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3518				    zio_t *, rzio);
3519				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3520
3521				if (*arc_flags & ARC_NOWAIT) {
3522					zio_nowait(rzio);
3523					return (0);
3524				}
3525
3526				ASSERT(*arc_flags & ARC_WAIT);
3527				if (zio_wait(rzio) == 0)
3528					return (0);
3529
3530				/* l2arc read error; goto zio_read() */
3531			} else {
3532				DTRACE_PROBE1(l2arc__miss,
3533				    arc_buf_hdr_t *, hdr);
3534				ARCSTAT_BUMP(arcstat_l2_misses);
3535				if (HDR_L2_WRITING(hdr))
3536					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3537				spa_config_exit(spa, SCL_L2ARC, vd);
3538			}
3539		} else {
3540			if (vd != NULL)
3541				spa_config_exit(spa, SCL_L2ARC, vd);
3542			if (l2arc_ndev != 0) {
3543				DTRACE_PROBE1(l2arc__miss,
3544				    arc_buf_hdr_t *, hdr);
3545				ARCSTAT_BUMP(arcstat_l2_misses);
3546			}
3547		}
3548
3549		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3550		    arc_read_done, buf, priority, zio_flags, zb);
3551
3552		if (*arc_flags & ARC_WAIT)
3553			return (zio_wait(rzio));
3554
3555		ASSERT(*arc_flags & ARC_NOWAIT);
3556		zio_nowait(rzio);
3557	}
3558	return (0);
3559}
3560
3561void
3562arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3563{
3564	ASSERT(buf->b_hdr != NULL);
3565	ASSERT(buf->b_hdr->b_state != arc_anon);
3566	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3567	ASSERT(buf->b_efunc == NULL);
3568	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3569
3570	buf->b_efunc = func;
3571	buf->b_private = private;
3572}
3573
3574/*
3575 * Notify the arc that a block was freed, and thus will never be used again.
3576 */
3577void
3578arc_freed(spa_t *spa, const blkptr_t *bp)
3579{
3580	arc_buf_hdr_t *hdr;
3581	kmutex_t *hash_lock;
3582	uint64_t guid = spa_load_guid(spa);
3583
3584	ASSERT(!BP_IS_EMBEDDED(bp));
3585
3586	hdr = buf_hash_find(guid, bp, &hash_lock);
3587	if (hdr == NULL)
3588		return;
3589	if (HDR_BUF_AVAILABLE(hdr)) {
3590		arc_buf_t *buf = hdr->b_buf;
3591		add_reference(hdr, hash_lock, FTAG);
3592		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3593		mutex_exit(hash_lock);
3594
3595		arc_release(buf, FTAG);
3596		(void) arc_buf_remove_ref(buf, FTAG);
3597	} else {
3598		mutex_exit(hash_lock);
3599	}
3600
3601}
3602
3603/*
3604 * Clear the user eviction callback set by arc_set_callback(), first calling
3605 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3606 * clearing the callback may result in the arc_buf being destroyed.  However,
3607 * it will not result in the *last* arc_buf being destroyed, hence the data
3608 * will remain cached in the ARC. We make a copy of the arc buffer here so
3609 * that we can process the callback without holding any locks.
3610 *
3611 * It's possible that the callback is already in the process of being cleared
3612 * by another thread.  In this case we can not clear the callback.
3613 *
3614 * Returns B_TRUE if the callback was successfully called and cleared.
3615 */
3616boolean_t
3617arc_clear_callback(arc_buf_t *buf)
3618{
3619	arc_buf_hdr_t *hdr;
3620	kmutex_t *hash_lock;
3621	arc_evict_func_t *efunc = buf->b_efunc;
3622	void *private = buf->b_private;
3623	list_t *list, *evicted_list;
3624	kmutex_t *lock, *evicted_lock;
3625
3626	mutex_enter(&buf->b_evict_lock);
3627	hdr = buf->b_hdr;
3628	if (hdr == NULL) {
3629		/*
3630		 * We are in arc_do_user_evicts().
3631		 */
3632		ASSERT(buf->b_data == NULL);
3633		mutex_exit(&buf->b_evict_lock);
3634		return (B_FALSE);
3635	} else if (buf->b_data == NULL) {
3636		/*
3637		 * We are on the eviction list; process this buffer now
3638		 * but let arc_do_user_evicts() do the reaping.
3639		 */
3640		buf->b_efunc = NULL;
3641		mutex_exit(&buf->b_evict_lock);
3642		VERIFY0(efunc(private));
3643		return (B_TRUE);
3644	}
3645	hash_lock = HDR_LOCK(hdr);
3646	mutex_enter(hash_lock);
3647	hdr = buf->b_hdr;
3648	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3649
3650	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3651	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3652
3653	buf->b_efunc = NULL;
3654	buf->b_private = NULL;
3655
3656	if (hdr->b_datacnt > 1) {
3657		mutex_exit(&buf->b_evict_lock);
3658		arc_buf_destroy(buf, FALSE, TRUE);
3659	} else {
3660		ASSERT(buf == hdr->b_buf);
3661		hdr->b_flags |= ARC_BUF_AVAILABLE;
3662		mutex_exit(&buf->b_evict_lock);
3663	}
3664
3665	mutex_exit(hash_lock);
3666	VERIFY0(efunc(private));
3667	return (B_TRUE);
3668}
3669
3670/*
3671 * Release this buffer from the cache, making it an anonymous buffer.  This
3672 * must be done after a read and prior to modifying the buffer contents.
3673 * If the buffer has more than one reference, we must make
3674 * a new hdr for the buffer.
3675 */
3676void
3677arc_release(arc_buf_t *buf, void *tag)
3678{
3679	arc_buf_hdr_t *hdr;
3680	kmutex_t *hash_lock = NULL;
3681	l2arc_buf_hdr_t *l2hdr;
3682	uint64_t buf_size;
3683
3684	/*
3685	 * It would be nice to assert that if it's DMU metadata (level >
3686	 * 0 || it's the dnode file), then it must be syncing context.
3687	 * But we don't know that information at this level.
3688	 */
3689
3690	mutex_enter(&buf->b_evict_lock);
3691	hdr = buf->b_hdr;
3692
3693	/* this buffer is not on any list */
3694	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3695
3696	if (hdr->b_state == arc_anon) {
3697		/* this buffer is already released */
3698		ASSERT(buf->b_efunc == NULL);
3699	} else {
3700		hash_lock = HDR_LOCK(hdr);
3701		mutex_enter(hash_lock);
3702		hdr = buf->b_hdr;
3703		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3704	}
3705
3706	l2hdr = hdr->b_l2hdr;
3707	if (l2hdr) {
3708		mutex_enter(&l2arc_buflist_mtx);
3709		arc_buf_l2_cdata_free(hdr);
3710		hdr->b_l2hdr = NULL;
3711		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3712	}
3713	buf_size = hdr->b_size;
3714
3715	/*
3716	 * Do we have more than one buf?
3717	 */
3718	if (hdr->b_datacnt > 1) {
3719		arc_buf_hdr_t *nhdr;
3720		arc_buf_t **bufp;
3721		uint64_t blksz = hdr->b_size;
3722		uint64_t spa = hdr->b_spa;
3723		arc_buf_contents_t type = hdr->b_type;
3724		uint32_t flags = hdr->b_flags;
3725
3726		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3727		/*
3728		 * Pull the data off of this hdr and attach it to
3729		 * a new anonymous hdr.
3730		 */
3731		(void) remove_reference(hdr, hash_lock, tag);
3732		bufp = &hdr->b_buf;
3733		while (*bufp != buf)
3734			bufp = &(*bufp)->b_next;
3735		*bufp = buf->b_next;
3736		buf->b_next = NULL;
3737
3738		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3739		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3740		if (refcount_is_zero(&hdr->b_refcnt)) {
3741			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3742			ASSERT3U(*size, >=, hdr->b_size);
3743			atomic_add_64(size, -hdr->b_size);
3744		}
3745
3746		/*
3747		 * We're releasing a duplicate user data buffer, update
3748		 * our statistics accordingly.
3749		 */
3750		if (hdr->b_type == ARC_BUFC_DATA) {
3751			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3752			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3753			    -hdr->b_size);
3754		}
3755		hdr->b_datacnt -= 1;
3756		arc_cksum_verify(buf);
3757#ifdef illumos
3758		arc_buf_unwatch(buf);
3759#endif /* illumos */
3760
3761		mutex_exit(hash_lock);
3762
3763		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3764		nhdr->b_size = blksz;
3765		nhdr->b_spa = spa;
3766		nhdr->b_type = type;
3767		nhdr->b_buf = buf;
3768		nhdr->b_state = arc_anon;
3769		nhdr->b_arc_access = 0;
3770		nhdr->b_flags = flags & ARC_L2_WRITING;
3771		nhdr->b_l2hdr = NULL;
3772		nhdr->b_datacnt = 1;
3773		nhdr->b_freeze_cksum = NULL;
3774		(void) refcount_add(&nhdr->b_refcnt, tag);
3775		buf->b_hdr = nhdr;
3776		mutex_exit(&buf->b_evict_lock);
3777		atomic_add_64(&arc_anon->arcs_size, blksz);
3778	} else {
3779		mutex_exit(&buf->b_evict_lock);
3780		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3781		ASSERT(!list_link_active(&hdr->b_arc_node));
3782		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3783		if (hdr->b_state != arc_anon)
3784			arc_change_state(arc_anon, hdr, hash_lock);
3785		hdr->b_arc_access = 0;
3786		if (hash_lock)
3787			mutex_exit(hash_lock);
3788
3789		buf_discard_identity(hdr);
3790		arc_buf_thaw(buf);
3791	}
3792	buf->b_efunc = NULL;
3793	buf->b_private = NULL;
3794
3795	if (l2hdr) {
3796		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3797		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3798		    -l2hdr->b_asize, 0, 0);
3799		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3800		    hdr->b_size, 0);
3801		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3802		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3803		mutex_exit(&l2arc_buflist_mtx);
3804	}
3805}
3806
3807int
3808arc_released(arc_buf_t *buf)
3809{
3810	int released;
3811
3812	mutex_enter(&buf->b_evict_lock);
3813	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3814	mutex_exit(&buf->b_evict_lock);
3815	return (released);
3816}
3817
3818#ifdef ZFS_DEBUG
3819int
3820arc_referenced(arc_buf_t *buf)
3821{
3822	int referenced;
3823
3824	mutex_enter(&buf->b_evict_lock);
3825	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3826	mutex_exit(&buf->b_evict_lock);
3827	return (referenced);
3828}
3829#endif
3830
3831static void
3832arc_write_ready(zio_t *zio)
3833{
3834	arc_write_callback_t *callback = zio->io_private;
3835	arc_buf_t *buf = callback->awcb_buf;
3836	arc_buf_hdr_t *hdr = buf->b_hdr;
3837
3838	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3839	callback->awcb_ready(zio, buf, callback->awcb_private);
3840
3841	/*
3842	 * If the IO is already in progress, then this is a re-write
3843	 * attempt, so we need to thaw and re-compute the cksum.
3844	 * It is the responsibility of the callback to handle the
3845	 * accounting for any re-write attempt.
3846	 */
3847	if (HDR_IO_IN_PROGRESS(hdr)) {
3848		mutex_enter(&hdr->b_freeze_lock);
3849		if (hdr->b_freeze_cksum != NULL) {
3850			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3851			hdr->b_freeze_cksum = NULL;
3852		}
3853		mutex_exit(&hdr->b_freeze_lock);
3854	}
3855	arc_cksum_compute(buf, B_FALSE);
3856	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3857}
3858
3859/*
3860 * The SPA calls this callback for each physical write that happens on behalf
3861 * of a logical write.  See the comment in dbuf_write_physdone() for details.
3862 */
3863static void
3864arc_write_physdone(zio_t *zio)
3865{
3866	arc_write_callback_t *cb = zio->io_private;
3867	if (cb->awcb_physdone != NULL)
3868		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3869}
3870
3871static void
3872arc_write_done(zio_t *zio)
3873{
3874	arc_write_callback_t *callback = zio->io_private;
3875	arc_buf_t *buf = callback->awcb_buf;
3876	arc_buf_hdr_t *hdr = buf->b_hdr;
3877
3878	ASSERT(hdr->b_acb == NULL);
3879
3880	if (zio->io_error == 0) {
3881		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3882			buf_discard_identity(hdr);
3883		} else {
3884			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3885			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3886			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3887		}
3888	} else {
3889		ASSERT(BUF_EMPTY(hdr));
3890	}
3891
3892	/*
3893	 * If the block to be written was all-zero or compressed enough to be
3894	 * embedded in the BP, no write was performed so there will be no
3895	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3896	 * (and uncached).
3897	 */
3898	if (!BUF_EMPTY(hdr)) {
3899		arc_buf_hdr_t *exists;
3900		kmutex_t *hash_lock;
3901
3902		ASSERT(zio->io_error == 0);
3903
3904		arc_cksum_verify(buf);
3905
3906		exists = buf_hash_insert(hdr, &hash_lock);
3907		if (exists) {
3908			/*
3909			 * This can only happen if we overwrite for
3910			 * sync-to-convergence, because we remove
3911			 * buffers from the hash table when we arc_free().
3912			 */
3913			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3914				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3915					panic("bad overwrite, hdr=%p exists=%p",
3916					    (void *)hdr, (void *)exists);
3917				ASSERT(refcount_is_zero(&exists->b_refcnt));
3918				arc_change_state(arc_anon, exists, hash_lock);
3919				mutex_exit(hash_lock);
3920				arc_hdr_destroy(exists);
3921				exists = buf_hash_insert(hdr, &hash_lock);
3922				ASSERT3P(exists, ==, NULL);
3923			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3924				/* nopwrite */
3925				ASSERT(zio->io_prop.zp_nopwrite);
3926				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3927					panic("bad nopwrite, hdr=%p exists=%p",
3928					    (void *)hdr, (void *)exists);
3929			} else {
3930				/* Dedup */
3931				ASSERT(hdr->b_datacnt == 1);
3932				ASSERT(hdr->b_state == arc_anon);
3933				ASSERT(BP_GET_DEDUP(zio->io_bp));
3934				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3935			}
3936		}
3937		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3938		/* if it's not anon, we are doing a scrub */
3939		if (!exists && hdr->b_state == arc_anon)
3940			arc_access(hdr, hash_lock);
3941		mutex_exit(hash_lock);
3942	} else {
3943		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3944	}
3945
3946	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3947	callback->awcb_done(zio, buf, callback->awcb_private);
3948
3949	kmem_free(callback, sizeof (arc_write_callback_t));
3950}
3951
3952zio_t *
3953arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3954    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3955    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3956    arc_done_func_t *done, void *private, zio_priority_t priority,
3957    int zio_flags, const zbookmark_phys_t *zb)
3958{
3959	arc_buf_hdr_t *hdr = buf->b_hdr;
3960	arc_write_callback_t *callback;
3961	zio_t *zio;
3962
3963	ASSERT(ready != NULL);
3964	ASSERT(done != NULL);
3965	ASSERT(!HDR_IO_ERROR(hdr));
3966	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3967	ASSERT(hdr->b_acb == NULL);
3968	if (l2arc)
3969		hdr->b_flags |= ARC_L2CACHE;
3970	if (l2arc_compress)
3971		hdr->b_flags |= ARC_L2COMPRESS;
3972	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3973	callback->awcb_ready = ready;
3974	callback->awcb_physdone = physdone;
3975	callback->awcb_done = done;
3976	callback->awcb_private = private;
3977	callback->awcb_buf = buf;
3978
3979	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3980	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
3981	    priority, zio_flags, zb);
3982
3983	return (zio);
3984}
3985
3986static int
3987arc_memory_throttle(uint64_t reserve, uint64_t txg)
3988{
3989#ifdef _KERNEL
3990	uint64_t available_memory = ptob(freemem);
3991	static uint64_t page_load = 0;
3992	static uint64_t last_txg = 0;
3993
3994#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
3995	available_memory =
3996	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
3997#endif
3998
3999	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4000		return (0);
4001
4002	if (txg > last_txg) {
4003		last_txg = txg;
4004		page_load = 0;
4005	}
4006	/*
4007	 * If we are in pageout, we know that memory is already tight,
4008	 * the arc is already going to be evicting, so we just want to
4009	 * continue to let page writes occur as quickly as possible.
4010	 */
4011	if (curproc == pageproc) {
4012		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4013			return (SET_ERROR(ERESTART));
4014		/* Note: reserve is inflated, so we deflate */
4015		page_load += reserve / 8;
4016		return (0);
4017	} else if (page_load > 0 && arc_reclaim_needed()) {
4018		/* memory is low, delay before restarting */
4019		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4020		return (SET_ERROR(EAGAIN));
4021	}
4022	page_load = 0;
4023#endif
4024	return (0);
4025}
4026
4027void
4028arc_tempreserve_clear(uint64_t reserve)
4029{
4030	atomic_add_64(&arc_tempreserve, -reserve);
4031	ASSERT((int64_t)arc_tempreserve >= 0);
4032}
4033
4034int
4035arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4036{
4037	int error;
4038	uint64_t anon_size;
4039
4040	if (reserve > arc_c/4 && !arc_no_grow) {
4041		arc_c = MIN(arc_c_max, reserve * 4);
4042		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4043	}
4044	if (reserve > arc_c)
4045		return (SET_ERROR(ENOMEM));
4046
4047	/*
4048	 * Don't count loaned bufs as in flight dirty data to prevent long
4049	 * network delays from blocking transactions that are ready to be
4050	 * assigned to a txg.
4051	 */
4052	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4053
4054	/*
4055	 * Writes will, almost always, require additional memory allocations
4056	 * in order to compress/encrypt/etc the data.  We therefore need to
4057	 * make sure that there is sufficient available memory for this.
4058	 */
4059	error = arc_memory_throttle(reserve, txg);
4060	if (error != 0)
4061		return (error);
4062
4063	/*
4064	 * Throttle writes when the amount of dirty data in the cache
4065	 * gets too large.  We try to keep the cache less than half full
4066	 * of dirty blocks so that our sync times don't grow too large.
4067	 * Note: if two requests come in concurrently, we might let them
4068	 * both succeed, when one of them should fail.  Not a huge deal.
4069	 */
4070
4071	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4072	    anon_size > arc_c / 4) {
4073		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4074		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4075		    arc_tempreserve>>10,
4076		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4077		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4078		    reserve>>10, arc_c>>10);
4079		return (SET_ERROR(ERESTART));
4080	}
4081	atomic_add_64(&arc_tempreserve, reserve);
4082	return (0);
4083}
4084
4085static kmutex_t arc_lowmem_lock;
4086#ifdef _KERNEL
4087static eventhandler_tag arc_event_lowmem = NULL;
4088
4089static void
4090arc_lowmem(void *arg __unused, int howto __unused)
4091{
4092
4093	/* Serialize access via arc_lowmem_lock. */
4094	mutex_enter(&arc_lowmem_lock);
4095	mutex_enter(&arc_reclaim_thr_lock);
4096	needfree = 1;
4097	DTRACE_PROBE(arc__needfree);
4098	cv_signal(&arc_reclaim_thr_cv);
4099
4100	/*
4101	 * It is unsafe to block here in arbitrary threads, because we can come
4102	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4103	 * with ARC reclaim thread.
4104	 */
4105	if (curproc == pageproc) {
4106		while (needfree)
4107			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4108	}
4109	mutex_exit(&arc_reclaim_thr_lock);
4110	mutex_exit(&arc_lowmem_lock);
4111}
4112#endif
4113
4114void
4115arc_init(void)
4116{
4117	int i, prefetch_tunable_set = 0;
4118
4119	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4120	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4121	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4122
4123	/* Convert seconds to clock ticks */
4124	arc_min_prefetch_lifespan = 1 * hz;
4125
4126	/* Start out with 1/8 of all memory */
4127	arc_c = kmem_size() / 8;
4128
4129#ifdef sun
4130#ifdef _KERNEL
4131	/*
4132	 * On architectures where the physical memory can be larger
4133	 * than the addressable space (intel in 32-bit mode), we may
4134	 * need to limit the cache to 1/8 of VM size.
4135	 */
4136	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4137#endif
4138#endif	/* sun */
4139	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4140	arc_c_min = MAX(arc_c / 4, 64<<18);
4141	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4142	if (arc_c * 8 >= 1<<30)
4143		arc_c_max = (arc_c * 8) - (1<<30);
4144	else
4145		arc_c_max = arc_c_min;
4146	arc_c_max = MAX(arc_c * 5, arc_c_max);
4147
4148#ifdef _KERNEL
4149	/*
4150	 * Allow the tunables to override our calculations if they are
4151	 * reasonable (ie. over 16MB)
4152	 */
4153	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
4154		arc_c_max = zfs_arc_max;
4155	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4156		arc_c_min = zfs_arc_min;
4157#endif
4158
4159	arc_c = arc_c_max;
4160	arc_p = (arc_c >> 1);
4161
4162	/* limit meta-data to 1/4 of the arc capacity */
4163	arc_meta_limit = arc_c_max / 4;
4164
4165	/* Allow the tunable to override if it is reasonable */
4166	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4167		arc_meta_limit = zfs_arc_meta_limit;
4168
4169	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4170		arc_c_min = arc_meta_limit / 2;
4171
4172	if (zfs_arc_grow_retry > 0)
4173		arc_grow_retry = zfs_arc_grow_retry;
4174
4175	if (zfs_arc_shrink_shift > 0)
4176		arc_shrink_shift = zfs_arc_shrink_shift;
4177
4178	if (zfs_arc_p_min_shift > 0)
4179		arc_p_min_shift = zfs_arc_p_min_shift;
4180
4181	/* if kmem_flags are set, lets try to use less memory */
4182	if (kmem_debugging())
4183		arc_c = arc_c / 2;
4184	if (arc_c < arc_c_min)
4185		arc_c = arc_c_min;
4186
4187	zfs_arc_min = arc_c_min;
4188	zfs_arc_max = arc_c_max;
4189
4190	arc_anon = &ARC_anon;
4191	arc_mru = &ARC_mru;
4192	arc_mru_ghost = &ARC_mru_ghost;
4193	arc_mfu = &ARC_mfu;
4194	arc_mfu_ghost = &ARC_mfu_ghost;
4195	arc_l2c_only = &ARC_l2c_only;
4196	arc_size = 0;
4197
4198	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4199		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4200		    NULL, MUTEX_DEFAULT, NULL);
4201		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4202		    NULL, MUTEX_DEFAULT, NULL);
4203		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4204		    NULL, MUTEX_DEFAULT, NULL);
4205		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4206		    NULL, MUTEX_DEFAULT, NULL);
4207		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4208		    NULL, MUTEX_DEFAULT, NULL);
4209		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4210		    NULL, MUTEX_DEFAULT, NULL);
4211
4212		list_create(&arc_mru->arcs_lists[i],
4213		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4214		list_create(&arc_mru_ghost->arcs_lists[i],
4215		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4216		list_create(&arc_mfu->arcs_lists[i],
4217		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4218		list_create(&arc_mfu_ghost->arcs_lists[i],
4219		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4220		list_create(&arc_mfu_ghost->arcs_lists[i],
4221		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4222		list_create(&arc_l2c_only->arcs_lists[i],
4223		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4224	}
4225
4226	buf_init();
4227
4228	arc_thread_exit = 0;
4229	arc_eviction_list = NULL;
4230	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4231	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4232
4233	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4234	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4235
4236	if (arc_ksp != NULL) {
4237		arc_ksp->ks_data = &arc_stats;
4238		kstat_install(arc_ksp);
4239	}
4240
4241	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4242	    TS_RUN, minclsyspri);
4243
4244#ifdef _KERNEL
4245	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4246	    EVENTHANDLER_PRI_FIRST);
4247#endif
4248
4249	arc_dead = FALSE;
4250	arc_warm = B_FALSE;
4251
4252	/*
4253	 * Calculate maximum amount of dirty data per pool.
4254	 *
4255	 * If it has been set by /etc/system, take that.
4256	 * Otherwise, use a percentage of physical memory defined by
4257	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4258	 * zfs_dirty_data_max_max (default 4GB).
4259	 */
4260	if (zfs_dirty_data_max == 0) {
4261		zfs_dirty_data_max = ptob(physmem) *
4262		    zfs_dirty_data_max_percent / 100;
4263		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4264		    zfs_dirty_data_max_max);
4265	}
4266
4267#ifdef _KERNEL
4268	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4269		prefetch_tunable_set = 1;
4270
4271#ifdef __i386__
4272	if (prefetch_tunable_set == 0) {
4273		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4274		    "-- to enable,\n");
4275		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4276		    "to /boot/loader.conf.\n");
4277		zfs_prefetch_disable = 1;
4278	}
4279#else
4280	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4281	    prefetch_tunable_set == 0) {
4282		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4283		    "than 4GB of RAM is present;\n"
4284		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4285		    "to /boot/loader.conf.\n");
4286		zfs_prefetch_disable = 1;
4287	}
4288#endif
4289	/* Warn about ZFS memory and address space requirements. */
4290	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4291		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4292		    "expect unstable behavior.\n");
4293	}
4294	if (kmem_size() < 512 * (1 << 20)) {
4295		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4296		    "expect unstable behavior.\n");
4297		printf("             Consider tuning vm.kmem_size and "
4298		    "vm.kmem_size_max\n");
4299		printf("             in /boot/loader.conf.\n");
4300	}
4301#endif
4302}
4303
4304void
4305arc_fini(void)
4306{
4307	int i;
4308
4309	mutex_enter(&arc_reclaim_thr_lock);
4310	arc_thread_exit = 1;
4311	cv_signal(&arc_reclaim_thr_cv);
4312	while (arc_thread_exit != 0)
4313		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4314	mutex_exit(&arc_reclaim_thr_lock);
4315
4316	arc_flush(NULL);
4317
4318	arc_dead = TRUE;
4319
4320	if (arc_ksp != NULL) {
4321		kstat_delete(arc_ksp);
4322		arc_ksp = NULL;
4323	}
4324
4325	mutex_destroy(&arc_eviction_mtx);
4326	mutex_destroy(&arc_reclaim_thr_lock);
4327	cv_destroy(&arc_reclaim_thr_cv);
4328
4329	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4330		list_destroy(&arc_mru->arcs_lists[i]);
4331		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4332		list_destroy(&arc_mfu->arcs_lists[i]);
4333		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4334		list_destroy(&arc_l2c_only->arcs_lists[i]);
4335
4336		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4337		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4338		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4339		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4340		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4341		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4342	}
4343
4344	buf_fini();
4345
4346	ASSERT(arc_loaned_bytes == 0);
4347
4348	mutex_destroy(&arc_lowmem_lock);
4349#ifdef _KERNEL
4350	if (arc_event_lowmem != NULL)
4351		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4352#endif
4353}
4354
4355/*
4356 * Level 2 ARC
4357 *
4358 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4359 * It uses dedicated storage devices to hold cached data, which are populated
4360 * using large infrequent writes.  The main role of this cache is to boost
4361 * the performance of random read workloads.  The intended L2ARC devices
4362 * include short-stroked disks, solid state disks, and other media with
4363 * substantially faster read latency than disk.
4364 *
4365 *                 +-----------------------+
4366 *                 |         ARC           |
4367 *                 +-----------------------+
4368 *                    |         ^     ^
4369 *                    |         |     |
4370 *      l2arc_feed_thread()    arc_read()
4371 *                    |         |     |
4372 *                    |  l2arc read   |
4373 *                    V         |     |
4374 *               +---------------+    |
4375 *               |     L2ARC     |    |
4376 *               +---------------+    |
4377 *                   |    ^           |
4378 *          l2arc_write() |           |
4379 *                   |    |           |
4380 *                   V    |           |
4381 *                 +-------+      +-------+
4382 *                 | vdev  |      | vdev  |
4383 *                 | cache |      | cache |
4384 *                 +-------+      +-------+
4385 *                 +=========+     .-----.
4386 *                 :  L2ARC  :    |-_____-|
4387 *                 : devices :    | Disks |
4388 *                 +=========+    `-_____-'
4389 *
4390 * Read requests are satisfied from the following sources, in order:
4391 *
4392 *	1) ARC
4393 *	2) vdev cache of L2ARC devices
4394 *	3) L2ARC devices
4395 *	4) vdev cache of disks
4396 *	5) disks
4397 *
4398 * Some L2ARC device types exhibit extremely slow write performance.
4399 * To accommodate for this there are some significant differences between
4400 * the L2ARC and traditional cache design:
4401 *
4402 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4403 * the ARC behave as usual, freeing buffers and placing headers on ghost
4404 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4405 * this would add inflated write latencies for all ARC memory pressure.
4406 *
4407 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4408 * It does this by periodically scanning buffers from the eviction-end of
4409 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4410 * not already there. It scans until a headroom of buffers is satisfied,
4411 * which itself is a buffer for ARC eviction. If a compressible buffer is
4412 * found during scanning and selected for writing to an L2ARC device, we
4413 * temporarily boost scanning headroom during the next scan cycle to make
4414 * sure we adapt to compression effects (which might significantly reduce
4415 * the data volume we write to L2ARC). The thread that does this is
4416 * l2arc_feed_thread(), illustrated below; example sizes are included to
4417 * provide a better sense of ratio than this diagram:
4418 *
4419 *	       head -->                        tail
4420 *	        +---------------------+----------+
4421 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4422 *	        +---------------------+----------+   |   o L2ARC eligible
4423 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4424 *	        +---------------------+----------+   |
4425 *	             15.9 Gbytes      ^ 32 Mbytes    |
4426 *	                           headroom          |
4427 *	                                      l2arc_feed_thread()
4428 *	                                             |
4429 *	                 l2arc write hand <--[oooo]--'
4430 *	                         |           8 Mbyte
4431 *	                         |          write max
4432 *	                         V
4433 *		  +==============================+
4434 *	L2ARC dev |####|#|###|###|    |####| ... |
4435 *	          +==============================+
4436 *	                     32 Gbytes
4437 *
4438 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4439 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4440 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4441 * safe to say that this is an uncommon case, since buffers at the end of
4442 * the ARC lists have moved there due to inactivity.
4443 *
4444 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4445 * then the L2ARC simply misses copying some buffers.  This serves as a
4446 * pressure valve to prevent heavy read workloads from both stalling the ARC
4447 * with waits and clogging the L2ARC with writes.  This also helps prevent
4448 * the potential for the L2ARC to churn if it attempts to cache content too
4449 * quickly, such as during backups of the entire pool.
4450 *
4451 * 5. After system boot and before the ARC has filled main memory, there are
4452 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4453 * lists can remain mostly static.  Instead of searching from tail of these
4454 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4455 * for eligible buffers, greatly increasing its chance of finding them.
4456 *
4457 * The L2ARC device write speed is also boosted during this time so that
4458 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4459 * there are no L2ARC reads, and no fear of degrading read performance
4460 * through increased writes.
4461 *
4462 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4463 * the vdev queue can aggregate them into larger and fewer writes.  Each
4464 * device is written to in a rotor fashion, sweeping writes through
4465 * available space then repeating.
4466 *
4467 * 7. The L2ARC does not store dirty content.  It never needs to flush
4468 * write buffers back to disk based storage.
4469 *
4470 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4471 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4472 *
4473 * The performance of the L2ARC can be tweaked by a number of tunables, which
4474 * may be necessary for different workloads:
4475 *
4476 *	l2arc_write_max		max write bytes per interval
4477 *	l2arc_write_boost	extra write bytes during device warmup
4478 *	l2arc_noprefetch	skip caching prefetched buffers
4479 *	l2arc_headroom		number of max device writes to precache
4480 *	l2arc_headroom_boost	when we find compressed buffers during ARC
4481 *				scanning, we multiply headroom by this
4482 *				percentage factor for the next scan cycle,
4483 *				since more compressed buffers are likely to
4484 *				be present
4485 *	l2arc_feed_secs		seconds between L2ARC writing
4486 *
4487 * Tunables may be removed or added as future performance improvements are
4488 * integrated, and also may become zpool properties.
4489 *
4490 * There are three key functions that control how the L2ARC warms up:
4491 *
4492 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4493 *	l2arc_write_size()	calculate how much to write
4494 *	l2arc_write_interval()	calculate sleep delay between writes
4495 *
4496 * These three functions determine what to write, how much, and how quickly
4497 * to send writes.
4498 */
4499
4500static boolean_t
4501l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4502{
4503	/*
4504	 * A buffer is *not* eligible for the L2ARC if it:
4505	 * 1. belongs to a different spa.
4506	 * 2. is already cached on the L2ARC.
4507	 * 3. has an I/O in progress (it may be an incomplete read).
4508	 * 4. is flagged not eligible (zfs property).
4509	 */
4510	if (ab->b_spa != spa_guid) {
4511		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4512		return (B_FALSE);
4513	}
4514	if (ab->b_l2hdr != NULL) {
4515		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4516		return (B_FALSE);
4517	}
4518	if (HDR_IO_IN_PROGRESS(ab)) {
4519		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4520		return (B_FALSE);
4521	}
4522	if (!HDR_L2CACHE(ab)) {
4523		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4524		return (B_FALSE);
4525	}
4526
4527	return (B_TRUE);
4528}
4529
4530static uint64_t
4531l2arc_write_size(void)
4532{
4533	uint64_t size;
4534
4535	/*
4536	 * Make sure our globals have meaningful values in case the user
4537	 * altered them.
4538	 */
4539	size = l2arc_write_max;
4540	if (size == 0) {
4541		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4542		    "be greater than zero, resetting it to the default (%d)",
4543		    L2ARC_WRITE_SIZE);
4544		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4545	}
4546
4547	if (arc_warm == B_FALSE)
4548		size += l2arc_write_boost;
4549
4550	return (size);
4551
4552}
4553
4554static clock_t
4555l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4556{
4557	clock_t interval, next, now;
4558
4559	/*
4560	 * If the ARC lists are busy, increase our write rate; if the
4561	 * lists are stale, idle back.  This is achieved by checking
4562	 * how much we previously wrote - if it was more than half of
4563	 * what we wanted, schedule the next write much sooner.
4564	 */
4565	if (l2arc_feed_again && wrote > (wanted / 2))
4566		interval = (hz * l2arc_feed_min_ms) / 1000;
4567	else
4568		interval = hz * l2arc_feed_secs;
4569
4570	now = ddi_get_lbolt();
4571	next = MAX(now, MIN(now + interval, began + interval));
4572
4573	return (next);
4574}
4575
4576static void
4577l2arc_hdr_stat_add(void)
4578{
4579	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4580	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4581}
4582
4583static void
4584l2arc_hdr_stat_remove(void)
4585{
4586	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4587	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4588}
4589
4590/*
4591 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4592 * If a device is returned, this also returns holding the spa config lock.
4593 */
4594static l2arc_dev_t *
4595l2arc_dev_get_next(void)
4596{
4597	l2arc_dev_t *first, *next = NULL;
4598
4599	/*
4600	 * Lock out the removal of spas (spa_namespace_lock), then removal
4601	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4602	 * both locks will be dropped and a spa config lock held instead.
4603	 */
4604	mutex_enter(&spa_namespace_lock);
4605	mutex_enter(&l2arc_dev_mtx);
4606
4607	/* if there are no vdevs, there is nothing to do */
4608	if (l2arc_ndev == 0)
4609		goto out;
4610
4611	first = NULL;
4612	next = l2arc_dev_last;
4613	do {
4614		/* loop around the list looking for a non-faulted vdev */
4615		if (next == NULL) {
4616			next = list_head(l2arc_dev_list);
4617		} else {
4618			next = list_next(l2arc_dev_list, next);
4619			if (next == NULL)
4620				next = list_head(l2arc_dev_list);
4621		}
4622
4623		/* if we have come back to the start, bail out */
4624		if (first == NULL)
4625			first = next;
4626		else if (next == first)
4627			break;
4628
4629	} while (vdev_is_dead(next->l2ad_vdev));
4630
4631	/* if we were unable to find any usable vdevs, return NULL */
4632	if (vdev_is_dead(next->l2ad_vdev))
4633		next = NULL;
4634
4635	l2arc_dev_last = next;
4636
4637out:
4638	mutex_exit(&l2arc_dev_mtx);
4639
4640	/*
4641	 * Grab the config lock to prevent the 'next' device from being
4642	 * removed while we are writing to it.
4643	 */
4644	if (next != NULL)
4645		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4646	mutex_exit(&spa_namespace_lock);
4647
4648	return (next);
4649}
4650
4651/*
4652 * Free buffers that were tagged for destruction.
4653 */
4654static void
4655l2arc_do_free_on_write()
4656{
4657	list_t *buflist;
4658	l2arc_data_free_t *df, *df_prev;
4659
4660	mutex_enter(&l2arc_free_on_write_mtx);
4661	buflist = l2arc_free_on_write;
4662
4663	for (df = list_tail(buflist); df; df = df_prev) {
4664		df_prev = list_prev(buflist, df);
4665		ASSERT(df->l2df_data != NULL);
4666		ASSERT(df->l2df_func != NULL);
4667		df->l2df_func(df->l2df_data, df->l2df_size);
4668		list_remove(buflist, df);
4669		kmem_free(df, sizeof (l2arc_data_free_t));
4670	}
4671
4672	mutex_exit(&l2arc_free_on_write_mtx);
4673}
4674
4675/*
4676 * A write to a cache device has completed.  Update all headers to allow
4677 * reads from these buffers to begin.
4678 */
4679static void
4680l2arc_write_done(zio_t *zio)
4681{
4682	l2arc_write_callback_t *cb;
4683	l2arc_dev_t *dev;
4684	list_t *buflist;
4685	arc_buf_hdr_t *head, *ab, *ab_prev;
4686	l2arc_buf_hdr_t *abl2;
4687	kmutex_t *hash_lock;
4688	int64_t bytes_dropped = 0;
4689
4690	cb = zio->io_private;
4691	ASSERT(cb != NULL);
4692	dev = cb->l2wcb_dev;
4693	ASSERT(dev != NULL);
4694	head = cb->l2wcb_head;
4695	ASSERT(head != NULL);
4696	buflist = dev->l2ad_buflist;
4697	ASSERT(buflist != NULL);
4698	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4699	    l2arc_write_callback_t *, cb);
4700
4701	if (zio->io_error != 0)
4702		ARCSTAT_BUMP(arcstat_l2_writes_error);
4703
4704	mutex_enter(&l2arc_buflist_mtx);
4705
4706	/*
4707	 * All writes completed, or an error was hit.
4708	 */
4709	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4710		ab_prev = list_prev(buflist, ab);
4711		abl2 = ab->b_l2hdr;
4712
4713		/*
4714		 * Release the temporary compressed buffer as soon as possible.
4715		 */
4716		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4717			l2arc_release_cdata_buf(ab);
4718
4719		hash_lock = HDR_LOCK(ab);
4720		if (!mutex_tryenter(hash_lock)) {
4721			/*
4722			 * This buffer misses out.  It may be in a stage
4723			 * of eviction.  Its ARC_L2_WRITING flag will be
4724			 * left set, denying reads to this buffer.
4725			 */
4726			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4727			continue;
4728		}
4729
4730		if (zio->io_error != 0) {
4731			/*
4732			 * Error - drop L2ARC entry.
4733			 */
4734			list_remove(buflist, ab);
4735			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4736			bytes_dropped += abl2->b_asize;
4737			ab->b_l2hdr = NULL;
4738			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4739			    ab->b_size, 0);
4740			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4741			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4742		}
4743
4744		/*
4745		 * Allow ARC to begin reads to this L2ARC entry.
4746		 */
4747		ab->b_flags &= ~ARC_L2_WRITING;
4748
4749		mutex_exit(hash_lock);
4750	}
4751
4752	atomic_inc_64(&l2arc_writes_done);
4753	list_remove(buflist, head);
4754	kmem_cache_free(hdr_cache, head);
4755	mutex_exit(&l2arc_buflist_mtx);
4756
4757	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4758
4759	l2arc_do_free_on_write();
4760
4761	kmem_free(cb, sizeof (l2arc_write_callback_t));
4762}
4763
4764/*
4765 * A read to a cache device completed.  Validate buffer contents before
4766 * handing over to the regular ARC routines.
4767 */
4768static void
4769l2arc_read_done(zio_t *zio)
4770{
4771	l2arc_read_callback_t *cb;
4772	arc_buf_hdr_t *hdr;
4773	arc_buf_t *buf;
4774	kmutex_t *hash_lock;
4775	int equal;
4776
4777	ASSERT(zio->io_vd != NULL);
4778	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4779
4780	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4781
4782	cb = zio->io_private;
4783	ASSERT(cb != NULL);
4784	buf = cb->l2rcb_buf;
4785	ASSERT(buf != NULL);
4786
4787	hash_lock = HDR_LOCK(buf->b_hdr);
4788	mutex_enter(hash_lock);
4789	hdr = buf->b_hdr;
4790	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4791
4792	/*
4793	 * If the buffer was compressed, decompress it first.
4794	 */
4795	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4796		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4797	ASSERT(zio->io_data != NULL);
4798
4799	/*
4800	 * Check this survived the L2ARC journey.
4801	 */
4802	equal = arc_cksum_equal(buf);
4803	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4804		mutex_exit(hash_lock);
4805		zio->io_private = buf;
4806		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4807		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4808		arc_read_done(zio);
4809	} else {
4810		mutex_exit(hash_lock);
4811		/*
4812		 * Buffer didn't survive caching.  Increment stats and
4813		 * reissue to the original storage device.
4814		 */
4815		if (zio->io_error != 0) {
4816			ARCSTAT_BUMP(arcstat_l2_io_error);
4817		} else {
4818			zio->io_error = SET_ERROR(EIO);
4819		}
4820		if (!equal)
4821			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4822
4823		/*
4824		 * If there's no waiter, issue an async i/o to the primary
4825		 * storage now.  If there *is* a waiter, the caller must
4826		 * issue the i/o in a context where it's OK to block.
4827		 */
4828		if (zio->io_waiter == NULL) {
4829			zio_t *pio = zio_unique_parent(zio);
4830
4831			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4832
4833			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4834			    buf->b_data, zio->io_size, arc_read_done, buf,
4835			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4836		}
4837	}
4838
4839	kmem_free(cb, sizeof (l2arc_read_callback_t));
4840}
4841
4842/*
4843 * This is the list priority from which the L2ARC will search for pages to
4844 * cache.  This is used within loops (0..3) to cycle through lists in the
4845 * desired order.  This order can have a significant effect on cache
4846 * performance.
4847 *
4848 * Currently the metadata lists are hit first, MFU then MRU, followed by
4849 * the data lists.  This function returns a locked list, and also returns
4850 * the lock pointer.
4851 */
4852static list_t *
4853l2arc_list_locked(int list_num, kmutex_t **lock)
4854{
4855	list_t *list = NULL;
4856	int idx;
4857
4858	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4859
4860	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4861		idx = list_num;
4862		list = &arc_mfu->arcs_lists[idx];
4863		*lock = ARCS_LOCK(arc_mfu, idx);
4864	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4865		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4866		list = &arc_mru->arcs_lists[idx];
4867		*lock = ARCS_LOCK(arc_mru, idx);
4868	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4869		ARC_BUFC_NUMDATALISTS)) {
4870		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4871		list = &arc_mfu->arcs_lists[idx];
4872		*lock = ARCS_LOCK(arc_mfu, idx);
4873	} else {
4874		idx = list_num - ARC_BUFC_NUMLISTS;
4875		list = &arc_mru->arcs_lists[idx];
4876		*lock = ARCS_LOCK(arc_mru, idx);
4877	}
4878
4879	ASSERT(!(MUTEX_HELD(*lock)));
4880	mutex_enter(*lock);
4881	return (list);
4882}
4883
4884/*
4885 * Evict buffers from the device write hand to the distance specified in
4886 * bytes.  This distance may span populated buffers, it may span nothing.
4887 * This is clearing a region on the L2ARC device ready for writing.
4888 * If the 'all' boolean is set, every buffer is evicted.
4889 */
4890static void
4891l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4892{
4893	list_t *buflist;
4894	l2arc_buf_hdr_t *abl2;
4895	arc_buf_hdr_t *ab, *ab_prev;
4896	kmutex_t *hash_lock;
4897	uint64_t taddr;
4898	int64_t bytes_evicted = 0;
4899
4900	buflist = dev->l2ad_buflist;
4901
4902	if (buflist == NULL)
4903		return;
4904
4905	if (!all && dev->l2ad_first) {
4906		/*
4907		 * This is the first sweep through the device.  There is
4908		 * nothing to evict.
4909		 */
4910		return;
4911	}
4912
4913	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4914		/*
4915		 * When nearing the end of the device, evict to the end
4916		 * before the device write hand jumps to the start.
4917		 */
4918		taddr = dev->l2ad_end;
4919	} else {
4920		taddr = dev->l2ad_hand + distance;
4921	}
4922	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4923	    uint64_t, taddr, boolean_t, all);
4924
4925top:
4926	mutex_enter(&l2arc_buflist_mtx);
4927	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4928		ab_prev = list_prev(buflist, ab);
4929
4930		hash_lock = HDR_LOCK(ab);
4931		if (!mutex_tryenter(hash_lock)) {
4932			/*
4933			 * Missed the hash lock.  Retry.
4934			 */
4935			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4936			mutex_exit(&l2arc_buflist_mtx);
4937			mutex_enter(hash_lock);
4938			mutex_exit(hash_lock);
4939			goto top;
4940		}
4941
4942		if (HDR_L2_WRITE_HEAD(ab)) {
4943			/*
4944			 * We hit a write head node.  Leave it for
4945			 * l2arc_write_done().
4946			 */
4947			list_remove(buflist, ab);
4948			mutex_exit(hash_lock);
4949			continue;
4950		}
4951
4952		if (!all && ab->b_l2hdr != NULL &&
4953		    (ab->b_l2hdr->b_daddr > taddr ||
4954		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4955			/*
4956			 * We've evicted to the target address,
4957			 * or the end of the device.
4958			 */
4959			mutex_exit(hash_lock);
4960			break;
4961		}
4962
4963		if (HDR_FREE_IN_PROGRESS(ab)) {
4964			/*
4965			 * Already on the path to destruction.
4966			 */
4967			mutex_exit(hash_lock);
4968			continue;
4969		}
4970
4971		if (ab->b_state == arc_l2c_only) {
4972			ASSERT(!HDR_L2_READING(ab));
4973			/*
4974			 * This doesn't exist in the ARC.  Destroy.
4975			 * arc_hdr_destroy() will call list_remove()
4976			 * and decrement arcstat_l2_size.
4977			 */
4978			arc_change_state(arc_anon, ab, hash_lock);
4979			arc_hdr_destroy(ab);
4980		} else {
4981			/*
4982			 * Invalidate issued or about to be issued
4983			 * reads, since we may be about to write
4984			 * over this location.
4985			 */
4986			if (HDR_L2_READING(ab)) {
4987				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4988				ab->b_flags |= ARC_L2_EVICTED;
4989			}
4990
4991			/*
4992			 * Tell ARC this no longer exists in L2ARC.
4993			 */
4994			if (ab->b_l2hdr != NULL) {
4995				abl2 = ab->b_l2hdr;
4996				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4997				bytes_evicted += abl2->b_asize;
4998				ab->b_l2hdr = NULL;
4999				/*
5000				 * We are destroying l2hdr, so ensure that
5001				 * its compressed buffer, if any, is not leaked.
5002				 */
5003				ASSERT(abl2->b_tmp_cdata == NULL);
5004				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
5005				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
5006			}
5007			list_remove(buflist, ab);
5008
5009			/*
5010			 * This may have been leftover after a
5011			 * failed write.
5012			 */
5013			ab->b_flags &= ~ARC_L2_WRITING;
5014		}
5015		mutex_exit(hash_lock);
5016	}
5017	mutex_exit(&l2arc_buflist_mtx);
5018
5019	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
5020	dev->l2ad_evict = taddr;
5021}
5022
5023/*
5024 * Find and write ARC buffers to the L2ARC device.
5025 *
5026 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
5027 * for reading until they have completed writing.
5028 * The headroom_boost is an in-out parameter used to maintain headroom boost
5029 * state between calls to this function.
5030 *
5031 * Returns the number of bytes actually written (which may be smaller than
5032 * the delta by which the device hand has changed due to alignment).
5033 */
5034static uint64_t
5035l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5036    boolean_t *headroom_boost)
5037{
5038	arc_buf_hdr_t *ab, *ab_prev, *head;
5039	list_t *list;
5040	uint64_t write_asize, write_psize, write_sz, headroom,
5041	    buf_compress_minsz;
5042	void *buf_data;
5043	kmutex_t *list_lock;
5044	boolean_t full;
5045	l2arc_write_callback_t *cb;
5046	zio_t *pio, *wzio;
5047	uint64_t guid = spa_load_guid(spa);
5048	const boolean_t do_headroom_boost = *headroom_boost;
5049	int try;
5050
5051	ASSERT(dev->l2ad_vdev != NULL);
5052
5053	/* Lower the flag now, we might want to raise it again later. */
5054	*headroom_boost = B_FALSE;
5055
5056	pio = NULL;
5057	write_sz = write_asize = write_psize = 0;
5058	full = B_FALSE;
5059	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5060	head->b_flags |= ARC_L2_WRITE_HEAD;
5061
5062	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5063	/*
5064	 * We will want to try to compress buffers that are at least 2x the
5065	 * device sector size.
5066	 */
5067	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5068
5069	/*
5070	 * Copy buffers for L2ARC writing.
5071	 */
5072	mutex_enter(&l2arc_buflist_mtx);
5073	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5074		uint64_t passed_sz = 0;
5075
5076		list = l2arc_list_locked(try, &list_lock);
5077		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5078
5079		/*
5080		 * L2ARC fast warmup.
5081		 *
5082		 * Until the ARC is warm and starts to evict, read from the
5083		 * head of the ARC lists rather than the tail.
5084		 */
5085		if (arc_warm == B_FALSE)
5086			ab = list_head(list);
5087		else
5088			ab = list_tail(list);
5089		if (ab == NULL)
5090			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5091
5092		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5093		if (do_headroom_boost)
5094			headroom = (headroom * l2arc_headroom_boost) / 100;
5095
5096		for (; ab; ab = ab_prev) {
5097			l2arc_buf_hdr_t *l2hdr;
5098			kmutex_t *hash_lock;
5099			uint64_t buf_sz;
5100
5101			if (arc_warm == B_FALSE)
5102				ab_prev = list_next(list, ab);
5103			else
5104				ab_prev = list_prev(list, ab);
5105			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
5106
5107			hash_lock = HDR_LOCK(ab);
5108			if (!mutex_tryenter(hash_lock)) {
5109				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5110				/*
5111				 * Skip this buffer rather than waiting.
5112				 */
5113				continue;
5114			}
5115
5116			passed_sz += ab->b_size;
5117			if (passed_sz > headroom) {
5118				/*
5119				 * Searched too far.
5120				 */
5121				mutex_exit(hash_lock);
5122				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5123				break;
5124			}
5125
5126			if (!l2arc_write_eligible(guid, ab)) {
5127				mutex_exit(hash_lock);
5128				continue;
5129			}
5130
5131			if ((write_sz + ab->b_size) > target_sz) {
5132				full = B_TRUE;
5133				mutex_exit(hash_lock);
5134				ARCSTAT_BUMP(arcstat_l2_write_full);
5135				break;
5136			}
5137
5138			if (pio == NULL) {
5139				/*
5140				 * Insert a dummy header on the buflist so
5141				 * l2arc_write_done() can find where the
5142				 * write buffers begin without searching.
5143				 */
5144				list_insert_head(dev->l2ad_buflist, head);
5145
5146				cb = kmem_alloc(
5147				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5148				cb->l2wcb_dev = dev;
5149				cb->l2wcb_head = head;
5150				pio = zio_root(spa, l2arc_write_done, cb,
5151				    ZIO_FLAG_CANFAIL);
5152				ARCSTAT_BUMP(arcstat_l2_write_pios);
5153			}
5154
5155			/*
5156			 * Create and add a new L2ARC header.
5157			 */
5158			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5159			l2hdr->b_dev = dev;
5160			ab->b_flags |= ARC_L2_WRITING;
5161
5162			/*
5163			 * Temporarily stash the data buffer in b_tmp_cdata.
5164			 * The subsequent write step will pick it up from
5165			 * there. This is because can't access ab->b_buf
5166			 * without holding the hash_lock, which we in turn
5167			 * can't access without holding the ARC list locks
5168			 * (which we want to avoid during compression/writing).
5169			 */
5170			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5171			l2hdr->b_asize = ab->b_size;
5172			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5173
5174			buf_sz = ab->b_size;
5175			ab->b_l2hdr = l2hdr;
5176
5177			list_insert_head(dev->l2ad_buflist, ab);
5178
5179			/*
5180			 * Compute and store the buffer cksum before
5181			 * writing.  On debug the cksum is verified first.
5182			 */
5183			arc_cksum_verify(ab->b_buf);
5184			arc_cksum_compute(ab->b_buf, B_TRUE);
5185
5186			mutex_exit(hash_lock);
5187
5188			write_sz += buf_sz;
5189		}
5190
5191		mutex_exit(list_lock);
5192
5193		if (full == B_TRUE)
5194			break;
5195	}
5196
5197	/* No buffers selected for writing? */
5198	if (pio == NULL) {
5199		ASSERT0(write_sz);
5200		mutex_exit(&l2arc_buflist_mtx);
5201		kmem_cache_free(hdr_cache, head);
5202		return (0);
5203	}
5204
5205	/*
5206	 * Now start writing the buffers. We're starting at the write head
5207	 * and work backwards, retracing the course of the buffer selector
5208	 * loop above.
5209	 */
5210	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5211	    ab = list_prev(dev->l2ad_buflist, ab)) {
5212		l2arc_buf_hdr_t *l2hdr;
5213		uint64_t buf_sz;
5214
5215		/*
5216		 * We shouldn't need to lock the buffer here, since we flagged
5217		 * it as ARC_L2_WRITING in the previous step, but we must take
5218		 * care to only access its L2 cache parameters. In particular,
5219		 * ab->b_buf may be invalid by now due to ARC eviction.
5220		 */
5221		l2hdr = ab->b_l2hdr;
5222		l2hdr->b_daddr = dev->l2ad_hand;
5223
5224		if ((ab->b_flags & ARC_L2COMPRESS) &&
5225		    l2hdr->b_asize >= buf_compress_minsz) {
5226			if (l2arc_compress_buf(l2hdr)) {
5227				/*
5228				 * If compression succeeded, enable headroom
5229				 * boost on the next scan cycle.
5230				 */
5231				*headroom_boost = B_TRUE;
5232			}
5233		}
5234
5235		/*
5236		 * Pick up the buffer data we had previously stashed away
5237		 * (and now potentially also compressed).
5238		 */
5239		buf_data = l2hdr->b_tmp_cdata;
5240		buf_sz = l2hdr->b_asize;
5241
5242		/*
5243		 * If the data has not been compressed, then clear b_tmp_cdata
5244		 * to make sure that it points only to a temporary compression
5245		 * buffer.
5246		 */
5247		if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
5248			l2hdr->b_tmp_cdata = NULL;
5249
5250		/* Compression may have squashed the buffer to zero length. */
5251		if (buf_sz != 0) {
5252			uint64_t buf_p_sz;
5253
5254			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5255			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5256			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5257			    ZIO_FLAG_CANFAIL, B_FALSE);
5258
5259			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5260			    zio_t *, wzio);
5261			(void) zio_nowait(wzio);
5262
5263			write_asize += buf_sz;
5264			/*
5265			 * Keep the clock hand suitably device-aligned.
5266			 */
5267			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5268			write_psize += buf_p_sz;
5269			dev->l2ad_hand += buf_p_sz;
5270		}
5271	}
5272
5273	mutex_exit(&l2arc_buflist_mtx);
5274
5275	ASSERT3U(write_asize, <=, target_sz);
5276	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5277	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5278	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5279	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5280	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5281
5282	/*
5283	 * Bump device hand to the device start if it is approaching the end.
5284	 * l2arc_evict() will already have evicted ahead for this case.
5285	 */
5286	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5287		dev->l2ad_hand = dev->l2ad_start;
5288		dev->l2ad_evict = dev->l2ad_start;
5289		dev->l2ad_first = B_FALSE;
5290	}
5291
5292	dev->l2ad_writing = B_TRUE;
5293	(void) zio_wait(pio);
5294	dev->l2ad_writing = B_FALSE;
5295
5296	return (write_asize);
5297}
5298
5299/*
5300 * Compresses an L2ARC buffer.
5301 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5302 * size in l2hdr->b_asize. This routine tries to compress the data and
5303 * depending on the compression result there are three possible outcomes:
5304 * *) The buffer was incompressible. The original l2hdr contents were left
5305 *    untouched and are ready for writing to an L2 device.
5306 * *) The buffer was all-zeros, so there is no need to write it to an L2
5307 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5308 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5309 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5310 *    data buffer which holds the compressed data to be written, and b_asize
5311 *    tells us how much data there is. b_compress is set to the appropriate
5312 *    compression algorithm. Once writing is done, invoke
5313 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5314 *
5315 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5316 * buffer was incompressible).
5317 */
5318static boolean_t
5319l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5320{
5321	void *cdata;
5322	size_t csize, len, rounded;
5323
5324	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5325	ASSERT(l2hdr->b_tmp_cdata != NULL);
5326
5327	len = l2hdr->b_asize;
5328	cdata = zio_data_buf_alloc(len);
5329	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5330	    cdata, l2hdr->b_asize);
5331
5332	if (csize == 0) {
5333		/* zero block, indicate that there's nothing to write */
5334		zio_data_buf_free(cdata, len);
5335		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5336		l2hdr->b_asize = 0;
5337		l2hdr->b_tmp_cdata = NULL;
5338		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5339		return (B_TRUE);
5340	}
5341
5342	rounded = P2ROUNDUP(csize,
5343	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
5344	if (rounded < len) {
5345		/*
5346		 * Compression succeeded, we'll keep the cdata around for
5347		 * writing and release it afterwards.
5348		 */
5349		if (rounded > csize) {
5350			bzero((char *)cdata + csize, rounded - csize);
5351			csize = rounded;
5352		}
5353		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5354		l2hdr->b_asize = csize;
5355		l2hdr->b_tmp_cdata = cdata;
5356		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5357		return (B_TRUE);
5358	} else {
5359		/*
5360		 * Compression failed, release the compressed buffer.
5361		 * l2hdr will be left unmodified.
5362		 */
5363		zio_data_buf_free(cdata, len);
5364		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5365		return (B_FALSE);
5366	}
5367}
5368
5369/*
5370 * Decompresses a zio read back from an l2arc device. On success, the
5371 * underlying zio's io_data buffer is overwritten by the uncompressed
5372 * version. On decompression error (corrupt compressed stream), the
5373 * zio->io_error value is set to signal an I/O error.
5374 *
5375 * Please note that the compressed data stream is not checksummed, so
5376 * if the underlying device is experiencing data corruption, we may feed
5377 * corrupt data to the decompressor, so the decompressor needs to be
5378 * able to handle this situation (LZ4 does).
5379 */
5380static void
5381l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5382{
5383	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5384
5385	if (zio->io_error != 0) {
5386		/*
5387		 * An io error has occured, just restore the original io
5388		 * size in preparation for a main pool read.
5389		 */
5390		zio->io_orig_size = zio->io_size = hdr->b_size;
5391		return;
5392	}
5393
5394	if (c == ZIO_COMPRESS_EMPTY) {
5395		/*
5396		 * An empty buffer results in a null zio, which means we
5397		 * need to fill its io_data after we're done restoring the
5398		 * buffer's contents.
5399		 */
5400		ASSERT(hdr->b_buf != NULL);
5401		bzero(hdr->b_buf->b_data, hdr->b_size);
5402		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5403	} else {
5404		ASSERT(zio->io_data != NULL);
5405		/*
5406		 * We copy the compressed data from the start of the arc buffer
5407		 * (the zio_read will have pulled in only what we need, the
5408		 * rest is garbage which we will overwrite at decompression)
5409		 * and then decompress back to the ARC data buffer. This way we
5410		 * can minimize copying by simply decompressing back over the
5411		 * original compressed data (rather than decompressing to an
5412		 * aux buffer and then copying back the uncompressed buffer,
5413		 * which is likely to be much larger).
5414		 */
5415		uint64_t csize;
5416		void *cdata;
5417
5418		csize = zio->io_size;
5419		cdata = zio_data_buf_alloc(csize);
5420		bcopy(zio->io_data, cdata, csize);
5421		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5422		    hdr->b_size) != 0)
5423			zio->io_error = EIO;
5424		zio_data_buf_free(cdata, csize);
5425	}
5426
5427	/* Restore the expected uncompressed IO size. */
5428	zio->io_orig_size = zio->io_size = hdr->b_size;
5429}
5430
5431/*
5432 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5433 * This buffer serves as a temporary holder of compressed data while
5434 * the buffer entry is being written to an l2arc device. Once that is
5435 * done, we can dispose of it.
5436 */
5437static void
5438l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5439{
5440	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5441
5442	ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
5443	if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
5444		/*
5445		 * If the data was compressed, then we've allocated a
5446		 * temporary buffer for it, so now we need to release it.
5447		 */
5448		ASSERT(l2hdr->b_tmp_cdata != NULL);
5449		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5450		l2hdr->b_tmp_cdata = NULL;
5451	} else {
5452		ASSERT(l2hdr->b_tmp_cdata == NULL);
5453	}
5454}
5455
5456/*
5457 * This thread feeds the L2ARC at regular intervals.  This is the beating
5458 * heart of the L2ARC.
5459 */
5460static void
5461l2arc_feed_thread(void *dummy __unused)
5462{
5463	callb_cpr_t cpr;
5464	l2arc_dev_t *dev;
5465	spa_t *spa;
5466	uint64_t size, wrote;
5467	clock_t begin, next = ddi_get_lbolt();
5468	boolean_t headroom_boost = B_FALSE;
5469
5470	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5471
5472	mutex_enter(&l2arc_feed_thr_lock);
5473
5474	while (l2arc_thread_exit == 0) {
5475		CALLB_CPR_SAFE_BEGIN(&cpr);
5476		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5477		    next - ddi_get_lbolt());
5478		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5479		next = ddi_get_lbolt() + hz;
5480
5481		/*
5482		 * Quick check for L2ARC devices.
5483		 */
5484		mutex_enter(&l2arc_dev_mtx);
5485		if (l2arc_ndev == 0) {
5486			mutex_exit(&l2arc_dev_mtx);
5487			continue;
5488		}
5489		mutex_exit(&l2arc_dev_mtx);
5490		begin = ddi_get_lbolt();
5491
5492		/*
5493		 * This selects the next l2arc device to write to, and in
5494		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5495		 * will return NULL if there are now no l2arc devices or if
5496		 * they are all faulted.
5497		 *
5498		 * If a device is returned, its spa's config lock is also
5499		 * held to prevent device removal.  l2arc_dev_get_next()
5500		 * will grab and release l2arc_dev_mtx.
5501		 */
5502		if ((dev = l2arc_dev_get_next()) == NULL)
5503			continue;
5504
5505		spa = dev->l2ad_spa;
5506		ASSERT(spa != NULL);
5507
5508		/*
5509		 * If the pool is read-only then force the feed thread to
5510		 * sleep a little longer.
5511		 */
5512		if (!spa_writeable(spa)) {
5513			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5514			spa_config_exit(spa, SCL_L2ARC, dev);
5515			continue;
5516		}
5517
5518		/*
5519		 * Avoid contributing to memory pressure.
5520		 */
5521		if (arc_reclaim_needed()) {
5522			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5523			spa_config_exit(spa, SCL_L2ARC, dev);
5524			continue;
5525		}
5526
5527		ARCSTAT_BUMP(arcstat_l2_feeds);
5528
5529		size = l2arc_write_size();
5530
5531		/*
5532		 * Evict L2ARC buffers that will be overwritten.
5533		 */
5534		l2arc_evict(dev, size, B_FALSE);
5535
5536		/*
5537		 * Write ARC buffers.
5538		 */
5539		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5540
5541		/*
5542		 * Calculate interval between writes.
5543		 */
5544		next = l2arc_write_interval(begin, size, wrote);
5545		spa_config_exit(spa, SCL_L2ARC, dev);
5546	}
5547
5548	l2arc_thread_exit = 0;
5549	cv_broadcast(&l2arc_feed_thr_cv);
5550	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5551	thread_exit();
5552}
5553
5554boolean_t
5555l2arc_vdev_present(vdev_t *vd)
5556{
5557	l2arc_dev_t *dev;
5558
5559	mutex_enter(&l2arc_dev_mtx);
5560	for (dev = list_head(l2arc_dev_list); dev != NULL;
5561	    dev = list_next(l2arc_dev_list, dev)) {
5562		if (dev->l2ad_vdev == vd)
5563			break;
5564	}
5565	mutex_exit(&l2arc_dev_mtx);
5566
5567	return (dev != NULL);
5568}
5569
5570/*
5571 * Add a vdev for use by the L2ARC.  By this point the spa has already
5572 * validated the vdev and opened it.
5573 */
5574void
5575l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5576{
5577	l2arc_dev_t *adddev;
5578
5579	ASSERT(!l2arc_vdev_present(vd));
5580
5581	vdev_ashift_optimize(vd);
5582
5583	/*
5584	 * Create a new l2arc device entry.
5585	 */
5586	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5587	adddev->l2ad_spa = spa;
5588	adddev->l2ad_vdev = vd;
5589	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5590	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5591	adddev->l2ad_hand = adddev->l2ad_start;
5592	adddev->l2ad_evict = adddev->l2ad_start;
5593	adddev->l2ad_first = B_TRUE;
5594	adddev->l2ad_writing = B_FALSE;
5595
5596	/*
5597	 * This is a list of all ARC buffers that are still valid on the
5598	 * device.
5599	 */
5600	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5601	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5602	    offsetof(arc_buf_hdr_t, b_l2node));
5603
5604	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5605
5606	/*
5607	 * Add device to global list
5608	 */
5609	mutex_enter(&l2arc_dev_mtx);
5610	list_insert_head(l2arc_dev_list, adddev);
5611	atomic_inc_64(&l2arc_ndev);
5612	mutex_exit(&l2arc_dev_mtx);
5613}
5614
5615/*
5616 * Remove a vdev from the L2ARC.
5617 */
5618void
5619l2arc_remove_vdev(vdev_t *vd)
5620{
5621	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5622
5623	/*
5624	 * Find the device by vdev
5625	 */
5626	mutex_enter(&l2arc_dev_mtx);
5627	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5628		nextdev = list_next(l2arc_dev_list, dev);
5629		if (vd == dev->l2ad_vdev) {
5630			remdev = dev;
5631			break;
5632		}
5633	}
5634	ASSERT(remdev != NULL);
5635
5636	/*
5637	 * Remove device from global list
5638	 */
5639	list_remove(l2arc_dev_list, remdev);
5640	l2arc_dev_last = NULL;		/* may have been invalidated */
5641	atomic_dec_64(&l2arc_ndev);
5642	mutex_exit(&l2arc_dev_mtx);
5643
5644	/*
5645	 * Clear all buflists and ARC references.  L2ARC device flush.
5646	 */
5647	l2arc_evict(remdev, 0, B_TRUE);
5648	list_destroy(remdev->l2ad_buflist);
5649	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5650	kmem_free(remdev, sizeof (l2arc_dev_t));
5651}
5652
5653void
5654l2arc_init(void)
5655{
5656	l2arc_thread_exit = 0;
5657	l2arc_ndev = 0;
5658	l2arc_writes_sent = 0;
5659	l2arc_writes_done = 0;
5660
5661	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5662	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5663	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5664	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5665	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5666
5667	l2arc_dev_list = &L2ARC_dev_list;
5668	l2arc_free_on_write = &L2ARC_free_on_write;
5669	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5670	    offsetof(l2arc_dev_t, l2ad_node));
5671	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5672	    offsetof(l2arc_data_free_t, l2df_list_node));
5673}
5674
5675void
5676l2arc_fini(void)
5677{
5678	/*
5679	 * This is called from dmu_fini(), which is called from spa_fini();
5680	 * Because of this, we can assume that all l2arc devices have
5681	 * already been removed when the pools themselves were removed.
5682	 */
5683
5684	l2arc_do_free_on_write();
5685
5686	mutex_destroy(&l2arc_feed_thr_lock);
5687	cv_destroy(&l2arc_feed_thr_cv);
5688	mutex_destroy(&l2arc_dev_mtx);
5689	mutex_destroy(&l2arc_buflist_mtx);
5690	mutex_destroy(&l2arc_free_on_write_mtx);
5691
5692	list_destroy(l2arc_dev_list);
5693	list_destroy(l2arc_free_on_write);
5694}
5695
5696void
5697l2arc_start(void)
5698{
5699	if (!(spa_mode_global & FWRITE))
5700		return;
5701
5702	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5703	    TS_RUN, minclsyspri);
5704}
5705
5706void
5707l2arc_stop(void)
5708{
5709	if (!(spa_mode_global & FWRITE))
5710		return;
5711
5712	mutex_enter(&l2arc_feed_thr_lock);
5713	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5714	l2arc_thread_exit = 1;
5715	while (l2arc_thread_exit != 0)
5716		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5717	mutex_exit(&l2arc_feed_thr_lock);
5718}
5719