1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
25 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26 */
27
28/*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory.  This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about.  Our cache is not so simple.  At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them.  Blocks are only evictable
43 * when there are no external references active.  This makes
44 * eviction far more problematic:  we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space.  In these circumstances we are unable to adjust the cache
49 * size.  To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slows the flow of new data
51 * into the cache until we can make space available.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss.  Our model has a variable sized cache.  It grows with
56 * high use, but also tries to react to memory pressure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefore exactly the same size.  So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict.  In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes).  We therefore choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73/*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists.  The arc_read() interface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2.  We therefore provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexs, rather they rely on the
85 * hash table mutexs for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexs).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table.  It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state.  When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock.  Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()).  Note however that the data associated
104 * with the buffer may be evicted prior to the callback.  The callback
105 * must be made with *no locks held* (to prevent deadlock).  Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_clear_callback()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 *	- L2ARC buflist creation
116 *	- L2ARC buflist eviction
117 *	- L2ARC write completion, which walks L2ARC buflists
118 *	- ARC header destruction, as it removes from L2ARC buflists
119 *	- ARC header release, as it removes from L2ARC buflists
120 */
121
122#include <sys/spa.h>
123#include <sys/zio.h>
124#include <sys/zio_compress.h>
125#include <sys/zfs_context.h>
126#include <sys/arc.h>
127#include <sys/refcount.h>
128#include <sys/vdev.h>
129#include <sys/vdev_impl.h>
130#include <sys/dsl_pool.h>
131#ifdef _KERNEL
132#include <sys/dnlc.h>
133#endif
134#include <sys/callb.h>
135#include <sys/kstat.h>
136#include <sys/trim_map.h>
137#include <zfs_fletcher.h>
138#include <sys/sdt.h>
139
140#include <vm/vm_pageout.h>
141
142#ifdef illumos
143#ifndef _KERNEL
144/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
145boolean_t arc_watch = B_FALSE;
146int arc_procfd;
147#endif
148#endif /* illumos */
149
150static kmutex_t		arc_reclaim_thr_lock;
151static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
152static uint8_t		arc_thread_exit;
153
154#define	ARC_REDUCE_DNLC_PERCENT	3
155uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156
157typedef enum arc_reclaim_strategy {
158	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
159	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
160} arc_reclaim_strategy_t;
161
162/*
163 * The number of iterations through arc_evict_*() before we
164 * drop & reacquire the lock.
165 */
166int arc_evict_iterations = 100;
167
168/* number of seconds before growing cache again */
169static int		arc_grow_retry = 60;
170
171/* shift of arc_c for calculating both min and max arc_p */
172static int		arc_p_min_shift = 4;
173
174/* log2(fraction of arc to reclaim) */
175static int		arc_shrink_shift = 5;
176
177/*
178 * minimum lifespan of a prefetch block in clock ticks
179 * (initialized in arc_init())
180 */
181static int		arc_min_prefetch_lifespan;
182
183/*
184 * If this percent of memory is free, don't throttle.
185 */
186int arc_lotsfree_percent = 10;
187
188static int arc_dead;
189extern int zfs_prefetch_disable;
190
191/*
192 * The arc has filled available memory and has now warmed up.
193 */
194static boolean_t arc_warm;
195
196/*
197 * These tunables are for performance analysis.
198 */
199uint64_t zfs_arc_max;
200uint64_t zfs_arc_min;
201uint64_t zfs_arc_meta_limit = 0;
202int zfs_arc_grow_retry = 0;
203int zfs_arc_shrink_shift = 0;
204int zfs_arc_p_min_shift = 0;
205int zfs_disable_dup_eviction = 0;
206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
207
208TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
209TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
210TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
211TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
212SYSCTL_DECL(_vfs_zfs);
213SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
214    "Maximum ARC size");
215SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
216    "Minimum ARC size");
217SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
218    &zfs_arc_average_blocksize, 0,
219    "ARC average blocksize");
220
221/*
222 * Note that buffers can be in one of 6 states:
223 *	ARC_anon	- anonymous (discussed below)
224 *	ARC_mru		- recently used, currently cached
225 *	ARC_mru_ghost	- recentely used, no longer in cache
226 *	ARC_mfu		- frequently used, currently cached
227 *	ARC_mfu_ghost	- frequently used, no longer in cache
228 *	ARC_l2c_only	- exists in L2ARC but not other states
229 * When there are no active references to the buffer, they are
230 * are linked onto a list in one of these arc states.  These are
231 * the only buffers that can be evicted or deleted.  Within each
232 * state there are multiple lists, one for meta-data and one for
233 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
234 * etc.) is tracked separately so that it can be managed more
235 * explicitly: favored over data, limited explicitly.
236 *
237 * Anonymous buffers are buffers that are not associated with
238 * a DVA.  These are buffers that hold dirty block copies
239 * before they are written to stable storage.  By definition,
240 * they are "ref'd" and are considered part of arc_mru
241 * that cannot be freed.  Generally, they will aquire a DVA
242 * as they are written and migrate onto the arc_mru list.
243 *
244 * The ARC_l2c_only state is for buffers that are in the second
245 * level ARC but no longer in any of the ARC_m* lists.  The second
246 * level ARC itself may also contain buffers that are in any of
247 * the ARC_m* states - meaning that a buffer can exist in two
248 * places.  The reason for the ARC_l2c_only state is to keep the
249 * buffer header in the hash table, so that reads that hit the
250 * second level ARC benefit from these fast lookups.
251 */
252
253#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
254struct arcs_lock {
255	kmutex_t	arcs_lock;
256#ifdef _KERNEL
257	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
258#endif
259};
260
261/*
262 * must be power of two for mask use to work
263 *
264 */
265#define ARC_BUFC_NUMDATALISTS		16
266#define ARC_BUFC_NUMMETADATALISTS	16
267#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
268
269typedef struct arc_state {
270	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
271	uint64_t arcs_size;	/* total amount of data in this state */
272	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
273	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
274} arc_state_t;
275
276#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
277
278/* The 6 states: */
279static arc_state_t ARC_anon;
280static arc_state_t ARC_mru;
281static arc_state_t ARC_mru_ghost;
282static arc_state_t ARC_mfu;
283static arc_state_t ARC_mfu_ghost;
284static arc_state_t ARC_l2c_only;
285
286typedef struct arc_stats {
287	kstat_named_t arcstat_hits;
288	kstat_named_t arcstat_misses;
289	kstat_named_t arcstat_demand_data_hits;
290	kstat_named_t arcstat_demand_data_misses;
291	kstat_named_t arcstat_demand_metadata_hits;
292	kstat_named_t arcstat_demand_metadata_misses;
293	kstat_named_t arcstat_prefetch_data_hits;
294	kstat_named_t arcstat_prefetch_data_misses;
295	kstat_named_t arcstat_prefetch_metadata_hits;
296	kstat_named_t arcstat_prefetch_metadata_misses;
297	kstat_named_t arcstat_mru_hits;
298	kstat_named_t arcstat_mru_ghost_hits;
299	kstat_named_t arcstat_mfu_hits;
300	kstat_named_t arcstat_mfu_ghost_hits;
301	kstat_named_t arcstat_allocated;
302	kstat_named_t arcstat_deleted;
303	kstat_named_t arcstat_stolen;
304	kstat_named_t arcstat_recycle_miss;
305	/*
306	 * Number of buffers that could not be evicted because the hash lock
307	 * was held by another thread.  The lock may not necessarily be held
308	 * by something using the same buffer, since hash locks are shared
309	 * by multiple buffers.
310	 */
311	kstat_named_t arcstat_mutex_miss;
312	/*
313	 * Number of buffers skipped because they have I/O in progress, are
314	 * indrect prefetch buffers that have not lived long enough, or are
315	 * not from the spa we're trying to evict from.
316	 */
317	kstat_named_t arcstat_evict_skip;
318	kstat_named_t arcstat_evict_l2_cached;
319	kstat_named_t arcstat_evict_l2_eligible;
320	kstat_named_t arcstat_evict_l2_ineligible;
321	kstat_named_t arcstat_hash_elements;
322	kstat_named_t arcstat_hash_elements_max;
323	kstat_named_t arcstat_hash_collisions;
324	kstat_named_t arcstat_hash_chains;
325	kstat_named_t arcstat_hash_chain_max;
326	kstat_named_t arcstat_p;
327	kstat_named_t arcstat_c;
328	kstat_named_t arcstat_c_min;
329	kstat_named_t arcstat_c_max;
330	kstat_named_t arcstat_size;
331	kstat_named_t arcstat_hdr_size;
332	kstat_named_t arcstat_data_size;
333	kstat_named_t arcstat_other_size;
334	kstat_named_t arcstat_l2_hits;
335	kstat_named_t arcstat_l2_misses;
336	kstat_named_t arcstat_l2_feeds;
337	kstat_named_t arcstat_l2_rw_clash;
338	kstat_named_t arcstat_l2_read_bytes;
339	kstat_named_t arcstat_l2_write_bytes;
340	kstat_named_t arcstat_l2_writes_sent;
341	kstat_named_t arcstat_l2_writes_done;
342	kstat_named_t arcstat_l2_writes_error;
343	kstat_named_t arcstat_l2_writes_hdr_miss;
344	kstat_named_t arcstat_l2_evict_lock_retry;
345	kstat_named_t arcstat_l2_evict_reading;
346	kstat_named_t arcstat_l2_free_on_write;
347	kstat_named_t arcstat_l2_cdata_free_on_write;
348	kstat_named_t arcstat_l2_abort_lowmem;
349	kstat_named_t arcstat_l2_cksum_bad;
350	kstat_named_t arcstat_l2_io_error;
351	kstat_named_t arcstat_l2_size;
352	kstat_named_t arcstat_l2_asize;
353	kstat_named_t arcstat_l2_hdr_size;
354	kstat_named_t arcstat_l2_compress_successes;
355	kstat_named_t arcstat_l2_compress_zeros;
356	kstat_named_t arcstat_l2_compress_failures;
357	kstat_named_t arcstat_l2_write_trylock_fail;
358	kstat_named_t arcstat_l2_write_passed_headroom;
359	kstat_named_t arcstat_l2_write_spa_mismatch;
360	kstat_named_t arcstat_l2_write_in_l2;
361	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
362	kstat_named_t arcstat_l2_write_not_cacheable;
363	kstat_named_t arcstat_l2_write_full;
364	kstat_named_t arcstat_l2_write_buffer_iter;
365	kstat_named_t arcstat_l2_write_pios;
366	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
367	kstat_named_t arcstat_l2_write_buffer_list_iter;
368	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
369	kstat_named_t arcstat_memory_throttle_count;
370	kstat_named_t arcstat_duplicate_buffers;
371	kstat_named_t arcstat_duplicate_buffers_size;
372	kstat_named_t arcstat_duplicate_reads;
373} arc_stats_t;
374
375static arc_stats_t arc_stats = {
376	{ "hits",			KSTAT_DATA_UINT64 },
377	{ "misses",			KSTAT_DATA_UINT64 },
378	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
379	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
380	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
381	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
382	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
383	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
384	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
385	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
386	{ "mru_hits",			KSTAT_DATA_UINT64 },
387	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
388	{ "mfu_hits",			KSTAT_DATA_UINT64 },
389	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
390	{ "allocated",			KSTAT_DATA_UINT64 },
391	{ "deleted",			KSTAT_DATA_UINT64 },
392	{ "stolen",			KSTAT_DATA_UINT64 },
393	{ "recycle_miss",		KSTAT_DATA_UINT64 },
394	{ "mutex_miss",			KSTAT_DATA_UINT64 },
395	{ "evict_skip",			KSTAT_DATA_UINT64 },
396	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
397	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
398	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
399	{ "hash_elements",		KSTAT_DATA_UINT64 },
400	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
401	{ "hash_collisions",		KSTAT_DATA_UINT64 },
402	{ "hash_chains",		KSTAT_DATA_UINT64 },
403	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
404	{ "p",				KSTAT_DATA_UINT64 },
405	{ "c",				KSTAT_DATA_UINT64 },
406	{ "c_min",			KSTAT_DATA_UINT64 },
407	{ "c_max",			KSTAT_DATA_UINT64 },
408	{ "size",			KSTAT_DATA_UINT64 },
409	{ "hdr_size",			KSTAT_DATA_UINT64 },
410	{ "data_size",			KSTAT_DATA_UINT64 },
411	{ "other_size",			KSTAT_DATA_UINT64 },
412	{ "l2_hits",			KSTAT_DATA_UINT64 },
413	{ "l2_misses",			KSTAT_DATA_UINT64 },
414	{ "l2_feeds",			KSTAT_DATA_UINT64 },
415	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
416	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
417	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
418	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
419	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
420	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
421	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
422	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
423	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
424	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
425	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
426	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
427	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
428	{ "l2_io_error",		KSTAT_DATA_UINT64 },
429	{ "l2_size",			KSTAT_DATA_UINT64 },
430	{ "l2_asize",			KSTAT_DATA_UINT64 },
431	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
432	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
433	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
434	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
435	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
436	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
437	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
438	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
439	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
440	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
441	{ "l2_write_full",		KSTAT_DATA_UINT64 },
442	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
443	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
444	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
445	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
446	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
447	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
448	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
449	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
450	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
451};
452
453#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
454
455#define	ARCSTAT_INCR(stat, val) \
456	atomic_add_64(&arc_stats.stat.value.ui64, (val))
457
458#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
459#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
460
461#define	ARCSTAT_MAX(stat, val) {					\
462	uint64_t m;							\
463	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
464	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
465		continue;						\
466}
467
468#define	ARCSTAT_MAXSTAT(stat) \
469	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
470
471/*
472 * We define a macro to allow ARC hits/misses to be easily broken down by
473 * two separate conditions, giving a total of four different subtypes for
474 * each of hits and misses (so eight statistics total).
475 */
476#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
477	if (cond1) {							\
478		if (cond2) {						\
479			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
480		} else {						\
481			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
482		}							\
483	} else {							\
484		if (cond2) {						\
485			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
486		} else {						\
487			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
488		}							\
489	}
490
491kstat_t			*arc_ksp;
492static arc_state_t	*arc_anon;
493static arc_state_t	*arc_mru;
494static arc_state_t	*arc_mru_ghost;
495static arc_state_t	*arc_mfu;
496static arc_state_t	*arc_mfu_ghost;
497static arc_state_t	*arc_l2c_only;
498
499/*
500 * There are several ARC variables that are critical to export as kstats --
501 * but we don't want to have to grovel around in the kstat whenever we wish to
502 * manipulate them.  For these variables, we therefore define them to be in
503 * terms of the statistic variable.  This assures that we are not introducing
504 * the possibility of inconsistency by having shadow copies of the variables,
505 * while still allowing the code to be readable.
506 */
507#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
508#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
509#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
510#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
511#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
512
513#define	L2ARC_IS_VALID_COMPRESS(_c_) \
514	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
515
516static int		arc_no_grow;	/* Don't try to grow cache size */
517static uint64_t		arc_tempreserve;
518static uint64_t		arc_loaned_bytes;
519static uint64_t		arc_meta_used;
520static uint64_t		arc_meta_limit;
521static uint64_t		arc_meta_max = 0;
522SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
523    "ARC metadata used");
524SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
525    "ARC metadata limit");
526
527typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
528
529typedef struct arc_callback arc_callback_t;
530
531struct arc_callback {
532	void			*acb_private;
533	arc_done_func_t		*acb_done;
534	arc_buf_t		*acb_buf;
535	zio_t			*acb_zio_dummy;
536	arc_callback_t		*acb_next;
537};
538
539typedef struct arc_write_callback arc_write_callback_t;
540
541struct arc_write_callback {
542	void		*awcb_private;
543	arc_done_func_t	*awcb_ready;
544	arc_done_func_t	*awcb_physdone;
545	arc_done_func_t	*awcb_done;
546	arc_buf_t	*awcb_buf;
547};
548
549struct arc_buf_hdr {
550	/* protected by hash lock */
551	dva_t			b_dva;
552	uint64_t		b_birth;
553	uint64_t		b_cksum0;
554
555	kmutex_t		b_freeze_lock;
556	zio_cksum_t		*b_freeze_cksum;
557	void			*b_thawed;
558
559	arc_buf_hdr_t		*b_hash_next;
560	arc_buf_t		*b_buf;
561	uint32_t		b_flags;
562	uint32_t		b_datacnt;
563
564	arc_callback_t		*b_acb;
565	kcondvar_t		b_cv;
566
567	/* immutable */
568	arc_buf_contents_t	b_type;
569	uint64_t		b_size;
570	uint64_t		b_spa;
571
572	/* protected by arc state mutex */
573	arc_state_t		*b_state;
574	list_node_t		b_arc_node;
575
576	/* updated atomically */
577	clock_t			b_arc_access;
578
579	/* self protecting */
580	refcount_t		b_refcnt;
581
582	l2arc_buf_hdr_t		*b_l2hdr;
583	list_node_t		b_l2node;
584};
585
586static arc_buf_t *arc_eviction_list;
587static kmutex_t arc_eviction_mtx;
588static arc_buf_hdr_t arc_eviction_hdr;
589static void arc_get_data_buf(arc_buf_t *buf);
590static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
591static int arc_evict_needed(arc_buf_contents_t type);
592static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
593#ifdef illumos
594static void arc_buf_watch(arc_buf_t *buf);
595#endif /* illumos */
596
597static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
598
599#define	GHOST_STATE(state)	\
600	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
601	(state) == arc_l2c_only)
602
603/*
604 * Private ARC flags.  These flags are private ARC only flags that will show up
605 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
606 * be passed in as arc_flags in things like arc_read.  However, these flags
607 * should never be passed and should only be set by ARC code.  When adding new
608 * public flags, make sure not to smash the private ones.
609 */
610
611#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
612#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
613#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
614#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
615#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
616#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
617#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
618#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
619#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
620#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
621
622#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
623#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
624#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
625#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
626#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
627#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
628#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
629#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
630#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
631				    (hdr)->b_l2hdr != NULL)
632#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
633#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
634#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
635
636/*
637 * Other sizes
638 */
639
640#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
641#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
642
643/*
644 * Hash table routines
645 */
646
647#define	HT_LOCK_PAD	CACHE_LINE_SIZE
648
649struct ht_lock {
650	kmutex_t	ht_lock;
651#ifdef _KERNEL
652	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
653#endif
654};
655
656#define	BUF_LOCKS 256
657typedef struct buf_hash_table {
658	uint64_t ht_mask;
659	arc_buf_hdr_t **ht_table;
660	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
661} buf_hash_table_t;
662
663static buf_hash_table_t buf_hash_table;
664
665#define	BUF_HASH_INDEX(spa, dva, birth) \
666	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
667#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
668#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
669#define	HDR_LOCK(hdr) \
670	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
671
672uint64_t zfs_crc64_table[256];
673
674/*
675 * Level 2 ARC
676 */
677
678#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
679#define	L2ARC_HEADROOM		2			/* num of writes */
680/*
681 * If we discover during ARC scan any buffers to be compressed, we boost
682 * our headroom for the next scanning cycle by this percentage multiple.
683 */
684#define	L2ARC_HEADROOM_BOOST	200
685#define	L2ARC_FEED_SECS		1		/* caching interval secs */
686#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
687
688#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
689#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
690
691/* L2ARC Performance Tunables */
692uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
693uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
694uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
695uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
696uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
697uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
698boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
699boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
700boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
701
702SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
703    &l2arc_write_max, 0, "max write size");
704SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
705    &l2arc_write_boost, 0, "extra write during warmup");
706SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
707    &l2arc_headroom, 0, "number of dev writes");
708SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
709    &l2arc_feed_secs, 0, "interval seconds");
710SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
711    &l2arc_feed_min_ms, 0, "min interval milliseconds");
712
713SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
714    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
715SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
716    &l2arc_feed_again, 0, "turbo warmup");
717SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
718    &l2arc_norw, 0, "no reads during writes");
719
720SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
721    &ARC_anon.arcs_size, 0, "size of anonymous state");
722SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
723    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
724SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
725    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
726
727SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
728    &ARC_mru.arcs_size, 0, "size of mru state");
729SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
730    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
731SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
732    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
733
734SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
735    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
736SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
737    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
738    "size of metadata in mru ghost state");
739SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
740    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
741    "size of data in mru ghost state");
742
743SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
744    &ARC_mfu.arcs_size, 0, "size of mfu state");
745SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
746    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
747SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
748    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
749
750SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
751    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
752SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
753    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
754    "size of metadata in mfu ghost state");
755SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
756    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
757    "size of data in mfu ghost state");
758
759SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
760    &ARC_l2c_only.arcs_size, 0, "size of mru state");
761
762/*
763 * L2ARC Internals
764 */
765typedef struct l2arc_dev {
766	vdev_t			*l2ad_vdev;	/* vdev */
767	spa_t			*l2ad_spa;	/* spa */
768	uint64_t		l2ad_hand;	/* next write location */
769	uint64_t		l2ad_start;	/* first addr on device */
770	uint64_t		l2ad_end;	/* last addr on device */
771	uint64_t		l2ad_evict;	/* last addr eviction reached */
772	boolean_t		l2ad_first;	/* first sweep through */
773	boolean_t		l2ad_writing;	/* currently writing */
774	list_t			*l2ad_buflist;	/* buffer list */
775	list_node_t		l2ad_node;	/* device list node */
776} l2arc_dev_t;
777
778static list_t L2ARC_dev_list;			/* device list */
779static list_t *l2arc_dev_list;			/* device list pointer */
780static kmutex_t l2arc_dev_mtx;			/* device list mutex */
781static l2arc_dev_t *l2arc_dev_last;		/* last device used */
782static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
783static list_t L2ARC_free_on_write;		/* free after write buf list */
784static list_t *l2arc_free_on_write;		/* free after write list ptr */
785static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
786static uint64_t l2arc_ndev;			/* number of devices */
787
788typedef struct l2arc_read_callback {
789	arc_buf_t		*l2rcb_buf;		/* read buffer */
790	spa_t			*l2rcb_spa;		/* spa */
791	blkptr_t		l2rcb_bp;		/* original blkptr */
792	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
793	int			l2rcb_flags;		/* original flags */
794	enum zio_compress	l2rcb_compress;		/* applied compress */
795} l2arc_read_callback_t;
796
797typedef struct l2arc_write_callback {
798	l2arc_dev_t	*l2wcb_dev;		/* device info */
799	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
800} l2arc_write_callback_t;
801
802struct l2arc_buf_hdr {
803	/* protected by arc_buf_hdr  mutex */
804	l2arc_dev_t		*b_dev;		/* L2ARC device */
805	uint64_t		b_daddr;	/* disk address, offset byte */
806	/* compression applied to buffer data */
807	enum zio_compress	b_compress;
808	/* real alloc'd buffer size depending on b_compress applied */
809	int			b_asize;
810	/* temporary buffer holder for in-flight compressed data */
811	void			*b_tmp_cdata;
812};
813
814typedef struct l2arc_data_free {
815	/* protected by l2arc_free_on_write_mtx */
816	void		*l2df_data;
817	size_t		l2df_size;
818	void		(*l2df_func)(void *, size_t);
819	list_node_t	l2df_list_node;
820} l2arc_data_free_t;
821
822static kmutex_t l2arc_feed_thr_lock;
823static kcondvar_t l2arc_feed_thr_cv;
824static uint8_t l2arc_thread_exit;
825
826static void l2arc_read_done(zio_t *zio);
827static void l2arc_hdr_stat_add(void);
828static void l2arc_hdr_stat_remove(void);
829
830static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
831static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
832    enum zio_compress c);
833static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
834
835static uint64_t
836buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
837{
838	uint8_t *vdva = (uint8_t *)dva;
839	uint64_t crc = -1ULL;
840	int i;
841
842	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
843
844	for (i = 0; i < sizeof (dva_t); i++)
845		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
846
847	crc ^= (spa>>8) ^ birth;
848
849	return (crc);
850}
851
852#define	BUF_EMPTY(buf)						\
853	((buf)->b_dva.dva_word[0] == 0 &&			\
854	(buf)->b_dva.dva_word[1] == 0 &&			\
855	(buf)->b_cksum0 == 0)
856
857#define	BUF_EQUAL(spa, dva, birth, buf)				\
858	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
859	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
860	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
861
862static void
863buf_discard_identity(arc_buf_hdr_t *hdr)
864{
865	hdr->b_dva.dva_word[0] = 0;
866	hdr->b_dva.dva_word[1] = 0;
867	hdr->b_birth = 0;
868	hdr->b_cksum0 = 0;
869}
870
871static arc_buf_hdr_t *
872buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
873{
874	const dva_t *dva = BP_IDENTITY(bp);
875	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
876	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
877	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
878	arc_buf_hdr_t *buf;
879
880	mutex_enter(hash_lock);
881	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
882	    buf = buf->b_hash_next) {
883		if (BUF_EQUAL(spa, dva, birth, buf)) {
884			*lockp = hash_lock;
885			return (buf);
886		}
887	}
888	mutex_exit(hash_lock);
889	*lockp = NULL;
890	return (NULL);
891}
892
893/*
894 * Insert an entry into the hash table.  If there is already an element
895 * equal to elem in the hash table, then the already existing element
896 * will be returned and the new element will not be inserted.
897 * Otherwise returns NULL.
898 */
899static arc_buf_hdr_t *
900buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
901{
902	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
903	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
904	arc_buf_hdr_t *fbuf;
905	uint32_t i;
906
907	ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
908	ASSERT(buf->b_birth != 0);
909	ASSERT(!HDR_IN_HASH_TABLE(buf));
910	*lockp = hash_lock;
911	mutex_enter(hash_lock);
912	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
913	    fbuf = fbuf->b_hash_next, i++) {
914		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
915			return (fbuf);
916	}
917
918	buf->b_hash_next = buf_hash_table.ht_table[idx];
919	buf_hash_table.ht_table[idx] = buf;
920	buf->b_flags |= ARC_IN_HASH_TABLE;
921
922	/* collect some hash table performance data */
923	if (i > 0) {
924		ARCSTAT_BUMP(arcstat_hash_collisions);
925		if (i == 1)
926			ARCSTAT_BUMP(arcstat_hash_chains);
927
928		ARCSTAT_MAX(arcstat_hash_chain_max, i);
929	}
930
931	ARCSTAT_BUMP(arcstat_hash_elements);
932	ARCSTAT_MAXSTAT(arcstat_hash_elements);
933
934	return (NULL);
935}
936
937static void
938buf_hash_remove(arc_buf_hdr_t *buf)
939{
940	arc_buf_hdr_t *fbuf, **bufp;
941	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
942
943	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
944	ASSERT(HDR_IN_HASH_TABLE(buf));
945
946	bufp = &buf_hash_table.ht_table[idx];
947	while ((fbuf = *bufp) != buf) {
948		ASSERT(fbuf != NULL);
949		bufp = &fbuf->b_hash_next;
950	}
951	*bufp = buf->b_hash_next;
952	buf->b_hash_next = NULL;
953	buf->b_flags &= ~ARC_IN_HASH_TABLE;
954
955	/* collect some hash table performance data */
956	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
957
958	if (buf_hash_table.ht_table[idx] &&
959	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
960		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
961}
962
963/*
964 * Global data structures and functions for the buf kmem cache.
965 */
966static kmem_cache_t *hdr_cache;
967static kmem_cache_t *buf_cache;
968
969static void
970buf_fini(void)
971{
972	int i;
973
974	kmem_free(buf_hash_table.ht_table,
975	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
976	for (i = 0; i < BUF_LOCKS; i++)
977		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
978	kmem_cache_destroy(hdr_cache);
979	kmem_cache_destroy(buf_cache);
980}
981
982/*
983 * Constructor callback - called when the cache is empty
984 * and a new buf is requested.
985 */
986/* ARGSUSED */
987static int
988hdr_cons(void *vbuf, void *unused, int kmflag)
989{
990	arc_buf_hdr_t *buf = vbuf;
991
992	bzero(buf, sizeof (arc_buf_hdr_t));
993	refcount_create(&buf->b_refcnt);
994	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
995	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
996	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
997
998	return (0);
999}
1000
1001/* ARGSUSED */
1002static int
1003buf_cons(void *vbuf, void *unused, int kmflag)
1004{
1005	arc_buf_t *buf = vbuf;
1006
1007	bzero(buf, sizeof (arc_buf_t));
1008	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1009	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1010
1011	return (0);
1012}
1013
1014/*
1015 * Destructor callback - called when a cached buf is
1016 * no longer required.
1017 */
1018/* ARGSUSED */
1019static void
1020hdr_dest(void *vbuf, void *unused)
1021{
1022	arc_buf_hdr_t *buf = vbuf;
1023
1024	ASSERT(BUF_EMPTY(buf));
1025	refcount_destroy(&buf->b_refcnt);
1026	cv_destroy(&buf->b_cv);
1027	mutex_destroy(&buf->b_freeze_lock);
1028	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1029}
1030
1031/* ARGSUSED */
1032static void
1033buf_dest(void *vbuf, void *unused)
1034{
1035	arc_buf_t *buf = vbuf;
1036
1037	mutex_destroy(&buf->b_evict_lock);
1038	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1039}
1040
1041/*
1042 * Reclaim callback -- invoked when memory is low.
1043 */
1044/* ARGSUSED */
1045static void
1046hdr_recl(void *unused)
1047{
1048	dprintf("hdr_recl called\n");
1049	/*
1050	 * umem calls the reclaim func when we destroy the buf cache,
1051	 * which is after we do arc_fini().
1052	 */
1053	if (!arc_dead)
1054		cv_signal(&arc_reclaim_thr_cv);
1055}
1056
1057static void
1058buf_init(void)
1059{
1060	uint64_t *ct;
1061	uint64_t hsize = 1ULL << 12;
1062	int i, j;
1063
1064	/*
1065	 * The hash table is big enough to fill all of physical memory
1066	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1067	 * By default, the table will take up
1068	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1069	 */
1070	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1071		hsize <<= 1;
1072retry:
1073	buf_hash_table.ht_mask = hsize - 1;
1074	buf_hash_table.ht_table =
1075	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1076	if (buf_hash_table.ht_table == NULL) {
1077		ASSERT(hsize > (1ULL << 8));
1078		hsize >>= 1;
1079		goto retry;
1080	}
1081
1082	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1083	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1084	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1085	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1086
1087	for (i = 0; i < 256; i++)
1088		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1089			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1090
1091	for (i = 0; i < BUF_LOCKS; i++) {
1092		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1093		    NULL, MUTEX_DEFAULT, NULL);
1094	}
1095}
1096
1097#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1098
1099static void
1100arc_cksum_verify(arc_buf_t *buf)
1101{
1102	zio_cksum_t zc;
1103
1104	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1105		return;
1106
1107	mutex_enter(&buf->b_hdr->b_freeze_lock);
1108	if (buf->b_hdr->b_freeze_cksum == NULL ||
1109	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1110		mutex_exit(&buf->b_hdr->b_freeze_lock);
1111		return;
1112	}
1113	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1114	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1115		panic("buffer modified while frozen!");
1116	mutex_exit(&buf->b_hdr->b_freeze_lock);
1117}
1118
1119static int
1120arc_cksum_equal(arc_buf_t *buf)
1121{
1122	zio_cksum_t zc;
1123	int equal;
1124
1125	mutex_enter(&buf->b_hdr->b_freeze_lock);
1126	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1127	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1128	mutex_exit(&buf->b_hdr->b_freeze_lock);
1129
1130	return (equal);
1131}
1132
1133static void
1134arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1135{
1136	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1137		return;
1138
1139	mutex_enter(&buf->b_hdr->b_freeze_lock);
1140	if (buf->b_hdr->b_freeze_cksum != NULL) {
1141		mutex_exit(&buf->b_hdr->b_freeze_lock);
1142		return;
1143	}
1144	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1145	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1146	    buf->b_hdr->b_freeze_cksum);
1147	mutex_exit(&buf->b_hdr->b_freeze_lock);
1148#ifdef illumos
1149	arc_buf_watch(buf);
1150#endif /* illumos */
1151}
1152
1153#ifdef illumos
1154#ifndef _KERNEL
1155typedef struct procctl {
1156	long cmd;
1157	prwatch_t prwatch;
1158} procctl_t;
1159#endif
1160
1161/* ARGSUSED */
1162static void
1163arc_buf_unwatch(arc_buf_t *buf)
1164{
1165#ifndef _KERNEL
1166	if (arc_watch) {
1167		int result;
1168		procctl_t ctl;
1169		ctl.cmd = PCWATCH;
1170		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1171		ctl.prwatch.pr_size = 0;
1172		ctl.prwatch.pr_wflags = 0;
1173		result = write(arc_procfd, &ctl, sizeof (ctl));
1174		ASSERT3U(result, ==, sizeof (ctl));
1175	}
1176#endif
1177}
1178
1179/* ARGSUSED */
1180static void
1181arc_buf_watch(arc_buf_t *buf)
1182{
1183#ifndef _KERNEL
1184	if (arc_watch) {
1185		int result;
1186		procctl_t ctl;
1187		ctl.cmd = PCWATCH;
1188		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1189		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1190		ctl.prwatch.pr_wflags = WA_WRITE;
1191		result = write(arc_procfd, &ctl, sizeof (ctl));
1192		ASSERT3U(result, ==, sizeof (ctl));
1193	}
1194#endif
1195}
1196#endif /* illumos */
1197
1198void
1199arc_buf_thaw(arc_buf_t *buf)
1200{
1201	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1202		if (buf->b_hdr->b_state != arc_anon)
1203			panic("modifying non-anon buffer!");
1204		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1205			panic("modifying buffer while i/o in progress!");
1206		arc_cksum_verify(buf);
1207	}
1208
1209	mutex_enter(&buf->b_hdr->b_freeze_lock);
1210	if (buf->b_hdr->b_freeze_cksum != NULL) {
1211		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1212		buf->b_hdr->b_freeze_cksum = NULL;
1213	}
1214
1215	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1216		if (buf->b_hdr->b_thawed)
1217			kmem_free(buf->b_hdr->b_thawed, 1);
1218		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1219	}
1220
1221	mutex_exit(&buf->b_hdr->b_freeze_lock);
1222
1223#ifdef illumos
1224	arc_buf_unwatch(buf);
1225#endif /* illumos */
1226}
1227
1228void
1229arc_buf_freeze(arc_buf_t *buf)
1230{
1231	kmutex_t *hash_lock;
1232
1233	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1234		return;
1235
1236	hash_lock = HDR_LOCK(buf->b_hdr);
1237	mutex_enter(hash_lock);
1238
1239	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1240	    buf->b_hdr->b_state == arc_anon);
1241	arc_cksum_compute(buf, B_FALSE);
1242	mutex_exit(hash_lock);
1243
1244}
1245
1246static void
1247get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1248{
1249	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1250
1251	if (ab->b_type == ARC_BUFC_METADATA)
1252		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1253	else {
1254		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1255		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1256	}
1257
1258	*list = &state->arcs_lists[buf_hashid];
1259	*lock = ARCS_LOCK(state, buf_hashid);
1260}
1261
1262
1263static void
1264add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1265{
1266	ASSERT(MUTEX_HELD(hash_lock));
1267
1268	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1269	    (ab->b_state != arc_anon)) {
1270		uint64_t delta = ab->b_size * ab->b_datacnt;
1271		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1272		list_t *list;
1273		kmutex_t *lock;
1274
1275		get_buf_info(ab, ab->b_state, &list, &lock);
1276		ASSERT(!MUTEX_HELD(lock));
1277		mutex_enter(lock);
1278		ASSERT(list_link_active(&ab->b_arc_node));
1279		list_remove(list, ab);
1280		if (GHOST_STATE(ab->b_state)) {
1281			ASSERT0(ab->b_datacnt);
1282			ASSERT3P(ab->b_buf, ==, NULL);
1283			delta = ab->b_size;
1284		}
1285		ASSERT(delta > 0);
1286		ASSERT3U(*size, >=, delta);
1287		atomic_add_64(size, -delta);
1288		mutex_exit(lock);
1289		/* remove the prefetch flag if we get a reference */
1290		if (ab->b_flags & ARC_PREFETCH)
1291			ab->b_flags &= ~ARC_PREFETCH;
1292	}
1293}
1294
1295static int
1296remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1297{
1298	int cnt;
1299	arc_state_t *state = ab->b_state;
1300
1301	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1302	ASSERT(!GHOST_STATE(state));
1303
1304	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1305	    (state != arc_anon)) {
1306		uint64_t *size = &state->arcs_lsize[ab->b_type];
1307		list_t *list;
1308		kmutex_t *lock;
1309
1310		get_buf_info(ab, state, &list, &lock);
1311		ASSERT(!MUTEX_HELD(lock));
1312		mutex_enter(lock);
1313		ASSERT(!list_link_active(&ab->b_arc_node));
1314		list_insert_head(list, ab);
1315		ASSERT(ab->b_datacnt > 0);
1316		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1317		mutex_exit(lock);
1318	}
1319	return (cnt);
1320}
1321
1322/*
1323 * Move the supplied buffer to the indicated state.  The mutex
1324 * for the buffer must be held by the caller.
1325 */
1326static void
1327arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1328{
1329	arc_state_t *old_state = ab->b_state;
1330	int64_t refcnt = refcount_count(&ab->b_refcnt);
1331	uint64_t from_delta, to_delta;
1332	list_t *list;
1333	kmutex_t *lock;
1334
1335	ASSERT(MUTEX_HELD(hash_lock));
1336	ASSERT3P(new_state, !=, old_state);
1337	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1338	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1339	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1340
1341	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1342
1343	/*
1344	 * If this buffer is evictable, transfer it from the
1345	 * old state list to the new state list.
1346	 */
1347	if (refcnt == 0) {
1348		if (old_state != arc_anon) {
1349			int use_mutex;
1350			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1351
1352			get_buf_info(ab, old_state, &list, &lock);
1353			use_mutex = !MUTEX_HELD(lock);
1354			if (use_mutex)
1355				mutex_enter(lock);
1356
1357			ASSERT(list_link_active(&ab->b_arc_node));
1358			list_remove(list, ab);
1359
1360			/*
1361			 * If prefetching out of the ghost cache,
1362			 * we will have a non-zero datacnt.
1363			 */
1364			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1365				/* ghost elements have a ghost size */
1366				ASSERT(ab->b_buf == NULL);
1367				from_delta = ab->b_size;
1368			}
1369			ASSERT3U(*size, >=, from_delta);
1370			atomic_add_64(size, -from_delta);
1371
1372			if (use_mutex)
1373				mutex_exit(lock);
1374		}
1375		if (new_state != arc_anon) {
1376			int use_mutex;
1377			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1378
1379			get_buf_info(ab, new_state, &list, &lock);
1380			use_mutex = !MUTEX_HELD(lock);
1381			if (use_mutex)
1382				mutex_enter(lock);
1383
1384			list_insert_head(list, ab);
1385
1386			/* ghost elements have a ghost size */
1387			if (GHOST_STATE(new_state)) {
1388				ASSERT(ab->b_datacnt == 0);
1389				ASSERT(ab->b_buf == NULL);
1390				to_delta = ab->b_size;
1391			}
1392			atomic_add_64(size, to_delta);
1393
1394			if (use_mutex)
1395				mutex_exit(lock);
1396		}
1397	}
1398
1399	ASSERT(!BUF_EMPTY(ab));
1400	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1401		buf_hash_remove(ab);
1402
1403	/* adjust state sizes */
1404	if (to_delta)
1405		atomic_add_64(&new_state->arcs_size, to_delta);
1406	if (from_delta) {
1407		ASSERT3U(old_state->arcs_size, >=, from_delta);
1408		atomic_add_64(&old_state->arcs_size, -from_delta);
1409	}
1410	ab->b_state = new_state;
1411
1412	/* adjust l2arc hdr stats */
1413	if (new_state == arc_l2c_only)
1414		l2arc_hdr_stat_add();
1415	else if (old_state == arc_l2c_only)
1416		l2arc_hdr_stat_remove();
1417}
1418
1419void
1420arc_space_consume(uint64_t space, arc_space_type_t type)
1421{
1422	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1423
1424	switch (type) {
1425	case ARC_SPACE_DATA:
1426		ARCSTAT_INCR(arcstat_data_size, space);
1427		break;
1428	case ARC_SPACE_OTHER:
1429		ARCSTAT_INCR(arcstat_other_size, space);
1430		break;
1431	case ARC_SPACE_HDRS:
1432		ARCSTAT_INCR(arcstat_hdr_size, space);
1433		break;
1434	case ARC_SPACE_L2HDRS:
1435		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1436		break;
1437	}
1438
1439	atomic_add_64(&arc_meta_used, space);
1440	atomic_add_64(&arc_size, space);
1441}
1442
1443void
1444arc_space_return(uint64_t space, arc_space_type_t type)
1445{
1446	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1447
1448	switch (type) {
1449	case ARC_SPACE_DATA:
1450		ARCSTAT_INCR(arcstat_data_size, -space);
1451		break;
1452	case ARC_SPACE_OTHER:
1453		ARCSTAT_INCR(arcstat_other_size, -space);
1454		break;
1455	case ARC_SPACE_HDRS:
1456		ARCSTAT_INCR(arcstat_hdr_size, -space);
1457		break;
1458	case ARC_SPACE_L2HDRS:
1459		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1460		break;
1461	}
1462
1463	ASSERT(arc_meta_used >= space);
1464	if (arc_meta_max < arc_meta_used)
1465		arc_meta_max = arc_meta_used;
1466	atomic_add_64(&arc_meta_used, -space);
1467	ASSERT(arc_size >= space);
1468	atomic_add_64(&arc_size, -space);
1469}
1470
1471void *
1472arc_data_buf_alloc(uint64_t size)
1473{
1474	if (arc_evict_needed(ARC_BUFC_DATA))
1475		cv_signal(&arc_reclaim_thr_cv);
1476	atomic_add_64(&arc_size, size);
1477	return (zio_data_buf_alloc(size));
1478}
1479
1480void
1481arc_data_buf_free(void *buf, uint64_t size)
1482{
1483	zio_data_buf_free(buf, size);
1484	ASSERT(arc_size >= size);
1485	atomic_add_64(&arc_size, -size);
1486}
1487
1488arc_buf_t *
1489arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1490{
1491	arc_buf_hdr_t *hdr;
1492	arc_buf_t *buf;
1493
1494	ASSERT3U(size, >, 0);
1495	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1496	ASSERT(BUF_EMPTY(hdr));
1497	hdr->b_size = size;
1498	hdr->b_type = type;
1499	hdr->b_spa = spa_load_guid(spa);
1500	hdr->b_state = arc_anon;
1501	hdr->b_arc_access = 0;
1502	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1503	buf->b_hdr = hdr;
1504	buf->b_data = NULL;
1505	buf->b_efunc = NULL;
1506	buf->b_private = NULL;
1507	buf->b_next = NULL;
1508	hdr->b_buf = buf;
1509	arc_get_data_buf(buf);
1510	hdr->b_datacnt = 1;
1511	hdr->b_flags = 0;
1512	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1513	(void) refcount_add(&hdr->b_refcnt, tag);
1514
1515	return (buf);
1516}
1517
1518static char *arc_onloan_tag = "onloan";
1519
1520/*
1521 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1522 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1523 * buffers must be returned to the arc before they can be used by the DMU or
1524 * freed.
1525 */
1526arc_buf_t *
1527arc_loan_buf(spa_t *spa, int size)
1528{
1529	arc_buf_t *buf;
1530
1531	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1532
1533	atomic_add_64(&arc_loaned_bytes, size);
1534	return (buf);
1535}
1536
1537/*
1538 * Return a loaned arc buffer to the arc.
1539 */
1540void
1541arc_return_buf(arc_buf_t *buf, void *tag)
1542{
1543	arc_buf_hdr_t *hdr = buf->b_hdr;
1544
1545	ASSERT(buf->b_data != NULL);
1546	(void) refcount_add(&hdr->b_refcnt, tag);
1547	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1548
1549	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1550}
1551
1552/* Detach an arc_buf from a dbuf (tag) */
1553void
1554arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1555{
1556	arc_buf_hdr_t *hdr;
1557
1558	ASSERT(buf->b_data != NULL);
1559	hdr = buf->b_hdr;
1560	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1561	(void) refcount_remove(&hdr->b_refcnt, tag);
1562	buf->b_efunc = NULL;
1563	buf->b_private = NULL;
1564
1565	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1566}
1567
1568static arc_buf_t *
1569arc_buf_clone(arc_buf_t *from)
1570{
1571	arc_buf_t *buf;
1572	arc_buf_hdr_t *hdr = from->b_hdr;
1573	uint64_t size = hdr->b_size;
1574
1575	ASSERT(hdr->b_state != arc_anon);
1576
1577	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1578	buf->b_hdr = hdr;
1579	buf->b_data = NULL;
1580	buf->b_efunc = NULL;
1581	buf->b_private = NULL;
1582	buf->b_next = hdr->b_buf;
1583	hdr->b_buf = buf;
1584	arc_get_data_buf(buf);
1585	bcopy(from->b_data, buf->b_data, size);
1586
1587	/*
1588	 * This buffer already exists in the arc so create a duplicate
1589	 * copy for the caller.  If the buffer is associated with user data
1590	 * then track the size and number of duplicates.  These stats will be
1591	 * updated as duplicate buffers are created and destroyed.
1592	 */
1593	if (hdr->b_type == ARC_BUFC_DATA) {
1594		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1595		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1596	}
1597	hdr->b_datacnt += 1;
1598	return (buf);
1599}
1600
1601void
1602arc_buf_add_ref(arc_buf_t *buf, void* tag)
1603{
1604	arc_buf_hdr_t *hdr;
1605	kmutex_t *hash_lock;
1606
1607	/*
1608	 * Check to see if this buffer is evicted.  Callers
1609	 * must verify b_data != NULL to know if the add_ref
1610	 * was successful.
1611	 */
1612	mutex_enter(&buf->b_evict_lock);
1613	if (buf->b_data == NULL) {
1614		mutex_exit(&buf->b_evict_lock);
1615		return;
1616	}
1617	hash_lock = HDR_LOCK(buf->b_hdr);
1618	mutex_enter(hash_lock);
1619	hdr = buf->b_hdr;
1620	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1621	mutex_exit(&buf->b_evict_lock);
1622
1623	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1624	add_reference(hdr, hash_lock, tag);
1625	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1626	arc_access(hdr, hash_lock);
1627	mutex_exit(hash_lock);
1628	ARCSTAT_BUMP(arcstat_hits);
1629	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1630	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1631	    data, metadata, hits);
1632}
1633
1634static void
1635arc_buf_free_on_write(void *data, size_t size,
1636    void (*free_func)(void *, size_t))
1637{
1638	l2arc_data_free_t *df;
1639
1640	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1641	df->l2df_data = data;
1642	df->l2df_size = size;
1643	df->l2df_func = free_func;
1644	mutex_enter(&l2arc_free_on_write_mtx);
1645	list_insert_head(l2arc_free_on_write, df);
1646	mutex_exit(&l2arc_free_on_write_mtx);
1647}
1648
1649/*
1650 * Free the arc data buffer.  If it is an l2arc write in progress,
1651 * the buffer is placed on l2arc_free_on_write to be freed later.
1652 */
1653static void
1654arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1655{
1656	arc_buf_hdr_t *hdr = buf->b_hdr;
1657
1658	if (HDR_L2_WRITING(hdr)) {
1659		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1660		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1661	} else {
1662		free_func(buf->b_data, hdr->b_size);
1663	}
1664}
1665
1666/*
1667 * Free up buf->b_data and if 'remove' is set, then pull the
1668 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1669 */
1670static void
1671arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1672{
1673	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1674
1675	ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
1676
1677	if (l2hdr->b_tmp_cdata == NULL)
1678		return;
1679
1680	ASSERT(HDR_L2_WRITING(hdr));
1681	arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
1682	    zio_data_buf_free);
1683	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1684	l2hdr->b_tmp_cdata = NULL;
1685}
1686
1687static void
1688arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1689{
1690	arc_buf_t **bufp;
1691
1692	/* free up data associated with the buf */
1693	if (buf->b_data) {
1694		arc_state_t *state = buf->b_hdr->b_state;
1695		uint64_t size = buf->b_hdr->b_size;
1696		arc_buf_contents_t type = buf->b_hdr->b_type;
1697
1698		arc_cksum_verify(buf);
1699#ifdef illumos
1700		arc_buf_unwatch(buf);
1701#endif /* illumos */
1702
1703		if (!recycle) {
1704			if (type == ARC_BUFC_METADATA) {
1705				arc_buf_data_free(buf, zio_buf_free);
1706				arc_space_return(size, ARC_SPACE_DATA);
1707			} else {
1708				ASSERT(type == ARC_BUFC_DATA);
1709				arc_buf_data_free(buf, zio_data_buf_free);
1710				ARCSTAT_INCR(arcstat_data_size, -size);
1711				atomic_add_64(&arc_size, -size);
1712			}
1713		}
1714		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1715			uint64_t *cnt = &state->arcs_lsize[type];
1716
1717			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1718			ASSERT(state != arc_anon);
1719
1720			ASSERT3U(*cnt, >=, size);
1721			atomic_add_64(cnt, -size);
1722		}
1723		ASSERT3U(state->arcs_size, >=, size);
1724		atomic_add_64(&state->arcs_size, -size);
1725		buf->b_data = NULL;
1726
1727		/*
1728		 * If we're destroying a duplicate buffer make sure
1729		 * that the appropriate statistics are updated.
1730		 */
1731		if (buf->b_hdr->b_datacnt > 1 &&
1732		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1733			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1734			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1735		}
1736		ASSERT(buf->b_hdr->b_datacnt > 0);
1737		buf->b_hdr->b_datacnt -= 1;
1738	}
1739
1740	/* only remove the buf if requested */
1741	if (!remove)
1742		return;
1743
1744	/* remove the buf from the hdr list */
1745	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1746		continue;
1747	*bufp = buf->b_next;
1748	buf->b_next = NULL;
1749
1750	ASSERT(buf->b_efunc == NULL);
1751
1752	/* clean up the buf */
1753	buf->b_hdr = NULL;
1754	kmem_cache_free(buf_cache, buf);
1755}
1756
1757static void
1758arc_hdr_destroy(arc_buf_hdr_t *hdr)
1759{
1760	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1761	ASSERT3P(hdr->b_state, ==, arc_anon);
1762	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1763	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1764
1765	if (l2hdr != NULL) {
1766		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1767		/*
1768		 * To prevent arc_free() and l2arc_evict() from
1769		 * attempting to free the same buffer at the same time,
1770		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1771		 * give it priority.  l2arc_evict() can't destroy this
1772		 * header while we are waiting on l2arc_buflist_mtx.
1773		 *
1774		 * The hdr may be removed from l2ad_buflist before we
1775		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1776		 */
1777		if (!buflist_held) {
1778			mutex_enter(&l2arc_buflist_mtx);
1779			l2hdr = hdr->b_l2hdr;
1780		}
1781
1782		if (l2hdr != NULL) {
1783			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1784			    hdr->b_size, 0);
1785			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1786			arc_buf_l2_cdata_free(hdr);
1787			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1788			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1789			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1790			    -l2hdr->b_asize, 0, 0);
1791			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1792			if (hdr->b_state == arc_l2c_only)
1793				l2arc_hdr_stat_remove();
1794			hdr->b_l2hdr = NULL;
1795		}
1796
1797		if (!buflist_held)
1798			mutex_exit(&l2arc_buflist_mtx);
1799	}
1800
1801	if (!BUF_EMPTY(hdr)) {
1802		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1803		buf_discard_identity(hdr);
1804	}
1805	while (hdr->b_buf) {
1806		arc_buf_t *buf = hdr->b_buf;
1807
1808		if (buf->b_efunc) {
1809			mutex_enter(&arc_eviction_mtx);
1810			mutex_enter(&buf->b_evict_lock);
1811			ASSERT(buf->b_hdr != NULL);
1812			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1813			hdr->b_buf = buf->b_next;
1814			buf->b_hdr = &arc_eviction_hdr;
1815			buf->b_next = arc_eviction_list;
1816			arc_eviction_list = buf;
1817			mutex_exit(&buf->b_evict_lock);
1818			mutex_exit(&arc_eviction_mtx);
1819		} else {
1820			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1821		}
1822	}
1823	if (hdr->b_freeze_cksum != NULL) {
1824		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1825		hdr->b_freeze_cksum = NULL;
1826	}
1827	if (hdr->b_thawed) {
1828		kmem_free(hdr->b_thawed, 1);
1829		hdr->b_thawed = NULL;
1830	}
1831
1832	ASSERT(!list_link_active(&hdr->b_arc_node));
1833	ASSERT3P(hdr->b_hash_next, ==, NULL);
1834	ASSERT3P(hdr->b_acb, ==, NULL);
1835	kmem_cache_free(hdr_cache, hdr);
1836}
1837
1838void
1839arc_buf_free(arc_buf_t *buf, void *tag)
1840{
1841	arc_buf_hdr_t *hdr = buf->b_hdr;
1842	int hashed = hdr->b_state != arc_anon;
1843
1844	ASSERT(buf->b_efunc == NULL);
1845	ASSERT(buf->b_data != NULL);
1846
1847	if (hashed) {
1848		kmutex_t *hash_lock = HDR_LOCK(hdr);
1849
1850		mutex_enter(hash_lock);
1851		hdr = buf->b_hdr;
1852		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1853
1854		(void) remove_reference(hdr, hash_lock, tag);
1855		if (hdr->b_datacnt > 1) {
1856			arc_buf_destroy(buf, FALSE, TRUE);
1857		} else {
1858			ASSERT(buf == hdr->b_buf);
1859			ASSERT(buf->b_efunc == NULL);
1860			hdr->b_flags |= ARC_BUF_AVAILABLE;
1861		}
1862		mutex_exit(hash_lock);
1863	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1864		int destroy_hdr;
1865		/*
1866		 * We are in the middle of an async write.  Don't destroy
1867		 * this buffer unless the write completes before we finish
1868		 * decrementing the reference count.
1869		 */
1870		mutex_enter(&arc_eviction_mtx);
1871		(void) remove_reference(hdr, NULL, tag);
1872		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1873		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1874		mutex_exit(&arc_eviction_mtx);
1875		if (destroy_hdr)
1876			arc_hdr_destroy(hdr);
1877	} else {
1878		if (remove_reference(hdr, NULL, tag) > 0)
1879			arc_buf_destroy(buf, FALSE, TRUE);
1880		else
1881			arc_hdr_destroy(hdr);
1882	}
1883}
1884
1885boolean_t
1886arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1887{
1888	arc_buf_hdr_t *hdr = buf->b_hdr;
1889	kmutex_t *hash_lock = HDR_LOCK(hdr);
1890	boolean_t no_callback = (buf->b_efunc == NULL);
1891
1892	if (hdr->b_state == arc_anon) {
1893		ASSERT(hdr->b_datacnt == 1);
1894		arc_buf_free(buf, tag);
1895		return (no_callback);
1896	}
1897
1898	mutex_enter(hash_lock);
1899	hdr = buf->b_hdr;
1900	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1901	ASSERT(hdr->b_state != arc_anon);
1902	ASSERT(buf->b_data != NULL);
1903
1904	(void) remove_reference(hdr, hash_lock, tag);
1905	if (hdr->b_datacnt > 1) {
1906		if (no_callback)
1907			arc_buf_destroy(buf, FALSE, TRUE);
1908	} else if (no_callback) {
1909		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1910		ASSERT(buf->b_efunc == NULL);
1911		hdr->b_flags |= ARC_BUF_AVAILABLE;
1912	}
1913	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1914	    refcount_is_zero(&hdr->b_refcnt));
1915	mutex_exit(hash_lock);
1916	return (no_callback);
1917}
1918
1919int
1920arc_buf_size(arc_buf_t *buf)
1921{
1922	return (buf->b_hdr->b_size);
1923}
1924
1925/*
1926 * Called from the DMU to determine if the current buffer should be
1927 * evicted. In order to ensure proper locking, the eviction must be initiated
1928 * from the DMU. Return true if the buffer is associated with user data and
1929 * duplicate buffers still exist.
1930 */
1931boolean_t
1932arc_buf_eviction_needed(arc_buf_t *buf)
1933{
1934	arc_buf_hdr_t *hdr;
1935	boolean_t evict_needed = B_FALSE;
1936
1937	if (zfs_disable_dup_eviction)
1938		return (B_FALSE);
1939
1940	mutex_enter(&buf->b_evict_lock);
1941	hdr = buf->b_hdr;
1942	if (hdr == NULL) {
1943		/*
1944		 * We are in arc_do_user_evicts(); let that function
1945		 * perform the eviction.
1946		 */
1947		ASSERT(buf->b_data == NULL);
1948		mutex_exit(&buf->b_evict_lock);
1949		return (B_FALSE);
1950	} else if (buf->b_data == NULL) {
1951		/*
1952		 * We have already been added to the arc eviction list;
1953		 * recommend eviction.
1954		 */
1955		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1956		mutex_exit(&buf->b_evict_lock);
1957		return (B_TRUE);
1958	}
1959
1960	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1961		evict_needed = B_TRUE;
1962
1963	mutex_exit(&buf->b_evict_lock);
1964	return (evict_needed);
1965}
1966
1967/*
1968 * Evict buffers from list until we've removed the specified number of
1969 * bytes.  Move the removed buffers to the appropriate evict state.
1970 * If the recycle flag is set, then attempt to "recycle" a buffer:
1971 * - look for a buffer to evict that is `bytes' long.
1972 * - return the data block from this buffer rather than freeing it.
1973 * This flag is used by callers that are trying to make space for a
1974 * new buffer in a full arc cache.
1975 *
1976 * This function makes a "best effort".  It skips over any buffers
1977 * it can't get a hash_lock on, and so may not catch all candidates.
1978 * It may also return without evicting as much space as requested.
1979 */
1980static void *
1981arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1982    arc_buf_contents_t type)
1983{
1984	arc_state_t *evicted_state;
1985	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1986	int64_t bytes_remaining;
1987	arc_buf_hdr_t *ab, *ab_prev = NULL;
1988	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1989	kmutex_t *lock, *evicted_lock;
1990	kmutex_t *hash_lock;
1991	boolean_t have_lock;
1992	void *stolen = NULL;
1993	arc_buf_hdr_t marker = { 0 };
1994	int count = 0;
1995	static int evict_metadata_offset, evict_data_offset;
1996	int i, idx, offset, list_count, lists;
1997
1998	ASSERT(state == arc_mru || state == arc_mfu);
1999
2000	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2001
2002	if (type == ARC_BUFC_METADATA) {
2003		offset = 0;
2004		list_count = ARC_BUFC_NUMMETADATALISTS;
2005		list_start = &state->arcs_lists[0];
2006		evicted_list_start = &evicted_state->arcs_lists[0];
2007		idx = evict_metadata_offset;
2008	} else {
2009		offset = ARC_BUFC_NUMMETADATALISTS;
2010		list_start = &state->arcs_lists[offset];
2011		evicted_list_start = &evicted_state->arcs_lists[offset];
2012		list_count = ARC_BUFC_NUMDATALISTS;
2013		idx = evict_data_offset;
2014	}
2015	bytes_remaining = evicted_state->arcs_lsize[type];
2016	lists = 0;
2017
2018evict_start:
2019	list = &list_start[idx];
2020	evicted_list = &evicted_list_start[idx];
2021	lock = ARCS_LOCK(state, (offset + idx));
2022	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2023
2024	mutex_enter(lock);
2025	mutex_enter(evicted_lock);
2026
2027	for (ab = list_tail(list); ab; ab = ab_prev) {
2028		ab_prev = list_prev(list, ab);
2029		bytes_remaining -= (ab->b_size * ab->b_datacnt);
2030		/* prefetch buffers have a minimum lifespan */
2031		if (HDR_IO_IN_PROGRESS(ab) ||
2032		    (spa && ab->b_spa != spa) ||
2033		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2034		    ddi_get_lbolt() - ab->b_arc_access <
2035		    arc_min_prefetch_lifespan)) {
2036			skipped++;
2037			continue;
2038		}
2039		/* "lookahead" for better eviction candidate */
2040		if (recycle && ab->b_size != bytes &&
2041		    ab_prev && ab_prev->b_size == bytes)
2042			continue;
2043
2044		/* ignore markers */
2045		if (ab->b_spa == 0)
2046			continue;
2047
2048		/*
2049		 * It may take a long time to evict all the bufs requested.
2050		 * To avoid blocking all arc activity, periodically drop
2051		 * the arcs_mtx and give other threads a chance to run
2052		 * before reacquiring the lock.
2053		 *
2054		 * If we are looking for a buffer to recycle, we are in
2055		 * the hot code path, so don't sleep.
2056		 */
2057		if (!recycle && count++ > arc_evict_iterations) {
2058			list_insert_after(list, ab, &marker);
2059			mutex_exit(evicted_lock);
2060			mutex_exit(lock);
2061			kpreempt(KPREEMPT_SYNC);
2062			mutex_enter(lock);
2063			mutex_enter(evicted_lock);
2064			ab_prev = list_prev(list, &marker);
2065			list_remove(list, &marker);
2066			count = 0;
2067			continue;
2068		}
2069
2070		hash_lock = HDR_LOCK(ab);
2071		have_lock = MUTEX_HELD(hash_lock);
2072		if (have_lock || mutex_tryenter(hash_lock)) {
2073			ASSERT0(refcount_count(&ab->b_refcnt));
2074			ASSERT(ab->b_datacnt > 0);
2075			while (ab->b_buf) {
2076				arc_buf_t *buf = ab->b_buf;
2077				if (!mutex_tryenter(&buf->b_evict_lock)) {
2078					missed += 1;
2079					break;
2080				}
2081				if (buf->b_data) {
2082					bytes_evicted += ab->b_size;
2083					if (recycle && ab->b_type == type &&
2084					    ab->b_size == bytes &&
2085					    !HDR_L2_WRITING(ab)) {
2086						stolen = buf->b_data;
2087						recycle = FALSE;
2088					}
2089				}
2090				if (buf->b_efunc) {
2091					mutex_enter(&arc_eviction_mtx);
2092					arc_buf_destroy(buf,
2093					    buf->b_data == stolen, FALSE);
2094					ab->b_buf = buf->b_next;
2095					buf->b_hdr = &arc_eviction_hdr;
2096					buf->b_next = arc_eviction_list;
2097					arc_eviction_list = buf;
2098					mutex_exit(&arc_eviction_mtx);
2099					mutex_exit(&buf->b_evict_lock);
2100				} else {
2101					mutex_exit(&buf->b_evict_lock);
2102					arc_buf_destroy(buf,
2103					    buf->b_data == stolen, TRUE);
2104				}
2105			}
2106
2107			if (ab->b_l2hdr) {
2108				ARCSTAT_INCR(arcstat_evict_l2_cached,
2109				    ab->b_size);
2110			} else {
2111				if (l2arc_write_eligible(ab->b_spa, ab)) {
2112					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2113					    ab->b_size);
2114				} else {
2115					ARCSTAT_INCR(
2116					    arcstat_evict_l2_ineligible,
2117					    ab->b_size);
2118				}
2119			}
2120
2121			if (ab->b_datacnt == 0) {
2122				arc_change_state(evicted_state, ab, hash_lock);
2123				ASSERT(HDR_IN_HASH_TABLE(ab));
2124				ab->b_flags |= ARC_IN_HASH_TABLE;
2125				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2126				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2127			}
2128			if (!have_lock)
2129				mutex_exit(hash_lock);
2130			if (bytes >= 0 && bytes_evicted >= bytes)
2131				break;
2132			if (bytes_remaining > 0) {
2133				mutex_exit(evicted_lock);
2134				mutex_exit(lock);
2135				idx  = ((idx + 1) & (list_count - 1));
2136				lists++;
2137				goto evict_start;
2138			}
2139		} else {
2140			missed += 1;
2141		}
2142	}
2143
2144	mutex_exit(evicted_lock);
2145	mutex_exit(lock);
2146
2147	idx  = ((idx + 1) & (list_count - 1));
2148	lists++;
2149
2150	if (bytes_evicted < bytes) {
2151		if (lists < list_count)
2152			goto evict_start;
2153		else
2154			dprintf("only evicted %lld bytes from %x",
2155			    (longlong_t)bytes_evicted, state);
2156	}
2157	if (type == ARC_BUFC_METADATA)
2158		evict_metadata_offset = idx;
2159	else
2160		evict_data_offset = idx;
2161
2162	if (skipped)
2163		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2164
2165	if (missed)
2166		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2167
2168	/*
2169	 * Note: we have just evicted some data into the ghost state,
2170	 * potentially putting the ghost size over the desired size.  Rather
2171	 * that evicting from the ghost list in this hot code path, leave
2172	 * this chore to the arc_reclaim_thread().
2173	 */
2174
2175	if (stolen)
2176		ARCSTAT_BUMP(arcstat_stolen);
2177	return (stolen);
2178}
2179
2180/*
2181 * Remove buffers from list until we've removed the specified number of
2182 * bytes.  Destroy the buffers that are removed.
2183 */
2184static void
2185arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2186{
2187	arc_buf_hdr_t *ab, *ab_prev;
2188	arc_buf_hdr_t marker = { 0 };
2189	list_t *list, *list_start;
2190	kmutex_t *hash_lock, *lock;
2191	uint64_t bytes_deleted = 0;
2192	uint64_t bufs_skipped = 0;
2193	int count = 0;
2194	static int evict_offset;
2195	int list_count, idx = evict_offset;
2196	int offset, lists = 0;
2197
2198	ASSERT(GHOST_STATE(state));
2199
2200	/*
2201	 * data lists come after metadata lists
2202	 */
2203	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2204	list_count = ARC_BUFC_NUMDATALISTS;
2205	offset = ARC_BUFC_NUMMETADATALISTS;
2206
2207evict_start:
2208	list = &list_start[idx];
2209	lock = ARCS_LOCK(state, idx + offset);
2210
2211	mutex_enter(lock);
2212	for (ab = list_tail(list); ab; ab = ab_prev) {
2213		ab_prev = list_prev(list, ab);
2214		if (ab->b_type > ARC_BUFC_NUMTYPES)
2215			panic("invalid ab=%p", (void *)ab);
2216		if (spa && ab->b_spa != spa)
2217			continue;
2218
2219		/* ignore markers */
2220		if (ab->b_spa == 0)
2221			continue;
2222
2223		hash_lock = HDR_LOCK(ab);
2224		/* caller may be trying to modify this buffer, skip it */
2225		if (MUTEX_HELD(hash_lock))
2226			continue;
2227
2228		/*
2229		 * It may take a long time to evict all the bufs requested.
2230		 * To avoid blocking all arc activity, periodically drop
2231		 * the arcs_mtx and give other threads a chance to run
2232		 * before reacquiring the lock.
2233		 */
2234		if (count++ > arc_evict_iterations) {
2235			list_insert_after(list, ab, &marker);
2236			mutex_exit(lock);
2237			kpreempt(KPREEMPT_SYNC);
2238			mutex_enter(lock);
2239			ab_prev = list_prev(list, &marker);
2240			list_remove(list, &marker);
2241			count = 0;
2242			continue;
2243		}
2244		if (mutex_tryenter(hash_lock)) {
2245			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2246			ASSERT(ab->b_buf == NULL);
2247			ARCSTAT_BUMP(arcstat_deleted);
2248			bytes_deleted += ab->b_size;
2249
2250			if (ab->b_l2hdr != NULL) {
2251				/*
2252				 * This buffer is cached on the 2nd Level ARC;
2253				 * don't destroy the header.
2254				 */
2255				arc_change_state(arc_l2c_only, ab, hash_lock);
2256				mutex_exit(hash_lock);
2257			} else {
2258				arc_change_state(arc_anon, ab, hash_lock);
2259				mutex_exit(hash_lock);
2260				arc_hdr_destroy(ab);
2261			}
2262
2263			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2264			if (bytes >= 0 && bytes_deleted >= bytes)
2265				break;
2266		} else if (bytes < 0) {
2267			/*
2268			 * Insert a list marker and then wait for the
2269			 * hash lock to become available. Once its
2270			 * available, restart from where we left off.
2271			 */
2272			list_insert_after(list, ab, &marker);
2273			mutex_exit(lock);
2274			mutex_enter(hash_lock);
2275			mutex_exit(hash_lock);
2276			mutex_enter(lock);
2277			ab_prev = list_prev(list, &marker);
2278			list_remove(list, &marker);
2279		} else {
2280			bufs_skipped += 1;
2281		}
2282
2283	}
2284	mutex_exit(lock);
2285	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2286	lists++;
2287
2288	if (lists < list_count)
2289		goto evict_start;
2290
2291	evict_offset = idx;
2292	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2293	    (bytes < 0 || bytes_deleted < bytes)) {
2294		list_start = &state->arcs_lists[0];
2295		list_count = ARC_BUFC_NUMMETADATALISTS;
2296		offset = lists = 0;
2297		goto evict_start;
2298	}
2299
2300	if (bufs_skipped) {
2301		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2302		ASSERT(bytes >= 0);
2303	}
2304
2305	if (bytes_deleted < bytes)
2306		dprintf("only deleted %lld bytes from %p",
2307		    (longlong_t)bytes_deleted, state);
2308}
2309
2310static void
2311arc_adjust(void)
2312{
2313	int64_t adjustment, delta;
2314
2315	/*
2316	 * Adjust MRU size
2317	 */
2318
2319	adjustment = MIN((int64_t)(arc_size - arc_c),
2320	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2321	    arc_p));
2322
2323	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2324		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2325		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2326		adjustment -= delta;
2327	}
2328
2329	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2330		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2331		(void) arc_evict(arc_mru, 0, delta, FALSE,
2332		    ARC_BUFC_METADATA);
2333	}
2334
2335	/*
2336	 * Adjust MFU size
2337	 */
2338
2339	adjustment = arc_size - arc_c;
2340
2341	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2342		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2343		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2344		adjustment -= delta;
2345	}
2346
2347	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2348		int64_t delta = MIN(adjustment,
2349		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2350		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2351		    ARC_BUFC_METADATA);
2352	}
2353
2354	/*
2355	 * Adjust ghost lists
2356	 */
2357
2358	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2359
2360	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2361		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2362		arc_evict_ghost(arc_mru_ghost, 0, delta);
2363	}
2364
2365	adjustment =
2366	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2367
2368	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2369		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2370		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2371	}
2372}
2373
2374static void
2375arc_do_user_evicts(void)
2376{
2377	static arc_buf_t *tmp_arc_eviction_list;
2378
2379	/*
2380	 * Move list over to avoid LOR
2381	 */
2382restart:
2383	mutex_enter(&arc_eviction_mtx);
2384	tmp_arc_eviction_list = arc_eviction_list;
2385	arc_eviction_list = NULL;
2386	mutex_exit(&arc_eviction_mtx);
2387
2388	while (tmp_arc_eviction_list != NULL) {
2389		arc_buf_t *buf = tmp_arc_eviction_list;
2390		tmp_arc_eviction_list = buf->b_next;
2391		mutex_enter(&buf->b_evict_lock);
2392		buf->b_hdr = NULL;
2393		mutex_exit(&buf->b_evict_lock);
2394
2395		if (buf->b_efunc != NULL)
2396			VERIFY0(buf->b_efunc(buf->b_private));
2397
2398		buf->b_efunc = NULL;
2399		buf->b_private = NULL;
2400		kmem_cache_free(buf_cache, buf);
2401	}
2402
2403	if (arc_eviction_list != NULL)
2404		goto restart;
2405}
2406
2407/*
2408 * Flush all *evictable* data from the cache for the given spa.
2409 * NOTE: this will not touch "active" (i.e. referenced) data.
2410 */
2411void
2412arc_flush(spa_t *spa)
2413{
2414	uint64_t guid = 0;
2415
2416	if (spa)
2417		guid = spa_load_guid(spa);
2418
2419	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2420		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2421		if (spa)
2422			break;
2423	}
2424	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2425		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2426		if (spa)
2427			break;
2428	}
2429	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2430		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2431		if (spa)
2432			break;
2433	}
2434	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2435		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2436		if (spa)
2437			break;
2438	}
2439
2440	arc_evict_ghost(arc_mru_ghost, guid, -1);
2441	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2442
2443	mutex_enter(&arc_reclaim_thr_lock);
2444	arc_do_user_evicts();
2445	mutex_exit(&arc_reclaim_thr_lock);
2446	ASSERT(spa || arc_eviction_list == NULL);
2447}
2448
2449void
2450arc_shrink(void)
2451{
2452	if (arc_c > arc_c_min) {
2453		uint64_t to_free;
2454
2455#ifdef _KERNEL
2456		to_free = arc_c >> arc_shrink_shift;
2457#else
2458		to_free = arc_c >> arc_shrink_shift;
2459#endif
2460		if (arc_c > arc_c_min + to_free)
2461			atomic_add_64(&arc_c, -to_free);
2462		else
2463			arc_c = arc_c_min;
2464
2465		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2466		if (arc_c > arc_size)
2467			arc_c = MAX(arc_size, arc_c_min);
2468		if (arc_p > arc_c)
2469			arc_p = (arc_c >> 1);
2470		ASSERT(arc_c >= arc_c_min);
2471		ASSERT((int64_t)arc_p >= 0);
2472	}
2473
2474	if (arc_size > arc_c)
2475		arc_adjust();
2476}
2477
2478static int needfree = 0;
2479
2480static int
2481arc_reclaim_needed(void)
2482{
2483
2484#ifdef _KERNEL
2485
2486	if (needfree)
2487		return (1);
2488
2489	/*
2490	 * Cooperate with pagedaemon when it's time for it to scan
2491	 * and reclaim some pages.
2492	 */
2493	if (vm_paging_needed())
2494		return (1);
2495
2496#ifdef sun
2497	/*
2498	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2499	 */
2500	extra = desfree;
2501
2502	/*
2503	 * check that we're out of range of the pageout scanner.  It starts to
2504	 * schedule paging if freemem is less than lotsfree and needfree.
2505	 * lotsfree is the high-water mark for pageout, and needfree is the
2506	 * number of needed free pages.  We add extra pages here to make sure
2507	 * the scanner doesn't start up while we're freeing memory.
2508	 */
2509	if (freemem < lotsfree + needfree + extra)
2510		return (1);
2511
2512	/*
2513	 * check to make sure that swapfs has enough space so that anon
2514	 * reservations can still succeed. anon_resvmem() checks that the
2515	 * availrmem is greater than swapfs_minfree, and the number of reserved
2516	 * swap pages.  We also add a bit of extra here just to prevent
2517	 * circumstances from getting really dire.
2518	 */
2519	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2520		return (1);
2521
2522#if defined(__i386)
2523	/*
2524	 * If we're on an i386 platform, it's possible that we'll exhaust the
2525	 * kernel heap space before we ever run out of available physical
2526	 * memory.  Most checks of the size of the heap_area compare against
2527	 * tune.t_minarmem, which is the minimum available real memory that we
2528	 * can have in the system.  However, this is generally fixed at 25 pages
2529	 * which is so low that it's useless.  In this comparison, we seek to
2530	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2531	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2532	 * free)
2533	 */
2534	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2535	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2536		return (1);
2537#endif
2538#else	/* !sun */
2539	if (kmem_used() > (kmem_size() * 3) / 4)
2540		return (1);
2541#endif	/* sun */
2542
2543#else
2544	if (spa_get_random(100) == 0)
2545		return (1);
2546#endif
2547	return (0);
2548}
2549
2550extern kmem_cache_t	*zio_buf_cache[];
2551extern kmem_cache_t	*zio_data_buf_cache[];
2552
2553static void
2554arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2555{
2556	size_t			i;
2557	kmem_cache_t		*prev_cache = NULL;
2558	kmem_cache_t		*prev_data_cache = NULL;
2559
2560#ifdef _KERNEL
2561	if (arc_meta_used >= arc_meta_limit) {
2562		/*
2563		 * We are exceeding our meta-data cache limit.
2564		 * Purge some DNLC entries to release holds on meta-data.
2565		 */
2566		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2567	}
2568#if defined(__i386)
2569	/*
2570	 * Reclaim unused memory from all kmem caches.
2571	 */
2572	kmem_reap();
2573#endif
2574#endif
2575
2576	/*
2577	 * An aggressive reclamation will shrink the cache size as well as
2578	 * reap free buffers from the arc kmem caches.
2579	 */
2580	if (strat == ARC_RECLAIM_AGGR)
2581		arc_shrink();
2582
2583	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2584		if (zio_buf_cache[i] != prev_cache) {
2585			prev_cache = zio_buf_cache[i];
2586			kmem_cache_reap_now(zio_buf_cache[i]);
2587		}
2588		if (zio_data_buf_cache[i] != prev_data_cache) {
2589			prev_data_cache = zio_data_buf_cache[i];
2590			kmem_cache_reap_now(zio_data_buf_cache[i]);
2591		}
2592	}
2593	kmem_cache_reap_now(buf_cache);
2594	kmem_cache_reap_now(hdr_cache);
2595}
2596
2597static void
2598arc_reclaim_thread(void *dummy __unused)
2599{
2600	clock_t			growtime = 0;
2601	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2602	callb_cpr_t		cpr;
2603
2604	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2605
2606	mutex_enter(&arc_reclaim_thr_lock);
2607	while (arc_thread_exit == 0) {
2608		if (arc_reclaim_needed()) {
2609
2610			if (arc_no_grow) {
2611				if (last_reclaim == ARC_RECLAIM_CONS) {
2612					last_reclaim = ARC_RECLAIM_AGGR;
2613				} else {
2614					last_reclaim = ARC_RECLAIM_CONS;
2615				}
2616			} else {
2617				arc_no_grow = TRUE;
2618				last_reclaim = ARC_RECLAIM_AGGR;
2619				membar_producer();
2620			}
2621
2622			/* reset the growth delay for every reclaim */
2623			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2624
2625			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2626				/*
2627				 * If needfree is TRUE our vm_lowmem hook
2628				 * was called and in that case we must free some
2629				 * memory, so switch to aggressive mode.
2630				 */
2631				arc_no_grow = TRUE;
2632				last_reclaim = ARC_RECLAIM_AGGR;
2633			}
2634			arc_kmem_reap_now(last_reclaim);
2635			arc_warm = B_TRUE;
2636
2637		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2638			arc_no_grow = FALSE;
2639		}
2640
2641		arc_adjust();
2642
2643		if (arc_eviction_list != NULL)
2644			arc_do_user_evicts();
2645
2646#ifdef _KERNEL
2647		if (needfree) {
2648			needfree = 0;
2649			wakeup(&needfree);
2650		}
2651#endif
2652
2653		/* block until needed, or one second, whichever is shorter */
2654		CALLB_CPR_SAFE_BEGIN(&cpr);
2655		(void) cv_timedwait(&arc_reclaim_thr_cv,
2656		    &arc_reclaim_thr_lock, hz);
2657		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2658	}
2659
2660	arc_thread_exit = 0;
2661	cv_broadcast(&arc_reclaim_thr_cv);
2662	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2663	thread_exit();
2664}
2665
2666/*
2667 * Adapt arc info given the number of bytes we are trying to add and
2668 * the state that we are comming from.  This function is only called
2669 * when we are adding new content to the cache.
2670 */
2671static void
2672arc_adapt(int bytes, arc_state_t *state)
2673{
2674	int mult;
2675	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2676
2677	if (state == arc_l2c_only)
2678		return;
2679
2680	ASSERT(bytes > 0);
2681	/*
2682	 * Adapt the target size of the MRU list:
2683	 *	- if we just hit in the MRU ghost list, then increase
2684	 *	  the target size of the MRU list.
2685	 *	- if we just hit in the MFU ghost list, then increase
2686	 *	  the target size of the MFU list by decreasing the
2687	 *	  target size of the MRU list.
2688	 */
2689	if (state == arc_mru_ghost) {
2690		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2691		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2692		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2693
2694		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2695	} else if (state == arc_mfu_ghost) {
2696		uint64_t delta;
2697
2698		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2699		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2700		mult = MIN(mult, 10);
2701
2702		delta = MIN(bytes * mult, arc_p);
2703		arc_p = MAX(arc_p_min, arc_p - delta);
2704	}
2705	ASSERT((int64_t)arc_p >= 0);
2706
2707	if (arc_reclaim_needed()) {
2708		cv_signal(&arc_reclaim_thr_cv);
2709		return;
2710	}
2711
2712	if (arc_no_grow)
2713		return;
2714
2715	if (arc_c >= arc_c_max)
2716		return;
2717
2718	/*
2719	 * If we're within (2 * maxblocksize) bytes of the target
2720	 * cache size, increment the target cache size
2721	 */
2722	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2723		atomic_add_64(&arc_c, (int64_t)bytes);
2724		if (arc_c > arc_c_max)
2725			arc_c = arc_c_max;
2726		else if (state == arc_anon)
2727			atomic_add_64(&arc_p, (int64_t)bytes);
2728		if (arc_p > arc_c)
2729			arc_p = arc_c;
2730	}
2731	ASSERT((int64_t)arc_p >= 0);
2732}
2733
2734/*
2735 * Check if the cache has reached its limits and eviction is required
2736 * prior to insert.
2737 */
2738static int
2739arc_evict_needed(arc_buf_contents_t type)
2740{
2741	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2742		return (1);
2743
2744#ifdef sun
2745#ifdef _KERNEL
2746	/*
2747	 * If zio data pages are being allocated out of a separate heap segment,
2748	 * then enforce that the size of available vmem for this area remains
2749	 * above about 1/32nd free.
2750	 */
2751	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2752	    vmem_size(zio_arena, VMEM_FREE) <
2753	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2754		return (1);
2755#endif
2756#endif	/* sun */
2757
2758	if (arc_reclaim_needed())
2759		return (1);
2760
2761	return (arc_size > arc_c);
2762}
2763
2764/*
2765 * The buffer, supplied as the first argument, needs a data block.
2766 * So, if we are at cache max, determine which cache should be victimized.
2767 * We have the following cases:
2768 *
2769 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2770 * In this situation if we're out of space, but the resident size of the MFU is
2771 * under the limit, victimize the MFU cache to satisfy this insertion request.
2772 *
2773 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2774 * Here, we've used up all of the available space for the MRU, so we need to
2775 * evict from our own cache instead.  Evict from the set of resident MRU
2776 * entries.
2777 *
2778 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2779 * c minus p represents the MFU space in the cache, since p is the size of the
2780 * cache that is dedicated to the MRU.  In this situation there's still space on
2781 * the MFU side, so the MRU side needs to be victimized.
2782 *
2783 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2784 * MFU's resident set is consuming more space than it has been allotted.  In
2785 * this situation, we must victimize our own cache, the MFU, for this insertion.
2786 */
2787static void
2788arc_get_data_buf(arc_buf_t *buf)
2789{
2790	arc_state_t		*state = buf->b_hdr->b_state;
2791	uint64_t		size = buf->b_hdr->b_size;
2792	arc_buf_contents_t	type = buf->b_hdr->b_type;
2793
2794	arc_adapt(size, state);
2795
2796	/*
2797	 * We have not yet reached cache maximum size,
2798	 * just allocate a new buffer.
2799	 */
2800	if (!arc_evict_needed(type)) {
2801		if (type == ARC_BUFC_METADATA) {
2802			buf->b_data = zio_buf_alloc(size);
2803			arc_space_consume(size, ARC_SPACE_DATA);
2804		} else {
2805			ASSERT(type == ARC_BUFC_DATA);
2806			buf->b_data = zio_data_buf_alloc(size);
2807			ARCSTAT_INCR(arcstat_data_size, size);
2808			atomic_add_64(&arc_size, size);
2809		}
2810		goto out;
2811	}
2812
2813	/*
2814	 * If we are prefetching from the mfu ghost list, this buffer
2815	 * will end up on the mru list; so steal space from there.
2816	 */
2817	if (state == arc_mfu_ghost)
2818		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2819	else if (state == arc_mru_ghost)
2820		state = arc_mru;
2821
2822	if (state == arc_mru || state == arc_anon) {
2823		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2824		state = (arc_mfu->arcs_lsize[type] >= size &&
2825		    arc_p > mru_used) ? arc_mfu : arc_mru;
2826	} else {
2827		/* MFU cases */
2828		uint64_t mfu_space = arc_c - arc_p;
2829		state =  (arc_mru->arcs_lsize[type] >= size &&
2830		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2831	}
2832	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2833		if (type == ARC_BUFC_METADATA) {
2834			buf->b_data = zio_buf_alloc(size);
2835			arc_space_consume(size, ARC_SPACE_DATA);
2836		} else {
2837			ASSERT(type == ARC_BUFC_DATA);
2838			buf->b_data = zio_data_buf_alloc(size);
2839			ARCSTAT_INCR(arcstat_data_size, size);
2840			atomic_add_64(&arc_size, size);
2841		}
2842		ARCSTAT_BUMP(arcstat_recycle_miss);
2843	}
2844	ASSERT(buf->b_data != NULL);
2845out:
2846	/*
2847	 * Update the state size.  Note that ghost states have a
2848	 * "ghost size" and so don't need to be updated.
2849	 */
2850	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2851		arc_buf_hdr_t *hdr = buf->b_hdr;
2852
2853		atomic_add_64(&hdr->b_state->arcs_size, size);
2854		if (list_link_active(&hdr->b_arc_node)) {
2855			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2856			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2857		}
2858		/*
2859		 * If we are growing the cache, and we are adding anonymous
2860		 * data, and we have outgrown arc_p, update arc_p
2861		 */
2862		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2863		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2864			arc_p = MIN(arc_c, arc_p + size);
2865	}
2866	ARCSTAT_BUMP(arcstat_allocated);
2867}
2868
2869/*
2870 * This routine is called whenever a buffer is accessed.
2871 * NOTE: the hash lock is dropped in this function.
2872 */
2873static void
2874arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2875{
2876	clock_t now;
2877
2878	ASSERT(MUTEX_HELD(hash_lock));
2879
2880	if (buf->b_state == arc_anon) {
2881		/*
2882		 * This buffer is not in the cache, and does not
2883		 * appear in our "ghost" list.  Add the new buffer
2884		 * to the MRU state.
2885		 */
2886
2887		ASSERT(buf->b_arc_access == 0);
2888		buf->b_arc_access = ddi_get_lbolt();
2889		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2890		arc_change_state(arc_mru, buf, hash_lock);
2891
2892	} else if (buf->b_state == arc_mru) {
2893		now = ddi_get_lbolt();
2894
2895		/*
2896		 * If this buffer is here because of a prefetch, then either:
2897		 * - clear the flag if this is a "referencing" read
2898		 *   (any subsequent access will bump this into the MFU state).
2899		 * or
2900		 * - move the buffer to the head of the list if this is
2901		 *   another prefetch (to make it less likely to be evicted).
2902		 */
2903		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2904			if (refcount_count(&buf->b_refcnt) == 0) {
2905				ASSERT(list_link_active(&buf->b_arc_node));
2906			} else {
2907				buf->b_flags &= ~ARC_PREFETCH;
2908				ARCSTAT_BUMP(arcstat_mru_hits);
2909			}
2910			buf->b_arc_access = now;
2911			return;
2912		}
2913
2914		/*
2915		 * This buffer has been "accessed" only once so far,
2916		 * but it is still in the cache. Move it to the MFU
2917		 * state.
2918		 */
2919		if (now > buf->b_arc_access + ARC_MINTIME) {
2920			/*
2921			 * More than 125ms have passed since we
2922			 * instantiated this buffer.  Move it to the
2923			 * most frequently used state.
2924			 */
2925			buf->b_arc_access = now;
2926			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2927			arc_change_state(arc_mfu, buf, hash_lock);
2928		}
2929		ARCSTAT_BUMP(arcstat_mru_hits);
2930	} else if (buf->b_state == arc_mru_ghost) {
2931		arc_state_t	*new_state;
2932		/*
2933		 * This buffer has been "accessed" recently, but
2934		 * was evicted from the cache.  Move it to the
2935		 * MFU state.
2936		 */
2937
2938		if (buf->b_flags & ARC_PREFETCH) {
2939			new_state = arc_mru;
2940			if (refcount_count(&buf->b_refcnt) > 0)
2941				buf->b_flags &= ~ARC_PREFETCH;
2942			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2943		} else {
2944			new_state = arc_mfu;
2945			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2946		}
2947
2948		buf->b_arc_access = ddi_get_lbolt();
2949		arc_change_state(new_state, buf, hash_lock);
2950
2951		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2952	} else if (buf->b_state == arc_mfu) {
2953		/*
2954		 * This buffer has been accessed more than once and is
2955		 * still in the cache.  Keep it in the MFU state.
2956		 *
2957		 * NOTE: an add_reference() that occurred when we did
2958		 * the arc_read() will have kicked this off the list.
2959		 * If it was a prefetch, we will explicitly move it to
2960		 * the head of the list now.
2961		 */
2962		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2963			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2964			ASSERT(list_link_active(&buf->b_arc_node));
2965		}
2966		ARCSTAT_BUMP(arcstat_mfu_hits);
2967		buf->b_arc_access = ddi_get_lbolt();
2968	} else if (buf->b_state == arc_mfu_ghost) {
2969		arc_state_t	*new_state = arc_mfu;
2970		/*
2971		 * This buffer has been accessed more than once but has
2972		 * been evicted from the cache.  Move it back to the
2973		 * MFU state.
2974		 */
2975
2976		if (buf->b_flags & ARC_PREFETCH) {
2977			/*
2978			 * This is a prefetch access...
2979			 * move this block back to the MRU state.
2980			 */
2981			ASSERT0(refcount_count(&buf->b_refcnt));
2982			new_state = arc_mru;
2983		}
2984
2985		buf->b_arc_access = ddi_get_lbolt();
2986		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2987		arc_change_state(new_state, buf, hash_lock);
2988
2989		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2990	} else if (buf->b_state == arc_l2c_only) {
2991		/*
2992		 * This buffer is on the 2nd Level ARC.
2993		 */
2994
2995		buf->b_arc_access = ddi_get_lbolt();
2996		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2997		arc_change_state(arc_mfu, buf, hash_lock);
2998	} else {
2999		ASSERT(!"invalid arc state");
3000	}
3001}
3002
3003/* a generic arc_done_func_t which you can use */
3004/* ARGSUSED */
3005void
3006arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3007{
3008	if (zio == NULL || zio->io_error == 0)
3009		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3010	VERIFY(arc_buf_remove_ref(buf, arg));
3011}
3012
3013/* a generic arc_done_func_t */
3014void
3015arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3016{
3017	arc_buf_t **bufp = arg;
3018	if (zio && zio->io_error) {
3019		VERIFY(arc_buf_remove_ref(buf, arg));
3020		*bufp = NULL;
3021	} else {
3022		*bufp = buf;
3023		ASSERT(buf->b_data);
3024	}
3025}
3026
3027static void
3028arc_read_done(zio_t *zio)
3029{
3030	arc_buf_hdr_t	*hdr;
3031	arc_buf_t	*buf;
3032	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3033	kmutex_t	*hash_lock = NULL;
3034	arc_callback_t	*callback_list, *acb;
3035	int		freeable = FALSE;
3036
3037	buf = zio->io_private;
3038	hdr = buf->b_hdr;
3039
3040	/*
3041	 * The hdr was inserted into hash-table and removed from lists
3042	 * prior to starting I/O.  We should find this header, since
3043	 * it's in the hash table, and it should be legit since it's
3044	 * not possible to evict it during the I/O.  The only possible
3045	 * reason for it not to be found is if we were freed during the
3046	 * read.
3047	 */
3048	if (HDR_IN_HASH_TABLE(hdr)) {
3049		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3050		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3051		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3052		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3053		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3054
3055		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3056		    &hash_lock);
3057
3058		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3059		    hash_lock == NULL) ||
3060		    (found == hdr &&
3061		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3062		    (found == hdr && HDR_L2_READING(hdr)));
3063	}
3064
3065	hdr->b_flags &= ~ARC_L2_EVICTED;
3066	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3067		hdr->b_flags &= ~ARC_L2CACHE;
3068
3069	/* byteswap if necessary */
3070	callback_list = hdr->b_acb;
3071	ASSERT(callback_list != NULL);
3072	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3073		dmu_object_byteswap_t bswap =
3074		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3075		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3076		    byteswap_uint64_array :
3077		    dmu_ot_byteswap[bswap].ob_func;
3078		func(buf->b_data, hdr->b_size);
3079	}
3080
3081	arc_cksum_compute(buf, B_FALSE);
3082#ifdef illumos
3083	arc_buf_watch(buf);
3084#endif /* illumos */
3085
3086	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3087		/*
3088		 * Only call arc_access on anonymous buffers.  This is because
3089		 * if we've issued an I/O for an evicted buffer, we've already
3090		 * called arc_access (to prevent any simultaneous readers from
3091		 * getting confused).
3092		 */
3093		arc_access(hdr, hash_lock);
3094	}
3095
3096	/* create copies of the data buffer for the callers */
3097	abuf = buf;
3098	for (acb = callback_list; acb; acb = acb->acb_next) {
3099		if (acb->acb_done) {
3100			if (abuf == NULL) {
3101				ARCSTAT_BUMP(arcstat_duplicate_reads);
3102				abuf = arc_buf_clone(buf);
3103			}
3104			acb->acb_buf = abuf;
3105			abuf = NULL;
3106		}
3107	}
3108	hdr->b_acb = NULL;
3109	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3110	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3111	if (abuf == buf) {
3112		ASSERT(buf->b_efunc == NULL);
3113		ASSERT(hdr->b_datacnt == 1);
3114		hdr->b_flags |= ARC_BUF_AVAILABLE;
3115	}
3116
3117	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3118
3119	if (zio->io_error != 0) {
3120		hdr->b_flags |= ARC_IO_ERROR;
3121		if (hdr->b_state != arc_anon)
3122			arc_change_state(arc_anon, hdr, hash_lock);
3123		if (HDR_IN_HASH_TABLE(hdr))
3124			buf_hash_remove(hdr);
3125		freeable = refcount_is_zero(&hdr->b_refcnt);
3126	}
3127
3128	/*
3129	 * Broadcast before we drop the hash_lock to avoid the possibility
3130	 * that the hdr (and hence the cv) might be freed before we get to
3131	 * the cv_broadcast().
3132	 */
3133	cv_broadcast(&hdr->b_cv);
3134
3135	if (hash_lock) {
3136		mutex_exit(hash_lock);
3137	} else {
3138		/*
3139		 * This block was freed while we waited for the read to
3140		 * complete.  It has been removed from the hash table and
3141		 * moved to the anonymous state (so that it won't show up
3142		 * in the cache).
3143		 */
3144		ASSERT3P(hdr->b_state, ==, arc_anon);
3145		freeable = refcount_is_zero(&hdr->b_refcnt);
3146	}
3147
3148	/* execute each callback and free its structure */
3149	while ((acb = callback_list) != NULL) {
3150		if (acb->acb_done)
3151			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3152
3153		if (acb->acb_zio_dummy != NULL) {
3154			acb->acb_zio_dummy->io_error = zio->io_error;
3155			zio_nowait(acb->acb_zio_dummy);
3156		}
3157
3158		callback_list = acb->acb_next;
3159		kmem_free(acb, sizeof (arc_callback_t));
3160	}
3161
3162	if (freeable)
3163		arc_hdr_destroy(hdr);
3164}
3165
3166/*
3167 * "Read" the block block at the specified DVA (in bp) via the
3168 * cache.  If the block is found in the cache, invoke the provided
3169 * callback immediately and return.  Note that the `zio' parameter
3170 * in the callback will be NULL in this case, since no IO was
3171 * required.  If the block is not in the cache pass the read request
3172 * on to the spa with a substitute callback function, so that the
3173 * requested block will be added to the cache.
3174 *
3175 * If a read request arrives for a block that has a read in-progress,
3176 * either wait for the in-progress read to complete (and return the
3177 * results); or, if this is a read with a "done" func, add a record
3178 * to the read to invoke the "done" func when the read completes,
3179 * and return; or just return.
3180 *
3181 * arc_read_done() will invoke all the requested "done" functions
3182 * for readers of this block.
3183 */
3184int
3185arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3186    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3187    const zbookmark_phys_t *zb)
3188{
3189	arc_buf_hdr_t *hdr = NULL;
3190	arc_buf_t *buf = NULL;
3191	kmutex_t *hash_lock = NULL;
3192	zio_t *rzio;
3193	uint64_t guid = spa_load_guid(spa);
3194
3195	ASSERT(!BP_IS_EMBEDDED(bp) ||
3196	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3197
3198top:
3199	if (!BP_IS_EMBEDDED(bp)) {
3200		/*
3201		 * Embedded BP's have no DVA and require no I/O to "read".
3202		 * Create an anonymous arc buf to back it.
3203		 */
3204		hdr = buf_hash_find(guid, bp, &hash_lock);
3205	}
3206
3207	if (hdr != NULL && hdr->b_datacnt > 0) {
3208
3209		*arc_flags |= ARC_CACHED;
3210
3211		if (HDR_IO_IN_PROGRESS(hdr)) {
3212
3213			if (*arc_flags & ARC_WAIT) {
3214				cv_wait(&hdr->b_cv, hash_lock);
3215				mutex_exit(hash_lock);
3216				goto top;
3217			}
3218			ASSERT(*arc_flags & ARC_NOWAIT);
3219
3220			if (done) {
3221				arc_callback_t	*acb = NULL;
3222
3223				acb = kmem_zalloc(sizeof (arc_callback_t),
3224				    KM_SLEEP);
3225				acb->acb_done = done;
3226				acb->acb_private = private;
3227				if (pio != NULL)
3228					acb->acb_zio_dummy = zio_null(pio,
3229					    spa, NULL, NULL, NULL, zio_flags);
3230
3231				ASSERT(acb->acb_done != NULL);
3232				acb->acb_next = hdr->b_acb;
3233				hdr->b_acb = acb;
3234				add_reference(hdr, hash_lock, private);
3235				mutex_exit(hash_lock);
3236				return (0);
3237			}
3238			mutex_exit(hash_lock);
3239			return (0);
3240		}
3241
3242		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3243
3244		if (done) {
3245			add_reference(hdr, hash_lock, private);
3246			/*
3247			 * If this block is already in use, create a new
3248			 * copy of the data so that we will be guaranteed
3249			 * that arc_release() will always succeed.
3250			 */
3251			buf = hdr->b_buf;
3252			ASSERT(buf);
3253			ASSERT(buf->b_data);
3254			if (HDR_BUF_AVAILABLE(hdr)) {
3255				ASSERT(buf->b_efunc == NULL);
3256				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3257			} else {
3258				buf = arc_buf_clone(buf);
3259			}
3260
3261		} else if (*arc_flags & ARC_PREFETCH &&
3262		    refcount_count(&hdr->b_refcnt) == 0) {
3263			hdr->b_flags |= ARC_PREFETCH;
3264		}
3265		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3266		arc_access(hdr, hash_lock);
3267		if (*arc_flags & ARC_L2CACHE)
3268			hdr->b_flags |= ARC_L2CACHE;
3269		if (*arc_flags & ARC_L2COMPRESS)
3270			hdr->b_flags |= ARC_L2COMPRESS;
3271		mutex_exit(hash_lock);
3272		ARCSTAT_BUMP(arcstat_hits);
3273		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3274		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3275		    data, metadata, hits);
3276
3277		if (done)
3278			done(NULL, buf, private);
3279	} else {
3280		uint64_t size = BP_GET_LSIZE(bp);
3281		arc_callback_t *acb;
3282		vdev_t *vd = NULL;
3283		uint64_t addr = 0;
3284		boolean_t devw = B_FALSE;
3285		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3286		uint64_t b_asize = 0;
3287
3288		if (hdr == NULL) {
3289			/* this block is not in the cache */
3290			arc_buf_hdr_t *exists = NULL;
3291			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3292			buf = arc_buf_alloc(spa, size, private, type);
3293			hdr = buf->b_hdr;
3294			if (!BP_IS_EMBEDDED(bp)) {
3295				hdr->b_dva = *BP_IDENTITY(bp);
3296				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3297				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3298				exists = buf_hash_insert(hdr, &hash_lock);
3299			}
3300			if (exists != NULL) {
3301				/* somebody beat us to the hash insert */
3302				mutex_exit(hash_lock);
3303				buf_discard_identity(hdr);
3304				(void) arc_buf_remove_ref(buf, private);
3305				goto top; /* restart the IO request */
3306			}
3307			/* if this is a prefetch, we don't have a reference */
3308			if (*arc_flags & ARC_PREFETCH) {
3309				(void) remove_reference(hdr, hash_lock,
3310				    private);
3311				hdr->b_flags |= ARC_PREFETCH;
3312			}
3313			if (*arc_flags & ARC_L2CACHE)
3314				hdr->b_flags |= ARC_L2CACHE;
3315			if (*arc_flags & ARC_L2COMPRESS)
3316				hdr->b_flags |= ARC_L2COMPRESS;
3317			if (BP_GET_LEVEL(bp) > 0)
3318				hdr->b_flags |= ARC_INDIRECT;
3319		} else {
3320			/* this block is in the ghost cache */
3321			ASSERT(GHOST_STATE(hdr->b_state));
3322			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3323			ASSERT0(refcount_count(&hdr->b_refcnt));
3324			ASSERT(hdr->b_buf == NULL);
3325
3326			/* if this is a prefetch, we don't have a reference */
3327			if (*arc_flags & ARC_PREFETCH)
3328				hdr->b_flags |= ARC_PREFETCH;
3329			else
3330				add_reference(hdr, hash_lock, private);
3331			if (*arc_flags & ARC_L2CACHE)
3332				hdr->b_flags |= ARC_L2CACHE;
3333			if (*arc_flags & ARC_L2COMPRESS)
3334				hdr->b_flags |= ARC_L2COMPRESS;
3335			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3336			buf->b_hdr = hdr;
3337			buf->b_data = NULL;
3338			buf->b_efunc = NULL;
3339			buf->b_private = NULL;
3340			buf->b_next = NULL;
3341			hdr->b_buf = buf;
3342			ASSERT(hdr->b_datacnt == 0);
3343			hdr->b_datacnt = 1;
3344			arc_get_data_buf(buf);
3345			arc_access(hdr, hash_lock);
3346		}
3347
3348		ASSERT(!GHOST_STATE(hdr->b_state));
3349
3350		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3351		acb->acb_done = done;
3352		acb->acb_private = private;
3353
3354		ASSERT(hdr->b_acb == NULL);
3355		hdr->b_acb = acb;
3356		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3357
3358		if (hdr->b_l2hdr != NULL &&
3359		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3360			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3361			addr = hdr->b_l2hdr->b_daddr;
3362			b_compress = hdr->b_l2hdr->b_compress;
3363			b_asize = hdr->b_l2hdr->b_asize;
3364			/*
3365			 * Lock out device removal.
3366			 */
3367			if (vdev_is_dead(vd) ||
3368			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3369				vd = NULL;
3370		}
3371
3372		if (hash_lock != NULL)
3373			mutex_exit(hash_lock);
3374
3375		/*
3376		 * At this point, we have a level 1 cache miss.  Try again in
3377		 * L2ARC if possible.
3378		 */
3379		ASSERT3U(hdr->b_size, ==, size);
3380		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3381		    uint64_t, size, zbookmark_phys_t *, zb);
3382		ARCSTAT_BUMP(arcstat_misses);
3383		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3384		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3385		    data, metadata, misses);
3386#ifdef _KERNEL
3387		curthread->td_ru.ru_inblock++;
3388#endif
3389
3390		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3391			/*
3392			 * Read from the L2ARC if the following are true:
3393			 * 1. The L2ARC vdev was previously cached.
3394			 * 2. This buffer still has L2ARC metadata.
3395			 * 3. This buffer isn't currently writing to the L2ARC.
3396			 * 4. The L2ARC entry wasn't evicted, which may
3397			 *    also have invalidated the vdev.
3398			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3399			 */
3400			if (hdr->b_l2hdr != NULL &&
3401			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3402			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3403				l2arc_read_callback_t *cb;
3404
3405				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3406				ARCSTAT_BUMP(arcstat_l2_hits);
3407
3408				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3409				    KM_SLEEP);
3410				cb->l2rcb_buf = buf;
3411				cb->l2rcb_spa = spa;
3412				cb->l2rcb_bp = *bp;
3413				cb->l2rcb_zb = *zb;
3414				cb->l2rcb_flags = zio_flags;
3415				cb->l2rcb_compress = b_compress;
3416
3417				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3418				    addr + size < vd->vdev_psize -
3419				    VDEV_LABEL_END_SIZE);
3420
3421				/*
3422				 * l2arc read.  The SCL_L2ARC lock will be
3423				 * released by l2arc_read_done().
3424				 * Issue a null zio if the underlying buffer
3425				 * was squashed to zero size by compression.
3426				 */
3427				if (b_compress == ZIO_COMPRESS_EMPTY) {
3428					rzio = zio_null(pio, spa, vd,
3429					    l2arc_read_done, cb,
3430					    zio_flags | ZIO_FLAG_DONT_CACHE |
3431					    ZIO_FLAG_CANFAIL |
3432					    ZIO_FLAG_DONT_PROPAGATE |
3433					    ZIO_FLAG_DONT_RETRY);
3434				} else {
3435					rzio = zio_read_phys(pio, vd, addr,
3436					    b_asize, buf->b_data,
3437					    ZIO_CHECKSUM_OFF,
3438					    l2arc_read_done, cb, priority,
3439					    zio_flags | ZIO_FLAG_DONT_CACHE |
3440					    ZIO_FLAG_CANFAIL |
3441					    ZIO_FLAG_DONT_PROPAGATE |
3442					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3443				}
3444				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3445				    zio_t *, rzio);
3446				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3447
3448				if (*arc_flags & ARC_NOWAIT) {
3449					zio_nowait(rzio);
3450					return (0);
3451				}
3452
3453				ASSERT(*arc_flags & ARC_WAIT);
3454				if (zio_wait(rzio) == 0)
3455					return (0);
3456
3457				/* l2arc read error; goto zio_read() */
3458			} else {
3459				DTRACE_PROBE1(l2arc__miss,
3460				    arc_buf_hdr_t *, hdr);
3461				ARCSTAT_BUMP(arcstat_l2_misses);
3462				if (HDR_L2_WRITING(hdr))
3463					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3464				spa_config_exit(spa, SCL_L2ARC, vd);
3465			}
3466		} else {
3467			if (vd != NULL)
3468				spa_config_exit(spa, SCL_L2ARC, vd);
3469			if (l2arc_ndev != 0) {
3470				DTRACE_PROBE1(l2arc__miss,
3471				    arc_buf_hdr_t *, hdr);
3472				ARCSTAT_BUMP(arcstat_l2_misses);
3473			}
3474		}
3475
3476		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3477		    arc_read_done, buf, priority, zio_flags, zb);
3478
3479		if (*arc_flags & ARC_WAIT)
3480			return (zio_wait(rzio));
3481
3482		ASSERT(*arc_flags & ARC_NOWAIT);
3483		zio_nowait(rzio);
3484	}
3485	return (0);
3486}
3487
3488void
3489arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3490{
3491	ASSERT(buf->b_hdr != NULL);
3492	ASSERT(buf->b_hdr->b_state != arc_anon);
3493	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3494	ASSERT(buf->b_efunc == NULL);
3495	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3496
3497	buf->b_efunc = func;
3498	buf->b_private = private;
3499}
3500
3501/*
3502 * Notify the arc that a block was freed, and thus will never be used again.
3503 */
3504void
3505arc_freed(spa_t *spa, const blkptr_t *bp)
3506{
3507	arc_buf_hdr_t *hdr;
3508	kmutex_t *hash_lock;
3509	uint64_t guid = spa_load_guid(spa);
3510
3511	ASSERT(!BP_IS_EMBEDDED(bp));
3512
3513	hdr = buf_hash_find(guid, bp, &hash_lock);
3514	if (hdr == NULL)
3515		return;
3516	if (HDR_BUF_AVAILABLE(hdr)) {
3517		arc_buf_t *buf = hdr->b_buf;
3518		add_reference(hdr, hash_lock, FTAG);
3519		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3520		mutex_exit(hash_lock);
3521
3522		arc_release(buf, FTAG);
3523		(void) arc_buf_remove_ref(buf, FTAG);
3524	} else {
3525		mutex_exit(hash_lock);
3526	}
3527
3528}
3529
3530/*
3531 * Clear the user eviction callback set by arc_set_callback(), first calling
3532 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3533 * clearing the callback may result in the arc_buf being destroyed.  However,
3534 * it will not result in the *last* arc_buf being destroyed, hence the data
3535 * will remain cached in the ARC. We make a copy of the arc buffer here so
3536 * that we can process the callback without holding any locks.
3537 *
3538 * It's possible that the callback is already in the process of being cleared
3539 * by another thread.  In this case we can not clear the callback.
3540 *
3541 * Returns B_TRUE if the callback was successfully called and cleared.
3542 */
3543boolean_t
3544arc_clear_callback(arc_buf_t *buf)
3545{
3546	arc_buf_hdr_t *hdr;
3547	kmutex_t *hash_lock;
3548	arc_evict_func_t *efunc = buf->b_efunc;
3549	void *private = buf->b_private;
3550	list_t *list, *evicted_list;
3551	kmutex_t *lock, *evicted_lock;
3552
3553	mutex_enter(&buf->b_evict_lock);
3554	hdr = buf->b_hdr;
3555	if (hdr == NULL) {
3556		/*
3557		 * We are in arc_do_user_evicts().
3558		 */
3559		ASSERT(buf->b_data == NULL);
3560		mutex_exit(&buf->b_evict_lock);
3561		return (B_FALSE);
3562	} else if (buf->b_data == NULL) {
3563		/*
3564		 * We are on the eviction list; process this buffer now
3565		 * but let arc_do_user_evicts() do the reaping.
3566		 */
3567		buf->b_efunc = NULL;
3568		mutex_exit(&buf->b_evict_lock);
3569		VERIFY0(efunc(private));
3570		return (B_TRUE);
3571	}
3572	hash_lock = HDR_LOCK(hdr);
3573	mutex_enter(hash_lock);
3574	hdr = buf->b_hdr;
3575	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3576
3577	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3578	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3579
3580	buf->b_efunc = NULL;
3581	buf->b_private = NULL;
3582
3583	if (hdr->b_datacnt > 1) {
3584		mutex_exit(&buf->b_evict_lock);
3585		arc_buf_destroy(buf, FALSE, TRUE);
3586	} else {
3587		ASSERT(buf == hdr->b_buf);
3588		hdr->b_flags |= ARC_BUF_AVAILABLE;
3589		mutex_exit(&buf->b_evict_lock);
3590	}
3591
3592	mutex_exit(hash_lock);
3593	VERIFY0(efunc(private));
3594	return (B_TRUE);
3595}
3596
3597/*
3598 * Release this buffer from the cache, making it an anonymous buffer.  This
3599 * must be done after a read and prior to modifying the buffer contents.
3600 * If the buffer has more than one reference, we must make
3601 * a new hdr for the buffer.
3602 */
3603void
3604arc_release(arc_buf_t *buf, void *tag)
3605{
3606	arc_buf_hdr_t *hdr;
3607	kmutex_t *hash_lock = NULL;
3608	l2arc_buf_hdr_t *l2hdr;
3609	uint64_t buf_size;
3610
3611	/*
3612	 * It would be nice to assert that if it's DMU metadata (level >
3613	 * 0 || it's the dnode file), then it must be syncing context.
3614	 * But we don't know that information at this level.
3615	 */
3616
3617	mutex_enter(&buf->b_evict_lock);
3618	hdr = buf->b_hdr;
3619
3620	/* this buffer is not on any list */
3621	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3622
3623	if (hdr->b_state == arc_anon) {
3624		/* this buffer is already released */
3625		ASSERT(buf->b_efunc == NULL);
3626	} else {
3627		hash_lock = HDR_LOCK(hdr);
3628		mutex_enter(hash_lock);
3629		hdr = buf->b_hdr;
3630		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3631	}
3632
3633	l2hdr = hdr->b_l2hdr;
3634	if (l2hdr) {
3635		mutex_enter(&l2arc_buflist_mtx);
3636		arc_buf_l2_cdata_free(hdr);
3637		hdr->b_l2hdr = NULL;
3638		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3639	}
3640	buf_size = hdr->b_size;
3641
3642	/*
3643	 * Do we have more than one buf?
3644	 */
3645	if (hdr->b_datacnt > 1) {
3646		arc_buf_hdr_t *nhdr;
3647		arc_buf_t **bufp;
3648		uint64_t blksz = hdr->b_size;
3649		uint64_t spa = hdr->b_spa;
3650		arc_buf_contents_t type = hdr->b_type;
3651		uint32_t flags = hdr->b_flags;
3652
3653		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3654		/*
3655		 * Pull the data off of this hdr and attach it to
3656		 * a new anonymous hdr.
3657		 */
3658		(void) remove_reference(hdr, hash_lock, tag);
3659		bufp = &hdr->b_buf;
3660		while (*bufp != buf)
3661			bufp = &(*bufp)->b_next;
3662		*bufp = buf->b_next;
3663		buf->b_next = NULL;
3664
3665		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3666		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3667		if (refcount_is_zero(&hdr->b_refcnt)) {
3668			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3669			ASSERT3U(*size, >=, hdr->b_size);
3670			atomic_add_64(size, -hdr->b_size);
3671		}
3672
3673		/*
3674		 * We're releasing a duplicate user data buffer, update
3675		 * our statistics accordingly.
3676		 */
3677		if (hdr->b_type == ARC_BUFC_DATA) {
3678			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3679			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3680			    -hdr->b_size);
3681		}
3682		hdr->b_datacnt -= 1;
3683		arc_cksum_verify(buf);
3684#ifdef illumos
3685		arc_buf_unwatch(buf);
3686#endif /* illumos */
3687
3688		mutex_exit(hash_lock);
3689
3690		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3691		nhdr->b_size = blksz;
3692		nhdr->b_spa = spa;
3693		nhdr->b_type = type;
3694		nhdr->b_buf = buf;
3695		nhdr->b_state = arc_anon;
3696		nhdr->b_arc_access = 0;
3697		nhdr->b_flags = flags & ARC_L2_WRITING;
3698		nhdr->b_l2hdr = NULL;
3699		nhdr->b_datacnt = 1;
3700		nhdr->b_freeze_cksum = NULL;
3701		(void) refcount_add(&nhdr->b_refcnt, tag);
3702		buf->b_hdr = nhdr;
3703		mutex_exit(&buf->b_evict_lock);
3704		atomic_add_64(&arc_anon->arcs_size, blksz);
3705	} else {
3706		mutex_exit(&buf->b_evict_lock);
3707		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3708		ASSERT(!list_link_active(&hdr->b_arc_node));
3709		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3710		if (hdr->b_state != arc_anon)
3711			arc_change_state(arc_anon, hdr, hash_lock);
3712		hdr->b_arc_access = 0;
3713		if (hash_lock)
3714			mutex_exit(hash_lock);
3715
3716		buf_discard_identity(hdr);
3717		arc_buf_thaw(buf);
3718	}
3719	buf->b_efunc = NULL;
3720	buf->b_private = NULL;
3721
3722	if (l2hdr) {
3723		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3724		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3725		    -l2hdr->b_asize, 0, 0);
3726		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3727		    hdr->b_size, 0);
3728		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3729		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3730		mutex_exit(&l2arc_buflist_mtx);
3731	}
3732}
3733
3734int
3735arc_released(arc_buf_t *buf)
3736{
3737	int released;
3738
3739	mutex_enter(&buf->b_evict_lock);
3740	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3741	mutex_exit(&buf->b_evict_lock);
3742	return (released);
3743}
3744
3745#ifdef ZFS_DEBUG
3746int
3747arc_referenced(arc_buf_t *buf)
3748{
3749	int referenced;
3750
3751	mutex_enter(&buf->b_evict_lock);
3752	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3753	mutex_exit(&buf->b_evict_lock);
3754	return (referenced);
3755}
3756#endif
3757
3758static void
3759arc_write_ready(zio_t *zio)
3760{
3761	arc_write_callback_t *callback = zio->io_private;
3762	arc_buf_t *buf = callback->awcb_buf;
3763	arc_buf_hdr_t *hdr = buf->b_hdr;
3764
3765	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3766	callback->awcb_ready(zio, buf, callback->awcb_private);
3767
3768	/*
3769	 * If the IO is already in progress, then this is a re-write
3770	 * attempt, so we need to thaw and re-compute the cksum.
3771	 * It is the responsibility of the callback to handle the
3772	 * accounting for any re-write attempt.
3773	 */
3774	if (HDR_IO_IN_PROGRESS(hdr)) {
3775		mutex_enter(&hdr->b_freeze_lock);
3776		if (hdr->b_freeze_cksum != NULL) {
3777			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3778			hdr->b_freeze_cksum = NULL;
3779		}
3780		mutex_exit(&hdr->b_freeze_lock);
3781	}
3782	arc_cksum_compute(buf, B_FALSE);
3783	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3784}
3785
3786/*
3787 * The SPA calls this callback for each physical write that happens on behalf
3788 * of a logical write.  See the comment in dbuf_write_physdone() for details.
3789 */
3790static void
3791arc_write_physdone(zio_t *zio)
3792{
3793	arc_write_callback_t *cb = zio->io_private;
3794	if (cb->awcb_physdone != NULL)
3795		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3796}
3797
3798static void
3799arc_write_done(zio_t *zio)
3800{
3801	arc_write_callback_t *callback = zio->io_private;
3802	arc_buf_t *buf = callback->awcb_buf;
3803	arc_buf_hdr_t *hdr = buf->b_hdr;
3804
3805	ASSERT(hdr->b_acb == NULL);
3806
3807	if (zio->io_error == 0) {
3808		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3809			buf_discard_identity(hdr);
3810		} else {
3811			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3812			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3813			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3814		}
3815	} else {
3816		ASSERT(BUF_EMPTY(hdr));
3817	}
3818
3819	/*
3820	 * If the block to be written was all-zero or compressed enough to be
3821	 * embedded in the BP, no write was performed so there will be no
3822	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3823	 * (and uncached).
3824	 */
3825	if (!BUF_EMPTY(hdr)) {
3826		arc_buf_hdr_t *exists;
3827		kmutex_t *hash_lock;
3828
3829		ASSERT(zio->io_error == 0);
3830
3831		arc_cksum_verify(buf);
3832
3833		exists = buf_hash_insert(hdr, &hash_lock);
3834		if (exists) {
3835			/*
3836			 * This can only happen if we overwrite for
3837			 * sync-to-convergence, because we remove
3838			 * buffers from the hash table when we arc_free().
3839			 */
3840			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3841				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3842					panic("bad overwrite, hdr=%p exists=%p",
3843					    (void *)hdr, (void *)exists);
3844				ASSERT(refcount_is_zero(&exists->b_refcnt));
3845				arc_change_state(arc_anon, exists, hash_lock);
3846				mutex_exit(hash_lock);
3847				arc_hdr_destroy(exists);
3848				exists = buf_hash_insert(hdr, &hash_lock);
3849				ASSERT3P(exists, ==, NULL);
3850			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3851				/* nopwrite */
3852				ASSERT(zio->io_prop.zp_nopwrite);
3853				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3854					panic("bad nopwrite, hdr=%p exists=%p",
3855					    (void *)hdr, (void *)exists);
3856			} else {
3857				/* Dedup */
3858				ASSERT(hdr->b_datacnt == 1);
3859				ASSERT(hdr->b_state == arc_anon);
3860				ASSERT(BP_GET_DEDUP(zio->io_bp));
3861				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3862			}
3863		}
3864		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3865		/* if it's not anon, we are doing a scrub */
3866		if (!exists && hdr->b_state == arc_anon)
3867			arc_access(hdr, hash_lock);
3868		mutex_exit(hash_lock);
3869	} else {
3870		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3871	}
3872
3873	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3874	callback->awcb_done(zio, buf, callback->awcb_private);
3875
3876	kmem_free(callback, sizeof (arc_write_callback_t));
3877}
3878
3879zio_t *
3880arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3881    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3882    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3883    arc_done_func_t *done, void *private, zio_priority_t priority,
3884    int zio_flags, const zbookmark_phys_t *zb)
3885{
3886	arc_buf_hdr_t *hdr = buf->b_hdr;
3887	arc_write_callback_t *callback;
3888	zio_t *zio;
3889
3890	ASSERT(ready != NULL);
3891	ASSERT(done != NULL);
3892	ASSERT(!HDR_IO_ERROR(hdr));
3893	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3894	ASSERT(hdr->b_acb == NULL);
3895	if (l2arc)
3896		hdr->b_flags |= ARC_L2CACHE;
3897	if (l2arc_compress)
3898		hdr->b_flags |= ARC_L2COMPRESS;
3899	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3900	callback->awcb_ready = ready;
3901	callback->awcb_physdone = physdone;
3902	callback->awcb_done = done;
3903	callback->awcb_private = private;
3904	callback->awcb_buf = buf;
3905
3906	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3907	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
3908	    priority, zio_flags, zb);
3909
3910	return (zio);
3911}
3912
3913static int
3914arc_memory_throttle(uint64_t reserve, uint64_t txg)
3915{
3916#ifdef _KERNEL
3917	uint64_t available_memory =
3918	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3919	static uint64_t page_load = 0;
3920	static uint64_t last_txg = 0;
3921
3922#ifdef sun
3923#if defined(__i386)
3924	available_memory =
3925	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3926#endif
3927#endif	/* sun */
3928
3929	if (cnt.v_free_count + cnt.v_cache_count >
3930	    (uint64_t)physmem * arc_lotsfree_percent / 100)
3931		return (0);
3932
3933	if (txg > last_txg) {
3934		last_txg = txg;
3935		page_load = 0;
3936	}
3937	/*
3938	 * If we are in pageout, we know that memory is already tight,
3939	 * the arc is already going to be evicting, so we just want to
3940	 * continue to let page writes occur as quickly as possible.
3941	 */
3942	if (curproc == pageproc) {
3943		if (page_load > available_memory / 4)
3944			return (SET_ERROR(ERESTART));
3945		/* Note: reserve is inflated, so we deflate */
3946		page_load += reserve / 8;
3947		return (0);
3948	} else if (page_load > 0 && arc_reclaim_needed()) {
3949		/* memory is low, delay before restarting */
3950		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3951		return (SET_ERROR(EAGAIN));
3952	}
3953	page_load = 0;
3954#endif
3955	return (0);
3956}
3957
3958void
3959arc_tempreserve_clear(uint64_t reserve)
3960{
3961	atomic_add_64(&arc_tempreserve, -reserve);
3962	ASSERT((int64_t)arc_tempreserve >= 0);
3963}
3964
3965int
3966arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3967{
3968	int error;
3969	uint64_t anon_size;
3970
3971	if (reserve > arc_c/4 && !arc_no_grow)
3972		arc_c = MIN(arc_c_max, reserve * 4);
3973	if (reserve > arc_c)
3974		return (SET_ERROR(ENOMEM));
3975
3976	/*
3977	 * Don't count loaned bufs as in flight dirty data to prevent long
3978	 * network delays from blocking transactions that are ready to be
3979	 * assigned to a txg.
3980	 */
3981	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3982
3983	/*
3984	 * Writes will, almost always, require additional memory allocations
3985	 * in order to compress/encrypt/etc the data.  We therefore need to
3986	 * make sure that there is sufficient available memory for this.
3987	 */
3988	error = arc_memory_throttle(reserve, txg);
3989	if (error != 0)
3990		return (error);
3991
3992	/*
3993	 * Throttle writes when the amount of dirty data in the cache
3994	 * gets too large.  We try to keep the cache less than half full
3995	 * of dirty blocks so that our sync times don't grow too large.
3996	 * Note: if two requests come in concurrently, we might let them
3997	 * both succeed, when one of them should fail.  Not a huge deal.
3998	 */
3999
4000	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4001	    anon_size > arc_c / 4) {
4002		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4003		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4004		    arc_tempreserve>>10,
4005		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4006		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4007		    reserve>>10, arc_c>>10);
4008		return (SET_ERROR(ERESTART));
4009	}
4010	atomic_add_64(&arc_tempreserve, reserve);
4011	return (0);
4012}
4013
4014static kmutex_t arc_lowmem_lock;
4015#ifdef _KERNEL
4016static eventhandler_tag arc_event_lowmem = NULL;
4017
4018static void
4019arc_lowmem(void *arg __unused, int howto __unused)
4020{
4021
4022	/* Serialize access via arc_lowmem_lock. */
4023	mutex_enter(&arc_lowmem_lock);
4024	mutex_enter(&arc_reclaim_thr_lock);
4025	needfree = 1;
4026	cv_signal(&arc_reclaim_thr_cv);
4027
4028	/*
4029	 * It is unsafe to block here in arbitrary threads, because we can come
4030	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4031	 * with ARC reclaim thread.
4032	 */
4033	if (curproc == pageproc) {
4034		while (needfree)
4035			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4036	}
4037	mutex_exit(&arc_reclaim_thr_lock);
4038	mutex_exit(&arc_lowmem_lock);
4039}
4040#endif
4041
4042void
4043arc_init(void)
4044{
4045	int i, prefetch_tunable_set = 0;
4046
4047	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4048	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4049	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4050
4051	/* Convert seconds to clock ticks */
4052	arc_min_prefetch_lifespan = 1 * hz;
4053
4054	/* Start out with 1/8 of all memory */
4055	arc_c = kmem_size() / 8;
4056
4057#ifdef sun
4058#ifdef _KERNEL
4059	/*
4060	 * On architectures where the physical memory can be larger
4061	 * than the addressable space (intel in 32-bit mode), we may
4062	 * need to limit the cache to 1/8 of VM size.
4063	 */
4064	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4065#endif
4066#endif	/* sun */
4067	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4068	arc_c_min = MAX(arc_c / 4, 64<<18);
4069	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4070	if (arc_c * 8 >= 1<<30)
4071		arc_c_max = (arc_c * 8) - (1<<30);
4072	else
4073		arc_c_max = arc_c_min;
4074	arc_c_max = MAX(arc_c * 5, arc_c_max);
4075
4076#ifdef _KERNEL
4077	/*
4078	 * Allow the tunables to override our calculations if they are
4079	 * reasonable (ie. over 16MB)
4080	 */
4081	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
4082		arc_c_max = zfs_arc_max;
4083	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4084		arc_c_min = zfs_arc_min;
4085#endif
4086
4087	arc_c = arc_c_max;
4088	arc_p = (arc_c >> 1);
4089
4090	/* limit meta-data to 1/4 of the arc capacity */
4091	arc_meta_limit = arc_c_max / 4;
4092
4093	/* Allow the tunable to override if it is reasonable */
4094	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4095		arc_meta_limit = zfs_arc_meta_limit;
4096
4097	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4098		arc_c_min = arc_meta_limit / 2;
4099
4100	if (zfs_arc_grow_retry > 0)
4101		arc_grow_retry = zfs_arc_grow_retry;
4102
4103	if (zfs_arc_shrink_shift > 0)
4104		arc_shrink_shift = zfs_arc_shrink_shift;
4105
4106	if (zfs_arc_p_min_shift > 0)
4107		arc_p_min_shift = zfs_arc_p_min_shift;
4108
4109	/* if kmem_flags are set, lets try to use less memory */
4110	if (kmem_debugging())
4111		arc_c = arc_c / 2;
4112	if (arc_c < arc_c_min)
4113		arc_c = arc_c_min;
4114
4115	zfs_arc_min = arc_c_min;
4116	zfs_arc_max = arc_c_max;
4117
4118	arc_anon = &ARC_anon;
4119	arc_mru = &ARC_mru;
4120	arc_mru_ghost = &ARC_mru_ghost;
4121	arc_mfu = &ARC_mfu;
4122	arc_mfu_ghost = &ARC_mfu_ghost;
4123	arc_l2c_only = &ARC_l2c_only;
4124	arc_size = 0;
4125
4126	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4127		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4128		    NULL, MUTEX_DEFAULT, NULL);
4129		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4130		    NULL, MUTEX_DEFAULT, NULL);
4131		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4132		    NULL, MUTEX_DEFAULT, NULL);
4133		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4134		    NULL, MUTEX_DEFAULT, NULL);
4135		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4136		    NULL, MUTEX_DEFAULT, NULL);
4137		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4138		    NULL, MUTEX_DEFAULT, NULL);
4139
4140		list_create(&arc_mru->arcs_lists[i],
4141		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4142		list_create(&arc_mru_ghost->arcs_lists[i],
4143		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4144		list_create(&arc_mfu->arcs_lists[i],
4145		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4146		list_create(&arc_mfu_ghost->arcs_lists[i],
4147		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4148		list_create(&arc_mfu_ghost->arcs_lists[i],
4149		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4150		list_create(&arc_l2c_only->arcs_lists[i],
4151		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4152	}
4153
4154	buf_init();
4155
4156	arc_thread_exit = 0;
4157	arc_eviction_list = NULL;
4158	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4159	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4160
4161	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4162	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4163
4164	if (arc_ksp != NULL) {
4165		arc_ksp->ks_data = &arc_stats;
4166		kstat_install(arc_ksp);
4167	}
4168
4169	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4170	    TS_RUN, minclsyspri);
4171
4172#ifdef _KERNEL
4173	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4174	    EVENTHANDLER_PRI_FIRST);
4175#endif
4176
4177	arc_dead = FALSE;
4178	arc_warm = B_FALSE;
4179
4180	/*
4181	 * Calculate maximum amount of dirty data per pool.
4182	 *
4183	 * If it has been set by /etc/system, take that.
4184	 * Otherwise, use a percentage of physical memory defined by
4185	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4186	 * zfs_dirty_data_max_max (default 4GB).
4187	 */
4188	if (zfs_dirty_data_max == 0) {
4189		zfs_dirty_data_max = ptob(physmem) *
4190		    zfs_dirty_data_max_percent / 100;
4191		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4192		    zfs_dirty_data_max_max);
4193	}
4194
4195#ifdef _KERNEL
4196	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4197		prefetch_tunable_set = 1;
4198
4199#ifdef __i386__
4200	if (prefetch_tunable_set == 0) {
4201		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4202		    "-- to enable,\n");
4203		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4204		    "to /boot/loader.conf.\n");
4205		zfs_prefetch_disable = 1;
4206	}
4207#else
4208	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4209	    prefetch_tunable_set == 0) {
4210		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4211		    "than 4GB of RAM is present;\n"
4212		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4213		    "to /boot/loader.conf.\n");
4214		zfs_prefetch_disable = 1;
4215	}
4216#endif
4217	/* Warn about ZFS memory and address space requirements. */
4218	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4219		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4220		    "expect unstable behavior.\n");
4221	}
4222	if (kmem_size() < 512 * (1 << 20)) {
4223		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4224		    "expect unstable behavior.\n");
4225		printf("             Consider tuning vm.kmem_size and "
4226		    "vm.kmem_size_max\n");
4227		printf("             in /boot/loader.conf.\n");
4228	}
4229#endif
4230}
4231
4232void
4233arc_fini(void)
4234{
4235	int i;
4236
4237	mutex_enter(&arc_reclaim_thr_lock);
4238	arc_thread_exit = 1;
4239	cv_signal(&arc_reclaim_thr_cv);
4240	while (arc_thread_exit != 0)
4241		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4242	mutex_exit(&arc_reclaim_thr_lock);
4243
4244	arc_flush(NULL);
4245
4246	arc_dead = TRUE;
4247
4248	if (arc_ksp != NULL) {
4249		kstat_delete(arc_ksp);
4250		arc_ksp = NULL;
4251	}
4252
4253	mutex_destroy(&arc_eviction_mtx);
4254	mutex_destroy(&arc_reclaim_thr_lock);
4255	cv_destroy(&arc_reclaim_thr_cv);
4256
4257	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4258		list_destroy(&arc_mru->arcs_lists[i]);
4259		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4260		list_destroy(&arc_mfu->arcs_lists[i]);
4261		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4262		list_destroy(&arc_l2c_only->arcs_lists[i]);
4263
4264		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4265		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4266		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4267		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4268		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4269		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4270	}
4271
4272	buf_fini();
4273
4274	ASSERT(arc_loaned_bytes == 0);
4275
4276	mutex_destroy(&arc_lowmem_lock);
4277#ifdef _KERNEL
4278	if (arc_event_lowmem != NULL)
4279		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4280#endif
4281}
4282
4283/*
4284 * Level 2 ARC
4285 *
4286 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4287 * It uses dedicated storage devices to hold cached data, which are populated
4288 * using large infrequent writes.  The main role of this cache is to boost
4289 * the performance of random read workloads.  The intended L2ARC devices
4290 * include short-stroked disks, solid state disks, and other media with
4291 * substantially faster read latency than disk.
4292 *
4293 *                 +-----------------------+
4294 *                 |         ARC           |
4295 *                 +-----------------------+
4296 *                    |         ^     ^
4297 *                    |         |     |
4298 *      l2arc_feed_thread()    arc_read()
4299 *                    |         |     |
4300 *                    |  l2arc read   |
4301 *                    V         |     |
4302 *               +---------------+    |
4303 *               |     L2ARC     |    |
4304 *               +---------------+    |
4305 *                   |    ^           |
4306 *          l2arc_write() |           |
4307 *                   |    |           |
4308 *                   V    |           |
4309 *                 +-------+      +-------+
4310 *                 | vdev  |      | vdev  |
4311 *                 | cache |      | cache |
4312 *                 +-------+      +-------+
4313 *                 +=========+     .-----.
4314 *                 :  L2ARC  :    |-_____-|
4315 *                 : devices :    | Disks |
4316 *                 +=========+    `-_____-'
4317 *
4318 * Read requests are satisfied from the following sources, in order:
4319 *
4320 *	1) ARC
4321 *	2) vdev cache of L2ARC devices
4322 *	3) L2ARC devices
4323 *	4) vdev cache of disks
4324 *	5) disks
4325 *
4326 * Some L2ARC device types exhibit extremely slow write performance.
4327 * To accommodate for this there are some significant differences between
4328 * the L2ARC and traditional cache design:
4329 *
4330 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4331 * the ARC behave as usual, freeing buffers and placing headers on ghost
4332 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4333 * this would add inflated write latencies for all ARC memory pressure.
4334 *
4335 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4336 * It does this by periodically scanning buffers from the eviction-end of
4337 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4338 * not already there. It scans until a headroom of buffers is satisfied,
4339 * which itself is a buffer for ARC eviction. If a compressible buffer is
4340 * found during scanning and selected for writing to an L2ARC device, we
4341 * temporarily boost scanning headroom during the next scan cycle to make
4342 * sure we adapt to compression effects (which might significantly reduce
4343 * the data volume we write to L2ARC). The thread that does this is
4344 * l2arc_feed_thread(), illustrated below; example sizes are included to
4345 * provide a better sense of ratio than this diagram:
4346 *
4347 *	       head -->                        tail
4348 *	        +---------------------+----------+
4349 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4350 *	        +---------------------+----------+   |   o L2ARC eligible
4351 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4352 *	        +---------------------+----------+   |
4353 *	             15.9 Gbytes      ^ 32 Mbytes    |
4354 *	                           headroom          |
4355 *	                                      l2arc_feed_thread()
4356 *	                                             |
4357 *	                 l2arc write hand <--[oooo]--'
4358 *	                         |           8 Mbyte
4359 *	                         |          write max
4360 *	                         V
4361 *		  +==============================+
4362 *	L2ARC dev |####|#|###|###|    |####| ... |
4363 *	          +==============================+
4364 *	                     32 Gbytes
4365 *
4366 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4367 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4368 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4369 * safe to say that this is an uncommon case, since buffers at the end of
4370 * the ARC lists have moved there due to inactivity.
4371 *
4372 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4373 * then the L2ARC simply misses copying some buffers.  This serves as a
4374 * pressure valve to prevent heavy read workloads from both stalling the ARC
4375 * with waits and clogging the L2ARC with writes.  This also helps prevent
4376 * the potential for the L2ARC to churn if it attempts to cache content too
4377 * quickly, such as during backups of the entire pool.
4378 *
4379 * 5. After system boot and before the ARC has filled main memory, there are
4380 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4381 * lists can remain mostly static.  Instead of searching from tail of these
4382 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4383 * for eligible buffers, greatly increasing its chance of finding them.
4384 *
4385 * The L2ARC device write speed is also boosted during this time so that
4386 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4387 * there are no L2ARC reads, and no fear of degrading read performance
4388 * through increased writes.
4389 *
4390 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4391 * the vdev queue can aggregate them into larger and fewer writes.  Each
4392 * device is written to in a rotor fashion, sweeping writes through
4393 * available space then repeating.
4394 *
4395 * 7. The L2ARC does not store dirty content.  It never needs to flush
4396 * write buffers back to disk based storage.
4397 *
4398 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4399 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4400 *
4401 * The performance of the L2ARC can be tweaked by a number of tunables, which
4402 * may be necessary for different workloads:
4403 *
4404 *	l2arc_write_max		max write bytes per interval
4405 *	l2arc_write_boost	extra write bytes during device warmup
4406 *	l2arc_noprefetch	skip caching prefetched buffers
4407 *	l2arc_headroom		number of max device writes to precache
4408 *	l2arc_headroom_boost	when we find compressed buffers during ARC
4409 *				scanning, we multiply headroom by this
4410 *				percentage factor for the next scan cycle,
4411 *				since more compressed buffers are likely to
4412 *				be present
4413 *	l2arc_feed_secs		seconds between L2ARC writing
4414 *
4415 * Tunables may be removed or added as future performance improvements are
4416 * integrated, and also may become zpool properties.
4417 *
4418 * There are three key functions that control how the L2ARC warms up:
4419 *
4420 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4421 *	l2arc_write_size()	calculate how much to write
4422 *	l2arc_write_interval()	calculate sleep delay between writes
4423 *
4424 * These three functions determine what to write, how much, and how quickly
4425 * to send writes.
4426 */
4427
4428static boolean_t
4429l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4430{
4431	/*
4432	 * A buffer is *not* eligible for the L2ARC if it:
4433	 * 1. belongs to a different spa.
4434	 * 2. is already cached on the L2ARC.
4435	 * 3. has an I/O in progress (it may be an incomplete read).
4436	 * 4. is flagged not eligible (zfs property).
4437	 */
4438	if (ab->b_spa != spa_guid) {
4439		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4440		return (B_FALSE);
4441	}
4442	if (ab->b_l2hdr != NULL) {
4443		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4444		return (B_FALSE);
4445	}
4446	if (HDR_IO_IN_PROGRESS(ab)) {
4447		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4448		return (B_FALSE);
4449	}
4450	if (!HDR_L2CACHE(ab)) {
4451		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4452		return (B_FALSE);
4453	}
4454
4455	return (B_TRUE);
4456}
4457
4458static uint64_t
4459l2arc_write_size(void)
4460{
4461	uint64_t size;
4462
4463	/*
4464	 * Make sure our globals have meaningful values in case the user
4465	 * altered them.
4466	 */
4467	size = l2arc_write_max;
4468	if (size == 0) {
4469		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4470		    "be greater than zero, resetting it to the default (%d)",
4471		    L2ARC_WRITE_SIZE);
4472		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4473	}
4474
4475	if (arc_warm == B_FALSE)
4476		size += l2arc_write_boost;
4477
4478	return (size);
4479
4480}
4481
4482static clock_t
4483l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4484{
4485	clock_t interval, next, now;
4486
4487	/*
4488	 * If the ARC lists are busy, increase our write rate; if the
4489	 * lists are stale, idle back.  This is achieved by checking
4490	 * how much we previously wrote - if it was more than half of
4491	 * what we wanted, schedule the next write much sooner.
4492	 */
4493	if (l2arc_feed_again && wrote > (wanted / 2))
4494		interval = (hz * l2arc_feed_min_ms) / 1000;
4495	else
4496		interval = hz * l2arc_feed_secs;
4497
4498	now = ddi_get_lbolt();
4499	next = MAX(now, MIN(now + interval, began + interval));
4500
4501	return (next);
4502}
4503
4504static void
4505l2arc_hdr_stat_add(void)
4506{
4507	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4508	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4509}
4510
4511static void
4512l2arc_hdr_stat_remove(void)
4513{
4514	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4515	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4516}
4517
4518/*
4519 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4520 * If a device is returned, this also returns holding the spa config lock.
4521 */
4522static l2arc_dev_t *
4523l2arc_dev_get_next(void)
4524{
4525	l2arc_dev_t *first, *next = NULL;
4526
4527	/*
4528	 * Lock out the removal of spas (spa_namespace_lock), then removal
4529	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4530	 * both locks will be dropped and a spa config lock held instead.
4531	 */
4532	mutex_enter(&spa_namespace_lock);
4533	mutex_enter(&l2arc_dev_mtx);
4534
4535	/* if there are no vdevs, there is nothing to do */
4536	if (l2arc_ndev == 0)
4537		goto out;
4538
4539	first = NULL;
4540	next = l2arc_dev_last;
4541	do {
4542		/* loop around the list looking for a non-faulted vdev */
4543		if (next == NULL) {
4544			next = list_head(l2arc_dev_list);
4545		} else {
4546			next = list_next(l2arc_dev_list, next);
4547			if (next == NULL)
4548				next = list_head(l2arc_dev_list);
4549		}
4550
4551		/* if we have come back to the start, bail out */
4552		if (first == NULL)
4553			first = next;
4554		else if (next == first)
4555			break;
4556
4557	} while (vdev_is_dead(next->l2ad_vdev));
4558
4559	/* if we were unable to find any usable vdevs, return NULL */
4560	if (vdev_is_dead(next->l2ad_vdev))
4561		next = NULL;
4562
4563	l2arc_dev_last = next;
4564
4565out:
4566	mutex_exit(&l2arc_dev_mtx);
4567
4568	/*
4569	 * Grab the config lock to prevent the 'next' device from being
4570	 * removed while we are writing to it.
4571	 */
4572	if (next != NULL)
4573		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4574	mutex_exit(&spa_namespace_lock);
4575
4576	return (next);
4577}
4578
4579/*
4580 * Free buffers that were tagged for destruction.
4581 */
4582static void
4583l2arc_do_free_on_write()
4584{
4585	list_t *buflist;
4586	l2arc_data_free_t *df, *df_prev;
4587
4588	mutex_enter(&l2arc_free_on_write_mtx);
4589	buflist = l2arc_free_on_write;
4590
4591	for (df = list_tail(buflist); df; df = df_prev) {
4592		df_prev = list_prev(buflist, df);
4593		ASSERT(df->l2df_data != NULL);
4594		ASSERT(df->l2df_func != NULL);
4595		df->l2df_func(df->l2df_data, df->l2df_size);
4596		list_remove(buflist, df);
4597		kmem_free(df, sizeof (l2arc_data_free_t));
4598	}
4599
4600	mutex_exit(&l2arc_free_on_write_mtx);
4601}
4602
4603/*
4604 * A write to a cache device has completed.  Update all headers to allow
4605 * reads from these buffers to begin.
4606 */
4607static void
4608l2arc_write_done(zio_t *zio)
4609{
4610	l2arc_write_callback_t *cb;
4611	l2arc_dev_t *dev;
4612	list_t *buflist;
4613	arc_buf_hdr_t *head, *ab, *ab_prev;
4614	l2arc_buf_hdr_t *abl2;
4615	kmutex_t *hash_lock;
4616	int64_t bytes_dropped = 0;
4617
4618	cb = zio->io_private;
4619	ASSERT(cb != NULL);
4620	dev = cb->l2wcb_dev;
4621	ASSERT(dev != NULL);
4622	head = cb->l2wcb_head;
4623	ASSERT(head != NULL);
4624	buflist = dev->l2ad_buflist;
4625	ASSERT(buflist != NULL);
4626	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4627	    l2arc_write_callback_t *, cb);
4628
4629	if (zio->io_error != 0)
4630		ARCSTAT_BUMP(arcstat_l2_writes_error);
4631
4632	mutex_enter(&l2arc_buflist_mtx);
4633
4634	/*
4635	 * All writes completed, or an error was hit.
4636	 */
4637	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4638		ab_prev = list_prev(buflist, ab);
4639		abl2 = ab->b_l2hdr;
4640
4641		/*
4642		 * Release the temporary compressed buffer as soon as possible.
4643		 */
4644		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4645			l2arc_release_cdata_buf(ab);
4646
4647		hash_lock = HDR_LOCK(ab);
4648		if (!mutex_tryenter(hash_lock)) {
4649			/*
4650			 * This buffer misses out.  It may be in a stage
4651			 * of eviction.  Its ARC_L2_WRITING flag will be
4652			 * left set, denying reads to this buffer.
4653			 */
4654			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4655			continue;
4656		}
4657
4658		if (zio->io_error != 0) {
4659			/*
4660			 * Error - drop L2ARC entry.
4661			 */
4662			list_remove(buflist, ab);
4663			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4664			bytes_dropped += abl2->b_asize;
4665			ab->b_l2hdr = NULL;
4666			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4667			    ab->b_size, 0);
4668			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4669			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4670		}
4671
4672		/*
4673		 * Allow ARC to begin reads to this L2ARC entry.
4674		 */
4675		ab->b_flags &= ~ARC_L2_WRITING;
4676
4677		mutex_exit(hash_lock);
4678	}
4679
4680	atomic_inc_64(&l2arc_writes_done);
4681	list_remove(buflist, head);
4682	kmem_cache_free(hdr_cache, head);
4683	mutex_exit(&l2arc_buflist_mtx);
4684
4685	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4686
4687	l2arc_do_free_on_write();
4688
4689	kmem_free(cb, sizeof (l2arc_write_callback_t));
4690}
4691
4692/*
4693 * A read to a cache device completed.  Validate buffer contents before
4694 * handing over to the regular ARC routines.
4695 */
4696static void
4697l2arc_read_done(zio_t *zio)
4698{
4699	l2arc_read_callback_t *cb;
4700	arc_buf_hdr_t *hdr;
4701	arc_buf_t *buf;
4702	kmutex_t *hash_lock;
4703	int equal;
4704
4705	ASSERT(zio->io_vd != NULL);
4706	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4707
4708	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4709
4710	cb = zio->io_private;
4711	ASSERT(cb != NULL);
4712	buf = cb->l2rcb_buf;
4713	ASSERT(buf != NULL);
4714
4715	hash_lock = HDR_LOCK(buf->b_hdr);
4716	mutex_enter(hash_lock);
4717	hdr = buf->b_hdr;
4718	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4719
4720	/*
4721	 * If the buffer was compressed, decompress it first.
4722	 */
4723	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4724		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4725	ASSERT(zio->io_data != NULL);
4726
4727	/*
4728	 * Check this survived the L2ARC journey.
4729	 */
4730	equal = arc_cksum_equal(buf);
4731	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4732		mutex_exit(hash_lock);
4733		zio->io_private = buf;
4734		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4735		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4736		arc_read_done(zio);
4737	} else {
4738		mutex_exit(hash_lock);
4739		/*
4740		 * Buffer didn't survive caching.  Increment stats and
4741		 * reissue to the original storage device.
4742		 */
4743		if (zio->io_error != 0) {
4744			ARCSTAT_BUMP(arcstat_l2_io_error);
4745		} else {
4746			zio->io_error = SET_ERROR(EIO);
4747		}
4748		if (!equal)
4749			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4750
4751		/*
4752		 * If there's no waiter, issue an async i/o to the primary
4753		 * storage now.  If there *is* a waiter, the caller must
4754		 * issue the i/o in a context where it's OK to block.
4755		 */
4756		if (zio->io_waiter == NULL) {
4757			zio_t *pio = zio_unique_parent(zio);
4758
4759			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4760
4761			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4762			    buf->b_data, zio->io_size, arc_read_done, buf,
4763			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4764		}
4765	}
4766
4767	kmem_free(cb, sizeof (l2arc_read_callback_t));
4768}
4769
4770/*
4771 * This is the list priority from which the L2ARC will search for pages to
4772 * cache.  This is used within loops (0..3) to cycle through lists in the
4773 * desired order.  This order can have a significant effect on cache
4774 * performance.
4775 *
4776 * Currently the metadata lists are hit first, MFU then MRU, followed by
4777 * the data lists.  This function returns a locked list, and also returns
4778 * the lock pointer.
4779 */
4780static list_t *
4781l2arc_list_locked(int list_num, kmutex_t **lock)
4782{
4783	list_t *list = NULL;
4784	int idx;
4785
4786	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4787
4788	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4789		idx = list_num;
4790		list = &arc_mfu->arcs_lists[idx];
4791		*lock = ARCS_LOCK(arc_mfu, idx);
4792	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4793		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4794		list = &arc_mru->arcs_lists[idx];
4795		*lock = ARCS_LOCK(arc_mru, idx);
4796	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4797		ARC_BUFC_NUMDATALISTS)) {
4798		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4799		list = &arc_mfu->arcs_lists[idx];
4800		*lock = ARCS_LOCK(arc_mfu, idx);
4801	} else {
4802		idx = list_num - ARC_BUFC_NUMLISTS;
4803		list = &arc_mru->arcs_lists[idx];
4804		*lock = ARCS_LOCK(arc_mru, idx);
4805	}
4806
4807	ASSERT(!(MUTEX_HELD(*lock)));
4808	mutex_enter(*lock);
4809	return (list);
4810}
4811
4812/*
4813 * Evict buffers from the device write hand to the distance specified in
4814 * bytes.  This distance may span populated buffers, it may span nothing.
4815 * This is clearing a region on the L2ARC device ready for writing.
4816 * If the 'all' boolean is set, every buffer is evicted.
4817 */
4818static void
4819l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4820{
4821	list_t *buflist;
4822	l2arc_buf_hdr_t *abl2;
4823	arc_buf_hdr_t *ab, *ab_prev;
4824	kmutex_t *hash_lock;
4825	uint64_t taddr;
4826	int64_t bytes_evicted = 0;
4827
4828	buflist = dev->l2ad_buflist;
4829
4830	if (buflist == NULL)
4831		return;
4832
4833	if (!all && dev->l2ad_first) {
4834		/*
4835		 * This is the first sweep through the device.  There is
4836		 * nothing to evict.
4837		 */
4838		return;
4839	}
4840
4841	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4842		/*
4843		 * When nearing the end of the device, evict to the end
4844		 * before the device write hand jumps to the start.
4845		 */
4846		taddr = dev->l2ad_end;
4847	} else {
4848		taddr = dev->l2ad_hand + distance;
4849	}
4850	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4851	    uint64_t, taddr, boolean_t, all);
4852
4853top:
4854	mutex_enter(&l2arc_buflist_mtx);
4855	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4856		ab_prev = list_prev(buflist, ab);
4857
4858		hash_lock = HDR_LOCK(ab);
4859		if (!mutex_tryenter(hash_lock)) {
4860			/*
4861			 * Missed the hash lock.  Retry.
4862			 */
4863			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4864			mutex_exit(&l2arc_buflist_mtx);
4865			mutex_enter(hash_lock);
4866			mutex_exit(hash_lock);
4867			goto top;
4868		}
4869
4870		if (HDR_L2_WRITE_HEAD(ab)) {
4871			/*
4872			 * We hit a write head node.  Leave it for
4873			 * l2arc_write_done().
4874			 */
4875			list_remove(buflist, ab);
4876			mutex_exit(hash_lock);
4877			continue;
4878		}
4879
4880		if (!all && ab->b_l2hdr != NULL &&
4881		    (ab->b_l2hdr->b_daddr > taddr ||
4882		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4883			/*
4884			 * We've evicted to the target address,
4885			 * or the end of the device.
4886			 */
4887			mutex_exit(hash_lock);
4888			break;
4889		}
4890
4891		if (HDR_FREE_IN_PROGRESS(ab)) {
4892			/*
4893			 * Already on the path to destruction.
4894			 */
4895			mutex_exit(hash_lock);
4896			continue;
4897		}
4898
4899		if (ab->b_state == arc_l2c_only) {
4900			ASSERT(!HDR_L2_READING(ab));
4901			/*
4902			 * This doesn't exist in the ARC.  Destroy.
4903			 * arc_hdr_destroy() will call list_remove()
4904			 * and decrement arcstat_l2_size.
4905			 */
4906			arc_change_state(arc_anon, ab, hash_lock);
4907			arc_hdr_destroy(ab);
4908		} else {
4909			/*
4910			 * Invalidate issued or about to be issued
4911			 * reads, since we may be about to write
4912			 * over this location.
4913			 */
4914			if (HDR_L2_READING(ab)) {
4915				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4916				ab->b_flags |= ARC_L2_EVICTED;
4917			}
4918
4919			/*
4920			 * Tell ARC this no longer exists in L2ARC.
4921			 */
4922			if (ab->b_l2hdr != NULL) {
4923				abl2 = ab->b_l2hdr;
4924				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4925				bytes_evicted += abl2->b_asize;
4926				ab->b_l2hdr = NULL;
4927				/*
4928				 * We are destroying l2hdr, so ensure that
4929				 * its compressed buffer, if any, is not leaked.
4930				 */
4931				ASSERT(abl2->b_tmp_cdata == NULL);
4932				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4933				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4934			}
4935			list_remove(buflist, ab);
4936
4937			/*
4938			 * This may have been leftover after a
4939			 * failed write.
4940			 */
4941			ab->b_flags &= ~ARC_L2_WRITING;
4942		}
4943		mutex_exit(hash_lock);
4944	}
4945	mutex_exit(&l2arc_buflist_mtx);
4946
4947	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
4948	dev->l2ad_evict = taddr;
4949}
4950
4951/*
4952 * Find and write ARC buffers to the L2ARC device.
4953 *
4954 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4955 * for reading until they have completed writing.
4956 * The headroom_boost is an in-out parameter used to maintain headroom boost
4957 * state between calls to this function.
4958 *
4959 * Returns the number of bytes actually written (which may be smaller than
4960 * the delta by which the device hand has changed due to alignment).
4961 */
4962static uint64_t
4963l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4964    boolean_t *headroom_boost)
4965{
4966	arc_buf_hdr_t *ab, *ab_prev, *head;
4967	list_t *list;
4968	uint64_t write_asize, write_psize, write_sz, headroom,
4969	    buf_compress_minsz;
4970	void *buf_data;
4971	kmutex_t *list_lock;
4972	boolean_t full;
4973	l2arc_write_callback_t *cb;
4974	zio_t *pio, *wzio;
4975	uint64_t guid = spa_load_guid(spa);
4976	const boolean_t do_headroom_boost = *headroom_boost;
4977	int try;
4978
4979	ASSERT(dev->l2ad_vdev != NULL);
4980
4981	/* Lower the flag now, we might want to raise it again later. */
4982	*headroom_boost = B_FALSE;
4983
4984	pio = NULL;
4985	write_sz = write_asize = write_psize = 0;
4986	full = B_FALSE;
4987	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4988	head->b_flags |= ARC_L2_WRITE_HEAD;
4989
4990	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4991	/*
4992	 * We will want to try to compress buffers that are at least 2x the
4993	 * device sector size.
4994	 */
4995	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4996
4997	/*
4998	 * Copy buffers for L2ARC writing.
4999	 */
5000	mutex_enter(&l2arc_buflist_mtx);
5001	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5002		uint64_t passed_sz = 0;
5003
5004		list = l2arc_list_locked(try, &list_lock);
5005		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5006
5007		/*
5008		 * L2ARC fast warmup.
5009		 *
5010		 * Until the ARC is warm and starts to evict, read from the
5011		 * head of the ARC lists rather than the tail.
5012		 */
5013		if (arc_warm == B_FALSE)
5014			ab = list_head(list);
5015		else
5016			ab = list_tail(list);
5017		if (ab == NULL)
5018			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5019
5020		headroom = target_sz * l2arc_headroom;
5021		if (do_headroom_boost)
5022			headroom = (headroom * l2arc_headroom_boost) / 100;
5023
5024		for (; ab; ab = ab_prev) {
5025			l2arc_buf_hdr_t *l2hdr;
5026			kmutex_t *hash_lock;
5027			uint64_t buf_sz;
5028
5029			if (arc_warm == B_FALSE)
5030				ab_prev = list_next(list, ab);
5031			else
5032				ab_prev = list_prev(list, ab);
5033			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
5034
5035			hash_lock = HDR_LOCK(ab);
5036			if (!mutex_tryenter(hash_lock)) {
5037				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5038				/*
5039				 * Skip this buffer rather than waiting.
5040				 */
5041				continue;
5042			}
5043
5044			passed_sz += ab->b_size;
5045			if (passed_sz > headroom) {
5046				/*
5047				 * Searched too far.
5048				 */
5049				mutex_exit(hash_lock);
5050				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5051				break;
5052			}
5053
5054			if (!l2arc_write_eligible(guid, ab)) {
5055				mutex_exit(hash_lock);
5056				continue;
5057			}
5058
5059			if ((write_sz + ab->b_size) > target_sz) {
5060				full = B_TRUE;
5061				mutex_exit(hash_lock);
5062				ARCSTAT_BUMP(arcstat_l2_write_full);
5063				break;
5064			}
5065
5066			if (pio == NULL) {
5067				/*
5068				 * Insert a dummy header on the buflist so
5069				 * l2arc_write_done() can find where the
5070				 * write buffers begin without searching.
5071				 */
5072				list_insert_head(dev->l2ad_buflist, head);
5073
5074				cb = kmem_alloc(
5075				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5076				cb->l2wcb_dev = dev;
5077				cb->l2wcb_head = head;
5078				pio = zio_root(spa, l2arc_write_done, cb,
5079				    ZIO_FLAG_CANFAIL);
5080				ARCSTAT_BUMP(arcstat_l2_write_pios);
5081			}
5082
5083			/*
5084			 * Create and add a new L2ARC header.
5085			 */
5086			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5087			l2hdr->b_dev = dev;
5088			ab->b_flags |= ARC_L2_WRITING;
5089
5090			/*
5091			 * Temporarily stash the data buffer in b_tmp_cdata.
5092			 * The subsequent write step will pick it up from
5093			 * there. This is because can't access ab->b_buf
5094			 * without holding the hash_lock, which we in turn
5095			 * can't access without holding the ARC list locks
5096			 * (which we want to avoid during compression/writing).
5097			 */
5098			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5099			l2hdr->b_asize = ab->b_size;
5100			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5101
5102			buf_sz = ab->b_size;
5103			ab->b_l2hdr = l2hdr;
5104
5105			list_insert_head(dev->l2ad_buflist, ab);
5106
5107			/*
5108			 * Compute and store the buffer cksum before
5109			 * writing.  On debug the cksum is verified first.
5110			 */
5111			arc_cksum_verify(ab->b_buf);
5112			arc_cksum_compute(ab->b_buf, B_TRUE);
5113
5114			mutex_exit(hash_lock);
5115
5116			write_sz += buf_sz;
5117		}
5118
5119		mutex_exit(list_lock);
5120
5121		if (full == B_TRUE)
5122			break;
5123	}
5124
5125	/* No buffers selected for writing? */
5126	if (pio == NULL) {
5127		ASSERT0(write_sz);
5128		mutex_exit(&l2arc_buflist_mtx);
5129		kmem_cache_free(hdr_cache, head);
5130		return (0);
5131	}
5132
5133	/*
5134	 * Now start writing the buffers. We're starting at the write head
5135	 * and work backwards, retracing the course of the buffer selector
5136	 * loop above.
5137	 */
5138	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5139	    ab = list_prev(dev->l2ad_buflist, ab)) {
5140		l2arc_buf_hdr_t *l2hdr;
5141		uint64_t buf_sz;
5142
5143		/*
5144		 * We shouldn't need to lock the buffer here, since we flagged
5145		 * it as ARC_L2_WRITING in the previous step, but we must take
5146		 * care to only access its L2 cache parameters. In particular,
5147		 * ab->b_buf may be invalid by now due to ARC eviction.
5148		 */
5149		l2hdr = ab->b_l2hdr;
5150		l2hdr->b_daddr = dev->l2ad_hand;
5151
5152		if ((ab->b_flags & ARC_L2COMPRESS) &&
5153		    l2hdr->b_asize >= buf_compress_minsz) {
5154			if (l2arc_compress_buf(l2hdr)) {
5155				/*
5156				 * If compression succeeded, enable headroom
5157				 * boost on the next scan cycle.
5158				 */
5159				*headroom_boost = B_TRUE;
5160			}
5161		}
5162
5163		/*
5164		 * Pick up the buffer data we had previously stashed away
5165		 * (and now potentially also compressed).
5166		 */
5167		buf_data = l2hdr->b_tmp_cdata;
5168		buf_sz = l2hdr->b_asize;
5169
5170		/*
5171		 * If the data has not been compressed, then clear b_tmp_cdata
5172		 * to make sure that it points only to a temporary compression
5173		 * buffer.
5174		 */
5175		if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
5176			l2hdr->b_tmp_cdata = NULL;
5177
5178		/* Compression may have squashed the buffer to zero length. */
5179		if (buf_sz != 0) {
5180			uint64_t buf_p_sz;
5181
5182			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5183			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5184			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5185			    ZIO_FLAG_CANFAIL, B_FALSE);
5186
5187			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5188			    zio_t *, wzio);
5189			(void) zio_nowait(wzio);
5190
5191			write_asize += buf_sz;
5192			/*
5193			 * Keep the clock hand suitably device-aligned.
5194			 */
5195			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5196			write_psize += buf_p_sz;
5197			dev->l2ad_hand += buf_p_sz;
5198		}
5199	}
5200
5201	mutex_exit(&l2arc_buflist_mtx);
5202
5203	ASSERT3U(write_asize, <=, target_sz);
5204	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5205	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5206	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5207	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5208	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5209
5210	/*
5211	 * Bump device hand to the device start if it is approaching the end.
5212	 * l2arc_evict() will already have evicted ahead for this case.
5213	 */
5214	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5215		dev->l2ad_hand = dev->l2ad_start;
5216		dev->l2ad_evict = dev->l2ad_start;
5217		dev->l2ad_first = B_FALSE;
5218	}
5219
5220	dev->l2ad_writing = B_TRUE;
5221	(void) zio_wait(pio);
5222	dev->l2ad_writing = B_FALSE;
5223
5224	return (write_asize);
5225}
5226
5227/*
5228 * Compresses an L2ARC buffer.
5229 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5230 * size in l2hdr->b_asize. This routine tries to compress the data and
5231 * depending on the compression result there are three possible outcomes:
5232 * *) The buffer was incompressible. The original l2hdr contents were left
5233 *    untouched and are ready for writing to an L2 device.
5234 * *) The buffer was all-zeros, so there is no need to write it to an L2
5235 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5236 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5237 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5238 *    data buffer which holds the compressed data to be written, and b_asize
5239 *    tells us how much data there is. b_compress is set to the appropriate
5240 *    compression algorithm. Once writing is done, invoke
5241 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5242 *
5243 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5244 * buffer was incompressible).
5245 */
5246static boolean_t
5247l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5248{
5249	void *cdata;
5250	size_t csize, len, rounded;
5251
5252	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5253	ASSERT(l2hdr->b_tmp_cdata != NULL);
5254
5255	len = l2hdr->b_asize;
5256	cdata = zio_data_buf_alloc(len);
5257	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5258	    cdata, l2hdr->b_asize);
5259
5260	rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
5261	if (rounded > csize) {
5262		bzero((char *)cdata + csize, rounded - csize);
5263		csize = rounded;
5264	}
5265
5266	if (csize == 0) {
5267		/* zero block, indicate that there's nothing to write */
5268		zio_data_buf_free(cdata, len);
5269		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5270		l2hdr->b_asize = 0;
5271		l2hdr->b_tmp_cdata = NULL;
5272		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5273		return (B_TRUE);
5274	} else if (csize > 0 && csize < len) {
5275		/*
5276		 * Compression succeeded, we'll keep the cdata around for
5277		 * writing and release it afterwards.
5278		 */
5279		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5280		l2hdr->b_asize = csize;
5281		l2hdr->b_tmp_cdata = cdata;
5282		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5283		return (B_TRUE);
5284	} else {
5285		/*
5286		 * Compression failed, release the compressed buffer.
5287		 * l2hdr will be left unmodified.
5288		 */
5289		zio_data_buf_free(cdata, len);
5290		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5291		return (B_FALSE);
5292	}
5293}
5294
5295/*
5296 * Decompresses a zio read back from an l2arc device. On success, the
5297 * underlying zio's io_data buffer is overwritten by the uncompressed
5298 * version. On decompression error (corrupt compressed stream), the
5299 * zio->io_error value is set to signal an I/O error.
5300 *
5301 * Please note that the compressed data stream is not checksummed, so
5302 * if the underlying device is experiencing data corruption, we may feed
5303 * corrupt data to the decompressor, so the decompressor needs to be
5304 * able to handle this situation (LZ4 does).
5305 */
5306static void
5307l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5308{
5309	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5310
5311	if (zio->io_error != 0) {
5312		/*
5313		 * An io error has occured, just restore the original io
5314		 * size in preparation for a main pool read.
5315		 */
5316		zio->io_orig_size = zio->io_size = hdr->b_size;
5317		return;
5318	}
5319
5320	if (c == ZIO_COMPRESS_EMPTY) {
5321		/*
5322		 * An empty buffer results in a null zio, which means we
5323		 * need to fill its io_data after we're done restoring the
5324		 * buffer's contents.
5325		 */
5326		ASSERT(hdr->b_buf != NULL);
5327		bzero(hdr->b_buf->b_data, hdr->b_size);
5328		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5329	} else {
5330		ASSERT(zio->io_data != NULL);
5331		/*
5332		 * We copy the compressed data from the start of the arc buffer
5333		 * (the zio_read will have pulled in only what we need, the
5334		 * rest is garbage which we will overwrite at decompression)
5335		 * and then decompress back to the ARC data buffer. This way we
5336		 * can minimize copying by simply decompressing back over the
5337		 * original compressed data (rather than decompressing to an
5338		 * aux buffer and then copying back the uncompressed buffer,
5339		 * which is likely to be much larger).
5340		 */
5341		uint64_t csize;
5342		void *cdata;
5343
5344		csize = zio->io_size;
5345		cdata = zio_data_buf_alloc(csize);
5346		bcopy(zio->io_data, cdata, csize);
5347		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5348		    hdr->b_size) != 0)
5349			zio->io_error = EIO;
5350		zio_data_buf_free(cdata, csize);
5351	}
5352
5353	/* Restore the expected uncompressed IO size. */
5354	zio->io_orig_size = zio->io_size = hdr->b_size;
5355}
5356
5357/*
5358 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5359 * This buffer serves as a temporary holder of compressed data while
5360 * the buffer entry is being written to an l2arc device. Once that is
5361 * done, we can dispose of it.
5362 */
5363static void
5364l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5365{
5366	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5367
5368	ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
5369	if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
5370		/*
5371		 * If the data was compressed, then we've allocated a
5372		 * temporary buffer for it, so now we need to release it.
5373		 */
5374		ASSERT(l2hdr->b_tmp_cdata != NULL);
5375		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5376		l2hdr->b_tmp_cdata = NULL;
5377	} else {
5378		ASSERT(l2hdr->b_tmp_cdata == NULL);
5379	}
5380}
5381
5382/*
5383 * This thread feeds the L2ARC at regular intervals.  This is the beating
5384 * heart of the L2ARC.
5385 */
5386static void
5387l2arc_feed_thread(void *dummy __unused)
5388{
5389	callb_cpr_t cpr;
5390	l2arc_dev_t *dev;
5391	spa_t *spa;
5392	uint64_t size, wrote;
5393	clock_t begin, next = ddi_get_lbolt();
5394	boolean_t headroom_boost = B_FALSE;
5395
5396	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5397
5398	mutex_enter(&l2arc_feed_thr_lock);
5399
5400	while (l2arc_thread_exit == 0) {
5401		CALLB_CPR_SAFE_BEGIN(&cpr);
5402		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5403		    next - ddi_get_lbolt());
5404		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5405		next = ddi_get_lbolt() + hz;
5406
5407		/*
5408		 * Quick check for L2ARC devices.
5409		 */
5410		mutex_enter(&l2arc_dev_mtx);
5411		if (l2arc_ndev == 0) {
5412			mutex_exit(&l2arc_dev_mtx);
5413			continue;
5414		}
5415		mutex_exit(&l2arc_dev_mtx);
5416		begin = ddi_get_lbolt();
5417
5418		/*
5419		 * This selects the next l2arc device to write to, and in
5420		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5421		 * will return NULL if there are now no l2arc devices or if
5422		 * they are all faulted.
5423		 *
5424		 * If a device is returned, its spa's config lock is also
5425		 * held to prevent device removal.  l2arc_dev_get_next()
5426		 * will grab and release l2arc_dev_mtx.
5427		 */
5428		if ((dev = l2arc_dev_get_next()) == NULL)
5429			continue;
5430
5431		spa = dev->l2ad_spa;
5432		ASSERT(spa != NULL);
5433
5434		/*
5435		 * If the pool is read-only then force the feed thread to
5436		 * sleep a little longer.
5437		 */
5438		if (!spa_writeable(spa)) {
5439			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5440			spa_config_exit(spa, SCL_L2ARC, dev);
5441			continue;
5442		}
5443
5444		/*
5445		 * Avoid contributing to memory pressure.
5446		 */
5447		if (arc_reclaim_needed()) {
5448			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5449			spa_config_exit(spa, SCL_L2ARC, dev);
5450			continue;
5451		}
5452
5453		ARCSTAT_BUMP(arcstat_l2_feeds);
5454
5455		size = l2arc_write_size();
5456
5457		/*
5458		 * Evict L2ARC buffers that will be overwritten.
5459		 */
5460		l2arc_evict(dev, size, B_FALSE);
5461
5462		/*
5463		 * Write ARC buffers.
5464		 */
5465		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5466
5467		/*
5468		 * Calculate interval between writes.
5469		 */
5470		next = l2arc_write_interval(begin, size, wrote);
5471		spa_config_exit(spa, SCL_L2ARC, dev);
5472	}
5473
5474	l2arc_thread_exit = 0;
5475	cv_broadcast(&l2arc_feed_thr_cv);
5476	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5477	thread_exit();
5478}
5479
5480boolean_t
5481l2arc_vdev_present(vdev_t *vd)
5482{
5483	l2arc_dev_t *dev;
5484
5485	mutex_enter(&l2arc_dev_mtx);
5486	for (dev = list_head(l2arc_dev_list); dev != NULL;
5487	    dev = list_next(l2arc_dev_list, dev)) {
5488		if (dev->l2ad_vdev == vd)
5489			break;
5490	}
5491	mutex_exit(&l2arc_dev_mtx);
5492
5493	return (dev != NULL);
5494}
5495
5496/*
5497 * Add a vdev for use by the L2ARC.  By this point the spa has already
5498 * validated the vdev and opened it.
5499 */
5500void
5501l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5502{
5503	l2arc_dev_t *adddev;
5504
5505	ASSERT(!l2arc_vdev_present(vd));
5506
5507	vdev_ashift_optimize(vd);
5508
5509	/*
5510	 * Create a new l2arc device entry.
5511	 */
5512	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5513	adddev->l2ad_spa = spa;
5514	adddev->l2ad_vdev = vd;
5515	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5516	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5517	adddev->l2ad_hand = adddev->l2ad_start;
5518	adddev->l2ad_evict = adddev->l2ad_start;
5519	adddev->l2ad_first = B_TRUE;
5520	adddev->l2ad_writing = B_FALSE;
5521
5522	/*
5523	 * This is a list of all ARC buffers that are still valid on the
5524	 * device.
5525	 */
5526	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5527	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5528	    offsetof(arc_buf_hdr_t, b_l2node));
5529
5530	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5531
5532	/*
5533	 * Add device to global list
5534	 */
5535	mutex_enter(&l2arc_dev_mtx);
5536	list_insert_head(l2arc_dev_list, adddev);
5537	atomic_inc_64(&l2arc_ndev);
5538	mutex_exit(&l2arc_dev_mtx);
5539}
5540
5541/*
5542 * Remove a vdev from the L2ARC.
5543 */
5544void
5545l2arc_remove_vdev(vdev_t *vd)
5546{
5547	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5548
5549	/*
5550	 * Find the device by vdev
5551	 */
5552	mutex_enter(&l2arc_dev_mtx);
5553	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5554		nextdev = list_next(l2arc_dev_list, dev);
5555		if (vd == dev->l2ad_vdev) {
5556			remdev = dev;
5557			break;
5558		}
5559	}
5560	ASSERT(remdev != NULL);
5561
5562	/*
5563	 * Remove device from global list
5564	 */
5565	list_remove(l2arc_dev_list, remdev);
5566	l2arc_dev_last = NULL;		/* may have been invalidated */
5567	atomic_dec_64(&l2arc_ndev);
5568	mutex_exit(&l2arc_dev_mtx);
5569
5570	/*
5571	 * Clear all buflists and ARC references.  L2ARC device flush.
5572	 */
5573	l2arc_evict(remdev, 0, B_TRUE);
5574	list_destroy(remdev->l2ad_buflist);
5575	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5576	kmem_free(remdev, sizeof (l2arc_dev_t));
5577}
5578
5579void
5580l2arc_init(void)
5581{
5582	l2arc_thread_exit = 0;
5583	l2arc_ndev = 0;
5584	l2arc_writes_sent = 0;
5585	l2arc_writes_done = 0;
5586
5587	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5588	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5589	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5590	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5591	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5592
5593	l2arc_dev_list = &L2ARC_dev_list;
5594	l2arc_free_on_write = &L2ARC_free_on_write;
5595	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5596	    offsetof(l2arc_dev_t, l2ad_node));
5597	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5598	    offsetof(l2arc_data_free_t, l2df_list_node));
5599}
5600
5601void
5602l2arc_fini(void)
5603{
5604	/*
5605	 * This is called from dmu_fini(), which is called from spa_fini();
5606	 * Because of this, we can assume that all l2arc devices have
5607	 * already been removed when the pools themselves were removed.
5608	 */
5609
5610	l2arc_do_free_on_write();
5611
5612	mutex_destroy(&l2arc_feed_thr_lock);
5613	cv_destroy(&l2arc_feed_thr_cv);
5614	mutex_destroy(&l2arc_dev_mtx);
5615	mutex_destroy(&l2arc_buflist_mtx);
5616	mutex_destroy(&l2arc_free_on_write_mtx);
5617
5618	list_destroy(l2arc_dev_list);
5619	list_destroy(l2arc_free_on_write);
5620}
5621
5622void
5623l2arc_start(void)
5624{
5625	if (!(spa_mode_global & FWRITE))
5626		return;
5627
5628	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5629	    TS_RUN, minclsyspri);
5630}
5631
5632void
5633l2arc_stop(void)
5634{
5635	if (!(spa_mode_global & FWRITE))
5636		return;
5637
5638	mutex_enter(&l2arc_feed_thr_lock);
5639	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5640	l2arc_thread_exit = 1;
5641	while (l2arc_thread_exit != 0)
5642		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5643	mutex_exit(&l2arc_feed_thr_lock);
5644}
5645