arc.c revision 269846
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
25 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26 */
27
28/*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory.  This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about.  Our cache is not so simple.  At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them.  Blocks are only evictable
43 * when there are no external references active.  This makes
44 * eviction far more problematic:  we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space.  In these circumstances we are unable to adjust the cache
49 * size.  To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slows the flow of new data
51 * into the cache until we can make space available.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss.  Our model has a variable sized cache.  It grows with
56 * high use, but also tries to react to memory pressure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefore exactly the same size.  So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict.  In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes).  We therefore choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73/*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists.  The arc_read() interface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2.  We therefore provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexs, rather they rely on the
85 * hash table mutexs for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexs).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table.  It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state.  When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock.  Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()).  Note however that the data associated
104 * with the buffer may be evicted prior to the callback.  The callback
105 * must be made with *no locks held* (to prevent deadlock).  Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_clear_callback()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 *	- L2ARC buflist creation
116 *	- L2ARC buflist eviction
117 *	- L2ARC write completion, which walks L2ARC buflists
118 *	- ARC header destruction, as it removes from L2ARC buflists
119 *	- ARC header release, as it removes from L2ARC buflists
120 */
121
122#include <sys/spa.h>
123#include <sys/zio.h>
124#include <sys/zio_compress.h>
125#include <sys/zfs_context.h>
126#include <sys/arc.h>
127#include <sys/refcount.h>
128#include <sys/vdev.h>
129#include <sys/vdev_impl.h>
130#include <sys/dsl_pool.h>
131#ifdef _KERNEL
132#include <sys/dnlc.h>
133#endif
134#include <sys/callb.h>
135#include <sys/kstat.h>
136#include <sys/trim_map.h>
137#include <zfs_fletcher.h>
138#include <sys/sdt.h>
139
140#include <vm/vm_pageout.h>
141
142#ifdef illumos
143#ifndef _KERNEL
144/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
145boolean_t arc_watch = B_FALSE;
146int arc_procfd;
147#endif
148#endif /* illumos */
149
150static kmutex_t		arc_reclaim_thr_lock;
151static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
152static uint8_t		arc_thread_exit;
153
154#define	ARC_REDUCE_DNLC_PERCENT	3
155uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156
157typedef enum arc_reclaim_strategy {
158	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
159	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
160} arc_reclaim_strategy_t;
161
162/*
163 * The number of iterations through arc_evict_*() before we
164 * drop & reacquire the lock.
165 */
166int arc_evict_iterations = 100;
167
168/* number of seconds before growing cache again */
169static int		arc_grow_retry = 60;
170
171/* shift of arc_c for calculating both min and max arc_p */
172static int		arc_p_min_shift = 4;
173
174/* log2(fraction of arc to reclaim) */
175static int		arc_shrink_shift = 5;
176
177/*
178 * minimum lifespan of a prefetch block in clock ticks
179 * (initialized in arc_init())
180 */
181static int		arc_min_prefetch_lifespan;
182
183/*
184 * If this percent of memory is free, don't throttle.
185 */
186int arc_lotsfree_percent = 10;
187
188static int arc_dead;
189extern int zfs_prefetch_disable;
190
191/*
192 * The arc has filled available memory and has now warmed up.
193 */
194static boolean_t arc_warm;
195
196/*
197 * These tunables are for performance analysis.
198 */
199uint64_t zfs_arc_max;
200uint64_t zfs_arc_min;
201uint64_t zfs_arc_meta_limit = 0;
202int zfs_arc_grow_retry = 0;
203int zfs_arc_shrink_shift = 0;
204int zfs_arc_p_min_shift = 0;
205int zfs_disable_dup_eviction = 0;
206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
207
208TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
209TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
210TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
211TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
212SYSCTL_DECL(_vfs_zfs);
213SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
214    "Maximum ARC size");
215SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
216    "Minimum ARC size");
217SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
218    &zfs_arc_average_blocksize, 0,
219    "ARC average blocksize");
220
221/*
222 * Note that buffers can be in one of 6 states:
223 *	ARC_anon	- anonymous (discussed below)
224 *	ARC_mru		- recently used, currently cached
225 *	ARC_mru_ghost	- recentely used, no longer in cache
226 *	ARC_mfu		- frequently used, currently cached
227 *	ARC_mfu_ghost	- frequently used, no longer in cache
228 *	ARC_l2c_only	- exists in L2ARC but not other states
229 * When there are no active references to the buffer, they are
230 * are linked onto a list in one of these arc states.  These are
231 * the only buffers that can be evicted or deleted.  Within each
232 * state there are multiple lists, one for meta-data and one for
233 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
234 * etc.) is tracked separately so that it can be managed more
235 * explicitly: favored over data, limited explicitly.
236 *
237 * Anonymous buffers are buffers that are not associated with
238 * a DVA.  These are buffers that hold dirty block copies
239 * before they are written to stable storage.  By definition,
240 * they are "ref'd" and are considered part of arc_mru
241 * that cannot be freed.  Generally, they will aquire a DVA
242 * as they are written and migrate onto the arc_mru list.
243 *
244 * The ARC_l2c_only state is for buffers that are in the second
245 * level ARC but no longer in any of the ARC_m* lists.  The second
246 * level ARC itself may also contain buffers that are in any of
247 * the ARC_m* states - meaning that a buffer can exist in two
248 * places.  The reason for the ARC_l2c_only state is to keep the
249 * buffer header in the hash table, so that reads that hit the
250 * second level ARC benefit from these fast lookups.
251 */
252
253#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
254struct arcs_lock {
255	kmutex_t	arcs_lock;
256#ifdef _KERNEL
257	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
258#endif
259};
260
261/*
262 * must be power of two for mask use to work
263 *
264 */
265#define ARC_BUFC_NUMDATALISTS		16
266#define ARC_BUFC_NUMMETADATALISTS	16
267#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
268
269typedef struct arc_state {
270	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
271	uint64_t arcs_size;	/* total amount of data in this state */
272	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
273	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
274} arc_state_t;
275
276#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
277
278/* The 6 states: */
279static arc_state_t ARC_anon;
280static arc_state_t ARC_mru;
281static arc_state_t ARC_mru_ghost;
282static arc_state_t ARC_mfu;
283static arc_state_t ARC_mfu_ghost;
284static arc_state_t ARC_l2c_only;
285
286typedef struct arc_stats {
287	kstat_named_t arcstat_hits;
288	kstat_named_t arcstat_misses;
289	kstat_named_t arcstat_demand_data_hits;
290	kstat_named_t arcstat_demand_data_misses;
291	kstat_named_t arcstat_demand_metadata_hits;
292	kstat_named_t arcstat_demand_metadata_misses;
293	kstat_named_t arcstat_prefetch_data_hits;
294	kstat_named_t arcstat_prefetch_data_misses;
295	kstat_named_t arcstat_prefetch_metadata_hits;
296	kstat_named_t arcstat_prefetch_metadata_misses;
297	kstat_named_t arcstat_mru_hits;
298	kstat_named_t arcstat_mru_ghost_hits;
299	kstat_named_t arcstat_mfu_hits;
300	kstat_named_t arcstat_mfu_ghost_hits;
301	kstat_named_t arcstat_allocated;
302	kstat_named_t arcstat_deleted;
303	kstat_named_t arcstat_stolen;
304	kstat_named_t arcstat_recycle_miss;
305	/*
306	 * Number of buffers that could not be evicted because the hash lock
307	 * was held by another thread.  The lock may not necessarily be held
308	 * by something using the same buffer, since hash locks are shared
309	 * by multiple buffers.
310	 */
311	kstat_named_t arcstat_mutex_miss;
312	/*
313	 * Number of buffers skipped because they have I/O in progress, are
314	 * indrect prefetch buffers that have not lived long enough, or are
315	 * not from the spa we're trying to evict from.
316	 */
317	kstat_named_t arcstat_evict_skip;
318	kstat_named_t arcstat_evict_l2_cached;
319	kstat_named_t arcstat_evict_l2_eligible;
320	kstat_named_t arcstat_evict_l2_ineligible;
321	kstat_named_t arcstat_hash_elements;
322	kstat_named_t arcstat_hash_elements_max;
323	kstat_named_t arcstat_hash_collisions;
324	kstat_named_t arcstat_hash_chains;
325	kstat_named_t arcstat_hash_chain_max;
326	kstat_named_t arcstat_p;
327	kstat_named_t arcstat_c;
328	kstat_named_t arcstat_c_min;
329	kstat_named_t arcstat_c_max;
330	kstat_named_t arcstat_size;
331	kstat_named_t arcstat_hdr_size;
332	kstat_named_t arcstat_data_size;
333	kstat_named_t arcstat_other_size;
334	kstat_named_t arcstat_l2_hits;
335	kstat_named_t arcstat_l2_misses;
336	kstat_named_t arcstat_l2_feeds;
337	kstat_named_t arcstat_l2_rw_clash;
338	kstat_named_t arcstat_l2_read_bytes;
339	kstat_named_t arcstat_l2_write_bytes;
340	kstat_named_t arcstat_l2_writes_sent;
341	kstat_named_t arcstat_l2_writes_done;
342	kstat_named_t arcstat_l2_writes_error;
343	kstat_named_t arcstat_l2_writes_hdr_miss;
344	kstat_named_t arcstat_l2_evict_lock_retry;
345	kstat_named_t arcstat_l2_evict_reading;
346	kstat_named_t arcstat_l2_free_on_write;
347	kstat_named_t arcstat_l2_abort_lowmem;
348	kstat_named_t arcstat_l2_cksum_bad;
349	kstat_named_t arcstat_l2_io_error;
350	kstat_named_t arcstat_l2_size;
351	kstat_named_t arcstat_l2_asize;
352	kstat_named_t arcstat_l2_hdr_size;
353	kstat_named_t arcstat_l2_compress_successes;
354	kstat_named_t arcstat_l2_compress_zeros;
355	kstat_named_t arcstat_l2_compress_failures;
356	kstat_named_t arcstat_l2_write_trylock_fail;
357	kstat_named_t arcstat_l2_write_passed_headroom;
358	kstat_named_t arcstat_l2_write_spa_mismatch;
359	kstat_named_t arcstat_l2_write_in_l2;
360	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
361	kstat_named_t arcstat_l2_write_not_cacheable;
362	kstat_named_t arcstat_l2_write_full;
363	kstat_named_t arcstat_l2_write_buffer_iter;
364	kstat_named_t arcstat_l2_write_pios;
365	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
366	kstat_named_t arcstat_l2_write_buffer_list_iter;
367	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
368	kstat_named_t arcstat_memory_throttle_count;
369	kstat_named_t arcstat_duplicate_buffers;
370	kstat_named_t arcstat_duplicate_buffers_size;
371	kstat_named_t arcstat_duplicate_reads;
372} arc_stats_t;
373
374static arc_stats_t arc_stats = {
375	{ "hits",			KSTAT_DATA_UINT64 },
376	{ "misses",			KSTAT_DATA_UINT64 },
377	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
378	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
379	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
380	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
381	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
382	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
383	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
384	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
385	{ "mru_hits",			KSTAT_DATA_UINT64 },
386	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
387	{ "mfu_hits",			KSTAT_DATA_UINT64 },
388	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
389	{ "allocated",			KSTAT_DATA_UINT64 },
390	{ "deleted",			KSTAT_DATA_UINT64 },
391	{ "stolen",			KSTAT_DATA_UINT64 },
392	{ "recycle_miss",		KSTAT_DATA_UINT64 },
393	{ "mutex_miss",			KSTAT_DATA_UINT64 },
394	{ "evict_skip",			KSTAT_DATA_UINT64 },
395	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
396	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
397	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
398	{ "hash_elements",		KSTAT_DATA_UINT64 },
399	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
400	{ "hash_collisions",		KSTAT_DATA_UINT64 },
401	{ "hash_chains",		KSTAT_DATA_UINT64 },
402	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
403	{ "p",				KSTAT_DATA_UINT64 },
404	{ "c",				KSTAT_DATA_UINT64 },
405	{ "c_min",			KSTAT_DATA_UINT64 },
406	{ "c_max",			KSTAT_DATA_UINT64 },
407	{ "size",			KSTAT_DATA_UINT64 },
408	{ "hdr_size",			KSTAT_DATA_UINT64 },
409	{ "data_size",			KSTAT_DATA_UINT64 },
410	{ "other_size",			KSTAT_DATA_UINT64 },
411	{ "l2_hits",			KSTAT_DATA_UINT64 },
412	{ "l2_misses",			KSTAT_DATA_UINT64 },
413	{ "l2_feeds",			KSTAT_DATA_UINT64 },
414	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
415	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
416	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
417	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
418	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
419	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
420	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
421	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
422	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
423	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
424	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
425	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
426	{ "l2_io_error",		KSTAT_DATA_UINT64 },
427	{ "l2_size",			KSTAT_DATA_UINT64 },
428	{ "l2_asize",			KSTAT_DATA_UINT64 },
429	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
430	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
431	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
432	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
433	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
434	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
435	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
436	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
437	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
438	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
439	{ "l2_write_full",		KSTAT_DATA_UINT64 },
440	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
441	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
442	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
443	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
444	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
445	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
446	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
447	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
448	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
449};
450
451#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
452
453#define	ARCSTAT_INCR(stat, val) \
454	atomic_add_64(&arc_stats.stat.value.ui64, (val))
455
456#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
457#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
458
459#define	ARCSTAT_MAX(stat, val) {					\
460	uint64_t m;							\
461	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
462	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
463		continue;						\
464}
465
466#define	ARCSTAT_MAXSTAT(stat) \
467	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
468
469/*
470 * We define a macro to allow ARC hits/misses to be easily broken down by
471 * two separate conditions, giving a total of four different subtypes for
472 * each of hits and misses (so eight statistics total).
473 */
474#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
475	if (cond1) {							\
476		if (cond2) {						\
477			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
478		} else {						\
479			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
480		}							\
481	} else {							\
482		if (cond2) {						\
483			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
484		} else {						\
485			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
486		}							\
487	}
488
489kstat_t			*arc_ksp;
490static arc_state_t	*arc_anon;
491static arc_state_t	*arc_mru;
492static arc_state_t	*arc_mru_ghost;
493static arc_state_t	*arc_mfu;
494static arc_state_t	*arc_mfu_ghost;
495static arc_state_t	*arc_l2c_only;
496
497/*
498 * There are several ARC variables that are critical to export as kstats --
499 * but we don't want to have to grovel around in the kstat whenever we wish to
500 * manipulate them.  For these variables, we therefore define them to be in
501 * terms of the statistic variable.  This assures that we are not introducing
502 * the possibility of inconsistency by having shadow copies of the variables,
503 * while still allowing the code to be readable.
504 */
505#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
506#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
507#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
508#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
509#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
510
511#define	L2ARC_IS_VALID_COMPRESS(_c_) \
512	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
513
514static int		arc_no_grow;	/* Don't try to grow cache size */
515static uint64_t		arc_tempreserve;
516static uint64_t		arc_loaned_bytes;
517static uint64_t		arc_meta_used;
518static uint64_t		arc_meta_limit;
519static uint64_t		arc_meta_max = 0;
520SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
521    "ARC metadata used");
522SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
523    "ARC metadata limit");
524
525typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
526
527typedef struct arc_callback arc_callback_t;
528
529struct arc_callback {
530	void			*acb_private;
531	arc_done_func_t		*acb_done;
532	arc_buf_t		*acb_buf;
533	zio_t			*acb_zio_dummy;
534	arc_callback_t		*acb_next;
535};
536
537typedef struct arc_write_callback arc_write_callback_t;
538
539struct arc_write_callback {
540	void		*awcb_private;
541	arc_done_func_t	*awcb_ready;
542	arc_done_func_t	*awcb_physdone;
543	arc_done_func_t	*awcb_done;
544	arc_buf_t	*awcb_buf;
545};
546
547struct arc_buf_hdr {
548	/* protected by hash lock */
549	dva_t			b_dva;
550	uint64_t		b_birth;
551	uint64_t		b_cksum0;
552
553	kmutex_t		b_freeze_lock;
554	zio_cksum_t		*b_freeze_cksum;
555	void			*b_thawed;
556
557	arc_buf_hdr_t		*b_hash_next;
558	arc_buf_t		*b_buf;
559	uint32_t		b_flags;
560	uint32_t		b_datacnt;
561
562	arc_callback_t		*b_acb;
563	kcondvar_t		b_cv;
564
565	/* immutable */
566	arc_buf_contents_t	b_type;
567	uint64_t		b_size;
568	uint64_t		b_spa;
569
570	/* protected by arc state mutex */
571	arc_state_t		*b_state;
572	list_node_t		b_arc_node;
573
574	/* updated atomically */
575	clock_t			b_arc_access;
576
577	/* self protecting */
578	refcount_t		b_refcnt;
579
580	l2arc_buf_hdr_t		*b_l2hdr;
581	list_node_t		b_l2node;
582};
583
584static arc_buf_t *arc_eviction_list;
585static kmutex_t arc_eviction_mtx;
586static arc_buf_hdr_t arc_eviction_hdr;
587static void arc_get_data_buf(arc_buf_t *buf);
588static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
589static int arc_evict_needed(arc_buf_contents_t type);
590static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
591#ifdef illumos
592static void arc_buf_watch(arc_buf_t *buf);
593#endif /* illumos */
594
595static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
596
597#define	GHOST_STATE(state)	\
598	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
599	(state) == arc_l2c_only)
600
601/*
602 * Private ARC flags.  These flags are private ARC only flags that will show up
603 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
604 * be passed in as arc_flags in things like arc_read.  However, these flags
605 * should never be passed and should only be set by ARC code.  When adding new
606 * public flags, make sure not to smash the private ones.
607 */
608
609#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
610#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
611#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
612#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
613#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
614#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
615#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
616#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
617#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
618#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
619
620#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
621#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
622#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
623#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
624#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
625#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
626#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
627#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
628#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
629				    (hdr)->b_l2hdr != NULL)
630#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
631#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
632#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
633
634/*
635 * Other sizes
636 */
637
638#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
639#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
640
641/*
642 * Hash table routines
643 */
644
645#define	HT_LOCK_PAD	CACHE_LINE_SIZE
646
647struct ht_lock {
648	kmutex_t	ht_lock;
649#ifdef _KERNEL
650	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
651#endif
652};
653
654#define	BUF_LOCKS 256
655typedef struct buf_hash_table {
656	uint64_t ht_mask;
657	arc_buf_hdr_t **ht_table;
658	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
659} buf_hash_table_t;
660
661static buf_hash_table_t buf_hash_table;
662
663#define	BUF_HASH_INDEX(spa, dva, birth) \
664	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
665#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
666#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
667#define	HDR_LOCK(hdr) \
668	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
669
670uint64_t zfs_crc64_table[256];
671
672/*
673 * Level 2 ARC
674 */
675
676#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
677#define	L2ARC_HEADROOM		2			/* num of writes */
678/*
679 * If we discover during ARC scan any buffers to be compressed, we boost
680 * our headroom for the next scanning cycle by this percentage multiple.
681 */
682#define	L2ARC_HEADROOM_BOOST	200
683#define	L2ARC_FEED_SECS		1		/* caching interval secs */
684#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
685
686#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
687#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
688
689/* L2ARC Performance Tunables */
690uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
691uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
692uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
693uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
694uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
695uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
696boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
697boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
698boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
699
700SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
701    &l2arc_write_max, 0, "max write size");
702SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
703    &l2arc_write_boost, 0, "extra write during warmup");
704SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
705    &l2arc_headroom, 0, "number of dev writes");
706SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
707    &l2arc_feed_secs, 0, "interval seconds");
708SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
709    &l2arc_feed_min_ms, 0, "min interval milliseconds");
710
711SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
712    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
713SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
714    &l2arc_feed_again, 0, "turbo warmup");
715SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
716    &l2arc_norw, 0, "no reads during writes");
717
718SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
719    &ARC_anon.arcs_size, 0, "size of anonymous state");
720SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
721    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
722SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
723    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
724
725SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
726    &ARC_mru.arcs_size, 0, "size of mru state");
727SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
728    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
729SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
730    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
731
732SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
733    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
734SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
735    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
736    "size of metadata in mru ghost state");
737SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
738    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
739    "size of data in mru ghost state");
740
741SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
742    &ARC_mfu.arcs_size, 0, "size of mfu state");
743SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
744    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
745SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
746    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
747
748SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
749    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
750SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
751    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
752    "size of metadata in mfu ghost state");
753SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
754    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
755    "size of data in mfu ghost state");
756
757SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
758    &ARC_l2c_only.arcs_size, 0, "size of mru state");
759
760/*
761 * L2ARC Internals
762 */
763typedef struct l2arc_dev {
764	vdev_t			*l2ad_vdev;	/* vdev */
765	spa_t			*l2ad_spa;	/* spa */
766	uint64_t		l2ad_hand;	/* next write location */
767	uint64_t		l2ad_start;	/* first addr on device */
768	uint64_t		l2ad_end;	/* last addr on device */
769	uint64_t		l2ad_evict;	/* last addr eviction reached */
770	boolean_t		l2ad_first;	/* first sweep through */
771	boolean_t		l2ad_writing;	/* currently writing */
772	list_t			*l2ad_buflist;	/* buffer list */
773	list_node_t		l2ad_node;	/* device list node */
774} l2arc_dev_t;
775
776static list_t L2ARC_dev_list;			/* device list */
777static list_t *l2arc_dev_list;			/* device list pointer */
778static kmutex_t l2arc_dev_mtx;			/* device list mutex */
779static l2arc_dev_t *l2arc_dev_last;		/* last device used */
780static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
781static list_t L2ARC_free_on_write;		/* free after write buf list */
782static list_t *l2arc_free_on_write;		/* free after write list ptr */
783static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
784static uint64_t l2arc_ndev;			/* number of devices */
785
786typedef struct l2arc_read_callback {
787	arc_buf_t		*l2rcb_buf;		/* read buffer */
788	spa_t			*l2rcb_spa;		/* spa */
789	blkptr_t		l2rcb_bp;		/* original blkptr */
790	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
791	int			l2rcb_flags;		/* original flags */
792	enum zio_compress	l2rcb_compress;		/* applied compress */
793} l2arc_read_callback_t;
794
795typedef struct l2arc_write_callback {
796	l2arc_dev_t	*l2wcb_dev;		/* device info */
797	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
798} l2arc_write_callback_t;
799
800struct l2arc_buf_hdr {
801	/* protected by arc_buf_hdr  mutex */
802	l2arc_dev_t		*b_dev;		/* L2ARC device */
803	uint64_t		b_daddr;	/* disk address, offset byte */
804	/* compression applied to buffer data */
805	enum zio_compress	b_compress;
806	/* real alloc'd buffer size depending on b_compress applied */
807	int			b_asize;
808	/* temporary buffer holder for in-flight compressed data */
809	void			*b_tmp_cdata;
810};
811
812typedef struct l2arc_data_free {
813	/* protected by l2arc_free_on_write_mtx */
814	void		*l2df_data;
815	size_t		l2df_size;
816	void		(*l2df_func)(void *, size_t);
817	list_node_t	l2df_list_node;
818} l2arc_data_free_t;
819
820static kmutex_t l2arc_feed_thr_lock;
821static kcondvar_t l2arc_feed_thr_cv;
822static uint8_t l2arc_thread_exit;
823
824static void l2arc_read_done(zio_t *zio);
825static void l2arc_hdr_stat_add(void);
826static void l2arc_hdr_stat_remove(void);
827
828static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
829static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
830    enum zio_compress c);
831static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
832
833static uint64_t
834buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
835{
836	uint8_t *vdva = (uint8_t *)dva;
837	uint64_t crc = -1ULL;
838	int i;
839
840	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
841
842	for (i = 0; i < sizeof (dva_t); i++)
843		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
844
845	crc ^= (spa>>8) ^ birth;
846
847	return (crc);
848}
849
850#define	BUF_EMPTY(buf)						\
851	((buf)->b_dva.dva_word[0] == 0 &&			\
852	(buf)->b_dva.dva_word[1] == 0 &&			\
853	(buf)->b_cksum0 == 0)
854
855#define	BUF_EQUAL(spa, dva, birth, buf)				\
856	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
857	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
858	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
859
860static void
861buf_discard_identity(arc_buf_hdr_t *hdr)
862{
863	hdr->b_dva.dva_word[0] = 0;
864	hdr->b_dva.dva_word[1] = 0;
865	hdr->b_birth = 0;
866	hdr->b_cksum0 = 0;
867}
868
869static arc_buf_hdr_t *
870buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
871{
872	const dva_t *dva = BP_IDENTITY(bp);
873	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
874	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
875	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
876	arc_buf_hdr_t *buf;
877
878	mutex_enter(hash_lock);
879	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
880	    buf = buf->b_hash_next) {
881		if (BUF_EQUAL(spa, dva, birth, buf)) {
882			*lockp = hash_lock;
883			return (buf);
884		}
885	}
886	mutex_exit(hash_lock);
887	*lockp = NULL;
888	return (NULL);
889}
890
891/*
892 * Insert an entry into the hash table.  If there is already an element
893 * equal to elem in the hash table, then the already existing element
894 * will be returned and the new element will not be inserted.
895 * Otherwise returns NULL.
896 */
897static arc_buf_hdr_t *
898buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
899{
900	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
901	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
902	arc_buf_hdr_t *fbuf;
903	uint32_t i;
904
905	ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
906	ASSERT(buf->b_birth != 0);
907	ASSERT(!HDR_IN_HASH_TABLE(buf));
908	*lockp = hash_lock;
909	mutex_enter(hash_lock);
910	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
911	    fbuf = fbuf->b_hash_next, i++) {
912		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
913			return (fbuf);
914	}
915
916	buf->b_hash_next = buf_hash_table.ht_table[idx];
917	buf_hash_table.ht_table[idx] = buf;
918	buf->b_flags |= ARC_IN_HASH_TABLE;
919
920	/* collect some hash table performance data */
921	if (i > 0) {
922		ARCSTAT_BUMP(arcstat_hash_collisions);
923		if (i == 1)
924			ARCSTAT_BUMP(arcstat_hash_chains);
925
926		ARCSTAT_MAX(arcstat_hash_chain_max, i);
927	}
928
929	ARCSTAT_BUMP(arcstat_hash_elements);
930	ARCSTAT_MAXSTAT(arcstat_hash_elements);
931
932	return (NULL);
933}
934
935static void
936buf_hash_remove(arc_buf_hdr_t *buf)
937{
938	arc_buf_hdr_t *fbuf, **bufp;
939	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
940
941	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
942	ASSERT(HDR_IN_HASH_TABLE(buf));
943
944	bufp = &buf_hash_table.ht_table[idx];
945	while ((fbuf = *bufp) != buf) {
946		ASSERT(fbuf != NULL);
947		bufp = &fbuf->b_hash_next;
948	}
949	*bufp = buf->b_hash_next;
950	buf->b_hash_next = NULL;
951	buf->b_flags &= ~ARC_IN_HASH_TABLE;
952
953	/* collect some hash table performance data */
954	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
955
956	if (buf_hash_table.ht_table[idx] &&
957	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
958		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
959}
960
961/*
962 * Global data structures and functions for the buf kmem cache.
963 */
964static kmem_cache_t *hdr_cache;
965static kmem_cache_t *buf_cache;
966
967static void
968buf_fini(void)
969{
970	int i;
971
972	kmem_free(buf_hash_table.ht_table,
973	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
974	for (i = 0; i < BUF_LOCKS; i++)
975		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
976	kmem_cache_destroy(hdr_cache);
977	kmem_cache_destroy(buf_cache);
978}
979
980/*
981 * Constructor callback - called when the cache is empty
982 * and a new buf is requested.
983 */
984/* ARGSUSED */
985static int
986hdr_cons(void *vbuf, void *unused, int kmflag)
987{
988	arc_buf_hdr_t *buf = vbuf;
989
990	bzero(buf, sizeof (arc_buf_hdr_t));
991	refcount_create(&buf->b_refcnt);
992	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
993	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
994	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
995
996	return (0);
997}
998
999/* ARGSUSED */
1000static int
1001buf_cons(void *vbuf, void *unused, int kmflag)
1002{
1003	arc_buf_t *buf = vbuf;
1004
1005	bzero(buf, sizeof (arc_buf_t));
1006	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1007	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1008
1009	return (0);
1010}
1011
1012/*
1013 * Destructor callback - called when a cached buf is
1014 * no longer required.
1015 */
1016/* ARGSUSED */
1017static void
1018hdr_dest(void *vbuf, void *unused)
1019{
1020	arc_buf_hdr_t *buf = vbuf;
1021
1022	ASSERT(BUF_EMPTY(buf));
1023	refcount_destroy(&buf->b_refcnt);
1024	cv_destroy(&buf->b_cv);
1025	mutex_destroy(&buf->b_freeze_lock);
1026	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1027}
1028
1029/* ARGSUSED */
1030static void
1031buf_dest(void *vbuf, void *unused)
1032{
1033	arc_buf_t *buf = vbuf;
1034
1035	mutex_destroy(&buf->b_evict_lock);
1036	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1037}
1038
1039/*
1040 * Reclaim callback -- invoked when memory is low.
1041 */
1042/* ARGSUSED */
1043static void
1044hdr_recl(void *unused)
1045{
1046	dprintf("hdr_recl called\n");
1047	/*
1048	 * umem calls the reclaim func when we destroy the buf cache,
1049	 * which is after we do arc_fini().
1050	 */
1051	if (!arc_dead)
1052		cv_signal(&arc_reclaim_thr_cv);
1053}
1054
1055static void
1056buf_init(void)
1057{
1058	uint64_t *ct;
1059	uint64_t hsize = 1ULL << 12;
1060	int i, j;
1061
1062	/*
1063	 * The hash table is big enough to fill all of physical memory
1064	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1065	 * By default, the table will take up
1066	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1067	 */
1068	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1069		hsize <<= 1;
1070retry:
1071	buf_hash_table.ht_mask = hsize - 1;
1072	buf_hash_table.ht_table =
1073	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1074	if (buf_hash_table.ht_table == NULL) {
1075		ASSERT(hsize > (1ULL << 8));
1076		hsize >>= 1;
1077		goto retry;
1078	}
1079
1080	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1081	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1082	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1083	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1084
1085	for (i = 0; i < 256; i++)
1086		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1087			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1088
1089	for (i = 0; i < BUF_LOCKS; i++) {
1090		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1091		    NULL, MUTEX_DEFAULT, NULL);
1092	}
1093}
1094
1095#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1096
1097static void
1098arc_cksum_verify(arc_buf_t *buf)
1099{
1100	zio_cksum_t zc;
1101
1102	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1103		return;
1104
1105	mutex_enter(&buf->b_hdr->b_freeze_lock);
1106	if (buf->b_hdr->b_freeze_cksum == NULL ||
1107	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1108		mutex_exit(&buf->b_hdr->b_freeze_lock);
1109		return;
1110	}
1111	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1112	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1113		panic("buffer modified while frozen!");
1114	mutex_exit(&buf->b_hdr->b_freeze_lock);
1115}
1116
1117static int
1118arc_cksum_equal(arc_buf_t *buf)
1119{
1120	zio_cksum_t zc;
1121	int equal;
1122
1123	mutex_enter(&buf->b_hdr->b_freeze_lock);
1124	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1125	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1126	mutex_exit(&buf->b_hdr->b_freeze_lock);
1127
1128	return (equal);
1129}
1130
1131static void
1132arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1133{
1134	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1135		return;
1136
1137	mutex_enter(&buf->b_hdr->b_freeze_lock);
1138	if (buf->b_hdr->b_freeze_cksum != NULL) {
1139		mutex_exit(&buf->b_hdr->b_freeze_lock);
1140		return;
1141	}
1142	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1143	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1144	    buf->b_hdr->b_freeze_cksum);
1145	mutex_exit(&buf->b_hdr->b_freeze_lock);
1146#ifdef illumos
1147	arc_buf_watch(buf);
1148#endif /* illumos */
1149}
1150
1151#ifdef illumos
1152#ifndef _KERNEL
1153typedef struct procctl {
1154	long cmd;
1155	prwatch_t prwatch;
1156} procctl_t;
1157#endif
1158
1159/* ARGSUSED */
1160static void
1161arc_buf_unwatch(arc_buf_t *buf)
1162{
1163#ifndef _KERNEL
1164	if (arc_watch) {
1165		int result;
1166		procctl_t ctl;
1167		ctl.cmd = PCWATCH;
1168		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1169		ctl.prwatch.pr_size = 0;
1170		ctl.prwatch.pr_wflags = 0;
1171		result = write(arc_procfd, &ctl, sizeof (ctl));
1172		ASSERT3U(result, ==, sizeof (ctl));
1173	}
1174#endif
1175}
1176
1177/* ARGSUSED */
1178static void
1179arc_buf_watch(arc_buf_t *buf)
1180{
1181#ifndef _KERNEL
1182	if (arc_watch) {
1183		int result;
1184		procctl_t ctl;
1185		ctl.cmd = PCWATCH;
1186		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1187		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1188		ctl.prwatch.pr_wflags = WA_WRITE;
1189		result = write(arc_procfd, &ctl, sizeof (ctl));
1190		ASSERT3U(result, ==, sizeof (ctl));
1191	}
1192#endif
1193}
1194#endif /* illumos */
1195
1196void
1197arc_buf_thaw(arc_buf_t *buf)
1198{
1199	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1200		if (buf->b_hdr->b_state != arc_anon)
1201			panic("modifying non-anon buffer!");
1202		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1203			panic("modifying buffer while i/o in progress!");
1204		arc_cksum_verify(buf);
1205	}
1206
1207	mutex_enter(&buf->b_hdr->b_freeze_lock);
1208	if (buf->b_hdr->b_freeze_cksum != NULL) {
1209		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1210		buf->b_hdr->b_freeze_cksum = NULL;
1211	}
1212
1213	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1214		if (buf->b_hdr->b_thawed)
1215			kmem_free(buf->b_hdr->b_thawed, 1);
1216		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1217	}
1218
1219	mutex_exit(&buf->b_hdr->b_freeze_lock);
1220
1221#ifdef illumos
1222	arc_buf_unwatch(buf);
1223#endif /* illumos */
1224}
1225
1226void
1227arc_buf_freeze(arc_buf_t *buf)
1228{
1229	kmutex_t *hash_lock;
1230
1231	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1232		return;
1233
1234	hash_lock = HDR_LOCK(buf->b_hdr);
1235	mutex_enter(hash_lock);
1236
1237	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1238	    buf->b_hdr->b_state == arc_anon);
1239	arc_cksum_compute(buf, B_FALSE);
1240	mutex_exit(hash_lock);
1241
1242}
1243
1244static void
1245get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1246{
1247	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1248
1249	if (ab->b_type == ARC_BUFC_METADATA)
1250		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1251	else {
1252		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1253		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1254	}
1255
1256	*list = &state->arcs_lists[buf_hashid];
1257	*lock = ARCS_LOCK(state, buf_hashid);
1258}
1259
1260
1261static void
1262add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1263{
1264	ASSERT(MUTEX_HELD(hash_lock));
1265
1266	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1267	    (ab->b_state != arc_anon)) {
1268		uint64_t delta = ab->b_size * ab->b_datacnt;
1269		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1270		list_t *list;
1271		kmutex_t *lock;
1272
1273		get_buf_info(ab, ab->b_state, &list, &lock);
1274		ASSERT(!MUTEX_HELD(lock));
1275		mutex_enter(lock);
1276		ASSERT(list_link_active(&ab->b_arc_node));
1277		list_remove(list, ab);
1278		if (GHOST_STATE(ab->b_state)) {
1279			ASSERT0(ab->b_datacnt);
1280			ASSERT3P(ab->b_buf, ==, NULL);
1281			delta = ab->b_size;
1282		}
1283		ASSERT(delta > 0);
1284		ASSERT3U(*size, >=, delta);
1285		atomic_add_64(size, -delta);
1286		mutex_exit(lock);
1287		/* remove the prefetch flag if we get a reference */
1288		if (ab->b_flags & ARC_PREFETCH)
1289			ab->b_flags &= ~ARC_PREFETCH;
1290	}
1291}
1292
1293static int
1294remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1295{
1296	int cnt;
1297	arc_state_t *state = ab->b_state;
1298
1299	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1300	ASSERT(!GHOST_STATE(state));
1301
1302	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1303	    (state != arc_anon)) {
1304		uint64_t *size = &state->arcs_lsize[ab->b_type];
1305		list_t *list;
1306		kmutex_t *lock;
1307
1308		get_buf_info(ab, state, &list, &lock);
1309		ASSERT(!MUTEX_HELD(lock));
1310		mutex_enter(lock);
1311		ASSERT(!list_link_active(&ab->b_arc_node));
1312		list_insert_head(list, ab);
1313		ASSERT(ab->b_datacnt > 0);
1314		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1315		mutex_exit(lock);
1316	}
1317	return (cnt);
1318}
1319
1320/*
1321 * Move the supplied buffer to the indicated state.  The mutex
1322 * for the buffer must be held by the caller.
1323 */
1324static void
1325arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1326{
1327	arc_state_t *old_state = ab->b_state;
1328	int64_t refcnt = refcount_count(&ab->b_refcnt);
1329	uint64_t from_delta, to_delta;
1330	list_t *list;
1331	kmutex_t *lock;
1332
1333	ASSERT(MUTEX_HELD(hash_lock));
1334	ASSERT3P(new_state, !=, old_state);
1335	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1336	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1337	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1338
1339	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1340
1341	/*
1342	 * If this buffer is evictable, transfer it from the
1343	 * old state list to the new state list.
1344	 */
1345	if (refcnt == 0) {
1346		if (old_state != arc_anon) {
1347			int use_mutex;
1348			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1349
1350			get_buf_info(ab, old_state, &list, &lock);
1351			use_mutex = !MUTEX_HELD(lock);
1352			if (use_mutex)
1353				mutex_enter(lock);
1354
1355			ASSERT(list_link_active(&ab->b_arc_node));
1356			list_remove(list, ab);
1357
1358			/*
1359			 * If prefetching out of the ghost cache,
1360			 * we will have a non-zero datacnt.
1361			 */
1362			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1363				/* ghost elements have a ghost size */
1364				ASSERT(ab->b_buf == NULL);
1365				from_delta = ab->b_size;
1366			}
1367			ASSERT3U(*size, >=, from_delta);
1368			atomic_add_64(size, -from_delta);
1369
1370			if (use_mutex)
1371				mutex_exit(lock);
1372		}
1373		if (new_state != arc_anon) {
1374			int use_mutex;
1375			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1376
1377			get_buf_info(ab, new_state, &list, &lock);
1378			use_mutex = !MUTEX_HELD(lock);
1379			if (use_mutex)
1380				mutex_enter(lock);
1381
1382			list_insert_head(list, ab);
1383
1384			/* ghost elements have a ghost size */
1385			if (GHOST_STATE(new_state)) {
1386				ASSERT(ab->b_datacnt == 0);
1387				ASSERT(ab->b_buf == NULL);
1388				to_delta = ab->b_size;
1389			}
1390			atomic_add_64(size, to_delta);
1391
1392			if (use_mutex)
1393				mutex_exit(lock);
1394		}
1395	}
1396
1397	ASSERT(!BUF_EMPTY(ab));
1398	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1399		buf_hash_remove(ab);
1400
1401	/* adjust state sizes */
1402	if (to_delta)
1403		atomic_add_64(&new_state->arcs_size, to_delta);
1404	if (from_delta) {
1405		ASSERT3U(old_state->arcs_size, >=, from_delta);
1406		atomic_add_64(&old_state->arcs_size, -from_delta);
1407	}
1408	ab->b_state = new_state;
1409
1410	/* adjust l2arc hdr stats */
1411	if (new_state == arc_l2c_only)
1412		l2arc_hdr_stat_add();
1413	else if (old_state == arc_l2c_only)
1414		l2arc_hdr_stat_remove();
1415}
1416
1417void
1418arc_space_consume(uint64_t space, arc_space_type_t type)
1419{
1420	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1421
1422	switch (type) {
1423	case ARC_SPACE_DATA:
1424		ARCSTAT_INCR(arcstat_data_size, space);
1425		break;
1426	case ARC_SPACE_OTHER:
1427		ARCSTAT_INCR(arcstat_other_size, space);
1428		break;
1429	case ARC_SPACE_HDRS:
1430		ARCSTAT_INCR(arcstat_hdr_size, space);
1431		break;
1432	case ARC_SPACE_L2HDRS:
1433		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1434		break;
1435	}
1436
1437	atomic_add_64(&arc_meta_used, space);
1438	atomic_add_64(&arc_size, space);
1439}
1440
1441void
1442arc_space_return(uint64_t space, arc_space_type_t type)
1443{
1444	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1445
1446	switch (type) {
1447	case ARC_SPACE_DATA:
1448		ARCSTAT_INCR(arcstat_data_size, -space);
1449		break;
1450	case ARC_SPACE_OTHER:
1451		ARCSTAT_INCR(arcstat_other_size, -space);
1452		break;
1453	case ARC_SPACE_HDRS:
1454		ARCSTAT_INCR(arcstat_hdr_size, -space);
1455		break;
1456	case ARC_SPACE_L2HDRS:
1457		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1458		break;
1459	}
1460
1461	ASSERT(arc_meta_used >= space);
1462	if (arc_meta_max < arc_meta_used)
1463		arc_meta_max = arc_meta_used;
1464	atomic_add_64(&arc_meta_used, -space);
1465	ASSERT(arc_size >= space);
1466	atomic_add_64(&arc_size, -space);
1467}
1468
1469void *
1470arc_data_buf_alloc(uint64_t size)
1471{
1472	if (arc_evict_needed(ARC_BUFC_DATA))
1473		cv_signal(&arc_reclaim_thr_cv);
1474	atomic_add_64(&arc_size, size);
1475	return (zio_data_buf_alloc(size));
1476}
1477
1478void
1479arc_data_buf_free(void *buf, uint64_t size)
1480{
1481	zio_data_buf_free(buf, size);
1482	ASSERT(arc_size >= size);
1483	atomic_add_64(&arc_size, -size);
1484}
1485
1486arc_buf_t *
1487arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1488{
1489	arc_buf_hdr_t *hdr;
1490	arc_buf_t *buf;
1491
1492	ASSERT3U(size, >, 0);
1493	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1494	ASSERT(BUF_EMPTY(hdr));
1495	hdr->b_size = size;
1496	hdr->b_type = type;
1497	hdr->b_spa = spa_load_guid(spa);
1498	hdr->b_state = arc_anon;
1499	hdr->b_arc_access = 0;
1500	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1501	buf->b_hdr = hdr;
1502	buf->b_data = NULL;
1503	buf->b_efunc = NULL;
1504	buf->b_private = NULL;
1505	buf->b_next = NULL;
1506	hdr->b_buf = buf;
1507	arc_get_data_buf(buf);
1508	hdr->b_datacnt = 1;
1509	hdr->b_flags = 0;
1510	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1511	(void) refcount_add(&hdr->b_refcnt, tag);
1512
1513	return (buf);
1514}
1515
1516static char *arc_onloan_tag = "onloan";
1517
1518/*
1519 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1520 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1521 * buffers must be returned to the arc before they can be used by the DMU or
1522 * freed.
1523 */
1524arc_buf_t *
1525arc_loan_buf(spa_t *spa, int size)
1526{
1527	arc_buf_t *buf;
1528
1529	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1530
1531	atomic_add_64(&arc_loaned_bytes, size);
1532	return (buf);
1533}
1534
1535/*
1536 * Return a loaned arc buffer to the arc.
1537 */
1538void
1539arc_return_buf(arc_buf_t *buf, void *tag)
1540{
1541	arc_buf_hdr_t *hdr = buf->b_hdr;
1542
1543	ASSERT(buf->b_data != NULL);
1544	(void) refcount_add(&hdr->b_refcnt, tag);
1545	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1546
1547	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1548}
1549
1550/* Detach an arc_buf from a dbuf (tag) */
1551void
1552arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1553{
1554	arc_buf_hdr_t *hdr;
1555
1556	ASSERT(buf->b_data != NULL);
1557	hdr = buf->b_hdr;
1558	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1559	(void) refcount_remove(&hdr->b_refcnt, tag);
1560	buf->b_efunc = NULL;
1561	buf->b_private = NULL;
1562
1563	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1564}
1565
1566static arc_buf_t *
1567arc_buf_clone(arc_buf_t *from)
1568{
1569	arc_buf_t *buf;
1570	arc_buf_hdr_t *hdr = from->b_hdr;
1571	uint64_t size = hdr->b_size;
1572
1573	ASSERT(hdr->b_state != arc_anon);
1574
1575	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1576	buf->b_hdr = hdr;
1577	buf->b_data = NULL;
1578	buf->b_efunc = NULL;
1579	buf->b_private = NULL;
1580	buf->b_next = hdr->b_buf;
1581	hdr->b_buf = buf;
1582	arc_get_data_buf(buf);
1583	bcopy(from->b_data, buf->b_data, size);
1584
1585	/*
1586	 * This buffer already exists in the arc so create a duplicate
1587	 * copy for the caller.  If the buffer is associated with user data
1588	 * then track the size and number of duplicates.  These stats will be
1589	 * updated as duplicate buffers are created and destroyed.
1590	 */
1591	if (hdr->b_type == ARC_BUFC_DATA) {
1592		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1593		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1594	}
1595	hdr->b_datacnt += 1;
1596	return (buf);
1597}
1598
1599void
1600arc_buf_add_ref(arc_buf_t *buf, void* tag)
1601{
1602	arc_buf_hdr_t *hdr;
1603	kmutex_t *hash_lock;
1604
1605	/*
1606	 * Check to see if this buffer is evicted.  Callers
1607	 * must verify b_data != NULL to know if the add_ref
1608	 * was successful.
1609	 */
1610	mutex_enter(&buf->b_evict_lock);
1611	if (buf->b_data == NULL) {
1612		mutex_exit(&buf->b_evict_lock);
1613		return;
1614	}
1615	hash_lock = HDR_LOCK(buf->b_hdr);
1616	mutex_enter(hash_lock);
1617	hdr = buf->b_hdr;
1618	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1619	mutex_exit(&buf->b_evict_lock);
1620
1621	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1622	add_reference(hdr, hash_lock, tag);
1623	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1624	arc_access(hdr, hash_lock);
1625	mutex_exit(hash_lock);
1626	ARCSTAT_BUMP(arcstat_hits);
1627	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1628	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1629	    data, metadata, hits);
1630}
1631
1632/*
1633 * Free the arc data buffer.  If it is an l2arc write in progress,
1634 * the buffer is placed on l2arc_free_on_write to be freed later.
1635 */
1636static void
1637arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1638{
1639	arc_buf_hdr_t *hdr = buf->b_hdr;
1640
1641	if (HDR_L2_WRITING(hdr)) {
1642		l2arc_data_free_t *df;
1643		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1644		df->l2df_data = buf->b_data;
1645		df->l2df_size = hdr->b_size;
1646		df->l2df_func = free_func;
1647		mutex_enter(&l2arc_free_on_write_mtx);
1648		list_insert_head(l2arc_free_on_write, df);
1649		mutex_exit(&l2arc_free_on_write_mtx);
1650		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1651	} else {
1652		free_func(buf->b_data, hdr->b_size);
1653	}
1654}
1655
1656/*
1657 * Free up buf->b_data and if 'remove' is set, then pull the
1658 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1659 */
1660static void
1661arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1662{
1663	arc_buf_t **bufp;
1664
1665	/* free up data associated with the buf */
1666	if (buf->b_data) {
1667		arc_state_t *state = buf->b_hdr->b_state;
1668		uint64_t size = buf->b_hdr->b_size;
1669		arc_buf_contents_t type = buf->b_hdr->b_type;
1670
1671		arc_cksum_verify(buf);
1672#ifdef illumos
1673		arc_buf_unwatch(buf);
1674#endif /* illumos */
1675
1676		if (!recycle) {
1677			if (type == ARC_BUFC_METADATA) {
1678				arc_buf_data_free(buf, zio_buf_free);
1679				arc_space_return(size, ARC_SPACE_DATA);
1680			} else {
1681				ASSERT(type == ARC_BUFC_DATA);
1682				arc_buf_data_free(buf, zio_data_buf_free);
1683				ARCSTAT_INCR(arcstat_data_size, -size);
1684				atomic_add_64(&arc_size, -size);
1685			}
1686		}
1687		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1688			uint64_t *cnt = &state->arcs_lsize[type];
1689
1690			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1691			ASSERT(state != arc_anon);
1692
1693			ASSERT3U(*cnt, >=, size);
1694			atomic_add_64(cnt, -size);
1695		}
1696		ASSERT3U(state->arcs_size, >=, size);
1697		atomic_add_64(&state->arcs_size, -size);
1698		buf->b_data = NULL;
1699
1700		/*
1701		 * If we're destroying a duplicate buffer make sure
1702		 * that the appropriate statistics are updated.
1703		 */
1704		if (buf->b_hdr->b_datacnt > 1 &&
1705		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1706			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1707			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1708		}
1709		ASSERT(buf->b_hdr->b_datacnt > 0);
1710		buf->b_hdr->b_datacnt -= 1;
1711	}
1712
1713	/* only remove the buf if requested */
1714	if (!remove)
1715		return;
1716
1717	/* remove the buf from the hdr list */
1718	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1719		continue;
1720	*bufp = buf->b_next;
1721	buf->b_next = NULL;
1722
1723	ASSERT(buf->b_efunc == NULL);
1724
1725	/* clean up the buf */
1726	buf->b_hdr = NULL;
1727	kmem_cache_free(buf_cache, buf);
1728}
1729
1730static void
1731arc_hdr_destroy(arc_buf_hdr_t *hdr)
1732{
1733	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1734	ASSERT3P(hdr->b_state, ==, arc_anon);
1735	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1736	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1737
1738	if (l2hdr != NULL) {
1739		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1740		/*
1741		 * To prevent arc_free() and l2arc_evict() from
1742		 * attempting to free the same buffer at the same time,
1743		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1744		 * give it priority.  l2arc_evict() can't destroy this
1745		 * header while we are waiting on l2arc_buflist_mtx.
1746		 *
1747		 * The hdr may be removed from l2ad_buflist before we
1748		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1749		 */
1750		if (!buflist_held) {
1751			mutex_enter(&l2arc_buflist_mtx);
1752			l2hdr = hdr->b_l2hdr;
1753		}
1754
1755		if (l2hdr != NULL) {
1756			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1757			    hdr->b_size, 0);
1758			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1759			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1760			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1761			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1762			    -l2hdr->b_asize, 0, 0);
1763			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1764			if (hdr->b_state == arc_l2c_only)
1765				l2arc_hdr_stat_remove();
1766			hdr->b_l2hdr = NULL;
1767		}
1768
1769		if (!buflist_held)
1770			mutex_exit(&l2arc_buflist_mtx);
1771	}
1772
1773	if (!BUF_EMPTY(hdr)) {
1774		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1775		buf_discard_identity(hdr);
1776	}
1777	while (hdr->b_buf) {
1778		arc_buf_t *buf = hdr->b_buf;
1779
1780		if (buf->b_efunc) {
1781			mutex_enter(&arc_eviction_mtx);
1782			mutex_enter(&buf->b_evict_lock);
1783			ASSERT(buf->b_hdr != NULL);
1784			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1785			hdr->b_buf = buf->b_next;
1786			buf->b_hdr = &arc_eviction_hdr;
1787			buf->b_next = arc_eviction_list;
1788			arc_eviction_list = buf;
1789			mutex_exit(&buf->b_evict_lock);
1790			mutex_exit(&arc_eviction_mtx);
1791		} else {
1792			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1793		}
1794	}
1795	if (hdr->b_freeze_cksum != NULL) {
1796		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1797		hdr->b_freeze_cksum = NULL;
1798	}
1799	if (hdr->b_thawed) {
1800		kmem_free(hdr->b_thawed, 1);
1801		hdr->b_thawed = NULL;
1802	}
1803
1804	ASSERT(!list_link_active(&hdr->b_arc_node));
1805	ASSERT3P(hdr->b_hash_next, ==, NULL);
1806	ASSERT3P(hdr->b_acb, ==, NULL);
1807	kmem_cache_free(hdr_cache, hdr);
1808}
1809
1810void
1811arc_buf_free(arc_buf_t *buf, void *tag)
1812{
1813	arc_buf_hdr_t *hdr = buf->b_hdr;
1814	int hashed = hdr->b_state != arc_anon;
1815
1816	ASSERT(buf->b_efunc == NULL);
1817	ASSERT(buf->b_data != NULL);
1818
1819	if (hashed) {
1820		kmutex_t *hash_lock = HDR_LOCK(hdr);
1821
1822		mutex_enter(hash_lock);
1823		hdr = buf->b_hdr;
1824		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1825
1826		(void) remove_reference(hdr, hash_lock, tag);
1827		if (hdr->b_datacnt > 1) {
1828			arc_buf_destroy(buf, FALSE, TRUE);
1829		} else {
1830			ASSERT(buf == hdr->b_buf);
1831			ASSERT(buf->b_efunc == NULL);
1832			hdr->b_flags |= ARC_BUF_AVAILABLE;
1833		}
1834		mutex_exit(hash_lock);
1835	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1836		int destroy_hdr;
1837		/*
1838		 * We are in the middle of an async write.  Don't destroy
1839		 * this buffer unless the write completes before we finish
1840		 * decrementing the reference count.
1841		 */
1842		mutex_enter(&arc_eviction_mtx);
1843		(void) remove_reference(hdr, NULL, tag);
1844		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1845		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1846		mutex_exit(&arc_eviction_mtx);
1847		if (destroy_hdr)
1848			arc_hdr_destroy(hdr);
1849	} else {
1850		if (remove_reference(hdr, NULL, tag) > 0)
1851			arc_buf_destroy(buf, FALSE, TRUE);
1852		else
1853			arc_hdr_destroy(hdr);
1854	}
1855}
1856
1857boolean_t
1858arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1859{
1860	arc_buf_hdr_t *hdr = buf->b_hdr;
1861	kmutex_t *hash_lock = HDR_LOCK(hdr);
1862	boolean_t no_callback = (buf->b_efunc == NULL);
1863
1864	if (hdr->b_state == arc_anon) {
1865		ASSERT(hdr->b_datacnt == 1);
1866		arc_buf_free(buf, tag);
1867		return (no_callback);
1868	}
1869
1870	mutex_enter(hash_lock);
1871	hdr = buf->b_hdr;
1872	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1873	ASSERT(hdr->b_state != arc_anon);
1874	ASSERT(buf->b_data != NULL);
1875
1876	(void) remove_reference(hdr, hash_lock, tag);
1877	if (hdr->b_datacnt > 1) {
1878		if (no_callback)
1879			arc_buf_destroy(buf, FALSE, TRUE);
1880	} else if (no_callback) {
1881		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1882		ASSERT(buf->b_efunc == NULL);
1883		hdr->b_flags |= ARC_BUF_AVAILABLE;
1884	}
1885	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1886	    refcount_is_zero(&hdr->b_refcnt));
1887	mutex_exit(hash_lock);
1888	return (no_callback);
1889}
1890
1891int
1892arc_buf_size(arc_buf_t *buf)
1893{
1894	return (buf->b_hdr->b_size);
1895}
1896
1897/*
1898 * Called from the DMU to determine if the current buffer should be
1899 * evicted. In order to ensure proper locking, the eviction must be initiated
1900 * from the DMU. Return true if the buffer is associated with user data and
1901 * duplicate buffers still exist.
1902 */
1903boolean_t
1904arc_buf_eviction_needed(arc_buf_t *buf)
1905{
1906	arc_buf_hdr_t *hdr;
1907	boolean_t evict_needed = B_FALSE;
1908
1909	if (zfs_disable_dup_eviction)
1910		return (B_FALSE);
1911
1912	mutex_enter(&buf->b_evict_lock);
1913	hdr = buf->b_hdr;
1914	if (hdr == NULL) {
1915		/*
1916		 * We are in arc_do_user_evicts(); let that function
1917		 * perform the eviction.
1918		 */
1919		ASSERT(buf->b_data == NULL);
1920		mutex_exit(&buf->b_evict_lock);
1921		return (B_FALSE);
1922	} else if (buf->b_data == NULL) {
1923		/*
1924		 * We have already been added to the arc eviction list;
1925		 * recommend eviction.
1926		 */
1927		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1928		mutex_exit(&buf->b_evict_lock);
1929		return (B_TRUE);
1930	}
1931
1932	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1933		evict_needed = B_TRUE;
1934
1935	mutex_exit(&buf->b_evict_lock);
1936	return (evict_needed);
1937}
1938
1939/*
1940 * Evict buffers from list until we've removed the specified number of
1941 * bytes.  Move the removed buffers to the appropriate evict state.
1942 * If the recycle flag is set, then attempt to "recycle" a buffer:
1943 * - look for a buffer to evict that is `bytes' long.
1944 * - return the data block from this buffer rather than freeing it.
1945 * This flag is used by callers that are trying to make space for a
1946 * new buffer in a full arc cache.
1947 *
1948 * This function makes a "best effort".  It skips over any buffers
1949 * it can't get a hash_lock on, and so may not catch all candidates.
1950 * It may also return without evicting as much space as requested.
1951 */
1952static void *
1953arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1954    arc_buf_contents_t type)
1955{
1956	arc_state_t *evicted_state;
1957	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1958	int64_t bytes_remaining;
1959	arc_buf_hdr_t *ab, *ab_prev = NULL;
1960	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1961	kmutex_t *lock, *evicted_lock;
1962	kmutex_t *hash_lock;
1963	boolean_t have_lock;
1964	void *stolen = NULL;
1965	arc_buf_hdr_t marker = { 0 };
1966	int count = 0;
1967	static int evict_metadata_offset, evict_data_offset;
1968	int i, idx, offset, list_count, lists;
1969
1970	ASSERT(state == arc_mru || state == arc_mfu);
1971
1972	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1973
1974	if (type == ARC_BUFC_METADATA) {
1975		offset = 0;
1976		list_count = ARC_BUFC_NUMMETADATALISTS;
1977		list_start = &state->arcs_lists[0];
1978		evicted_list_start = &evicted_state->arcs_lists[0];
1979		idx = evict_metadata_offset;
1980	} else {
1981		offset = ARC_BUFC_NUMMETADATALISTS;
1982		list_start = &state->arcs_lists[offset];
1983		evicted_list_start = &evicted_state->arcs_lists[offset];
1984		list_count = ARC_BUFC_NUMDATALISTS;
1985		idx = evict_data_offset;
1986	}
1987	bytes_remaining = evicted_state->arcs_lsize[type];
1988	lists = 0;
1989
1990evict_start:
1991	list = &list_start[idx];
1992	evicted_list = &evicted_list_start[idx];
1993	lock = ARCS_LOCK(state, (offset + idx));
1994	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
1995
1996	mutex_enter(lock);
1997	mutex_enter(evicted_lock);
1998
1999	for (ab = list_tail(list); ab; ab = ab_prev) {
2000		ab_prev = list_prev(list, ab);
2001		bytes_remaining -= (ab->b_size * ab->b_datacnt);
2002		/* prefetch buffers have a minimum lifespan */
2003		if (HDR_IO_IN_PROGRESS(ab) ||
2004		    (spa && ab->b_spa != spa) ||
2005		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2006		    ddi_get_lbolt() - ab->b_arc_access <
2007		    arc_min_prefetch_lifespan)) {
2008			skipped++;
2009			continue;
2010		}
2011		/* "lookahead" for better eviction candidate */
2012		if (recycle && ab->b_size != bytes &&
2013		    ab_prev && ab_prev->b_size == bytes)
2014			continue;
2015
2016		/* ignore markers */
2017		if (ab->b_spa == 0)
2018			continue;
2019
2020		/*
2021		 * It may take a long time to evict all the bufs requested.
2022		 * To avoid blocking all arc activity, periodically drop
2023		 * the arcs_mtx and give other threads a chance to run
2024		 * before reacquiring the lock.
2025		 *
2026		 * If we are looking for a buffer to recycle, we are in
2027		 * the hot code path, so don't sleep.
2028		 */
2029		if (!recycle && count++ > arc_evict_iterations) {
2030			list_insert_after(list, ab, &marker);
2031			mutex_exit(evicted_lock);
2032			mutex_exit(lock);
2033			kpreempt(KPREEMPT_SYNC);
2034			mutex_enter(lock);
2035			mutex_enter(evicted_lock);
2036			ab_prev = list_prev(list, &marker);
2037			list_remove(list, &marker);
2038			count = 0;
2039			continue;
2040		}
2041
2042		hash_lock = HDR_LOCK(ab);
2043		have_lock = MUTEX_HELD(hash_lock);
2044		if (have_lock || mutex_tryenter(hash_lock)) {
2045			ASSERT0(refcount_count(&ab->b_refcnt));
2046			ASSERT(ab->b_datacnt > 0);
2047			while (ab->b_buf) {
2048				arc_buf_t *buf = ab->b_buf;
2049				if (!mutex_tryenter(&buf->b_evict_lock)) {
2050					missed += 1;
2051					break;
2052				}
2053				if (buf->b_data) {
2054					bytes_evicted += ab->b_size;
2055					if (recycle && ab->b_type == type &&
2056					    ab->b_size == bytes &&
2057					    !HDR_L2_WRITING(ab)) {
2058						stolen = buf->b_data;
2059						recycle = FALSE;
2060					}
2061				}
2062				if (buf->b_efunc) {
2063					mutex_enter(&arc_eviction_mtx);
2064					arc_buf_destroy(buf,
2065					    buf->b_data == stolen, FALSE);
2066					ab->b_buf = buf->b_next;
2067					buf->b_hdr = &arc_eviction_hdr;
2068					buf->b_next = arc_eviction_list;
2069					arc_eviction_list = buf;
2070					mutex_exit(&arc_eviction_mtx);
2071					mutex_exit(&buf->b_evict_lock);
2072				} else {
2073					mutex_exit(&buf->b_evict_lock);
2074					arc_buf_destroy(buf,
2075					    buf->b_data == stolen, TRUE);
2076				}
2077			}
2078
2079			if (ab->b_l2hdr) {
2080				ARCSTAT_INCR(arcstat_evict_l2_cached,
2081				    ab->b_size);
2082			} else {
2083				if (l2arc_write_eligible(ab->b_spa, ab)) {
2084					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2085					    ab->b_size);
2086				} else {
2087					ARCSTAT_INCR(
2088					    arcstat_evict_l2_ineligible,
2089					    ab->b_size);
2090				}
2091			}
2092
2093			if (ab->b_datacnt == 0) {
2094				arc_change_state(evicted_state, ab, hash_lock);
2095				ASSERT(HDR_IN_HASH_TABLE(ab));
2096				ab->b_flags |= ARC_IN_HASH_TABLE;
2097				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2098				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2099			}
2100			if (!have_lock)
2101				mutex_exit(hash_lock);
2102			if (bytes >= 0 && bytes_evicted >= bytes)
2103				break;
2104			if (bytes_remaining > 0) {
2105				mutex_exit(evicted_lock);
2106				mutex_exit(lock);
2107				idx  = ((idx + 1) & (list_count - 1));
2108				lists++;
2109				goto evict_start;
2110			}
2111		} else {
2112			missed += 1;
2113		}
2114	}
2115
2116	mutex_exit(evicted_lock);
2117	mutex_exit(lock);
2118
2119	idx  = ((idx + 1) & (list_count - 1));
2120	lists++;
2121
2122	if (bytes_evicted < bytes) {
2123		if (lists < list_count)
2124			goto evict_start;
2125		else
2126			dprintf("only evicted %lld bytes from %x",
2127			    (longlong_t)bytes_evicted, state);
2128	}
2129	if (type == ARC_BUFC_METADATA)
2130		evict_metadata_offset = idx;
2131	else
2132		evict_data_offset = idx;
2133
2134	if (skipped)
2135		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2136
2137	if (missed)
2138		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2139
2140	/*
2141	 * Note: we have just evicted some data into the ghost state,
2142	 * potentially putting the ghost size over the desired size.  Rather
2143	 * that evicting from the ghost list in this hot code path, leave
2144	 * this chore to the arc_reclaim_thread().
2145	 */
2146
2147	if (stolen)
2148		ARCSTAT_BUMP(arcstat_stolen);
2149	return (stolen);
2150}
2151
2152/*
2153 * Remove buffers from list until we've removed the specified number of
2154 * bytes.  Destroy the buffers that are removed.
2155 */
2156static void
2157arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2158{
2159	arc_buf_hdr_t *ab, *ab_prev;
2160	arc_buf_hdr_t marker = { 0 };
2161	list_t *list, *list_start;
2162	kmutex_t *hash_lock, *lock;
2163	uint64_t bytes_deleted = 0;
2164	uint64_t bufs_skipped = 0;
2165	int count = 0;
2166	static int evict_offset;
2167	int list_count, idx = evict_offset;
2168	int offset, lists = 0;
2169
2170	ASSERT(GHOST_STATE(state));
2171
2172	/*
2173	 * data lists come after metadata lists
2174	 */
2175	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2176	list_count = ARC_BUFC_NUMDATALISTS;
2177	offset = ARC_BUFC_NUMMETADATALISTS;
2178
2179evict_start:
2180	list = &list_start[idx];
2181	lock = ARCS_LOCK(state, idx + offset);
2182
2183	mutex_enter(lock);
2184	for (ab = list_tail(list); ab; ab = ab_prev) {
2185		ab_prev = list_prev(list, ab);
2186		if (ab->b_type > ARC_BUFC_NUMTYPES)
2187			panic("invalid ab=%p", (void *)ab);
2188		if (spa && ab->b_spa != spa)
2189			continue;
2190
2191		/* ignore markers */
2192		if (ab->b_spa == 0)
2193			continue;
2194
2195		hash_lock = HDR_LOCK(ab);
2196		/* caller may be trying to modify this buffer, skip it */
2197		if (MUTEX_HELD(hash_lock))
2198			continue;
2199
2200		/*
2201		 * It may take a long time to evict all the bufs requested.
2202		 * To avoid blocking all arc activity, periodically drop
2203		 * the arcs_mtx and give other threads a chance to run
2204		 * before reacquiring the lock.
2205		 */
2206		if (count++ > arc_evict_iterations) {
2207			list_insert_after(list, ab, &marker);
2208			mutex_exit(lock);
2209			kpreempt(KPREEMPT_SYNC);
2210			mutex_enter(lock);
2211			ab_prev = list_prev(list, &marker);
2212			list_remove(list, &marker);
2213			count = 0;
2214			continue;
2215		}
2216		if (mutex_tryenter(hash_lock)) {
2217			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2218			ASSERT(ab->b_buf == NULL);
2219			ARCSTAT_BUMP(arcstat_deleted);
2220			bytes_deleted += ab->b_size;
2221
2222			if (ab->b_l2hdr != NULL) {
2223				/*
2224				 * This buffer is cached on the 2nd Level ARC;
2225				 * don't destroy the header.
2226				 */
2227				arc_change_state(arc_l2c_only, ab, hash_lock);
2228				mutex_exit(hash_lock);
2229			} else {
2230				arc_change_state(arc_anon, ab, hash_lock);
2231				mutex_exit(hash_lock);
2232				arc_hdr_destroy(ab);
2233			}
2234
2235			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2236			if (bytes >= 0 && bytes_deleted >= bytes)
2237				break;
2238		} else if (bytes < 0) {
2239			/*
2240			 * Insert a list marker and then wait for the
2241			 * hash lock to become available. Once its
2242			 * available, restart from where we left off.
2243			 */
2244			list_insert_after(list, ab, &marker);
2245			mutex_exit(lock);
2246			mutex_enter(hash_lock);
2247			mutex_exit(hash_lock);
2248			mutex_enter(lock);
2249			ab_prev = list_prev(list, &marker);
2250			list_remove(list, &marker);
2251		} else {
2252			bufs_skipped += 1;
2253		}
2254
2255	}
2256	mutex_exit(lock);
2257	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2258	lists++;
2259
2260	if (lists < list_count)
2261		goto evict_start;
2262
2263	evict_offset = idx;
2264	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2265	    (bytes < 0 || bytes_deleted < bytes)) {
2266		list_start = &state->arcs_lists[0];
2267		list_count = ARC_BUFC_NUMMETADATALISTS;
2268		offset = lists = 0;
2269		goto evict_start;
2270	}
2271
2272	if (bufs_skipped) {
2273		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2274		ASSERT(bytes >= 0);
2275	}
2276
2277	if (bytes_deleted < bytes)
2278		dprintf("only deleted %lld bytes from %p",
2279		    (longlong_t)bytes_deleted, state);
2280}
2281
2282static void
2283arc_adjust(void)
2284{
2285	int64_t adjustment, delta;
2286
2287	/*
2288	 * Adjust MRU size
2289	 */
2290
2291	adjustment = MIN((int64_t)(arc_size - arc_c),
2292	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2293	    arc_p));
2294
2295	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2296		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2297		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2298		adjustment -= delta;
2299	}
2300
2301	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2302		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2303		(void) arc_evict(arc_mru, 0, delta, FALSE,
2304		    ARC_BUFC_METADATA);
2305	}
2306
2307	/*
2308	 * Adjust MFU size
2309	 */
2310
2311	adjustment = arc_size - arc_c;
2312
2313	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2314		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2315		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2316		adjustment -= delta;
2317	}
2318
2319	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2320		int64_t delta = MIN(adjustment,
2321		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2322		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2323		    ARC_BUFC_METADATA);
2324	}
2325
2326	/*
2327	 * Adjust ghost lists
2328	 */
2329
2330	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2331
2332	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2333		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2334		arc_evict_ghost(arc_mru_ghost, 0, delta);
2335	}
2336
2337	adjustment =
2338	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2339
2340	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2341		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2342		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2343	}
2344}
2345
2346static void
2347arc_do_user_evicts(void)
2348{
2349	static arc_buf_t *tmp_arc_eviction_list;
2350
2351	/*
2352	 * Move list over to avoid LOR
2353	 */
2354restart:
2355	mutex_enter(&arc_eviction_mtx);
2356	tmp_arc_eviction_list = arc_eviction_list;
2357	arc_eviction_list = NULL;
2358	mutex_exit(&arc_eviction_mtx);
2359
2360	while (tmp_arc_eviction_list != NULL) {
2361		arc_buf_t *buf = tmp_arc_eviction_list;
2362		tmp_arc_eviction_list = buf->b_next;
2363		mutex_enter(&buf->b_evict_lock);
2364		buf->b_hdr = NULL;
2365		mutex_exit(&buf->b_evict_lock);
2366
2367		if (buf->b_efunc != NULL)
2368			VERIFY0(buf->b_efunc(buf->b_private));
2369
2370		buf->b_efunc = NULL;
2371		buf->b_private = NULL;
2372		kmem_cache_free(buf_cache, buf);
2373	}
2374
2375	if (arc_eviction_list != NULL)
2376		goto restart;
2377}
2378
2379/*
2380 * Flush all *evictable* data from the cache for the given spa.
2381 * NOTE: this will not touch "active" (i.e. referenced) data.
2382 */
2383void
2384arc_flush(spa_t *spa)
2385{
2386	uint64_t guid = 0;
2387
2388	if (spa)
2389		guid = spa_load_guid(spa);
2390
2391	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2392		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2393		if (spa)
2394			break;
2395	}
2396	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2397		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2398		if (spa)
2399			break;
2400	}
2401	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2402		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2403		if (spa)
2404			break;
2405	}
2406	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2407		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2408		if (spa)
2409			break;
2410	}
2411
2412	arc_evict_ghost(arc_mru_ghost, guid, -1);
2413	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2414
2415	mutex_enter(&arc_reclaim_thr_lock);
2416	arc_do_user_evicts();
2417	mutex_exit(&arc_reclaim_thr_lock);
2418	ASSERT(spa || arc_eviction_list == NULL);
2419}
2420
2421void
2422arc_shrink(void)
2423{
2424	if (arc_c > arc_c_min) {
2425		uint64_t to_free;
2426
2427#ifdef _KERNEL
2428		to_free = arc_c >> arc_shrink_shift;
2429#else
2430		to_free = arc_c >> arc_shrink_shift;
2431#endif
2432		if (arc_c > arc_c_min + to_free)
2433			atomic_add_64(&arc_c, -to_free);
2434		else
2435			arc_c = arc_c_min;
2436
2437		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2438		if (arc_c > arc_size)
2439			arc_c = MAX(arc_size, arc_c_min);
2440		if (arc_p > arc_c)
2441			arc_p = (arc_c >> 1);
2442		ASSERT(arc_c >= arc_c_min);
2443		ASSERT((int64_t)arc_p >= 0);
2444	}
2445
2446	if (arc_size > arc_c)
2447		arc_adjust();
2448}
2449
2450static int needfree = 0;
2451
2452static int
2453arc_reclaim_needed(void)
2454{
2455
2456#ifdef _KERNEL
2457
2458	if (needfree)
2459		return (1);
2460
2461	/*
2462	 * Cooperate with pagedaemon when it's time for it to scan
2463	 * and reclaim some pages.
2464	 */
2465	if (vm_paging_needed())
2466		return (1);
2467
2468#ifdef sun
2469	/*
2470	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2471	 */
2472	extra = desfree;
2473
2474	/*
2475	 * check that we're out of range of the pageout scanner.  It starts to
2476	 * schedule paging if freemem is less than lotsfree and needfree.
2477	 * lotsfree is the high-water mark for pageout, and needfree is the
2478	 * number of needed free pages.  We add extra pages here to make sure
2479	 * the scanner doesn't start up while we're freeing memory.
2480	 */
2481	if (freemem < lotsfree + needfree + extra)
2482		return (1);
2483
2484	/*
2485	 * check to make sure that swapfs has enough space so that anon
2486	 * reservations can still succeed. anon_resvmem() checks that the
2487	 * availrmem is greater than swapfs_minfree, and the number of reserved
2488	 * swap pages.  We also add a bit of extra here just to prevent
2489	 * circumstances from getting really dire.
2490	 */
2491	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2492		return (1);
2493
2494#if defined(__i386)
2495	/*
2496	 * If we're on an i386 platform, it's possible that we'll exhaust the
2497	 * kernel heap space before we ever run out of available physical
2498	 * memory.  Most checks of the size of the heap_area compare against
2499	 * tune.t_minarmem, which is the minimum available real memory that we
2500	 * can have in the system.  However, this is generally fixed at 25 pages
2501	 * which is so low that it's useless.  In this comparison, we seek to
2502	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2503	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2504	 * free)
2505	 */
2506	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2507	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2508		return (1);
2509#endif
2510#else	/* !sun */
2511	if (kmem_used() > (kmem_size() * 3) / 4)
2512		return (1);
2513#endif	/* sun */
2514
2515#else
2516	if (spa_get_random(100) == 0)
2517		return (1);
2518#endif
2519	return (0);
2520}
2521
2522extern kmem_cache_t	*zio_buf_cache[];
2523extern kmem_cache_t	*zio_data_buf_cache[];
2524
2525static void
2526arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2527{
2528	size_t			i;
2529	kmem_cache_t		*prev_cache = NULL;
2530	kmem_cache_t		*prev_data_cache = NULL;
2531
2532#ifdef _KERNEL
2533	if (arc_meta_used >= arc_meta_limit) {
2534		/*
2535		 * We are exceeding our meta-data cache limit.
2536		 * Purge some DNLC entries to release holds on meta-data.
2537		 */
2538		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2539	}
2540#if defined(__i386)
2541	/*
2542	 * Reclaim unused memory from all kmem caches.
2543	 */
2544	kmem_reap();
2545#endif
2546#endif
2547
2548	/*
2549	 * An aggressive reclamation will shrink the cache size as well as
2550	 * reap free buffers from the arc kmem caches.
2551	 */
2552	if (strat == ARC_RECLAIM_AGGR)
2553		arc_shrink();
2554
2555	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2556		if (zio_buf_cache[i] != prev_cache) {
2557			prev_cache = zio_buf_cache[i];
2558			kmem_cache_reap_now(zio_buf_cache[i]);
2559		}
2560		if (zio_data_buf_cache[i] != prev_data_cache) {
2561			prev_data_cache = zio_data_buf_cache[i];
2562			kmem_cache_reap_now(zio_data_buf_cache[i]);
2563		}
2564	}
2565	kmem_cache_reap_now(buf_cache);
2566	kmem_cache_reap_now(hdr_cache);
2567}
2568
2569static void
2570arc_reclaim_thread(void *dummy __unused)
2571{
2572	clock_t			growtime = 0;
2573	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2574	callb_cpr_t		cpr;
2575
2576	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2577
2578	mutex_enter(&arc_reclaim_thr_lock);
2579	while (arc_thread_exit == 0) {
2580		if (arc_reclaim_needed()) {
2581
2582			if (arc_no_grow) {
2583				if (last_reclaim == ARC_RECLAIM_CONS) {
2584					last_reclaim = ARC_RECLAIM_AGGR;
2585				} else {
2586					last_reclaim = ARC_RECLAIM_CONS;
2587				}
2588			} else {
2589				arc_no_grow = TRUE;
2590				last_reclaim = ARC_RECLAIM_AGGR;
2591				membar_producer();
2592			}
2593
2594			/* reset the growth delay for every reclaim */
2595			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2596
2597			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2598				/*
2599				 * If needfree is TRUE our vm_lowmem hook
2600				 * was called and in that case we must free some
2601				 * memory, so switch to aggressive mode.
2602				 */
2603				arc_no_grow = TRUE;
2604				last_reclaim = ARC_RECLAIM_AGGR;
2605			}
2606			arc_kmem_reap_now(last_reclaim);
2607			arc_warm = B_TRUE;
2608
2609		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2610			arc_no_grow = FALSE;
2611		}
2612
2613		arc_adjust();
2614
2615		if (arc_eviction_list != NULL)
2616			arc_do_user_evicts();
2617
2618#ifdef _KERNEL
2619		if (needfree) {
2620			needfree = 0;
2621			wakeup(&needfree);
2622		}
2623#endif
2624
2625		/* block until needed, or one second, whichever is shorter */
2626		CALLB_CPR_SAFE_BEGIN(&cpr);
2627		(void) cv_timedwait(&arc_reclaim_thr_cv,
2628		    &arc_reclaim_thr_lock, hz);
2629		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2630	}
2631
2632	arc_thread_exit = 0;
2633	cv_broadcast(&arc_reclaim_thr_cv);
2634	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2635	thread_exit();
2636}
2637
2638/*
2639 * Adapt arc info given the number of bytes we are trying to add and
2640 * the state that we are comming from.  This function is only called
2641 * when we are adding new content to the cache.
2642 */
2643static void
2644arc_adapt(int bytes, arc_state_t *state)
2645{
2646	int mult;
2647	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2648
2649	if (state == arc_l2c_only)
2650		return;
2651
2652	ASSERT(bytes > 0);
2653	/*
2654	 * Adapt the target size of the MRU list:
2655	 *	- if we just hit in the MRU ghost list, then increase
2656	 *	  the target size of the MRU list.
2657	 *	- if we just hit in the MFU ghost list, then increase
2658	 *	  the target size of the MFU list by decreasing the
2659	 *	  target size of the MRU list.
2660	 */
2661	if (state == arc_mru_ghost) {
2662		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2663		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2664		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2665
2666		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2667	} else if (state == arc_mfu_ghost) {
2668		uint64_t delta;
2669
2670		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2671		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2672		mult = MIN(mult, 10);
2673
2674		delta = MIN(bytes * mult, arc_p);
2675		arc_p = MAX(arc_p_min, arc_p - delta);
2676	}
2677	ASSERT((int64_t)arc_p >= 0);
2678
2679	if (arc_reclaim_needed()) {
2680		cv_signal(&arc_reclaim_thr_cv);
2681		return;
2682	}
2683
2684	if (arc_no_grow)
2685		return;
2686
2687	if (arc_c >= arc_c_max)
2688		return;
2689
2690	/*
2691	 * If we're within (2 * maxblocksize) bytes of the target
2692	 * cache size, increment the target cache size
2693	 */
2694	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2695		atomic_add_64(&arc_c, (int64_t)bytes);
2696		if (arc_c > arc_c_max)
2697			arc_c = arc_c_max;
2698		else if (state == arc_anon)
2699			atomic_add_64(&arc_p, (int64_t)bytes);
2700		if (arc_p > arc_c)
2701			arc_p = arc_c;
2702	}
2703	ASSERT((int64_t)arc_p >= 0);
2704}
2705
2706/*
2707 * Check if the cache has reached its limits and eviction is required
2708 * prior to insert.
2709 */
2710static int
2711arc_evict_needed(arc_buf_contents_t type)
2712{
2713	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2714		return (1);
2715
2716#ifdef sun
2717#ifdef _KERNEL
2718	/*
2719	 * If zio data pages are being allocated out of a separate heap segment,
2720	 * then enforce that the size of available vmem for this area remains
2721	 * above about 1/32nd free.
2722	 */
2723	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2724	    vmem_size(zio_arena, VMEM_FREE) <
2725	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2726		return (1);
2727#endif
2728#endif	/* sun */
2729
2730	if (arc_reclaim_needed())
2731		return (1);
2732
2733	return (arc_size > arc_c);
2734}
2735
2736/*
2737 * The buffer, supplied as the first argument, needs a data block.
2738 * So, if we are at cache max, determine which cache should be victimized.
2739 * We have the following cases:
2740 *
2741 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2742 * In this situation if we're out of space, but the resident size of the MFU is
2743 * under the limit, victimize the MFU cache to satisfy this insertion request.
2744 *
2745 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2746 * Here, we've used up all of the available space for the MRU, so we need to
2747 * evict from our own cache instead.  Evict from the set of resident MRU
2748 * entries.
2749 *
2750 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2751 * c minus p represents the MFU space in the cache, since p is the size of the
2752 * cache that is dedicated to the MRU.  In this situation there's still space on
2753 * the MFU side, so the MRU side needs to be victimized.
2754 *
2755 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2756 * MFU's resident set is consuming more space than it has been allotted.  In
2757 * this situation, we must victimize our own cache, the MFU, for this insertion.
2758 */
2759static void
2760arc_get_data_buf(arc_buf_t *buf)
2761{
2762	arc_state_t		*state = buf->b_hdr->b_state;
2763	uint64_t		size = buf->b_hdr->b_size;
2764	arc_buf_contents_t	type = buf->b_hdr->b_type;
2765
2766	arc_adapt(size, state);
2767
2768	/*
2769	 * We have not yet reached cache maximum size,
2770	 * just allocate a new buffer.
2771	 */
2772	if (!arc_evict_needed(type)) {
2773		if (type == ARC_BUFC_METADATA) {
2774			buf->b_data = zio_buf_alloc(size);
2775			arc_space_consume(size, ARC_SPACE_DATA);
2776		} else {
2777			ASSERT(type == ARC_BUFC_DATA);
2778			buf->b_data = zio_data_buf_alloc(size);
2779			ARCSTAT_INCR(arcstat_data_size, size);
2780			atomic_add_64(&arc_size, size);
2781		}
2782		goto out;
2783	}
2784
2785	/*
2786	 * If we are prefetching from the mfu ghost list, this buffer
2787	 * will end up on the mru list; so steal space from there.
2788	 */
2789	if (state == arc_mfu_ghost)
2790		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2791	else if (state == arc_mru_ghost)
2792		state = arc_mru;
2793
2794	if (state == arc_mru || state == arc_anon) {
2795		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2796		state = (arc_mfu->arcs_lsize[type] >= size &&
2797		    arc_p > mru_used) ? arc_mfu : arc_mru;
2798	} else {
2799		/* MFU cases */
2800		uint64_t mfu_space = arc_c - arc_p;
2801		state =  (arc_mru->arcs_lsize[type] >= size &&
2802		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2803	}
2804	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2805		if (type == ARC_BUFC_METADATA) {
2806			buf->b_data = zio_buf_alloc(size);
2807			arc_space_consume(size, ARC_SPACE_DATA);
2808		} else {
2809			ASSERT(type == ARC_BUFC_DATA);
2810			buf->b_data = zio_data_buf_alloc(size);
2811			ARCSTAT_INCR(arcstat_data_size, size);
2812			atomic_add_64(&arc_size, size);
2813		}
2814		ARCSTAT_BUMP(arcstat_recycle_miss);
2815	}
2816	ASSERT(buf->b_data != NULL);
2817out:
2818	/*
2819	 * Update the state size.  Note that ghost states have a
2820	 * "ghost size" and so don't need to be updated.
2821	 */
2822	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2823		arc_buf_hdr_t *hdr = buf->b_hdr;
2824
2825		atomic_add_64(&hdr->b_state->arcs_size, size);
2826		if (list_link_active(&hdr->b_arc_node)) {
2827			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2828			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2829		}
2830		/*
2831		 * If we are growing the cache, and we are adding anonymous
2832		 * data, and we have outgrown arc_p, update arc_p
2833		 */
2834		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2835		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2836			arc_p = MIN(arc_c, arc_p + size);
2837	}
2838	ARCSTAT_BUMP(arcstat_allocated);
2839}
2840
2841/*
2842 * This routine is called whenever a buffer is accessed.
2843 * NOTE: the hash lock is dropped in this function.
2844 */
2845static void
2846arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2847{
2848	clock_t now;
2849
2850	ASSERT(MUTEX_HELD(hash_lock));
2851
2852	if (buf->b_state == arc_anon) {
2853		/*
2854		 * This buffer is not in the cache, and does not
2855		 * appear in our "ghost" list.  Add the new buffer
2856		 * to the MRU state.
2857		 */
2858
2859		ASSERT(buf->b_arc_access == 0);
2860		buf->b_arc_access = ddi_get_lbolt();
2861		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2862		arc_change_state(arc_mru, buf, hash_lock);
2863
2864	} else if (buf->b_state == arc_mru) {
2865		now = ddi_get_lbolt();
2866
2867		/*
2868		 * If this buffer is here because of a prefetch, then either:
2869		 * - clear the flag if this is a "referencing" read
2870		 *   (any subsequent access will bump this into the MFU state).
2871		 * or
2872		 * - move the buffer to the head of the list if this is
2873		 *   another prefetch (to make it less likely to be evicted).
2874		 */
2875		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2876			if (refcount_count(&buf->b_refcnt) == 0) {
2877				ASSERT(list_link_active(&buf->b_arc_node));
2878			} else {
2879				buf->b_flags &= ~ARC_PREFETCH;
2880				ARCSTAT_BUMP(arcstat_mru_hits);
2881			}
2882			buf->b_arc_access = now;
2883			return;
2884		}
2885
2886		/*
2887		 * This buffer has been "accessed" only once so far,
2888		 * but it is still in the cache. Move it to the MFU
2889		 * state.
2890		 */
2891		if (now > buf->b_arc_access + ARC_MINTIME) {
2892			/*
2893			 * More than 125ms have passed since we
2894			 * instantiated this buffer.  Move it to the
2895			 * most frequently used state.
2896			 */
2897			buf->b_arc_access = now;
2898			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2899			arc_change_state(arc_mfu, buf, hash_lock);
2900		}
2901		ARCSTAT_BUMP(arcstat_mru_hits);
2902	} else if (buf->b_state == arc_mru_ghost) {
2903		arc_state_t	*new_state;
2904		/*
2905		 * This buffer has been "accessed" recently, but
2906		 * was evicted from the cache.  Move it to the
2907		 * MFU state.
2908		 */
2909
2910		if (buf->b_flags & ARC_PREFETCH) {
2911			new_state = arc_mru;
2912			if (refcount_count(&buf->b_refcnt) > 0)
2913				buf->b_flags &= ~ARC_PREFETCH;
2914			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2915		} else {
2916			new_state = arc_mfu;
2917			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2918		}
2919
2920		buf->b_arc_access = ddi_get_lbolt();
2921		arc_change_state(new_state, buf, hash_lock);
2922
2923		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2924	} else if (buf->b_state == arc_mfu) {
2925		/*
2926		 * This buffer has been accessed more than once and is
2927		 * still in the cache.  Keep it in the MFU state.
2928		 *
2929		 * NOTE: an add_reference() that occurred when we did
2930		 * the arc_read() will have kicked this off the list.
2931		 * If it was a prefetch, we will explicitly move it to
2932		 * the head of the list now.
2933		 */
2934		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2935			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2936			ASSERT(list_link_active(&buf->b_arc_node));
2937		}
2938		ARCSTAT_BUMP(arcstat_mfu_hits);
2939		buf->b_arc_access = ddi_get_lbolt();
2940	} else if (buf->b_state == arc_mfu_ghost) {
2941		arc_state_t	*new_state = arc_mfu;
2942		/*
2943		 * This buffer has been accessed more than once but has
2944		 * been evicted from the cache.  Move it back to the
2945		 * MFU state.
2946		 */
2947
2948		if (buf->b_flags & ARC_PREFETCH) {
2949			/*
2950			 * This is a prefetch access...
2951			 * move this block back to the MRU state.
2952			 */
2953			ASSERT0(refcount_count(&buf->b_refcnt));
2954			new_state = arc_mru;
2955		}
2956
2957		buf->b_arc_access = ddi_get_lbolt();
2958		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2959		arc_change_state(new_state, buf, hash_lock);
2960
2961		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2962	} else if (buf->b_state == arc_l2c_only) {
2963		/*
2964		 * This buffer is on the 2nd Level ARC.
2965		 */
2966
2967		buf->b_arc_access = ddi_get_lbolt();
2968		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2969		arc_change_state(arc_mfu, buf, hash_lock);
2970	} else {
2971		ASSERT(!"invalid arc state");
2972	}
2973}
2974
2975/* a generic arc_done_func_t which you can use */
2976/* ARGSUSED */
2977void
2978arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2979{
2980	if (zio == NULL || zio->io_error == 0)
2981		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2982	VERIFY(arc_buf_remove_ref(buf, arg));
2983}
2984
2985/* a generic arc_done_func_t */
2986void
2987arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2988{
2989	arc_buf_t **bufp = arg;
2990	if (zio && zio->io_error) {
2991		VERIFY(arc_buf_remove_ref(buf, arg));
2992		*bufp = NULL;
2993	} else {
2994		*bufp = buf;
2995		ASSERT(buf->b_data);
2996	}
2997}
2998
2999static void
3000arc_read_done(zio_t *zio)
3001{
3002	arc_buf_hdr_t	*hdr;
3003	arc_buf_t	*buf;
3004	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3005	kmutex_t	*hash_lock = NULL;
3006	arc_callback_t	*callback_list, *acb;
3007	int		freeable = FALSE;
3008
3009	buf = zio->io_private;
3010	hdr = buf->b_hdr;
3011
3012	/*
3013	 * The hdr was inserted into hash-table and removed from lists
3014	 * prior to starting I/O.  We should find this header, since
3015	 * it's in the hash table, and it should be legit since it's
3016	 * not possible to evict it during the I/O.  The only possible
3017	 * reason for it not to be found is if we were freed during the
3018	 * read.
3019	 */
3020	if (HDR_IN_HASH_TABLE(hdr)) {
3021		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3022		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3023		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3024		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3025		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3026
3027		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3028		    &hash_lock);
3029
3030		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3031		    hash_lock == NULL) ||
3032		    (found == hdr &&
3033		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3034		    (found == hdr && HDR_L2_READING(hdr)));
3035	}
3036
3037	hdr->b_flags &= ~ARC_L2_EVICTED;
3038	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3039		hdr->b_flags &= ~ARC_L2CACHE;
3040
3041	/* byteswap if necessary */
3042	callback_list = hdr->b_acb;
3043	ASSERT(callback_list != NULL);
3044	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3045		dmu_object_byteswap_t bswap =
3046		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3047		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3048		    byteswap_uint64_array :
3049		    dmu_ot_byteswap[bswap].ob_func;
3050		func(buf->b_data, hdr->b_size);
3051	}
3052
3053	arc_cksum_compute(buf, B_FALSE);
3054#ifdef illumos
3055	arc_buf_watch(buf);
3056#endif /* illumos */
3057
3058	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3059		/*
3060		 * Only call arc_access on anonymous buffers.  This is because
3061		 * if we've issued an I/O for an evicted buffer, we've already
3062		 * called arc_access (to prevent any simultaneous readers from
3063		 * getting confused).
3064		 */
3065		arc_access(hdr, hash_lock);
3066	}
3067
3068	/* create copies of the data buffer for the callers */
3069	abuf = buf;
3070	for (acb = callback_list; acb; acb = acb->acb_next) {
3071		if (acb->acb_done) {
3072			if (abuf == NULL) {
3073				ARCSTAT_BUMP(arcstat_duplicate_reads);
3074				abuf = arc_buf_clone(buf);
3075			}
3076			acb->acb_buf = abuf;
3077			abuf = NULL;
3078		}
3079	}
3080	hdr->b_acb = NULL;
3081	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3082	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3083	if (abuf == buf) {
3084		ASSERT(buf->b_efunc == NULL);
3085		ASSERT(hdr->b_datacnt == 1);
3086		hdr->b_flags |= ARC_BUF_AVAILABLE;
3087	}
3088
3089	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3090
3091	if (zio->io_error != 0) {
3092		hdr->b_flags |= ARC_IO_ERROR;
3093		if (hdr->b_state != arc_anon)
3094			arc_change_state(arc_anon, hdr, hash_lock);
3095		if (HDR_IN_HASH_TABLE(hdr))
3096			buf_hash_remove(hdr);
3097		freeable = refcount_is_zero(&hdr->b_refcnt);
3098	}
3099
3100	/*
3101	 * Broadcast before we drop the hash_lock to avoid the possibility
3102	 * that the hdr (and hence the cv) might be freed before we get to
3103	 * the cv_broadcast().
3104	 */
3105	cv_broadcast(&hdr->b_cv);
3106
3107	if (hash_lock) {
3108		mutex_exit(hash_lock);
3109	} else {
3110		/*
3111		 * This block was freed while we waited for the read to
3112		 * complete.  It has been removed from the hash table and
3113		 * moved to the anonymous state (so that it won't show up
3114		 * in the cache).
3115		 */
3116		ASSERT3P(hdr->b_state, ==, arc_anon);
3117		freeable = refcount_is_zero(&hdr->b_refcnt);
3118	}
3119
3120	/* execute each callback and free its structure */
3121	while ((acb = callback_list) != NULL) {
3122		if (acb->acb_done)
3123			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3124
3125		if (acb->acb_zio_dummy != NULL) {
3126			acb->acb_zio_dummy->io_error = zio->io_error;
3127			zio_nowait(acb->acb_zio_dummy);
3128		}
3129
3130		callback_list = acb->acb_next;
3131		kmem_free(acb, sizeof (arc_callback_t));
3132	}
3133
3134	if (freeable)
3135		arc_hdr_destroy(hdr);
3136}
3137
3138/*
3139 * "Read" the block block at the specified DVA (in bp) via the
3140 * cache.  If the block is found in the cache, invoke the provided
3141 * callback immediately and return.  Note that the `zio' parameter
3142 * in the callback will be NULL in this case, since no IO was
3143 * required.  If the block is not in the cache pass the read request
3144 * on to the spa with a substitute callback function, so that the
3145 * requested block will be added to the cache.
3146 *
3147 * If a read request arrives for a block that has a read in-progress,
3148 * either wait for the in-progress read to complete (and return the
3149 * results); or, if this is a read with a "done" func, add a record
3150 * to the read to invoke the "done" func when the read completes,
3151 * and return; or just return.
3152 *
3153 * arc_read_done() will invoke all the requested "done" functions
3154 * for readers of this block.
3155 */
3156int
3157arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3158    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3159    const zbookmark_phys_t *zb)
3160{
3161	arc_buf_hdr_t *hdr = NULL;
3162	arc_buf_t *buf = NULL;
3163	kmutex_t *hash_lock = NULL;
3164	zio_t *rzio;
3165	uint64_t guid = spa_load_guid(spa);
3166
3167	ASSERT(!BP_IS_EMBEDDED(bp) ||
3168	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3169
3170top:
3171	if (!BP_IS_EMBEDDED(bp)) {
3172		/*
3173		 * Embedded BP's have no DVA and require no I/O to "read".
3174		 * Create an anonymous arc buf to back it.
3175		 */
3176		hdr = buf_hash_find(guid, bp, &hash_lock);
3177	}
3178
3179	if (hdr != NULL && hdr->b_datacnt > 0) {
3180
3181		*arc_flags |= ARC_CACHED;
3182
3183		if (HDR_IO_IN_PROGRESS(hdr)) {
3184
3185			if (*arc_flags & ARC_WAIT) {
3186				cv_wait(&hdr->b_cv, hash_lock);
3187				mutex_exit(hash_lock);
3188				goto top;
3189			}
3190			ASSERT(*arc_flags & ARC_NOWAIT);
3191
3192			if (done) {
3193				arc_callback_t	*acb = NULL;
3194
3195				acb = kmem_zalloc(sizeof (arc_callback_t),
3196				    KM_SLEEP);
3197				acb->acb_done = done;
3198				acb->acb_private = private;
3199				if (pio != NULL)
3200					acb->acb_zio_dummy = zio_null(pio,
3201					    spa, NULL, NULL, NULL, zio_flags);
3202
3203				ASSERT(acb->acb_done != NULL);
3204				acb->acb_next = hdr->b_acb;
3205				hdr->b_acb = acb;
3206				add_reference(hdr, hash_lock, private);
3207				mutex_exit(hash_lock);
3208				return (0);
3209			}
3210			mutex_exit(hash_lock);
3211			return (0);
3212		}
3213
3214		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3215
3216		if (done) {
3217			add_reference(hdr, hash_lock, private);
3218			/*
3219			 * If this block is already in use, create a new
3220			 * copy of the data so that we will be guaranteed
3221			 * that arc_release() will always succeed.
3222			 */
3223			buf = hdr->b_buf;
3224			ASSERT(buf);
3225			ASSERT(buf->b_data);
3226			if (HDR_BUF_AVAILABLE(hdr)) {
3227				ASSERT(buf->b_efunc == NULL);
3228				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3229			} else {
3230				buf = arc_buf_clone(buf);
3231			}
3232
3233		} else if (*arc_flags & ARC_PREFETCH &&
3234		    refcount_count(&hdr->b_refcnt) == 0) {
3235			hdr->b_flags |= ARC_PREFETCH;
3236		}
3237		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3238		arc_access(hdr, hash_lock);
3239		if (*arc_flags & ARC_L2CACHE)
3240			hdr->b_flags |= ARC_L2CACHE;
3241		if (*arc_flags & ARC_L2COMPRESS)
3242			hdr->b_flags |= ARC_L2COMPRESS;
3243		mutex_exit(hash_lock);
3244		ARCSTAT_BUMP(arcstat_hits);
3245		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3246		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3247		    data, metadata, hits);
3248
3249		if (done)
3250			done(NULL, buf, private);
3251	} else {
3252		uint64_t size = BP_GET_LSIZE(bp);
3253		arc_callback_t *acb;
3254		vdev_t *vd = NULL;
3255		uint64_t addr = 0;
3256		boolean_t devw = B_FALSE;
3257		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3258		uint64_t b_asize = 0;
3259
3260		if (hdr == NULL) {
3261			/* this block is not in the cache */
3262			arc_buf_hdr_t *exists = NULL;
3263			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3264			buf = arc_buf_alloc(spa, size, private, type);
3265			hdr = buf->b_hdr;
3266			if (!BP_IS_EMBEDDED(bp)) {
3267				hdr->b_dva = *BP_IDENTITY(bp);
3268				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3269				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3270				exists = buf_hash_insert(hdr, &hash_lock);
3271			}
3272			if (exists != NULL) {
3273				/* somebody beat us to the hash insert */
3274				mutex_exit(hash_lock);
3275				buf_discard_identity(hdr);
3276				(void) arc_buf_remove_ref(buf, private);
3277				goto top; /* restart the IO request */
3278			}
3279			/* if this is a prefetch, we don't have a reference */
3280			if (*arc_flags & ARC_PREFETCH) {
3281				(void) remove_reference(hdr, hash_lock,
3282				    private);
3283				hdr->b_flags |= ARC_PREFETCH;
3284			}
3285			if (*arc_flags & ARC_L2CACHE)
3286				hdr->b_flags |= ARC_L2CACHE;
3287			if (*arc_flags & ARC_L2COMPRESS)
3288				hdr->b_flags |= ARC_L2COMPRESS;
3289			if (BP_GET_LEVEL(bp) > 0)
3290				hdr->b_flags |= ARC_INDIRECT;
3291		} else {
3292			/* this block is in the ghost cache */
3293			ASSERT(GHOST_STATE(hdr->b_state));
3294			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3295			ASSERT0(refcount_count(&hdr->b_refcnt));
3296			ASSERT(hdr->b_buf == NULL);
3297
3298			/* if this is a prefetch, we don't have a reference */
3299			if (*arc_flags & ARC_PREFETCH)
3300				hdr->b_flags |= ARC_PREFETCH;
3301			else
3302				add_reference(hdr, hash_lock, private);
3303			if (*arc_flags & ARC_L2CACHE)
3304				hdr->b_flags |= ARC_L2CACHE;
3305			if (*arc_flags & ARC_L2COMPRESS)
3306				hdr->b_flags |= ARC_L2COMPRESS;
3307			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3308			buf->b_hdr = hdr;
3309			buf->b_data = NULL;
3310			buf->b_efunc = NULL;
3311			buf->b_private = NULL;
3312			buf->b_next = NULL;
3313			hdr->b_buf = buf;
3314			ASSERT(hdr->b_datacnt == 0);
3315			hdr->b_datacnt = 1;
3316			arc_get_data_buf(buf);
3317			arc_access(hdr, hash_lock);
3318		}
3319
3320		ASSERT(!GHOST_STATE(hdr->b_state));
3321
3322		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3323		acb->acb_done = done;
3324		acb->acb_private = private;
3325
3326		ASSERT(hdr->b_acb == NULL);
3327		hdr->b_acb = acb;
3328		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3329
3330		if (hdr->b_l2hdr != NULL &&
3331		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3332			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3333			addr = hdr->b_l2hdr->b_daddr;
3334			b_compress = hdr->b_l2hdr->b_compress;
3335			b_asize = hdr->b_l2hdr->b_asize;
3336			/*
3337			 * Lock out device removal.
3338			 */
3339			if (vdev_is_dead(vd) ||
3340			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3341				vd = NULL;
3342		}
3343
3344		if (hash_lock != NULL)
3345			mutex_exit(hash_lock);
3346
3347		/*
3348		 * At this point, we have a level 1 cache miss.  Try again in
3349		 * L2ARC if possible.
3350		 */
3351		ASSERT3U(hdr->b_size, ==, size);
3352		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3353		    uint64_t, size, zbookmark_phys_t *, zb);
3354		ARCSTAT_BUMP(arcstat_misses);
3355		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3356		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3357		    data, metadata, misses);
3358#ifdef _KERNEL
3359		curthread->td_ru.ru_inblock++;
3360#endif
3361
3362		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3363			/*
3364			 * Read from the L2ARC if the following are true:
3365			 * 1. The L2ARC vdev was previously cached.
3366			 * 2. This buffer still has L2ARC metadata.
3367			 * 3. This buffer isn't currently writing to the L2ARC.
3368			 * 4. The L2ARC entry wasn't evicted, which may
3369			 *    also have invalidated the vdev.
3370			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3371			 */
3372			if (hdr->b_l2hdr != NULL &&
3373			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3374			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3375				l2arc_read_callback_t *cb;
3376
3377				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3378				ARCSTAT_BUMP(arcstat_l2_hits);
3379
3380				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3381				    KM_SLEEP);
3382				cb->l2rcb_buf = buf;
3383				cb->l2rcb_spa = spa;
3384				cb->l2rcb_bp = *bp;
3385				cb->l2rcb_zb = *zb;
3386				cb->l2rcb_flags = zio_flags;
3387				cb->l2rcb_compress = b_compress;
3388
3389				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3390				    addr + size < vd->vdev_psize -
3391				    VDEV_LABEL_END_SIZE);
3392
3393				/*
3394				 * l2arc read.  The SCL_L2ARC lock will be
3395				 * released by l2arc_read_done().
3396				 * Issue a null zio if the underlying buffer
3397				 * was squashed to zero size by compression.
3398				 */
3399				if (b_compress == ZIO_COMPRESS_EMPTY) {
3400					rzio = zio_null(pio, spa, vd,
3401					    l2arc_read_done, cb,
3402					    zio_flags | ZIO_FLAG_DONT_CACHE |
3403					    ZIO_FLAG_CANFAIL |
3404					    ZIO_FLAG_DONT_PROPAGATE |
3405					    ZIO_FLAG_DONT_RETRY);
3406				} else {
3407					rzio = zio_read_phys(pio, vd, addr,
3408					    b_asize, buf->b_data,
3409					    ZIO_CHECKSUM_OFF,
3410					    l2arc_read_done, cb, priority,
3411					    zio_flags | ZIO_FLAG_DONT_CACHE |
3412					    ZIO_FLAG_CANFAIL |
3413					    ZIO_FLAG_DONT_PROPAGATE |
3414					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3415				}
3416				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3417				    zio_t *, rzio);
3418				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3419
3420				if (*arc_flags & ARC_NOWAIT) {
3421					zio_nowait(rzio);
3422					return (0);
3423				}
3424
3425				ASSERT(*arc_flags & ARC_WAIT);
3426				if (zio_wait(rzio) == 0)
3427					return (0);
3428
3429				/* l2arc read error; goto zio_read() */
3430			} else {
3431				DTRACE_PROBE1(l2arc__miss,
3432				    arc_buf_hdr_t *, hdr);
3433				ARCSTAT_BUMP(arcstat_l2_misses);
3434				if (HDR_L2_WRITING(hdr))
3435					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3436				spa_config_exit(spa, SCL_L2ARC, vd);
3437			}
3438		} else {
3439			if (vd != NULL)
3440				spa_config_exit(spa, SCL_L2ARC, vd);
3441			if (l2arc_ndev != 0) {
3442				DTRACE_PROBE1(l2arc__miss,
3443				    arc_buf_hdr_t *, hdr);
3444				ARCSTAT_BUMP(arcstat_l2_misses);
3445			}
3446		}
3447
3448		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3449		    arc_read_done, buf, priority, zio_flags, zb);
3450
3451		if (*arc_flags & ARC_WAIT)
3452			return (zio_wait(rzio));
3453
3454		ASSERT(*arc_flags & ARC_NOWAIT);
3455		zio_nowait(rzio);
3456	}
3457	return (0);
3458}
3459
3460void
3461arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3462{
3463	ASSERT(buf->b_hdr != NULL);
3464	ASSERT(buf->b_hdr->b_state != arc_anon);
3465	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3466	ASSERT(buf->b_efunc == NULL);
3467	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3468
3469	buf->b_efunc = func;
3470	buf->b_private = private;
3471}
3472
3473/*
3474 * Notify the arc that a block was freed, and thus will never be used again.
3475 */
3476void
3477arc_freed(spa_t *spa, const blkptr_t *bp)
3478{
3479	arc_buf_hdr_t *hdr;
3480	kmutex_t *hash_lock;
3481	uint64_t guid = spa_load_guid(spa);
3482
3483	ASSERT(!BP_IS_EMBEDDED(bp));
3484
3485	hdr = buf_hash_find(guid, bp, &hash_lock);
3486	if (hdr == NULL)
3487		return;
3488	if (HDR_BUF_AVAILABLE(hdr)) {
3489		arc_buf_t *buf = hdr->b_buf;
3490		add_reference(hdr, hash_lock, FTAG);
3491		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3492		mutex_exit(hash_lock);
3493
3494		arc_release(buf, FTAG);
3495		(void) arc_buf_remove_ref(buf, FTAG);
3496	} else {
3497		mutex_exit(hash_lock);
3498	}
3499
3500}
3501
3502/*
3503 * Clear the user eviction callback set by arc_set_callback(), first calling
3504 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3505 * clearing the callback may result in the arc_buf being destroyed.  However,
3506 * it will not result in the *last* arc_buf being destroyed, hence the data
3507 * will remain cached in the ARC. We make a copy of the arc buffer here so
3508 * that we can process the callback without holding any locks.
3509 *
3510 * It's possible that the callback is already in the process of being cleared
3511 * by another thread.  In this case we can not clear the callback.
3512 *
3513 * Returns B_TRUE if the callback was successfully called and cleared.
3514 */
3515boolean_t
3516arc_clear_callback(arc_buf_t *buf)
3517{
3518	arc_buf_hdr_t *hdr;
3519	kmutex_t *hash_lock;
3520	arc_evict_func_t *efunc = buf->b_efunc;
3521	void *private = buf->b_private;
3522	list_t *list, *evicted_list;
3523	kmutex_t *lock, *evicted_lock;
3524
3525	mutex_enter(&buf->b_evict_lock);
3526	hdr = buf->b_hdr;
3527	if (hdr == NULL) {
3528		/*
3529		 * We are in arc_do_user_evicts().
3530		 */
3531		ASSERT(buf->b_data == NULL);
3532		mutex_exit(&buf->b_evict_lock);
3533		return (B_FALSE);
3534	} else if (buf->b_data == NULL) {
3535		/*
3536		 * We are on the eviction list; process this buffer now
3537		 * but let arc_do_user_evicts() do the reaping.
3538		 */
3539		buf->b_efunc = NULL;
3540		mutex_exit(&buf->b_evict_lock);
3541		VERIFY0(efunc(private));
3542		return (B_TRUE);
3543	}
3544	hash_lock = HDR_LOCK(hdr);
3545	mutex_enter(hash_lock);
3546	hdr = buf->b_hdr;
3547	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3548
3549	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3550	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3551
3552	buf->b_efunc = NULL;
3553	buf->b_private = NULL;
3554
3555	if (hdr->b_datacnt > 1) {
3556		mutex_exit(&buf->b_evict_lock);
3557		arc_buf_destroy(buf, FALSE, TRUE);
3558	} else {
3559		ASSERT(buf == hdr->b_buf);
3560		hdr->b_flags |= ARC_BUF_AVAILABLE;
3561		mutex_exit(&buf->b_evict_lock);
3562	}
3563
3564	mutex_exit(hash_lock);
3565	VERIFY0(efunc(private));
3566	return (B_TRUE);
3567}
3568
3569/*
3570 * Release this buffer from the cache, making it an anonymous buffer.  This
3571 * must be done after a read and prior to modifying the buffer contents.
3572 * If the buffer has more than one reference, we must make
3573 * a new hdr for the buffer.
3574 */
3575void
3576arc_release(arc_buf_t *buf, void *tag)
3577{
3578	arc_buf_hdr_t *hdr;
3579	kmutex_t *hash_lock = NULL;
3580	l2arc_buf_hdr_t *l2hdr;
3581	uint64_t buf_size;
3582
3583	/*
3584	 * It would be nice to assert that if it's DMU metadata (level >
3585	 * 0 || it's the dnode file), then it must be syncing context.
3586	 * But we don't know that information at this level.
3587	 */
3588
3589	mutex_enter(&buf->b_evict_lock);
3590	hdr = buf->b_hdr;
3591
3592	/* this buffer is not on any list */
3593	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3594
3595	if (hdr->b_state == arc_anon) {
3596		/* this buffer is already released */
3597		ASSERT(buf->b_efunc == NULL);
3598	} else {
3599		hash_lock = HDR_LOCK(hdr);
3600		mutex_enter(hash_lock);
3601		hdr = buf->b_hdr;
3602		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3603	}
3604
3605	l2hdr = hdr->b_l2hdr;
3606	if (l2hdr) {
3607		mutex_enter(&l2arc_buflist_mtx);
3608		hdr->b_l2hdr = NULL;
3609		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3610	}
3611	buf_size = hdr->b_size;
3612
3613	/*
3614	 * Do we have more than one buf?
3615	 */
3616	if (hdr->b_datacnt > 1) {
3617		arc_buf_hdr_t *nhdr;
3618		arc_buf_t **bufp;
3619		uint64_t blksz = hdr->b_size;
3620		uint64_t spa = hdr->b_spa;
3621		arc_buf_contents_t type = hdr->b_type;
3622		uint32_t flags = hdr->b_flags;
3623
3624		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3625		/*
3626		 * Pull the data off of this hdr and attach it to
3627		 * a new anonymous hdr.
3628		 */
3629		(void) remove_reference(hdr, hash_lock, tag);
3630		bufp = &hdr->b_buf;
3631		while (*bufp != buf)
3632			bufp = &(*bufp)->b_next;
3633		*bufp = buf->b_next;
3634		buf->b_next = NULL;
3635
3636		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3637		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3638		if (refcount_is_zero(&hdr->b_refcnt)) {
3639			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3640			ASSERT3U(*size, >=, hdr->b_size);
3641			atomic_add_64(size, -hdr->b_size);
3642		}
3643
3644		/*
3645		 * We're releasing a duplicate user data buffer, update
3646		 * our statistics accordingly.
3647		 */
3648		if (hdr->b_type == ARC_BUFC_DATA) {
3649			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3650			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3651			    -hdr->b_size);
3652		}
3653		hdr->b_datacnt -= 1;
3654		arc_cksum_verify(buf);
3655#ifdef illumos
3656		arc_buf_unwatch(buf);
3657#endif /* illumos */
3658
3659		mutex_exit(hash_lock);
3660
3661		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3662		nhdr->b_size = blksz;
3663		nhdr->b_spa = spa;
3664		nhdr->b_type = type;
3665		nhdr->b_buf = buf;
3666		nhdr->b_state = arc_anon;
3667		nhdr->b_arc_access = 0;
3668		nhdr->b_flags = flags & ARC_L2_WRITING;
3669		nhdr->b_l2hdr = NULL;
3670		nhdr->b_datacnt = 1;
3671		nhdr->b_freeze_cksum = NULL;
3672		(void) refcount_add(&nhdr->b_refcnt, tag);
3673		buf->b_hdr = nhdr;
3674		mutex_exit(&buf->b_evict_lock);
3675		atomic_add_64(&arc_anon->arcs_size, blksz);
3676	} else {
3677		mutex_exit(&buf->b_evict_lock);
3678		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3679		ASSERT(!list_link_active(&hdr->b_arc_node));
3680		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3681		if (hdr->b_state != arc_anon)
3682			arc_change_state(arc_anon, hdr, hash_lock);
3683		hdr->b_arc_access = 0;
3684		if (hash_lock)
3685			mutex_exit(hash_lock);
3686
3687		buf_discard_identity(hdr);
3688		arc_buf_thaw(buf);
3689	}
3690	buf->b_efunc = NULL;
3691	buf->b_private = NULL;
3692
3693	if (l2hdr) {
3694		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3695		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3696		    -l2hdr->b_asize, 0, 0);
3697		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3698		    hdr->b_size, 0);
3699		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3700		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3701		mutex_exit(&l2arc_buflist_mtx);
3702	}
3703}
3704
3705int
3706arc_released(arc_buf_t *buf)
3707{
3708	int released;
3709
3710	mutex_enter(&buf->b_evict_lock);
3711	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3712	mutex_exit(&buf->b_evict_lock);
3713	return (released);
3714}
3715
3716#ifdef ZFS_DEBUG
3717int
3718arc_referenced(arc_buf_t *buf)
3719{
3720	int referenced;
3721
3722	mutex_enter(&buf->b_evict_lock);
3723	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3724	mutex_exit(&buf->b_evict_lock);
3725	return (referenced);
3726}
3727#endif
3728
3729static void
3730arc_write_ready(zio_t *zio)
3731{
3732	arc_write_callback_t *callback = zio->io_private;
3733	arc_buf_t *buf = callback->awcb_buf;
3734	arc_buf_hdr_t *hdr = buf->b_hdr;
3735
3736	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3737	callback->awcb_ready(zio, buf, callback->awcb_private);
3738
3739	/*
3740	 * If the IO is already in progress, then this is a re-write
3741	 * attempt, so we need to thaw and re-compute the cksum.
3742	 * It is the responsibility of the callback to handle the
3743	 * accounting for any re-write attempt.
3744	 */
3745	if (HDR_IO_IN_PROGRESS(hdr)) {
3746		mutex_enter(&hdr->b_freeze_lock);
3747		if (hdr->b_freeze_cksum != NULL) {
3748			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3749			hdr->b_freeze_cksum = NULL;
3750		}
3751		mutex_exit(&hdr->b_freeze_lock);
3752	}
3753	arc_cksum_compute(buf, B_FALSE);
3754	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3755}
3756
3757/*
3758 * The SPA calls this callback for each physical write that happens on behalf
3759 * of a logical write.  See the comment in dbuf_write_physdone() for details.
3760 */
3761static void
3762arc_write_physdone(zio_t *zio)
3763{
3764	arc_write_callback_t *cb = zio->io_private;
3765	if (cb->awcb_physdone != NULL)
3766		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3767}
3768
3769static void
3770arc_write_done(zio_t *zio)
3771{
3772	arc_write_callback_t *callback = zio->io_private;
3773	arc_buf_t *buf = callback->awcb_buf;
3774	arc_buf_hdr_t *hdr = buf->b_hdr;
3775
3776	ASSERT(hdr->b_acb == NULL);
3777
3778	if (zio->io_error == 0) {
3779		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3780			buf_discard_identity(hdr);
3781		} else {
3782			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3783			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3784			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3785		}
3786	} else {
3787		ASSERT(BUF_EMPTY(hdr));
3788	}
3789
3790	/*
3791	 * If the block to be written was all-zero or compressed enough to be
3792	 * embedded in the BP, no write was performed so there will be no
3793	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3794	 * (and uncached).
3795	 */
3796	if (!BUF_EMPTY(hdr)) {
3797		arc_buf_hdr_t *exists;
3798		kmutex_t *hash_lock;
3799
3800		ASSERT(zio->io_error == 0);
3801
3802		arc_cksum_verify(buf);
3803
3804		exists = buf_hash_insert(hdr, &hash_lock);
3805		if (exists) {
3806			/*
3807			 * This can only happen if we overwrite for
3808			 * sync-to-convergence, because we remove
3809			 * buffers from the hash table when we arc_free().
3810			 */
3811			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3812				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3813					panic("bad overwrite, hdr=%p exists=%p",
3814					    (void *)hdr, (void *)exists);
3815				ASSERT(refcount_is_zero(&exists->b_refcnt));
3816				arc_change_state(arc_anon, exists, hash_lock);
3817				mutex_exit(hash_lock);
3818				arc_hdr_destroy(exists);
3819				exists = buf_hash_insert(hdr, &hash_lock);
3820				ASSERT3P(exists, ==, NULL);
3821			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3822				/* nopwrite */
3823				ASSERT(zio->io_prop.zp_nopwrite);
3824				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3825					panic("bad nopwrite, hdr=%p exists=%p",
3826					    (void *)hdr, (void *)exists);
3827			} else {
3828				/* Dedup */
3829				ASSERT(hdr->b_datacnt == 1);
3830				ASSERT(hdr->b_state == arc_anon);
3831				ASSERT(BP_GET_DEDUP(zio->io_bp));
3832				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3833			}
3834		}
3835		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3836		/* if it's not anon, we are doing a scrub */
3837		if (!exists && hdr->b_state == arc_anon)
3838			arc_access(hdr, hash_lock);
3839		mutex_exit(hash_lock);
3840	} else {
3841		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3842	}
3843
3844	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3845	callback->awcb_done(zio, buf, callback->awcb_private);
3846
3847	kmem_free(callback, sizeof (arc_write_callback_t));
3848}
3849
3850zio_t *
3851arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3852    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3853    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3854    arc_done_func_t *done, void *private, zio_priority_t priority,
3855    int zio_flags, const zbookmark_phys_t *zb)
3856{
3857	arc_buf_hdr_t *hdr = buf->b_hdr;
3858	arc_write_callback_t *callback;
3859	zio_t *zio;
3860
3861	ASSERT(ready != NULL);
3862	ASSERT(done != NULL);
3863	ASSERT(!HDR_IO_ERROR(hdr));
3864	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3865	ASSERT(hdr->b_acb == NULL);
3866	if (l2arc)
3867		hdr->b_flags |= ARC_L2CACHE;
3868	if (l2arc_compress)
3869		hdr->b_flags |= ARC_L2COMPRESS;
3870	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3871	callback->awcb_ready = ready;
3872	callback->awcb_physdone = physdone;
3873	callback->awcb_done = done;
3874	callback->awcb_private = private;
3875	callback->awcb_buf = buf;
3876
3877	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3878	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
3879	    priority, zio_flags, zb);
3880
3881	return (zio);
3882}
3883
3884static int
3885arc_memory_throttle(uint64_t reserve, uint64_t txg)
3886{
3887#ifdef _KERNEL
3888	uint64_t available_memory =
3889	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3890	static uint64_t page_load = 0;
3891	static uint64_t last_txg = 0;
3892
3893#ifdef sun
3894#if defined(__i386)
3895	available_memory =
3896	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3897#endif
3898#endif	/* sun */
3899
3900	if (cnt.v_free_count + cnt.v_cache_count >
3901	    (uint64_t)physmem * arc_lotsfree_percent / 100)
3902		return (0);
3903
3904	if (txg > last_txg) {
3905		last_txg = txg;
3906		page_load = 0;
3907	}
3908	/*
3909	 * If we are in pageout, we know that memory is already tight,
3910	 * the arc is already going to be evicting, so we just want to
3911	 * continue to let page writes occur as quickly as possible.
3912	 */
3913	if (curproc == pageproc) {
3914		if (page_load > available_memory / 4)
3915			return (SET_ERROR(ERESTART));
3916		/* Note: reserve is inflated, so we deflate */
3917		page_load += reserve / 8;
3918		return (0);
3919	} else if (page_load > 0 && arc_reclaim_needed()) {
3920		/* memory is low, delay before restarting */
3921		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3922		return (SET_ERROR(EAGAIN));
3923	}
3924	page_load = 0;
3925#endif
3926	return (0);
3927}
3928
3929void
3930arc_tempreserve_clear(uint64_t reserve)
3931{
3932	atomic_add_64(&arc_tempreserve, -reserve);
3933	ASSERT((int64_t)arc_tempreserve >= 0);
3934}
3935
3936int
3937arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3938{
3939	int error;
3940	uint64_t anon_size;
3941
3942	if (reserve > arc_c/4 && !arc_no_grow)
3943		arc_c = MIN(arc_c_max, reserve * 4);
3944	if (reserve > arc_c)
3945		return (SET_ERROR(ENOMEM));
3946
3947	/*
3948	 * Don't count loaned bufs as in flight dirty data to prevent long
3949	 * network delays from blocking transactions that are ready to be
3950	 * assigned to a txg.
3951	 */
3952	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3953
3954	/*
3955	 * Writes will, almost always, require additional memory allocations
3956	 * in order to compress/encrypt/etc the data.  We therefore need to
3957	 * make sure that there is sufficient available memory for this.
3958	 */
3959	error = arc_memory_throttle(reserve, txg);
3960	if (error != 0)
3961		return (error);
3962
3963	/*
3964	 * Throttle writes when the amount of dirty data in the cache
3965	 * gets too large.  We try to keep the cache less than half full
3966	 * of dirty blocks so that our sync times don't grow too large.
3967	 * Note: if two requests come in concurrently, we might let them
3968	 * both succeed, when one of them should fail.  Not a huge deal.
3969	 */
3970
3971	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3972	    anon_size > arc_c / 4) {
3973		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3974		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3975		    arc_tempreserve>>10,
3976		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3977		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3978		    reserve>>10, arc_c>>10);
3979		return (SET_ERROR(ERESTART));
3980	}
3981	atomic_add_64(&arc_tempreserve, reserve);
3982	return (0);
3983}
3984
3985static kmutex_t arc_lowmem_lock;
3986#ifdef _KERNEL
3987static eventhandler_tag arc_event_lowmem = NULL;
3988
3989static void
3990arc_lowmem(void *arg __unused, int howto __unused)
3991{
3992
3993	/* Serialize access via arc_lowmem_lock. */
3994	mutex_enter(&arc_lowmem_lock);
3995	mutex_enter(&arc_reclaim_thr_lock);
3996	needfree = 1;
3997	cv_signal(&arc_reclaim_thr_cv);
3998
3999	/*
4000	 * It is unsafe to block here in arbitrary threads, because we can come
4001	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4002	 * with ARC reclaim thread.
4003	 */
4004	if (curproc == pageproc) {
4005		while (needfree)
4006			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4007	}
4008	mutex_exit(&arc_reclaim_thr_lock);
4009	mutex_exit(&arc_lowmem_lock);
4010}
4011#endif
4012
4013void
4014arc_init(void)
4015{
4016	int i, prefetch_tunable_set = 0;
4017
4018	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4019	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4020	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4021
4022	/* Convert seconds to clock ticks */
4023	arc_min_prefetch_lifespan = 1 * hz;
4024
4025	/* Start out with 1/8 of all memory */
4026	arc_c = kmem_size() / 8;
4027
4028#ifdef sun
4029#ifdef _KERNEL
4030	/*
4031	 * On architectures where the physical memory can be larger
4032	 * than the addressable space (intel in 32-bit mode), we may
4033	 * need to limit the cache to 1/8 of VM size.
4034	 */
4035	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4036#endif
4037#endif	/* sun */
4038	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4039	arc_c_min = MAX(arc_c / 4, 64<<18);
4040	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4041	if (arc_c * 8 >= 1<<30)
4042		arc_c_max = (arc_c * 8) - (1<<30);
4043	else
4044		arc_c_max = arc_c_min;
4045	arc_c_max = MAX(arc_c * 5, arc_c_max);
4046
4047#ifdef _KERNEL
4048	/*
4049	 * Allow the tunables to override our calculations if they are
4050	 * reasonable (ie. over 16MB)
4051	 */
4052	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
4053		arc_c_max = zfs_arc_max;
4054	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4055		arc_c_min = zfs_arc_min;
4056#endif
4057
4058	arc_c = arc_c_max;
4059	arc_p = (arc_c >> 1);
4060
4061	/* limit meta-data to 1/4 of the arc capacity */
4062	arc_meta_limit = arc_c_max / 4;
4063
4064	/* Allow the tunable to override if it is reasonable */
4065	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4066		arc_meta_limit = zfs_arc_meta_limit;
4067
4068	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4069		arc_c_min = arc_meta_limit / 2;
4070
4071	if (zfs_arc_grow_retry > 0)
4072		arc_grow_retry = zfs_arc_grow_retry;
4073
4074	if (zfs_arc_shrink_shift > 0)
4075		arc_shrink_shift = zfs_arc_shrink_shift;
4076
4077	if (zfs_arc_p_min_shift > 0)
4078		arc_p_min_shift = zfs_arc_p_min_shift;
4079
4080	/* if kmem_flags are set, lets try to use less memory */
4081	if (kmem_debugging())
4082		arc_c = arc_c / 2;
4083	if (arc_c < arc_c_min)
4084		arc_c = arc_c_min;
4085
4086	zfs_arc_min = arc_c_min;
4087	zfs_arc_max = arc_c_max;
4088
4089	arc_anon = &ARC_anon;
4090	arc_mru = &ARC_mru;
4091	arc_mru_ghost = &ARC_mru_ghost;
4092	arc_mfu = &ARC_mfu;
4093	arc_mfu_ghost = &ARC_mfu_ghost;
4094	arc_l2c_only = &ARC_l2c_only;
4095	arc_size = 0;
4096
4097	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4098		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4099		    NULL, MUTEX_DEFAULT, NULL);
4100		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4101		    NULL, MUTEX_DEFAULT, NULL);
4102		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4103		    NULL, MUTEX_DEFAULT, NULL);
4104		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4105		    NULL, MUTEX_DEFAULT, NULL);
4106		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4107		    NULL, MUTEX_DEFAULT, NULL);
4108		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4109		    NULL, MUTEX_DEFAULT, NULL);
4110
4111		list_create(&arc_mru->arcs_lists[i],
4112		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4113		list_create(&arc_mru_ghost->arcs_lists[i],
4114		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4115		list_create(&arc_mfu->arcs_lists[i],
4116		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4117		list_create(&arc_mfu_ghost->arcs_lists[i],
4118		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4119		list_create(&arc_mfu_ghost->arcs_lists[i],
4120		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4121		list_create(&arc_l2c_only->arcs_lists[i],
4122		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4123	}
4124
4125	buf_init();
4126
4127	arc_thread_exit = 0;
4128	arc_eviction_list = NULL;
4129	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4130	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4131
4132	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4133	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4134
4135	if (arc_ksp != NULL) {
4136		arc_ksp->ks_data = &arc_stats;
4137		kstat_install(arc_ksp);
4138	}
4139
4140	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4141	    TS_RUN, minclsyspri);
4142
4143#ifdef _KERNEL
4144	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4145	    EVENTHANDLER_PRI_FIRST);
4146#endif
4147
4148	arc_dead = FALSE;
4149	arc_warm = B_FALSE;
4150
4151	/*
4152	 * Calculate maximum amount of dirty data per pool.
4153	 *
4154	 * If it has been set by /etc/system, take that.
4155	 * Otherwise, use a percentage of physical memory defined by
4156	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4157	 * zfs_dirty_data_max_max (default 4GB).
4158	 */
4159	if (zfs_dirty_data_max == 0) {
4160		zfs_dirty_data_max = ptob(physmem) *
4161		    zfs_dirty_data_max_percent / 100;
4162		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4163		    zfs_dirty_data_max_max);
4164	}
4165
4166#ifdef _KERNEL
4167	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4168		prefetch_tunable_set = 1;
4169
4170#ifdef __i386__
4171	if (prefetch_tunable_set == 0) {
4172		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4173		    "-- to enable,\n");
4174		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4175		    "to /boot/loader.conf.\n");
4176		zfs_prefetch_disable = 1;
4177	}
4178#else
4179	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4180	    prefetch_tunable_set == 0) {
4181		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4182		    "than 4GB of RAM is present;\n"
4183		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4184		    "to /boot/loader.conf.\n");
4185		zfs_prefetch_disable = 1;
4186	}
4187#endif
4188	/* Warn about ZFS memory and address space requirements. */
4189	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4190		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4191		    "expect unstable behavior.\n");
4192	}
4193	if (kmem_size() < 512 * (1 << 20)) {
4194		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4195		    "expect unstable behavior.\n");
4196		printf("             Consider tuning vm.kmem_size and "
4197		    "vm.kmem_size_max\n");
4198		printf("             in /boot/loader.conf.\n");
4199	}
4200#endif
4201}
4202
4203void
4204arc_fini(void)
4205{
4206	int i;
4207
4208	mutex_enter(&arc_reclaim_thr_lock);
4209	arc_thread_exit = 1;
4210	cv_signal(&arc_reclaim_thr_cv);
4211	while (arc_thread_exit != 0)
4212		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4213	mutex_exit(&arc_reclaim_thr_lock);
4214
4215	arc_flush(NULL);
4216
4217	arc_dead = TRUE;
4218
4219	if (arc_ksp != NULL) {
4220		kstat_delete(arc_ksp);
4221		arc_ksp = NULL;
4222	}
4223
4224	mutex_destroy(&arc_eviction_mtx);
4225	mutex_destroy(&arc_reclaim_thr_lock);
4226	cv_destroy(&arc_reclaim_thr_cv);
4227
4228	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4229		list_destroy(&arc_mru->arcs_lists[i]);
4230		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4231		list_destroy(&arc_mfu->arcs_lists[i]);
4232		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4233		list_destroy(&arc_l2c_only->arcs_lists[i]);
4234
4235		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4236		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4237		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4238		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4239		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4240		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4241	}
4242
4243	buf_fini();
4244
4245	ASSERT(arc_loaned_bytes == 0);
4246
4247	mutex_destroy(&arc_lowmem_lock);
4248#ifdef _KERNEL
4249	if (arc_event_lowmem != NULL)
4250		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4251#endif
4252}
4253
4254/*
4255 * Level 2 ARC
4256 *
4257 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4258 * It uses dedicated storage devices to hold cached data, which are populated
4259 * using large infrequent writes.  The main role of this cache is to boost
4260 * the performance of random read workloads.  The intended L2ARC devices
4261 * include short-stroked disks, solid state disks, and other media with
4262 * substantially faster read latency than disk.
4263 *
4264 *                 +-----------------------+
4265 *                 |         ARC           |
4266 *                 +-----------------------+
4267 *                    |         ^     ^
4268 *                    |         |     |
4269 *      l2arc_feed_thread()    arc_read()
4270 *                    |         |     |
4271 *                    |  l2arc read   |
4272 *                    V         |     |
4273 *               +---------------+    |
4274 *               |     L2ARC     |    |
4275 *               +---------------+    |
4276 *                   |    ^           |
4277 *          l2arc_write() |           |
4278 *                   |    |           |
4279 *                   V    |           |
4280 *                 +-------+      +-------+
4281 *                 | vdev  |      | vdev  |
4282 *                 | cache |      | cache |
4283 *                 +-------+      +-------+
4284 *                 +=========+     .-----.
4285 *                 :  L2ARC  :    |-_____-|
4286 *                 : devices :    | Disks |
4287 *                 +=========+    `-_____-'
4288 *
4289 * Read requests are satisfied from the following sources, in order:
4290 *
4291 *	1) ARC
4292 *	2) vdev cache of L2ARC devices
4293 *	3) L2ARC devices
4294 *	4) vdev cache of disks
4295 *	5) disks
4296 *
4297 * Some L2ARC device types exhibit extremely slow write performance.
4298 * To accommodate for this there are some significant differences between
4299 * the L2ARC and traditional cache design:
4300 *
4301 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4302 * the ARC behave as usual, freeing buffers and placing headers on ghost
4303 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4304 * this would add inflated write latencies for all ARC memory pressure.
4305 *
4306 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4307 * It does this by periodically scanning buffers from the eviction-end of
4308 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4309 * not already there. It scans until a headroom of buffers is satisfied,
4310 * which itself is a buffer for ARC eviction. If a compressible buffer is
4311 * found during scanning and selected for writing to an L2ARC device, we
4312 * temporarily boost scanning headroom during the next scan cycle to make
4313 * sure we adapt to compression effects (which might significantly reduce
4314 * the data volume we write to L2ARC). The thread that does this is
4315 * l2arc_feed_thread(), illustrated below; example sizes are included to
4316 * provide a better sense of ratio than this diagram:
4317 *
4318 *	       head -->                        tail
4319 *	        +---------------------+----------+
4320 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4321 *	        +---------------------+----------+   |   o L2ARC eligible
4322 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4323 *	        +---------------------+----------+   |
4324 *	             15.9 Gbytes      ^ 32 Mbytes    |
4325 *	                           headroom          |
4326 *	                                      l2arc_feed_thread()
4327 *	                                             |
4328 *	                 l2arc write hand <--[oooo]--'
4329 *	                         |           8 Mbyte
4330 *	                         |          write max
4331 *	                         V
4332 *		  +==============================+
4333 *	L2ARC dev |####|#|###|###|    |####| ... |
4334 *	          +==============================+
4335 *	                     32 Gbytes
4336 *
4337 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4338 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4339 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4340 * safe to say that this is an uncommon case, since buffers at the end of
4341 * the ARC lists have moved there due to inactivity.
4342 *
4343 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4344 * then the L2ARC simply misses copying some buffers.  This serves as a
4345 * pressure valve to prevent heavy read workloads from both stalling the ARC
4346 * with waits and clogging the L2ARC with writes.  This also helps prevent
4347 * the potential for the L2ARC to churn if it attempts to cache content too
4348 * quickly, such as during backups of the entire pool.
4349 *
4350 * 5. After system boot and before the ARC has filled main memory, there are
4351 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4352 * lists can remain mostly static.  Instead of searching from tail of these
4353 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4354 * for eligible buffers, greatly increasing its chance of finding them.
4355 *
4356 * The L2ARC device write speed is also boosted during this time so that
4357 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4358 * there are no L2ARC reads, and no fear of degrading read performance
4359 * through increased writes.
4360 *
4361 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4362 * the vdev queue can aggregate them into larger and fewer writes.  Each
4363 * device is written to in a rotor fashion, sweeping writes through
4364 * available space then repeating.
4365 *
4366 * 7. The L2ARC does not store dirty content.  It never needs to flush
4367 * write buffers back to disk based storage.
4368 *
4369 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4370 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4371 *
4372 * The performance of the L2ARC can be tweaked by a number of tunables, which
4373 * may be necessary for different workloads:
4374 *
4375 *	l2arc_write_max		max write bytes per interval
4376 *	l2arc_write_boost	extra write bytes during device warmup
4377 *	l2arc_noprefetch	skip caching prefetched buffers
4378 *	l2arc_headroom		number of max device writes to precache
4379 *	l2arc_headroom_boost	when we find compressed buffers during ARC
4380 *				scanning, we multiply headroom by this
4381 *				percentage factor for the next scan cycle,
4382 *				since more compressed buffers are likely to
4383 *				be present
4384 *	l2arc_feed_secs		seconds between L2ARC writing
4385 *
4386 * Tunables may be removed or added as future performance improvements are
4387 * integrated, and also may become zpool properties.
4388 *
4389 * There are three key functions that control how the L2ARC warms up:
4390 *
4391 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4392 *	l2arc_write_size()	calculate how much to write
4393 *	l2arc_write_interval()	calculate sleep delay between writes
4394 *
4395 * These three functions determine what to write, how much, and how quickly
4396 * to send writes.
4397 */
4398
4399static boolean_t
4400l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4401{
4402	/*
4403	 * A buffer is *not* eligible for the L2ARC if it:
4404	 * 1. belongs to a different spa.
4405	 * 2. is already cached on the L2ARC.
4406	 * 3. has an I/O in progress (it may be an incomplete read).
4407	 * 4. is flagged not eligible (zfs property).
4408	 */
4409	if (ab->b_spa != spa_guid) {
4410		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4411		return (B_FALSE);
4412	}
4413	if (ab->b_l2hdr != NULL) {
4414		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4415		return (B_FALSE);
4416	}
4417	if (HDR_IO_IN_PROGRESS(ab)) {
4418		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4419		return (B_FALSE);
4420	}
4421	if (!HDR_L2CACHE(ab)) {
4422		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4423		return (B_FALSE);
4424	}
4425
4426	return (B_TRUE);
4427}
4428
4429static uint64_t
4430l2arc_write_size(void)
4431{
4432	uint64_t size;
4433
4434	/*
4435	 * Make sure our globals have meaningful values in case the user
4436	 * altered them.
4437	 */
4438	size = l2arc_write_max;
4439	if (size == 0) {
4440		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4441		    "be greater than zero, resetting it to the default (%d)",
4442		    L2ARC_WRITE_SIZE);
4443		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4444	}
4445
4446	if (arc_warm == B_FALSE)
4447		size += l2arc_write_boost;
4448
4449	return (size);
4450
4451}
4452
4453static clock_t
4454l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4455{
4456	clock_t interval, next, now;
4457
4458	/*
4459	 * If the ARC lists are busy, increase our write rate; if the
4460	 * lists are stale, idle back.  This is achieved by checking
4461	 * how much we previously wrote - if it was more than half of
4462	 * what we wanted, schedule the next write much sooner.
4463	 */
4464	if (l2arc_feed_again && wrote > (wanted / 2))
4465		interval = (hz * l2arc_feed_min_ms) / 1000;
4466	else
4467		interval = hz * l2arc_feed_secs;
4468
4469	now = ddi_get_lbolt();
4470	next = MAX(now, MIN(now + interval, began + interval));
4471
4472	return (next);
4473}
4474
4475static void
4476l2arc_hdr_stat_add(void)
4477{
4478	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4479	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4480}
4481
4482static void
4483l2arc_hdr_stat_remove(void)
4484{
4485	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4486	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4487}
4488
4489/*
4490 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4491 * If a device is returned, this also returns holding the spa config lock.
4492 */
4493static l2arc_dev_t *
4494l2arc_dev_get_next(void)
4495{
4496	l2arc_dev_t *first, *next = NULL;
4497
4498	/*
4499	 * Lock out the removal of spas (spa_namespace_lock), then removal
4500	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4501	 * both locks will be dropped and a spa config lock held instead.
4502	 */
4503	mutex_enter(&spa_namespace_lock);
4504	mutex_enter(&l2arc_dev_mtx);
4505
4506	/* if there are no vdevs, there is nothing to do */
4507	if (l2arc_ndev == 0)
4508		goto out;
4509
4510	first = NULL;
4511	next = l2arc_dev_last;
4512	do {
4513		/* loop around the list looking for a non-faulted vdev */
4514		if (next == NULL) {
4515			next = list_head(l2arc_dev_list);
4516		} else {
4517			next = list_next(l2arc_dev_list, next);
4518			if (next == NULL)
4519				next = list_head(l2arc_dev_list);
4520		}
4521
4522		/* if we have come back to the start, bail out */
4523		if (first == NULL)
4524			first = next;
4525		else if (next == first)
4526			break;
4527
4528	} while (vdev_is_dead(next->l2ad_vdev));
4529
4530	/* if we were unable to find any usable vdevs, return NULL */
4531	if (vdev_is_dead(next->l2ad_vdev))
4532		next = NULL;
4533
4534	l2arc_dev_last = next;
4535
4536out:
4537	mutex_exit(&l2arc_dev_mtx);
4538
4539	/*
4540	 * Grab the config lock to prevent the 'next' device from being
4541	 * removed while we are writing to it.
4542	 */
4543	if (next != NULL)
4544		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4545	mutex_exit(&spa_namespace_lock);
4546
4547	return (next);
4548}
4549
4550/*
4551 * Free buffers that were tagged for destruction.
4552 */
4553static void
4554l2arc_do_free_on_write()
4555{
4556	list_t *buflist;
4557	l2arc_data_free_t *df, *df_prev;
4558
4559	mutex_enter(&l2arc_free_on_write_mtx);
4560	buflist = l2arc_free_on_write;
4561
4562	for (df = list_tail(buflist); df; df = df_prev) {
4563		df_prev = list_prev(buflist, df);
4564		ASSERT(df->l2df_data != NULL);
4565		ASSERT(df->l2df_func != NULL);
4566		df->l2df_func(df->l2df_data, df->l2df_size);
4567		list_remove(buflist, df);
4568		kmem_free(df, sizeof (l2arc_data_free_t));
4569	}
4570
4571	mutex_exit(&l2arc_free_on_write_mtx);
4572}
4573
4574/*
4575 * A write to a cache device has completed.  Update all headers to allow
4576 * reads from these buffers to begin.
4577 */
4578static void
4579l2arc_write_done(zio_t *zio)
4580{
4581	l2arc_write_callback_t *cb;
4582	l2arc_dev_t *dev;
4583	list_t *buflist;
4584	arc_buf_hdr_t *head, *ab, *ab_prev;
4585	l2arc_buf_hdr_t *abl2;
4586	kmutex_t *hash_lock;
4587	int64_t bytes_dropped = 0;
4588
4589	cb = zio->io_private;
4590	ASSERT(cb != NULL);
4591	dev = cb->l2wcb_dev;
4592	ASSERT(dev != NULL);
4593	head = cb->l2wcb_head;
4594	ASSERT(head != NULL);
4595	buflist = dev->l2ad_buflist;
4596	ASSERT(buflist != NULL);
4597	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4598	    l2arc_write_callback_t *, cb);
4599
4600	if (zio->io_error != 0)
4601		ARCSTAT_BUMP(arcstat_l2_writes_error);
4602
4603	mutex_enter(&l2arc_buflist_mtx);
4604
4605	/*
4606	 * All writes completed, or an error was hit.
4607	 */
4608	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4609		ab_prev = list_prev(buflist, ab);
4610		abl2 = ab->b_l2hdr;
4611
4612		/*
4613		 * Release the temporary compressed buffer as soon as possible.
4614		 */
4615		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4616			l2arc_release_cdata_buf(ab);
4617
4618		hash_lock = HDR_LOCK(ab);
4619		if (!mutex_tryenter(hash_lock)) {
4620			/*
4621			 * This buffer misses out.  It may be in a stage
4622			 * of eviction.  Its ARC_L2_WRITING flag will be
4623			 * left set, denying reads to this buffer.
4624			 */
4625			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4626			continue;
4627		}
4628
4629		if (zio->io_error != 0) {
4630			/*
4631			 * Error - drop L2ARC entry.
4632			 */
4633			list_remove(buflist, ab);
4634			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4635			bytes_dropped += abl2->b_asize;
4636			ab->b_l2hdr = NULL;
4637			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4638			    ab->b_size, 0);
4639			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4640			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4641		}
4642
4643		/*
4644		 * Allow ARC to begin reads to this L2ARC entry.
4645		 */
4646		ab->b_flags &= ~ARC_L2_WRITING;
4647
4648		mutex_exit(hash_lock);
4649	}
4650
4651	atomic_inc_64(&l2arc_writes_done);
4652	list_remove(buflist, head);
4653	kmem_cache_free(hdr_cache, head);
4654	mutex_exit(&l2arc_buflist_mtx);
4655
4656	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4657
4658	l2arc_do_free_on_write();
4659
4660	kmem_free(cb, sizeof (l2arc_write_callback_t));
4661}
4662
4663/*
4664 * A read to a cache device completed.  Validate buffer contents before
4665 * handing over to the regular ARC routines.
4666 */
4667static void
4668l2arc_read_done(zio_t *zio)
4669{
4670	l2arc_read_callback_t *cb;
4671	arc_buf_hdr_t *hdr;
4672	arc_buf_t *buf;
4673	kmutex_t *hash_lock;
4674	int equal;
4675
4676	ASSERT(zio->io_vd != NULL);
4677	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4678
4679	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4680
4681	cb = zio->io_private;
4682	ASSERT(cb != NULL);
4683	buf = cb->l2rcb_buf;
4684	ASSERT(buf != NULL);
4685
4686	hash_lock = HDR_LOCK(buf->b_hdr);
4687	mutex_enter(hash_lock);
4688	hdr = buf->b_hdr;
4689	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4690
4691	/*
4692	 * If the buffer was compressed, decompress it first.
4693	 */
4694	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4695		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4696	ASSERT(zio->io_data != NULL);
4697
4698	/*
4699	 * Check this survived the L2ARC journey.
4700	 */
4701	equal = arc_cksum_equal(buf);
4702	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4703		mutex_exit(hash_lock);
4704		zio->io_private = buf;
4705		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4706		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4707		arc_read_done(zio);
4708	} else {
4709		mutex_exit(hash_lock);
4710		/*
4711		 * Buffer didn't survive caching.  Increment stats and
4712		 * reissue to the original storage device.
4713		 */
4714		if (zio->io_error != 0) {
4715			ARCSTAT_BUMP(arcstat_l2_io_error);
4716		} else {
4717			zio->io_error = SET_ERROR(EIO);
4718		}
4719		if (!equal)
4720			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4721
4722		/*
4723		 * If there's no waiter, issue an async i/o to the primary
4724		 * storage now.  If there *is* a waiter, the caller must
4725		 * issue the i/o in a context where it's OK to block.
4726		 */
4727		if (zio->io_waiter == NULL) {
4728			zio_t *pio = zio_unique_parent(zio);
4729
4730			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4731
4732			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4733			    buf->b_data, zio->io_size, arc_read_done, buf,
4734			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4735		}
4736	}
4737
4738	kmem_free(cb, sizeof (l2arc_read_callback_t));
4739}
4740
4741/*
4742 * This is the list priority from which the L2ARC will search for pages to
4743 * cache.  This is used within loops (0..3) to cycle through lists in the
4744 * desired order.  This order can have a significant effect on cache
4745 * performance.
4746 *
4747 * Currently the metadata lists are hit first, MFU then MRU, followed by
4748 * the data lists.  This function returns a locked list, and also returns
4749 * the lock pointer.
4750 */
4751static list_t *
4752l2arc_list_locked(int list_num, kmutex_t **lock)
4753{
4754	list_t *list = NULL;
4755	int idx;
4756
4757	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4758
4759	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4760		idx = list_num;
4761		list = &arc_mfu->arcs_lists[idx];
4762		*lock = ARCS_LOCK(arc_mfu, idx);
4763	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4764		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4765		list = &arc_mru->arcs_lists[idx];
4766		*lock = ARCS_LOCK(arc_mru, idx);
4767	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4768		ARC_BUFC_NUMDATALISTS)) {
4769		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4770		list = &arc_mfu->arcs_lists[idx];
4771		*lock = ARCS_LOCK(arc_mfu, idx);
4772	} else {
4773		idx = list_num - ARC_BUFC_NUMLISTS;
4774		list = &arc_mru->arcs_lists[idx];
4775		*lock = ARCS_LOCK(arc_mru, idx);
4776	}
4777
4778	ASSERT(!(MUTEX_HELD(*lock)));
4779	mutex_enter(*lock);
4780	return (list);
4781}
4782
4783/*
4784 * Evict buffers from the device write hand to the distance specified in
4785 * bytes.  This distance may span populated buffers, it may span nothing.
4786 * This is clearing a region on the L2ARC device ready for writing.
4787 * If the 'all' boolean is set, every buffer is evicted.
4788 */
4789static void
4790l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4791{
4792	list_t *buflist;
4793	l2arc_buf_hdr_t *abl2;
4794	arc_buf_hdr_t *ab, *ab_prev;
4795	kmutex_t *hash_lock;
4796	uint64_t taddr;
4797	int64_t bytes_evicted = 0;
4798
4799	buflist = dev->l2ad_buflist;
4800
4801	if (buflist == NULL)
4802		return;
4803
4804	if (!all && dev->l2ad_first) {
4805		/*
4806		 * This is the first sweep through the device.  There is
4807		 * nothing to evict.
4808		 */
4809		return;
4810	}
4811
4812	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4813		/*
4814		 * When nearing the end of the device, evict to the end
4815		 * before the device write hand jumps to the start.
4816		 */
4817		taddr = dev->l2ad_end;
4818	} else {
4819		taddr = dev->l2ad_hand + distance;
4820	}
4821	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4822	    uint64_t, taddr, boolean_t, all);
4823
4824top:
4825	mutex_enter(&l2arc_buflist_mtx);
4826	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4827		ab_prev = list_prev(buflist, ab);
4828
4829		hash_lock = HDR_LOCK(ab);
4830		if (!mutex_tryenter(hash_lock)) {
4831			/*
4832			 * Missed the hash lock.  Retry.
4833			 */
4834			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4835			mutex_exit(&l2arc_buflist_mtx);
4836			mutex_enter(hash_lock);
4837			mutex_exit(hash_lock);
4838			goto top;
4839		}
4840
4841		if (HDR_L2_WRITE_HEAD(ab)) {
4842			/*
4843			 * We hit a write head node.  Leave it for
4844			 * l2arc_write_done().
4845			 */
4846			list_remove(buflist, ab);
4847			mutex_exit(hash_lock);
4848			continue;
4849		}
4850
4851		if (!all && ab->b_l2hdr != NULL &&
4852		    (ab->b_l2hdr->b_daddr > taddr ||
4853		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4854			/*
4855			 * We've evicted to the target address,
4856			 * or the end of the device.
4857			 */
4858			mutex_exit(hash_lock);
4859			break;
4860		}
4861
4862		if (HDR_FREE_IN_PROGRESS(ab)) {
4863			/*
4864			 * Already on the path to destruction.
4865			 */
4866			mutex_exit(hash_lock);
4867			continue;
4868		}
4869
4870		if (ab->b_state == arc_l2c_only) {
4871			ASSERT(!HDR_L2_READING(ab));
4872			/*
4873			 * This doesn't exist in the ARC.  Destroy.
4874			 * arc_hdr_destroy() will call list_remove()
4875			 * and decrement arcstat_l2_size.
4876			 */
4877			arc_change_state(arc_anon, ab, hash_lock);
4878			arc_hdr_destroy(ab);
4879		} else {
4880			/*
4881			 * Invalidate issued or about to be issued
4882			 * reads, since we may be about to write
4883			 * over this location.
4884			 */
4885			if (HDR_L2_READING(ab)) {
4886				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4887				ab->b_flags |= ARC_L2_EVICTED;
4888			}
4889
4890			/*
4891			 * Tell ARC this no longer exists in L2ARC.
4892			 */
4893			if (ab->b_l2hdr != NULL) {
4894				abl2 = ab->b_l2hdr;
4895				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4896				bytes_evicted += abl2->b_asize;
4897				ab->b_l2hdr = NULL;
4898				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4899				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4900			}
4901			list_remove(buflist, ab);
4902
4903			/*
4904			 * This may have been leftover after a
4905			 * failed write.
4906			 */
4907			ab->b_flags &= ~ARC_L2_WRITING;
4908		}
4909		mutex_exit(hash_lock);
4910	}
4911	mutex_exit(&l2arc_buflist_mtx);
4912
4913	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
4914	dev->l2ad_evict = taddr;
4915}
4916
4917/*
4918 * Find and write ARC buffers to the L2ARC device.
4919 *
4920 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4921 * for reading until they have completed writing.
4922 * The headroom_boost is an in-out parameter used to maintain headroom boost
4923 * state between calls to this function.
4924 *
4925 * Returns the number of bytes actually written (which may be smaller than
4926 * the delta by which the device hand has changed due to alignment).
4927 */
4928static uint64_t
4929l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4930    boolean_t *headroom_boost)
4931{
4932	arc_buf_hdr_t *ab, *ab_prev, *head;
4933	list_t *list;
4934	uint64_t write_asize, write_psize, write_sz, headroom,
4935	    buf_compress_minsz;
4936	void *buf_data;
4937	kmutex_t *list_lock;
4938	boolean_t full;
4939	l2arc_write_callback_t *cb;
4940	zio_t *pio, *wzio;
4941	uint64_t guid = spa_load_guid(spa);
4942	const boolean_t do_headroom_boost = *headroom_boost;
4943	int try;
4944
4945	ASSERT(dev->l2ad_vdev != NULL);
4946
4947	/* Lower the flag now, we might want to raise it again later. */
4948	*headroom_boost = B_FALSE;
4949
4950	pio = NULL;
4951	write_sz = write_asize = write_psize = 0;
4952	full = B_FALSE;
4953	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4954	head->b_flags |= ARC_L2_WRITE_HEAD;
4955
4956	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4957	/*
4958	 * We will want to try to compress buffers that are at least 2x the
4959	 * device sector size.
4960	 */
4961	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4962
4963	/*
4964	 * Copy buffers for L2ARC writing.
4965	 */
4966	mutex_enter(&l2arc_buflist_mtx);
4967	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
4968		uint64_t passed_sz = 0;
4969
4970		list = l2arc_list_locked(try, &list_lock);
4971		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
4972
4973		/*
4974		 * L2ARC fast warmup.
4975		 *
4976		 * Until the ARC is warm and starts to evict, read from the
4977		 * head of the ARC lists rather than the tail.
4978		 */
4979		if (arc_warm == B_FALSE)
4980			ab = list_head(list);
4981		else
4982			ab = list_tail(list);
4983		if (ab == NULL)
4984			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
4985
4986		headroom = target_sz * l2arc_headroom;
4987		if (do_headroom_boost)
4988			headroom = (headroom * l2arc_headroom_boost) / 100;
4989
4990		for (; ab; ab = ab_prev) {
4991			l2arc_buf_hdr_t *l2hdr;
4992			kmutex_t *hash_lock;
4993			uint64_t buf_sz;
4994
4995			if (arc_warm == B_FALSE)
4996				ab_prev = list_next(list, ab);
4997			else
4998				ab_prev = list_prev(list, ab);
4999			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
5000
5001			hash_lock = HDR_LOCK(ab);
5002			if (!mutex_tryenter(hash_lock)) {
5003				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5004				/*
5005				 * Skip this buffer rather than waiting.
5006				 */
5007				continue;
5008			}
5009
5010			passed_sz += ab->b_size;
5011			if (passed_sz > headroom) {
5012				/*
5013				 * Searched too far.
5014				 */
5015				mutex_exit(hash_lock);
5016				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5017				break;
5018			}
5019
5020			if (!l2arc_write_eligible(guid, ab)) {
5021				mutex_exit(hash_lock);
5022				continue;
5023			}
5024
5025			if ((write_sz + ab->b_size) > target_sz) {
5026				full = B_TRUE;
5027				mutex_exit(hash_lock);
5028				ARCSTAT_BUMP(arcstat_l2_write_full);
5029				break;
5030			}
5031
5032			if (pio == NULL) {
5033				/*
5034				 * Insert a dummy header on the buflist so
5035				 * l2arc_write_done() can find where the
5036				 * write buffers begin without searching.
5037				 */
5038				list_insert_head(dev->l2ad_buflist, head);
5039
5040				cb = kmem_alloc(
5041				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5042				cb->l2wcb_dev = dev;
5043				cb->l2wcb_head = head;
5044				pio = zio_root(spa, l2arc_write_done, cb,
5045				    ZIO_FLAG_CANFAIL);
5046				ARCSTAT_BUMP(arcstat_l2_write_pios);
5047			}
5048
5049			/*
5050			 * Create and add a new L2ARC header.
5051			 */
5052			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5053			l2hdr->b_dev = dev;
5054			ab->b_flags |= ARC_L2_WRITING;
5055
5056			/*
5057			 * Temporarily stash the data buffer in b_tmp_cdata.
5058			 * The subsequent write step will pick it up from
5059			 * there. This is because can't access ab->b_buf
5060			 * without holding the hash_lock, which we in turn
5061			 * can't access without holding the ARC list locks
5062			 * (which we want to avoid during compression/writing).
5063			 */
5064			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5065			l2hdr->b_asize = ab->b_size;
5066			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5067
5068			buf_sz = ab->b_size;
5069			ab->b_l2hdr = l2hdr;
5070
5071			list_insert_head(dev->l2ad_buflist, ab);
5072
5073			/*
5074			 * Compute and store the buffer cksum before
5075			 * writing.  On debug the cksum is verified first.
5076			 */
5077			arc_cksum_verify(ab->b_buf);
5078			arc_cksum_compute(ab->b_buf, B_TRUE);
5079
5080			mutex_exit(hash_lock);
5081
5082			write_sz += buf_sz;
5083		}
5084
5085		mutex_exit(list_lock);
5086
5087		if (full == B_TRUE)
5088			break;
5089	}
5090
5091	/* No buffers selected for writing? */
5092	if (pio == NULL) {
5093		ASSERT0(write_sz);
5094		mutex_exit(&l2arc_buflist_mtx);
5095		kmem_cache_free(hdr_cache, head);
5096		return (0);
5097	}
5098
5099	/*
5100	 * Now start writing the buffers. We're starting at the write head
5101	 * and work backwards, retracing the course of the buffer selector
5102	 * loop above.
5103	 */
5104	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5105	    ab = list_prev(dev->l2ad_buflist, ab)) {
5106		l2arc_buf_hdr_t *l2hdr;
5107		uint64_t buf_sz;
5108
5109		/*
5110		 * We shouldn't need to lock the buffer here, since we flagged
5111		 * it as ARC_L2_WRITING in the previous step, but we must take
5112		 * care to only access its L2 cache parameters. In particular,
5113		 * ab->b_buf may be invalid by now due to ARC eviction.
5114		 */
5115		l2hdr = ab->b_l2hdr;
5116		l2hdr->b_daddr = dev->l2ad_hand;
5117
5118		if ((ab->b_flags & ARC_L2COMPRESS) &&
5119		    l2hdr->b_asize >= buf_compress_minsz) {
5120			if (l2arc_compress_buf(l2hdr)) {
5121				/*
5122				 * If compression succeeded, enable headroom
5123				 * boost on the next scan cycle.
5124				 */
5125				*headroom_boost = B_TRUE;
5126			}
5127		}
5128
5129		/*
5130		 * Pick up the buffer data we had previously stashed away
5131		 * (and now potentially also compressed).
5132		 */
5133		buf_data = l2hdr->b_tmp_cdata;
5134		buf_sz = l2hdr->b_asize;
5135
5136		/* Compression may have squashed the buffer to zero length. */
5137		if (buf_sz != 0) {
5138			uint64_t buf_p_sz;
5139
5140			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5141			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5142			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5143			    ZIO_FLAG_CANFAIL, B_FALSE);
5144
5145			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5146			    zio_t *, wzio);
5147			(void) zio_nowait(wzio);
5148
5149			write_asize += buf_sz;
5150			/*
5151			 * Keep the clock hand suitably device-aligned.
5152			 */
5153			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5154			write_psize += buf_p_sz;
5155			dev->l2ad_hand += buf_p_sz;
5156		}
5157	}
5158
5159	mutex_exit(&l2arc_buflist_mtx);
5160
5161	ASSERT3U(write_asize, <=, target_sz);
5162	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5163	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5164	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5165	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5166	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5167
5168	/*
5169	 * Bump device hand to the device start if it is approaching the end.
5170	 * l2arc_evict() will already have evicted ahead for this case.
5171	 */
5172	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5173		dev->l2ad_hand = dev->l2ad_start;
5174		dev->l2ad_evict = dev->l2ad_start;
5175		dev->l2ad_first = B_FALSE;
5176	}
5177
5178	dev->l2ad_writing = B_TRUE;
5179	(void) zio_wait(pio);
5180	dev->l2ad_writing = B_FALSE;
5181
5182	return (write_asize);
5183}
5184
5185/*
5186 * Compresses an L2ARC buffer.
5187 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5188 * size in l2hdr->b_asize. This routine tries to compress the data and
5189 * depending on the compression result there are three possible outcomes:
5190 * *) The buffer was incompressible. The original l2hdr contents were left
5191 *    untouched and are ready for writing to an L2 device.
5192 * *) The buffer was all-zeros, so there is no need to write it to an L2
5193 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5194 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5195 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5196 *    data buffer which holds the compressed data to be written, and b_asize
5197 *    tells us how much data there is. b_compress is set to the appropriate
5198 *    compression algorithm. Once writing is done, invoke
5199 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5200 *
5201 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5202 * buffer was incompressible).
5203 */
5204static boolean_t
5205l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5206{
5207	void *cdata;
5208	size_t csize, len, rounded;
5209
5210	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5211	ASSERT(l2hdr->b_tmp_cdata != NULL);
5212
5213	len = l2hdr->b_asize;
5214	cdata = zio_data_buf_alloc(len);
5215	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5216	    cdata, l2hdr->b_asize);
5217
5218	rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
5219	if (rounded > csize) {
5220		bzero((char *)cdata + csize, rounded - csize);
5221		csize = rounded;
5222	}
5223
5224	if (csize == 0) {
5225		/* zero block, indicate that there's nothing to write */
5226		zio_data_buf_free(cdata, len);
5227		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5228		l2hdr->b_asize = 0;
5229		l2hdr->b_tmp_cdata = NULL;
5230		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5231		return (B_TRUE);
5232	} else if (csize > 0 && csize < len) {
5233		/*
5234		 * Compression succeeded, we'll keep the cdata around for
5235		 * writing and release it afterwards.
5236		 */
5237		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5238		l2hdr->b_asize = csize;
5239		l2hdr->b_tmp_cdata = cdata;
5240		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5241		return (B_TRUE);
5242	} else {
5243		/*
5244		 * Compression failed, release the compressed buffer.
5245		 * l2hdr will be left unmodified.
5246		 */
5247		zio_data_buf_free(cdata, len);
5248		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5249		return (B_FALSE);
5250	}
5251}
5252
5253/*
5254 * Decompresses a zio read back from an l2arc device. On success, the
5255 * underlying zio's io_data buffer is overwritten by the uncompressed
5256 * version. On decompression error (corrupt compressed stream), the
5257 * zio->io_error value is set to signal an I/O error.
5258 *
5259 * Please note that the compressed data stream is not checksummed, so
5260 * if the underlying device is experiencing data corruption, we may feed
5261 * corrupt data to the decompressor, so the decompressor needs to be
5262 * able to handle this situation (LZ4 does).
5263 */
5264static void
5265l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5266{
5267	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5268
5269	if (zio->io_error != 0) {
5270		/*
5271		 * An io error has occured, just restore the original io
5272		 * size in preparation for a main pool read.
5273		 */
5274		zio->io_orig_size = zio->io_size = hdr->b_size;
5275		return;
5276	}
5277
5278	if (c == ZIO_COMPRESS_EMPTY) {
5279		/*
5280		 * An empty buffer results in a null zio, which means we
5281		 * need to fill its io_data after we're done restoring the
5282		 * buffer's contents.
5283		 */
5284		ASSERT(hdr->b_buf != NULL);
5285		bzero(hdr->b_buf->b_data, hdr->b_size);
5286		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5287	} else {
5288		ASSERT(zio->io_data != NULL);
5289		/*
5290		 * We copy the compressed data from the start of the arc buffer
5291		 * (the zio_read will have pulled in only what we need, the
5292		 * rest is garbage which we will overwrite at decompression)
5293		 * and then decompress back to the ARC data buffer. This way we
5294		 * can minimize copying by simply decompressing back over the
5295		 * original compressed data (rather than decompressing to an
5296		 * aux buffer and then copying back the uncompressed buffer,
5297		 * which is likely to be much larger).
5298		 */
5299		uint64_t csize;
5300		void *cdata;
5301
5302		csize = zio->io_size;
5303		cdata = zio_data_buf_alloc(csize);
5304		bcopy(zio->io_data, cdata, csize);
5305		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5306		    hdr->b_size) != 0)
5307			zio->io_error = EIO;
5308		zio_data_buf_free(cdata, csize);
5309	}
5310
5311	/* Restore the expected uncompressed IO size. */
5312	zio->io_orig_size = zio->io_size = hdr->b_size;
5313}
5314
5315/*
5316 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5317 * This buffer serves as a temporary holder of compressed data while
5318 * the buffer entry is being written to an l2arc device. Once that is
5319 * done, we can dispose of it.
5320 */
5321static void
5322l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5323{
5324	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5325
5326	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5327		/*
5328		 * If the data was compressed, then we've allocated a
5329		 * temporary buffer for it, so now we need to release it.
5330		 */
5331		ASSERT(l2hdr->b_tmp_cdata != NULL);
5332		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5333	}
5334	l2hdr->b_tmp_cdata = NULL;
5335}
5336
5337/*
5338 * This thread feeds the L2ARC at regular intervals.  This is the beating
5339 * heart of the L2ARC.
5340 */
5341static void
5342l2arc_feed_thread(void *dummy __unused)
5343{
5344	callb_cpr_t cpr;
5345	l2arc_dev_t *dev;
5346	spa_t *spa;
5347	uint64_t size, wrote;
5348	clock_t begin, next = ddi_get_lbolt();
5349	boolean_t headroom_boost = B_FALSE;
5350
5351	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5352
5353	mutex_enter(&l2arc_feed_thr_lock);
5354
5355	while (l2arc_thread_exit == 0) {
5356		CALLB_CPR_SAFE_BEGIN(&cpr);
5357		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5358		    next - ddi_get_lbolt());
5359		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5360		next = ddi_get_lbolt() + hz;
5361
5362		/*
5363		 * Quick check for L2ARC devices.
5364		 */
5365		mutex_enter(&l2arc_dev_mtx);
5366		if (l2arc_ndev == 0) {
5367			mutex_exit(&l2arc_dev_mtx);
5368			continue;
5369		}
5370		mutex_exit(&l2arc_dev_mtx);
5371		begin = ddi_get_lbolt();
5372
5373		/*
5374		 * This selects the next l2arc device to write to, and in
5375		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5376		 * will return NULL if there are now no l2arc devices or if
5377		 * they are all faulted.
5378		 *
5379		 * If a device is returned, its spa's config lock is also
5380		 * held to prevent device removal.  l2arc_dev_get_next()
5381		 * will grab and release l2arc_dev_mtx.
5382		 */
5383		if ((dev = l2arc_dev_get_next()) == NULL)
5384			continue;
5385
5386		spa = dev->l2ad_spa;
5387		ASSERT(spa != NULL);
5388
5389		/*
5390		 * If the pool is read-only then force the feed thread to
5391		 * sleep a little longer.
5392		 */
5393		if (!spa_writeable(spa)) {
5394			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5395			spa_config_exit(spa, SCL_L2ARC, dev);
5396			continue;
5397		}
5398
5399		/*
5400		 * Avoid contributing to memory pressure.
5401		 */
5402		if (arc_reclaim_needed()) {
5403			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5404			spa_config_exit(spa, SCL_L2ARC, dev);
5405			continue;
5406		}
5407
5408		ARCSTAT_BUMP(arcstat_l2_feeds);
5409
5410		size = l2arc_write_size();
5411
5412		/*
5413		 * Evict L2ARC buffers that will be overwritten.
5414		 */
5415		l2arc_evict(dev, size, B_FALSE);
5416
5417		/*
5418		 * Write ARC buffers.
5419		 */
5420		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5421
5422		/*
5423		 * Calculate interval between writes.
5424		 */
5425		next = l2arc_write_interval(begin, size, wrote);
5426		spa_config_exit(spa, SCL_L2ARC, dev);
5427	}
5428
5429	l2arc_thread_exit = 0;
5430	cv_broadcast(&l2arc_feed_thr_cv);
5431	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5432	thread_exit();
5433}
5434
5435boolean_t
5436l2arc_vdev_present(vdev_t *vd)
5437{
5438	l2arc_dev_t *dev;
5439
5440	mutex_enter(&l2arc_dev_mtx);
5441	for (dev = list_head(l2arc_dev_list); dev != NULL;
5442	    dev = list_next(l2arc_dev_list, dev)) {
5443		if (dev->l2ad_vdev == vd)
5444			break;
5445	}
5446	mutex_exit(&l2arc_dev_mtx);
5447
5448	return (dev != NULL);
5449}
5450
5451/*
5452 * Add a vdev for use by the L2ARC.  By this point the spa has already
5453 * validated the vdev and opened it.
5454 */
5455void
5456l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5457{
5458	l2arc_dev_t *adddev;
5459
5460	ASSERT(!l2arc_vdev_present(vd));
5461
5462	vdev_ashift_optimize(vd);
5463
5464	/*
5465	 * Create a new l2arc device entry.
5466	 */
5467	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5468	adddev->l2ad_spa = spa;
5469	adddev->l2ad_vdev = vd;
5470	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5471	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5472	adddev->l2ad_hand = adddev->l2ad_start;
5473	adddev->l2ad_evict = adddev->l2ad_start;
5474	adddev->l2ad_first = B_TRUE;
5475	adddev->l2ad_writing = B_FALSE;
5476
5477	/*
5478	 * This is a list of all ARC buffers that are still valid on the
5479	 * device.
5480	 */
5481	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5482	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5483	    offsetof(arc_buf_hdr_t, b_l2node));
5484
5485	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5486
5487	/*
5488	 * Add device to global list
5489	 */
5490	mutex_enter(&l2arc_dev_mtx);
5491	list_insert_head(l2arc_dev_list, adddev);
5492	atomic_inc_64(&l2arc_ndev);
5493	mutex_exit(&l2arc_dev_mtx);
5494}
5495
5496/*
5497 * Remove a vdev from the L2ARC.
5498 */
5499void
5500l2arc_remove_vdev(vdev_t *vd)
5501{
5502	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5503
5504	/*
5505	 * Find the device by vdev
5506	 */
5507	mutex_enter(&l2arc_dev_mtx);
5508	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5509		nextdev = list_next(l2arc_dev_list, dev);
5510		if (vd == dev->l2ad_vdev) {
5511			remdev = dev;
5512			break;
5513		}
5514	}
5515	ASSERT(remdev != NULL);
5516
5517	/*
5518	 * Remove device from global list
5519	 */
5520	list_remove(l2arc_dev_list, remdev);
5521	l2arc_dev_last = NULL;		/* may have been invalidated */
5522	atomic_dec_64(&l2arc_ndev);
5523	mutex_exit(&l2arc_dev_mtx);
5524
5525	/*
5526	 * Clear all buflists and ARC references.  L2ARC device flush.
5527	 */
5528	l2arc_evict(remdev, 0, B_TRUE);
5529	list_destroy(remdev->l2ad_buflist);
5530	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5531	kmem_free(remdev, sizeof (l2arc_dev_t));
5532}
5533
5534void
5535l2arc_init(void)
5536{
5537	l2arc_thread_exit = 0;
5538	l2arc_ndev = 0;
5539	l2arc_writes_sent = 0;
5540	l2arc_writes_done = 0;
5541
5542	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5543	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5544	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5545	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5546	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5547
5548	l2arc_dev_list = &L2ARC_dev_list;
5549	l2arc_free_on_write = &L2ARC_free_on_write;
5550	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5551	    offsetof(l2arc_dev_t, l2ad_node));
5552	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5553	    offsetof(l2arc_data_free_t, l2df_list_node));
5554}
5555
5556void
5557l2arc_fini(void)
5558{
5559	/*
5560	 * This is called from dmu_fini(), which is called from spa_fini();
5561	 * Because of this, we can assume that all l2arc devices have
5562	 * already been removed when the pools themselves were removed.
5563	 */
5564
5565	l2arc_do_free_on_write();
5566
5567	mutex_destroy(&l2arc_feed_thr_lock);
5568	cv_destroy(&l2arc_feed_thr_cv);
5569	mutex_destroy(&l2arc_dev_mtx);
5570	mutex_destroy(&l2arc_buflist_mtx);
5571	mutex_destroy(&l2arc_free_on_write_mtx);
5572
5573	list_destroy(l2arc_dev_list);
5574	list_destroy(l2arc_free_on_write);
5575}
5576
5577void
5578l2arc_start(void)
5579{
5580	if (!(spa_mode_global & FWRITE))
5581		return;
5582
5583	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5584	    TS_RUN, minclsyspri);
5585}
5586
5587void
5588l2arc_stop(void)
5589{
5590	if (!(spa_mode_global & FWRITE))
5591		return;
5592
5593	mutex_enter(&l2arc_feed_thr_lock);
5594	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5595	l2arc_thread_exit = 1;
5596	while (l2arc_thread_exit != 0)
5597		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5598	mutex_exit(&l2arc_feed_thr_lock);
5599}
5600