zio.c revision 265741
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/fm/fs/zfs.h>
29#include <sys/spa.h>
30#include <sys/txg.h>
31#include <sys/spa_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio_impl.h>
34#include <sys/zio_compress.h>
35#include <sys/zio_checksum.h>
36#include <sys/dmu_objset.h>
37#include <sys/arc.h>
38#include <sys/ddt.h>
39#include <sys/trim_map.h>
40#include <sys/zfeature.h>
41
42SYSCTL_DECL(_vfs_zfs);
43SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
44#if defined(__amd64__)
45static int zio_use_uma = 1;
46#else
47static int zio_use_uma = 0;
48#endif
49TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
50SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
51    "Use uma(9) for ZIO allocations");
52static int zio_exclude_metadata = 0;
53TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
54SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
55    "Exclude metadata buffers from dumps as well");
56
57zio_trim_stats_t zio_trim_stats = {
58	{ "bytes",		KSTAT_DATA_UINT64,
59	  "Number of bytes successfully TRIMmed" },
60	{ "success",		KSTAT_DATA_UINT64,
61	  "Number of successful TRIM requests" },
62	{ "unsupported",	KSTAT_DATA_UINT64,
63	  "Number of TRIM requests that failed because TRIM is not supported" },
64	{ "failed",		KSTAT_DATA_UINT64,
65	  "Number of TRIM requests that failed for reasons other than not supported" },
66};
67
68static kstat_t *zio_trim_ksp;
69
70/*
71 * ==========================================================================
72 * I/O type descriptions
73 * ==========================================================================
74 */
75const char *zio_type_name[ZIO_TYPES] = {
76	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
77	"zio_ioctl"
78};
79
80/*
81 * ==========================================================================
82 * I/O kmem caches
83 * ==========================================================================
84 */
85kmem_cache_t *zio_cache;
86kmem_cache_t *zio_link_cache;
87kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
88kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
89
90#ifdef _KERNEL
91extern vmem_t *zio_alloc_arena;
92#endif
93
94/*
95 * The following actions directly effect the spa's sync-to-convergence logic.
96 * The values below define the sync pass when we start performing the action.
97 * Care should be taken when changing these values as they directly impact
98 * spa_sync() performance. Tuning these values may introduce subtle performance
99 * pathologies and should only be done in the context of performance analysis.
100 * These tunables will eventually be removed and replaced with #defines once
101 * enough analysis has been done to determine optimal values.
102 *
103 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
104 * regular blocks are not deferred.
105 */
106int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
107TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free);
108SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
109    &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
110int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
111TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress);
112SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
113    &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
114int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
115TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite);
116SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
117    &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
118
119/*
120 * An allocating zio is one that either currently has the DVA allocate
121 * stage set or will have it later in its lifetime.
122 */
123#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
124
125boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
126
127#ifdef ZFS_DEBUG
128int zio_buf_debug_limit = 16384;
129#else
130int zio_buf_debug_limit = 0;
131#endif
132
133void
134zio_init(void)
135{
136	size_t c;
137	zio_cache = kmem_cache_create("zio_cache",
138	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
139	zio_link_cache = kmem_cache_create("zio_link_cache",
140	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
141	if (!zio_use_uma)
142		goto out;
143
144	/*
145	 * For small buffers, we want a cache for each multiple of
146	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
147	 * for each quarter-power of 2.  For large buffers, we want
148	 * a cache for each multiple of PAGESIZE.
149	 */
150	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
151		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
152		size_t p2 = size;
153		size_t align = 0;
154		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
155
156		while (p2 & (p2 - 1))
157			p2 &= p2 - 1;
158
159#ifdef illumos
160#ifndef _KERNEL
161		/*
162		 * If we are using watchpoints, put each buffer on its own page,
163		 * to eliminate the performance overhead of trapping to the
164		 * kernel when modifying a non-watched buffer that shares the
165		 * page with a watched buffer.
166		 */
167		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
168			continue;
169#endif
170#endif /* illumos */
171		if (size <= 4 * SPA_MINBLOCKSIZE) {
172			align = SPA_MINBLOCKSIZE;
173		} else if (IS_P2ALIGNED(size, PAGESIZE)) {
174			align = PAGESIZE;
175		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
176			align = p2 >> 2;
177		}
178
179		if (align != 0) {
180			char name[36];
181			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
182			zio_buf_cache[c] = kmem_cache_create(name, size,
183			    align, NULL, NULL, NULL, NULL, NULL, cflags);
184
185			/*
186			 * Since zio_data bufs do not appear in crash dumps, we
187			 * pass KMC_NOTOUCH so that no allocator metadata is
188			 * stored with the buffers.
189			 */
190			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
191			zio_data_buf_cache[c] = kmem_cache_create(name, size,
192			    align, NULL, NULL, NULL, NULL, NULL,
193			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
194		}
195	}
196
197	while (--c != 0) {
198		ASSERT(zio_buf_cache[c] != NULL);
199		if (zio_buf_cache[c - 1] == NULL)
200			zio_buf_cache[c - 1] = zio_buf_cache[c];
201
202		ASSERT(zio_data_buf_cache[c] != NULL);
203		if (zio_data_buf_cache[c - 1] == NULL)
204			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
205	}
206out:
207
208	zio_inject_init();
209
210	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
211	    KSTAT_TYPE_NAMED,
212	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
213	    KSTAT_FLAG_VIRTUAL);
214
215	if (zio_trim_ksp != NULL) {
216		zio_trim_ksp->ks_data = &zio_trim_stats;
217		kstat_install(zio_trim_ksp);
218	}
219}
220
221void
222zio_fini(void)
223{
224	size_t c;
225	kmem_cache_t *last_cache = NULL;
226	kmem_cache_t *last_data_cache = NULL;
227
228	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
229		if (zio_buf_cache[c] != last_cache) {
230			last_cache = zio_buf_cache[c];
231			kmem_cache_destroy(zio_buf_cache[c]);
232		}
233		zio_buf_cache[c] = NULL;
234
235		if (zio_data_buf_cache[c] != last_data_cache) {
236			last_data_cache = zio_data_buf_cache[c];
237			kmem_cache_destroy(zio_data_buf_cache[c]);
238		}
239		zio_data_buf_cache[c] = NULL;
240	}
241
242	kmem_cache_destroy(zio_link_cache);
243	kmem_cache_destroy(zio_cache);
244
245	zio_inject_fini();
246
247	if (zio_trim_ksp != NULL) {
248		kstat_delete(zio_trim_ksp);
249		zio_trim_ksp = NULL;
250	}
251}
252
253/*
254 * ==========================================================================
255 * Allocate and free I/O buffers
256 * ==========================================================================
257 */
258
259/*
260 * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
261 * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
262 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
263 * excess / transient data in-core during a crashdump.
264 */
265void *
266zio_buf_alloc(size_t size)
267{
268	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
269	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
270
271	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
272
273	if (zio_use_uma)
274		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
275	else
276		return (kmem_alloc(size, KM_SLEEP|flags));
277}
278
279/*
280 * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
281 * crashdump if the kernel panics.  This exists so that we will limit the amount
282 * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
283 * of kernel heap dumped to disk when the kernel panics)
284 */
285void *
286zio_data_buf_alloc(size_t size)
287{
288	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
289
290	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
291
292	if (zio_use_uma)
293		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
294	else
295		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
296}
297
298void
299zio_buf_free(void *buf, size_t size)
300{
301	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
302
303	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
304
305	if (zio_use_uma)
306		kmem_cache_free(zio_buf_cache[c], buf);
307	else
308		kmem_free(buf, size);
309}
310
311void
312zio_data_buf_free(void *buf, size_t size)
313{
314	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
315
316	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
317
318	if (zio_use_uma)
319		kmem_cache_free(zio_data_buf_cache[c], buf);
320	else
321		kmem_free(buf, size);
322}
323
324/*
325 * ==========================================================================
326 * Push and pop I/O transform buffers
327 * ==========================================================================
328 */
329static void
330zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
331	zio_transform_func_t *transform)
332{
333	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
334
335	zt->zt_orig_data = zio->io_data;
336	zt->zt_orig_size = zio->io_size;
337	zt->zt_bufsize = bufsize;
338	zt->zt_transform = transform;
339
340	zt->zt_next = zio->io_transform_stack;
341	zio->io_transform_stack = zt;
342
343	zio->io_data = data;
344	zio->io_size = size;
345}
346
347static void
348zio_pop_transforms(zio_t *zio)
349{
350	zio_transform_t *zt;
351
352	while ((zt = zio->io_transform_stack) != NULL) {
353		if (zt->zt_transform != NULL)
354			zt->zt_transform(zio,
355			    zt->zt_orig_data, zt->zt_orig_size);
356
357		if (zt->zt_bufsize != 0)
358			zio_buf_free(zio->io_data, zt->zt_bufsize);
359
360		zio->io_data = zt->zt_orig_data;
361		zio->io_size = zt->zt_orig_size;
362		zio->io_transform_stack = zt->zt_next;
363
364		kmem_free(zt, sizeof (zio_transform_t));
365	}
366}
367
368/*
369 * ==========================================================================
370 * I/O transform callbacks for subblocks and decompression
371 * ==========================================================================
372 */
373static void
374zio_subblock(zio_t *zio, void *data, uint64_t size)
375{
376	ASSERT(zio->io_size > size);
377
378	if (zio->io_type == ZIO_TYPE_READ)
379		bcopy(zio->io_data, data, size);
380}
381
382static void
383zio_decompress(zio_t *zio, void *data, uint64_t size)
384{
385	if (zio->io_error == 0 &&
386	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
387	    zio->io_data, data, zio->io_size, size) != 0)
388		zio->io_error = SET_ERROR(EIO);
389}
390
391/*
392 * ==========================================================================
393 * I/O parent/child relationships and pipeline interlocks
394 * ==========================================================================
395 */
396/*
397 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
398 *        continue calling these functions until they return NULL.
399 *        Otherwise, the next caller will pick up the list walk in
400 *        some indeterminate state.  (Otherwise every caller would
401 *        have to pass in a cookie to keep the state represented by
402 *        io_walk_link, which gets annoying.)
403 */
404zio_t *
405zio_walk_parents(zio_t *cio)
406{
407	zio_link_t *zl = cio->io_walk_link;
408	list_t *pl = &cio->io_parent_list;
409
410	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
411	cio->io_walk_link = zl;
412
413	if (zl == NULL)
414		return (NULL);
415
416	ASSERT(zl->zl_child == cio);
417	return (zl->zl_parent);
418}
419
420zio_t *
421zio_walk_children(zio_t *pio)
422{
423	zio_link_t *zl = pio->io_walk_link;
424	list_t *cl = &pio->io_child_list;
425
426	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
427	pio->io_walk_link = zl;
428
429	if (zl == NULL)
430		return (NULL);
431
432	ASSERT(zl->zl_parent == pio);
433	return (zl->zl_child);
434}
435
436zio_t *
437zio_unique_parent(zio_t *cio)
438{
439	zio_t *pio = zio_walk_parents(cio);
440
441	VERIFY(zio_walk_parents(cio) == NULL);
442	return (pio);
443}
444
445void
446zio_add_child(zio_t *pio, zio_t *cio)
447{
448	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
449
450	/*
451	 * Logical I/Os can have logical, gang, or vdev children.
452	 * Gang I/Os can have gang or vdev children.
453	 * Vdev I/Os can only have vdev children.
454	 * The following ASSERT captures all of these constraints.
455	 */
456	ASSERT(cio->io_child_type <= pio->io_child_type);
457
458	zl->zl_parent = pio;
459	zl->zl_child = cio;
460
461	mutex_enter(&cio->io_lock);
462	mutex_enter(&pio->io_lock);
463
464	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
465
466	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
467		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
468
469	list_insert_head(&pio->io_child_list, zl);
470	list_insert_head(&cio->io_parent_list, zl);
471
472	pio->io_child_count++;
473	cio->io_parent_count++;
474
475	mutex_exit(&pio->io_lock);
476	mutex_exit(&cio->io_lock);
477}
478
479static void
480zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
481{
482	ASSERT(zl->zl_parent == pio);
483	ASSERT(zl->zl_child == cio);
484
485	mutex_enter(&cio->io_lock);
486	mutex_enter(&pio->io_lock);
487
488	list_remove(&pio->io_child_list, zl);
489	list_remove(&cio->io_parent_list, zl);
490
491	pio->io_child_count--;
492	cio->io_parent_count--;
493
494	mutex_exit(&pio->io_lock);
495	mutex_exit(&cio->io_lock);
496
497	kmem_cache_free(zio_link_cache, zl);
498}
499
500static boolean_t
501zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
502{
503	uint64_t *countp = &zio->io_children[child][wait];
504	boolean_t waiting = B_FALSE;
505
506	mutex_enter(&zio->io_lock);
507	ASSERT(zio->io_stall == NULL);
508	if (*countp != 0) {
509		zio->io_stage >>= 1;
510		zio->io_stall = countp;
511		waiting = B_TRUE;
512	}
513	mutex_exit(&zio->io_lock);
514
515	return (waiting);
516}
517
518static void
519zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
520{
521	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
522	int *errorp = &pio->io_child_error[zio->io_child_type];
523
524	mutex_enter(&pio->io_lock);
525	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
526		*errorp = zio_worst_error(*errorp, zio->io_error);
527	pio->io_reexecute |= zio->io_reexecute;
528	ASSERT3U(*countp, >, 0);
529
530	(*countp)--;
531
532	if (*countp == 0 && pio->io_stall == countp) {
533		pio->io_stall = NULL;
534		mutex_exit(&pio->io_lock);
535		zio_execute(pio);
536	} else {
537		mutex_exit(&pio->io_lock);
538	}
539}
540
541static void
542zio_inherit_child_errors(zio_t *zio, enum zio_child c)
543{
544	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
545		zio->io_error = zio->io_child_error[c];
546}
547
548/*
549 * ==========================================================================
550 * Create the various types of I/O (read, write, free, etc)
551 * ==========================================================================
552 */
553static zio_t *
554zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
555    void *data, uint64_t size, zio_done_func_t *done, void *private,
556    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
557    vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
558    enum zio_stage stage, enum zio_stage pipeline)
559{
560	zio_t *zio;
561
562	ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
563	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
564	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
565
566	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
567	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
568	ASSERT(vd || stage == ZIO_STAGE_OPEN);
569
570	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
571	bzero(zio, sizeof (zio_t));
572
573	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
574	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
575
576	list_create(&zio->io_parent_list, sizeof (zio_link_t),
577	    offsetof(zio_link_t, zl_parent_node));
578	list_create(&zio->io_child_list, sizeof (zio_link_t),
579	    offsetof(zio_link_t, zl_child_node));
580
581	if (vd != NULL)
582		zio->io_child_type = ZIO_CHILD_VDEV;
583	else if (flags & ZIO_FLAG_GANG_CHILD)
584		zio->io_child_type = ZIO_CHILD_GANG;
585	else if (flags & ZIO_FLAG_DDT_CHILD)
586		zio->io_child_type = ZIO_CHILD_DDT;
587	else
588		zio->io_child_type = ZIO_CHILD_LOGICAL;
589
590	if (bp != NULL) {
591		zio->io_bp = (blkptr_t *)bp;
592		zio->io_bp_copy = *bp;
593		zio->io_bp_orig = *bp;
594		if (type != ZIO_TYPE_WRITE ||
595		    zio->io_child_type == ZIO_CHILD_DDT)
596			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
597		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
598			zio->io_logical = zio;
599		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
600			pipeline |= ZIO_GANG_STAGES;
601	}
602
603	zio->io_spa = spa;
604	zio->io_txg = txg;
605	zio->io_done = done;
606	zio->io_private = private;
607	zio->io_type = type;
608	zio->io_priority = priority;
609	zio->io_vd = vd;
610	zio->io_offset = offset;
611	zio->io_orig_data = zio->io_data = data;
612	zio->io_orig_size = zio->io_size = size;
613	zio->io_orig_flags = zio->io_flags = flags;
614	zio->io_orig_stage = zio->io_stage = stage;
615	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
616
617	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
618	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
619
620	if (zb != NULL)
621		zio->io_bookmark = *zb;
622
623	if (pio != NULL) {
624		if (zio->io_logical == NULL)
625			zio->io_logical = pio->io_logical;
626		if (zio->io_child_type == ZIO_CHILD_GANG)
627			zio->io_gang_leader = pio->io_gang_leader;
628		zio_add_child(pio, zio);
629	}
630
631	return (zio);
632}
633
634static void
635zio_destroy(zio_t *zio)
636{
637	list_destroy(&zio->io_parent_list);
638	list_destroy(&zio->io_child_list);
639	mutex_destroy(&zio->io_lock);
640	cv_destroy(&zio->io_cv);
641	kmem_cache_free(zio_cache, zio);
642}
643
644zio_t *
645zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
646    void *private, enum zio_flag flags)
647{
648	zio_t *zio;
649
650	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
651	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
652	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
653
654	return (zio);
655}
656
657zio_t *
658zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
659{
660	return (zio_null(NULL, spa, NULL, done, private, flags));
661}
662
663zio_t *
664zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
665    void *data, uint64_t size, zio_done_func_t *done, void *private,
666    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
667{
668	zio_t *zio;
669
670	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
671	    data, size, done, private,
672	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
673	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
674	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
675
676	return (zio);
677}
678
679zio_t *
680zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
681    void *data, uint64_t size, const zio_prop_t *zp,
682    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
683    void *private,
684    zio_priority_t priority, enum zio_flag flags, const zbookmark_t *zb)
685{
686	zio_t *zio;
687
688	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
689	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
690	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
691	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
692	    DMU_OT_IS_VALID(zp->zp_type) &&
693	    zp->zp_level < 32 &&
694	    zp->zp_copies > 0 &&
695	    zp->zp_copies <= spa_max_replication(spa));
696
697	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
698	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
699	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
700	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
701
702	zio->io_ready = ready;
703	zio->io_physdone = physdone;
704	zio->io_prop = *zp;
705
706	return (zio);
707}
708
709zio_t *
710zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
711    uint64_t size, zio_done_func_t *done, void *private,
712    zio_priority_t priority, enum zio_flag flags, zbookmark_t *zb)
713{
714	zio_t *zio;
715
716	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
717	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
718	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
719
720	return (zio);
721}
722
723void
724zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
725{
726	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
727	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
728	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
729	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
730
731	/*
732	 * We must reset the io_prop to match the values that existed
733	 * when the bp was first written by dmu_sync() keeping in mind
734	 * that nopwrite and dedup are mutually exclusive.
735	 */
736	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
737	zio->io_prop.zp_nopwrite = nopwrite;
738	zio->io_prop.zp_copies = copies;
739	zio->io_bp_override = bp;
740}
741
742void
743zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
744{
745	metaslab_check_free(spa, bp);
746
747	/*
748	 * Frees that are for the currently-syncing txg, are not going to be
749	 * deferred, and which will not need to do a read (i.e. not GANG or
750	 * DEDUP), can be processed immediately.  Otherwise, put them on the
751	 * in-memory list for later processing.
752	 */
753	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
754	    txg != spa->spa_syncing_txg ||
755	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
756		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
757	} else {
758		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
759		    BP_GET_PSIZE(bp), 0)));
760	}
761}
762
763zio_t *
764zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
765    uint64_t size, enum zio_flag flags)
766{
767	zio_t *zio;
768	enum zio_stage stage = ZIO_FREE_PIPELINE;
769
770	dprintf_bp(bp, "freeing in txg %llu, pass %u",
771	    (longlong_t)txg, spa->spa_sync_pass);
772
773	ASSERT(!BP_IS_HOLE(bp));
774	ASSERT(spa_syncing_txg(spa) == txg);
775	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
776
777	metaslab_check_free(spa, bp);
778	arc_freed(spa, bp);
779
780	if (zfs_trim_enabled)
781		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
782		    ZIO_STAGE_VDEV_IO_ASSESS;
783	/*
784	 * GANG and DEDUP blocks can induce a read (for the gang block header,
785	 * or the DDT), so issue them asynchronously so that this thread is
786	 * not tied up.
787	 */
788	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
789		stage |= ZIO_STAGE_ISSUE_ASYNC;
790
791	zio = zio_create(pio, spa, txg, bp, NULL, size,
792	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
793	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
794
795	return (zio);
796}
797
798zio_t *
799zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
800    zio_done_func_t *done, void *private, enum zio_flag flags)
801{
802	zio_t *zio;
803
804	/*
805	 * A claim is an allocation of a specific block.  Claims are needed
806	 * to support immediate writes in the intent log.  The issue is that
807	 * immediate writes contain committed data, but in a txg that was
808	 * *not* committed.  Upon opening the pool after an unclean shutdown,
809	 * the intent log claims all blocks that contain immediate write data
810	 * so that the SPA knows they're in use.
811	 *
812	 * All claims *must* be resolved in the first txg -- before the SPA
813	 * starts allocating blocks -- so that nothing is allocated twice.
814	 * If txg == 0 we just verify that the block is claimable.
815	 */
816	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
817	ASSERT(txg == spa_first_txg(spa) || txg == 0);
818	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
819
820	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
821	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
822	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
823
824	return (zio);
825}
826
827zio_t *
828zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
829    uint64_t size, zio_done_func_t *done, void *private,
830    enum zio_flag flags)
831{
832	zio_t *zio;
833	int c;
834
835	if (vd->vdev_children == 0) {
836		zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
837		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, offset, NULL,
838		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
839
840		zio->io_cmd = cmd;
841	} else {
842		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
843
844		for (c = 0; c < vd->vdev_children; c++)
845			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
846			    offset, size, done, private, flags));
847	}
848
849	return (zio);
850}
851
852zio_t *
853zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
854    void *data, int checksum, zio_done_func_t *done, void *private,
855    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
856{
857	zio_t *zio;
858
859	ASSERT(vd->vdev_children == 0);
860	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
861	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
862	ASSERT3U(offset + size, <=, vd->vdev_psize);
863
864	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
865	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
866	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
867
868	zio->io_prop.zp_checksum = checksum;
869
870	return (zio);
871}
872
873zio_t *
874zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
875    void *data, int checksum, zio_done_func_t *done, void *private,
876    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
877{
878	zio_t *zio;
879
880	ASSERT(vd->vdev_children == 0);
881	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
882	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
883	ASSERT3U(offset + size, <=, vd->vdev_psize);
884
885	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
886	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
887	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
888
889	zio->io_prop.zp_checksum = checksum;
890
891	if (zio_checksum_table[checksum].ci_eck) {
892		/*
893		 * zec checksums are necessarily destructive -- they modify
894		 * the end of the write buffer to hold the verifier/checksum.
895		 * Therefore, we must make a local copy in case the data is
896		 * being written to multiple places in parallel.
897		 */
898		void *wbuf = zio_buf_alloc(size);
899		bcopy(data, wbuf, size);
900		zio_push_transform(zio, wbuf, size, size, NULL);
901	}
902
903	return (zio);
904}
905
906/*
907 * Create a child I/O to do some work for us.
908 */
909zio_t *
910zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
911	void *data, uint64_t size, int type, zio_priority_t priority,
912	enum zio_flag flags, zio_done_func_t *done, void *private)
913{
914	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
915	zio_t *zio;
916
917	ASSERT(vd->vdev_parent ==
918	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
919
920	if (type == ZIO_TYPE_READ && bp != NULL) {
921		/*
922		 * If we have the bp, then the child should perform the
923		 * checksum and the parent need not.  This pushes error
924		 * detection as close to the leaves as possible and
925		 * eliminates redundant checksums in the interior nodes.
926		 */
927		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
928		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
929	}
930
931	if (vd->vdev_children == 0)
932		offset += VDEV_LABEL_START_SIZE;
933
934	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
935
936	/*
937	 * If we've decided to do a repair, the write is not speculative --
938	 * even if the original read was.
939	 */
940	if (flags & ZIO_FLAG_IO_REPAIR)
941		flags &= ~ZIO_FLAG_SPECULATIVE;
942
943	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
944	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
945	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
946
947	zio->io_physdone = pio->io_physdone;
948	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
949		zio->io_logical->io_phys_children++;
950
951	return (zio);
952}
953
954zio_t *
955zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
956	int type, zio_priority_t priority, enum zio_flag flags,
957	zio_done_func_t *done, void *private)
958{
959	zio_t *zio;
960
961	ASSERT(vd->vdev_ops->vdev_op_leaf);
962
963	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
964	    data, size, done, private, type, priority,
965	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
966	    vd, offset, NULL,
967	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
968
969	return (zio);
970}
971
972void
973zio_flush(zio_t *zio, vdev_t *vd)
974{
975	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
976	    NULL, NULL,
977	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
978}
979
980zio_t *
981zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
982{
983
984	ASSERT(vd->vdev_ops->vdev_op_leaf);
985
986	return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
987	    NULL, NULL,
988	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
989}
990
991void
992zio_shrink(zio_t *zio, uint64_t size)
993{
994	ASSERT(zio->io_executor == NULL);
995	ASSERT(zio->io_orig_size == zio->io_size);
996	ASSERT(size <= zio->io_size);
997
998	/*
999	 * We don't shrink for raidz because of problems with the
1000	 * reconstruction when reading back less than the block size.
1001	 * Note, BP_IS_RAIDZ() assumes no compression.
1002	 */
1003	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1004	if (!BP_IS_RAIDZ(zio->io_bp))
1005		zio->io_orig_size = zio->io_size = size;
1006}
1007
1008/*
1009 * ==========================================================================
1010 * Prepare to read and write logical blocks
1011 * ==========================================================================
1012 */
1013
1014static int
1015zio_read_bp_init(zio_t **ziop)
1016{
1017	zio_t *zio = *ziop;
1018	blkptr_t *bp = zio->io_bp;
1019
1020	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1021	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
1022	    !(zio->io_flags & ZIO_FLAG_RAW)) {
1023		uint64_t psize = BP_GET_PSIZE(bp);
1024		void *cbuf = zio_buf_alloc(psize);
1025
1026		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1027	}
1028
1029	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1030		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1031
1032	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1033		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1034
1035	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1036		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1037
1038	return (ZIO_PIPELINE_CONTINUE);
1039}
1040
1041static int
1042zio_write_bp_init(zio_t **ziop)
1043{
1044	zio_t *zio = *ziop;
1045	spa_t *spa = zio->io_spa;
1046	zio_prop_t *zp = &zio->io_prop;
1047	enum zio_compress compress = zp->zp_compress;
1048	blkptr_t *bp = zio->io_bp;
1049	uint64_t lsize = zio->io_size;
1050	uint64_t psize = lsize;
1051	int pass = 1;
1052
1053	/*
1054	 * If our children haven't all reached the ready stage,
1055	 * wait for them and then repeat this pipeline stage.
1056	 */
1057	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1058	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1059		return (ZIO_PIPELINE_STOP);
1060
1061	if (!IO_IS_ALLOCATING(zio))
1062		return (ZIO_PIPELINE_CONTINUE);
1063
1064	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1065
1066	if (zio->io_bp_override) {
1067		ASSERT(bp->blk_birth != zio->io_txg);
1068		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1069
1070		*bp = *zio->io_bp_override;
1071		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1072
1073		/*
1074		 * If we've been overridden and nopwrite is set then
1075		 * set the flag accordingly to indicate that a nopwrite
1076		 * has already occurred.
1077		 */
1078		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1079			ASSERT(!zp->zp_dedup);
1080			zio->io_flags |= ZIO_FLAG_NOPWRITE;
1081			return (ZIO_PIPELINE_CONTINUE);
1082		}
1083
1084		ASSERT(!zp->zp_nopwrite);
1085
1086		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1087			return (ZIO_PIPELINE_CONTINUE);
1088
1089		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1090		    zp->zp_dedup_verify);
1091
1092		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1093			BP_SET_DEDUP(bp, 1);
1094			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1095			return (ZIO_PIPELINE_CONTINUE);
1096		}
1097		zio->io_bp_override = NULL;
1098		BP_ZERO(bp);
1099	}
1100
1101	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1102		/*
1103		 * We're rewriting an existing block, which means we're
1104		 * working on behalf of spa_sync().  For spa_sync() to
1105		 * converge, it must eventually be the case that we don't
1106		 * have to allocate new blocks.  But compression changes
1107		 * the blocksize, which forces a reallocate, and makes
1108		 * convergence take longer.  Therefore, after the first
1109		 * few passes, stop compressing to ensure convergence.
1110		 */
1111		pass = spa_sync_pass(spa);
1112
1113		ASSERT(zio->io_txg == spa_syncing_txg(spa));
1114		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1115		ASSERT(!BP_GET_DEDUP(bp));
1116
1117		if (pass >= zfs_sync_pass_dont_compress)
1118			compress = ZIO_COMPRESS_OFF;
1119
1120		/* Make sure someone doesn't change their mind on overwrites */
1121		ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
1122		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1123	}
1124
1125	if (compress != ZIO_COMPRESS_OFF) {
1126		metaslab_class_t *mc = spa_normal_class(spa);
1127		void *cbuf = zio_buf_alloc(lsize);
1128		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize,
1129		    (size_t)metaslab_class_get_minblocksize(mc));
1130		if (psize == 0 || psize == lsize) {
1131			compress = ZIO_COMPRESS_OFF;
1132			zio_buf_free(cbuf, lsize);
1133		} else {
1134			ASSERT(psize < lsize);
1135			zio_push_transform(zio, cbuf, psize, lsize, NULL);
1136		}
1137	}
1138
1139	/*
1140	 * The final pass of spa_sync() must be all rewrites, but the first
1141	 * few passes offer a trade-off: allocating blocks defers convergence,
1142	 * but newly allocated blocks are sequential, so they can be written
1143	 * to disk faster.  Therefore, we allow the first few passes of
1144	 * spa_sync() to allocate new blocks, but force rewrites after that.
1145	 * There should only be a handful of blocks after pass 1 in any case.
1146	 */
1147	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1148	    BP_GET_PSIZE(bp) == psize &&
1149	    pass >= zfs_sync_pass_rewrite) {
1150		ASSERT(psize != 0);
1151		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1152		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1153		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1154	} else {
1155		BP_ZERO(bp);
1156		zio->io_pipeline = ZIO_WRITE_PIPELINE;
1157	}
1158
1159	if (psize == 0) {
1160		if (zio->io_bp_orig.blk_birth != 0 &&
1161		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1162			BP_SET_LSIZE(bp, lsize);
1163			BP_SET_TYPE(bp, zp->zp_type);
1164			BP_SET_LEVEL(bp, zp->zp_level);
1165			BP_SET_BIRTH(bp, zio->io_txg, 0);
1166		}
1167		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1168	} else {
1169		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1170		BP_SET_LSIZE(bp, lsize);
1171		BP_SET_TYPE(bp, zp->zp_type);
1172		BP_SET_LEVEL(bp, zp->zp_level);
1173		BP_SET_PSIZE(bp, psize);
1174		BP_SET_COMPRESS(bp, compress);
1175		BP_SET_CHECKSUM(bp, zp->zp_checksum);
1176		BP_SET_DEDUP(bp, zp->zp_dedup);
1177		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1178		if (zp->zp_dedup) {
1179			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1180			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1181			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1182		}
1183		if (zp->zp_nopwrite) {
1184			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1185			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1186			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1187		}
1188	}
1189
1190	return (ZIO_PIPELINE_CONTINUE);
1191}
1192
1193static int
1194zio_free_bp_init(zio_t **ziop)
1195{
1196	zio_t *zio = *ziop;
1197	blkptr_t *bp = zio->io_bp;
1198
1199	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1200		if (BP_GET_DEDUP(bp))
1201			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1202	}
1203
1204	return (ZIO_PIPELINE_CONTINUE);
1205}
1206
1207/*
1208 * ==========================================================================
1209 * Execute the I/O pipeline
1210 * ==========================================================================
1211 */
1212
1213static void
1214zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1215{
1216	spa_t *spa = zio->io_spa;
1217	zio_type_t t = zio->io_type;
1218	int flags = (cutinline ? TQ_FRONT : 0);
1219
1220	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
1221
1222	/*
1223	 * If we're a config writer or a probe, the normal issue and
1224	 * interrupt threads may all be blocked waiting for the config lock.
1225	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1226	 */
1227	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1228		t = ZIO_TYPE_NULL;
1229
1230	/*
1231	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1232	 */
1233	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1234		t = ZIO_TYPE_NULL;
1235
1236	/*
1237	 * If this is a high priority I/O, then use the high priority taskq if
1238	 * available.
1239	 */
1240	if (zio->io_priority == ZIO_PRIORITY_NOW &&
1241	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1242		q++;
1243
1244	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1245
1246	/*
1247	 * NB: We are assuming that the zio can only be dispatched
1248	 * to a single taskq at a time.  It would be a grievous error
1249	 * to dispatch the zio to another taskq at the same time.
1250	 */
1251#if defined(illumos) || !defined(_KERNEL)
1252	ASSERT(zio->io_tqent.tqent_next == NULL);
1253#else
1254	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
1255#endif
1256	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1257	    flags, &zio->io_tqent);
1258}
1259
1260static boolean_t
1261zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1262{
1263	kthread_t *executor = zio->io_executor;
1264	spa_t *spa = zio->io_spa;
1265
1266	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1267		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1268		uint_t i;
1269		for (i = 0; i < tqs->stqs_count; i++) {
1270			if (taskq_member(tqs->stqs_taskq[i], executor))
1271				return (B_TRUE);
1272		}
1273	}
1274
1275	return (B_FALSE);
1276}
1277
1278static int
1279zio_issue_async(zio_t **ziop)
1280{
1281	zio_t *zio = *ziop;
1282
1283	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1284
1285	return (ZIO_PIPELINE_STOP);
1286}
1287
1288void
1289zio_interrupt(zio_t *zio)
1290{
1291	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1292}
1293
1294/*
1295 * Execute the I/O pipeline until one of the following occurs:
1296 *
1297 *	(1) the I/O completes
1298 *	(2) the pipeline stalls waiting for dependent child I/Os
1299 *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
1300 *	(4) the I/O is delegated by vdev-level caching or aggregation
1301 *	(5) the I/O is deferred due to vdev-level queueing
1302 *	(6) the I/O is handed off to another thread.
1303 *
1304 * In all cases, the pipeline stops whenever there's no CPU work; it never
1305 * burns a thread in cv_wait().
1306 *
1307 * There's no locking on io_stage because there's no legitimate way
1308 * for multiple threads to be attempting to process the same I/O.
1309 */
1310static zio_pipe_stage_t *zio_pipeline[];
1311
1312void
1313zio_execute(zio_t *zio)
1314{
1315	zio->io_executor = curthread;
1316
1317	while (zio->io_stage < ZIO_STAGE_DONE) {
1318		enum zio_stage pipeline = zio->io_pipeline;
1319		enum zio_stage stage = zio->io_stage;
1320		int rv;
1321
1322		ASSERT(!MUTEX_HELD(&zio->io_lock));
1323		ASSERT(ISP2(stage));
1324		ASSERT(zio->io_stall == NULL);
1325
1326		do {
1327			stage <<= 1;
1328		} while ((stage & pipeline) == 0);
1329
1330		ASSERT(stage <= ZIO_STAGE_DONE);
1331
1332		/*
1333		 * If we are in interrupt context and this pipeline stage
1334		 * will grab a config lock that is held across I/O,
1335		 * or may wait for an I/O that needs an interrupt thread
1336		 * to complete, issue async to avoid deadlock.
1337		 *
1338		 * For VDEV_IO_START, we cut in line so that the io will
1339		 * be sent to disk promptly.
1340		 */
1341		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1342		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1343			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1344			    zio_requeue_io_start_cut_in_line : B_FALSE;
1345			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1346			return;
1347		}
1348
1349		zio->io_stage = stage;
1350		rv = zio_pipeline[highbit64(stage) - 1](&zio);
1351
1352		if (rv == ZIO_PIPELINE_STOP)
1353			return;
1354
1355		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1356	}
1357}
1358
1359/*
1360 * ==========================================================================
1361 * Initiate I/O, either sync or async
1362 * ==========================================================================
1363 */
1364int
1365zio_wait(zio_t *zio)
1366{
1367	int error;
1368
1369	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1370	ASSERT(zio->io_executor == NULL);
1371
1372	zio->io_waiter = curthread;
1373
1374	zio_execute(zio);
1375
1376	mutex_enter(&zio->io_lock);
1377	while (zio->io_executor != NULL)
1378		cv_wait(&zio->io_cv, &zio->io_lock);
1379	mutex_exit(&zio->io_lock);
1380
1381	error = zio->io_error;
1382	zio_destroy(zio);
1383
1384	return (error);
1385}
1386
1387void
1388zio_nowait(zio_t *zio)
1389{
1390	ASSERT(zio->io_executor == NULL);
1391
1392	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1393	    zio_unique_parent(zio) == NULL) {
1394		/*
1395		 * This is a logical async I/O with no parent to wait for it.
1396		 * We add it to the spa_async_root_zio "Godfather" I/O which
1397		 * will ensure they complete prior to unloading the pool.
1398		 */
1399		spa_t *spa = zio->io_spa;
1400
1401		zio_add_child(spa->spa_async_zio_root, zio);
1402	}
1403
1404	zio_execute(zio);
1405}
1406
1407/*
1408 * ==========================================================================
1409 * Reexecute or suspend/resume failed I/O
1410 * ==========================================================================
1411 */
1412
1413static void
1414zio_reexecute(zio_t *pio)
1415{
1416	zio_t *cio, *cio_next;
1417
1418	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1419	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1420	ASSERT(pio->io_gang_leader == NULL);
1421	ASSERT(pio->io_gang_tree == NULL);
1422
1423	pio->io_flags = pio->io_orig_flags;
1424	pio->io_stage = pio->io_orig_stage;
1425	pio->io_pipeline = pio->io_orig_pipeline;
1426	pio->io_reexecute = 0;
1427	pio->io_flags |= ZIO_FLAG_REEXECUTED;
1428	pio->io_error = 0;
1429	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1430		pio->io_state[w] = 0;
1431	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1432		pio->io_child_error[c] = 0;
1433
1434	if (IO_IS_ALLOCATING(pio))
1435		BP_ZERO(pio->io_bp);
1436
1437	/*
1438	 * As we reexecute pio's children, new children could be created.
1439	 * New children go to the head of pio's io_child_list, however,
1440	 * so we will (correctly) not reexecute them.  The key is that
1441	 * the remainder of pio's io_child_list, from 'cio_next' onward,
1442	 * cannot be affected by any side effects of reexecuting 'cio'.
1443	 */
1444	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1445		cio_next = zio_walk_children(pio);
1446		mutex_enter(&pio->io_lock);
1447		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1448			pio->io_children[cio->io_child_type][w]++;
1449		mutex_exit(&pio->io_lock);
1450		zio_reexecute(cio);
1451	}
1452
1453	/*
1454	 * Now that all children have been reexecuted, execute the parent.
1455	 * We don't reexecute "The Godfather" I/O here as it's the
1456	 * responsibility of the caller to wait on him.
1457	 */
1458	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1459		zio_execute(pio);
1460}
1461
1462void
1463zio_suspend(spa_t *spa, zio_t *zio)
1464{
1465	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1466		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1467		    "failure and the failure mode property for this pool "
1468		    "is set to panic.", spa_name(spa));
1469
1470	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1471
1472	mutex_enter(&spa->spa_suspend_lock);
1473
1474	if (spa->spa_suspend_zio_root == NULL)
1475		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1476		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1477		    ZIO_FLAG_GODFATHER);
1478
1479	spa->spa_suspended = B_TRUE;
1480
1481	if (zio != NULL) {
1482		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1483		ASSERT(zio != spa->spa_suspend_zio_root);
1484		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1485		ASSERT(zio_unique_parent(zio) == NULL);
1486		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1487		zio_add_child(spa->spa_suspend_zio_root, zio);
1488	}
1489
1490	mutex_exit(&spa->spa_suspend_lock);
1491}
1492
1493int
1494zio_resume(spa_t *spa)
1495{
1496	zio_t *pio;
1497
1498	/*
1499	 * Reexecute all previously suspended i/o.
1500	 */
1501	mutex_enter(&spa->spa_suspend_lock);
1502	spa->spa_suspended = B_FALSE;
1503	cv_broadcast(&spa->spa_suspend_cv);
1504	pio = spa->spa_suspend_zio_root;
1505	spa->spa_suspend_zio_root = NULL;
1506	mutex_exit(&spa->spa_suspend_lock);
1507
1508	if (pio == NULL)
1509		return (0);
1510
1511	zio_reexecute(pio);
1512	return (zio_wait(pio));
1513}
1514
1515void
1516zio_resume_wait(spa_t *spa)
1517{
1518	mutex_enter(&spa->spa_suspend_lock);
1519	while (spa_suspended(spa))
1520		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1521	mutex_exit(&spa->spa_suspend_lock);
1522}
1523
1524/*
1525 * ==========================================================================
1526 * Gang blocks.
1527 *
1528 * A gang block is a collection of small blocks that looks to the DMU
1529 * like one large block.  When zio_dva_allocate() cannot find a block
1530 * of the requested size, due to either severe fragmentation or the pool
1531 * being nearly full, it calls zio_write_gang_block() to construct the
1532 * block from smaller fragments.
1533 *
1534 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1535 * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1536 * an indirect block: it's an array of block pointers.  It consumes
1537 * only one sector and hence is allocatable regardless of fragmentation.
1538 * The gang header's bps point to its gang members, which hold the data.
1539 *
1540 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1541 * as the verifier to ensure uniqueness of the SHA256 checksum.
1542 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1543 * not the gang header.  This ensures that data block signatures (needed for
1544 * deduplication) are independent of how the block is physically stored.
1545 *
1546 * Gang blocks can be nested: a gang member may itself be a gang block.
1547 * Thus every gang block is a tree in which root and all interior nodes are
1548 * gang headers, and the leaves are normal blocks that contain user data.
1549 * The root of the gang tree is called the gang leader.
1550 *
1551 * To perform any operation (read, rewrite, free, claim) on a gang block,
1552 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1553 * in the io_gang_tree field of the original logical i/o by recursively
1554 * reading the gang leader and all gang headers below it.  This yields
1555 * an in-core tree containing the contents of every gang header and the
1556 * bps for every constituent of the gang block.
1557 *
1558 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1559 * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1560 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1561 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1562 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1563 * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1564 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1565 * of the gang header plus zio_checksum_compute() of the data to update the
1566 * gang header's blk_cksum as described above.
1567 *
1568 * The two-phase assemble/issue model solves the problem of partial failure --
1569 * what if you'd freed part of a gang block but then couldn't read the
1570 * gang header for another part?  Assembling the entire gang tree first
1571 * ensures that all the necessary gang header I/O has succeeded before
1572 * starting the actual work of free, claim, or write.  Once the gang tree
1573 * is assembled, free and claim are in-memory operations that cannot fail.
1574 *
1575 * In the event that a gang write fails, zio_dva_unallocate() walks the
1576 * gang tree to immediately free (i.e. insert back into the space map)
1577 * everything we've allocated.  This ensures that we don't get ENOSPC
1578 * errors during repeated suspend/resume cycles due to a flaky device.
1579 *
1580 * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1581 * the gang tree, we won't modify the block, so we can safely defer the free
1582 * (knowing that the block is still intact).  If we *can* assemble the gang
1583 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1584 * each constituent bp and we can allocate a new block on the next sync pass.
1585 *
1586 * In all cases, the gang tree allows complete recovery from partial failure.
1587 * ==========================================================================
1588 */
1589
1590static zio_t *
1591zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1592{
1593	if (gn != NULL)
1594		return (pio);
1595
1596	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1597	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1598	    &pio->io_bookmark));
1599}
1600
1601zio_t *
1602zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1603{
1604	zio_t *zio;
1605
1606	if (gn != NULL) {
1607		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1608		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1609		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1610		/*
1611		 * As we rewrite each gang header, the pipeline will compute
1612		 * a new gang block header checksum for it; but no one will
1613		 * compute a new data checksum, so we do that here.  The one
1614		 * exception is the gang leader: the pipeline already computed
1615		 * its data checksum because that stage precedes gang assembly.
1616		 * (Presently, nothing actually uses interior data checksums;
1617		 * this is just good hygiene.)
1618		 */
1619		if (gn != pio->io_gang_leader->io_gang_tree) {
1620			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1621			    data, BP_GET_PSIZE(bp));
1622		}
1623		/*
1624		 * If we are here to damage data for testing purposes,
1625		 * leave the GBH alone so that we can detect the damage.
1626		 */
1627		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1628			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1629	} else {
1630		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1631		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1632		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1633	}
1634
1635	return (zio);
1636}
1637
1638/* ARGSUSED */
1639zio_t *
1640zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1641{
1642	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1643	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
1644	    ZIO_GANG_CHILD_FLAGS(pio)));
1645}
1646
1647/* ARGSUSED */
1648zio_t *
1649zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1650{
1651	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1652	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1653}
1654
1655static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1656	NULL,
1657	zio_read_gang,
1658	zio_rewrite_gang,
1659	zio_free_gang,
1660	zio_claim_gang,
1661	NULL
1662};
1663
1664static void zio_gang_tree_assemble_done(zio_t *zio);
1665
1666static zio_gang_node_t *
1667zio_gang_node_alloc(zio_gang_node_t **gnpp)
1668{
1669	zio_gang_node_t *gn;
1670
1671	ASSERT(*gnpp == NULL);
1672
1673	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1674	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1675	*gnpp = gn;
1676
1677	return (gn);
1678}
1679
1680static void
1681zio_gang_node_free(zio_gang_node_t **gnpp)
1682{
1683	zio_gang_node_t *gn = *gnpp;
1684
1685	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1686		ASSERT(gn->gn_child[g] == NULL);
1687
1688	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1689	kmem_free(gn, sizeof (*gn));
1690	*gnpp = NULL;
1691}
1692
1693static void
1694zio_gang_tree_free(zio_gang_node_t **gnpp)
1695{
1696	zio_gang_node_t *gn = *gnpp;
1697
1698	if (gn == NULL)
1699		return;
1700
1701	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1702		zio_gang_tree_free(&gn->gn_child[g]);
1703
1704	zio_gang_node_free(gnpp);
1705}
1706
1707static void
1708zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1709{
1710	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1711
1712	ASSERT(gio->io_gang_leader == gio);
1713	ASSERT(BP_IS_GANG(bp));
1714
1715	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1716	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1717	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1718}
1719
1720static void
1721zio_gang_tree_assemble_done(zio_t *zio)
1722{
1723	zio_t *gio = zio->io_gang_leader;
1724	zio_gang_node_t *gn = zio->io_private;
1725	blkptr_t *bp = zio->io_bp;
1726
1727	ASSERT(gio == zio_unique_parent(zio));
1728	ASSERT(zio->io_child_count == 0);
1729
1730	if (zio->io_error)
1731		return;
1732
1733	if (BP_SHOULD_BYTESWAP(bp))
1734		byteswap_uint64_array(zio->io_data, zio->io_size);
1735
1736	ASSERT(zio->io_data == gn->gn_gbh);
1737	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1738	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1739
1740	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1741		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1742		if (!BP_IS_GANG(gbp))
1743			continue;
1744		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1745	}
1746}
1747
1748static void
1749zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1750{
1751	zio_t *gio = pio->io_gang_leader;
1752	zio_t *zio;
1753
1754	ASSERT(BP_IS_GANG(bp) == !!gn);
1755	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1756	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1757
1758	/*
1759	 * If you're a gang header, your data is in gn->gn_gbh.
1760	 * If you're a gang member, your data is in 'data' and gn == NULL.
1761	 */
1762	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1763
1764	if (gn != NULL) {
1765		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1766
1767		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1768			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1769			if (BP_IS_HOLE(gbp))
1770				continue;
1771			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1772			data = (char *)data + BP_GET_PSIZE(gbp);
1773		}
1774	}
1775
1776	if (gn == gio->io_gang_tree && gio->io_data != NULL)
1777		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1778
1779	if (zio != pio)
1780		zio_nowait(zio);
1781}
1782
1783static int
1784zio_gang_assemble(zio_t **ziop)
1785{
1786	zio_t *zio = *ziop;
1787	blkptr_t *bp = zio->io_bp;
1788
1789	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1790	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1791
1792	zio->io_gang_leader = zio;
1793
1794	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1795
1796	return (ZIO_PIPELINE_CONTINUE);
1797}
1798
1799static int
1800zio_gang_issue(zio_t **ziop)
1801{
1802	zio_t *zio = *ziop;
1803	blkptr_t *bp = zio->io_bp;
1804
1805	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1806		return (ZIO_PIPELINE_STOP);
1807
1808	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1809	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1810
1811	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1812		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1813	else
1814		zio_gang_tree_free(&zio->io_gang_tree);
1815
1816	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1817
1818	return (ZIO_PIPELINE_CONTINUE);
1819}
1820
1821static void
1822zio_write_gang_member_ready(zio_t *zio)
1823{
1824	zio_t *pio = zio_unique_parent(zio);
1825	zio_t *gio = zio->io_gang_leader;
1826	dva_t *cdva = zio->io_bp->blk_dva;
1827	dva_t *pdva = pio->io_bp->blk_dva;
1828	uint64_t asize;
1829
1830	if (BP_IS_HOLE(zio->io_bp))
1831		return;
1832
1833	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1834
1835	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1836	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1837	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1838	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1839	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1840
1841	mutex_enter(&pio->io_lock);
1842	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1843		ASSERT(DVA_GET_GANG(&pdva[d]));
1844		asize = DVA_GET_ASIZE(&pdva[d]);
1845		asize += DVA_GET_ASIZE(&cdva[d]);
1846		DVA_SET_ASIZE(&pdva[d], asize);
1847	}
1848	mutex_exit(&pio->io_lock);
1849}
1850
1851static int
1852zio_write_gang_block(zio_t *pio)
1853{
1854	spa_t *spa = pio->io_spa;
1855	blkptr_t *bp = pio->io_bp;
1856	zio_t *gio = pio->io_gang_leader;
1857	zio_t *zio;
1858	zio_gang_node_t *gn, **gnpp;
1859	zio_gbh_phys_t *gbh;
1860	uint64_t txg = pio->io_txg;
1861	uint64_t resid = pio->io_size;
1862	uint64_t lsize;
1863	int copies = gio->io_prop.zp_copies;
1864	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1865	zio_prop_t zp;
1866	int error;
1867
1868	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1869	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1870	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1871	if (error) {
1872		pio->io_error = error;
1873		return (ZIO_PIPELINE_CONTINUE);
1874	}
1875
1876	if (pio == gio) {
1877		gnpp = &gio->io_gang_tree;
1878	} else {
1879		gnpp = pio->io_private;
1880		ASSERT(pio->io_ready == zio_write_gang_member_ready);
1881	}
1882
1883	gn = zio_gang_node_alloc(gnpp);
1884	gbh = gn->gn_gbh;
1885	bzero(gbh, SPA_GANGBLOCKSIZE);
1886
1887	/*
1888	 * Create the gang header.
1889	 */
1890	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1891	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1892
1893	/*
1894	 * Create and nowait the gang children.
1895	 */
1896	for (int g = 0; resid != 0; resid -= lsize, g++) {
1897		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1898		    SPA_MINBLOCKSIZE);
1899		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1900
1901		zp.zp_checksum = gio->io_prop.zp_checksum;
1902		zp.zp_compress = ZIO_COMPRESS_OFF;
1903		zp.zp_type = DMU_OT_NONE;
1904		zp.zp_level = 0;
1905		zp.zp_copies = gio->io_prop.zp_copies;
1906		zp.zp_dedup = B_FALSE;
1907		zp.zp_dedup_verify = B_FALSE;
1908		zp.zp_nopwrite = B_FALSE;
1909
1910		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1911		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1912		    zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1913		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1914		    &pio->io_bookmark));
1915	}
1916
1917	/*
1918	 * Set pio's pipeline to just wait for zio to finish.
1919	 */
1920	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1921
1922	zio_nowait(zio);
1923
1924	return (ZIO_PIPELINE_CONTINUE);
1925}
1926
1927/*
1928 * The zio_nop_write stage in the pipeline determines if allocating
1929 * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1930 * such as SHA256, we can compare the checksums of the new data and the old
1931 * to determine if allocating a new block is required.  The nopwrite
1932 * feature can handle writes in either syncing or open context (i.e. zil
1933 * writes) and as a result is mutually exclusive with dedup.
1934 */
1935static int
1936zio_nop_write(zio_t **ziop)
1937{
1938	zio_t *zio = *ziop;
1939	blkptr_t *bp = zio->io_bp;
1940	blkptr_t *bp_orig = &zio->io_bp_orig;
1941	zio_prop_t *zp = &zio->io_prop;
1942
1943	ASSERT(BP_GET_LEVEL(bp) == 0);
1944	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1945	ASSERT(zp->zp_nopwrite);
1946	ASSERT(!zp->zp_dedup);
1947	ASSERT(zio->io_bp_override == NULL);
1948	ASSERT(IO_IS_ALLOCATING(zio));
1949
1950	/*
1951	 * Check to see if the original bp and the new bp have matching
1952	 * characteristics (i.e. same checksum, compression algorithms, etc).
1953	 * If they don't then just continue with the pipeline which will
1954	 * allocate a new bp.
1955	 */
1956	if (BP_IS_HOLE(bp_orig) ||
1957	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
1958	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
1959	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
1960	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
1961	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
1962		return (ZIO_PIPELINE_CONTINUE);
1963
1964	/*
1965	 * If the checksums match then reset the pipeline so that we
1966	 * avoid allocating a new bp and issuing any I/O.
1967	 */
1968	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
1969		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
1970		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
1971		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
1972		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
1973		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
1974		    sizeof (uint64_t)) == 0);
1975
1976		*bp = *bp_orig;
1977		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1978		zio->io_flags |= ZIO_FLAG_NOPWRITE;
1979	}
1980
1981	return (ZIO_PIPELINE_CONTINUE);
1982}
1983
1984/*
1985 * ==========================================================================
1986 * Dedup
1987 * ==========================================================================
1988 */
1989static void
1990zio_ddt_child_read_done(zio_t *zio)
1991{
1992	blkptr_t *bp = zio->io_bp;
1993	ddt_entry_t *dde = zio->io_private;
1994	ddt_phys_t *ddp;
1995	zio_t *pio = zio_unique_parent(zio);
1996
1997	mutex_enter(&pio->io_lock);
1998	ddp = ddt_phys_select(dde, bp);
1999	if (zio->io_error == 0)
2000		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
2001	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2002		dde->dde_repair_data = zio->io_data;
2003	else
2004		zio_buf_free(zio->io_data, zio->io_size);
2005	mutex_exit(&pio->io_lock);
2006}
2007
2008static int
2009zio_ddt_read_start(zio_t **ziop)
2010{
2011	zio_t *zio = *ziop;
2012	blkptr_t *bp = zio->io_bp;
2013
2014	ASSERT(BP_GET_DEDUP(bp));
2015	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2016	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2017
2018	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2019		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2020		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2021		ddt_phys_t *ddp = dde->dde_phys;
2022		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2023		blkptr_t blk;
2024
2025		ASSERT(zio->io_vsd == NULL);
2026		zio->io_vsd = dde;
2027
2028		if (ddp_self == NULL)
2029			return (ZIO_PIPELINE_CONTINUE);
2030
2031		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2032			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2033				continue;
2034			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2035			    &blk);
2036			zio_nowait(zio_read(zio, zio->io_spa, &blk,
2037			    zio_buf_alloc(zio->io_size), zio->io_size,
2038			    zio_ddt_child_read_done, dde, zio->io_priority,
2039			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2040			    &zio->io_bookmark));
2041		}
2042		return (ZIO_PIPELINE_CONTINUE);
2043	}
2044
2045	zio_nowait(zio_read(zio, zio->io_spa, bp,
2046	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2047	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2048
2049	return (ZIO_PIPELINE_CONTINUE);
2050}
2051
2052static int
2053zio_ddt_read_done(zio_t **ziop)
2054{
2055	zio_t *zio = *ziop;
2056	blkptr_t *bp = zio->io_bp;
2057
2058	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2059		return (ZIO_PIPELINE_STOP);
2060
2061	ASSERT(BP_GET_DEDUP(bp));
2062	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2063	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2064
2065	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2066		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2067		ddt_entry_t *dde = zio->io_vsd;
2068		if (ddt == NULL) {
2069			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2070			return (ZIO_PIPELINE_CONTINUE);
2071		}
2072		if (dde == NULL) {
2073			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2074			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2075			return (ZIO_PIPELINE_STOP);
2076		}
2077		if (dde->dde_repair_data != NULL) {
2078			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2079			zio->io_child_error[ZIO_CHILD_DDT] = 0;
2080		}
2081		ddt_repair_done(ddt, dde);
2082		zio->io_vsd = NULL;
2083	}
2084
2085	ASSERT(zio->io_vsd == NULL);
2086
2087	return (ZIO_PIPELINE_CONTINUE);
2088}
2089
2090static boolean_t
2091zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2092{
2093	spa_t *spa = zio->io_spa;
2094
2095	/*
2096	 * Note: we compare the original data, not the transformed data,
2097	 * because when zio->io_bp is an override bp, we will not have
2098	 * pushed the I/O transforms.  That's an important optimization
2099	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2100	 */
2101	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2102		zio_t *lio = dde->dde_lead_zio[p];
2103
2104		if (lio != NULL) {
2105			return (lio->io_orig_size != zio->io_orig_size ||
2106			    bcmp(zio->io_orig_data, lio->io_orig_data,
2107			    zio->io_orig_size) != 0);
2108		}
2109	}
2110
2111	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2112		ddt_phys_t *ddp = &dde->dde_phys[p];
2113
2114		if (ddp->ddp_phys_birth != 0) {
2115			arc_buf_t *abuf = NULL;
2116			uint32_t aflags = ARC_WAIT;
2117			blkptr_t blk = *zio->io_bp;
2118			int error;
2119
2120			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2121
2122			ddt_exit(ddt);
2123
2124			error = arc_read(NULL, spa, &blk,
2125			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2126			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2127			    &aflags, &zio->io_bookmark);
2128
2129			if (error == 0) {
2130				if (arc_buf_size(abuf) != zio->io_orig_size ||
2131				    bcmp(abuf->b_data, zio->io_orig_data,
2132				    zio->io_orig_size) != 0)
2133					error = SET_ERROR(EEXIST);
2134				VERIFY(arc_buf_remove_ref(abuf, &abuf));
2135			}
2136
2137			ddt_enter(ddt);
2138			return (error != 0);
2139		}
2140	}
2141
2142	return (B_FALSE);
2143}
2144
2145static void
2146zio_ddt_child_write_ready(zio_t *zio)
2147{
2148	int p = zio->io_prop.zp_copies;
2149	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2150	ddt_entry_t *dde = zio->io_private;
2151	ddt_phys_t *ddp = &dde->dde_phys[p];
2152	zio_t *pio;
2153
2154	if (zio->io_error)
2155		return;
2156
2157	ddt_enter(ddt);
2158
2159	ASSERT(dde->dde_lead_zio[p] == zio);
2160
2161	ddt_phys_fill(ddp, zio->io_bp);
2162
2163	while ((pio = zio_walk_parents(zio)) != NULL)
2164		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2165
2166	ddt_exit(ddt);
2167}
2168
2169static void
2170zio_ddt_child_write_done(zio_t *zio)
2171{
2172	int p = zio->io_prop.zp_copies;
2173	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2174	ddt_entry_t *dde = zio->io_private;
2175	ddt_phys_t *ddp = &dde->dde_phys[p];
2176
2177	ddt_enter(ddt);
2178
2179	ASSERT(ddp->ddp_refcnt == 0);
2180	ASSERT(dde->dde_lead_zio[p] == zio);
2181	dde->dde_lead_zio[p] = NULL;
2182
2183	if (zio->io_error == 0) {
2184		while (zio_walk_parents(zio) != NULL)
2185			ddt_phys_addref(ddp);
2186	} else {
2187		ddt_phys_clear(ddp);
2188	}
2189
2190	ddt_exit(ddt);
2191}
2192
2193static void
2194zio_ddt_ditto_write_done(zio_t *zio)
2195{
2196	int p = DDT_PHYS_DITTO;
2197	zio_prop_t *zp = &zio->io_prop;
2198	blkptr_t *bp = zio->io_bp;
2199	ddt_t *ddt = ddt_select(zio->io_spa, bp);
2200	ddt_entry_t *dde = zio->io_private;
2201	ddt_phys_t *ddp = &dde->dde_phys[p];
2202	ddt_key_t *ddk = &dde->dde_key;
2203
2204	ddt_enter(ddt);
2205
2206	ASSERT(ddp->ddp_refcnt == 0);
2207	ASSERT(dde->dde_lead_zio[p] == zio);
2208	dde->dde_lead_zio[p] = NULL;
2209
2210	if (zio->io_error == 0) {
2211		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2212		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2213		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2214		if (ddp->ddp_phys_birth != 0)
2215			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2216		ddt_phys_fill(ddp, bp);
2217	}
2218
2219	ddt_exit(ddt);
2220}
2221
2222static int
2223zio_ddt_write(zio_t **ziop)
2224{
2225	zio_t *zio = *ziop;
2226	spa_t *spa = zio->io_spa;
2227	blkptr_t *bp = zio->io_bp;
2228	uint64_t txg = zio->io_txg;
2229	zio_prop_t *zp = &zio->io_prop;
2230	int p = zp->zp_copies;
2231	int ditto_copies;
2232	zio_t *cio = NULL;
2233	zio_t *dio = NULL;
2234	ddt_t *ddt = ddt_select(spa, bp);
2235	ddt_entry_t *dde;
2236	ddt_phys_t *ddp;
2237
2238	ASSERT(BP_GET_DEDUP(bp));
2239	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2240	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2241
2242	ddt_enter(ddt);
2243	dde = ddt_lookup(ddt, bp, B_TRUE);
2244	ddp = &dde->dde_phys[p];
2245
2246	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2247		/*
2248		 * If we're using a weak checksum, upgrade to a strong checksum
2249		 * and try again.  If we're already using a strong checksum,
2250		 * we can't resolve it, so just convert to an ordinary write.
2251		 * (And automatically e-mail a paper to Nature?)
2252		 */
2253		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2254			zp->zp_checksum = spa_dedup_checksum(spa);
2255			zio_pop_transforms(zio);
2256			zio->io_stage = ZIO_STAGE_OPEN;
2257			BP_ZERO(bp);
2258		} else {
2259			zp->zp_dedup = B_FALSE;
2260		}
2261		zio->io_pipeline = ZIO_WRITE_PIPELINE;
2262		ddt_exit(ddt);
2263		return (ZIO_PIPELINE_CONTINUE);
2264	}
2265
2266	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2267	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2268
2269	if (ditto_copies > ddt_ditto_copies_present(dde) &&
2270	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2271		zio_prop_t czp = *zp;
2272
2273		czp.zp_copies = ditto_copies;
2274
2275		/*
2276		 * If we arrived here with an override bp, we won't have run
2277		 * the transform stack, so we won't have the data we need to
2278		 * generate a child i/o.  So, toss the override bp and restart.
2279		 * This is safe, because using the override bp is just an
2280		 * optimization; and it's rare, so the cost doesn't matter.
2281		 */
2282		if (zio->io_bp_override) {
2283			zio_pop_transforms(zio);
2284			zio->io_stage = ZIO_STAGE_OPEN;
2285			zio->io_pipeline = ZIO_WRITE_PIPELINE;
2286			zio->io_bp_override = NULL;
2287			BP_ZERO(bp);
2288			ddt_exit(ddt);
2289			return (ZIO_PIPELINE_CONTINUE);
2290		}
2291
2292		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2293		    zio->io_orig_size, &czp, NULL, NULL,
2294		    zio_ddt_ditto_write_done, dde, zio->io_priority,
2295		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2296
2297		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2298		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2299	}
2300
2301	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2302		if (ddp->ddp_phys_birth != 0)
2303			ddt_bp_fill(ddp, bp, txg);
2304		if (dde->dde_lead_zio[p] != NULL)
2305			zio_add_child(zio, dde->dde_lead_zio[p]);
2306		else
2307			ddt_phys_addref(ddp);
2308	} else if (zio->io_bp_override) {
2309		ASSERT(bp->blk_birth == txg);
2310		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2311		ddt_phys_fill(ddp, bp);
2312		ddt_phys_addref(ddp);
2313	} else {
2314		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2315		    zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2316		    zio_ddt_child_write_done, dde, zio->io_priority,
2317		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2318
2319		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2320		dde->dde_lead_zio[p] = cio;
2321	}
2322
2323	ddt_exit(ddt);
2324
2325	if (cio)
2326		zio_nowait(cio);
2327	if (dio)
2328		zio_nowait(dio);
2329
2330	return (ZIO_PIPELINE_CONTINUE);
2331}
2332
2333ddt_entry_t *freedde; /* for debugging */
2334
2335static int
2336zio_ddt_free(zio_t **ziop)
2337{
2338	zio_t *zio = *ziop;
2339	spa_t *spa = zio->io_spa;
2340	blkptr_t *bp = zio->io_bp;
2341	ddt_t *ddt = ddt_select(spa, bp);
2342	ddt_entry_t *dde;
2343	ddt_phys_t *ddp;
2344
2345	ASSERT(BP_GET_DEDUP(bp));
2346	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2347
2348	ddt_enter(ddt);
2349	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2350	ddp = ddt_phys_select(dde, bp);
2351	ddt_phys_decref(ddp);
2352	ddt_exit(ddt);
2353
2354	return (ZIO_PIPELINE_CONTINUE);
2355}
2356
2357/*
2358 * ==========================================================================
2359 * Allocate and free blocks
2360 * ==========================================================================
2361 */
2362static int
2363zio_dva_allocate(zio_t **ziop)
2364{
2365	zio_t *zio = *ziop;
2366	spa_t *spa = zio->io_spa;
2367	metaslab_class_t *mc = spa_normal_class(spa);
2368	blkptr_t *bp = zio->io_bp;
2369	int error;
2370	int flags = 0;
2371
2372	if (zio->io_gang_leader == NULL) {
2373		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2374		zio->io_gang_leader = zio;
2375	}
2376
2377	ASSERT(BP_IS_HOLE(bp));
2378	ASSERT0(BP_GET_NDVAS(bp));
2379	ASSERT3U(zio->io_prop.zp_copies, >, 0);
2380	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2381	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2382
2383	/*
2384	 * The dump device does not support gang blocks so allocation on
2385	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2386	 * the "fast" gang feature.
2387	 */
2388	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2389	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2390	    METASLAB_GANG_CHILD : 0;
2391	error = metaslab_alloc(spa, mc, zio->io_size, bp,
2392	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2393
2394	if (error) {
2395		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2396		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2397		    error);
2398		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2399			return (zio_write_gang_block(zio));
2400		zio->io_error = error;
2401	}
2402
2403	return (ZIO_PIPELINE_CONTINUE);
2404}
2405
2406static int
2407zio_dva_free(zio_t **ziop)
2408{
2409	zio_t *zio = *ziop;
2410
2411	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2412
2413	return (ZIO_PIPELINE_CONTINUE);
2414}
2415
2416static int
2417zio_dva_claim(zio_t **ziop)
2418{
2419	zio_t *zio = *ziop;
2420	int error;
2421
2422	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2423	if (error)
2424		zio->io_error = error;
2425
2426	return (ZIO_PIPELINE_CONTINUE);
2427}
2428
2429/*
2430 * Undo an allocation.  This is used by zio_done() when an I/O fails
2431 * and we want to give back the block we just allocated.
2432 * This handles both normal blocks and gang blocks.
2433 */
2434static void
2435zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2436{
2437	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2438	ASSERT(zio->io_bp_override == NULL);
2439
2440	if (!BP_IS_HOLE(bp))
2441		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2442
2443	if (gn != NULL) {
2444		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2445			zio_dva_unallocate(zio, gn->gn_child[g],
2446			    &gn->gn_gbh->zg_blkptr[g]);
2447		}
2448	}
2449}
2450
2451/*
2452 * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2453 */
2454int
2455zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2456    uint64_t size, boolean_t use_slog)
2457{
2458	int error = 1;
2459
2460	ASSERT(txg > spa_syncing_txg(spa));
2461
2462	/*
2463	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2464	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2465	 * when allocating them.
2466	 */
2467	if (use_slog) {
2468		error = metaslab_alloc(spa, spa_log_class(spa), size,
2469		    new_bp, 1, txg, old_bp,
2470		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2471	}
2472
2473	if (error) {
2474		error = metaslab_alloc(spa, spa_normal_class(spa), size,
2475		    new_bp, 1, txg, old_bp,
2476		    METASLAB_HINTBP_AVOID);
2477	}
2478
2479	if (error == 0) {
2480		BP_SET_LSIZE(new_bp, size);
2481		BP_SET_PSIZE(new_bp, size);
2482		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2483		BP_SET_CHECKSUM(new_bp,
2484		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2485		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2486		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2487		BP_SET_LEVEL(new_bp, 0);
2488		BP_SET_DEDUP(new_bp, 0);
2489		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2490	}
2491
2492	return (error);
2493}
2494
2495/*
2496 * Free an intent log block.
2497 */
2498void
2499zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2500{
2501	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2502	ASSERT(!BP_IS_GANG(bp));
2503
2504	zio_free(spa, txg, bp);
2505}
2506
2507/*
2508 * ==========================================================================
2509 * Read, write and delete to physical devices
2510 * ==========================================================================
2511 */
2512static int
2513zio_vdev_io_start(zio_t **ziop)
2514{
2515	zio_t *zio = *ziop;
2516	vdev_t *vd = zio->io_vd;
2517	uint64_t align;
2518	spa_t *spa = zio->io_spa;
2519
2520	ASSERT(zio->io_error == 0);
2521	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2522
2523	if (vd == NULL) {
2524		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2525			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2526
2527		/*
2528		 * The mirror_ops handle multiple DVAs in a single BP.
2529		 */
2530		return (vdev_mirror_ops.vdev_op_io_start(zio));
2531	}
2532
2533	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
2534		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
2535		return (ZIO_PIPELINE_CONTINUE);
2536	}
2537
2538	/*
2539	 * We keep track of time-sensitive I/Os so that the scan thread
2540	 * can quickly react to certain workloads.  In particular, we care
2541	 * about non-scrubbing, top-level reads and writes with the following
2542	 * characteristics:
2543	 * 	- synchronous writes of user data to non-slog devices
2544	 *	- any reads of user data
2545	 * When these conditions are met, adjust the timestamp of spa_last_io
2546	 * which allows the scan thread to adjust its workload accordingly.
2547	 */
2548	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2549	    vd == vd->vdev_top && !vd->vdev_islog &&
2550	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2551	    zio->io_txg != spa_syncing_txg(spa)) {
2552		uint64_t old = spa->spa_last_io;
2553		uint64_t new = ddi_get_lbolt64();
2554		if (old != new)
2555			(void) atomic_cas_64(&spa->spa_last_io, old, new);
2556	}
2557
2558	align = 1ULL << vd->vdev_top->vdev_ashift;
2559
2560	if (P2PHASE(zio->io_size, align) != 0) {
2561		uint64_t asize = P2ROUNDUP(zio->io_size, align);
2562		char *abuf = NULL;
2563		if (zio->io_type == ZIO_TYPE_READ ||
2564		    zio->io_type == ZIO_TYPE_WRITE)
2565			abuf = zio_buf_alloc(asize);
2566		ASSERT(vd == vd->vdev_top);
2567		if (zio->io_type == ZIO_TYPE_WRITE) {
2568			bcopy(zio->io_data, abuf, zio->io_size);
2569			bzero(abuf + zio->io_size, asize - zio->io_size);
2570		}
2571		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
2572		    zio_subblock);
2573	}
2574
2575	ASSERT(P2PHASE(zio->io_offset, align) == 0);
2576	ASSERT(P2PHASE(zio->io_size, align) == 0);
2577	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
2578
2579	/*
2580	 * If this is a repair I/O, and there's no self-healing involved --
2581	 * that is, we're just resilvering what we expect to resilver --
2582	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2583	 * This prevents spurious resilvering with nested replication.
2584	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2585	 * A is out of date, we'll read from C+D, then use the data to
2586	 * resilver A+B -- but we don't actually want to resilver B, just A.
2587	 * The top-level mirror has no way to know this, so instead we just
2588	 * discard unnecessary repairs as we work our way down the vdev tree.
2589	 * The same logic applies to any form of nested replication:
2590	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2591	 */
2592	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2593	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2594	    zio->io_txg != 0 &&	/* not a delegated i/o */
2595	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2596		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2597		zio_vdev_io_bypass(zio);
2598		return (ZIO_PIPELINE_CONTINUE);
2599	}
2600
2601	if (vd->vdev_ops->vdev_op_leaf &&
2602	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2603
2604		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2605			return (ZIO_PIPELINE_CONTINUE);
2606
2607		if ((zio = vdev_queue_io(zio)) == NULL)
2608			return (ZIO_PIPELINE_STOP);
2609		*ziop = zio;
2610
2611		if (!vdev_accessible(vd, zio)) {
2612			zio->io_error = SET_ERROR(ENXIO);
2613			zio_interrupt(zio);
2614			return (ZIO_PIPELINE_STOP);
2615		}
2616	}
2617
2618	/*
2619	 * Note that we ignore repair writes for TRIM because they can conflict
2620	 * with normal writes. This isn't an issue because, by definition, we
2621	 * only repair blocks that aren't freed.
2622	 */
2623	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE &&
2624	    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2625		if (!trim_map_write_start(zio))
2626			return (ZIO_PIPELINE_STOP);
2627	}
2628
2629	return (vd->vdev_ops->vdev_op_io_start(zio));
2630}
2631
2632static int
2633zio_vdev_io_done(zio_t **ziop)
2634{
2635	zio_t *zio = *ziop;
2636	vdev_t *vd = zio->io_vd;
2637	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2638	boolean_t unexpected_error = B_FALSE;
2639
2640	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2641		return (ZIO_PIPELINE_STOP);
2642
2643	ASSERT(zio->io_type == ZIO_TYPE_READ ||
2644	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
2645
2646	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2647	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2648
2649		if (zio->io_type == ZIO_TYPE_WRITE &&
2650		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
2651			trim_map_write_done(zio);
2652
2653		vdev_queue_io_done(zio);
2654
2655		if (zio->io_type == ZIO_TYPE_WRITE)
2656			vdev_cache_write(zio);
2657
2658		if (zio_injection_enabled && zio->io_error == 0)
2659			zio->io_error = zio_handle_device_injection(vd,
2660			    zio, EIO);
2661
2662		if (zio_injection_enabled && zio->io_error == 0)
2663			zio->io_error = zio_handle_label_injection(zio, EIO);
2664
2665		if (zio->io_error) {
2666			if (!vdev_accessible(vd, zio)) {
2667				zio->io_error = SET_ERROR(ENXIO);
2668			} else {
2669				unexpected_error = B_TRUE;
2670			}
2671		}
2672	}
2673
2674	ops->vdev_op_io_done(zio);
2675
2676	if (unexpected_error)
2677		VERIFY(vdev_probe(vd, zio) == NULL);
2678
2679	return (ZIO_PIPELINE_CONTINUE);
2680}
2681
2682/*
2683 * For non-raidz ZIOs, we can just copy aside the bad data read from the
2684 * disk, and use that to finish the checksum ereport later.
2685 */
2686static void
2687zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2688    const void *good_buf)
2689{
2690	/* no processing needed */
2691	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2692}
2693
2694/*ARGSUSED*/
2695void
2696zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2697{
2698	void *buf = zio_buf_alloc(zio->io_size);
2699
2700	bcopy(zio->io_data, buf, zio->io_size);
2701
2702	zcr->zcr_cbinfo = zio->io_size;
2703	zcr->zcr_cbdata = buf;
2704	zcr->zcr_finish = zio_vsd_default_cksum_finish;
2705	zcr->zcr_free = zio_buf_free;
2706}
2707
2708static int
2709zio_vdev_io_assess(zio_t **ziop)
2710{
2711	zio_t *zio = *ziop;
2712	vdev_t *vd = zio->io_vd;
2713
2714	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2715		return (ZIO_PIPELINE_STOP);
2716
2717	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2718		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2719
2720	if (zio->io_vsd != NULL) {
2721		zio->io_vsd_ops->vsd_free(zio);
2722		zio->io_vsd = NULL;
2723	}
2724
2725	if (zio_injection_enabled && zio->io_error == 0)
2726		zio->io_error = zio_handle_fault_injection(zio, EIO);
2727
2728	if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
2729		switch (zio->io_error) {
2730		case 0:
2731			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
2732			ZIO_TRIM_STAT_BUMP(success);
2733			break;
2734		case EOPNOTSUPP:
2735			ZIO_TRIM_STAT_BUMP(unsupported);
2736			break;
2737		default:
2738			ZIO_TRIM_STAT_BUMP(failed);
2739			break;
2740		}
2741
2742	/*
2743	 * If the I/O failed, determine whether we should attempt to retry it.
2744	 *
2745	 * On retry, we cut in line in the issue queue, since we don't want
2746	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2747	 */
2748	if (zio->io_error && vd == NULL &&
2749	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2750		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
2751		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
2752		zio->io_error = 0;
2753		zio->io_flags |= ZIO_FLAG_IO_RETRY |
2754		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2755		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2756		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2757		    zio_requeue_io_start_cut_in_line);
2758		return (ZIO_PIPELINE_STOP);
2759	}
2760
2761	/*
2762	 * If we got an error on a leaf device, convert it to ENXIO
2763	 * if the device is not accessible at all.
2764	 */
2765	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2766	    !vdev_accessible(vd, zio))
2767		zio->io_error = SET_ERROR(ENXIO);
2768
2769	/*
2770	 * If we can't write to an interior vdev (mirror or RAID-Z),
2771	 * set vdev_cant_write so that we stop trying to allocate from it.
2772	 */
2773	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2774	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2775		vd->vdev_cant_write = B_TRUE;
2776	}
2777
2778	if (zio->io_error)
2779		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2780
2781	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2782	    zio->io_physdone != NULL) {
2783		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2784		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2785		zio->io_physdone(zio->io_logical);
2786	}
2787
2788	return (ZIO_PIPELINE_CONTINUE);
2789}
2790
2791void
2792zio_vdev_io_reissue(zio_t *zio)
2793{
2794	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2795	ASSERT(zio->io_error == 0);
2796
2797	zio->io_stage >>= 1;
2798}
2799
2800void
2801zio_vdev_io_redone(zio_t *zio)
2802{
2803	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2804
2805	zio->io_stage >>= 1;
2806}
2807
2808void
2809zio_vdev_io_bypass(zio_t *zio)
2810{
2811	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2812	ASSERT(zio->io_error == 0);
2813
2814	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2815	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2816}
2817
2818/*
2819 * ==========================================================================
2820 * Generate and verify checksums
2821 * ==========================================================================
2822 */
2823static int
2824zio_checksum_generate(zio_t **ziop)
2825{
2826	zio_t *zio = *ziop;
2827	blkptr_t *bp = zio->io_bp;
2828	enum zio_checksum checksum;
2829
2830	if (bp == NULL) {
2831		/*
2832		 * This is zio_write_phys().
2833		 * We're either generating a label checksum, or none at all.
2834		 */
2835		checksum = zio->io_prop.zp_checksum;
2836
2837		if (checksum == ZIO_CHECKSUM_OFF)
2838			return (ZIO_PIPELINE_CONTINUE);
2839
2840		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2841	} else {
2842		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2843			ASSERT(!IO_IS_ALLOCATING(zio));
2844			checksum = ZIO_CHECKSUM_GANG_HEADER;
2845		} else {
2846			checksum = BP_GET_CHECKSUM(bp);
2847		}
2848	}
2849
2850	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2851
2852	return (ZIO_PIPELINE_CONTINUE);
2853}
2854
2855static int
2856zio_checksum_verify(zio_t **ziop)
2857{
2858	zio_t *zio = *ziop;
2859	zio_bad_cksum_t info;
2860	blkptr_t *bp = zio->io_bp;
2861	int error;
2862
2863	ASSERT(zio->io_vd != NULL);
2864
2865	if (bp == NULL) {
2866		/*
2867		 * This is zio_read_phys().
2868		 * We're either verifying a label checksum, or nothing at all.
2869		 */
2870		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2871			return (ZIO_PIPELINE_CONTINUE);
2872
2873		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2874	}
2875
2876	if ((error = zio_checksum_error(zio, &info)) != 0) {
2877		zio->io_error = error;
2878		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2879			zfs_ereport_start_checksum(zio->io_spa,
2880			    zio->io_vd, zio, zio->io_offset,
2881			    zio->io_size, NULL, &info);
2882		}
2883	}
2884
2885	return (ZIO_PIPELINE_CONTINUE);
2886}
2887
2888/*
2889 * Called by RAID-Z to ensure we don't compute the checksum twice.
2890 */
2891void
2892zio_checksum_verified(zio_t *zio)
2893{
2894	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2895}
2896
2897/*
2898 * ==========================================================================
2899 * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2900 * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2901 * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2902 * indicate errors that are specific to one I/O, and most likely permanent.
2903 * Any other error is presumed to be worse because we weren't expecting it.
2904 * ==========================================================================
2905 */
2906int
2907zio_worst_error(int e1, int e2)
2908{
2909	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2910	int r1, r2;
2911
2912	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2913		if (e1 == zio_error_rank[r1])
2914			break;
2915
2916	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2917		if (e2 == zio_error_rank[r2])
2918			break;
2919
2920	return (r1 > r2 ? e1 : e2);
2921}
2922
2923/*
2924 * ==========================================================================
2925 * I/O completion
2926 * ==========================================================================
2927 */
2928static int
2929zio_ready(zio_t **ziop)
2930{
2931	zio_t *zio = *ziop;
2932	blkptr_t *bp = zio->io_bp;
2933	zio_t *pio, *pio_next;
2934
2935	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2936	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2937		return (ZIO_PIPELINE_STOP);
2938
2939	if (zio->io_ready) {
2940		ASSERT(IO_IS_ALLOCATING(zio));
2941		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2942		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
2943		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2944
2945		zio->io_ready(zio);
2946	}
2947
2948	if (bp != NULL && bp != &zio->io_bp_copy)
2949		zio->io_bp_copy = *bp;
2950
2951	if (zio->io_error)
2952		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2953
2954	mutex_enter(&zio->io_lock);
2955	zio->io_state[ZIO_WAIT_READY] = 1;
2956	pio = zio_walk_parents(zio);
2957	mutex_exit(&zio->io_lock);
2958
2959	/*
2960	 * As we notify zio's parents, new parents could be added.
2961	 * New parents go to the head of zio's io_parent_list, however,
2962	 * so we will (correctly) not notify them.  The remainder of zio's
2963	 * io_parent_list, from 'pio_next' onward, cannot change because
2964	 * all parents must wait for us to be done before they can be done.
2965	 */
2966	for (; pio != NULL; pio = pio_next) {
2967		pio_next = zio_walk_parents(zio);
2968		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2969	}
2970
2971	if (zio->io_flags & ZIO_FLAG_NODATA) {
2972		if (BP_IS_GANG(bp)) {
2973			zio->io_flags &= ~ZIO_FLAG_NODATA;
2974		} else {
2975			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2976			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2977		}
2978	}
2979
2980	if (zio_injection_enabled &&
2981	    zio->io_spa->spa_syncing_txg == zio->io_txg)
2982		zio_handle_ignored_writes(zio);
2983
2984	return (ZIO_PIPELINE_CONTINUE);
2985}
2986
2987static int
2988zio_done(zio_t **ziop)
2989{
2990	zio_t *zio = *ziop;
2991	spa_t *spa = zio->io_spa;
2992	zio_t *lio = zio->io_logical;
2993	blkptr_t *bp = zio->io_bp;
2994	vdev_t *vd = zio->io_vd;
2995	uint64_t psize = zio->io_size;
2996	zio_t *pio, *pio_next;
2997
2998	/*
2999	 * If our children haven't all completed,
3000	 * wait for them and then repeat this pipeline stage.
3001	 */
3002	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3003	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3004	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3005	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3006		return (ZIO_PIPELINE_STOP);
3007
3008	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3009		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3010			ASSERT(zio->io_children[c][w] == 0);
3011
3012	if (bp != NULL) {
3013		ASSERT(bp->blk_pad[0] == 0);
3014		ASSERT(bp->blk_pad[1] == 0);
3015		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3016		    (bp == zio_unique_parent(zio)->io_bp));
3017		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3018		    zio->io_bp_override == NULL &&
3019		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3020			ASSERT(!BP_SHOULD_BYTESWAP(bp));
3021			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
3022			ASSERT(BP_COUNT_GANG(bp) == 0 ||
3023			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
3024		}
3025		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3026			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
3027	}
3028
3029	/*
3030	 * If there were child vdev/gang/ddt errors, they apply to us now.
3031	 */
3032	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3033	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3034	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3035
3036	/*
3037	 * If the I/O on the transformed data was successful, generate any
3038	 * checksum reports now while we still have the transformed data.
3039	 */
3040	if (zio->io_error == 0) {
3041		while (zio->io_cksum_report != NULL) {
3042			zio_cksum_report_t *zcr = zio->io_cksum_report;
3043			uint64_t align = zcr->zcr_align;
3044			uint64_t asize = P2ROUNDUP(psize, align);
3045			char *abuf = zio->io_data;
3046
3047			if (asize != psize) {
3048				abuf = zio_buf_alloc(asize);
3049				bcopy(zio->io_data, abuf, psize);
3050				bzero(abuf + psize, asize - psize);
3051			}
3052
3053			zio->io_cksum_report = zcr->zcr_next;
3054			zcr->zcr_next = NULL;
3055			zcr->zcr_finish(zcr, abuf);
3056			zfs_ereport_free_checksum(zcr);
3057
3058			if (asize != psize)
3059				zio_buf_free(abuf, asize);
3060		}
3061	}
3062
3063	zio_pop_transforms(zio);	/* note: may set zio->io_error */
3064
3065	vdev_stat_update(zio, psize);
3066
3067	if (zio->io_error) {
3068		/*
3069		 * If this I/O is attached to a particular vdev,
3070		 * generate an error message describing the I/O failure
3071		 * at the block level.  We ignore these errors if the
3072		 * device is currently unavailable.
3073		 */
3074		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3075			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3076
3077		if ((zio->io_error == EIO || !(zio->io_flags &
3078		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3079		    zio == lio) {
3080			/*
3081			 * For logical I/O requests, tell the SPA to log the
3082			 * error and generate a logical data ereport.
3083			 */
3084			spa_log_error(spa, zio);
3085			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3086			    0, 0);
3087		}
3088	}
3089
3090	if (zio->io_error && zio == lio) {
3091		/*
3092		 * Determine whether zio should be reexecuted.  This will
3093		 * propagate all the way to the root via zio_notify_parent().
3094		 */
3095		ASSERT(vd == NULL && bp != NULL);
3096		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3097
3098		if (IO_IS_ALLOCATING(zio) &&
3099		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3100			if (zio->io_error != ENOSPC)
3101				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3102			else
3103				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3104		}
3105
3106		if ((zio->io_type == ZIO_TYPE_READ ||
3107		    zio->io_type == ZIO_TYPE_FREE) &&
3108		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3109		    zio->io_error == ENXIO &&
3110		    spa_load_state(spa) == SPA_LOAD_NONE &&
3111		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3112			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3113
3114		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3115			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3116
3117		/*
3118		 * Here is a possibly good place to attempt to do
3119		 * either combinatorial reconstruction or error correction
3120		 * based on checksums.  It also might be a good place
3121		 * to send out preliminary ereports before we suspend
3122		 * processing.
3123		 */
3124	}
3125
3126	/*
3127	 * If there were logical child errors, they apply to us now.
3128	 * We defer this until now to avoid conflating logical child
3129	 * errors with errors that happened to the zio itself when
3130	 * updating vdev stats and reporting FMA events above.
3131	 */
3132	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3133
3134	if ((zio->io_error || zio->io_reexecute) &&
3135	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3136	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3137		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3138
3139	zio_gang_tree_free(&zio->io_gang_tree);
3140
3141	/*
3142	 * Godfather I/Os should never suspend.
3143	 */
3144	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3145	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3146		zio->io_reexecute = 0;
3147
3148	if (zio->io_reexecute) {
3149		/*
3150		 * This is a logical I/O that wants to reexecute.
3151		 *
3152		 * Reexecute is top-down.  When an i/o fails, if it's not
3153		 * the root, it simply notifies its parent and sticks around.
3154		 * The parent, seeing that it still has children in zio_done(),
3155		 * does the same.  This percolates all the way up to the root.
3156		 * The root i/o will reexecute or suspend the entire tree.
3157		 *
3158		 * This approach ensures that zio_reexecute() honors
3159		 * all the original i/o dependency relationships, e.g.
3160		 * parents not executing until children are ready.
3161		 */
3162		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3163
3164		zio->io_gang_leader = NULL;
3165
3166		mutex_enter(&zio->io_lock);
3167		zio->io_state[ZIO_WAIT_DONE] = 1;
3168		mutex_exit(&zio->io_lock);
3169
3170		/*
3171		 * "The Godfather" I/O monitors its children but is
3172		 * not a true parent to them. It will track them through
3173		 * the pipeline but severs its ties whenever they get into
3174		 * trouble (e.g. suspended). This allows "The Godfather"
3175		 * I/O to return status without blocking.
3176		 */
3177		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3178			zio_link_t *zl = zio->io_walk_link;
3179			pio_next = zio_walk_parents(zio);
3180
3181			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3182			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3183				zio_remove_child(pio, zio, zl);
3184				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3185			}
3186		}
3187
3188		if ((pio = zio_unique_parent(zio)) != NULL) {
3189			/*
3190			 * We're not a root i/o, so there's nothing to do
3191			 * but notify our parent.  Don't propagate errors
3192			 * upward since we haven't permanently failed yet.
3193			 */
3194			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3195			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3196			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3197		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3198			/*
3199			 * We'd fail again if we reexecuted now, so suspend
3200			 * until conditions improve (e.g. device comes online).
3201			 */
3202			zio_suspend(spa, zio);
3203		} else {
3204			/*
3205			 * Reexecution is potentially a huge amount of work.
3206			 * Hand it off to the otherwise-unused claim taskq.
3207			 */
3208#if defined(illumos) || !defined(_KERNEL)
3209			ASSERT(zio->io_tqent.tqent_next == NULL);
3210#else
3211			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
3212#endif
3213			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3214			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3215			    0, &zio->io_tqent);
3216		}
3217		return (ZIO_PIPELINE_STOP);
3218	}
3219
3220	ASSERT(zio->io_child_count == 0);
3221	ASSERT(zio->io_reexecute == 0);
3222	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3223
3224	/*
3225	 * Report any checksum errors, since the I/O is complete.
3226	 */
3227	while (zio->io_cksum_report != NULL) {
3228		zio_cksum_report_t *zcr = zio->io_cksum_report;
3229		zio->io_cksum_report = zcr->zcr_next;
3230		zcr->zcr_next = NULL;
3231		zcr->zcr_finish(zcr, NULL);
3232		zfs_ereport_free_checksum(zcr);
3233	}
3234
3235	/*
3236	 * It is the responsibility of the done callback to ensure that this
3237	 * particular zio is no longer discoverable for adoption, and as
3238	 * such, cannot acquire any new parents.
3239	 */
3240	if (zio->io_done)
3241		zio->io_done(zio);
3242
3243	mutex_enter(&zio->io_lock);
3244	zio->io_state[ZIO_WAIT_DONE] = 1;
3245	mutex_exit(&zio->io_lock);
3246
3247	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3248		zio_link_t *zl = zio->io_walk_link;
3249		pio_next = zio_walk_parents(zio);
3250		zio_remove_child(pio, zio, zl);
3251		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3252	}
3253
3254	if (zio->io_waiter != NULL) {
3255		mutex_enter(&zio->io_lock);
3256		zio->io_executor = NULL;
3257		cv_broadcast(&zio->io_cv);
3258		mutex_exit(&zio->io_lock);
3259	} else {
3260		zio_destroy(zio);
3261	}
3262
3263	return (ZIO_PIPELINE_STOP);
3264}
3265
3266/*
3267 * ==========================================================================
3268 * I/O pipeline definition
3269 * ==========================================================================
3270 */
3271static zio_pipe_stage_t *zio_pipeline[] = {
3272	NULL,
3273	zio_read_bp_init,
3274	zio_free_bp_init,
3275	zio_issue_async,
3276	zio_write_bp_init,
3277	zio_checksum_generate,
3278	zio_nop_write,
3279	zio_ddt_read_start,
3280	zio_ddt_read_done,
3281	zio_ddt_write,
3282	zio_ddt_free,
3283	zio_gang_assemble,
3284	zio_gang_issue,
3285	zio_dva_allocate,
3286	zio_dva_free,
3287	zio_dva_claim,
3288	zio_ready,
3289	zio_vdev_io_start,
3290	zio_vdev_io_done,
3291	zio_vdev_io_assess,
3292	zio_checksum_verify,
3293	zio_done
3294};
3295
3296/* dnp is the dnode for zb1->zb_object */
3297boolean_t
3298zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
3299    const zbookmark_t *zb2)
3300{
3301	uint64_t zb1nextL0, zb2thisobj;
3302
3303	ASSERT(zb1->zb_objset == zb2->zb_objset);
3304	ASSERT(zb2->zb_level == 0);
3305
3306	/*
3307	 * A bookmark in the deadlist is considered to be after
3308	 * everything else.
3309	 */
3310	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
3311		return (B_TRUE);
3312
3313	/* The objset_phys_t isn't before anything. */
3314	if (dnp == NULL)
3315		return (B_FALSE);
3316
3317	zb1nextL0 = (zb1->zb_blkid + 1) <<
3318	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3319
3320	zb2thisobj = zb2->zb_object ? zb2->zb_object :
3321	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3322
3323	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3324		uint64_t nextobj = zb1nextL0 *
3325		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3326		return (nextobj <= zb2thisobj);
3327	}
3328
3329	if (zb1->zb_object < zb2thisobj)
3330		return (B_TRUE);
3331	if (zb1->zb_object > zb2thisobj)
3332		return (B_FALSE);
3333	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3334		return (B_FALSE);
3335	return (zb1nextL0 <= zb2->zb_blkid);
3336}
3337