zio.c revision 273348
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/fm/fs/zfs.h>
29#include <sys/spa.h>
30#include <sys/txg.h>
31#include <sys/spa_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio_impl.h>
34#include <sys/zio_compress.h>
35#include <sys/zio_checksum.h>
36#include <sys/dmu_objset.h>
37#include <sys/arc.h>
38#include <sys/ddt.h>
39#include <sys/trim_map.h>
40#include <sys/blkptr.h>
41#include <sys/zfeature.h>
42
43SYSCTL_DECL(_vfs_zfs);
44SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
45#if defined(__amd64__)
46static int zio_use_uma = 1;
47#else
48static int zio_use_uma = 0;
49#endif
50TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
51SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
52    "Use uma(9) for ZIO allocations");
53static int zio_exclude_metadata = 0;
54TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
55SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
56    "Exclude metadata buffers from dumps as well");
57
58zio_trim_stats_t zio_trim_stats = {
59	{ "bytes",		KSTAT_DATA_UINT64,
60	  "Number of bytes successfully TRIMmed" },
61	{ "success",		KSTAT_DATA_UINT64,
62	  "Number of successful TRIM requests" },
63	{ "unsupported",	KSTAT_DATA_UINT64,
64	  "Number of TRIM requests that failed because TRIM is not supported" },
65	{ "failed",		KSTAT_DATA_UINT64,
66	  "Number of TRIM requests that failed for reasons other than not supported" },
67};
68
69static kstat_t *zio_trim_ksp;
70
71/*
72 * ==========================================================================
73 * I/O type descriptions
74 * ==========================================================================
75 */
76const char *zio_type_name[ZIO_TYPES] = {
77	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
78	"zio_ioctl"
79};
80
81/*
82 * ==========================================================================
83 * I/O kmem caches
84 * ==========================================================================
85 */
86kmem_cache_t *zio_cache;
87kmem_cache_t *zio_link_cache;
88kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
89kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
90
91#ifdef _KERNEL
92extern vmem_t *zio_alloc_arena;
93#endif
94
95/*
96 * The following actions directly effect the spa's sync-to-convergence logic.
97 * The values below define the sync pass when we start performing the action.
98 * Care should be taken when changing these values as they directly impact
99 * spa_sync() performance. Tuning these values may introduce subtle performance
100 * pathologies and should only be done in the context of performance analysis.
101 * These tunables will eventually be removed and replaced with #defines once
102 * enough analysis has been done to determine optimal values.
103 *
104 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
105 * regular blocks are not deferred.
106 */
107int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
108TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free);
109SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
110    &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
111int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
112TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress);
113SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
114    &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
115int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
116TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite);
117SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
118    &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
119
120/*
121 * An allocating zio is one that either currently has the DVA allocate
122 * stage set or will have it later in its lifetime.
123 */
124#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
125
126boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
127
128#ifdef ZFS_DEBUG
129int zio_buf_debug_limit = 16384;
130#else
131int zio_buf_debug_limit = 0;
132#endif
133
134void
135zio_init(void)
136{
137	size_t c;
138	zio_cache = kmem_cache_create("zio_cache",
139	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
140	zio_link_cache = kmem_cache_create("zio_link_cache",
141	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
142	if (!zio_use_uma)
143		goto out;
144
145	/*
146	 * For small buffers, we want a cache for each multiple of
147	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
148	 * for each quarter-power of 2.  For large buffers, we want
149	 * a cache for each multiple of PAGESIZE.
150	 */
151	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
152		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
153		size_t p2 = size;
154		size_t align = 0;
155		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
156
157		while (p2 & (p2 - 1))
158			p2 &= p2 - 1;
159
160#ifdef illumos
161#ifndef _KERNEL
162		/*
163		 * If we are using watchpoints, put each buffer on its own page,
164		 * to eliminate the performance overhead of trapping to the
165		 * kernel when modifying a non-watched buffer that shares the
166		 * page with a watched buffer.
167		 */
168		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
169			continue;
170#endif
171#endif /* illumos */
172		if (size <= 4 * SPA_MINBLOCKSIZE) {
173			align = SPA_MINBLOCKSIZE;
174		} else if (IS_P2ALIGNED(size, PAGESIZE)) {
175			align = PAGESIZE;
176		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
177			align = p2 >> 2;
178		}
179
180		if (align != 0) {
181			char name[36];
182			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
183			zio_buf_cache[c] = kmem_cache_create(name, size,
184			    align, NULL, NULL, NULL, NULL, NULL, cflags);
185
186			/*
187			 * Since zio_data bufs do not appear in crash dumps, we
188			 * pass KMC_NOTOUCH so that no allocator metadata is
189			 * stored with the buffers.
190			 */
191			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
192			zio_data_buf_cache[c] = kmem_cache_create(name, size,
193			    align, NULL, NULL, NULL, NULL, NULL,
194			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
195		}
196	}
197
198	while (--c != 0) {
199		ASSERT(zio_buf_cache[c] != NULL);
200		if (zio_buf_cache[c - 1] == NULL)
201			zio_buf_cache[c - 1] = zio_buf_cache[c];
202
203		ASSERT(zio_data_buf_cache[c] != NULL);
204		if (zio_data_buf_cache[c - 1] == NULL)
205			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
206	}
207out:
208
209	zio_inject_init();
210
211	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
212	    KSTAT_TYPE_NAMED,
213	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
214	    KSTAT_FLAG_VIRTUAL);
215
216	if (zio_trim_ksp != NULL) {
217		zio_trim_ksp->ks_data = &zio_trim_stats;
218		kstat_install(zio_trim_ksp);
219	}
220}
221
222void
223zio_fini(void)
224{
225	size_t c;
226	kmem_cache_t *last_cache = NULL;
227	kmem_cache_t *last_data_cache = NULL;
228
229	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
230		if (zio_buf_cache[c] != last_cache) {
231			last_cache = zio_buf_cache[c];
232			kmem_cache_destroy(zio_buf_cache[c]);
233		}
234		zio_buf_cache[c] = NULL;
235
236		if (zio_data_buf_cache[c] != last_data_cache) {
237			last_data_cache = zio_data_buf_cache[c];
238			kmem_cache_destroy(zio_data_buf_cache[c]);
239		}
240		zio_data_buf_cache[c] = NULL;
241	}
242
243	kmem_cache_destroy(zio_link_cache);
244	kmem_cache_destroy(zio_cache);
245
246	zio_inject_fini();
247
248	if (zio_trim_ksp != NULL) {
249		kstat_delete(zio_trim_ksp);
250		zio_trim_ksp = NULL;
251	}
252}
253
254/*
255 * ==========================================================================
256 * Allocate and free I/O buffers
257 * ==========================================================================
258 */
259
260/*
261 * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
262 * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
263 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
264 * excess / transient data in-core during a crashdump.
265 */
266void *
267zio_buf_alloc(size_t size)
268{
269	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
270	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
271
272	ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
273
274	if (zio_use_uma)
275		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
276	else
277		return (kmem_alloc(size, KM_SLEEP|flags));
278}
279
280/*
281 * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
282 * crashdump if the kernel panics.  This exists so that we will limit the amount
283 * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
284 * of kernel heap dumped to disk when the kernel panics)
285 */
286void *
287zio_data_buf_alloc(size_t size)
288{
289	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
290
291	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
292
293	if (zio_use_uma)
294		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
295	else
296		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
297}
298
299void
300zio_buf_free(void *buf, size_t size)
301{
302	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
303
304	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
305
306	if (zio_use_uma)
307		kmem_cache_free(zio_buf_cache[c], buf);
308	else
309		kmem_free(buf, size);
310}
311
312void
313zio_data_buf_free(void *buf, size_t size)
314{
315	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
316
317	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
318
319	if (zio_use_uma)
320		kmem_cache_free(zio_data_buf_cache[c], buf);
321	else
322		kmem_free(buf, size);
323}
324
325/*
326 * ==========================================================================
327 * Push and pop I/O transform buffers
328 * ==========================================================================
329 */
330static void
331zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
332	zio_transform_func_t *transform)
333{
334	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
335
336	zt->zt_orig_data = zio->io_data;
337	zt->zt_orig_size = zio->io_size;
338	zt->zt_bufsize = bufsize;
339	zt->zt_transform = transform;
340
341	zt->zt_next = zio->io_transform_stack;
342	zio->io_transform_stack = zt;
343
344	zio->io_data = data;
345	zio->io_size = size;
346}
347
348static void
349zio_pop_transforms(zio_t *zio)
350{
351	zio_transform_t *zt;
352
353	while ((zt = zio->io_transform_stack) != NULL) {
354		if (zt->zt_transform != NULL)
355			zt->zt_transform(zio,
356			    zt->zt_orig_data, zt->zt_orig_size);
357
358		if (zt->zt_bufsize != 0)
359			zio_buf_free(zio->io_data, zt->zt_bufsize);
360
361		zio->io_data = zt->zt_orig_data;
362		zio->io_size = zt->zt_orig_size;
363		zio->io_transform_stack = zt->zt_next;
364
365		kmem_free(zt, sizeof (zio_transform_t));
366	}
367}
368
369/*
370 * ==========================================================================
371 * I/O transform callbacks for subblocks and decompression
372 * ==========================================================================
373 */
374static void
375zio_subblock(zio_t *zio, void *data, uint64_t size)
376{
377	ASSERT(zio->io_size > size);
378
379	if (zio->io_type == ZIO_TYPE_READ)
380		bcopy(zio->io_data, data, size);
381}
382
383static void
384zio_decompress(zio_t *zio, void *data, uint64_t size)
385{
386	if (zio->io_error == 0 &&
387	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
388	    zio->io_data, data, zio->io_size, size) != 0)
389		zio->io_error = SET_ERROR(EIO);
390}
391
392/*
393 * ==========================================================================
394 * I/O parent/child relationships and pipeline interlocks
395 * ==========================================================================
396 */
397/*
398 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
399 *        continue calling these functions until they return NULL.
400 *        Otherwise, the next caller will pick up the list walk in
401 *        some indeterminate state.  (Otherwise every caller would
402 *        have to pass in a cookie to keep the state represented by
403 *        io_walk_link, which gets annoying.)
404 */
405zio_t *
406zio_walk_parents(zio_t *cio)
407{
408	zio_link_t *zl = cio->io_walk_link;
409	list_t *pl = &cio->io_parent_list;
410
411	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
412	cio->io_walk_link = zl;
413
414	if (zl == NULL)
415		return (NULL);
416
417	ASSERT(zl->zl_child == cio);
418	return (zl->zl_parent);
419}
420
421zio_t *
422zio_walk_children(zio_t *pio)
423{
424	zio_link_t *zl = pio->io_walk_link;
425	list_t *cl = &pio->io_child_list;
426
427	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
428	pio->io_walk_link = zl;
429
430	if (zl == NULL)
431		return (NULL);
432
433	ASSERT(zl->zl_parent == pio);
434	return (zl->zl_child);
435}
436
437zio_t *
438zio_unique_parent(zio_t *cio)
439{
440	zio_t *pio = zio_walk_parents(cio);
441
442	VERIFY(zio_walk_parents(cio) == NULL);
443	return (pio);
444}
445
446void
447zio_add_child(zio_t *pio, zio_t *cio)
448{
449	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
450
451	/*
452	 * Logical I/Os can have logical, gang, or vdev children.
453	 * Gang I/Os can have gang or vdev children.
454	 * Vdev I/Os can only have vdev children.
455	 * The following ASSERT captures all of these constraints.
456	 */
457	ASSERT(cio->io_child_type <= pio->io_child_type);
458
459	zl->zl_parent = pio;
460	zl->zl_child = cio;
461
462	mutex_enter(&cio->io_lock);
463	mutex_enter(&pio->io_lock);
464
465	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
466
467	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
468		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
469
470	list_insert_head(&pio->io_child_list, zl);
471	list_insert_head(&cio->io_parent_list, zl);
472
473	pio->io_child_count++;
474	cio->io_parent_count++;
475
476	mutex_exit(&pio->io_lock);
477	mutex_exit(&cio->io_lock);
478}
479
480static void
481zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
482{
483	ASSERT(zl->zl_parent == pio);
484	ASSERT(zl->zl_child == cio);
485
486	mutex_enter(&cio->io_lock);
487	mutex_enter(&pio->io_lock);
488
489	list_remove(&pio->io_child_list, zl);
490	list_remove(&cio->io_parent_list, zl);
491
492	pio->io_child_count--;
493	cio->io_parent_count--;
494
495	mutex_exit(&pio->io_lock);
496	mutex_exit(&cio->io_lock);
497
498	kmem_cache_free(zio_link_cache, zl);
499}
500
501static boolean_t
502zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
503{
504	uint64_t *countp = &zio->io_children[child][wait];
505	boolean_t waiting = B_FALSE;
506
507	mutex_enter(&zio->io_lock);
508	ASSERT(zio->io_stall == NULL);
509	if (*countp != 0) {
510		zio->io_stage >>= 1;
511		zio->io_stall = countp;
512		waiting = B_TRUE;
513	}
514	mutex_exit(&zio->io_lock);
515
516	return (waiting);
517}
518
519static void
520zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
521{
522	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
523	int *errorp = &pio->io_child_error[zio->io_child_type];
524
525	mutex_enter(&pio->io_lock);
526	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
527		*errorp = zio_worst_error(*errorp, zio->io_error);
528	pio->io_reexecute |= zio->io_reexecute;
529	ASSERT3U(*countp, >, 0);
530
531	(*countp)--;
532
533	if (*countp == 0 && pio->io_stall == countp) {
534		pio->io_stall = NULL;
535		mutex_exit(&pio->io_lock);
536		zio_execute(pio);
537	} else {
538		mutex_exit(&pio->io_lock);
539	}
540}
541
542static void
543zio_inherit_child_errors(zio_t *zio, enum zio_child c)
544{
545	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
546		zio->io_error = zio->io_child_error[c];
547}
548
549/*
550 * ==========================================================================
551 * Create the various types of I/O (read, write, free, etc)
552 * ==========================================================================
553 */
554static zio_t *
555zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
556    void *data, uint64_t size, zio_done_func_t *done, void *private,
557    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
558    vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
559    enum zio_stage stage, enum zio_stage pipeline)
560{
561	zio_t *zio;
562
563	ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
564	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
565	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
566
567	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
568	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
569	ASSERT(vd || stage == ZIO_STAGE_OPEN);
570
571	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
572	bzero(zio, sizeof (zio_t));
573
574	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
575	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
576
577	list_create(&zio->io_parent_list, sizeof (zio_link_t),
578	    offsetof(zio_link_t, zl_parent_node));
579	list_create(&zio->io_child_list, sizeof (zio_link_t),
580	    offsetof(zio_link_t, zl_child_node));
581
582	if (vd != NULL)
583		zio->io_child_type = ZIO_CHILD_VDEV;
584	else if (flags & ZIO_FLAG_GANG_CHILD)
585		zio->io_child_type = ZIO_CHILD_GANG;
586	else if (flags & ZIO_FLAG_DDT_CHILD)
587		zio->io_child_type = ZIO_CHILD_DDT;
588	else
589		zio->io_child_type = ZIO_CHILD_LOGICAL;
590
591	if (bp != NULL) {
592		zio->io_bp = (blkptr_t *)bp;
593		zio->io_bp_copy = *bp;
594		zio->io_bp_orig = *bp;
595		if (type != ZIO_TYPE_WRITE ||
596		    zio->io_child_type == ZIO_CHILD_DDT)
597			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
598		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
599			zio->io_logical = zio;
600		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
601			pipeline |= ZIO_GANG_STAGES;
602	}
603
604	zio->io_spa = spa;
605	zio->io_txg = txg;
606	zio->io_done = done;
607	zio->io_private = private;
608	zio->io_type = type;
609	zio->io_priority = priority;
610	zio->io_vd = vd;
611	zio->io_offset = offset;
612	zio->io_orig_data = zio->io_data = data;
613	zio->io_orig_size = zio->io_size = size;
614	zio->io_orig_flags = zio->io_flags = flags;
615	zio->io_orig_stage = zio->io_stage = stage;
616	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
617
618	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
619	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
620
621	if (zb != NULL)
622		zio->io_bookmark = *zb;
623
624	if (pio != NULL) {
625		if (zio->io_logical == NULL)
626			zio->io_logical = pio->io_logical;
627		if (zio->io_child_type == ZIO_CHILD_GANG)
628			zio->io_gang_leader = pio->io_gang_leader;
629		zio_add_child(pio, zio);
630	}
631
632	return (zio);
633}
634
635static void
636zio_destroy(zio_t *zio)
637{
638	list_destroy(&zio->io_parent_list);
639	list_destroy(&zio->io_child_list);
640	mutex_destroy(&zio->io_lock);
641	cv_destroy(&zio->io_cv);
642	kmem_cache_free(zio_cache, zio);
643}
644
645zio_t *
646zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
647    void *private, enum zio_flag flags)
648{
649	zio_t *zio;
650
651	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
652	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
653	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
654
655	return (zio);
656}
657
658zio_t *
659zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
660{
661	return (zio_null(NULL, spa, NULL, done, private, flags));
662}
663
664zio_t *
665zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
666    void *data, uint64_t size, zio_done_func_t *done, void *private,
667    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
668{
669	zio_t *zio;
670
671	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
672	    data, size, done, private,
673	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
674	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
675	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
676
677	return (zio);
678}
679
680zio_t *
681zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
682    void *data, uint64_t size, const zio_prop_t *zp,
683    zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
684    void *private,
685    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
686{
687	zio_t *zio;
688
689	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
690	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
691	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
692	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
693	    DMU_OT_IS_VALID(zp->zp_type) &&
694	    zp->zp_level < 32 &&
695	    zp->zp_copies > 0 &&
696	    zp->zp_copies <= spa_max_replication(spa));
697
698	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
699	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
700	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
701	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
702
703	zio->io_ready = ready;
704	zio->io_physdone = physdone;
705	zio->io_prop = *zp;
706
707	/*
708	 * Data can be NULL if we are going to call zio_write_override() to
709	 * provide the already-allocated BP.  But we may need the data to
710	 * verify a dedup hit (if requested).  In this case, don't try to
711	 * dedup (just take the already-allocated BP verbatim).
712	 */
713	if (data == NULL && zio->io_prop.zp_dedup_verify) {
714		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
715	}
716
717	return (zio);
718}
719
720zio_t *
721zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
722    uint64_t size, zio_done_func_t *done, void *private,
723    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
724{
725	zio_t *zio;
726
727	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
728	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
729	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
730
731	return (zio);
732}
733
734void
735zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
736{
737	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
738	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
739	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
740	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
741
742	/*
743	 * We must reset the io_prop to match the values that existed
744	 * when the bp was first written by dmu_sync() keeping in mind
745	 * that nopwrite and dedup are mutually exclusive.
746	 */
747	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
748	zio->io_prop.zp_nopwrite = nopwrite;
749	zio->io_prop.zp_copies = copies;
750	zio->io_bp_override = bp;
751}
752
753void
754zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
755{
756
757	/*
758	 * The check for EMBEDDED is a performance optimization.  We
759	 * process the free here (by ignoring it) rather than
760	 * putting it on the list and then processing it in zio_free_sync().
761	 */
762	if (BP_IS_EMBEDDED(bp))
763		return;
764	metaslab_check_free(spa, bp);
765
766	/*
767	 * Frees that are for the currently-syncing txg, are not going to be
768	 * deferred, and which will not need to do a read (i.e. not GANG or
769	 * DEDUP), can be processed immediately.  Otherwise, put them on the
770	 * in-memory list for later processing.
771	 */
772	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
773	    txg != spa->spa_syncing_txg ||
774	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
775		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
776	} else {
777		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
778		    BP_GET_PSIZE(bp), 0)));
779	}
780}
781
782zio_t *
783zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
784    uint64_t size, enum zio_flag flags)
785{
786	zio_t *zio;
787	enum zio_stage stage = ZIO_FREE_PIPELINE;
788
789	ASSERT(!BP_IS_HOLE(bp));
790	ASSERT(spa_syncing_txg(spa) == txg);
791	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
792
793	if (BP_IS_EMBEDDED(bp))
794		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
795
796	metaslab_check_free(spa, bp);
797	arc_freed(spa, bp);
798
799	if (zfs_trim_enabled)
800		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
801		    ZIO_STAGE_VDEV_IO_ASSESS;
802	/*
803	 * GANG and DEDUP blocks can induce a read (for the gang block header,
804	 * or the DDT), so issue them asynchronously so that this thread is
805	 * not tied up.
806	 */
807	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
808		stage |= ZIO_STAGE_ISSUE_ASYNC;
809
810	flags |= ZIO_FLAG_DONT_QUEUE;
811
812	zio = zio_create(pio, spa, txg, bp, NULL, size,
813	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
814	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
815
816	return (zio);
817}
818
819zio_t *
820zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
821    zio_done_func_t *done, void *private, enum zio_flag flags)
822{
823	zio_t *zio;
824
825	dprintf_bp(bp, "claiming in txg %llu", txg);
826
827	if (BP_IS_EMBEDDED(bp))
828		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
829
830	/*
831	 * A claim is an allocation of a specific block.  Claims are needed
832	 * to support immediate writes in the intent log.  The issue is that
833	 * immediate writes contain committed data, but in a txg that was
834	 * *not* committed.  Upon opening the pool after an unclean shutdown,
835	 * the intent log claims all blocks that contain immediate write data
836	 * so that the SPA knows they're in use.
837	 *
838	 * All claims *must* be resolved in the first txg -- before the SPA
839	 * starts allocating blocks -- so that nothing is allocated twice.
840	 * If txg == 0 we just verify that the block is claimable.
841	 */
842	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
843	ASSERT(txg == spa_first_txg(spa) || txg == 0);
844	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
845
846	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
847	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
848	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
849
850	return (zio);
851}
852
853zio_t *
854zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
855    uint64_t size, zio_done_func_t *done, void *private,
856    zio_priority_t priority, enum zio_flag flags)
857{
858	zio_t *zio;
859	int c;
860
861	if (vd->vdev_children == 0) {
862		zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
863		    ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
864		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
865
866		zio->io_cmd = cmd;
867	} else {
868		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
869
870		for (c = 0; c < vd->vdev_children; c++)
871			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
872			    offset, size, done, private, priority, flags));
873	}
874
875	return (zio);
876}
877
878zio_t *
879zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
880    void *data, int checksum, zio_done_func_t *done, void *private,
881    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
882{
883	zio_t *zio;
884
885	ASSERT(vd->vdev_children == 0);
886	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
887	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
888	ASSERT3U(offset + size, <=, vd->vdev_psize);
889
890	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
891	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
892	    NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
893
894	zio->io_prop.zp_checksum = checksum;
895
896	return (zio);
897}
898
899zio_t *
900zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
901    void *data, int checksum, zio_done_func_t *done, void *private,
902    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
903{
904	zio_t *zio;
905
906	ASSERT(vd->vdev_children == 0);
907	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
908	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
909	ASSERT3U(offset + size, <=, vd->vdev_psize);
910
911	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
912	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
913	    NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
914
915	zio->io_prop.zp_checksum = checksum;
916
917	if (zio_checksum_table[checksum].ci_eck) {
918		/*
919		 * zec checksums are necessarily destructive -- they modify
920		 * the end of the write buffer to hold the verifier/checksum.
921		 * Therefore, we must make a local copy in case the data is
922		 * being written to multiple places in parallel.
923		 */
924		void *wbuf = zio_buf_alloc(size);
925		bcopy(data, wbuf, size);
926		zio_push_transform(zio, wbuf, size, size, NULL);
927	}
928
929	return (zio);
930}
931
932/*
933 * Create a child I/O to do some work for us.
934 */
935zio_t *
936zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
937	void *data, uint64_t size, int type, zio_priority_t priority,
938	enum zio_flag flags, zio_done_func_t *done, void *private)
939{
940	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
941	zio_t *zio;
942
943	ASSERT(vd->vdev_parent ==
944	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
945
946	if (type == ZIO_TYPE_READ && bp != NULL) {
947		/*
948		 * If we have the bp, then the child should perform the
949		 * checksum and the parent need not.  This pushes error
950		 * detection as close to the leaves as possible and
951		 * eliminates redundant checksums in the interior nodes.
952		 */
953		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
954		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
955	}
956
957	/* Not all IO types require vdev io done stage e.g. free */
958	if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
959		pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
960
961	if (vd->vdev_children == 0)
962		offset += VDEV_LABEL_START_SIZE;
963
964	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
965
966	/*
967	 * If we've decided to do a repair, the write is not speculative --
968	 * even if the original read was.
969	 */
970	if (flags & ZIO_FLAG_IO_REPAIR)
971		flags &= ~ZIO_FLAG_SPECULATIVE;
972
973	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
974	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
975	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
976
977	zio->io_physdone = pio->io_physdone;
978	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
979		zio->io_logical->io_phys_children++;
980
981	return (zio);
982}
983
984zio_t *
985zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
986	int type, zio_priority_t priority, enum zio_flag flags,
987	zio_done_func_t *done, void *private)
988{
989	zio_t *zio;
990
991	ASSERT(vd->vdev_ops->vdev_op_leaf);
992
993	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
994	    data, size, done, private, type, priority,
995	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
996	    vd, offset, NULL,
997	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
998
999	return (zio);
1000}
1001
1002void
1003zio_flush(zio_t *zio, vdev_t *vd)
1004{
1005	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
1006	    NULL, NULL, ZIO_PRIORITY_NOW,
1007	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1008}
1009
1010zio_t *
1011zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
1012{
1013
1014	ASSERT(vd->vdev_ops->vdev_op_leaf);
1015
1016	return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
1017	    ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
1018	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
1019	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
1020}
1021
1022void
1023zio_shrink(zio_t *zio, uint64_t size)
1024{
1025	ASSERT(zio->io_executor == NULL);
1026	ASSERT(zio->io_orig_size == zio->io_size);
1027	ASSERT(size <= zio->io_size);
1028
1029	/*
1030	 * We don't shrink for raidz because of problems with the
1031	 * reconstruction when reading back less than the block size.
1032	 * Note, BP_IS_RAIDZ() assumes no compression.
1033	 */
1034	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1035	if (!BP_IS_RAIDZ(zio->io_bp))
1036		zio->io_orig_size = zio->io_size = size;
1037}
1038
1039/*
1040 * ==========================================================================
1041 * Prepare to read and write logical blocks
1042 * ==========================================================================
1043 */
1044
1045static int
1046zio_read_bp_init(zio_t *zio)
1047{
1048	blkptr_t *bp = zio->io_bp;
1049
1050	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1051	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
1052	    !(zio->io_flags & ZIO_FLAG_RAW)) {
1053		uint64_t psize =
1054		    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1055		void *cbuf = zio_buf_alloc(psize);
1056
1057		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1058	}
1059
1060	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1061		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1062		decode_embedded_bp_compressed(bp, zio->io_data);
1063	} else {
1064		ASSERT(!BP_IS_EMBEDDED(bp));
1065	}
1066
1067	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1068		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1069
1070	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1071		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1072
1073	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1074		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1075
1076	return (ZIO_PIPELINE_CONTINUE);
1077}
1078
1079static int
1080zio_write_bp_init(zio_t *zio)
1081{
1082	spa_t *spa = zio->io_spa;
1083	zio_prop_t *zp = &zio->io_prop;
1084	enum zio_compress compress = zp->zp_compress;
1085	blkptr_t *bp = zio->io_bp;
1086	uint64_t lsize = zio->io_size;
1087	uint64_t psize = lsize;
1088	int pass = 1;
1089
1090	/*
1091	 * If our children haven't all reached the ready stage,
1092	 * wait for them and then repeat this pipeline stage.
1093	 */
1094	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1095	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1096		return (ZIO_PIPELINE_STOP);
1097
1098	if (!IO_IS_ALLOCATING(zio))
1099		return (ZIO_PIPELINE_CONTINUE);
1100
1101	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1102
1103	if (zio->io_bp_override) {
1104		ASSERT(bp->blk_birth != zio->io_txg);
1105		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1106
1107		*bp = *zio->io_bp_override;
1108		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1109
1110		if (BP_IS_EMBEDDED(bp))
1111			return (ZIO_PIPELINE_CONTINUE);
1112
1113		/*
1114		 * If we've been overridden and nopwrite is set then
1115		 * set the flag accordingly to indicate that a nopwrite
1116		 * has already occurred.
1117		 */
1118		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1119			ASSERT(!zp->zp_dedup);
1120			zio->io_flags |= ZIO_FLAG_NOPWRITE;
1121			return (ZIO_PIPELINE_CONTINUE);
1122		}
1123
1124		ASSERT(!zp->zp_nopwrite);
1125
1126		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1127			return (ZIO_PIPELINE_CONTINUE);
1128
1129		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1130		    zp->zp_dedup_verify);
1131
1132		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1133			BP_SET_DEDUP(bp, 1);
1134			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1135			return (ZIO_PIPELINE_CONTINUE);
1136		}
1137		zio->io_bp_override = NULL;
1138		BP_ZERO(bp);
1139	}
1140
1141	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1142		/*
1143		 * We're rewriting an existing block, which means we're
1144		 * working on behalf of spa_sync().  For spa_sync() to
1145		 * converge, it must eventually be the case that we don't
1146		 * have to allocate new blocks.  But compression changes
1147		 * the blocksize, which forces a reallocate, and makes
1148		 * convergence take longer.  Therefore, after the first
1149		 * few passes, stop compressing to ensure convergence.
1150		 */
1151		pass = spa_sync_pass(spa);
1152
1153		ASSERT(zio->io_txg == spa_syncing_txg(spa));
1154		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1155		ASSERT(!BP_GET_DEDUP(bp));
1156
1157		if (pass >= zfs_sync_pass_dont_compress)
1158			compress = ZIO_COMPRESS_OFF;
1159
1160		/* Make sure someone doesn't change their mind on overwrites */
1161		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1162		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1163	}
1164
1165	if (compress != ZIO_COMPRESS_OFF) {
1166		void *cbuf = zio_buf_alloc(lsize);
1167		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1168		if (psize == 0 || psize == lsize) {
1169			compress = ZIO_COMPRESS_OFF;
1170			zio_buf_free(cbuf, lsize);
1171		} else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1172		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1173		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1174			encode_embedded_bp_compressed(bp,
1175			    cbuf, compress, lsize, psize);
1176			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1177			BP_SET_TYPE(bp, zio->io_prop.zp_type);
1178			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1179			zio_buf_free(cbuf, lsize);
1180			bp->blk_birth = zio->io_txg;
1181			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1182			ASSERT(spa_feature_is_active(spa,
1183			    SPA_FEATURE_EMBEDDED_DATA));
1184			return (ZIO_PIPELINE_CONTINUE);
1185		} else {
1186			/*
1187			 * Round up compressed size to MINBLOCKSIZE and
1188			 * zero the tail.
1189			 */
1190			size_t rounded =
1191			    P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
1192			if (rounded > psize) {
1193				bzero((char *)cbuf + psize, rounded - psize);
1194				psize = rounded;
1195			}
1196			if (psize == lsize) {
1197				compress = ZIO_COMPRESS_OFF;
1198				zio_buf_free(cbuf, lsize);
1199			} else {
1200				zio_push_transform(zio, cbuf,
1201				    psize, lsize, NULL);
1202			}
1203		}
1204	}
1205
1206	/*
1207	 * The final pass of spa_sync() must be all rewrites, but the first
1208	 * few passes offer a trade-off: allocating blocks defers convergence,
1209	 * but newly allocated blocks are sequential, so they can be written
1210	 * to disk faster.  Therefore, we allow the first few passes of
1211	 * spa_sync() to allocate new blocks, but force rewrites after that.
1212	 * There should only be a handful of blocks after pass 1 in any case.
1213	 */
1214	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1215	    BP_GET_PSIZE(bp) == psize &&
1216	    pass >= zfs_sync_pass_rewrite) {
1217		ASSERT(psize != 0);
1218		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1219		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1220		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1221	} else {
1222		BP_ZERO(bp);
1223		zio->io_pipeline = ZIO_WRITE_PIPELINE;
1224	}
1225
1226	if (psize == 0) {
1227		if (zio->io_bp_orig.blk_birth != 0 &&
1228		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1229			BP_SET_LSIZE(bp, lsize);
1230			BP_SET_TYPE(bp, zp->zp_type);
1231			BP_SET_LEVEL(bp, zp->zp_level);
1232			BP_SET_BIRTH(bp, zio->io_txg, 0);
1233		}
1234		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1235	} else {
1236		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1237		BP_SET_LSIZE(bp, lsize);
1238		BP_SET_TYPE(bp, zp->zp_type);
1239		BP_SET_LEVEL(bp, zp->zp_level);
1240		BP_SET_PSIZE(bp, psize);
1241		BP_SET_COMPRESS(bp, compress);
1242		BP_SET_CHECKSUM(bp, zp->zp_checksum);
1243		BP_SET_DEDUP(bp, zp->zp_dedup);
1244		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1245		if (zp->zp_dedup) {
1246			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1247			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1248			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1249		}
1250		if (zp->zp_nopwrite) {
1251			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1252			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1253			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1254		}
1255	}
1256
1257	return (ZIO_PIPELINE_CONTINUE);
1258}
1259
1260static int
1261zio_free_bp_init(zio_t *zio)
1262{
1263	blkptr_t *bp = zio->io_bp;
1264
1265	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1266		if (BP_GET_DEDUP(bp))
1267			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1268	}
1269
1270	return (ZIO_PIPELINE_CONTINUE);
1271}
1272
1273/*
1274 * ==========================================================================
1275 * Execute the I/O pipeline
1276 * ==========================================================================
1277 */
1278
1279static void
1280zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1281{
1282	spa_t *spa = zio->io_spa;
1283	zio_type_t t = zio->io_type;
1284	int flags = (cutinline ? TQ_FRONT : 0);
1285
1286	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
1287
1288	/*
1289	 * If we're a config writer or a probe, the normal issue and
1290	 * interrupt threads may all be blocked waiting for the config lock.
1291	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1292	 */
1293	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1294		t = ZIO_TYPE_NULL;
1295
1296	/*
1297	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1298	 */
1299	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1300		t = ZIO_TYPE_NULL;
1301
1302	/*
1303	 * If this is a high priority I/O, then use the high priority taskq if
1304	 * available.
1305	 */
1306	if (zio->io_priority == ZIO_PRIORITY_NOW &&
1307	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1308		q++;
1309
1310	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1311
1312	/*
1313	 * NB: We are assuming that the zio can only be dispatched
1314	 * to a single taskq at a time.  It would be a grievous error
1315	 * to dispatch the zio to another taskq at the same time.
1316	 */
1317#if defined(illumos) || !defined(_KERNEL)
1318	ASSERT(zio->io_tqent.tqent_next == NULL);
1319#else
1320	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
1321#endif
1322	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1323	    flags, &zio->io_tqent);
1324}
1325
1326static boolean_t
1327zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1328{
1329	kthread_t *executor = zio->io_executor;
1330	spa_t *spa = zio->io_spa;
1331
1332	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1333		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1334		uint_t i;
1335		for (i = 0; i < tqs->stqs_count; i++) {
1336			if (taskq_member(tqs->stqs_taskq[i], executor))
1337				return (B_TRUE);
1338		}
1339	}
1340
1341	return (B_FALSE);
1342}
1343
1344static int
1345zio_issue_async(zio_t *zio)
1346{
1347	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1348
1349	return (ZIO_PIPELINE_STOP);
1350}
1351
1352void
1353zio_interrupt(zio_t *zio)
1354{
1355	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1356}
1357
1358/*
1359 * Execute the I/O pipeline until one of the following occurs:
1360 *
1361 *	(1) the I/O completes
1362 *	(2) the pipeline stalls waiting for dependent child I/Os
1363 *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
1364 *	(4) the I/O is delegated by vdev-level caching or aggregation
1365 *	(5) the I/O is deferred due to vdev-level queueing
1366 *	(6) the I/O is handed off to another thread.
1367 *
1368 * In all cases, the pipeline stops whenever there's no CPU work; it never
1369 * burns a thread in cv_wait().
1370 *
1371 * There's no locking on io_stage because there's no legitimate way
1372 * for multiple threads to be attempting to process the same I/O.
1373 */
1374static zio_pipe_stage_t *zio_pipeline[];
1375
1376void
1377zio_execute(zio_t *zio)
1378{
1379	zio->io_executor = curthread;
1380
1381	while (zio->io_stage < ZIO_STAGE_DONE) {
1382		enum zio_stage pipeline = zio->io_pipeline;
1383		enum zio_stage stage = zio->io_stage;
1384		int rv;
1385
1386		ASSERT(!MUTEX_HELD(&zio->io_lock));
1387		ASSERT(ISP2(stage));
1388		ASSERT(zio->io_stall == NULL);
1389
1390		do {
1391			stage <<= 1;
1392		} while ((stage & pipeline) == 0);
1393
1394		ASSERT(stage <= ZIO_STAGE_DONE);
1395
1396		/*
1397		 * If we are in interrupt context and this pipeline stage
1398		 * will grab a config lock that is held across I/O,
1399		 * or may wait for an I/O that needs an interrupt thread
1400		 * to complete, issue async to avoid deadlock.
1401		 *
1402		 * For VDEV_IO_START, we cut in line so that the io will
1403		 * be sent to disk promptly.
1404		 */
1405		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1406		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1407			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1408			    zio_requeue_io_start_cut_in_line : B_FALSE;
1409			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1410			return;
1411		}
1412
1413		zio->io_stage = stage;
1414		rv = zio_pipeline[highbit64(stage) - 1](zio);
1415
1416		if (rv == ZIO_PIPELINE_STOP)
1417			return;
1418
1419		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1420	}
1421}
1422
1423/*
1424 * ==========================================================================
1425 * Initiate I/O, either sync or async
1426 * ==========================================================================
1427 */
1428int
1429zio_wait(zio_t *zio)
1430{
1431	int error;
1432
1433	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1434	ASSERT(zio->io_executor == NULL);
1435
1436	zio->io_waiter = curthread;
1437
1438	zio_execute(zio);
1439
1440	mutex_enter(&zio->io_lock);
1441	while (zio->io_executor != NULL)
1442		cv_wait(&zio->io_cv, &zio->io_lock);
1443	mutex_exit(&zio->io_lock);
1444
1445	error = zio->io_error;
1446	zio_destroy(zio);
1447
1448	return (error);
1449}
1450
1451void
1452zio_nowait(zio_t *zio)
1453{
1454	ASSERT(zio->io_executor == NULL);
1455
1456	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1457	    zio_unique_parent(zio) == NULL) {
1458		/*
1459		 * This is a logical async I/O with no parent to wait for it.
1460		 * We add it to the spa_async_root_zio "Godfather" I/O which
1461		 * will ensure they complete prior to unloading the pool.
1462		 */
1463		spa_t *spa = zio->io_spa;
1464
1465		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
1466	}
1467
1468	zio_execute(zio);
1469}
1470
1471/*
1472 * ==========================================================================
1473 * Reexecute or suspend/resume failed I/O
1474 * ==========================================================================
1475 */
1476
1477static void
1478zio_reexecute(zio_t *pio)
1479{
1480	zio_t *cio, *cio_next;
1481
1482	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1483	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1484	ASSERT(pio->io_gang_leader == NULL);
1485	ASSERT(pio->io_gang_tree == NULL);
1486
1487	pio->io_flags = pio->io_orig_flags;
1488	pio->io_stage = pio->io_orig_stage;
1489	pio->io_pipeline = pio->io_orig_pipeline;
1490	pio->io_reexecute = 0;
1491	pio->io_flags |= ZIO_FLAG_REEXECUTED;
1492	pio->io_error = 0;
1493	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1494		pio->io_state[w] = 0;
1495	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1496		pio->io_child_error[c] = 0;
1497
1498	if (IO_IS_ALLOCATING(pio))
1499		BP_ZERO(pio->io_bp);
1500
1501	/*
1502	 * As we reexecute pio's children, new children could be created.
1503	 * New children go to the head of pio's io_child_list, however,
1504	 * so we will (correctly) not reexecute them.  The key is that
1505	 * the remainder of pio's io_child_list, from 'cio_next' onward,
1506	 * cannot be affected by any side effects of reexecuting 'cio'.
1507	 */
1508	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1509		cio_next = zio_walk_children(pio);
1510		mutex_enter(&pio->io_lock);
1511		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1512			pio->io_children[cio->io_child_type][w]++;
1513		mutex_exit(&pio->io_lock);
1514		zio_reexecute(cio);
1515	}
1516
1517	/*
1518	 * Now that all children have been reexecuted, execute the parent.
1519	 * We don't reexecute "The Godfather" I/O here as it's the
1520	 * responsibility of the caller to wait on him.
1521	 */
1522	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1523		zio_execute(pio);
1524}
1525
1526void
1527zio_suspend(spa_t *spa, zio_t *zio)
1528{
1529	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1530		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1531		    "failure and the failure mode property for this pool "
1532		    "is set to panic.", spa_name(spa));
1533
1534	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1535
1536	mutex_enter(&spa->spa_suspend_lock);
1537
1538	if (spa->spa_suspend_zio_root == NULL)
1539		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1540		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1541		    ZIO_FLAG_GODFATHER);
1542
1543	spa->spa_suspended = B_TRUE;
1544
1545	if (zio != NULL) {
1546		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1547		ASSERT(zio != spa->spa_suspend_zio_root);
1548		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1549		ASSERT(zio_unique_parent(zio) == NULL);
1550		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1551		zio_add_child(spa->spa_suspend_zio_root, zio);
1552	}
1553
1554	mutex_exit(&spa->spa_suspend_lock);
1555}
1556
1557int
1558zio_resume(spa_t *spa)
1559{
1560	zio_t *pio;
1561
1562	/*
1563	 * Reexecute all previously suspended i/o.
1564	 */
1565	mutex_enter(&spa->spa_suspend_lock);
1566	spa->spa_suspended = B_FALSE;
1567	cv_broadcast(&spa->spa_suspend_cv);
1568	pio = spa->spa_suspend_zio_root;
1569	spa->spa_suspend_zio_root = NULL;
1570	mutex_exit(&spa->spa_suspend_lock);
1571
1572	if (pio == NULL)
1573		return (0);
1574
1575	zio_reexecute(pio);
1576	return (zio_wait(pio));
1577}
1578
1579void
1580zio_resume_wait(spa_t *spa)
1581{
1582	mutex_enter(&spa->spa_suspend_lock);
1583	while (spa_suspended(spa))
1584		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1585	mutex_exit(&spa->spa_suspend_lock);
1586}
1587
1588/*
1589 * ==========================================================================
1590 * Gang blocks.
1591 *
1592 * A gang block is a collection of small blocks that looks to the DMU
1593 * like one large block.  When zio_dva_allocate() cannot find a block
1594 * of the requested size, due to either severe fragmentation or the pool
1595 * being nearly full, it calls zio_write_gang_block() to construct the
1596 * block from smaller fragments.
1597 *
1598 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1599 * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1600 * an indirect block: it's an array of block pointers.  It consumes
1601 * only one sector and hence is allocatable regardless of fragmentation.
1602 * The gang header's bps point to its gang members, which hold the data.
1603 *
1604 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1605 * as the verifier to ensure uniqueness of the SHA256 checksum.
1606 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1607 * not the gang header.  This ensures that data block signatures (needed for
1608 * deduplication) are independent of how the block is physically stored.
1609 *
1610 * Gang blocks can be nested: a gang member may itself be a gang block.
1611 * Thus every gang block is a tree in which root and all interior nodes are
1612 * gang headers, and the leaves are normal blocks that contain user data.
1613 * The root of the gang tree is called the gang leader.
1614 *
1615 * To perform any operation (read, rewrite, free, claim) on a gang block,
1616 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1617 * in the io_gang_tree field of the original logical i/o by recursively
1618 * reading the gang leader and all gang headers below it.  This yields
1619 * an in-core tree containing the contents of every gang header and the
1620 * bps for every constituent of the gang block.
1621 *
1622 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1623 * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1624 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1625 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1626 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1627 * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1628 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1629 * of the gang header plus zio_checksum_compute() of the data to update the
1630 * gang header's blk_cksum as described above.
1631 *
1632 * The two-phase assemble/issue model solves the problem of partial failure --
1633 * what if you'd freed part of a gang block but then couldn't read the
1634 * gang header for another part?  Assembling the entire gang tree first
1635 * ensures that all the necessary gang header I/O has succeeded before
1636 * starting the actual work of free, claim, or write.  Once the gang tree
1637 * is assembled, free and claim are in-memory operations that cannot fail.
1638 *
1639 * In the event that a gang write fails, zio_dva_unallocate() walks the
1640 * gang tree to immediately free (i.e. insert back into the space map)
1641 * everything we've allocated.  This ensures that we don't get ENOSPC
1642 * errors during repeated suspend/resume cycles due to a flaky device.
1643 *
1644 * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1645 * the gang tree, we won't modify the block, so we can safely defer the free
1646 * (knowing that the block is still intact).  If we *can* assemble the gang
1647 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1648 * each constituent bp and we can allocate a new block on the next sync pass.
1649 *
1650 * In all cases, the gang tree allows complete recovery from partial failure.
1651 * ==========================================================================
1652 */
1653
1654static zio_t *
1655zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1656{
1657	if (gn != NULL)
1658		return (pio);
1659
1660	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1661	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1662	    &pio->io_bookmark));
1663}
1664
1665zio_t *
1666zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1667{
1668	zio_t *zio;
1669
1670	if (gn != NULL) {
1671		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1672		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1673		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1674		/*
1675		 * As we rewrite each gang header, the pipeline will compute
1676		 * a new gang block header checksum for it; but no one will
1677		 * compute a new data checksum, so we do that here.  The one
1678		 * exception is the gang leader: the pipeline already computed
1679		 * its data checksum because that stage precedes gang assembly.
1680		 * (Presently, nothing actually uses interior data checksums;
1681		 * this is just good hygiene.)
1682		 */
1683		if (gn != pio->io_gang_leader->io_gang_tree) {
1684			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1685			    data, BP_GET_PSIZE(bp));
1686		}
1687		/*
1688		 * If we are here to damage data for testing purposes,
1689		 * leave the GBH alone so that we can detect the damage.
1690		 */
1691		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1692			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1693	} else {
1694		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1695		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1696		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1697	}
1698
1699	return (zio);
1700}
1701
1702/* ARGSUSED */
1703zio_t *
1704zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1705{
1706	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1707	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
1708	    ZIO_GANG_CHILD_FLAGS(pio)));
1709}
1710
1711/* ARGSUSED */
1712zio_t *
1713zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1714{
1715	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1716	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1717}
1718
1719static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1720	NULL,
1721	zio_read_gang,
1722	zio_rewrite_gang,
1723	zio_free_gang,
1724	zio_claim_gang,
1725	NULL
1726};
1727
1728static void zio_gang_tree_assemble_done(zio_t *zio);
1729
1730static zio_gang_node_t *
1731zio_gang_node_alloc(zio_gang_node_t **gnpp)
1732{
1733	zio_gang_node_t *gn;
1734
1735	ASSERT(*gnpp == NULL);
1736
1737	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1738	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1739	*gnpp = gn;
1740
1741	return (gn);
1742}
1743
1744static void
1745zio_gang_node_free(zio_gang_node_t **gnpp)
1746{
1747	zio_gang_node_t *gn = *gnpp;
1748
1749	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1750		ASSERT(gn->gn_child[g] == NULL);
1751
1752	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1753	kmem_free(gn, sizeof (*gn));
1754	*gnpp = NULL;
1755}
1756
1757static void
1758zio_gang_tree_free(zio_gang_node_t **gnpp)
1759{
1760	zio_gang_node_t *gn = *gnpp;
1761
1762	if (gn == NULL)
1763		return;
1764
1765	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1766		zio_gang_tree_free(&gn->gn_child[g]);
1767
1768	zio_gang_node_free(gnpp);
1769}
1770
1771static void
1772zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1773{
1774	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1775
1776	ASSERT(gio->io_gang_leader == gio);
1777	ASSERT(BP_IS_GANG(bp));
1778
1779	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1780	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1781	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1782}
1783
1784static void
1785zio_gang_tree_assemble_done(zio_t *zio)
1786{
1787	zio_t *gio = zio->io_gang_leader;
1788	zio_gang_node_t *gn = zio->io_private;
1789	blkptr_t *bp = zio->io_bp;
1790
1791	ASSERT(gio == zio_unique_parent(zio));
1792	ASSERT(zio->io_child_count == 0);
1793
1794	if (zio->io_error)
1795		return;
1796
1797	if (BP_SHOULD_BYTESWAP(bp))
1798		byteswap_uint64_array(zio->io_data, zio->io_size);
1799
1800	ASSERT(zio->io_data == gn->gn_gbh);
1801	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1802	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1803
1804	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1805		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1806		if (!BP_IS_GANG(gbp))
1807			continue;
1808		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1809	}
1810}
1811
1812static void
1813zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1814{
1815	zio_t *gio = pio->io_gang_leader;
1816	zio_t *zio;
1817
1818	ASSERT(BP_IS_GANG(bp) == !!gn);
1819	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1820	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1821
1822	/*
1823	 * If you're a gang header, your data is in gn->gn_gbh.
1824	 * If you're a gang member, your data is in 'data' and gn == NULL.
1825	 */
1826	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1827
1828	if (gn != NULL) {
1829		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1830
1831		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1832			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1833			if (BP_IS_HOLE(gbp))
1834				continue;
1835			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1836			data = (char *)data + BP_GET_PSIZE(gbp);
1837		}
1838	}
1839
1840	if (gn == gio->io_gang_tree && gio->io_data != NULL)
1841		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1842
1843	if (zio != pio)
1844		zio_nowait(zio);
1845}
1846
1847static int
1848zio_gang_assemble(zio_t *zio)
1849{
1850	blkptr_t *bp = zio->io_bp;
1851
1852	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1853	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1854
1855	zio->io_gang_leader = zio;
1856
1857	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1858
1859	return (ZIO_PIPELINE_CONTINUE);
1860}
1861
1862static int
1863zio_gang_issue(zio_t *zio)
1864{
1865	blkptr_t *bp = zio->io_bp;
1866
1867	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1868		return (ZIO_PIPELINE_STOP);
1869
1870	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1871	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1872
1873	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1874		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1875	else
1876		zio_gang_tree_free(&zio->io_gang_tree);
1877
1878	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1879
1880	return (ZIO_PIPELINE_CONTINUE);
1881}
1882
1883static void
1884zio_write_gang_member_ready(zio_t *zio)
1885{
1886	zio_t *pio = zio_unique_parent(zio);
1887	zio_t *gio = zio->io_gang_leader;
1888	dva_t *cdva = zio->io_bp->blk_dva;
1889	dva_t *pdva = pio->io_bp->blk_dva;
1890	uint64_t asize;
1891
1892	if (BP_IS_HOLE(zio->io_bp))
1893		return;
1894
1895	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1896
1897	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1898	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1899	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1900	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1901	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1902
1903	mutex_enter(&pio->io_lock);
1904	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1905		ASSERT(DVA_GET_GANG(&pdva[d]));
1906		asize = DVA_GET_ASIZE(&pdva[d]);
1907		asize += DVA_GET_ASIZE(&cdva[d]);
1908		DVA_SET_ASIZE(&pdva[d], asize);
1909	}
1910	mutex_exit(&pio->io_lock);
1911}
1912
1913static int
1914zio_write_gang_block(zio_t *pio)
1915{
1916	spa_t *spa = pio->io_spa;
1917	blkptr_t *bp = pio->io_bp;
1918	zio_t *gio = pio->io_gang_leader;
1919	zio_t *zio;
1920	zio_gang_node_t *gn, **gnpp;
1921	zio_gbh_phys_t *gbh;
1922	uint64_t txg = pio->io_txg;
1923	uint64_t resid = pio->io_size;
1924	uint64_t lsize;
1925	int copies = gio->io_prop.zp_copies;
1926	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1927	zio_prop_t zp;
1928	int error;
1929
1930	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1931	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1932	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1933	if (error) {
1934		pio->io_error = error;
1935		return (ZIO_PIPELINE_CONTINUE);
1936	}
1937
1938	if (pio == gio) {
1939		gnpp = &gio->io_gang_tree;
1940	} else {
1941		gnpp = pio->io_private;
1942		ASSERT(pio->io_ready == zio_write_gang_member_ready);
1943	}
1944
1945	gn = zio_gang_node_alloc(gnpp);
1946	gbh = gn->gn_gbh;
1947	bzero(gbh, SPA_GANGBLOCKSIZE);
1948
1949	/*
1950	 * Create the gang header.
1951	 */
1952	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1953	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1954
1955	/*
1956	 * Create and nowait the gang children.
1957	 */
1958	for (int g = 0; resid != 0; resid -= lsize, g++) {
1959		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1960		    SPA_MINBLOCKSIZE);
1961		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1962
1963		zp.zp_checksum = gio->io_prop.zp_checksum;
1964		zp.zp_compress = ZIO_COMPRESS_OFF;
1965		zp.zp_type = DMU_OT_NONE;
1966		zp.zp_level = 0;
1967		zp.zp_copies = gio->io_prop.zp_copies;
1968		zp.zp_dedup = B_FALSE;
1969		zp.zp_dedup_verify = B_FALSE;
1970		zp.zp_nopwrite = B_FALSE;
1971
1972		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1973		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1974		    zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1975		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1976		    &pio->io_bookmark));
1977	}
1978
1979	/*
1980	 * Set pio's pipeline to just wait for zio to finish.
1981	 */
1982	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1983
1984	zio_nowait(zio);
1985
1986	return (ZIO_PIPELINE_CONTINUE);
1987}
1988
1989/*
1990 * The zio_nop_write stage in the pipeline determines if allocating
1991 * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1992 * such as SHA256, we can compare the checksums of the new data and the old
1993 * to determine if allocating a new block is required.  The nopwrite
1994 * feature can handle writes in either syncing or open context (i.e. zil
1995 * writes) and as a result is mutually exclusive with dedup.
1996 */
1997static int
1998zio_nop_write(zio_t *zio)
1999{
2000	blkptr_t *bp = zio->io_bp;
2001	blkptr_t *bp_orig = &zio->io_bp_orig;
2002	zio_prop_t *zp = &zio->io_prop;
2003
2004	ASSERT(BP_GET_LEVEL(bp) == 0);
2005	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
2006	ASSERT(zp->zp_nopwrite);
2007	ASSERT(!zp->zp_dedup);
2008	ASSERT(zio->io_bp_override == NULL);
2009	ASSERT(IO_IS_ALLOCATING(zio));
2010
2011	/*
2012	 * Check to see if the original bp and the new bp have matching
2013	 * characteristics (i.e. same checksum, compression algorithms, etc).
2014	 * If they don't then just continue with the pipeline which will
2015	 * allocate a new bp.
2016	 */
2017	if (BP_IS_HOLE(bp_orig) ||
2018	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2019	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2020	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2021	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2022	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
2023		return (ZIO_PIPELINE_CONTINUE);
2024
2025	/*
2026	 * If the checksums match then reset the pipeline so that we
2027	 * avoid allocating a new bp and issuing any I/O.
2028	 */
2029	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2030		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2031		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2032		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2033		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2034		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2035		    sizeof (uint64_t)) == 0);
2036
2037		*bp = *bp_orig;
2038		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2039		zio->io_flags |= ZIO_FLAG_NOPWRITE;
2040	}
2041
2042	return (ZIO_PIPELINE_CONTINUE);
2043}
2044
2045/*
2046 * ==========================================================================
2047 * Dedup
2048 * ==========================================================================
2049 */
2050static void
2051zio_ddt_child_read_done(zio_t *zio)
2052{
2053	blkptr_t *bp = zio->io_bp;
2054	ddt_entry_t *dde = zio->io_private;
2055	ddt_phys_t *ddp;
2056	zio_t *pio = zio_unique_parent(zio);
2057
2058	mutex_enter(&pio->io_lock);
2059	ddp = ddt_phys_select(dde, bp);
2060	if (zio->io_error == 0)
2061		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
2062	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2063		dde->dde_repair_data = zio->io_data;
2064	else
2065		zio_buf_free(zio->io_data, zio->io_size);
2066	mutex_exit(&pio->io_lock);
2067}
2068
2069static int
2070zio_ddt_read_start(zio_t *zio)
2071{
2072	blkptr_t *bp = zio->io_bp;
2073
2074	ASSERT(BP_GET_DEDUP(bp));
2075	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2076	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2077
2078	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2079		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2080		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2081		ddt_phys_t *ddp = dde->dde_phys;
2082		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2083		blkptr_t blk;
2084
2085		ASSERT(zio->io_vsd == NULL);
2086		zio->io_vsd = dde;
2087
2088		if (ddp_self == NULL)
2089			return (ZIO_PIPELINE_CONTINUE);
2090
2091		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2092			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2093				continue;
2094			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2095			    &blk);
2096			zio_nowait(zio_read(zio, zio->io_spa, &blk,
2097			    zio_buf_alloc(zio->io_size), zio->io_size,
2098			    zio_ddt_child_read_done, dde, zio->io_priority,
2099			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2100			    &zio->io_bookmark));
2101		}
2102		return (ZIO_PIPELINE_CONTINUE);
2103	}
2104
2105	zio_nowait(zio_read(zio, zio->io_spa, bp,
2106	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2107	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2108
2109	return (ZIO_PIPELINE_CONTINUE);
2110}
2111
2112static int
2113zio_ddt_read_done(zio_t *zio)
2114{
2115	blkptr_t *bp = zio->io_bp;
2116
2117	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2118		return (ZIO_PIPELINE_STOP);
2119
2120	ASSERT(BP_GET_DEDUP(bp));
2121	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2122	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2123
2124	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2125		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2126		ddt_entry_t *dde = zio->io_vsd;
2127		if (ddt == NULL) {
2128			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2129			return (ZIO_PIPELINE_CONTINUE);
2130		}
2131		if (dde == NULL) {
2132			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2133			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2134			return (ZIO_PIPELINE_STOP);
2135		}
2136		if (dde->dde_repair_data != NULL) {
2137			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2138			zio->io_child_error[ZIO_CHILD_DDT] = 0;
2139		}
2140		ddt_repair_done(ddt, dde);
2141		zio->io_vsd = NULL;
2142	}
2143
2144	ASSERT(zio->io_vsd == NULL);
2145
2146	return (ZIO_PIPELINE_CONTINUE);
2147}
2148
2149static boolean_t
2150zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2151{
2152	spa_t *spa = zio->io_spa;
2153
2154	/*
2155	 * Note: we compare the original data, not the transformed data,
2156	 * because when zio->io_bp is an override bp, we will not have
2157	 * pushed the I/O transforms.  That's an important optimization
2158	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2159	 */
2160	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2161		zio_t *lio = dde->dde_lead_zio[p];
2162
2163		if (lio != NULL) {
2164			return (lio->io_orig_size != zio->io_orig_size ||
2165			    bcmp(zio->io_orig_data, lio->io_orig_data,
2166			    zio->io_orig_size) != 0);
2167		}
2168	}
2169
2170	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2171		ddt_phys_t *ddp = &dde->dde_phys[p];
2172
2173		if (ddp->ddp_phys_birth != 0) {
2174			arc_buf_t *abuf = NULL;
2175			uint32_t aflags = ARC_WAIT;
2176			blkptr_t blk = *zio->io_bp;
2177			int error;
2178
2179			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2180
2181			ddt_exit(ddt);
2182
2183			error = arc_read(NULL, spa, &blk,
2184			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2185			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2186			    &aflags, &zio->io_bookmark);
2187
2188			if (error == 0) {
2189				if (arc_buf_size(abuf) != zio->io_orig_size ||
2190				    bcmp(abuf->b_data, zio->io_orig_data,
2191				    zio->io_orig_size) != 0)
2192					error = SET_ERROR(EEXIST);
2193				VERIFY(arc_buf_remove_ref(abuf, &abuf));
2194			}
2195
2196			ddt_enter(ddt);
2197			return (error != 0);
2198		}
2199	}
2200
2201	return (B_FALSE);
2202}
2203
2204static void
2205zio_ddt_child_write_ready(zio_t *zio)
2206{
2207	int p = zio->io_prop.zp_copies;
2208	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2209	ddt_entry_t *dde = zio->io_private;
2210	ddt_phys_t *ddp = &dde->dde_phys[p];
2211	zio_t *pio;
2212
2213	if (zio->io_error)
2214		return;
2215
2216	ddt_enter(ddt);
2217
2218	ASSERT(dde->dde_lead_zio[p] == zio);
2219
2220	ddt_phys_fill(ddp, zio->io_bp);
2221
2222	while ((pio = zio_walk_parents(zio)) != NULL)
2223		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2224
2225	ddt_exit(ddt);
2226}
2227
2228static void
2229zio_ddt_child_write_done(zio_t *zio)
2230{
2231	int p = zio->io_prop.zp_copies;
2232	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2233	ddt_entry_t *dde = zio->io_private;
2234	ddt_phys_t *ddp = &dde->dde_phys[p];
2235
2236	ddt_enter(ddt);
2237
2238	ASSERT(ddp->ddp_refcnt == 0);
2239	ASSERT(dde->dde_lead_zio[p] == zio);
2240	dde->dde_lead_zio[p] = NULL;
2241
2242	if (zio->io_error == 0) {
2243		while (zio_walk_parents(zio) != NULL)
2244			ddt_phys_addref(ddp);
2245	} else {
2246		ddt_phys_clear(ddp);
2247	}
2248
2249	ddt_exit(ddt);
2250}
2251
2252static void
2253zio_ddt_ditto_write_done(zio_t *zio)
2254{
2255	int p = DDT_PHYS_DITTO;
2256	zio_prop_t *zp = &zio->io_prop;
2257	blkptr_t *bp = zio->io_bp;
2258	ddt_t *ddt = ddt_select(zio->io_spa, bp);
2259	ddt_entry_t *dde = zio->io_private;
2260	ddt_phys_t *ddp = &dde->dde_phys[p];
2261	ddt_key_t *ddk = &dde->dde_key;
2262
2263	ddt_enter(ddt);
2264
2265	ASSERT(ddp->ddp_refcnt == 0);
2266	ASSERT(dde->dde_lead_zio[p] == zio);
2267	dde->dde_lead_zio[p] = NULL;
2268
2269	if (zio->io_error == 0) {
2270		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2271		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2272		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2273		if (ddp->ddp_phys_birth != 0)
2274			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2275		ddt_phys_fill(ddp, bp);
2276	}
2277
2278	ddt_exit(ddt);
2279}
2280
2281static int
2282zio_ddt_write(zio_t *zio)
2283{
2284	spa_t *spa = zio->io_spa;
2285	blkptr_t *bp = zio->io_bp;
2286	uint64_t txg = zio->io_txg;
2287	zio_prop_t *zp = &zio->io_prop;
2288	int p = zp->zp_copies;
2289	int ditto_copies;
2290	zio_t *cio = NULL;
2291	zio_t *dio = NULL;
2292	ddt_t *ddt = ddt_select(spa, bp);
2293	ddt_entry_t *dde;
2294	ddt_phys_t *ddp;
2295
2296	ASSERT(BP_GET_DEDUP(bp));
2297	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2298	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2299
2300	ddt_enter(ddt);
2301	dde = ddt_lookup(ddt, bp, B_TRUE);
2302	ddp = &dde->dde_phys[p];
2303
2304	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2305		/*
2306		 * If we're using a weak checksum, upgrade to a strong checksum
2307		 * and try again.  If we're already using a strong checksum,
2308		 * we can't resolve it, so just convert to an ordinary write.
2309		 * (And automatically e-mail a paper to Nature?)
2310		 */
2311		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2312			zp->zp_checksum = spa_dedup_checksum(spa);
2313			zio_pop_transforms(zio);
2314			zio->io_stage = ZIO_STAGE_OPEN;
2315			BP_ZERO(bp);
2316		} else {
2317			zp->zp_dedup = B_FALSE;
2318		}
2319		zio->io_pipeline = ZIO_WRITE_PIPELINE;
2320		ddt_exit(ddt);
2321		return (ZIO_PIPELINE_CONTINUE);
2322	}
2323
2324	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2325	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2326
2327	if (ditto_copies > ddt_ditto_copies_present(dde) &&
2328	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2329		zio_prop_t czp = *zp;
2330
2331		czp.zp_copies = ditto_copies;
2332
2333		/*
2334		 * If we arrived here with an override bp, we won't have run
2335		 * the transform stack, so we won't have the data we need to
2336		 * generate a child i/o.  So, toss the override bp and restart.
2337		 * This is safe, because using the override bp is just an
2338		 * optimization; and it's rare, so the cost doesn't matter.
2339		 */
2340		if (zio->io_bp_override) {
2341			zio_pop_transforms(zio);
2342			zio->io_stage = ZIO_STAGE_OPEN;
2343			zio->io_pipeline = ZIO_WRITE_PIPELINE;
2344			zio->io_bp_override = NULL;
2345			BP_ZERO(bp);
2346			ddt_exit(ddt);
2347			return (ZIO_PIPELINE_CONTINUE);
2348		}
2349
2350		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2351		    zio->io_orig_size, &czp, NULL, NULL,
2352		    zio_ddt_ditto_write_done, dde, zio->io_priority,
2353		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2354
2355		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2356		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2357	}
2358
2359	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2360		if (ddp->ddp_phys_birth != 0)
2361			ddt_bp_fill(ddp, bp, txg);
2362		if (dde->dde_lead_zio[p] != NULL)
2363			zio_add_child(zio, dde->dde_lead_zio[p]);
2364		else
2365			ddt_phys_addref(ddp);
2366	} else if (zio->io_bp_override) {
2367		ASSERT(bp->blk_birth == txg);
2368		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2369		ddt_phys_fill(ddp, bp);
2370		ddt_phys_addref(ddp);
2371	} else {
2372		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2373		    zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2374		    zio_ddt_child_write_done, dde, zio->io_priority,
2375		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2376
2377		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2378		dde->dde_lead_zio[p] = cio;
2379	}
2380
2381	ddt_exit(ddt);
2382
2383	if (cio)
2384		zio_nowait(cio);
2385	if (dio)
2386		zio_nowait(dio);
2387
2388	return (ZIO_PIPELINE_CONTINUE);
2389}
2390
2391ddt_entry_t *freedde; /* for debugging */
2392
2393static int
2394zio_ddt_free(zio_t *zio)
2395{
2396	spa_t *spa = zio->io_spa;
2397	blkptr_t *bp = zio->io_bp;
2398	ddt_t *ddt = ddt_select(spa, bp);
2399	ddt_entry_t *dde;
2400	ddt_phys_t *ddp;
2401
2402	ASSERT(BP_GET_DEDUP(bp));
2403	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2404
2405	ddt_enter(ddt);
2406	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2407	ddp = ddt_phys_select(dde, bp);
2408	ddt_phys_decref(ddp);
2409	ddt_exit(ddt);
2410
2411	return (ZIO_PIPELINE_CONTINUE);
2412}
2413
2414/*
2415 * ==========================================================================
2416 * Allocate and free blocks
2417 * ==========================================================================
2418 */
2419static int
2420zio_dva_allocate(zio_t *zio)
2421{
2422	spa_t *spa = zio->io_spa;
2423	metaslab_class_t *mc = spa_normal_class(spa);
2424	blkptr_t *bp = zio->io_bp;
2425	int error;
2426	int flags = 0;
2427
2428	if (zio->io_gang_leader == NULL) {
2429		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2430		zio->io_gang_leader = zio;
2431	}
2432
2433	ASSERT(BP_IS_HOLE(bp));
2434	ASSERT0(BP_GET_NDVAS(bp));
2435	ASSERT3U(zio->io_prop.zp_copies, >, 0);
2436	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2437	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2438
2439	/*
2440	 * The dump device does not support gang blocks so allocation on
2441	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2442	 * the "fast" gang feature.
2443	 */
2444	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2445	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2446	    METASLAB_GANG_CHILD : 0;
2447	error = metaslab_alloc(spa, mc, zio->io_size, bp,
2448	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2449
2450	if (error) {
2451		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2452		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2453		    error);
2454		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2455			return (zio_write_gang_block(zio));
2456		zio->io_error = error;
2457	}
2458
2459	return (ZIO_PIPELINE_CONTINUE);
2460}
2461
2462static int
2463zio_dva_free(zio_t *zio)
2464{
2465	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2466
2467	return (ZIO_PIPELINE_CONTINUE);
2468}
2469
2470static int
2471zio_dva_claim(zio_t *zio)
2472{
2473	int error;
2474
2475	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2476	if (error)
2477		zio->io_error = error;
2478
2479	return (ZIO_PIPELINE_CONTINUE);
2480}
2481
2482/*
2483 * Undo an allocation.  This is used by zio_done() when an I/O fails
2484 * and we want to give back the block we just allocated.
2485 * This handles both normal blocks and gang blocks.
2486 */
2487static void
2488zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2489{
2490	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2491	ASSERT(zio->io_bp_override == NULL);
2492
2493	if (!BP_IS_HOLE(bp))
2494		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2495
2496	if (gn != NULL) {
2497		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2498			zio_dva_unallocate(zio, gn->gn_child[g],
2499			    &gn->gn_gbh->zg_blkptr[g]);
2500		}
2501	}
2502}
2503
2504/*
2505 * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2506 */
2507int
2508zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2509    uint64_t size, boolean_t use_slog)
2510{
2511	int error = 1;
2512
2513	ASSERT(txg > spa_syncing_txg(spa));
2514
2515	/*
2516	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2517	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2518	 * when allocating them.
2519	 */
2520	if (use_slog) {
2521		error = metaslab_alloc(spa, spa_log_class(spa), size,
2522		    new_bp, 1, txg, old_bp,
2523		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2524	}
2525
2526	if (error) {
2527		error = metaslab_alloc(spa, spa_normal_class(spa), size,
2528		    new_bp, 1, txg, old_bp,
2529		    METASLAB_HINTBP_AVOID);
2530	}
2531
2532	if (error == 0) {
2533		BP_SET_LSIZE(new_bp, size);
2534		BP_SET_PSIZE(new_bp, size);
2535		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2536		BP_SET_CHECKSUM(new_bp,
2537		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2538		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2539		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2540		BP_SET_LEVEL(new_bp, 0);
2541		BP_SET_DEDUP(new_bp, 0);
2542		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2543	}
2544
2545	return (error);
2546}
2547
2548/*
2549 * Free an intent log block.
2550 */
2551void
2552zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2553{
2554	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2555	ASSERT(!BP_IS_GANG(bp));
2556
2557	zio_free(spa, txg, bp);
2558}
2559
2560/*
2561 * ==========================================================================
2562 * Read, write and delete to physical devices
2563 * ==========================================================================
2564 */
2565static int
2566zio_vdev_io_start(zio_t *zio)
2567{
2568	vdev_t *vd = zio->io_vd;
2569	uint64_t align;
2570	spa_t *spa = zio->io_spa;
2571	int ret;
2572
2573	ASSERT(zio->io_error == 0);
2574	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2575
2576	if (vd == NULL) {
2577		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2578			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2579
2580		/*
2581		 * The mirror_ops handle multiple DVAs in a single BP.
2582		 */
2583		return (vdev_mirror_ops.vdev_op_io_start(zio));
2584	}
2585
2586	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
2587	    zio->io_priority == ZIO_PRIORITY_NOW) {
2588		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
2589		return (ZIO_PIPELINE_CONTINUE);
2590	}
2591
2592	/*
2593	 * We keep track of time-sensitive I/Os so that the scan thread
2594	 * can quickly react to certain workloads.  In particular, we care
2595	 * about non-scrubbing, top-level reads and writes with the following
2596	 * characteristics:
2597	 * 	- synchronous writes of user data to non-slog devices
2598	 *	- any reads of user data
2599	 * When these conditions are met, adjust the timestamp of spa_last_io
2600	 * which allows the scan thread to adjust its workload accordingly.
2601	 */
2602	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2603	    vd == vd->vdev_top && !vd->vdev_islog &&
2604	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2605	    zio->io_txg != spa_syncing_txg(spa)) {
2606		uint64_t old = spa->spa_last_io;
2607		uint64_t new = ddi_get_lbolt64();
2608		if (old != new)
2609			(void) atomic_cas_64(&spa->spa_last_io, old, new);
2610	}
2611
2612	align = 1ULL << vd->vdev_top->vdev_ashift;
2613
2614	if ((!(zio->io_flags & ZIO_FLAG_PHYSICAL) ||
2615	    (vd->vdev_top->vdev_physical_ashift > SPA_MINBLOCKSHIFT)) &&
2616	    P2PHASE(zio->io_size, align) != 0) {
2617		/* Transform logical writes to be a full physical block size. */
2618		uint64_t asize = P2ROUNDUP(zio->io_size, align);
2619		char *abuf = NULL;
2620		if (zio->io_type == ZIO_TYPE_READ ||
2621		    zio->io_type == ZIO_TYPE_WRITE)
2622			abuf = zio_buf_alloc(asize);
2623		ASSERT(vd == vd->vdev_top);
2624		if (zio->io_type == ZIO_TYPE_WRITE) {
2625			bcopy(zio->io_data, abuf, zio->io_size);
2626			bzero(abuf + zio->io_size, asize - zio->io_size);
2627		}
2628		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
2629		    zio_subblock);
2630	}
2631
2632	/*
2633	 * If this is not a physical io, make sure that it is properly aligned
2634	 * before proceeding.
2635	 */
2636	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2637		ASSERT0(P2PHASE(zio->io_offset, align));
2638		ASSERT0(P2PHASE(zio->io_size, align));
2639	} else {
2640		/*
2641		 * For physical writes, we allow 512b aligned writes and assume
2642		 * the device will perform a read-modify-write as necessary.
2643		 */
2644		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2645		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2646	}
2647
2648	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
2649
2650	/*
2651	 * If this is a repair I/O, and there's no self-healing involved --
2652	 * that is, we're just resilvering what we expect to resilver --
2653	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2654	 * This prevents spurious resilvering with nested replication.
2655	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2656	 * A is out of date, we'll read from C+D, then use the data to
2657	 * resilver A+B -- but we don't actually want to resilver B, just A.
2658	 * The top-level mirror has no way to know this, so instead we just
2659	 * discard unnecessary repairs as we work our way down the vdev tree.
2660	 * The same logic applies to any form of nested replication:
2661	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2662	 */
2663	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2664	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2665	    zio->io_txg != 0 &&	/* not a delegated i/o */
2666	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2667		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2668		zio_vdev_io_bypass(zio);
2669		return (ZIO_PIPELINE_CONTINUE);
2670	}
2671
2672	if (vd->vdev_ops->vdev_op_leaf) {
2673		switch (zio->io_type) {
2674		case ZIO_TYPE_READ:
2675			if (vdev_cache_read(zio))
2676				return (ZIO_PIPELINE_CONTINUE);
2677			/* FALLTHROUGH */
2678		case ZIO_TYPE_WRITE:
2679		case ZIO_TYPE_FREE:
2680			if ((zio = vdev_queue_io(zio)) == NULL)
2681				return (ZIO_PIPELINE_STOP);
2682
2683			if (!vdev_accessible(vd, zio)) {
2684				zio->io_error = SET_ERROR(ENXIO);
2685				zio_interrupt(zio);
2686				return (ZIO_PIPELINE_STOP);
2687			}
2688			break;
2689		}
2690		/*
2691		 * Note that we ignore repair writes for TRIM because they can
2692		 * conflict with normal writes. This isn't an issue because, by
2693		 * definition, we only repair blocks that aren't freed.
2694		 */
2695		if (zio->io_type == ZIO_TYPE_WRITE &&
2696		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2697		    !trim_map_write_start(zio))
2698			return (ZIO_PIPELINE_STOP);
2699	}
2700
2701	ret = vd->vdev_ops->vdev_op_io_start(zio);
2702	ASSERT(ret == ZIO_PIPELINE_STOP);
2703
2704	return (ret);
2705}
2706
2707static int
2708zio_vdev_io_done(zio_t *zio)
2709{
2710	vdev_t *vd = zio->io_vd;
2711	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2712	boolean_t unexpected_error = B_FALSE;
2713
2714	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2715		return (ZIO_PIPELINE_STOP);
2716
2717	ASSERT(zio->io_type == ZIO_TYPE_READ ||
2718	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
2719
2720	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2721	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
2722	    zio->io_type == ZIO_TYPE_FREE)) {
2723
2724		if (zio->io_type == ZIO_TYPE_WRITE &&
2725		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
2726			trim_map_write_done(zio);
2727
2728		vdev_queue_io_done(zio);
2729
2730		if (zio->io_type == ZIO_TYPE_WRITE)
2731			vdev_cache_write(zio);
2732
2733		if (zio_injection_enabled && zio->io_error == 0)
2734			zio->io_error = zio_handle_device_injection(vd,
2735			    zio, EIO);
2736
2737		if (zio_injection_enabled && zio->io_error == 0)
2738			zio->io_error = zio_handle_label_injection(zio, EIO);
2739
2740		if (zio->io_error) {
2741			if (zio->io_error == ENOTSUP &&
2742			    zio->io_type == ZIO_TYPE_FREE) {
2743				/* Not all devices support TRIM. */
2744			} else if (!vdev_accessible(vd, zio)) {
2745				zio->io_error = SET_ERROR(ENXIO);
2746			} else {
2747				unexpected_error = B_TRUE;
2748			}
2749		}
2750	}
2751
2752	ops->vdev_op_io_done(zio);
2753
2754	if (unexpected_error)
2755		VERIFY(vdev_probe(vd, zio) == NULL);
2756
2757	return (ZIO_PIPELINE_CONTINUE);
2758}
2759
2760/*
2761 * For non-raidz ZIOs, we can just copy aside the bad data read from the
2762 * disk, and use that to finish the checksum ereport later.
2763 */
2764static void
2765zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2766    const void *good_buf)
2767{
2768	/* no processing needed */
2769	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2770}
2771
2772/*ARGSUSED*/
2773void
2774zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2775{
2776	void *buf = zio_buf_alloc(zio->io_size);
2777
2778	bcopy(zio->io_data, buf, zio->io_size);
2779
2780	zcr->zcr_cbinfo = zio->io_size;
2781	zcr->zcr_cbdata = buf;
2782	zcr->zcr_finish = zio_vsd_default_cksum_finish;
2783	zcr->zcr_free = zio_buf_free;
2784}
2785
2786static int
2787zio_vdev_io_assess(zio_t *zio)
2788{
2789	vdev_t *vd = zio->io_vd;
2790
2791	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2792		return (ZIO_PIPELINE_STOP);
2793
2794	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2795		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2796
2797	if (zio->io_vsd != NULL) {
2798		zio->io_vsd_ops->vsd_free(zio);
2799		zio->io_vsd = NULL;
2800	}
2801
2802	if (zio_injection_enabled && zio->io_error == 0)
2803		zio->io_error = zio_handle_fault_injection(zio, EIO);
2804
2805	if (zio->io_type == ZIO_TYPE_FREE &&
2806	    zio->io_priority != ZIO_PRIORITY_NOW) {
2807		switch (zio->io_error) {
2808		case 0:
2809			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
2810			ZIO_TRIM_STAT_BUMP(success);
2811			break;
2812		case EOPNOTSUPP:
2813			ZIO_TRIM_STAT_BUMP(unsupported);
2814			break;
2815		default:
2816			ZIO_TRIM_STAT_BUMP(failed);
2817			break;
2818		}
2819	}
2820
2821	/*
2822	 * If the I/O failed, determine whether we should attempt to retry it.
2823	 *
2824	 * On retry, we cut in line in the issue queue, since we don't want
2825	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2826	 */
2827	if (zio->io_error && vd == NULL &&
2828	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2829		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
2830		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
2831		zio->io_error = 0;
2832		zio->io_flags |= ZIO_FLAG_IO_RETRY |
2833		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2834		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2835		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2836		    zio_requeue_io_start_cut_in_line);
2837		return (ZIO_PIPELINE_STOP);
2838	}
2839
2840	/*
2841	 * If we got an error on a leaf device, convert it to ENXIO
2842	 * if the device is not accessible at all.
2843	 */
2844	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2845	    !vdev_accessible(vd, zio))
2846		zio->io_error = SET_ERROR(ENXIO);
2847
2848	/*
2849	 * If we can't write to an interior vdev (mirror or RAID-Z),
2850	 * set vdev_cant_write so that we stop trying to allocate from it.
2851	 */
2852	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2853	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2854		vd->vdev_cant_write = B_TRUE;
2855	}
2856
2857	if (zio->io_error)
2858		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2859
2860	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2861	    zio->io_physdone != NULL) {
2862		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2863		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2864		zio->io_physdone(zio->io_logical);
2865	}
2866
2867	return (ZIO_PIPELINE_CONTINUE);
2868}
2869
2870void
2871zio_vdev_io_reissue(zio_t *zio)
2872{
2873	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2874	ASSERT(zio->io_error == 0);
2875
2876	zio->io_stage >>= 1;
2877}
2878
2879void
2880zio_vdev_io_redone(zio_t *zio)
2881{
2882	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2883
2884	zio->io_stage >>= 1;
2885}
2886
2887void
2888zio_vdev_io_bypass(zio_t *zio)
2889{
2890	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2891	ASSERT(zio->io_error == 0);
2892
2893	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2894	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2895}
2896
2897/*
2898 * ==========================================================================
2899 * Generate and verify checksums
2900 * ==========================================================================
2901 */
2902static int
2903zio_checksum_generate(zio_t *zio)
2904{
2905	blkptr_t *bp = zio->io_bp;
2906	enum zio_checksum checksum;
2907
2908	if (bp == NULL) {
2909		/*
2910		 * This is zio_write_phys().
2911		 * We're either generating a label checksum, or none at all.
2912		 */
2913		checksum = zio->io_prop.zp_checksum;
2914
2915		if (checksum == ZIO_CHECKSUM_OFF)
2916			return (ZIO_PIPELINE_CONTINUE);
2917
2918		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2919	} else {
2920		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2921			ASSERT(!IO_IS_ALLOCATING(zio));
2922			checksum = ZIO_CHECKSUM_GANG_HEADER;
2923		} else {
2924			checksum = BP_GET_CHECKSUM(bp);
2925		}
2926	}
2927
2928	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2929
2930	return (ZIO_PIPELINE_CONTINUE);
2931}
2932
2933static int
2934zio_checksum_verify(zio_t *zio)
2935{
2936	zio_bad_cksum_t info;
2937	blkptr_t *bp = zio->io_bp;
2938	int error;
2939
2940	ASSERT(zio->io_vd != NULL);
2941
2942	if (bp == NULL) {
2943		/*
2944		 * This is zio_read_phys().
2945		 * We're either verifying a label checksum, or nothing at all.
2946		 */
2947		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2948			return (ZIO_PIPELINE_CONTINUE);
2949
2950		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2951	}
2952
2953	if ((error = zio_checksum_error(zio, &info)) != 0) {
2954		zio->io_error = error;
2955		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2956			zfs_ereport_start_checksum(zio->io_spa,
2957			    zio->io_vd, zio, zio->io_offset,
2958			    zio->io_size, NULL, &info);
2959		}
2960	}
2961
2962	return (ZIO_PIPELINE_CONTINUE);
2963}
2964
2965/*
2966 * Called by RAID-Z to ensure we don't compute the checksum twice.
2967 */
2968void
2969zio_checksum_verified(zio_t *zio)
2970{
2971	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2972}
2973
2974/*
2975 * ==========================================================================
2976 * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2977 * An error of 0 indicates success.  ENXIO indicates whole-device failure,
2978 * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2979 * indicate errors that are specific to one I/O, and most likely permanent.
2980 * Any other error is presumed to be worse because we weren't expecting it.
2981 * ==========================================================================
2982 */
2983int
2984zio_worst_error(int e1, int e2)
2985{
2986	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2987	int r1, r2;
2988
2989	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2990		if (e1 == zio_error_rank[r1])
2991			break;
2992
2993	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2994		if (e2 == zio_error_rank[r2])
2995			break;
2996
2997	return (r1 > r2 ? e1 : e2);
2998}
2999
3000/*
3001 * ==========================================================================
3002 * I/O completion
3003 * ==========================================================================
3004 */
3005static int
3006zio_ready(zio_t *zio)
3007{
3008	blkptr_t *bp = zio->io_bp;
3009	zio_t *pio, *pio_next;
3010
3011	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
3012	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
3013		return (ZIO_PIPELINE_STOP);
3014
3015	if (zio->io_ready) {
3016		ASSERT(IO_IS_ALLOCATING(zio));
3017		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
3018		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
3019		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
3020
3021		zio->io_ready(zio);
3022	}
3023
3024	if (bp != NULL && bp != &zio->io_bp_copy)
3025		zio->io_bp_copy = *bp;
3026
3027	if (zio->io_error)
3028		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
3029
3030	mutex_enter(&zio->io_lock);
3031	zio->io_state[ZIO_WAIT_READY] = 1;
3032	pio = zio_walk_parents(zio);
3033	mutex_exit(&zio->io_lock);
3034
3035	/*
3036	 * As we notify zio's parents, new parents could be added.
3037	 * New parents go to the head of zio's io_parent_list, however,
3038	 * so we will (correctly) not notify them.  The remainder of zio's
3039	 * io_parent_list, from 'pio_next' onward, cannot change because
3040	 * all parents must wait for us to be done before they can be done.
3041	 */
3042	for (; pio != NULL; pio = pio_next) {
3043		pio_next = zio_walk_parents(zio);
3044		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
3045	}
3046
3047	if (zio->io_flags & ZIO_FLAG_NODATA) {
3048		if (BP_IS_GANG(bp)) {
3049			zio->io_flags &= ~ZIO_FLAG_NODATA;
3050		} else {
3051			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3052			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3053		}
3054	}
3055
3056	if (zio_injection_enabled &&
3057	    zio->io_spa->spa_syncing_txg == zio->io_txg)
3058		zio_handle_ignored_writes(zio);
3059
3060	return (ZIO_PIPELINE_CONTINUE);
3061}
3062
3063static int
3064zio_done(zio_t *zio)
3065{
3066	spa_t *spa = zio->io_spa;
3067	zio_t *lio = zio->io_logical;
3068	blkptr_t *bp = zio->io_bp;
3069	vdev_t *vd = zio->io_vd;
3070	uint64_t psize = zio->io_size;
3071	zio_t *pio, *pio_next;
3072
3073	/*
3074	 * If our children haven't all completed,
3075	 * wait for them and then repeat this pipeline stage.
3076	 */
3077	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3078	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3079	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3080	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3081		return (ZIO_PIPELINE_STOP);
3082
3083	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3084		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3085			ASSERT(zio->io_children[c][w] == 0);
3086
3087	if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
3088		ASSERT(bp->blk_pad[0] == 0);
3089		ASSERT(bp->blk_pad[1] == 0);
3090		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3091		    (bp == zio_unique_parent(zio)->io_bp));
3092		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3093		    zio->io_bp_override == NULL &&
3094		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3095			ASSERT(!BP_SHOULD_BYTESWAP(bp));
3096			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
3097			ASSERT(BP_COUNT_GANG(bp) == 0 ||
3098			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
3099		}
3100		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3101			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
3102	}
3103
3104	/*
3105	 * If there were child vdev/gang/ddt errors, they apply to us now.
3106	 */
3107	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3108	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3109	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3110
3111	/*
3112	 * If the I/O on the transformed data was successful, generate any
3113	 * checksum reports now while we still have the transformed data.
3114	 */
3115	if (zio->io_error == 0) {
3116		while (zio->io_cksum_report != NULL) {
3117			zio_cksum_report_t *zcr = zio->io_cksum_report;
3118			uint64_t align = zcr->zcr_align;
3119			uint64_t asize = P2ROUNDUP(psize, align);
3120			char *abuf = zio->io_data;
3121
3122			if (asize != psize) {
3123				abuf = zio_buf_alloc(asize);
3124				bcopy(zio->io_data, abuf, psize);
3125				bzero(abuf + psize, asize - psize);
3126			}
3127
3128			zio->io_cksum_report = zcr->zcr_next;
3129			zcr->zcr_next = NULL;
3130			zcr->zcr_finish(zcr, abuf);
3131			zfs_ereport_free_checksum(zcr);
3132
3133			if (asize != psize)
3134				zio_buf_free(abuf, asize);
3135		}
3136	}
3137
3138	zio_pop_transforms(zio);	/* note: may set zio->io_error */
3139
3140	vdev_stat_update(zio, psize);
3141
3142	if (zio->io_error) {
3143		/*
3144		 * If this I/O is attached to a particular vdev,
3145		 * generate an error message describing the I/O failure
3146		 * at the block level.  We ignore these errors if the
3147		 * device is currently unavailable.
3148		 */
3149		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3150			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3151
3152		if ((zio->io_error == EIO || !(zio->io_flags &
3153		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3154		    zio == lio) {
3155			/*
3156			 * For logical I/O requests, tell the SPA to log the
3157			 * error and generate a logical data ereport.
3158			 */
3159			spa_log_error(spa, zio);
3160			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3161			    0, 0);
3162		}
3163	}
3164
3165	if (zio->io_error && zio == lio) {
3166		/*
3167		 * Determine whether zio should be reexecuted.  This will
3168		 * propagate all the way to the root via zio_notify_parent().
3169		 */
3170		ASSERT(vd == NULL && bp != NULL);
3171		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3172
3173		if (IO_IS_ALLOCATING(zio) &&
3174		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3175			if (zio->io_error != ENOSPC)
3176				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3177			else
3178				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3179		}
3180
3181		if ((zio->io_type == ZIO_TYPE_READ ||
3182		    zio->io_type == ZIO_TYPE_FREE) &&
3183		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3184		    zio->io_error == ENXIO &&
3185		    spa_load_state(spa) == SPA_LOAD_NONE &&
3186		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3187			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3188
3189		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3190			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3191
3192		/*
3193		 * Here is a possibly good place to attempt to do
3194		 * either combinatorial reconstruction or error correction
3195		 * based on checksums.  It also might be a good place
3196		 * to send out preliminary ereports before we suspend
3197		 * processing.
3198		 */
3199	}
3200
3201	/*
3202	 * If there were logical child errors, they apply to us now.
3203	 * We defer this until now to avoid conflating logical child
3204	 * errors with errors that happened to the zio itself when
3205	 * updating vdev stats and reporting FMA events above.
3206	 */
3207	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3208
3209	if ((zio->io_error || zio->io_reexecute) &&
3210	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3211	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3212		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3213
3214	zio_gang_tree_free(&zio->io_gang_tree);
3215
3216	/*
3217	 * Godfather I/Os should never suspend.
3218	 */
3219	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3220	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3221		zio->io_reexecute = 0;
3222
3223	if (zio->io_reexecute) {
3224		/*
3225		 * This is a logical I/O that wants to reexecute.
3226		 *
3227		 * Reexecute is top-down.  When an i/o fails, if it's not
3228		 * the root, it simply notifies its parent and sticks around.
3229		 * The parent, seeing that it still has children in zio_done(),
3230		 * does the same.  This percolates all the way up to the root.
3231		 * The root i/o will reexecute or suspend the entire tree.
3232		 *
3233		 * This approach ensures that zio_reexecute() honors
3234		 * all the original i/o dependency relationships, e.g.
3235		 * parents not executing until children are ready.
3236		 */
3237		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3238
3239		zio->io_gang_leader = NULL;
3240
3241		mutex_enter(&zio->io_lock);
3242		zio->io_state[ZIO_WAIT_DONE] = 1;
3243		mutex_exit(&zio->io_lock);
3244
3245		/*
3246		 * "The Godfather" I/O monitors its children but is
3247		 * not a true parent to them. It will track them through
3248		 * the pipeline but severs its ties whenever they get into
3249		 * trouble (e.g. suspended). This allows "The Godfather"
3250		 * I/O to return status without blocking.
3251		 */
3252		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3253			zio_link_t *zl = zio->io_walk_link;
3254			pio_next = zio_walk_parents(zio);
3255
3256			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3257			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3258				zio_remove_child(pio, zio, zl);
3259				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3260			}
3261		}
3262
3263		if ((pio = zio_unique_parent(zio)) != NULL) {
3264			/*
3265			 * We're not a root i/o, so there's nothing to do
3266			 * but notify our parent.  Don't propagate errors
3267			 * upward since we haven't permanently failed yet.
3268			 */
3269			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3270			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3271			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3272		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3273			/*
3274			 * We'd fail again if we reexecuted now, so suspend
3275			 * until conditions improve (e.g. device comes online).
3276			 */
3277			zio_suspend(spa, zio);
3278		} else {
3279			/*
3280			 * Reexecution is potentially a huge amount of work.
3281			 * Hand it off to the otherwise-unused claim taskq.
3282			 */
3283#if defined(illumos) || !defined(_KERNEL)
3284			ASSERT(zio->io_tqent.tqent_next == NULL);
3285#else
3286			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
3287#endif
3288			spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3289			    ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3290			    0, &zio->io_tqent);
3291		}
3292		return (ZIO_PIPELINE_STOP);
3293	}
3294
3295	ASSERT(zio->io_child_count == 0);
3296	ASSERT(zio->io_reexecute == 0);
3297	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3298
3299	/*
3300	 * Report any checksum errors, since the I/O is complete.
3301	 */
3302	while (zio->io_cksum_report != NULL) {
3303		zio_cksum_report_t *zcr = zio->io_cksum_report;
3304		zio->io_cksum_report = zcr->zcr_next;
3305		zcr->zcr_next = NULL;
3306		zcr->zcr_finish(zcr, NULL);
3307		zfs_ereport_free_checksum(zcr);
3308	}
3309
3310	/*
3311	 * It is the responsibility of the done callback to ensure that this
3312	 * particular zio is no longer discoverable for adoption, and as
3313	 * such, cannot acquire any new parents.
3314	 */
3315	if (zio->io_done)
3316		zio->io_done(zio);
3317
3318	mutex_enter(&zio->io_lock);
3319	zio->io_state[ZIO_WAIT_DONE] = 1;
3320	mutex_exit(&zio->io_lock);
3321
3322	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3323		zio_link_t *zl = zio->io_walk_link;
3324		pio_next = zio_walk_parents(zio);
3325		zio_remove_child(pio, zio, zl);
3326		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3327	}
3328
3329	if (zio->io_waiter != NULL) {
3330		mutex_enter(&zio->io_lock);
3331		zio->io_executor = NULL;
3332		cv_broadcast(&zio->io_cv);
3333		mutex_exit(&zio->io_lock);
3334	} else {
3335		zio_destroy(zio);
3336	}
3337
3338	return (ZIO_PIPELINE_STOP);
3339}
3340
3341/*
3342 * ==========================================================================
3343 * I/O pipeline definition
3344 * ==========================================================================
3345 */
3346static zio_pipe_stage_t *zio_pipeline[] = {
3347	NULL,
3348	zio_read_bp_init,
3349	zio_free_bp_init,
3350	zio_issue_async,
3351	zio_write_bp_init,
3352	zio_checksum_generate,
3353	zio_nop_write,
3354	zio_ddt_read_start,
3355	zio_ddt_read_done,
3356	zio_ddt_write,
3357	zio_ddt_free,
3358	zio_gang_assemble,
3359	zio_gang_issue,
3360	zio_dva_allocate,
3361	zio_dva_free,
3362	zio_dva_claim,
3363	zio_ready,
3364	zio_vdev_io_start,
3365	zio_vdev_io_done,
3366	zio_vdev_io_assess,
3367	zio_checksum_verify,
3368	zio_done
3369};
3370
3371/* dnp is the dnode for zb1->zb_object */
3372boolean_t
3373zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3374    const zbookmark_phys_t *zb2)
3375{
3376	uint64_t zb1nextL0, zb2thisobj;
3377
3378	ASSERT(zb1->zb_objset == zb2->zb_objset);
3379	ASSERT(zb2->zb_level == 0);
3380
3381	/* The objset_phys_t isn't before anything. */
3382	if (dnp == NULL)
3383		return (B_FALSE);
3384
3385	zb1nextL0 = (zb1->zb_blkid + 1) <<
3386	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3387
3388	zb2thisobj = zb2->zb_object ? zb2->zb_object :
3389	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3390
3391	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3392		uint64_t nextobj = zb1nextL0 *
3393		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3394		return (nextobj <= zb2thisobj);
3395	}
3396
3397	if (zb1->zb_object < zb2thisobj)
3398		return (B_TRUE);
3399	if (zb1->zb_object > zb2thisobj)
3400		return (B_FALSE);
3401	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3402		return (B_FALSE);
3403	return (zb1nextL0 <= zb2->zb_blkid);
3404}
3405