zio.c revision 260742
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/fm/fs/zfs.h>
29#include <sys/spa.h>
30#include <sys/txg.h>
31#include <sys/spa_impl.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio_impl.h>
34#include <sys/zio_compress.h>
35#include <sys/zio_checksum.h>
36#include <sys/dmu_objset.h>
37#include <sys/arc.h>
38#include <sys/ddt.h>
39#include <sys/trim_map.h>
40
41SYSCTL_DECL(_vfs_zfs);
42SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
43#if defined(__amd64__)
44static int zio_use_uma = 1;
45#else
46static int zio_use_uma = 0;
47#endif
48TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
49SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
50    "Use uma(9) for ZIO allocations");
51static int zio_exclude_metadata = 0;
52TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
53SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
54    "Exclude metadata buffers from dumps as well");
55
56zio_trim_stats_t zio_trim_stats = {
57	{ "bytes",		KSTAT_DATA_UINT64,
58	  "Number of bytes successfully TRIMmed" },
59	{ "success",		KSTAT_DATA_UINT64,
60	  "Number of successful TRIM requests" },
61	{ "unsupported",	KSTAT_DATA_UINT64,
62	  "Number of TRIM requests that failed because TRIM is not supported" },
63	{ "failed",		KSTAT_DATA_UINT64,
64	  "Number of TRIM requests that failed for reasons other than not supported" },
65};
66
67static kstat_t *zio_trim_ksp;
68
69/*
70 * ==========================================================================
71 * I/O priority table
72 * ==========================================================================
73 */
74uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
75	0,	/* ZIO_PRIORITY_NOW		*/
76	0,	/* ZIO_PRIORITY_SYNC_READ	*/
77	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
78	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
79	1,	/* ZIO_PRIORITY_CACHE_FILL	*/
80	1,	/* ZIO_PRIORITY_AGG		*/
81	4,	/* ZIO_PRIORITY_FREE		*/
82	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
83	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
84	10,	/* ZIO_PRIORITY_RESILVER	*/
85	20,	/* ZIO_PRIORITY_SCRUB		*/
86	2,	/* ZIO_PRIORITY_DDT_PREFETCH	*/
87	30,	/* ZIO_PRIORITY_TRIM		*/
88};
89
90/*
91 * ==========================================================================
92 * I/O type descriptions
93 * ==========================================================================
94 */
95char *zio_type_name[ZIO_TYPES] = {
96	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
97	"zio_ioctl"
98};
99
100/*
101 * ==========================================================================
102 * I/O kmem caches
103 * ==========================================================================
104 */
105kmem_cache_t *zio_cache;
106kmem_cache_t *zio_link_cache;
107kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
108kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
109
110#ifdef _KERNEL
111extern vmem_t *zio_alloc_arena;
112#endif
113extern int zfs_mg_alloc_failures;
114
115/*
116 * The following actions directly effect the spa's sync-to-convergence logic.
117 * The values below define the sync pass when we start performing the action.
118 * Care should be taken when changing these values as they directly impact
119 * spa_sync() performance. Tuning these values may introduce subtle performance
120 * pathologies and should only be done in the context of performance analysis.
121 * These tunables will eventually be removed and replaced with #defines once
122 * enough analysis has been done to determine optimal values.
123 *
124 * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
125 * regular blocks are not deferred.
126 */
127int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
128TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free);
129SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
130    &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
131int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
132TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress);
133SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
134    &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
135int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
136TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite);
137SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
138    &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
139
140/*
141 * An allocating zio is one that either currently has the DVA allocate
142 * stage set or will have it later in its lifetime.
143 */
144#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
145
146boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
147
148#ifdef ZFS_DEBUG
149int zio_buf_debug_limit = 16384;
150#else
151int zio_buf_debug_limit = 0;
152#endif
153
154void
155zio_init(void)
156{
157	size_t c;
158	zio_cache = kmem_cache_create("zio_cache",
159	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
160	zio_link_cache = kmem_cache_create("zio_link_cache",
161	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
162	if (!zio_use_uma)
163		goto out;
164
165	/*
166	 * For small buffers, we want a cache for each multiple of
167	 * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
168	 * for each quarter-power of 2.  For large buffers, we want
169	 * a cache for each multiple of PAGESIZE.
170	 */
171	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
172		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
173		size_t p2 = size;
174		size_t align = 0;
175		size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
176
177		while (p2 & (p2 - 1))
178			p2 &= p2 - 1;
179
180#ifdef illumos
181#ifndef _KERNEL
182		/*
183		 * If we are using watchpoints, put each buffer on its own page,
184		 * to eliminate the performance overhead of trapping to the
185		 * kernel when modifying a non-watched buffer that shares the
186		 * page with a watched buffer.
187		 */
188		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
189			continue;
190#endif
191#endif /* illumos */
192		if (size <= 4 * SPA_MINBLOCKSIZE) {
193			align = SPA_MINBLOCKSIZE;
194		} else if (IS_P2ALIGNED(size, PAGESIZE)) {
195			align = PAGESIZE;
196		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
197			align = p2 >> 2;
198		}
199
200		if (align != 0) {
201			char name[36];
202			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
203			zio_buf_cache[c] = kmem_cache_create(name, size,
204			    align, NULL, NULL, NULL, NULL, NULL, cflags);
205
206			/*
207			 * Since zio_data bufs do not appear in crash dumps, we
208			 * pass KMC_NOTOUCH so that no allocator metadata is
209			 * stored with the buffers.
210			 */
211			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
212			zio_data_buf_cache[c] = kmem_cache_create(name, size,
213			    align, NULL, NULL, NULL, NULL, NULL,
214			    cflags | KMC_NOTOUCH | KMC_NODEBUG);
215		}
216	}
217
218	while (--c != 0) {
219		ASSERT(zio_buf_cache[c] != NULL);
220		if (zio_buf_cache[c - 1] == NULL)
221			zio_buf_cache[c - 1] = zio_buf_cache[c];
222
223		ASSERT(zio_data_buf_cache[c] != NULL);
224		if (zio_data_buf_cache[c - 1] == NULL)
225			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
226	}
227out:
228
229	/*
230	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
231	 * to fail 3 times per txg or 8 failures, whichever is greater.
232	 */
233	if (zfs_mg_alloc_failures == 0)
234		zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
235	else if (zfs_mg_alloc_failures < 8)
236		zfs_mg_alloc_failures = 8;
237
238	zio_inject_init();
239
240	zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
241	    KSTAT_TYPE_NAMED,
242	    sizeof(zio_trim_stats) / sizeof(kstat_named_t),
243	    KSTAT_FLAG_VIRTUAL);
244
245	if (zio_trim_ksp != NULL) {
246		zio_trim_ksp->ks_data = &zio_trim_stats;
247		kstat_install(zio_trim_ksp);
248	}
249}
250
251void
252zio_fini(void)
253{
254	size_t c;
255	kmem_cache_t *last_cache = NULL;
256	kmem_cache_t *last_data_cache = NULL;
257
258	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
259		if (zio_buf_cache[c] != last_cache) {
260			last_cache = zio_buf_cache[c];
261			kmem_cache_destroy(zio_buf_cache[c]);
262		}
263		zio_buf_cache[c] = NULL;
264
265		if (zio_data_buf_cache[c] != last_data_cache) {
266			last_data_cache = zio_data_buf_cache[c];
267			kmem_cache_destroy(zio_data_buf_cache[c]);
268		}
269		zio_data_buf_cache[c] = NULL;
270	}
271
272	kmem_cache_destroy(zio_link_cache);
273	kmem_cache_destroy(zio_cache);
274
275	zio_inject_fini();
276
277	if (zio_trim_ksp != NULL) {
278		kstat_delete(zio_trim_ksp);
279		zio_trim_ksp = NULL;
280	}
281}
282
283/*
284 * ==========================================================================
285 * Allocate and free I/O buffers
286 * ==========================================================================
287 */
288
289/*
290 * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
291 * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
292 * useful to inspect ZFS metadata, but if possible, we should avoid keeping
293 * excess / transient data in-core during a crashdump.
294 */
295void *
296zio_buf_alloc(size_t size)
297{
298	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
299	int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
300
301	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
302
303	if (zio_use_uma)
304		return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
305	else
306		return (kmem_alloc(size, KM_SLEEP|flags));
307}
308
309/*
310 * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
311 * crashdump if the kernel panics.  This exists so that we will limit the amount
312 * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
313 * of kernel heap dumped to disk when the kernel panics)
314 */
315void *
316zio_data_buf_alloc(size_t size)
317{
318	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
319
320	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
321
322	if (zio_use_uma)
323		return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
324	else
325		return (kmem_alloc(size, KM_SLEEP | KM_NODEBUG));
326}
327
328void
329zio_buf_free(void *buf, size_t size)
330{
331	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
332
333	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
334
335	if (zio_use_uma)
336		kmem_cache_free(zio_buf_cache[c], buf);
337	else
338		kmem_free(buf, size);
339}
340
341void
342zio_data_buf_free(void *buf, size_t size)
343{
344	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
345
346	ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
347
348	if (zio_use_uma)
349		kmem_cache_free(zio_data_buf_cache[c], buf);
350	else
351		kmem_free(buf, size);
352}
353
354/*
355 * ==========================================================================
356 * Push and pop I/O transform buffers
357 * ==========================================================================
358 */
359static void
360zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
361	zio_transform_func_t *transform)
362{
363	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
364
365	zt->zt_orig_data = zio->io_data;
366	zt->zt_orig_size = zio->io_size;
367	zt->zt_bufsize = bufsize;
368	zt->zt_transform = transform;
369
370	zt->zt_next = zio->io_transform_stack;
371	zio->io_transform_stack = zt;
372
373	zio->io_data = data;
374	zio->io_size = size;
375}
376
377static void
378zio_pop_transforms(zio_t *zio)
379{
380	zio_transform_t *zt;
381
382	while ((zt = zio->io_transform_stack) != NULL) {
383		if (zt->zt_transform != NULL)
384			zt->zt_transform(zio,
385			    zt->zt_orig_data, zt->zt_orig_size);
386
387		if (zt->zt_bufsize != 0)
388			zio_buf_free(zio->io_data, zt->zt_bufsize);
389
390		zio->io_data = zt->zt_orig_data;
391		zio->io_size = zt->zt_orig_size;
392		zio->io_transform_stack = zt->zt_next;
393
394		kmem_free(zt, sizeof (zio_transform_t));
395	}
396}
397
398/*
399 * ==========================================================================
400 * I/O transform callbacks for subblocks and decompression
401 * ==========================================================================
402 */
403static void
404zio_subblock(zio_t *zio, void *data, uint64_t size)
405{
406	ASSERT(zio->io_size > size);
407
408	if (zio->io_type == ZIO_TYPE_READ)
409		bcopy(zio->io_data, data, size);
410}
411
412static void
413zio_decompress(zio_t *zio, void *data, uint64_t size)
414{
415	if (zio->io_error == 0 &&
416	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
417	    zio->io_data, data, zio->io_size, size) != 0)
418		zio->io_error = SET_ERROR(EIO);
419}
420
421/*
422 * ==========================================================================
423 * I/O parent/child relationships and pipeline interlocks
424 * ==========================================================================
425 */
426/*
427 * NOTE - Callers to zio_walk_parents() and zio_walk_children must
428 *        continue calling these functions until they return NULL.
429 *        Otherwise, the next caller will pick up the list walk in
430 *        some indeterminate state.  (Otherwise every caller would
431 *        have to pass in a cookie to keep the state represented by
432 *        io_walk_link, which gets annoying.)
433 */
434zio_t *
435zio_walk_parents(zio_t *cio)
436{
437	zio_link_t *zl = cio->io_walk_link;
438	list_t *pl = &cio->io_parent_list;
439
440	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
441	cio->io_walk_link = zl;
442
443	if (zl == NULL)
444		return (NULL);
445
446	ASSERT(zl->zl_child == cio);
447	return (zl->zl_parent);
448}
449
450zio_t *
451zio_walk_children(zio_t *pio)
452{
453	zio_link_t *zl = pio->io_walk_link;
454	list_t *cl = &pio->io_child_list;
455
456	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
457	pio->io_walk_link = zl;
458
459	if (zl == NULL)
460		return (NULL);
461
462	ASSERT(zl->zl_parent == pio);
463	return (zl->zl_child);
464}
465
466zio_t *
467zio_unique_parent(zio_t *cio)
468{
469	zio_t *pio = zio_walk_parents(cio);
470
471	VERIFY(zio_walk_parents(cio) == NULL);
472	return (pio);
473}
474
475void
476zio_add_child(zio_t *pio, zio_t *cio)
477{
478	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
479
480	/*
481	 * Logical I/Os can have logical, gang, or vdev children.
482	 * Gang I/Os can have gang or vdev children.
483	 * Vdev I/Os can only have vdev children.
484	 * The following ASSERT captures all of these constraints.
485	 */
486	ASSERT(cio->io_child_type <= pio->io_child_type);
487
488	zl->zl_parent = pio;
489	zl->zl_child = cio;
490
491	mutex_enter(&cio->io_lock);
492	mutex_enter(&pio->io_lock);
493
494	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
495
496	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
497		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
498
499	list_insert_head(&pio->io_child_list, zl);
500	list_insert_head(&cio->io_parent_list, zl);
501
502	pio->io_child_count++;
503	cio->io_parent_count++;
504
505	mutex_exit(&pio->io_lock);
506	mutex_exit(&cio->io_lock);
507}
508
509static void
510zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
511{
512	ASSERT(zl->zl_parent == pio);
513	ASSERT(zl->zl_child == cio);
514
515	mutex_enter(&cio->io_lock);
516	mutex_enter(&pio->io_lock);
517
518	list_remove(&pio->io_child_list, zl);
519	list_remove(&cio->io_parent_list, zl);
520
521	pio->io_child_count--;
522	cio->io_parent_count--;
523
524	mutex_exit(&pio->io_lock);
525	mutex_exit(&cio->io_lock);
526
527	kmem_cache_free(zio_link_cache, zl);
528}
529
530static boolean_t
531zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
532{
533	uint64_t *countp = &zio->io_children[child][wait];
534	boolean_t waiting = B_FALSE;
535
536	mutex_enter(&zio->io_lock);
537	ASSERT(zio->io_stall == NULL);
538	if (*countp != 0) {
539		zio->io_stage >>= 1;
540		zio->io_stall = countp;
541		waiting = B_TRUE;
542	}
543	mutex_exit(&zio->io_lock);
544
545	return (waiting);
546}
547
548static void
549zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
550{
551	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
552	int *errorp = &pio->io_child_error[zio->io_child_type];
553
554	mutex_enter(&pio->io_lock);
555	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
556		*errorp = zio_worst_error(*errorp, zio->io_error);
557	pio->io_reexecute |= zio->io_reexecute;
558	ASSERT3U(*countp, >, 0);
559	if (--*countp == 0 && pio->io_stall == countp) {
560		pio->io_stall = NULL;
561		mutex_exit(&pio->io_lock);
562		zio_execute(pio);
563	} else {
564		mutex_exit(&pio->io_lock);
565	}
566}
567
568static void
569zio_inherit_child_errors(zio_t *zio, enum zio_child c)
570{
571	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
572		zio->io_error = zio->io_child_error[c];
573}
574
575/*
576 * ==========================================================================
577 * Create the various types of I/O (read, write, free, etc)
578 * ==========================================================================
579 */
580static zio_t *
581zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
582    void *data, uint64_t size, zio_done_func_t *done, void *private,
583    zio_type_t type, int priority, enum zio_flag flags,
584    vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
585    enum zio_stage stage, enum zio_stage pipeline)
586{
587	zio_t *zio;
588
589	ASSERT3U(type == ZIO_TYPE_FREE || size, <=, SPA_MAXBLOCKSIZE);
590	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
591	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
592
593	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
594	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
595	ASSERT(vd || stage == ZIO_STAGE_OPEN);
596
597	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
598	bzero(zio, sizeof (zio_t));
599
600	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
601	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
602
603	list_create(&zio->io_parent_list, sizeof (zio_link_t),
604	    offsetof(zio_link_t, zl_parent_node));
605	list_create(&zio->io_child_list, sizeof (zio_link_t),
606	    offsetof(zio_link_t, zl_child_node));
607
608	if (vd != NULL)
609		zio->io_child_type = ZIO_CHILD_VDEV;
610	else if (flags & ZIO_FLAG_GANG_CHILD)
611		zio->io_child_type = ZIO_CHILD_GANG;
612	else if (flags & ZIO_FLAG_DDT_CHILD)
613		zio->io_child_type = ZIO_CHILD_DDT;
614	else
615		zio->io_child_type = ZIO_CHILD_LOGICAL;
616
617	if (bp != NULL) {
618		zio->io_bp = (blkptr_t *)bp;
619		zio->io_bp_copy = *bp;
620		zio->io_bp_orig = *bp;
621		if (type != ZIO_TYPE_WRITE ||
622		    zio->io_child_type == ZIO_CHILD_DDT)
623			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
624		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
625			zio->io_logical = zio;
626		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
627			pipeline |= ZIO_GANG_STAGES;
628	}
629
630	zio->io_spa = spa;
631	zio->io_txg = txg;
632	zio->io_done = done;
633	zio->io_private = private;
634	zio->io_type = type;
635	zio->io_priority = priority;
636	zio->io_vd = vd;
637	zio->io_offset = offset;
638	zio->io_orig_data = zio->io_data = data;
639	zio->io_orig_size = zio->io_size = size;
640	zio->io_orig_flags = zio->io_flags = flags;
641	zio->io_orig_stage = zio->io_stage = stage;
642	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
643
644	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
645	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
646
647	if (zb != NULL)
648		zio->io_bookmark = *zb;
649
650	if (pio != NULL) {
651		if (zio->io_logical == NULL)
652			zio->io_logical = pio->io_logical;
653		if (zio->io_child_type == ZIO_CHILD_GANG)
654			zio->io_gang_leader = pio->io_gang_leader;
655		zio_add_child(pio, zio);
656	}
657
658	return (zio);
659}
660
661static void
662zio_destroy(zio_t *zio)
663{
664	list_destroy(&zio->io_parent_list);
665	list_destroy(&zio->io_child_list);
666	mutex_destroy(&zio->io_lock);
667	cv_destroy(&zio->io_cv);
668	kmem_cache_free(zio_cache, zio);
669}
670
671zio_t *
672zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
673    void *private, enum zio_flag flags)
674{
675	zio_t *zio;
676
677	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
678	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
679	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
680
681	return (zio);
682}
683
684zio_t *
685zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
686{
687	return (zio_null(NULL, spa, NULL, done, private, flags));
688}
689
690zio_t *
691zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
692    void *data, uint64_t size, zio_done_func_t *done, void *private,
693    int priority, enum zio_flag flags, const zbookmark_t *zb)
694{
695	zio_t *zio;
696
697	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
698	    data, size, done, private,
699	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
700	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
701	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
702
703	return (zio);
704}
705
706zio_t *
707zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
708    void *data, uint64_t size, const zio_prop_t *zp,
709    zio_done_func_t *ready, zio_done_func_t *done, void *private,
710    int priority, enum zio_flag flags, const zbookmark_t *zb)
711{
712	zio_t *zio;
713
714	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
715	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
716	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
717	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
718	    DMU_OT_IS_VALID(zp->zp_type) &&
719	    zp->zp_level < 32 &&
720	    zp->zp_copies > 0 &&
721	    zp->zp_copies <= spa_max_replication(spa));
722
723	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
724	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
725	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
726	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
727
728	zio->io_ready = ready;
729	zio->io_prop = *zp;
730
731	return (zio);
732}
733
734zio_t *
735zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
736    uint64_t size, zio_done_func_t *done, void *private, int priority,
737    enum zio_flag flags, zbookmark_t *zb)
738{
739	zio_t *zio;
740
741	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
742	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
743	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
744
745	return (zio);
746}
747
748void
749zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
750{
751	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
752	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
753	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
754	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
755
756	/*
757	 * We must reset the io_prop to match the values that existed
758	 * when the bp was first written by dmu_sync() keeping in mind
759	 * that nopwrite and dedup are mutually exclusive.
760	 */
761	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
762	zio->io_prop.zp_nopwrite = nopwrite;
763	zio->io_prop.zp_copies = copies;
764	zio->io_bp_override = bp;
765}
766
767void
768zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
769{
770	metaslab_check_free(spa, bp);
771
772	/*
773	 * Frees that are for the currently-syncing txg, are not going to be
774	 * deferred, and which will not need to do a read (i.e. not GANG or
775	 * DEDUP), can be processed immediately.  Otherwise, put them on the
776	 * in-memory list for later processing.
777	 */
778	if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
779	    txg != spa->spa_syncing_txg ||
780	    spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
781		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
782	} else {
783		VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
784		    BP_GET_PSIZE(bp), 0)));
785	}
786}
787
788zio_t *
789zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
790    uint64_t size, enum zio_flag flags)
791{
792	zio_t *zio;
793	enum zio_stage stage = ZIO_FREE_PIPELINE;
794
795	dprintf_bp(bp, "freeing in txg %llu, pass %u",
796	    (longlong_t)txg, spa->spa_sync_pass);
797
798	ASSERT(!BP_IS_HOLE(bp));
799	ASSERT(spa_syncing_txg(spa) == txg);
800	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
801
802	metaslab_check_free(spa, bp);
803	arc_freed(spa, bp);
804
805	if (zfs_trim_enabled)
806		stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
807		    ZIO_STAGE_VDEV_IO_ASSESS;
808	/*
809	 * GANG and DEDUP blocks can induce a read (for the gang block header,
810	 * or the DDT), so issue them asynchronously so that this thread is
811	 * not tied up.
812	 */
813	else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
814		stage |= ZIO_STAGE_ISSUE_ASYNC;
815
816	zio = zio_create(pio, spa, txg, bp, NULL, size,
817	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
818	    NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
819
820	return (zio);
821}
822
823zio_t *
824zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
825    zio_done_func_t *done, void *private, enum zio_flag flags)
826{
827	zio_t *zio;
828
829	/*
830	 * A claim is an allocation of a specific block.  Claims are needed
831	 * to support immediate writes in the intent log.  The issue is that
832	 * immediate writes contain committed data, but in a txg that was
833	 * *not* committed.  Upon opening the pool after an unclean shutdown,
834	 * the intent log claims all blocks that contain immediate write data
835	 * so that the SPA knows they're in use.
836	 *
837	 * All claims *must* be resolved in the first txg -- before the SPA
838	 * starts allocating blocks -- so that nothing is allocated twice.
839	 * If txg == 0 we just verify that the block is claimable.
840	 */
841	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
842	ASSERT(txg == spa_first_txg(spa) || txg == 0);
843	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
844
845	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
846	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
847	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
848
849	return (zio);
850}
851
852zio_t *
853zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
854    uint64_t size, zio_done_func_t *done, void *private, int priority,
855    enum zio_flag flags)
856{
857	zio_t *zio;
858	int c;
859
860	if (vd->vdev_children == 0) {
861		zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private,
862		    ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL,
863		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
864
865		zio->io_cmd = cmd;
866	} else {
867		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
868
869		for (c = 0; c < vd->vdev_children; c++)
870			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
871			    offset, size, done, private, priority, flags));
872	}
873
874	return (zio);
875}
876
877zio_t *
878zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
879    void *data, int checksum, zio_done_func_t *done, void *private,
880    int priority, enum zio_flag flags, boolean_t labels)
881{
882	zio_t *zio;
883
884	ASSERT(vd->vdev_children == 0);
885	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
886	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
887	ASSERT3U(offset + size, <=, vd->vdev_psize);
888
889	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
890	    ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
891	    ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
892
893	zio->io_prop.zp_checksum = checksum;
894
895	return (zio);
896}
897
898zio_t *
899zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
900    void *data, int checksum, zio_done_func_t *done, void *private,
901    int priority, enum zio_flag flags, boolean_t labels)
902{
903	zio_t *zio;
904
905	ASSERT(vd->vdev_children == 0);
906	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
907	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
908	ASSERT3U(offset + size, <=, vd->vdev_psize);
909
910	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
911	    ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
912	    ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
913
914	zio->io_prop.zp_checksum = checksum;
915
916	if (zio_checksum_table[checksum].ci_eck) {
917		/*
918		 * zec checksums are necessarily destructive -- they modify
919		 * the end of the write buffer to hold the verifier/checksum.
920		 * Therefore, we must make a local copy in case the data is
921		 * being written to multiple places in parallel.
922		 */
923		void *wbuf = zio_buf_alloc(size);
924		bcopy(data, wbuf, size);
925		zio_push_transform(zio, wbuf, size, size, NULL);
926	}
927
928	return (zio);
929}
930
931/*
932 * Create a child I/O to do some work for us.
933 */
934zio_t *
935zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
936	void *data, uint64_t size, int type, int priority, enum zio_flag flags,
937	zio_done_func_t *done, void *private)
938{
939	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
940	zio_t *zio;
941
942	ASSERT(vd->vdev_parent ==
943	    (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
944
945	if (type == ZIO_TYPE_READ && bp != NULL) {
946		/*
947		 * If we have the bp, then the child should perform the
948		 * checksum and the parent need not.  This pushes error
949		 * detection as close to the leaves as possible and
950		 * eliminates redundant checksums in the interior nodes.
951		 */
952		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
953		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
954	}
955
956	if (vd->vdev_children == 0)
957		offset += VDEV_LABEL_START_SIZE;
958
959	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
960
961	/*
962	 * If we've decided to do a repair, the write is not speculative --
963	 * even if the original read was.
964	 */
965	if (flags & ZIO_FLAG_IO_REPAIR)
966		flags &= ~ZIO_FLAG_SPECULATIVE;
967
968	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
969	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
970	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
971
972	return (zio);
973}
974
975zio_t *
976zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
977	int type, int priority, enum zio_flag flags,
978	zio_done_func_t *done, void *private)
979{
980	zio_t *zio;
981
982	ASSERT(vd->vdev_ops->vdev_op_leaf);
983
984	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
985	    data, size, done, private, type, priority,
986	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
987	    vd, offset, NULL,
988	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
989
990	return (zio);
991}
992
993void
994zio_flush(zio_t *zio, vdev_t *vd)
995{
996	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0,
997	    NULL, NULL, ZIO_PRIORITY_NOW,
998	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
999}
1000
1001zio_t *
1002zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset, uint64_t size)
1003{
1004
1005	ASSERT(vd->vdev_ops->vdev_op_leaf);
1006
1007	return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
1008	    NULL, NULL, ZIO_PRIORITY_TRIM,
1009	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
1010}
1011
1012void
1013zio_shrink(zio_t *zio, uint64_t size)
1014{
1015	ASSERT(zio->io_executor == NULL);
1016	ASSERT(zio->io_orig_size == zio->io_size);
1017	ASSERT(size <= zio->io_size);
1018
1019	/*
1020	 * We don't shrink for raidz because of problems with the
1021	 * reconstruction when reading back less than the block size.
1022	 * Note, BP_IS_RAIDZ() assumes no compression.
1023	 */
1024	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1025	if (!BP_IS_RAIDZ(zio->io_bp))
1026		zio->io_orig_size = zio->io_size = size;
1027}
1028
1029/*
1030 * ==========================================================================
1031 * Prepare to read and write logical blocks
1032 * ==========================================================================
1033 */
1034
1035static int
1036zio_read_bp_init(zio_t *zio)
1037{
1038	blkptr_t *bp = zio->io_bp;
1039
1040	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1041	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
1042	    !(zio->io_flags & ZIO_FLAG_RAW)) {
1043		uint64_t psize = BP_GET_PSIZE(bp);
1044		void *cbuf = zio_buf_alloc(psize);
1045
1046		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1047	}
1048
1049	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1050		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1051
1052	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1053		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1054
1055	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1056		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1057
1058	return (ZIO_PIPELINE_CONTINUE);
1059}
1060
1061static int
1062zio_write_bp_init(zio_t *zio)
1063{
1064	spa_t *spa = zio->io_spa;
1065	zio_prop_t *zp = &zio->io_prop;
1066	enum zio_compress compress = zp->zp_compress;
1067	blkptr_t *bp = zio->io_bp;
1068	uint64_t lsize = zio->io_size;
1069	uint64_t psize = lsize;
1070	int pass = 1;
1071
1072	/*
1073	 * If our children haven't all reached the ready stage,
1074	 * wait for them and then repeat this pipeline stage.
1075	 */
1076	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1077	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1078		return (ZIO_PIPELINE_STOP);
1079
1080	if (!IO_IS_ALLOCATING(zio))
1081		return (ZIO_PIPELINE_CONTINUE);
1082
1083	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1084
1085	if (zio->io_bp_override) {
1086		ASSERT(bp->blk_birth != zio->io_txg);
1087		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1088
1089		*bp = *zio->io_bp_override;
1090		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1091
1092		/*
1093		 * If we've been overridden and nopwrite is set then
1094		 * set the flag accordingly to indicate that a nopwrite
1095		 * has already occurred.
1096		 */
1097		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1098			ASSERT(!zp->zp_dedup);
1099			zio->io_flags |= ZIO_FLAG_NOPWRITE;
1100			return (ZIO_PIPELINE_CONTINUE);
1101		}
1102
1103		ASSERT(!zp->zp_nopwrite);
1104
1105		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
1106			return (ZIO_PIPELINE_CONTINUE);
1107
1108		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1109		    zp->zp_dedup_verify);
1110
1111		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1112			BP_SET_DEDUP(bp, 1);
1113			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1114			return (ZIO_PIPELINE_CONTINUE);
1115		}
1116		zio->io_bp_override = NULL;
1117		BP_ZERO(bp);
1118	}
1119
1120	if (bp->blk_birth == zio->io_txg) {
1121		/*
1122		 * We're rewriting an existing block, which means we're
1123		 * working on behalf of spa_sync().  For spa_sync() to
1124		 * converge, it must eventually be the case that we don't
1125		 * have to allocate new blocks.  But compression changes
1126		 * the blocksize, which forces a reallocate, and makes
1127		 * convergence take longer.  Therefore, after the first
1128		 * few passes, stop compressing to ensure convergence.
1129		 */
1130		pass = spa_sync_pass(spa);
1131
1132		ASSERT(zio->io_txg == spa_syncing_txg(spa));
1133		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1134		ASSERT(!BP_GET_DEDUP(bp));
1135
1136		if (pass >= zfs_sync_pass_dont_compress)
1137			compress = ZIO_COMPRESS_OFF;
1138
1139		/* Make sure someone doesn't change their mind on overwrites */
1140		ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
1141		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1142	}
1143
1144	if (compress != ZIO_COMPRESS_OFF) {
1145		metaslab_class_t *mc = spa_normal_class(spa);
1146		void *cbuf = zio_buf_alloc(lsize);
1147		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize,
1148		    (size_t)metaslab_class_get_minblocksize(mc));
1149		if (psize == 0 || psize == lsize) {
1150			compress = ZIO_COMPRESS_OFF;
1151			zio_buf_free(cbuf, lsize);
1152		} else {
1153			ASSERT(psize < lsize);
1154			zio_push_transform(zio, cbuf, psize, lsize, NULL);
1155		}
1156	}
1157
1158	/*
1159	 * The final pass of spa_sync() must be all rewrites, but the first
1160	 * few passes offer a trade-off: allocating blocks defers convergence,
1161	 * but newly allocated blocks are sequential, so they can be written
1162	 * to disk faster.  Therefore, we allow the first few passes of
1163	 * spa_sync() to allocate new blocks, but force rewrites after that.
1164	 * There should only be a handful of blocks after pass 1 in any case.
1165	 */
1166	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
1167	    pass >= zfs_sync_pass_rewrite) {
1168		ASSERT(psize != 0);
1169		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1170		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1171		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1172	} else {
1173		BP_ZERO(bp);
1174		zio->io_pipeline = ZIO_WRITE_PIPELINE;
1175	}
1176
1177	if (psize == 0) {
1178		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1179	} else {
1180		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1181		BP_SET_LSIZE(bp, lsize);
1182		BP_SET_PSIZE(bp, psize);
1183		BP_SET_COMPRESS(bp, compress);
1184		BP_SET_CHECKSUM(bp, zp->zp_checksum);
1185		BP_SET_TYPE(bp, zp->zp_type);
1186		BP_SET_LEVEL(bp, zp->zp_level);
1187		BP_SET_DEDUP(bp, zp->zp_dedup);
1188		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1189		if (zp->zp_dedup) {
1190			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1191			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1192			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1193		}
1194		if (zp->zp_nopwrite) {
1195			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1196			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1197			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1198		}
1199	}
1200
1201	return (ZIO_PIPELINE_CONTINUE);
1202}
1203
1204static int
1205zio_free_bp_init(zio_t *zio)
1206{
1207	blkptr_t *bp = zio->io_bp;
1208
1209	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1210		if (BP_GET_DEDUP(bp))
1211			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1212	}
1213
1214	return (ZIO_PIPELINE_CONTINUE);
1215}
1216
1217/*
1218 * ==========================================================================
1219 * Execute the I/O pipeline
1220 * ==========================================================================
1221 */
1222
1223static void
1224zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
1225{
1226	spa_t *spa = zio->io_spa;
1227	zio_type_t t = zio->io_type;
1228	int flags = (cutinline ? TQ_FRONT : 0);
1229
1230	ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
1231
1232	/*
1233	 * If we're a config writer or a probe, the normal issue and
1234	 * interrupt threads may all be blocked waiting for the config lock.
1235	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1236	 */
1237	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1238		t = ZIO_TYPE_NULL;
1239
1240	/*
1241	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1242	 */
1243	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1244		t = ZIO_TYPE_NULL;
1245
1246	/*
1247	 * If this is a high priority I/O, then use the high priority taskq.
1248	 */
1249	if (zio->io_priority == ZIO_PRIORITY_NOW &&
1250	    spa->spa_zio_taskq[t][q + 1] != NULL)
1251		q++;
1252
1253	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1254
1255	/*
1256	 * NB: We are assuming that the zio can only be dispatched
1257	 * to a single taskq at a time.  It would be a grievous error
1258	 * to dispatch the zio to another taskq at the same time.
1259	 */
1260#if defined(illumos) || !defined(_KERNEL)
1261	ASSERT(zio->io_tqent.tqent_next == NULL);
1262#else
1263	ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
1264#endif
1265	taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
1266	    (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
1267}
1268
1269static boolean_t
1270zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
1271{
1272	kthread_t *executor = zio->io_executor;
1273	spa_t *spa = zio->io_spa;
1274
1275	for (zio_type_t t = 0; t < ZIO_TYPES; t++)
1276		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
1277			return (B_TRUE);
1278
1279	return (B_FALSE);
1280}
1281
1282static int
1283zio_issue_async(zio_t *zio)
1284{
1285	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1286
1287	return (ZIO_PIPELINE_STOP);
1288}
1289
1290void
1291zio_interrupt(zio_t *zio)
1292{
1293	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1294}
1295
1296/*
1297 * Execute the I/O pipeline until one of the following occurs:
1298 *
1299 *	(1) the I/O completes
1300 *	(2) the pipeline stalls waiting for dependent child I/Os
1301 *	(3) the I/O issues, so we're waiting for an I/O completion interrupt
1302 *	(4) the I/O is delegated by vdev-level caching or aggregation
1303 *	(5) the I/O is deferred due to vdev-level queueing
1304 *	(6) the I/O is handed off to another thread.
1305 *
1306 * In all cases, the pipeline stops whenever there's no CPU work; it never
1307 * burns a thread in cv_wait().
1308 *
1309 * There's no locking on io_stage because there's no legitimate way
1310 * for multiple threads to be attempting to process the same I/O.
1311 */
1312static zio_pipe_stage_t *zio_pipeline[];
1313
1314void
1315zio_execute(zio_t *zio)
1316{
1317	zio->io_executor = curthread;
1318
1319	while (zio->io_stage < ZIO_STAGE_DONE) {
1320		enum zio_stage pipeline = zio->io_pipeline;
1321		enum zio_stage stage = zio->io_stage;
1322		int rv;
1323
1324		ASSERT(!MUTEX_HELD(&zio->io_lock));
1325		ASSERT(ISP2(stage));
1326		ASSERT(zio->io_stall == NULL);
1327
1328		do {
1329			stage <<= 1;
1330		} while ((stage & pipeline) == 0);
1331
1332		ASSERT(stage <= ZIO_STAGE_DONE);
1333
1334		/*
1335		 * If we are in interrupt context and this pipeline stage
1336		 * will grab a config lock that is held across I/O,
1337		 * or may wait for an I/O that needs an interrupt thread
1338		 * to complete, issue async to avoid deadlock.
1339		 *
1340		 * For VDEV_IO_START, we cut in line so that the io will
1341		 * be sent to disk promptly.
1342		 */
1343		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1344		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1345			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1346			    zio_requeue_io_start_cut_in_line : B_FALSE;
1347			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1348			return;
1349		}
1350
1351		zio->io_stage = stage;
1352		rv = zio_pipeline[highbit(stage) - 1](zio);
1353
1354		if (rv == ZIO_PIPELINE_STOP)
1355			return;
1356
1357		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1358	}
1359}
1360
1361/*
1362 * ==========================================================================
1363 * Initiate I/O, either sync or async
1364 * ==========================================================================
1365 */
1366int
1367zio_wait(zio_t *zio)
1368{
1369	int error;
1370
1371	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1372	ASSERT(zio->io_executor == NULL);
1373
1374	zio->io_waiter = curthread;
1375
1376	zio_execute(zio);
1377
1378	mutex_enter(&zio->io_lock);
1379	while (zio->io_executor != NULL)
1380		cv_wait(&zio->io_cv, &zio->io_lock);
1381	mutex_exit(&zio->io_lock);
1382
1383	error = zio->io_error;
1384	zio_destroy(zio);
1385
1386	return (error);
1387}
1388
1389void
1390zio_nowait(zio_t *zio)
1391{
1392	ASSERT(zio->io_executor == NULL);
1393
1394	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1395	    zio_unique_parent(zio) == NULL) {
1396		/*
1397		 * This is a logical async I/O with no parent to wait for it.
1398		 * We add it to the spa_async_root_zio "Godfather" I/O which
1399		 * will ensure they complete prior to unloading the pool.
1400		 */
1401		spa_t *spa = zio->io_spa;
1402
1403		zio_add_child(spa->spa_async_zio_root, zio);
1404	}
1405
1406	zio_execute(zio);
1407}
1408
1409/*
1410 * ==========================================================================
1411 * Reexecute or suspend/resume failed I/O
1412 * ==========================================================================
1413 */
1414
1415static void
1416zio_reexecute(zio_t *pio)
1417{
1418	zio_t *cio, *cio_next;
1419
1420	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1421	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1422	ASSERT(pio->io_gang_leader == NULL);
1423	ASSERT(pio->io_gang_tree == NULL);
1424
1425	pio->io_flags = pio->io_orig_flags;
1426	pio->io_stage = pio->io_orig_stage;
1427	pio->io_pipeline = pio->io_orig_pipeline;
1428	pio->io_reexecute = 0;
1429	pio->io_flags |= ZIO_FLAG_REEXECUTED;
1430	pio->io_error = 0;
1431	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1432		pio->io_state[w] = 0;
1433	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1434		pio->io_child_error[c] = 0;
1435
1436	if (IO_IS_ALLOCATING(pio))
1437		BP_ZERO(pio->io_bp);
1438
1439	/*
1440	 * As we reexecute pio's children, new children could be created.
1441	 * New children go to the head of pio's io_child_list, however,
1442	 * so we will (correctly) not reexecute them.  The key is that
1443	 * the remainder of pio's io_child_list, from 'cio_next' onward,
1444	 * cannot be affected by any side effects of reexecuting 'cio'.
1445	 */
1446	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1447		cio_next = zio_walk_children(pio);
1448		mutex_enter(&pio->io_lock);
1449		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1450			pio->io_children[cio->io_child_type][w]++;
1451		mutex_exit(&pio->io_lock);
1452		zio_reexecute(cio);
1453	}
1454
1455	/*
1456	 * Now that all children have been reexecuted, execute the parent.
1457	 * We don't reexecute "The Godfather" I/O here as it's the
1458	 * responsibility of the caller to wait on him.
1459	 */
1460	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1461		zio_execute(pio);
1462}
1463
1464void
1465zio_suspend(spa_t *spa, zio_t *zio)
1466{
1467	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1468		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1469		    "failure and the failure mode property for this pool "
1470		    "is set to panic.", spa_name(spa));
1471
1472	zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1473
1474	mutex_enter(&spa->spa_suspend_lock);
1475
1476	if (spa->spa_suspend_zio_root == NULL)
1477		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1478		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1479		    ZIO_FLAG_GODFATHER);
1480
1481	spa->spa_suspended = B_TRUE;
1482
1483	if (zio != NULL) {
1484		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1485		ASSERT(zio != spa->spa_suspend_zio_root);
1486		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1487		ASSERT(zio_unique_parent(zio) == NULL);
1488		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1489		zio_add_child(spa->spa_suspend_zio_root, zio);
1490	}
1491
1492	mutex_exit(&spa->spa_suspend_lock);
1493}
1494
1495int
1496zio_resume(spa_t *spa)
1497{
1498	zio_t *pio;
1499
1500	/*
1501	 * Reexecute all previously suspended i/o.
1502	 */
1503	mutex_enter(&spa->spa_suspend_lock);
1504	spa->spa_suspended = B_FALSE;
1505	cv_broadcast(&spa->spa_suspend_cv);
1506	pio = spa->spa_suspend_zio_root;
1507	spa->spa_suspend_zio_root = NULL;
1508	mutex_exit(&spa->spa_suspend_lock);
1509
1510	if (pio == NULL)
1511		return (0);
1512
1513	zio_reexecute(pio);
1514	return (zio_wait(pio));
1515}
1516
1517void
1518zio_resume_wait(spa_t *spa)
1519{
1520	mutex_enter(&spa->spa_suspend_lock);
1521	while (spa_suspended(spa))
1522		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1523	mutex_exit(&spa->spa_suspend_lock);
1524}
1525
1526/*
1527 * ==========================================================================
1528 * Gang blocks.
1529 *
1530 * A gang block is a collection of small blocks that looks to the DMU
1531 * like one large block.  When zio_dva_allocate() cannot find a block
1532 * of the requested size, due to either severe fragmentation or the pool
1533 * being nearly full, it calls zio_write_gang_block() to construct the
1534 * block from smaller fragments.
1535 *
1536 * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1537 * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1538 * an indirect block: it's an array of block pointers.  It consumes
1539 * only one sector and hence is allocatable regardless of fragmentation.
1540 * The gang header's bps point to its gang members, which hold the data.
1541 *
1542 * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1543 * as the verifier to ensure uniqueness of the SHA256 checksum.
1544 * Critically, the gang block bp's blk_cksum is the checksum of the data,
1545 * not the gang header.  This ensures that data block signatures (needed for
1546 * deduplication) are independent of how the block is physically stored.
1547 *
1548 * Gang blocks can be nested: a gang member may itself be a gang block.
1549 * Thus every gang block is a tree in which root and all interior nodes are
1550 * gang headers, and the leaves are normal blocks that contain user data.
1551 * The root of the gang tree is called the gang leader.
1552 *
1553 * To perform any operation (read, rewrite, free, claim) on a gang block,
1554 * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1555 * in the io_gang_tree field of the original logical i/o by recursively
1556 * reading the gang leader and all gang headers below it.  This yields
1557 * an in-core tree containing the contents of every gang header and the
1558 * bps for every constituent of the gang block.
1559 *
1560 * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1561 * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1562 * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1563 * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1564 * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1565 * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1566 * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1567 * of the gang header plus zio_checksum_compute() of the data to update the
1568 * gang header's blk_cksum as described above.
1569 *
1570 * The two-phase assemble/issue model solves the problem of partial failure --
1571 * what if you'd freed part of a gang block but then couldn't read the
1572 * gang header for another part?  Assembling the entire gang tree first
1573 * ensures that all the necessary gang header I/O has succeeded before
1574 * starting the actual work of free, claim, or write.  Once the gang tree
1575 * is assembled, free and claim are in-memory operations that cannot fail.
1576 *
1577 * In the event that a gang write fails, zio_dva_unallocate() walks the
1578 * gang tree to immediately free (i.e. insert back into the space map)
1579 * everything we've allocated.  This ensures that we don't get ENOSPC
1580 * errors during repeated suspend/resume cycles due to a flaky device.
1581 *
1582 * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1583 * the gang tree, we won't modify the block, so we can safely defer the free
1584 * (knowing that the block is still intact).  If we *can* assemble the gang
1585 * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1586 * each constituent bp and we can allocate a new block on the next sync pass.
1587 *
1588 * In all cases, the gang tree allows complete recovery from partial failure.
1589 * ==========================================================================
1590 */
1591
1592static zio_t *
1593zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1594{
1595	if (gn != NULL)
1596		return (pio);
1597
1598	return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1599	    NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1600	    &pio->io_bookmark));
1601}
1602
1603zio_t *
1604zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1605{
1606	zio_t *zio;
1607
1608	if (gn != NULL) {
1609		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1610		    gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1611		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1612		/*
1613		 * As we rewrite each gang header, the pipeline will compute
1614		 * a new gang block header checksum for it; but no one will
1615		 * compute a new data checksum, so we do that here.  The one
1616		 * exception is the gang leader: the pipeline already computed
1617		 * its data checksum because that stage precedes gang assembly.
1618		 * (Presently, nothing actually uses interior data checksums;
1619		 * this is just good hygiene.)
1620		 */
1621		if (gn != pio->io_gang_leader->io_gang_tree) {
1622			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1623			    data, BP_GET_PSIZE(bp));
1624		}
1625		/*
1626		 * If we are here to damage data for testing purposes,
1627		 * leave the GBH alone so that we can detect the damage.
1628		 */
1629		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1630			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1631	} else {
1632		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1633		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1634		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1635	}
1636
1637	return (zio);
1638}
1639
1640/* ARGSUSED */
1641zio_t *
1642zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1643{
1644	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1645	    BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp),
1646	    ZIO_GANG_CHILD_FLAGS(pio)));
1647}
1648
1649/* ARGSUSED */
1650zio_t *
1651zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1652{
1653	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1654	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1655}
1656
1657static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1658	NULL,
1659	zio_read_gang,
1660	zio_rewrite_gang,
1661	zio_free_gang,
1662	zio_claim_gang,
1663	NULL
1664};
1665
1666static void zio_gang_tree_assemble_done(zio_t *zio);
1667
1668static zio_gang_node_t *
1669zio_gang_node_alloc(zio_gang_node_t **gnpp)
1670{
1671	zio_gang_node_t *gn;
1672
1673	ASSERT(*gnpp == NULL);
1674
1675	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1676	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1677	*gnpp = gn;
1678
1679	return (gn);
1680}
1681
1682static void
1683zio_gang_node_free(zio_gang_node_t **gnpp)
1684{
1685	zio_gang_node_t *gn = *gnpp;
1686
1687	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1688		ASSERT(gn->gn_child[g] == NULL);
1689
1690	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1691	kmem_free(gn, sizeof (*gn));
1692	*gnpp = NULL;
1693}
1694
1695static void
1696zio_gang_tree_free(zio_gang_node_t **gnpp)
1697{
1698	zio_gang_node_t *gn = *gnpp;
1699
1700	if (gn == NULL)
1701		return;
1702
1703	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1704		zio_gang_tree_free(&gn->gn_child[g]);
1705
1706	zio_gang_node_free(gnpp);
1707}
1708
1709static void
1710zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1711{
1712	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1713
1714	ASSERT(gio->io_gang_leader == gio);
1715	ASSERT(BP_IS_GANG(bp));
1716
1717	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1718	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1719	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1720}
1721
1722static void
1723zio_gang_tree_assemble_done(zio_t *zio)
1724{
1725	zio_t *gio = zio->io_gang_leader;
1726	zio_gang_node_t *gn = zio->io_private;
1727	blkptr_t *bp = zio->io_bp;
1728
1729	ASSERT(gio == zio_unique_parent(zio));
1730	ASSERT(zio->io_child_count == 0);
1731
1732	if (zio->io_error)
1733		return;
1734
1735	if (BP_SHOULD_BYTESWAP(bp))
1736		byteswap_uint64_array(zio->io_data, zio->io_size);
1737
1738	ASSERT(zio->io_data == gn->gn_gbh);
1739	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1740	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1741
1742	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1743		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1744		if (!BP_IS_GANG(gbp))
1745			continue;
1746		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1747	}
1748}
1749
1750static void
1751zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1752{
1753	zio_t *gio = pio->io_gang_leader;
1754	zio_t *zio;
1755
1756	ASSERT(BP_IS_GANG(bp) == !!gn);
1757	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1758	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1759
1760	/*
1761	 * If you're a gang header, your data is in gn->gn_gbh.
1762	 * If you're a gang member, your data is in 'data' and gn == NULL.
1763	 */
1764	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1765
1766	if (gn != NULL) {
1767		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1768
1769		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1770			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1771			if (BP_IS_HOLE(gbp))
1772				continue;
1773			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1774			data = (char *)data + BP_GET_PSIZE(gbp);
1775		}
1776	}
1777
1778	if (gn == gio->io_gang_tree && gio->io_data != NULL)
1779		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1780
1781	if (zio != pio)
1782		zio_nowait(zio);
1783}
1784
1785static int
1786zio_gang_assemble(zio_t *zio)
1787{
1788	blkptr_t *bp = zio->io_bp;
1789
1790	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1791	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1792
1793	zio->io_gang_leader = zio;
1794
1795	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1796
1797	return (ZIO_PIPELINE_CONTINUE);
1798}
1799
1800static int
1801zio_gang_issue(zio_t *zio)
1802{
1803	blkptr_t *bp = zio->io_bp;
1804
1805	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1806		return (ZIO_PIPELINE_STOP);
1807
1808	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1809	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1810
1811	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1812		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1813	else
1814		zio_gang_tree_free(&zio->io_gang_tree);
1815
1816	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1817
1818	return (ZIO_PIPELINE_CONTINUE);
1819}
1820
1821static void
1822zio_write_gang_member_ready(zio_t *zio)
1823{
1824	zio_t *pio = zio_unique_parent(zio);
1825	zio_t *gio = zio->io_gang_leader;
1826	dva_t *cdva = zio->io_bp->blk_dva;
1827	dva_t *pdva = pio->io_bp->blk_dva;
1828	uint64_t asize;
1829
1830	if (BP_IS_HOLE(zio->io_bp))
1831		return;
1832
1833	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1834
1835	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1836	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1837	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1838	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1839	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1840
1841	mutex_enter(&pio->io_lock);
1842	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1843		ASSERT(DVA_GET_GANG(&pdva[d]));
1844		asize = DVA_GET_ASIZE(&pdva[d]);
1845		asize += DVA_GET_ASIZE(&cdva[d]);
1846		DVA_SET_ASIZE(&pdva[d], asize);
1847	}
1848	mutex_exit(&pio->io_lock);
1849}
1850
1851static int
1852zio_write_gang_block(zio_t *pio)
1853{
1854	spa_t *spa = pio->io_spa;
1855	blkptr_t *bp = pio->io_bp;
1856	zio_t *gio = pio->io_gang_leader;
1857	zio_t *zio;
1858	zio_gang_node_t *gn, **gnpp;
1859	zio_gbh_phys_t *gbh;
1860	uint64_t txg = pio->io_txg;
1861	uint64_t resid = pio->io_size;
1862	uint64_t lsize;
1863	int copies = gio->io_prop.zp_copies;
1864	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1865	zio_prop_t zp;
1866	int error;
1867
1868	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1869	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1870	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1871	if (error) {
1872		pio->io_error = error;
1873		return (ZIO_PIPELINE_CONTINUE);
1874	}
1875
1876	if (pio == gio) {
1877		gnpp = &gio->io_gang_tree;
1878	} else {
1879		gnpp = pio->io_private;
1880		ASSERT(pio->io_ready == zio_write_gang_member_ready);
1881	}
1882
1883	gn = zio_gang_node_alloc(gnpp);
1884	gbh = gn->gn_gbh;
1885	bzero(gbh, SPA_GANGBLOCKSIZE);
1886
1887	/*
1888	 * Create the gang header.
1889	 */
1890	zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1891	    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1892
1893	/*
1894	 * Create and nowait the gang children.
1895	 */
1896	for (int g = 0; resid != 0; resid -= lsize, g++) {
1897		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1898		    SPA_MINBLOCKSIZE);
1899		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1900
1901		zp.zp_checksum = gio->io_prop.zp_checksum;
1902		zp.zp_compress = ZIO_COMPRESS_OFF;
1903		zp.zp_type = DMU_OT_NONE;
1904		zp.zp_level = 0;
1905		zp.zp_copies = gio->io_prop.zp_copies;
1906		zp.zp_dedup = B_FALSE;
1907		zp.zp_dedup_verify = B_FALSE;
1908		zp.zp_nopwrite = B_FALSE;
1909
1910		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1911		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1912		    zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1913		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1914		    &pio->io_bookmark));
1915	}
1916
1917	/*
1918	 * Set pio's pipeline to just wait for zio to finish.
1919	 */
1920	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1921
1922	zio_nowait(zio);
1923
1924	return (ZIO_PIPELINE_CONTINUE);
1925}
1926
1927/*
1928 * The zio_nop_write stage in the pipeline determines if allocating
1929 * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1930 * such as SHA256, we can compare the checksums of the new data and the old
1931 * to determine if allocating a new block is required.  The nopwrite
1932 * feature can handle writes in either syncing or open context (i.e. zil
1933 * writes) and as a result is mutually exclusive with dedup.
1934 */
1935static int
1936zio_nop_write(zio_t *zio)
1937{
1938	blkptr_t *bp = zio->io_bp;
1939	blkptr_t *bp_orig = &zio->io_bp_orig;
1940	zio_prop_t *zp = &zio->io_prop;
1941
1942	ASSERT(BP_GET_LEVEL(bp) == 0);
1943	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1944	ASSERT(zp->zp_nopwrite);
1945	ASSERT(!zp->zp_dedup);
1946	ASSERT(zio->io_bp_override == NULL);
1947	ASSERT(IO_IS_ALLOCATING(zio));
1948
1949	/*
1950	 * Check to see if the original bp and the new bp have matching
1951	 * characteristics (i.e. same checksum, compression algorithms, etc).
1952	 * If they don't then just continue with the pipeline which will
1953	 * allocate a new bp.
1954	 */
1955	if (BP_IS_HOLE(bp_orig) ||
1956	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
1957	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
1958	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
1959	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
1960	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
1961		return (ZIO_PIPELINE_CONTINUE);
1962
1963	/*
1964	 * If the checksums match then reset the pipeline so that we
1965	 * avoid allocating a new bp and issuing any I/O.
1966	 */
1967	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
1968		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
1969		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
1970		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
1971		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
1972		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
1973		    sizeof (uint64_t)) == 0);
1974
1975		*bp = *bp_orig;
1976		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1977		zio->io_flags |= ZIO_FLAG_NOPWRITE;
1978	}
1979
1980	return (ZIO_PIPELINE_CONTINUE);
1981}
1982
1983/*
1984 * ==========================================================================
1985 * Dedup
1986 * ==========================================================================
1987 */
1988static void
1989zio_ddt_child_read_done(zio_t *zio)
1990{
1991	blkptr_t *bp = zio->io_bp;
1992	ddt_entry_t *dde = zio->io_private;
1993	ddt_phys_t *ddp;
1994	zio_t *pio = zio_unique_parent(zio);
1995
1996	mutex_enter(&pio->io_lock);
1997	ddp = ddt_phys_select(dde, bp);
1998	if (zio->io_error == 0)
1999		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
2000	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2001		dde->dde_repair_data = zio->io_data;
2002	else
2003		zio_buf_free(zio->io_data, zio->io_size);
2004	mutex_exit(&pio->io_lock);
2005}
2006
2007static int
2008zio_ddt_read_start(zio_t *zio)
2009{
2010	blkptr_t *bp = zio->io_bp;
2011
2012	ASSERT(BP_GET_DEDUP(bp));
2013	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2014	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2015
2016	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2017		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2018		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2019		ddt_phys_t *ddp = dde->dde_phys;
2020		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2021		blkptr_t blk;
2022
2023		ASSERT(zio->io_vsd == NULL);
2024		zio->io_vsd = dde;
2025
2026		if (ddp_self == NULL)
2027			return (ZIO_PIPELINE_CONTINUE);
2028
2029		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2030			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2031				continue;
2032			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2033			    &blk);
2034			zio_nowait(zio_read(zio, zio->io_spa, &blk,
2035			    zio_buf_alloc(zio->io_size), zio->io_size,
2036			    zio_ddt_child_read_done, dde, zio->io_priority,
2037			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2038			    &zio->io_bookmark));
2039		}
2040		return (ZIO_PIPELINE_CONTINUE);
2041	}
2042
2043	zio_nowait(zio_read(zio, zio->io_spa, bp,
2044	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2045	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2046
2047	return (ZIO_PIPELINE_CONTINUE);
2048}
2049
2050static int
2051zio_ddt_read_done(zio_t *zio)
2052{
2053	blkptr_t *bp = zio->io_bp;
2054
2055	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2056		return (ZIO_PIPELINE_STOP);
2057
2058	ASSERT(BP_GET_DEDUP(bp));
2059	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2060	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2061
2062	if (zio->io_child_error[ZIO_CHILD_DDT]) {
2063		ddt_t *ddt = ddt_select(zio->io_spa, bp);
2064		ddt_entry_t *dde = zio->io_vsd;
2065		if (ddt == NULL) {
2066			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2067			return (ZIO_PIPELINE_CONTINUE);
2068		}
2069		if (dde == NULL) {
2070			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2071			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2072			return (ZIO_PIPELINE_STOP);
2073		}
2074		if (dde->dde_repair_data != NULL) {
2075			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2076			zio->io_child_error[ZIO_CHILD_DDT] = 0;
2077		}
2078		ddt_repair_done(ddt, dde);
2079		zio->io_vsd = NULL;
2080	}
2081
2082	ASSERT(zio->io_vsd == NULL);
2083
2084	return (ZIO_PIPELINE_CONTINUE);
2085}
2086
2087static boolean_t
2088zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2089{
2090	spa_t *spa = zio->io_spa;
2091
2092	/*
2093	 * Note: we compare the original data, not the transformed data,
2094	 * because when zio->io_bp is an override bp, we will not have
2095	 * pushed the I/O transforms.  That's an important optimization
2096	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2097	 */
2098	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2099		zio_t *lio = dde->dde_lead_zio[p];
2100
2101		if (lio != NULL) {
2102			return (lio->io_orig_size != zio->io_orig_size ||
2103			    bcmp(zio->io_orig_data, lio->io_orig_data,
2104			    zio->io_orig_size) != 0);
2105		}
2106	}
2107
2108	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2109		ddt_phys_t *ddp = &dde->dde_phys[p];
2110
2111		if (ddp->ddp_phys_birth != 0) {
2112			arc_buf_t *abuf = NULL;
2113			uint32_t aflags = ARC_WAIT;
2114			blkptr_t blk = *zio->io_bp;
2115			int error;
2116
2117			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2118
2119			ddt_exit(ddt);
2120
2121			error = arc_read(NULL, spa, &blk,
2122			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2123			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2124			    &aflags, &zio->io_bookmark);
2125
2126			if (error == 0) {
2127				if (arc_buf_size(abuf) != zio->io_orig_size ||
2128				    bcmp(abuf->b_data, zio->io_orig_data,
2129				    zio->io_orig_size) != 0)
2130					error = SET_ERROR(EEXIST);
2131				VERIFY(arc_buf_remove_ref(abuf, &abuf));
2132			}
2133
2134			ddt_enter(ddt);
2135			return (error != 0);
2136		}
2137	}
2138
2139	return (B_FALSE);
2140}
2141
2142static void
2143zio_ddt_child_write_ready(zio_t *zio)
2144{
2145	int p = zio->io_prop.zp_copies;
2146	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2147	ddt_entry_t *dde = zio->io_private;
2148	ddt_phys_t *ddp = &dde->dde_phys[p];
2149	zio_t *pio;
2150
2151	if (zio->io_error)
2152		return;
2153
2154	ddt_enter(ddt);
2155
2156	ASSERT(dde->dde_lead_zio[p] == zio);
2157
2158	ddt_phys_fill(ddp, zio->io_bp);
2159
2160	while ((pio = zio_walk_parents(zio)) != NULL)
2161		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2162
2163	ddt_exit(ddt);
2164}
2165
2166static void
2167zio_ddt_child_write_done(zio_t *zio)
2168{
2169	int p = zio->io_prop.zp_copies;
2170	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2171	ddt_entry_t *dde = zio->io_private;
2172	ddt_phys_t *ddp = &dde->dde_phys[p];
2173
2174	ddt_enter(ddt);
2175
2176	ASSERT(ddp->ddp_refcnt == 0);
2177	ASSERT(dde->dde_lead_zio[p] == zio);
2178	dde->dde_lead_zio[p] = NULL;
2179
2180	if (zio->io_error == 0) {
2181		while (zio_walk_parents(zio) != NULL)
2182			ddt_phys_addref(ddp);
2183	} else {
2184		ddt_phys_clear(ddp);
2185	}
2186
2187	ddt_exit(ddt);
2188}
2189
2190static void
2191zio_ddt_ditto_write_done(zio_t *zio)
2192{
2193	int p = DDT_PHYS_DITTO;
2194	zio_prop_t *zp = &zio->io_prop;
2195	blkptr_t *bp = zio->io_bp;
2196	ddt_t *ddt = ddt_select(zio->io_spa, bp);
2197	ddt_entry_t *dde = zio->io_private;
2198	ddt_phys_t *ddp = &dde->dde_phys[p];
2199	ddt_key_t *ddk = &dde->dde_key;
2200
2201	ddt_enter(ddt);
2202
2203	ASSERT(ddp->ddp_refcnt == 0);
2204	ASSERT(dde->dde_lead_zio[p] == zio);
2205	dde->dde_lead_zio[p] = NULL;
2206
2207	if (zio->io_error == 0) {
2208		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2209		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2210		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2211		if (ddp->ddp_phys_birth != 0)
2212			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2213		ddt_phys_fill(ddp, bp);
2214	}
2215
2216	ddt_exit(ddt);
2217}
2218
2219static int
2220zio_ddt_write(zio_t *zio)
2221{
2222	spa_t *spa = zio->io_spa;
2223	blkptr_t *bp = zio->io_bp;
2224	uint64_t txg = zio->io_txg;
2225	zio_prop_t *zp = &zio->io_prop;
2226	int p = zp->zp_copies;
2227	int ditto_copies;
2228	zio_t *cio = NULL;
2229	zio_t *dio = NULL;
2230	ddt_t *ddt = ddt_select(spa, bp);
2231	ddt_entry_t *dde;
2232	ddt_phys_t *ddp;
2233
2234	ASSERT(BP_GET_DEDUP(bp));
2235	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2236	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2237
2238	ddt_enter(ddt);
2239	dde = ddt_lookup(ddt, bp, B_TRUE);
2240	ddp = &dde->dde_phys[p];
2241
2242	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2243		/*
2244		 * If we're using a weak checksum, upgrade to a strong checksum
2245		 * and try again.  If we're already using a strong checksum,
2246		 * we can't resolve it, so just convert to an ordinary write.
2247		 * (And automatically e-mail a paper to Nature?)
2248		 */
2249		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2250			zp->zp_checksum = spa_dedup_checksum(spa);
2251			zio_pop_transforms(zio);
2252			zio->io_stage = ZIO_STAGE_OPEN;
2253			BP_ZERO(bp);
2254		} else {
2255			zp->zp_dedup = B_FALSE;
2256		}
2257		zio->io_pipeline = ZIO_WRITE_PIPELINE;
2258		ddt_exit(ddt);
2259		return (ZIO_PIPELINE_CONTINUE);
2260	}
2261
2262	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2263	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2264
2265	if (ditto_copies > ddt_ditto_copies_present(dde) &&
2266	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2267		zio_prop_t czp = *zp;
2268
2269		czp.zp_copies = ditto_copies;
2270
2271		/*
2272		 * If we arrived here with an override bp, we won't have run
2273		 * the transform stack, so we won't have the data we need to
2274		 * generate a child i/o.  So, toss the override bp and restart.
2275		 * This is safe, because using the override bp is just an
2276		 * optimization; and it's rare, so the cost doesn't matter.
2277		 */
2278		if (zio->io_bp_override) {
2279			zio_pop_transforms(zio);
2280			zio->io_stage = ZIO_STAGE_OPEN;
2281			zio->io_pipeline = ZIO_WRITE_PIPELINE;
2282			zio->io_bp_override = NULL;
2283			BP_ZERO(bp);
2284			ddt_exit(ddt);
2285			return (ZIO_PIPELINE_CONTINUE);
2286		}
2287
2288		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2289		    zio->io_orig_size, &czp, NULL,
2290		    zio_ddt_ditto_write_done, dde, zio->io_priority,
2291		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2292
2293		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2294		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2295	}
2296
2297	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2298		if (ddp->ddp_phys_birth != 0)
2299			ddt_bp_fill(ddp, bp, txg);
2300		if (dde->dde_lead_zio[p] != NULL)
2301			zio_add_child(zio, dde->dde_lead_zio[p]);
2302		else
2303			ddt_phys_addref(ddp);
2304	} else if (zio->io_bp_override) {
2305		ASSERT(bp->blk_birth == txg);
2306		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2307		ddt_phys_fill(ddp, bp);
2308		ddt_phys_addref(ddp);
2309	} else {
2310		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2311		    zio->io_orig_size, zp, zio_ddt_child_write_ready,
2312		    zio_ddt_child_write_done, dde, zio->io_priority,
2313		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2314
2315		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2316		dde->dde_lead_zio[p] = cio;
2317	}
2318
2319	ddt_exit(ddt);
2320
2321	if (cio)
2322		zio_nowait(cio);
2323	if (dio)
2324		zio_nowait(dio);
2325
2326	return (ZIO_PIPELINE_CONTINUE);
2327}
2328
2329ddt_entry_t *freedde; /* for debugging */
2330
2331static int
2332zio_ddt_free(zio_t *zio)
2333{
2334	spa_t *spa = zio->io_spa;
2335	blkptr_t *bp = zio->io_bp;
2336	ddt_t *ddt = ddt_select(spa, bp);
2337	ddt_entry_t *dde;
2338	ddt_phys_t *ddp;
2339
2340	ASSERT(BP_GET_DEDUP(bp));
2341	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2342
2343	ddt_enter(ddt);
2344	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2345	ddp = ddt_phys_select(dde, bp);
2346	ddt_phys_decref(ddp);
2347	ddt_exit(ddt);
2348
2349	return (ZIO_PIPELINE_CONTINUE);
2350}
2351
2352/*
2353 * ==========================================================================
2354 * Allocate and free blocks
2355 * ==========================================================================
2356 */
2357static int
2358zio_dva_allocate(zio_t *zio)
2359{
2360	spa_t *spa = zio->io_spa;
2361	metaslab_class_t *mc = spa_normal_class(spa);
2362	blkptr_t *bp = zio->io_bp;
2363	int error;
2364	int flags = 0;
2365
2366	if (zio->io_gang_leader == NULL) {
2367		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2368		zio->io_gang_leader = zio;
2369	}
2370
2371	ASSERT(BP_IS_HOLE(bp));
2372	ASSERT0(BP_GET_NDVAS(bp));
2373	ASSERT3U(zio->io_prop.zp_copies, >, 0);
2374	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2375	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2376
2377	/*
2378	 * The dump device does not support gang blocks so allocation on
2379	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2380	 * the "fast" gang feature.
2381	 */
2382	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2383	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2384	    METASLAB_GANG_CHILD : 0;
2385	error = metaslab_alloc(spa, mc, zio->io_size, bp,
2386	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2387
2388	if (error) {
2389		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2390		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2391		    error);
2392		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2393			return (zio_write_gang_block(zio));
2394		zio->io_error = error;
2395	}
2396
2397	return (ZIO_PIPELINE_CONTINUE);
2398}
2399
2400static int
2401zio_dva_free(zio_t *zio)
2402{
2403	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2404
2405	return (ZIO_PIPELINE_CONTINUE);
2406}
2407
2408static int
2409zio_dva_claim(zio_t *zio)
2410{
2411	int error;
2412
2413	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2414	if (error)
2415		zio->io_error = error;
2416
2417	return (ZIO_PIPELINE_CONTINUE);
2418}
2419
2420/*
2421 * Undo an allocation.  This is used by zio_done() when an I/O fails
2422 * and we want to give back the block we just allocated.
2423 * This handles both normal blocks and gang blocks.
2424 */
2425static void
2426zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2427{
2428	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2429	ASSERT(zio->io_bp_override == NULL);
2430
2431	if (!BP_IS_HOLE(bp))
2432		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2433
2434	if (gn != NULL) {
2435		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2436			zio_dva_unallocate(zio, gn->gn_child[g],
2437			    &gn->gn_gbh->zg_blkptr[g]);
2438		}
2439	}
2440}
2441
2442/*
2443 * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2444 */
2445int
2446zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2447    uint64_t size, boolean_t use_slog)
2448{
2449	int error = 1;
2450
2451	ASSERT(txg > spa_syncing_txg(spa));
2452
2453	/*
2454	 * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2455	 * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2456	 * when allocating them.
2457	 */
2458	if (use_slog) {
2459		error = metaslab_alloc(spa, spa_log_class(spa), size,
2460		    new_bp, 1, txg, old_bp,
2461		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2462	}
2463
2464	if (error) {
2465		error = metaslab_alloc(spa, spa_normal_class(spa), size,
2466		    new_bp, 1, txg, old_bp,
2467		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2468	}
2469
2470	if (error == 0) {
2471		BP_SET_LSIZE(new_bp, size);
2472		BP_SET_PSIZE(new_bp, size);
2473		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2474		BP_SET_CHECKSUM(new_bp,
2475		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2476		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2477		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2478		BP_SET_LEVEL(new_bp, 0);
2479		BP_SET_DEDUP(new_bp, 0);
2480		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2481	}
2482
2483	return (error);
2484}
2485
2486/*
2487 * Free an intent log block.
2488 */
2489void
2490zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2491{
2492	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2493	ASSERT(!BP_IS_GANG(bp));
2494
2495	zio_free(spa, txg, bp);
2496}
2497
2498/*
2499 * ==========================================================================
2500 * Read, write and delete to physical devices
2501 * ==========================================================================
2502 */
2503static int
2504zio_vdev_io_start(zio_t *zio)
2505{
2506	vdev_t *vd = zio->io_vd;
2507	uint64_t align;
2508	spa_t *spa = zio->io_spa;
2509
2510	ASSERT(zio->io_error == 0);
2511	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2512
2513	if (vd == NULL) {
2514		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2515			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2516
2517		/*
2518		 * The mirror_ops handle multiple DVAs in a single BP.
2519		 */
2520		return (vdev_mirror_ops.vdev_op_io_start(zio));
2521	}
2522
2523	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
2524		trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
2525		return (ZIO_PIPELINE_CONTINUE);
2526	}
2527
2528	/*
2529	 * We keep track of time-sensitive I/Os so that the scan thread
2530	 * can quickly react to certain workloads.  In particular, we care
2531	 * about non-scrubbing, top-level reads and writes with the following
2532	 * characteristics:
2533	 * 	- synchronous writes of user data to non-slog devices
2534	 *	- any reads of user data
2535	 * When these conditions are met, adjust the timestamp of spa_last_io
2536	 * which allows the scan thread to adjust its workload accordingly.
2537	 */
2538	if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2539	    vd == vd->vdev_top && !vd->vdev_islog &&
2540	    zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2541	    zio->io_txg != spa_syncing_txg(spa)) {
2542		uint64_t old = spa->spa_last_io;
2543		uint64_t new = ddi_get_lbolt64();
2544		if (old != new)
2545			(void) atomic_cas_64(&spa->spa_last_io, old, new);
2546	}
2547
2548	align = 1ULL << vd->vdev_top->vdev_ashift;
2549
2550	if (P2PHASE(zio->io_size, align) != 0) {
2551		uint64_t asize = P2ROUNDUP(zio->io_size, align);
2552		char *abuf = NULL;
2553		if (zio->io_type == ZIO_TYPE_READ ||
2554		    zio->io_type == ZIO_TYPE_WRITE)
2555			abuf = zio_buf_alloc(asize);
2556		ASSERT(vd == vd->vdev_top);
2557		if (zio->io_type == ZIO_TYPE_WRITE) {
2558			bcopy(zio->io_data, abuf, zio->io_size);
2559			bzero(abuf + zio->io_size, asize - zio->io_size);
2560		}
2561		zio_push_transform(zio, abuf, asize, abuf ? asize : 0,
2562		    zio_subblock);
2563	}
2564
2565	ASSERT(P2PHASE(zio->io_offset, align) == 0);
2566	ASSERT(P2PHASE(zio->io_size, align) == 0);
2567	VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
2568
2569	/*
2570	 * If this is a repair I/O, and there's no self-healing involved --
2571	 * that is, we're just resilvering what we expect to resilver --
2572	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
2573	 * This prevents spurious resilvering with nested replication.
2574	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2575	 * A is out of date, we'll read from C+D, then use the data to
2576	 * resilver A+B -- but we don't actually want to resilver B, just A.
2577	 * The top-level mirror has no way to know this, so instead we just
2578	 * discard unnecessary repairs as we work our way down the vdev tree.
2579	 * The same logic applies to any form of nested replication:
2580	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2581	 */
2582	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2583	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2584	    zio->io_txg != 0 &&	/* not a delegated i/o */
2585	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2586		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2587		zio_vdev_io_bypass(zio);
2588		return (ZIO_PIPELINE_CONTINUE);
2589	}
2590
2591	if (vd->vdev_ops->vdev_op_leaf &&
2592	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2593
2594		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
2595			return (ZIO_PIPELINE_CONTINUE);
2596
2597		if ((zio = vdev_queue_io(zio)) == NULL)
2598			return (ZIO_PIPELINE_STOP);
2599
2600		if (!vdev_accessible(vd, zio)) {
2601			zio->io_error = SET_ERROR(ENXIO);
2602			zio_interrupt(zio);
2603			return (ZIO_PIPELINE_STOP);
2604		}
2605	}
2606
2607	/*
2608	 * Note that we ignore repair writes for TRIM because they can conflict
2609	 * with normal writes. This isn't an issue because, by definition, we
2610	 * only repair blocks that aren't freed.
2611	 */
2612	if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE &&
2613	    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2614		if (!trim_map_write_start(zio))
2615			return (ZIO_PIPELINE_STOP);
2616	}
2617
2618	return (vd->vdev_ops->vdev_op_io_start(zio));
2619}
2620
2621static int
2622zio_vdev_io_done(zio_t *zio)
2623{
2624	vdev_t *vd = zio->io_vd;
2625	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2626	boolean_t unexpected_error = B_FALSE;
2627
2628	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2629		return (ZIO_PIPELINE_STOP);
2630
2631	ASSERT(zio->io_type == ZIO_TYPE_READ ||
2632	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
2633
2634	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2635	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2636
2637		if (zio->io_type == ZIO_TYPE_WRITE &&
2638		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
2639			trim_map_write_done(zio);
2640
2641		vdev_queue_io_done(zio);
2642
2643		if (zio->io_type == ZIO_TYPE_WRITE)
2644			vdev_cache_write(zio);
2645
2646		if (zio_injection_enabled && zio->io_error == 0)
2647			zio->io_error = zio_handle_device_injection(vd,
2648			    zio, EIO);
2649
2650		if (zio_injection_enabled && zio->io_error == 0)
2651			zio->io_error = zio_handle_label_injection(zio, EIO);
2652
2653		if (zio->io_error) {
2654			if (!vdev_accessible(vd, zio)) {
2655				zio->io_error = SET_ERROR(ENXIO);
2656			} else {
2657				unexpected_error = B_TRUE;
2658			}
2659		}
2660	}
2661
2662	ops->vdev_op_io_done(zio);
2663
2664	if (unexpected_error)
2665		VERIFY(vdev_probe(vd, zio) == NULL);
2666
2667	return (ZIO_PIPELINE_CONTINUE);
2668}
2669
2670/*
2671 * For non-raidz ZIOs, we can just copy aside the bad data read from the
2672 * disk, and use that to finish the checksum ereport later.
2673 */
2674static void
2675zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2676    const void *good_buf)
2677{
2678	/* no processing needed */
2679	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2680}
2681
2682/*ARGSUSED*/
2683void
2684zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2685{
2686	void *buf = zio_buf_alloc(zio->io_size);
2687
2688	bcopy(zio->io_data, buf, zio->io_size);
2689
2690	zcr->zcr_cbinfo = zio->io_size;
2691	zcr->zcr_cbdata = buf;
2692	zcr->zcr_finish = zio_vsd_default_cksum_finish;
2693	zcr->zcr_free = zio_buf_free;
2694}
2695
2696static int
2697zio_vdev_io_assess(zio_t *zio)
2698{
2699	vdev_t *vd = zio->io_vd;
2700
2701	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2702		return (ZIO_PIPELINE_STOP);
2703
2704	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2705		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2706
2707	if (zio->io_vsd != NULL) {
2708		zio->io_vsd_ops->vsd_free(zio);
2709		zio->io_vsd = NULL;
2710	}
2711
2712	if (zio_injection_enabled && zio->io_error == 0)
2713		zio->io_error = zio_handle_fault_injection(zio, EIO);
2714
2715	if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
2716		switch (zio->io_error) {
2717		case 0:
2718			ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
2719			ZIO_TRIM_STAT_BUMP(success);
2720			break;
2721		case EOPNOTSUPP:
2722			ZIO_TRIM_STAT_BUMP(unsupported);
2723			break;
2724		default:
2725			ZIO_TRIM_STAT_BUMP(failed);
2726			break;
2727		}
2728
2729	/*
2730	 * If the I/O failed, determine whether we should attempt to retry it.
2731	 *
2732	 * On retry, we cut in line in the issue queue, since we don't want
2733	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2734	 */
2735	if (zio->io_error && vd == NULL &&
2736	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2737		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
2738		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
2739		zio->io_error = 0;
2740		zio->io_flags |= ZIO_FLAG_IO_RETRY |
2741		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2742		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2743		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2744		    zio_requeue_io_start_cut_in_line);
2745		return (ZIO_PIPELINE_STOP);
2746	}
2747
2748	/*
2749	 * If we got an error on a leaf device, convert it to ENXIO
2750	 * if the device is not accessible at all.
2751	 */
2752	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2753	    !vdev_accessible(vd, zio))
2754		zio->io_error = SET_ERROR(ENXIO);
2755
2756	/*
2757	 * If we can't write to an interior vdev (mirror or RAID-Z),
2758	 * set vdev_cant_write so that we stop trying to allocate from it.
2759	 */
2760	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2761	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2762		vd->vdev_cant_write = B_TRUE;
2763	}
2764
2765	if (zio->io_error)
2766		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2767
2768	return (ZIO_PIPELINE_CONTINUE);
2769}
2770
2771void
2772zio_vdev_io_reissue(zio_t *zio)
2773{
2774	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2775	ASSERT(zio->io_error == 0);
2776
2777	zio->io_stage >>= 1;
2778}
2779
2780void
2781zio_vdev_io_redone(zio_t *zio)
2782{
2783	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2784
2785	zio->io_stage >>= 1;
2786}
2787
2788void
2789zio_vdev_io_bypass(zio_t *zio)
2790{
2791	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2792	ASSERT(zio->io_error == 0);
2793
2794	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2795	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2796}
2797
2798/*
2799 * ==========================================================================
2800 * Generate and verify checksums
2801 * ==========================================================================
2802 */
2803static int
2804zio_checksum_generate(zio_t *zio)
2805{
2806	blkptr_t *bp = zio->io_bp;
2807	enum zio_checksum checksum;
2808
2809	if (bp == NULL) {
2810		/*
2811		 * This is zio_write_phys().
2812		 * We're either generating a label checksum, or none at all.
2813		 */
2814		checksum = zio->io_prop.zp_checksum;
2815
2816		if (checksum == ZIO_CHECKSUM_OFF)
2817			return (ZIO_PIPELINE_CONTINUE);
2818
2819		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2820	} else {
2821		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2822			ASSERT(!IO_IS_ALLOCATING(zio));
2823			checksum = ZIO_CHECKSUM_GANG_HEADER;
2824		} else {
2825			checksum = BP_GET_CHECKSUM(bp);
2826		}
2827	}
2828
2829	zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2830
2831	return (ZIO_PIPELINE_CONTINUE);
2832}
2833
2834static int
2835zio_checksum_verify(zio_t *zio)
2836{
2837	zio_bad_cksum_t info;
2838	blkptr_t *bp = zio->io_bp;
2839	int error;
2840
2841	ASSERT(zio->io_vd != NULL);
2842
2843	if (bp == NULL) {
2844		/*
2845		 * This is zio_read_phys().
2846		 * We're either verifying a label checksum, or nothing at all.
2847		 */
2848		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2849			return (ZIO_PIPELINE_CONTINUE);
2850
2851		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2852	}
2853
2854	if ((error = zio_checksum_error(zio, &info)) != 0) {
2855		zio->io_error = error;
2856		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2857			zfs_ereport_start_checksum(zio->io_spa,
2858			    zio->io_vd, zio, zio->io_offset,
2859			    zio->io_size, NULL, &info);
2860		}
2861	}
2862
2863	return (ZIO_PIPELINE_CONTINUE);
2864}
2865
2866/*
2867 * Called by RAID-Z to ensure we don't compute the checksum twice.
2868 */
2869void
2870zio_checksum_verified(zio_t *zio)
2871{
2872	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2873}
2874
2875/*
2876 * ==========================================================================
2877 * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2878 * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2879 * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2880 * indicate errors that are specific to one I/O, and most likely permanent.
2881 * Any other error is presumed to be worse because we weren't expecting it.
2882 * ==========================================================================
2883 */
2884int
2885zio_worst_error(int e1, int e2)
2886{
2887	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2888	int r1, r2;
2889
2890	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2891		if (e1 == zio_error_rank[r1])
2892			break;
2893
2894	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2895		if (e2 == zio_error_rank[r2])
2896			break;
2897
2898	return (r1 > r2 ? e1 : e2);
2899}
2900
2901/*
2902 * ==========================================================================
2903 * I/O completion
2904 * ==========================================================================
2905 */
2906static int
2907zio_ready(zio_t *zio)
2908{
2909	blkptr_t *bp = zio->io_bp;
2910	zio_t *pio, *pio_next;
2911
2912	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2913	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2914		return (ZIO_PIPELINE_STOP);
2915
2916	if (zio->io_ready) {
2917		ASSERT(IO_IS_ALLOCATING(zio));
2918		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2919		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
2920		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2921
2922		zio->io_ready(zio);
2923	}
2924
2925	if (bp != NULL && bp != &zio->io_bp_copy)
2926		zio->io_bp_copy = *bp;
2927
2928	if (zio->io_error)
2929		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2930
2931	mutex_enter(&zio->io_lock);
2932	zio->io_state[ZIO_WAIT_READY] = 1;
2933	pio = zio_walk_parents(zio);
2934	mutex_exit(&zio->io_lock);
2935
2936	/*
2937	 * As we notify zio's parents, new parents could be added.
2938	 * New parents go to the head of zio's io_parent_list, however,
2939	 * so we will (correctly) not notify them.  The remainder of zio's
2940	 * io_parent_list, from 'pio_next' onward, cannot change because
2941	 * all parents must wait for us to be done before they can be done.
2942	 */
2943	for (; pio != NULL; pio = pio_next) {
2944		pio_next = zio_walk_parents(zio);
2945		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2946	}
2947
2948	if (zio->io_flags & ZIO_FLAG_NODATA) {
2949		if (BP_IS_GANG(bp)) {
2950			zio->io_flags &= ~ZIO_FLAG_NODATA;
2951		} else {
2952			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2953			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2954		}
2955	}
2956
2957	if (zio_injection_enabled &&
2958	    zio->io_spa->spa_syncing_txg == zio->io_txg)
2959		zio_handle_ignored_writes(zio);
2960
2961	return (ZIO_PIPELINE_CONTINUE);
2962}
2963
2964static int
2965zio_done(zio_t *zio)
2966{
2967	spa_t *spa = zio->io_spa;
2968	zio_t *lio = zio->io_logical;
2969	blkptr_t *bp = zio->io_bp;
2970	vdev_t *vd = zio->io_vd;
2971	uint64_t psize = zio->io_size;
2972	zio_t *pio, *pio_next;
2973
2974	/*
2975	 * If our children haven't all completed,
2976	 * wait for them and then repeat this pipeline stage.
2977	 */
2978	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2979	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2980	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
2981	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2982		return (ZIO_PIPELINE_STOP);
2983
2984	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2985		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2986			ASSERT(zio->io_children[c][w] == 0);
2987
2988	if (bp != NULL) {
2989		ASSERT(bp->blk_pad[0] == 0);
2990		ASSERT(bp->blk_pad[1] == 0);
2991		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2992		    (bp == zio_unique_parent(zio)->io_bp));
2993		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2994		    zio->io_bp_override == NULL &&
2995		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2996			ASSERT(!BP_SHOULD_BYTESWAP(bp));
2997			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
2998			ASSERT(BP_COUNT_GANG(bp) == 0 ||
2999			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
3000		}
3001		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3002			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
3003	}
3004
3005	/*
3006	 * If there were child vdev/gang/ddt errors, they apply to us now.
3007	 */
3008	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3009	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3010	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3011
3012	/*
3013	 * If the I/O on the transformed data was successful, generate any
3014	 * checksum reports now while we still have the transformed data.
3015	 */
3016	if (zio->io_error == 0) {
3017		while (zio->io_cksum_report != NULL) {
3018			zio_cksum_report_t *zcr = zio->io_cksum_report;
3019			uint64_t align = zcr->zcr_align;
3020			uint64_t asize = P2ROUNDUP(psize, align);
3021			char *abuf = zio->io_data;
3022
3023			if (asize != psize) {
3024				abuf = zio_buf_alloc(asize);
3025				bcopy(zio->io_data, abuf, psize);
3026				bzero(abuf + psize, asize - psize);
3027			}
3028
3029			zio->io_cksum_report = zcr->zcr_next;
3030			zcr->zcr_next = NULL;
3031			zcr->zcr_finish(zcr, abuf);
3032			zfs_ereport_free_checksum(zcr);
3033
3034			if (asize != psize)
3035				zio_buf_free(abuf, asize);
3036		}
3037	}
3038
3039	zio_pop_transforms(zio);	/* note: may set zio->io_error */
3040
3041	vdev_stat_update(zio, psize);
3042
3043	if (zio->io_error) {
3044		/*
3045		 * If this I/O is attached to a particular vdev,
3046		 * generate an error message describing the I/O failure
3047		 * at the block level.  We ignore these errors if the
3048		 * device is currently unavailable.
3049		 */
3050		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3051			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3052
3053		if ((zio->io_error == EIO || !(zio->io_flags &
3054		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3055		    zio == lio) {
3056			/*
3057			 * For logical I/O requests, tell the SPA to log the
3058			 * error and generate a logical data ereport.
3059			 */
3060			spa_log_error(spa, zio);
3061			zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3062			    0, 0);
3063		}
3064	}
3065
3066	if (zio->io_error && zio == lio) {
3067		/*
3068		 * Determine whether zio should be reexecuted.  This will
3069		 * propagate all the way to the root via zio_notify_parent().
3070		 */
3071		ASSERT(vd == NULL && bp != NULL);
3072		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3073
3074		if (IO_IS_ALLOCATING(zio) &&
3075		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3076			if (zio->io_error != ENOSPC)
3077				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3078			else
3079				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3080		}
3081
3082		if ((zio->io_type == ZIO_TYPE_READ ||
3083		    zio->io_type == ZIO_TYPE_FREE) &&
3084		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3085		    zio->io_error == ENXIO &&
3086		    spa_load_state(spa) == SPA_LOAD_NONE &&
3087		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3088			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3089
3090		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3091			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3092
3093		/*
3094		 * Here is a possibly good place to attempt to do
3095		 * either combinatorial reconstruction or error correction
3096		 * based on checksums.  It also might be a good place
3097		 * to send out preliminary ereports before we suspend
3098		 * processing.
3099		 */
3100	}
3101
3102	/*
3103	 * If there were logical child errors, they apply to us now.
3104	 * We defer this until now to avoid conflating logical child
3105	 * errors with errors that happened to the zio itself when
3106	 * updating vdev stats and reporting FMA events above.
3107	 */
3108	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3109
3110	if ((zio->io_error || zio->io_reexecute) &&
3111	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3112	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3113		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3114
3115	zio_gang_tree_free(&zio->io_gang_tree);
3116
3117	/*
3118	 * Godfather I/Os should never suspend.
3119	 */
3120	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3121	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3122		zio->io_reexecute = 0;
3123
3124	if (zio->io_reexecute) {
3125		/*
3126		 * This is a logical I/O that wants to reexecute.
3127		 *
3128		 * Reexecute is top-down.  When an i/o fails, if it's not
3129		 * the root, it simply notifies its parent and sticks around.
3130		 * The parent, seeing that it still has children in zio_done(),
3131		 * does the same.  This percolates all the way up to the root.
3132		 * The root i/o will reexecute or suspend the entire tree.
3133		 *
3134		 * This approach ensures that zio_reexecute() honors
3135		 * all the original i/o dependency relationships, e.g.
3136		 * parents not executing until children are ready.
3137		 */
3138		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3139
3140		zio->io_gang_leader = NULL;
3141
3142		mutex_enter(&zio->io_lock);
3143		zio->io_state[ZIO_WAIT_DONE] = 1;
3144		mutex_exit(&zio->io_lock);
3145
3146		/*
3147		 * "The Godfather" I/O monitors its children but is
3148		 * not a true parent to them. It will track them through
3149		 * the pipeline but severs its ties whenever they get into
3150		 * trouble (e.g. suspended). This allows "The Godfather"
3151		 * I/O to return status without blocking.
3152		 */
3153		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3154			zio_link_t *zl = zio->io_walk_link;
3155			pio_next = zio_walk_parents(zio);
3156
3157			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3158			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3159				zio_remove_child(pio, zio, zl);
3160				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3161			}
3162		}
3163
3164		if ((pio = zio_unique_parent(zio)) != NULL) {
3165			/*
3166			 * We're not a root i/o, so there's nothing to do
3167			 * but notify our parent.  Don't propagate errors
3168			 * upward since we haven't permanently failed yet.
3169			 */
3170			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3171			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3172			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3173		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3174			/*
3175			 * We'd fail again if we reexecuted now, so suspend
3176			 * until conditions improve (e.g. device comes online).
3177			 */
3178			zio_suspend(spa, zio);
3179		} else {
3180			/*
3181			 * Reexecution is potentially a huge amount of work.
3182			 * Hand it off to the otherwise-unused claim taskq.
3183			 */
3184#if defined(illumos) || !defined(_KERNEL)
3185			ASSERT(zio->io_tqent.tqent_next == NULL);
3186#else
3187			ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
3188#endif
3189			(void) taskq_dispatch_ent(
3190			    spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
3191			    (task_func_t *)zio_reexecute, zio, 0,
3192			    &zio->io_tqent);
3193		}
3194		return (ZIO_PIPELINE_STOP);
3195	}
3196
3197	ASSERT(zio->io_child_count == 0);
3198	ASSERT(zio->io_reexecute == 0);
3199	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3200
3201	/*
3202	 * Report any checksum errors, since the I/O is complete.
3203	 */
3204	while (zio->io_cksum_report != NULL) {
3205		zio_cksum_report_t *zcr = zio->io_cksum_report;
3206		zio->io_cksum_report = zcr->zcr_next;
3207		zcr->zcr_next = NULL;
3208		zcr->zcr_finish(zcr, NULL);
3209		zfs_ereport_free_checksum(zcr);
3210	}
3211
3212	/*
3213	 * It is the responsibility of the done callback to ensure that this
3214	 * particular zio is no longer discoverable for adoption, and as
3215	 * such, cannot acquire any new parents.
3216	 */
3217	if (zio->io_done)
3218		zio->io_done(zio);
3219
3220	mutex_enter(&zio->io_lock);
3221	zio->io_state[ZIO_WAIT_DONE] = 1;
3222	mutex_exit(&zio->io_lock);
3223
3224	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3225		zio_link_t *zl = zio->io_walk_link;
3226		pio_next = zio_walk_parents(zio);
3227		zio_remove_child(pio, zio, zl);
3228		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3229	}
3230
3231	if (zio->io_waiter != NULL) {
3232		mutex_enter(&zio->io_lock);
3233		zio->io_executor = NULL;
3234		cv_broadcast(&zio->io_cv);
3235		mutex_exit(&zio->io_lock);
3236	} else {
3237		zio_destroy(zio);
3238	}
3239
3240	return (ZIO_PIPELINE_STOP);
3241}
3242
3243/*
3244 * ==========================================================================
3245 * I/O pipeline definition
3246 * ==========================================================================
3247 */
3248static zio_pipe_stage_t *zio_pipeline[] = {
3249	NULL,
3250	zio_read_bp_init,
3251	zio_free_bp_init,
3252	zio_issue_async,
3253	zio_write_bp_init,
3254	zio_checksum_generate,
3255	zio_nop_write,
3256	zio_ddt_read_start,
3257	zio_ddt_read_done,
3258	zio_ddt_write,
3259	zio_ddt_free,
3260	zio_gang_assemble,
3261	zio_gang_issue,
3262	zio_dva_allocate,
3263	zio_dva_free,
3264	zio_dva_claim,
3265	zio_ready,
3266	zio_vdev_io_start,
3267	zio_vdev_io_done,
3268	zio_vdev_io_assess,
3269	zio_checksum_verify,
3270	zio_done
3271};
3272
3273/* dnp is the dnode for zb1->zb_object */
3274boolean_t
3275zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
3276    const zbookmark_t *zb2)
3277{
3278	uint64_t zb1nextL0, zb2thisobj;
3279
3280	ASSERT(zb1->zb_objset == zb2->zb_objset);
3281	ASSERT(zb2->zb_level == 0);
3282
3283	/*
3284	 * A bookmark in the deadlist is considered to be after
3285	 * everything else.
3286	 */
3287	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
3288		return (B_TRUE);
3289
3290	/* The objset_phys_t isn't before anything. */
3291	if (dnp == NULL)
3292		return (B_FALSE);
3293
3294	zb1nextL0 = (zb1->zb_blkid + 1) <<
3295	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3296
3297	zb2thisobj = zb2->zb_object ? zb2->zb_object :
3298	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3299
3300	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3301		uint64_t nextobj = zb1nextL0 *
3302		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3303		return (nextobj <= zb2thisobj);
3304	}
3305
3306	if (zb1->zb_object < zb2thisobj)
3307		return (B_TRUE);
3308	if (zb1->zb_object > zb2thisobj)
3309		return (B_FALSE);
3310	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3311		return (B_FALSE);
3312	return (zb1nextL0 <= zb2->zb_blkid);
3313}
3314