1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25 * Copyright (c) 2016 Gvozden Ne��kovi��. All rights reserved.
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
30#include <sys/spa_impl.h>
31#include <sys/zap.h>
32#include <sys/vdev_impl.h>
33#include <sys/metaslab_impl.h>
34#include <sys/zio.h>
35#include <sys/zio_checksum.h>
36#include <sys/dmu_tx.h>
37#include <sys/abd.h>
38#include <sys/zfs_rlock.h>
39#include <sys/fs/zfs.h>
40#include <sys/fm/fs/zfs.h>
41#include <sys/vdev_raidz.h>
42#include <sys/vdev_raidz_impl.h>
43#include <sys/vdev_draid.h>
44#include <sys/uberblock_impl.h>
45#include <sys/dsl_scan.h>
46
47#ifdef ZFS_DEBUG
48#include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
49#endif
50
51/*
52 * Virtual device vector for RAID-Z.
53 *
54 * This vdev supports single, double, and triple parity. For single parity,
55 * we use a simple XOR of all the data columns. For double or triple parity,
56 * we use a special case of Reed-Solomon coding. This extends the
57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60 * former is also based. The latter is designed to provide higher performance
61 * for writes.
62 *
63 * Note that the Plank paper claimed to support arbitrary N+M, but was then
64 * amended six years later identifying a critical flaw that invalidates its
65 * claims. Nevertheless, the technique can be adapted to work for up to
66 * triple parity. For additional parity, the amendment "Note: Correction to
67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68 * is viable, but the additional complexity means that write performance will
69 * suffer.
70 *
71 * All of the methods above operate on a Galois field, defined over the
72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73 * can be expressed with a single byte. Briefly, the operations on the
74 * field are defined as follows:
75 *
76 *   o addition (+) is represented by a bitwise XOR
77 *   o subtraction (-) is therefore identical to addition: A + B = A - B
78 *   o multiplication of A by 2 is defined by the following bitwise expression:
79 *
80 *	(A * 2)_7 = A_6
81 *	(A * 2)_6 = A_5
82 *	(A * 2)_5 = A_4
83 *	(A * 2)_4 = A_3 + A_7
84 *	(A * 2)_3 = A_2 + A_7
85 *	(A * 2)_2 = A_1 + A_7
86 *	(A * 2)_1 = A_0
87 *	(A * 2)_0 = A_7
88 *
89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90 * As an aside, this multiplication is derived from the error correcting
91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
92 *
93 * Observe that any number in the field (except for 0) can be expressed as a
94 * power of 2 -- a generator for the field. We store a table of the powers of
95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97 * than field addition). The inverse of a field element A (A^-1) is therefore
98 * A ^ (255 - 1) = A^254.
99 *
100 * The up-to-three parity columns, P, Q, R over several data columns,
101 * D_0, ... D_n-1, can be expressed by field operations:
102 *
103 *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
104 *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105 *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106 *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107 *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
108 *
109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111 * independent coefficients. (There are no additional coefficients that have
112 * this property which is why the uncorrected Plank method breaks down.)
113 *
114 * See the reconstruction code below for how P, Q and R can used individually
115 * or in concert to recover missing data columns.
116 */
117
118#define	VDEV_RAIDZ_P		0
119#define	VDEV_RAIDZ_Q		1
120#define	VDEV_RAIDZ_R		2
121
122#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
124
125/*
126 * We provide a mechanism to perform the field multiplication operation on a
127 * 64-bit value all at once rather than a byte at a time. This works by
128 * creating a mask from the top bit in each byte and using that to
129 * conditionally apply the XOR of 0x1d.
130 */
131#define	VDEV_RAIDZ_64MUL_2(x, mask) \
132{ \
133	(mask) = (x) & 0x8080808080808080ULL; \
134	(mask) = ((mask) << 1) - ((mask) >> 7); \
135	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
137}
138
139#define	VDEV_RAIDZ_64MUL_4(x, mask) \
140{ \
141	VDEV_RAIDZ_64MUL_2((x), mask); \
142	VDEV_RAIDZ_64MUL_2((x), mask); \
143}
144
145
146/*
147 * Big Theory Statement for how a RAIDZ VDEV is expanded
148 *
149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151 * that have been previously expanded can be expanded again.
152 *
153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154 * the VDEV) when an expansion starts.  And the expansion will pause if any
155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156 * operations on the pool can continue while an expansion is in progress (e.g.
157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158 * and zpool initialize which can't be run during an expansion.  Following a
159 * reboot or export/import, the expansion resumes where it left off.
160 *
161 * == Reflowing the Data ==
162 *
163 * The expansion involves reflowing (copying) the data from the current set
164 * of disks to spread it across the new set which now has one more disk. This
165 * reflow operation is similar to reflowing text when the column width of a
166 * text editor window is expanded. The text doesn���t change but the location of
167 * the text changes to accommodate the new width. An example reflow result for
168 * a 4-wide RAIDZ1 to a 5-wide is shown below.
169 *
170 *                            Reflow End State
171 *            Each letter indicates a parity group (logical stripe)
172 *
173 *         Before expansion                         After Expansion
174 *     D1     D2     D3     D4               D1     D2     D3     D4     D5
175 *  +------+------+------+------+         +------+------+------+------+------+
176 *  |      |      |      |      |         |      |      |      |      |      |
177 *  |  A   |  A   |  A   |  A   |         |  A   |  A   |  A   |  A   |  B   |
178 *  |     1|     2|     3|     4|         |     1|     2|     3|     4|     5|
179 *  +------+------+------+------+         +------+------+------+------+------+
180 *  |      |      |      |      |         |      |      |      |      |      |
181 *  |  B   |  B   |  C   |  C   |         |  B   |  C   |  C   |  C   |  C   |
182 *  |     5|     6|     7|     8|         |     6|     7|     8|     9|    10|
183 *  +------+------+------+------+         +------+------+------+------+------+
184 *  |      |      |      |      |         |      |      |      |      |      |
185 *  |  C   |  C   |  D   |  D   |         |  D   |  D   |  E   |  E   |  E   |
186 *  |     9|    10|    11|    12|         |    11|    12|    13|    14|    15|
187 *  +------+------+------+------+         +------+------+------+------+------+
188 *  |      |      |      |      |         |      |      |      |      |      |
189 *  |  E   |  E   |  E   |  E   |   -->   |  E   |  F   |  F   |  G   |  G   |
190 *  |    13|    14|    15|    16|         |    16|    17|    18|p   19|    20|
191 *  +------+------+------+------+         +------+------+------+------+------+
192 *  |      |      |      |      |         |      |      |      |      |      |
193 *  |  F   |  F   |  G   |  G   |         |  G   |  G   |  H   |  H   |  H   |
194 *  |    17|    18|    19|    20|         |    21|    22|    23|    24|    25|
195 *  +------+------+------+------+         +------+------+------+------+------+
196 *  |      |      |      |      |         |      |      |      |      |      |
197 *  |  G   |  G   |  H   |  H   |         |  H   |  I   |  I   |  J   |  J   |
198 *  |    21|    22|    23|    24|         |    26|    27|    28|    29|    30|
199 *  +------+------+------+------+         +------+------+------+------+------+
200 *  |      |      |      |      |         |      |      |      |      |      |
201 *  |  H   |  H   |  I   |  I   |         |  J   |  J   |      |      |  K   |
202 *  |    25|    26|    27|    28|         |    31|    32|    33|    34|    35|
203 *  +------+------+------+------+         +------+------+------+------+------+
204 *
205 * This reflow approach has several advantages. There is no need to read or
206 * modify the block pointers or recompute any block checksums.  The reflow
207 * doesn���t need to know where the parity sectors reside. We can read and write
208 * data sequentially and the copy can occur in a background thread in open
209 * context. The design also allows for fast discovery of what data to copy.
210 *
211 * The VDEV metaslabs are processed, one at a time, to copy the block data to
212 * have it flow across all the disks. The metaslab is disabled for allocations
213 * during the copy. As an optimization, we only copy the allocated data which
214 * can be determined by looking at the metaslab range tree. During the copy we
215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216 * need to be able to survive losing parity count disks).  This means we
217 * cannot overwrite data during the reflow that would be needed if a disk is
218 * lost.
219 *
220 * After the reflow completes, all newly-written blocks will have the new
221 * layout, i.e., they will have the parity to data ratio implied by the new
222 * number of disks in the RAIDZ group.  Even though the reflow copies all of
223 * the allocated space (data and parity), it is only rearranged, not changed.
224 *
225 * This act of reflowing the data has a few implications about blocks
226 * that were written before the reflow completes:
227 *
228 *  - Old blocks will still use the same amount of space (i.e., they will have
229 *    the parity to data ratio implied by the old number of disks in the RAIDZ
230 *    group).
231 *  - Reading old blocks will be slightly slower than before the reflow, for
232 *    two reasons. First, we will have to read from all disks in the RAIDZ
233 *    VDEV, rather than being able to skip the children that contain only
234 *    parity of this block (because the data of a single block is now spread
235 *    out across all the disks).  Second, in most cases there will be an extra
236 *    bcopy, needed to rearrange the data back to its original layout in memory.
237 *
238 * == Scratch Area ==
239 *
240 * As we copy the block data, we can only progress to the point that writes
241 * will not overlap with blocks whose progress has not yet been recorded on
242 * disk.  Since partially-copied rows are always read from the old location,
243 * we need to stop one row before the sector-wise overlap, to prevent any
244 * row-wise overlap. For example, in the diagram above, when we reflow sector
245 * B6 it will overwite the original location for B5.
246 *
247 * To get around this, a scratch space is used so that we can start copying
248 * without risking data loss by overlapping the row. As an added benefit, it
249 * improves performance at the beginning of the reflow, but that small perf
250 * boost wouldn't be worth the complexity on its own.
251 *
252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255 * the widths will likely be single digits so we can get a substantial chuck
256 * size using only a few MB of scratch per disk.
257 *
258 * The scratch area is persisted to disk which holds a large amount of reflowed
259 * state. We can always read the partially written stripes when a disk fails or
260 * the copy is interrupted (crash) during the initial copying phase and also
261 * get past a small chunk size restriction.  At a minimum, the scratch space
262 * must be large enough to get us to the point that one row does not overlap
263 * itself when moved (i.e new_width^2).  But going larger is even better. We
264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265 * as our scratch space to handle overwriting the initial part of the VDEV.
266 *
267 *	0     256K   512K                    4M
268 *	+------+------+-----------------------+-----------------------------
269 *	| VDEV | VDEV |   Boot Block (3.5M)   |  Allocatable space ...
270 *	|  L0  |  L1  |       Reserved        |     (Metaslabs)
271 *	+------+------+-----------------------+-------------------------------
272 *                        Scratch Area
273 *
274 * == Reflow Progress Updates ==
275 * After the initial scratch-based reflow, the expansion process works
276 * similarly to device removal. We create a new open context thread which
277 * reflows the data, and periodically kicks off sync tasks to update logical
278 * state. In this case, state is the committed progress (offset of next data
279 * to copy). We need to persist the completed offset on disk, so that if we
280 * crash we know which format each VDEV offset is in.
281 *
282 * == Time Dependent Geometry ==
283 *
284 * In non-expanded RAIDZ, blocks are read from disk in a column by column
285 * fashion. For a multi-row block, the second sector is in the first column
286 * not in the second column. This allows us to issue full reads for each
287 * column directly into the request buffer. The block data is thus laid out
288 * sequentially in a column-by-column fashion.
289 *
290 * For example, in the before expansion diagram above, one logical block might
291 * be sectors G19-H26. The parity is in G19,H23; and the data is in
292 * G20,H24,G21,H25,G22,H26.
293 *
294 * After a block is reflowed, the sectors that were all in the original column
295 * data can now reside in different columns. When reading from an expanded
296 * VDEV, we need to know the logical stripe width for each block so we can
297 * reconstitute the block���s data after the reads are completed. Likewise,
298 * when we perform the combinatorial reconstruction we need to know the
299 * original width so we can retry combinations from the past layouts.
300 *
301 * Time dependent geometry is what we call having blocks with different layouts
302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303 * block���s birth time (+ the time expansion ended) to establish the correct
304 * width for a given block. After an expansion completes, we record the time
305 * for blocks written with a particular width (geometry).
306 *
307 * == On Disk Format Changes ==
308 *
309 * New pool feature flag, 'raidz_expansion' whose reference count is the number
310 * of RAIDZ VDEVs that have been expanded.
311 *
312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
313 *
314 * Since the uberblock can point to arbitrary blocks, which might be on the
315 * expanding RAIDZ, and might or might not have been expanded. We need to know
316 * which way a block is laid out before reading it. This info is the next
317 * offset that needs to be reflowed and we persist that in the uberblock, in
318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319 * After the expansion is complete, we then use the raidz_expand_txgs array
320 * (see below) to determine how to read a block and the ub_raidz_reflow_info
321 * field no longer required.
322 *
323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324 * state (i.e., active or not) which is also required before reading a block
325 * during the initial phase of reflowing the data.
326 *
327 * The top-level RAIDZ VDEV has two new entries in the nvlist:
328 *
329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330 *                            and used after the expansion is complete to
331 *                            determine how to read a raidz block
332 * 'raidz_expanding' boolean: present during reflow and removed after completion
333 *                            used during a spa import to resume an unfinished
334 *                            expansion
335 *
336 * And finally the VDEVs top zap adds the following informational entries:
337 *   VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338 *   VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339 *   VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340 *   VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
341 */
342
343/*
344 * For testing only: pause the raidz expansion after reflowing this amount.
345 * (accessed by ZTS and ztest)
346 */
347#ifdef	_KERNEL
348static
349#endif	/* _KERNEL */
350unsigned long raidz_expand_max_reflow_bytes = 0;
351
352/*
353 * For testing only: pause the raidz expansion at a certain point.
354 */
355uint_t raidz_expand_pause_point = 0;
356
357/*
358 * Maximum amount of copy io's outstanding at once.
359 */
360static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
361
362/*
363 * Apply raidz map abds aggregation if the number of rows in the map is equal
364 * or greater than the value below.
365 */
366static unsigned long raidz_io_aggregate_rows = 4;
367
368/*
369 * Automatically start a pool scrub when a RAIDZ expansion completes in
370 * order to verify the checksums of all blocks which have been copied
371 * during the expansion.  Automatic scrubbing is enabled by default and
372 * is strongly recommended.
373 */
374static int zfs_scrub_after_expand = 1;
375
376static void
377vdev_raidz_row_free(raidz_row_t *rr)
378{
379	for (int c = 0; c < rr->rr_cols; c++) {
380		raidz_col_t *rc = &rr->rr_col[c];
381
382		if (rc->rc_size != 0)
383			abd_free(rc->rc_abd);
384		if (rc->rc_orig_data != NULL)
385			abd_free(rc->rc_orig_data);
386	}
387
388	if (rr->rr_abd_empty != NULL)
389		abd_free(rr->rr_abd_empty);
390
391	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
392}
393
394void
395vdev_raidz_map_free(raidz_map_t *rm)
396{
397	for (int i = 0; i < rm->rm_nrows; i++)
398		vdev_raidz_row_free(rm->rm_row[i]);
399
400	if (rm->rm_nphys_cols) {
401		for (int i = 0; i < rm->rm_nphys_cols; i++) {
402			if (rm->rm_phys_col[i].rc_abd != NULL)
403				abd_free(rm->rm_phys_col[i].rc_abd);
404		}
405
406		kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
407		    rm->rm_nphys_cols);
408	}
409
410	ASSERT3P(rm->rm_lr, ==, NULL);
411	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
412}
413
414static void
415vdev_raidz_map_free_vsd(zio_t *zio)
416{
417	raidz_map_t *rm = zio->io_vsd;
418
419	vdev_raidz_map_free(rm);
420}
421
422static int
423vdev_raidz_reflow_compare(const void *x1, const void *x2)
424{
425	const reflow_node_t *l = x1;
426	const reflow_node_t *r = x2;
427
428	return (TREE_CMP(l->re_txg, r->re_txg));
429}
430
431const zio_vsd_ops_t vdev_raidz_vsd_ops = {
432	.vsd_free = vdev_raidz_map_free_vsd,
433};
434
435raidz_row_t *
436vdev_raidz_row_alloc(int cols)
437{
438	raidz_row_t *rr =
439	    kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
440
441	rr->rr_cols = cols;
442	rr->rr_scols = cols;
443
444	for (int c = 0; c < cols; c++) {
445		raidz_col_t *rc = &rr->rr_col[c];
446		rc->rc_shadow_devidx = INT_MAX;
447		rc->rc_shadow_offset = UINT64_MAX;
448		rc->rc_allow_repair = 1;
449	}
450	return (rr);
451}
452
453static void
454vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
455{
456	int c;
457	int nwrapped = 0;
458	uint64_t off = 0;
459	raidz_row_t *rr = rm->rm_row[0];
460
461	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
462	ASSERT3U(rm->rm_nrows, ==, 1);
463
464	/*
465	 * Pad any parity columns with additional space to account for skip
466	 * sectors.
467	 */
468	if (rm->rm_skipstart < rr->rr_firstdatacol) {
469		ASSERT0(rm->rm_skipstart);
470		nwrapped = rm->rm_nskip;
471	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
472		nwrapped =
473		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
474	}
475
476	/*
477	 * Optional single skip sectors (rc_size == 0) will be handled in
478	 * vdev_raidz_io_start_write().
479	 */
480	int skipped = rr->rr_scols - rr->rr_cols;
481
482	/* Allocate buffers for the parity columns */
483	for (c = 0; c < rr->rr_firstdatacol; c++) {
484		raidz_col_t *rc = &rr->rr_col[c];
485
486		/*
487		 * Parity columns will pad out a linear ABD to account for
488		 * the skip sector. A linear ABD is used here because
489		 * parity calculations use the ABD buffer directly to calculate
490		 * parity. This avoids doing a memcpy back to the ABD after the
491		 * parity has been calculated. By issuing the parity column
492		 * with the skip sector we can reduce contention on the child
493		 * VDEV queue locks (vq_lock).
494		 */
495		if (c < nwrapped) {
496			rc->rc_abd = abd_alloc_linear(
497			    rc->rc_size + (1ULL << ashift), B_FALSE);
498			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
499			skipped++;
500		} else {
501			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
502		}
503	}
504
505	for (off = 0; c < rr->rr_cols; c++) {
506		raidz_col_t *rc = &rr->rr_col[c];
507		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
508		    zio->io_abd, off, rc->rc_size);
509
510		/*
511		 * Generate I/O for skip sectors to improve aggregation
512		 * continuity. We will use gang ABD's to reduce contention
513		 * on the child VDEV queue locks (vq_lock) by issuing
514		 * a single I/O that contains the data and skip sector.
515		 *
516		 * It is important to make sure that rc_size is not updated
517		 * even though we are adding a skip sector to the ABD. When
518		 * calculating the parity in vdev_raidz_generate_parity_row()
519		 * the rc_size is used to iterate through the ABD's. We can
520		 * not have zero'd out skip sectors used for calculating
521		 * parity for raidz, because those same sectors are not used
522		 * during reconstruction.
523		 */
524		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
525			rc->rc_abd = abd_alloc_gang();
526			abd_gang_add(rc->rc_abd, abd, B_TRUE);
527			abd_gang_add(rc->rc_abd,
528			    abd_get_zeros(1ULL << ashift), B_TRUE);
529			skipped++;
530		} else {
531			rc->rc_abd = abd;
532		}
533		off += rc->rc_size;
534	}
535
536	ASSERT3U(off, ==, zio->io_size);
537	ASSERT3S(skipped, ==, rm->rm_nskip);
538}
539
540static void
541vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
542{
543	int c;
544	raidz_row_t *rr = rm->rm_row[0];
545
546	ASSERT3U(rm->rm_nrows, ==, 1);
547
548	/* Allocate buffers for the parity columns */
549	for (c = 0; c < rr->rr_firstdatacol; c++)
550		rr->rr_col[c].rc_abd =
551		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
552
553	for (uint64_t off = 0; c < rr->rr_cols; c++) {
554		raidz_col_t *rc = &rr->rr_col[c];
555		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
556		    zio->io_abd, off, rc->rc_size);
557		off += rc->rc_size;
558	}
559}
560
561/*
562 * Divides the IO evenly across all child vdevs; usually, dcols is
563 * the number of children in the target vdev.
564 *
565 * Avoid inlining the function to keep vdev_raidz_io_start(), which
566 * is this functions only caller, as small as possible on the stack.
567 */
568noinline raidz_map_t *
569vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
570    uint64_t nparity)
571{
572	raidz_row_t *rr;
573	/* The starting RAIDZ (parent) vdev sector of the block. */
574	uint64_t b = zio->io_offset >> ashift;
575	/* The zio's size in units of the vdev's minimum sector size. */
576	uint64_t s = zio->io_size >> ashift;
577	/* The first column for this stripe. */
578	uint64_t f = b % dcols;
579	/* The starting byte offset on each child vdev. */
580	uint64_t o = (b / dcols) << ashift;
581	uint64_t acols, scols;
582
583	raidz_map_t *rm =
584	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
585	rm->rm_nrows = 1;
586
587	/*
588	 * "Quotient": The number of data sectors for this stripe on all but
589	 * the "big column" child vdevs that also contain "remainder" data.
590	 */
591	uint64_t q = s / (dcols - nparity);
592
593	/*
594	 * "Remainder": The number of partial stripe data sectors in this I/O.
595	 * This will add a sector to some, but not all, child vdevs.
596	 */
597	uint64_t r = s - q * (dcols - nparity);
598
599	/* The number of "big columns" - those which contain remainder data. */
600	uint64_t bc = (r == 0 ? 0 : r + nparity);
601
602	/*
603	 * The total number of data and parity sectors associated with
604	 * this I/O.
605	 */
606	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
607
608	/*
609	 * acols: The columns that will be accessed.
610	 * scols: The columns that will be accessed or skipped.
611	 */
612	if (q == 0) {
613		/* Our I/O request doesn't span all child vdevs. */
614		acols = bc;
615		scols = MIN(dcols, roundup(bc, nparity + 1));
616	} else {
617		acols = dcols;
618		scols = dcols;
619	}
620
621	ASSERT3U(acols, <=, scols);
622	rr = vdev_raidz_row_alloc(scols);
623	rm->rm_row[0] = rr;
624	rr->rr_cols = acols;
625	rr->rr_bigcols = bc;
626	rr->rr_firstdatacol = nparity;
627#ifdef ZFS_DEBUG
628	rr->rr_offset = zio->io_offset;
629	rr->rr_size = zio->io_size;
630#endif
631
632	uint64_t asize = 0;
633
634	for (uint64_t c = 0; c < scols; c++) {
635		raidz_col_t *rc = &rr->rr_col[c];
636		uint64_t col = f + c;
637		uint64_t coff = o;
638		if (col >= dcols) {
639			col -= dcols;
640			coff += 1ULL << ashift;
641		}
642		rc->rc_devidx = col;
643		rc->rc_offset = coff;
644
645		if (c >= acols)
646			rc->rc_size = 0;
647		else if (c < bc)
648			rc->rc_size = (q + 1) << ashift;
649		else
650			rc->rc_size = q << ashift;
651
652		asize += rc->rc_size;
653	}
654
655	ASSERT3U(asize, ==, tot << ashift);
656	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
657	rm->rm_skipstart = bc;
658
659	/*
660	 * If all data stored spans all columns, there's a danger that parity
661	 * will always be on the same device and, since parity isn't read
662	 * during normal operation, that device's I/O bandwidth won't be
663	 * used effectively. We therefore switch the parity every 1MB.
664	 *
665	 * ... at least that was, ostensibly, the theory. As a practical
666	 * matter unless we juggle the parity between all devices evenly, we
667	 * won't see any benefit. Further, occasional writes that aren't a
668	 * multiple of the LCM of the number of children and the minimum
669	 * stripe width are sufficient to avoid pessimal behavior.
670	 * Unfortunately, this decision created an implicit on-disk format
671	 * requirement that we need to support for all eternity, but only
672	 * for single-parity RAID-Z.
673	 *
674	 * If we intend to skip a sector in the zeroth column for padding
675	 * we must make sure to note this swap. We will never intend to
676	 * skip the first column since at least one data and one parity
677	 * column must appear in each row.
678	 */
679	ASSERT(rr->rr_cols >= 2);
680	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
681
682	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
683		uint64_t devidx = rr->rr_col[0].rc_devidx;
684		o = rr->rr_col[0].rc_offset;
685		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
686		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
687		rr->rr_col[1].rc_devidx = devidx;
688		rr->rr_col[1].rc_offset = o;
689		if (rm->rm_skipstart == 0)
690			rm->rm_skipstart = 1;
691	}
692
693	if (zio->io_type == ZIO_TYPE_WRITE) {
694		vdev_raidz_map_alloc_write(zio, rm, ashift);
695	} else {
696		vdev_raidz_map_alloc_read(zio, rm);
697	}
698	/* init RAIDZ parity ops */
699	rm->rm_ops = vdev_raidz_math_get_ops();
700
701	return (rm);
702}
703
704/*
705 * Everything before reflow_offset_synced should have been moved to the new
706 * location (read and write completed).  However, this may not yet be reflected
707 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
708 * uberblock has not yet been written). If reflow is not in progress,
709 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
710 * entirely before reflow_offset_synced, it will come from the new location.
711 * Otherwise this row will come from the old location.  Therefore, rows that
712 * straddle the reflow_offset_synced will come from the old location.
713 *
714 * For writes, reflow_offset_next is the next offset to copy.  If a sector has
715 * been copied, but not yet reflected in the on-disk progress
716 * (reflow_offset_synced), it will also be written to the new (already copied)
717 * offset.
718 */
719noinline raidz_map_t *
720vdev_raidz_map_alloc_expanded(zio_t *zio,
721    uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
722    uint64_t nparity, uint64_t reflow_offset_synced,
723    uint64_t reflow_offset_next, boolean_t use_scratch)
724{
725	abd_t *abd = zio->io_abd;
726	uint64_t offset = zio->io_offset;
727	uint64_t size = zio->io_size;
728
729	/* The zio's size in units of the vdev's minimum sector size. */
730	uint64_t s = size >> ashift;
731
732	/*
733	 * "Quotient": The number of data sectors for this stripe on all but
734	 * the "big column" child vdevs that also contain "remainder" data.
735	 * AKA "full rows"
736	 */
737	uint64_t q = s / (logical_cols - nparity);
738
739	/*
740	 * "Remainder": The number of partial stripe data sectors in this I/O.
741	 * This will add a sector to some, but not all, child vdevs.
742	 */
743	uint64_t r = s - q * (logical_cols - nparity);
744
745	/* The number of "big columns" - those which contain remainder data. */
746	uint64_t bc = (r == 0 ? 0 : r + nparity);
747
748	/*
749	 * The total number of data and parity sectors associated with
750	 * this I/O.
751	 */
752	uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
753
754	/* How many rows contain data (not skip) */
755	uint64_t rows = howmany(tot, logical_cols);
756	int cols = MIN(tot, logical_cols);
757
758	raidz_map_t *rm =
759	    kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
760	    KM_SLEEP);
761	rm->rm_nrows = rows;
762	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
763	rm->rm_skipstart = bc;
764	uint64_t asize = 0;
765
766	for (uint64_t row = 0; row < rows; row++) {
767		boolean_t row_use_scratch = B_FALSE;
768		raidz_row_t *rr = vdev_raidz_row_alloc(cols);
769		rm->rm_row[row] = rr;
770
771		/* The starting RAIDZ (parent) vdev sector of the row. */
772		uint64_t b = (offset >> ashift) + row * logical_cols;
773
774		/*
775		 * If we are in the middle of a reflow, and the copying has
776		 * not yet completed for any part of this row, then use the
777		 * old location of this row.  Note that reflow_offset_synced
778		 * reflects the i/o that's been completed, because it's
779		 * updated by a synctask, after zio_wait(spa_txg_zio[]).
780		 * This is sufficient for our check, even if that progress
781		 * has not yet been recorded to disk (reflected in
782		 * spa_ubsync).  Also note that we consider the last row to
783		 * be "full width" (`cols`-wide rather than `bc`-wide) for
784		 * this calculation. This causes a tiny bit of unnecessary
785		 * double-writes but is safe and simpler to calculate.
786		 */
787		int row_phys_cols = physical_cols;
788		if (b + cols > reflow_offset_synced >> ashift)
789			row_phys_cols--;
790		else if (use_scratch)
791			row_use_scratch = B_TRUE;
792
793		/* starting child of this row */
794		uint64_t child_id = b % row_phys_cols;
795		/* The starting byte offset on each child vdev. */
796		uint64_t child_offset = (b / row_phys_cols) << ashift;
797
798		/*
799		 * Note, rr_cols is the entire width of the block, even
800		 * if this row is shorter.  This is needed because parity
801		 * generation (for Q and R) needs to know the entire width,
802		 * because it treats the short row as though it was
803		 * full-width (and the "phantom" sectors were zero-filled).
804		 *
805		 * Another approach to this would be to set cols shorter
806		 * (to just the number of columns that we might do i/o to)
807		 * and have another mechanism to tell the parity generation
808		 * about the "entire width".  Reconstruction (at least
809		 * vdev_raidz_reconstruct_general()) would also need to
810		 * know about the "entire width".
811		 */
812		rr->rr_firstdatacol = nparity;
813#ifdef ZFS_DEBUG
814		/*
815		 * note: rr_size is PSIZE, not ASIZE
816		 */
817		rr->rr_offset = b << ashift;
818		rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
819#endif
820
821		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
822			if (child_id >= row_phys_cols) {
823				child_id -= row_phys_cols;
824				child_offset += 1ULL << ashift;
825			}
826			raidz_col_t *rc = &rr->rr_col[c];
827			rc->rc_devidx = child_id;
828			rc->rc_offset = child_offset;
829
830			/*
831			 * Get this from the scratch space if appropriate.
832			 * This only happens if we crashed in the middle of
833			 * raidz_reflow_scratch_sync() (while it's running,
834			 * the rangelock prevents us from doing concurrent
835			 * io), and even then only during zpool import or
836			 * when the pool is imported readonly.
837			 */
838			if (row_use_scratch)
839				rc->rc_offset -= VDEV_BOOT_SIZE;
840
841			uint64_t dc = c - rr->rr_firstdatacol;
842			if (c < rr->rr_firstdatacol) {
843				rc->rc_size = 1ULL << ashift;
844
845				/*
846				 * Parity sectors' rc_abd's are set below
847				 * after determining if this is an aggregation.
848				 */
849			} else if (row == rows - 1 && bc != 0 && c >= bc) {
850				/*
851				 * Past the end of the block (even including
852				 * skip sectors).  This sector is part of the
853				 * map so that we have full rows for p/q parity
854				 * generation.
855				 */
856				rc->rc_size = 0;
857				rc->rc_abd = NULL;
858			} else {
859				/* "data column" (col excluding parity) */
860				uint64_t off;
861
862				if (c < bc || r == 0) {
863					off = dc * rows + row;
864				} else {
865					off = r * rows +
866					    (dc - r) * (rows - 1) + row;
867				}
868				rc->rc_size = 1ULL << ashift;
869				rc->rc_abd = abd_get_offset_struct(
870				    &rc->rc_abdstruct, abd, off << ashift,
871				    rc->rc_size);
872			}
873
874			if (rc->rc_size == 0)
875				continue;
876
877			/*
878			 * If any part of this row is in both old and new
879			 * locations, the primary location is the old
880			 * location. If this sector was already copied to the
881			 * new location, we need to also write to the new,
882			 * "shadow" location.
883			 *
884			 * Note, `row_phys_cols != physical_cols` indicates
885			 * that the primary location is the old location.
886			 * `b+c < reflow_offset_next` indicates that the copy
887			 * to the new location has been initiated. We know
888			 * that the copy has completed because we have the
889			 * rangelock, which is held exclusively while the
890			 * copy is in progress.
891			 */
892			if (row_use_scratch ||
893			    (row_phys_cols != physical_cols &&
894			    b + c < reflow_offset_next >> ashift)) {
895				rc->rc_shadow_devidx = (b + c) % physical_cols;
896				rc->rc_shadow_offset =
897				    ((b + c) / physical_cols) << ashift;
898				if (row_use_scratch)
899					rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
900			}
901
902			asize += rc->rc_size;
903		}
904
905		/*
906		 * See comment in vdev_raidz_map_alloc()
907		 */
908		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
909		    (offset & (1ULL << 20))) {
910			ASSERT(rr->rr_cols >= 2);
911			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
912
913			int devidx0 = rr->rr_col[0].rc_devidx;
914			uint64_t offset0 = rr->rr_col[0].rc_offset;
915			int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
916			uint64_t shadow_offset0 =
917			    rr->rr_col[0].rc_shadow_offset;
918
919			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
920			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
921			rr->rr_col[0].rc_shadow_devidx =
922			    rr->rr_col[1].rc_shadow_devidx;
923			rr->rr_col[0].rc_shadow_offset =
924			    rr->rr_col[1].rc_shadow_offset;
925
926			rr->rr_col[1].rc_devidx = devidx0;
927			rr->rr_col[1].rc_offset = offset0;
928			rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
929			rr->rr_col[1].rc_shadow_offset = shadow_offset0;
930		}
931	}
932	ASSERT3U(asize, ==, tot << ashift);
933
934	/*
935	 * Determine if the block is contiguous, in which case we can use
936	 * an aggregation.
937	 */
938	if (rows >= raidz_io_aggregate_rows) {
939		rm->rm_nphys_cols = physical_cols;
940		rm->rm_phys_col =
941		    kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
942		    KM_SLEEP);
943
944		/*
945		 * Determine the aggregate io's offset and size, and check
946		 * that the io is contiguous.
947		 */
948		for (int i = 0;
949		    i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
950			raidz_row_t *rr = rm->rm_row[i];
951			for (int c = 0; c < rr->rr_cols; c++) {
952				raidz_col_t *rc = &rr->rr_col[c];
953				raidz_col_t *prc =
954				    &rm->rm_phys_col[rc->rc_devidx];
955
956				if (rc->rc_size == 0)
957					continue;
958
959				if (prc->rc_size == 0) {
960					ASSERT0(prc->rc_offset);
961					prc->rc_offset = rc->rc_offset;
962				} else if (prc->rc_offset + prc->rc_size !=
963				    rc->rc_offset) {
964					/*
965					 * This block is not contiguous and
966					 * therefore can't be aggregated.
967					 * This is expected to be rare, so
968					 * the cost of allocating and then
969					 * freeing rm_phys_col is not
970					 * significant.
971					 */
972					kmem_free(rm->rm_phys_col,
973					    sizeof (raidz_col_t) *
974					    rm->rm_nphys_cols);
975					rm->rm_phys_col = NULL;
976					rm->rm_nphys_cols = 0;
977					break;
978				}
979				prc->rc_size += rc->rc_size;
980			}
981		}
982	}
983	if (rm->rm_phys_col != NULL) {
984		/*
985		 * Allocate aggregate ABD's.
986		 */
987		for (int i = 0; i < rm->rm_nphys_cols; i++) {
988			raidz_col_t *prc = &rm->rm_phys_col[i];
989
990			prc->rc_devidx = i;
991
992			if (prc->rc_size == 0)
993				continue;
994
995			prc->rc_abd =
996			    abd_alloc_linear(rm->rm_phys_col[i].rc_size,
997			    B_FALSE);
998		}
999
1000		/*
1001		 * Point the parity abd's into the aggregate abd's.
1002		 */
1003		for (int i = 0; i < rm->rm_nrows; i++) {
1004			raidz_row_t *rr = rm->rm_row[i];
1005			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1006				raidz_col_t *rc = &rr->rr_col[c];
1007				raidz_col_t *prc =
1008				    &rm->rm_phys_col[rc->rc_devidx];
1009				rc->rc_abd =
1010				    abd_get_offset_struct(&rc->rc_abdstruct,
1011				    prc->rc_abd,
1012				    rc->rc_offset - prc->rc_offset,
1013				    rc->rc_size);
1014			}
1015		}
1016	} else {
1017		/*
1018		 * Allocate new abd's for the parity sectors.
1019		 */
1020		for (int i = 0; i < rm->rm_nrows; i++) {
1021			raidz_row_t *rr = rm->rm_row[i];
1022			for (int c = 0; c < rr->rr_firstdatacol; c++) {
1023				raidz_col_t *rc = &rr->rr_col[c];
1024				rc->rc_abd =
1025				    abd_alloc_linear(rc->rc_size,
1026				    B_TRUE);
1027			}
1028		}
1029	}
1030	/* init RAIDZ parity ops */
1031	rm->rm_ops = vdev_raidz_math_get_ops();
1032
1033	return (rm);
1034}
1035
1036struct pqr_struct {
1037	uint64_t *p;
1038	uint64_t *q;
1039	uint64_t *r;
1040};
1041
1042static int
1043vdev_raidz_p_func(void *buf, size_t size, void *private)
1044{
1045	struct pqr_struct *pqr = private;
1046	const uint64_t *src = buf;
1047	int cnt = size / sizeof (src[0]);
1048
1049	ASSERT(pqr->p && !pqr->q && !pqr->r);
1050
1051	for (int i = 0; i < cnt; i++, src++, pqr->p++)
1052		*pqr->p ^= *src;
1053
1054	return (0);
1055}
1056
1057static int
1058vdev_raidz_pq_func(void *buf, size_t size, void *private)
1059{
1060	struct pqr_struct *pqr = private;
1061	const uint64_t *src = buf;
1062	uint64_t mask;
1063	int cnt = size / sizeof (src[0]);
1064
1065	ASSERT(pqr->p && pqr->q && !pqr->r);
1066
1067	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1068		*pqr->p ^= *src;
1069		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1070		*pqr->q ^= *src;
1071	}
1072
1073	return (0);
1074}
1075
1076static int
1077vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1078{
1079	struct pqr_struct *pqr = private;
1080	const uint64_t *src = buf;
1081	uint64_t mask;
1082	int cnt = size / sizeof (src[0]);
1083
1084	ASSERT(pqr->p && pqr->q && pqr->r);
1085
1086	for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1087		*pqr->p ^= *src;
1088		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1089		*pqr->q ^= *src;
1090		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1091		*pqr->r ^= *src;
1092	}
1093
1094	return (0);
1095}
1096
1097static void
1098vdev_raidz_generate_parity_p(raidz_row_t *rr)
1099{
1100	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1101
1102	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1103		abd_t *src = rr->rr_col[c].rc_abd;
1104
1105		if (c == rr->rr_firstdatacol) {
1106			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1107		} else {
1108			struct pqr_struct pqr = { p, NULL, NULL };
1109			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1110			    vdev_raidz_p_func, &pqr);
1111		}
1112	}
1113}
1114
1115static void
1116vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1117{
1118	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1119	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1120	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1121	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1122	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1123
1124	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1125		abd_t *src = rr->rr_col[c].rc_abd;
1126
1127		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1128
1129		if (c == rr->rr_firstdatacol) {
1130			ASSERT(ccnt == pcnt || ccnt == 0);
1131			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1132			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1133
1134			for (uint64_t i = ccnt; i < pcnt; i++) {
1135				p[i] = 0;
1136				q[i] = 0;
1137			}
1138		} else {
1139			struct pqr_struct pqr = { p, q, NULL };
1140
1141			ASSERT(ccnt <= pcnt);
1142			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1143			    vdev_raidz_pq_func, &pqr);
1144
1145			/*
1146			 * Treat short columns as though they are full of 0s.
1147			 * Note that there's therefore nothing needed for P.
1148			 */
1149			uint64_t mask;
1150			for (uint64_t i = ccnt; i < pcnt; i++) {
1151				VDEV_RAIDZ_64MUL_2(q[i], mask);
1152			}
1153		}
1154	}
1155}
1156
1157static void
1158vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1159{
1160	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1161	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1162	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1163	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1164	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1165	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1166	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1167	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
1168
1169	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1170		abd_t *src = rr->rr_col[c].rc_abd;
1171
1172		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1173
1174		if (c == rr->rr_firstdatacol) {
1175			ASSERT(ccnt == pcnt || ccnt == 0);
1176			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1177			(void) memcpy(q, p, rr->rr_col[c].rc_size);
1178			(void) memcpy(r, p, rr->rr_col[c].rc_size);
1179
1180			for (uint64_t i = ccnt; i < pcnt; i++) {
1181				p[i] = 0;
1182				q[i] = 0;
1183				r[i] = 0;
1184			}
1185		} else {
1186			struct pqr_struct pqr = { p, q, r };
1187
1188			ASSERT(ccnt <= pcnt);
1189			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1190			    vdev_raidz_pqr_func, &pqr);
1191
1192			/*
1193			 * Treat short columns as though they are full of 0s.
1194			 * Note that there's therefore nothing needed for P.
1195			 */
1196			uint64_t mask;
1197			for (uint64_t i = ccnt; i < pcnt; i++) {
1198				VDEV_RAIDZ_64MUL_2(q[i], mask);
1199				VDEV_RAIDZ_64MUL_4(r[i], mask);
1200			}
1201		}
1202	}
1203}
1204
1205/*
1206 * Generate RAID parity in the first virtual columns according to the number of
1207 * parity columns available.
1208 */
1209void
1210vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1211{
1212	if (rr->rr_cols == 0) {
1213		/*
1214		 * We are handling this block one row at a time (because
1215		 * this block has a different logical vs physical width,
1216		 * due to RAIDZ expansion), and this is a pad-only row,
1217		 * which has no parity.
1218		 */
1219		return;
1220	}
1221
1222	/* Generate using the new math implementation */
1223	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1224		return;
1225
1226	switch (rr->rr_firstdatacol) {
1227	case 1:
1228		vdev_raidz_generate_parity_p(rr);
1229		break;
1230	case 2:
1231		vdev_raidz_generate_parity_pq(rr);
1232		break;
1233	case 3:
1234		vdev_raidz_generate_parity_pqr(rr);
1235		break;
1236	default:
1237		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1238	}
1239}
1240
1241void
1242vdev_raidz_generate_parity(raidz_map_t *rm)
1243{
1244	for (int i = 0; i < rm->rm_nrows; i++) {
1245		raidz_row_t *rr = rm->rm_row[i];
1246		vdev_raidz_generate_parity_row(rm, rr);
1247	}
1248}
1249
1250static int
1251vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1252{
1253	(void) private;
1254	uint64_t *dst = dbuf;
1255	uint64_t *src = sbuf;
1256	int cnt = size / sizeof (src[0]);
1257
1258	for (int i = 0; i < cnt; i++) {
1259		dst[i] ^= src[i];
1260	}
1261
1262	return (0);
1263}
1264
1265static int
1266vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1267    void *private)
1268{
1269	(void) private;
1270	uint64_t *dst = dbuf;
1271	uint64_t *src = sbuf;
1272	uint64_t mask;
1273	int cnt = size / sizeof (dst[0]);
1274
1275	for (int i = 0; i < cnt; i++, dst++, src++) {
1276		VDEV_RAIDZ_64MUL_2(*dst, mask);
1277		*dst ^= *src;
1278	}
1279
1280	return (0);
1281}
1282
1283static int
1284vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1285{
1286	(void) private;
1287	uint64_t *dst = buf;
1288	uint64_t mask;
1289	int cnt = size / sizeof (dst[0]);
1290
1291	for (int i = 0; i < cnt; i++, dst++) {
1292		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1293		VDEV_RAIDZ_64MUL_2(*dst, mask);
1294	}
1295
1296	return (0);
1297}
1298
1299struct reconst_q_struct {
1300	uint64_t *q;
1301	int exp;
1302};
1303
1304static int
1305vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1306{
1307	struct reconst_q_struct *rq = private;
1308	uint64_t *dst = buf;
1309	int cnt = size / sizeof (dst[0]);
1310
1311	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1312		int j;
1313		uint8_t *b;
1314
1315		*dst ^= *rq->q;
1316		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1317			*b = vdev_raidz_exp2(*b, rq->exp);
1318		}
1319	}
1320
1321	return (0);
1322}
1323
1324struct reconst_pq_struct {
1325	uint8_t *p;
1326	uint8_t *q;
1327	uint8_t *pxy;
1328	uint8_t *qxy;
1329	int aexp;
1330	int bexp;
1331};
1332
1333static int
1334vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1335{
1336	struct reconst_pq_struct *rpq = private;
1337	uint8_t *xd = xbuf;
1338	uint8_t *yd = ybuf;
1339
1340	for (int i = 0; i < size;
1341	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1342		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1343		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1344		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
1345	}
1346
1347	return (0);
1348}
1349
1350static int
1351vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1352{
1353	struct reconst_pq_struct *rpq = private;
1354	uint8_t *xd = xbuf;
1355
1356	for (int i = 0; i < size;
1357	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1358		/* same operation as vdev_raidz_reconst_pq_func() on xd */
1359		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1360		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1361	}
1362
1363	return (0);
1364}
1365
1366static void
1367vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1368{
1369	int x = tgts[0];
1370	abd_t *dst, *src;
1371
1372	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1373		zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1374
1375	ASSERT3U(ntgts, ==, 1);
1376	ASSERT3U(x, >=, rr->rr_firstdatacol);
1377	ASSERT3U(x, <, rr->rr_cols);
1378
1379	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1380
1381	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1382	dst = rr->rr_col[x].rc_abd;
1383
1384	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1385
1386	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1387		uint64_t size = MIN(rr->rr_col[x].rc_size,
1388		    rr->rr_col[c].rc_size);
1389
1390		src = rr->rr_col[c].rc_abd;
1391
1392		if (c == x)
1393			continue;
1394
1395		(void) abd_iterate_func2(dst, src, 0, 0, size,
1396		    vdev_raidz_reconst_p_func, NULL);
1397	}
1398}
1399
1400static void
1401vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1402{
1403	int x = tgts[0];
1404	int c, exp;
1405	abd_t *dst, *src;
1406
1407	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1408		zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1409
1410	ASSERT(ntgts == 1);
1411
1412	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1413
1414	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1415		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1416		    rr->rr_col[c].rc_size);
1417
1418		src = rr->rr_col[c].rc_abd;
1419		dst = rr->rr_col[x].rc_abd;
1420
1421		if (c == rr->rr_firstdatacol) {
1422			abd_copy(dst, src, size);
1423			if (rr->rr_col[x].rc_size > size) {
1424				abd_zero_off(dst, size,
1425				    rr->rr_col[x].rc_size - size);
1426			}
1427		} else {
1428			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1429			(void) abd_iterate_func2(dst, src, 0, 0, size,
1430			    vdev_raidz_reconst_q_pre_func, NULL);
1431			(void) abd_iterate_func(dst,
1432			    size, rr->rr_col[x].rc_size - size,
1433			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1434		}
1435	}
1436
1437	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1438	dst = rr->rr_col[x].rc_abd;
1439	exp = 255 - (rr->rr_cols - 1 - x);
1440
1441	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1442	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1443	    vdev_raidz_reconst_q_post_func, &rq);
1444}
1445
1446static void
1447vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1448{
1449	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1450	abd_t *pdata, *qdata;
1451	uint64_t xsize, ysize;
1452	int x = tgts[0];
1453	int y = tgts[1];
1454	abd_t *xd, *yd;
1455
1456	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1457		zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1458
1459	ASSERT(ntgts == 2);
1460	ASSERT(x < y);
1461	ASSERT(x >= rr->rr_firstdatacol);
1462	ASSERT(y < rr->rr_cols);
1463
1464	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1465
1466	/*
1467	 * Move the parity data aside -- we're going to compute parity as
1468	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1469	 * reuse the parity generation mechanism without trashing the actual
1470	 * parity so we make those columns appear to be full of zeros by
1471	 * setting their lengths to zero.
1472	 */
1473	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1474	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1475	xsize = rr->rr_col[x].rc_size;
1476	ysize = rr->rr_col[y].rc_size;
1477
1478	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1479	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1480	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1481	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1482	rr->rr_col[x].rc_size = 0;
1483	rr->rr_col[y].rc_size = 0;
1484
1485	vdev_raidz_generate_parity_pq(rr);
1486
1487	rr->rr_col[x].rc_size = xsize;
1488	rr->rr_col[y].rc_size = ysize;
1489
1490	p = abd_to_buf(pdata);
1491	q = abd_to_buf(qdata);
1492	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1493	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1494	xd = rr->rr_col[x].rc_abd;
1495	yd = rr->rr_col[y].rc_abd;
1496
1497	/*
1498	 * We now have:
1499	 *	Pxy = P + D_x + D_y
1500	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1501	 *
1502	 * We can then solve for D_x:
1503	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1504	 * where
1505	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1506	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1507	 *
1508	 * With D_x in hand, we can easily solve for D_y:
1509	 *	D_y = P + Pxy + D_x
1510	 */
1511
1512	a = vdev_raidz_pow2[255 + x - y];
1513	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
1514	tmp = 255 - vdev_raidz_log2[a ^ 1];
1515
1516	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1517	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1518
1519	ASSERT3U(xsize, >=, ysize);
1520	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1521
1522	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1523	    vdev_raidz_reconst_pq_func, &rpq);
1524	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1525	    vdev_raidz_reconst_pq_tail_func, &rpq);
1526
1527	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1528	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1529
1530	/*
1531	 * Restore the saved parity data.
1532	 */
1533	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1534	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1535}
1536
1537/*
1538 * In the general case of reconstruction, we must solve the system of linear
1539 * equations defined by the coefficients used to generate parity as well as
1540 * the contents of the data and parity disks. This can be expressed with
1541 * vectors for the original data (D) and the actual data (d) and parity (p)
1542 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1543 *
1544 *            __   __                     __     __
1545 *            |     |         __     __   |  p_0  |
1546 *            |  V  |         |  D_0  |   | p_m-1 |
1547 *            |     |    x    |   :   | = |  d_0  |
1548 *            |  I  |         | D_n-1 |   |   :   |
1549 *            |     |         ~~     ~~   | d_n-1 |
1550 *            ~~   ~~                     ~~     ~~
1551 *
1552 * I is simply a square identity matrix of size n, and V is a vandermonde
1553 * matrix defined by the coefficients we chose for the various parity columns
1554 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1555 * computation as well as linear separability.
1556 *
1557 *      __               __               __     __
1558 *      |   1   ..  1 1 1 |               |  p_0  |
1559 *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1560 *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1561 *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1562 *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1563 *      |   :       : : : |   |   :   |   |  d_2  |
1564 *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1565 *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1566 *      |   0   ..  0 0 1 |               | d_n-1 |
1567 *      ~~               ~~               ~~     ~~
1568 *
1569 * Note that I, V, d, and p are known. To compute D, we must invert the
1570 * matrix and use the known data and parity values to reconstruct the unknown
1571 * data values. We begin by removing the rows in V|I and d|p that correspond
1572 * to failed or missing columns; we then make V|I square (n x n) and d|p
1573 * sized n by removing rows corresponding to unused parity from the bottom up
1574 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1575 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1576 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1577 *           __                               __
1578 *           |  1   1   1   1   1   1   1   1  |
1579 *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1580 *           |  19 205 116  29  64  16  4   1  |      / /
1581 *           |  1   0   0   0   0   0   0   0  |     / /
1582 *           |  0   1   0   0   0   0   0   0  | <--' /
1583 *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1584 *           |  0   0   0   1   0   0   0   0  |
1585 *           |  0   0   0   0   1   0   0   0  |
1586 *           |  0   0   0   0   0   1   0   0  |
1587 *           |  0   0   0   0   0   0   1   0  |
1588 *           |  0   0   0   0   0   0   0   1  |
1589 *           ~~                               ~~
1590 *           __                               __
1591 *           |  1   1   1   1   1   1   1   1  |
1592 *           | 128  64  32  16  8   4   2   1  |
1593 *           |  19 205 116  29  64  16  4   1  |
1594 *           |  1   0   0   0   0   0   0   0  |
1595 *           |  0   1   0   0   0   0   0   0  |
1596 *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1597 *           |  0   0   0   1   0   0   0   0  |
1598 *           |  0   0   0   0   1   0   0   0  |
1599 *           |  0   0   0   0   0   1   0   0  |
1600 *           |  0   0   0   0   0   0   1   0  |
1601 *           |  0   0   0   0   0   0   0   1  |
1602 *           ~~                               ~~
1603 *
1604 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1605 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1606 * matrix is not singular.
1607 * __                                                                 __
1608 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1609 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1610 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1611 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1612 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1613 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1614 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1615 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1616 * ~~                                                                 ~~
1617 * __                                                                 __
1618 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1619 * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1620 * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1621 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1622 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1623 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1624 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1625 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1626 * ~~                                                                 ~~
1627 * __                                                                 __
1628 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1629 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1630 * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1631 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1632 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1633 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1634 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1635 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1636 * ~~                                                                 ~~
1637 * __                                                                 __
1638 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1639 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1640 * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1641 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1642 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1643 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1644 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1645 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1646 * ~~                                                                 ~~
1647 * __                                                                 __
1648 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1649 * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1650 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1651 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1652 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1653 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1654 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1655 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1656 * ~~                                                                 ~~
1657 * __                                                                 __
1658 * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1659 * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1660 * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1661 * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1662 * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1663 * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1664 * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1665 * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1666 * ~~                                                                 ~~
1667 *                   __                               __
1668 *                   |  0   0   1   0   0   0   0   0  |
1669 *                   | 167 100  5   41 159 169 217 208 |
1670 *                   | 166 100  4   40 158 168 216 209 |
1671 *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1672 *                   |  0   0   0   0   1   0   0   0  |
1673 *                   |  0   0   0   0   0   1   0   0  |
1674 *                   |  0   0   0   0   0   0   1   0  |
1675 *                   |  0   0   0   0   0   0   0   1  |
1676 *                   ~~                               ~~
1677 *
1678 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1679 * of the missing data.
1680 *
1681 * As is apparent from the example above, the only non-trivial rows in the
1682 * inverse matrix correspond to the data disks that we're trying to
1683 * reconstruct. Indeed, those are the only rows we need as the others would
1684 * only be useful for reconstructing data known or assumed to be valid. For
1685 * that reason, we only build the coefficients in the rows that correspond to
1686 * targeted columns.
1687 */
1688
1689static void
1690vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1691    uint8_t **rows)
1692{
1693	int i, j;
1694	int pow;
1695
1696	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1697
1698	/*
1699	 * Fill in the missing rows of interest.
1700	 */
1701	for (i = 0; i < nmap; i++) {
1702		ASSERT3S(0, <=, map[i]);
1703		ASSERT3S(map[i], <=, 2);
1704
1705		pow = map[i] * n;
1706		if (pow > 255)
1707			pow -= 255;
1708		ASSERT(pow <= 255);
1709
1710		for (j = 0; j < n; j++) {
1711			pow -= map[i];
1712			if (pow < 0)
1713				pow += 255;
1714			rows[i][j] = vdev_raidz_pow2[pow];
1715		}
1716	}
1717}
1718
1719static void
1720vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1721    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1722{
1723	int i, j, ii, jj;
1724	uint8_t log;
1725
1726	/*
1727	 * Assert that the first nmissing entries from the array of used
1728	 * columns correspond to parity columns and that subsequent entries
1729	 * correspond to data columns.
1730	 */
1731	for (i = 0; i < nmissing; i++) {
1732		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1733	}
1734	for (; i < n; i++) {
1735		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1736	}
1737
1738	/*
1739	 * First initialize the storage where we'll compute the inverse rows.
1740	 */
1741	for (i = 0; i < nmissing; i++) {
1742		for (j = 0; j < n; j++) {
1743			invrows[i][j] = (i == j) ? 1 : 0;
1744		}
1745	}
1746
1747	/*
1748	 * Subtract all trivial rows from the rows of consequence.
1749	 */
1750	for (i = 0; i < nmissing; i++) {
1751		for (j = nmissing; j < n; j++) {
1752			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1753			jj = used[j] - rr->rr_firstdatacol;
1754			ASSERT3S(jj, <, n);
1755			invrows[i][j] = rows[i][jj];
1756			rows[i][jj] = 0;
1757		}
1758	}
1759
1760	/*
1761	 * For each of the rows of interest, we must normalize it and subtract
1762	 * a multiple of it from the other rows.
1763	 */
1764	for (i = 0; i < nmissing; i++) {
1765		for (j = 0; j < missing[i]; j++) {
1766			ASSERT0(rows[i][j]);
1767		}
1768		ASSERT3U(rows[i][missing[i]], !=, 0);
1769
1770		/*
1771		 * Compute the inverse of the first element and multiply each
1772		 * element in the row by that value.
1773		 */
1774		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1775
1776		for (j = 0; j < n; j++) {
1777			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1778			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1779		}
1780
1781		for (ii = 0; ii < nmissing; ii++) {
1782			if (i == ii)
1783				continue;
1784
1785			ASSERT3U(rows[ii][missing[i]], !=, 0);
1786
1787			log = vdev_raidz_log2[rows[ii][missing[i]]];
1788
1789			for (j = 0; j < n; j++) {
1790				rows[ii][j] ^=
1791				    vdev_raidz_exp2(rows[i][j], log);
1792				invrows[ii][j] ^=
1793				    vdev_raidz_exp2(invrows[i][j], log);
1794			}
1795		}
1796	}
1797
1798	/*
1799	 * Verify that the data that is left in the rows are properly part of
1800	 * an identity matrix.
1801	 */
1802	for (i = 0; i < nmissing; i++) {
1803		for (j = 0; j < n; j++) {
1804			if (j == missing[i]) {
1805				ASSERT3U(rows[i][j], ==, 1);
1806			} else {
1807				ASSERT0(rows[i][j]);
1808			}
1809		}
1810	}
1811}
1812
1813static void
1814vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1815    int *missing, uint8_t **invrows, const uint8_t *used)
1816{
1817	int i, j, x, cc, c;
1818	uint8_t *src;
1819	uint64_t ccount;
1820	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1821	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1822	uint8_t log = 0;
1823	uint8_t val;
1824	int ll;
1825	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1826	uint8_t *p, *pp;
1827	size_t psize;
1828
1829	psize = sizeof (invlog[0][0]) * n * nmissing;
1830	p = kmem_alloc(psize, KM_SLEEP);
1831
1832	for (pp = p, i = 0; i < nmissing; i++) {
1833		invlog[i] = pp;
1834		pp += n;
1835	}
1836
1837	for (i = 0; i < nmissing; i++) {
1838		for (j = 0; j < n; j++) {
1839			ASSERT3U(invrows[i][j], !=, 0);
1840			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1841		}
1842	}
1843
1844	for (i = 0; i < n; i++) {
1845		c = used[i];
1846		ASSERT3U(c, <, rr->rr_cols);
1847
1848		ccount = rr->rr_col[c].rc_size;
1849		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1850		if (ccount == 0)
1851			continue;
1852		src = abd_to_buf(rr->rr_col[c].rc_abd);
1853		for (j = 0; j < nmissing; j++) {
1854			cc = missing[j] + rr->rr_firstdatacol;
1855			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1856			ASSERT3U(cc, <, rr->rr_cols);
1857			ASSERT3U(cc, !=, c);
1858
1859			dcount[j] = rr->rr_col[cc].rc_size;
1860			if (dcount[j] != 0)
1861				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1862		}
1863
1864		for (x = 0; x < ccount; x++, src++) {
1865			if (*src != 0)
1866				log = vdev_raidz_log2[*src];
1867
1868			for (cc = 0; cc < nmissing; cc++) {
1869				if (x >= dcount[cc])
1870					continue;
1871
1872				if (*src == 0) {
1873					val = 0;
1874				} else {
1875					if ((ll = log + invlog[cc][i]) >= 255)
1876						ll -= 255;
1877					val = vdev_raidz_pow2[ll];
1878				}
1879
1880				if (i == 0)
1881					dst[cc][x] = val;
1882				else
1883					dst[cc][x] ^= val;
1884			}
1885		}
1886	}
1887
1888	kmem_free(p, psize);
1889}
1890
1891static void
1892vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1893{
1894	int i, c, t, tt;
1895	unsigned int n;
1896	unsigned int nmissing_rows;
1897	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1898	int parity_map[VDEV_RAIDZ_MAXPARITY];
1899	uint8_t *p, *pp;
1900	size_t psize;
1901	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1902	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1903	uint8_t *used;
1904
1905	abd_t **bufs = NULL;
1906
1907	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1908		zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1909	/*
1910	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1911	 * temporary linear ABDs if any non-linear ABDs are found.
1912	 */
1913	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1914		ASSERT(rr->rr_col[i].rc_abd != NULL);
1915		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1916			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1917			    KM_PUSHPAGE);
1918
1919			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1920				raidz_col_t *col = &rr->rr_col[c];
1921
1922				bufs[c] = col->rc_abd;
1923				if (bufs[c] != NULL) {
1924					col->rc_abd = abd_alloc_linear(
1925					    col->rc_size, B_TRUE);
1926					abd_copy(col->rc_abd, bufs[c],
1927					    col->rc_size);
1928				}
1929			}
1930
1931			break;
1932		}
1933	}
1934
1935	n = rr->rr_cols - rr->rr_firstdatacol;
1936
1937	/*
1938	 * Figure out which data columns are missing.
1939	 */
1940	nmissing_rows = 0;
1941	for (t = 0; t < ntgts; t++) {
1942		if (tgts[t] >= rr->rr_firstdatacol) {
1943			missing_rows[nmissing_rows++] =
1944			    tgts[t] - rr->rr_firstdatacol;
1945		}
1946	}
1947
1948	/*
1949	 * Figure out which parity columns to use to help generate the missing
1950	 * data columns.
1951	 */
1952	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1953		ASSERT(tt < ntgts);
1954		ASSERT(c < rr->rr_firstdatacol);
1955
1956		/*
1957		 * Skip any targeted parity columns.
1958		 */
1959		if (c == tgts[tt]) {
1960			tt++;
1961			continue;
1962		}
1963
1964		parity_map[i] = c;
1965		i++;
1966	}
1967
1968	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1969	    nmissing_rows * n + sizeof (used[0]) * n;
1970	p = kmem_alloc(psize, KM_SLEEP);
1971
1972	for (pp = p, i = 0; i < nmissing_rows; i++) {
1973		rows[i] = pp;
1974		pp += n;
1975		invrows[i] = pp;
1976		pp += n;
1977	}
1978	used = pp;
1979
1980	for (i = 0; i < nmissing_rows; i++) {
1981		used[i] = parity_map[i];
1982	}
1983
1984	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1985		if (tt < nmissing_rows &&
1986		    c == missing_rows[tt] + rr->rr_firstdatacol) {
1987			tt++;
1988			continue;
1989		}
1990
1991		ASSERT3S(i, <, n);
1992		used[i] = c;
1993		i++;
1994	}
1995
1996	/*
1997	 * Initialize the interesting rows of the matrix.
1998	 */
1999	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
2000
2001	/*
2002	 * Invert the matrix.
2003	 */
2004	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
2005	    invrows, used);
2006
2007	/*
2008	 * Reconstruct the missing data using the generated matrix.
2009	 */
2010	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
2011	    invrows, used);
2012
2013	kmem_free(p, psize);
2014
2015	/*
2016	 * copy back from temporary linear abds and free them
2017	 */
2018	if (bufs) {
2019		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2020			raidz_col_t *col = &rr->rr_col[c];
2021
2022			if (bufs[c] != NULL) {
2023				abd_copy(bufs[c], col->rc_abd, col->rc_size);
2024				abd_free(col->rc_abd);
2025			}
2026			col->rc_abd = bufs[c];
2027		}
2028		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
2029	}
2030}
2031
2032static void
2033vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2034    const int *t, int nt)
2035{
2036	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2037	int ntgts;
2038	int i, c, ret;
2039	int nbadparity, nbaddata;
2040	int parity_valid[VDEV_RAIDZ_MAXPARITY];
2041
2042	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2043		zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2044		    rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2045		    (int)rr->rr_missingparity);
2046	}
2047
2048	nbadparity = rr->rr_firstdatacol;
2049	nbaddata = rr->rr_cols - nbadparity;
2050	ntgts = 0;
2051	for (i = 0, c = 0; c < rr->rr_cols; c++) {
2052		if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2053			zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2054			    "offset=%llx error=%u)",
2055			    rr, c, (int)rr->rr_col[c].rc_devidx,
2056			    (long long)rr->rr_col[c].rc_offset,
2057			    (int)rr->rr_col[c].rc_error);
2058		}
2059		if (c < rr->rr_firstdatacol)
2060			parity_valid[c] = B_FALSE;
2061
2062		if (i < nt && c == t[i]) {
2063			tgts[ntgts++] = c;
2064			i++;
2065		} else if (rr->rr_col[c].rc_error != 0) {
2066			tgts[ntgts++] = c;
2067		} else if (c >= rr->rr_firstdatacol) {
2068			nbaddata--;
2069		} else {
2070			parity_valid[c] = B_TRUE;
2071			nbadparity--;
2072		}
2073	}
2074
2075	ASSERT(ntgts >= nt);
2076	ASSERT(nbaddata >= 0);
2077	ASSERT(nbaddata + nbadparity == ntgts);
2078
2079	dt = &tgts[nbadparity];
2080
2081	/* Reconstruct using the new math implementation */
2082	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
2083	if (ret != RAIDZ_ORIGINAL_IMPL)
2084		return;
2085
2086	/*
2087	 * See if we can use any of our optimized reconstruction routines.
2088	 */
2089	switch (nbaddata) {
2090	case 1:
2091		if (parity_valid[VDEV_RAIDZ_P]) {
2092			vdev_raidz_reconstruct_p(rr, dt, 1);
2093			return;
2094		}
2095
2096		ASSERT(rr->rr_firstdatacol > 1);
2097
2098		if (parity_valid[VDEV_RAIDZ_Q]) {
2099			vdev_raidz_reconstruct_q(rr, dt, 1);
2100			return;
2101		}
2102
2103		ASSERT(rr->rr_firstdatacol > 2);
2104		break;
2105
2106	case 2:
2107		ASSERT(rr->rr_firstdatacol > 1);
2108
2109		if (parity_valid[VDEV_RAIDZ_P] &&
2110		    parity_valid[VDEV_RAIDZ_Q]) {
2111			vdev_raidz_reconstruct_pq(rr, dt, 2);
2112			return;
2113		}
2114
2115		ASSERT(rr->rr_firstdatacol > 2);
2116
2117		break;
2118	}
2119
2120	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
2121}
2122
2123static int
2124vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
2125    uint64_t *logical_ashift, uint64_t *physical_ashift)
2126{
2127	vdev_raidz_t *vdrz = vd->vdev_tsd;
2128	uint64_t nparity = vdrz->vd_nparity;
2129	int c;
2130	int lasterror = 0;
2131	int numerrors = 0;
2132
2133	ASSERT(nparity > 0);
2134
2135	if (nparity > VDEV_RAIDZ_MAXPARITY ||
2136	    vd->vdev_children < nparity + 1) {
2137		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2138		return (SET_ERROR(EINVAL));
2139	}
2140
2141	vdev_open_children(vd);
2142
2143	for (c = 0; c < vd->vdev_children; c++) {
2144		vdev_t *cvd = vd->vdev_child[c];
2145
2146		if (cvd->vdev_open_error != 0) {
2147			lasterror = cvd->vdev_open_error;
2148			numerrors++;
2149			continue;
2150		}
2151
2152		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
2153		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
2154		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
2155	}
2156	for (c = 0; c < vd->vdev_children; c++) {
2157		vdev_t *cvd = vd->vdev_child[c];
2158
2159		if (cvd->vdev_open_error != 0)
2160			continue;
2161		*physical_ashift = vdev_best_ashift(*logical_ashift,
2162		    *physical_ashift, cvd->vdev_physical_ashift);
2163	}
2164
2165	if (vd->vdev_rz_expanding) {
2166		*asize *= vd->vdev_children - 1;
2167		*max_asize *= vd->vdev_children - 1;
2168
2169		vd->vdev_min_asize = *asize;
2170	} else {
2171		*asize *= vd->vdev_children;
2172		*max_asize *= vd->vdev_children;
2173	}
2174
2175	if (numerrors > nparity) {
2176		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2177		return (lasterror);
2178	}
2179
2180	return (0);
2181}
2182
2183static void
2184vdev_raidz_close(vdev_t *vd)
2185{
2186	for (int c = 0; c < vd->vdev_children; c++) {
2187		if (vd->vdev_child[c] != NULL)
2188			vdev_close(vd->vdev_child[c]);
2189	}
2190}
2191
2192/*
2193 * Return the logical width to use, given the txg in which the allocation
2194 * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
2195 * BP was allocated.  Remapped BP's (that were relocated due to device
2196 * removal, see remap_blkptr_cb()), will have a more recent physical birth
2197 * which reflects when the BP was relocated, but we can ignore these because
2198 * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2199 */
2200static uint64_t
2201vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2202{
2203	reflow_node_t lookup = {
2204		.re_txg = txg,
2205	};
2206	avl_index_t where;
2207
2208	uint64_t width;
2209	mutex_enter(&vdrz->vd_expand_lock);
2210	reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2211	if (re != NULL) {
2212		width = re->re_logical_width;
2213	} else {
2214		re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2215		if (re != NULL)
2216			width = re->re_logical_width;
2217		else
2218			width = vdrz->vd_original_width;
2219	}
2220	mutex_exit(&vdrz->vd_expand_lock);
2221	return (width);
2222}
2223
2224/*
2225 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2226 * more space due to the lower data-to-parity ratio.  In this case it's
2227 * important to pass in the correct txg.  Note that vdev_gang_header_asize()
2228 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2229 * regardless of txg.  This is assured because for a single data sector, we
2230 * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2231 */
2232static uint64_t
2233vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
2234{
2235	vdev_raidz_t *vdrz = vd->vdev_tsd;
2236	uint64_t asize;
2237	uint64_t ashift = vd->vdev_top->vdev_ashift;
2238	uint64_t cols = vdrz->vd_original_width;
2239	uint64_t nparity = vdrz->vd_nparity;
2240
2241	cols = vdev_raidz_get_logical_width(vdrz, txg);
2242
2243	asize = ((psize - 1) >> ashift) + 1;
2244	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2245	asize = roundup(asize, nparity + 1) << ashift;
2246
2247#ifdef ZFS_DEBUG
2248	uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2249	uint64_t ncols_new = vdrz->vd_physical_width;
2250	asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2251	    (ncols_new - nparity));
2252	asize_new = roundup(asize_new, nparity + 1) << ashift;
2253	VERIFY3U(asize_new, <=, asize);
2254#endif
2255
2256	return (asize);
2257}
2258
2259/*
2260 * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2261 * so each child must provide at least 1/Nth of its asize.
2262 */
2263static uint64_t
2264vdev_raidz_min_asize(vdev_t *vd)
2265{
2266	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2267	    vd->vdev_children);
2268}
2269
2270void
2271vdev_raidz_child_done(zio_t *zio)
2272{
2273	raidz_col_t *rc = zio->io_private;
2274
2275	ASSERT3P(rc->rc_abd, !=, NULL);
2276	rc->rc_error = zio->io_error;
2277	rc->rc_tried = 1;
2278	rc->rc_skipped = 0;
2279}
2280
2281static void
2282vdev_raidz_shadow_child_done(zio_t *zio)
2283{
2284	raidz_col_t *rc = zio->io_private;
2285
2286	rc->rc_shadow_error = zio->io_error;
2287}
2288
2289static void
2290vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2291{
2292	(void) rm;
2293#ifdef ZFS_DEBUG
2294	range_seg64_t logical_rs, physical_rs, remain_rs;
2295	logical_rs.rs_start = rr->rr_offset;
2296	logical_rs.rs_end = logical_rs.rs_start +
2297	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
2298	    BP_GET_BIRTH(zio->io_bp));
2299
2300	raidz_col_t *rc = &rr->rr_col[col];
2301	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2302
2303	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2304	ASSERT(vdev_xlate_is_empty(&remain_rs));
2305	if (vdev_xlate_is_empty(&physical_rs)) {
2306		/*
2307		 * If we are in the middle of expansion, the
2308		 * physical->logical mapping is changing so vdev_xlate()
2309		 * can't give us a reliable answer.
2310		 */
2311		return;
2312	}
2313	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2314	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2315	/*
2316	 * It would be nice to assert that rs_end is equal
2317	 * to rc_offset + rc_size but there might be an
2318	 * optional I/O at the end that is not accounted in
2319	 * rc_size.
2320	 */
2321	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2322		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
2323		    rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
2324	} else {
2325		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2326	}
2327#endif
2328}
2329
2330static void
2331vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2332{
2333	vdev_t *vd = zio->io_vd;
2334	raidz_map_t *rm = zio->io_vsd;
2335
2336	vdev_raidz_generate_parity_row(rm, rr);
2337
2338	for (int c = 0; c < rr->rr_scols; c++) {
2339		raidz_col_t *rc = &rr->rr_col[c];
2340		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2341
2342		/* Verify physical to logical translation */
2343		vdev_raidz_io_verify(zio, rm, rr, c);
2344
2345		if (rc->rc_size == 0)
2346			continue;
2347
2348		ASSERT3U(rc->rc_offset + rc->rc_size, <,
2349		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2350
2351		ASSERT3P(rc->rc_abd, !=, NULL);
2352		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2353		    rc->rc_offset, rc->rc_abd,
2354		    abd_get_size(rc->rc_abd), zio->io_type,
2355		    zio->io_priority, 0, vdev_raidz_child_done, rc));
2356
2357		if (rc->rc_shadow_devidx != INT_MAX) {
2358			vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2359
2360			ASSERT3U(
2361			    rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2362			    cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2363
2364			zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2365			    rc->rc_shadow_offset, rc->rc_abd,
2366			    abd_get_size(rc->rc_abd),
2367			    zio->io_type, zio->io_priority, 0,
2368			    vdev_raidz_shadow_child_done, rc));
2369		}
2370	}
2371}
2372
2373/*
2374 * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2375 * This only works for vdev_raidz_map_alloc() (not _expanded()).
2376 */
2377static void
2378raidz_start_skip_writes(zio_t *zio)
2379{
2380	vdev_t *vd = zio->io_vd;
2381	uint64_t ashift = vd->vdev_top->vdev_ashift;
2382	raidz_map_t *rm = zio->io_vsd;
2383	ASSERT3U(rm->rm_nrows, ==, 1);
2384	raidz_row_t *rr = rm->rm_row[0];
2385	for (int c = 0; c < rr->rr_scols; c++) {
2386		raidz_col_t *rc = &rr->rr_col[c];
2387		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2388		if (rc->rc_size != 0)
2389			continue;
2390		ASSERT3P(rc->rc_abd, ==, NULL);
2391
2392		ASSERT3U(rc->rc_offset, <,
2393		    cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2394
2395		zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2396		    NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2397		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2398	}
2399}
2400
2401static void
2402vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2403{
2404	vdev_t *vd = zio->io_vd;
2405
2406	/*
2407	 * Iterate over the columns in reverse order so that we hit the parity
2408	 * last -- any errors along the way will force us to read the parity.
2409	 */
2410	for (int c = rr->rr_cols - 1; c >= 0; c--) {
2411		raidz_col_t *rc = &rr->rr_col[c];
2412		if (rc->rc_size == 0)
2413			continue;
2414		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2415		if (!vdev_readable(cvd)) {
2416			if (c >= rr->rr_firstdatacol)
2417				rr->rr_missingdata++;
2418			else
2419				rr->rr_missingparity++;
2420			rc->rc_error = SET_ERROR(ENXIO);
2421			rc->rc_tried = 1;	/* don't even try */
2422			rc->rc_skipped = 1;
2423			continue;
2424		}
2425		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2426			if (c >= rr->rr_firstdatacol)
2427				rr->rr_missingdata++;
2428			else
2429				rr->rr_missingparity++;
2430			rc->rc_error = SET_ERROR(ESTALE);
2431			rc->rc_skipped = 1;
2432			continue;
2433		}
2434		if (forceparity ||
2435		    c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
2436		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2437			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2438			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2439			    zio->io_type, zio->io_priority, 0,
2440			    vdev_raidz_child_done, rc));
2441		}
2442	}
2443}
2444
2445static void
2446vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2447{
2448	vdev_t *vd = zio->io_vd;
2449
2450	for (int i = 0; i < rm->rm_nphys_cols; i++) {
2451		raidz_col_t *prc = &rm->rm_phys_col[i];
2452		if (prc->rc_size == 0)
2453			continue;
2454
2455		ASSERT3U(prc->rc_devidx, ==, i);
2456		vdev_t *cvd = vd->vdev_child[i];
2457		if (!vdev_readable(cvd)) {
2458			prc->rc_error = SET_ERROR(ENXIO);
2459			prc->rc_tried = 1;	/* don't even try */
2460			prc->rc_skipped = 1;
2461			continue;
2462		}
2463		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2464			prc->rc_error = SET_ERROR(ESTALE);
2465			prc->rc_skipped = 1;
2466			continue;
2467		}
2468		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2469		    prc->rc_offset, prc->rc_abd, prc->rc_size,
2470		    zio->io_type, zio->io_priority, 0,
2471		    vdev_raidz_child_done, prc));
2472	}
2473}
2474
2475static void
2476vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2477{
2478	/*
2479	 * If there are multiple rows, we will be hitting
2480	 * all disks, so go ahead and read the parity so
2481	 * that we are reading in decent size chunks.
2482	 */
2483	boolean_t forceparity = rm->rm_nrows > 1;
2484
2485	if (rm->rm_phys_col) {
2486		vdev_raidz_io_start_read_phys_cols(zio, rm);
2487	} else {
2488		for (int i = 0; i < rm->rm_nrows; i++) {
2489			raidz_row_t *rr = rm->rm_row[i];
2490			vdev_raidz_io_start_read_row(zio, rr, forceparity);
2491		}
2492	}
2493}
2494
2495/*
2496 * Start an IO operation on a RAIDZ VDev
2497 *
2498 * Outline:
2499 * - For write operations:
2500 *   1. Generate the parity data
2501 *   2. Create child zio write operations to each column's vdev, for both
2502 *      data and parity.
2503 *   3. If the column skips any sectors for padding, create optional dummy
2504 *      write zio children for those areas to improve aggregation continuity.
2505 * - For read operations:
2506 *   1. Create child zio read operations to each data column's vdev to read
2507 *      the range of data required for zio.
2508 *   2. If this is a scrub or resilver operation, or if any of the data
2509 *      vdevs have had errors, then create zio read operations to the parity
2510 *      columns' VDevs as well.
2511 */
2512static void
2513vdev_raidz_io_start(zio_t *zio)
2514{
2515	vdev_t *vd = zio->io_vd;
2516	vdev_t *tvd = vd->vdev_top;
2517	vdev_raidz_t *vdrz = vd->vdev_tsd;
2518	raidz_map_t *rm;
2519
2520	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
2521	    BP_GET_BIRTH(zio->io_bp));
2522	if (logical_width != vdrz->vd_physical_width) {
2523		zfs_locked_range_t *lr = NULL;
2524		uint64_t synced_offset = UINT64_MAX;
2525		uint64_t next_offset = UINT64_MAX;
2526		boolean_t use_scratch = B_FALSE;
2527		/*
2528		 * Note: when the expansion is completing, we set
2529		 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2530		 * in a later txg than when we last update spa_ubsync's state
2531		 * (see the end of spa_raidz_expand_thread()).  Therefore we
2532		 * may see vre_state!=SCANNING before
2533		 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2534		 * on disk, but the copying progress has been synced to disk
2535		 * (and reflected in spa_ubsync).  In this case it's fine to
2536		 * treat the expansion as completed, since if we crash there's
2537		 * no additional copying to do.
2538		 */
2539		if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2540			ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2541			    &vdrz->vn_vre);
2542			lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2543			    zio->io_offset, zio->io_size, RL_READER);
2544			use_scratch =
2545			    (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2546			    RRSS_SCRATCH_VALID);
2547			synced_offset =
2548			    RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2549			next_offset = vdrz->vn_vre.vre_offset;
2550			/*
2551			 * If we haven't resumed expanding since importing the
2552			 * pool, vre_offset won't have been set yet.  In
2553			 * this case the next offset to be copied is the same
2554			 * as what was synced.
2555			 */
2556			if (next_offset == UINT64_MAX) {
2557				next_offset = synced_offset;
2558			}
2559		}
2560		if (use_scratch) {
2561			zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2562			    "%lld next_offset=%lld use_scratch=%u",
2563			    zio,
2564			    zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2565			    (long long)zio->io_offset,
2566			    (long long)synced_offset,
2567			    (long long)next_offset,
2568			    use_scratch);
2569		}
2570
2571		rm = vdev_raidz_map_alloc_expanded(zio,
2572		    tvd->vdev_ashift, vdrz->vd_physical_width,
2573		    logical_width, vdrz->vd_nparity,
2574		    synced_offset, next_offset, use_scratch);
2575		rm->rm_lr = lr;
2576	} else {
2577		rm = vdev_raidz_map_alloc(zio,
2578		    tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2579	}
2580	rm->rm_original_width = vdrz->vd_original_width;
2581
2582	zio->io_vsd = rm;
2583	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2584	if (zio->io_type == ZIO_TYPE_WRITE) {
2585		for (int i = 0; i < rm->rm_nrows; i++) {
2586			vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2587		}
2588
2589		if (logical_width == vdrz->vd_physical_width) {
2590			raidz_start_skip_writes(zio);
2591		}
2592	} else {
2593		ASSERT(zio->io_type == ZIO_TYPE_READ);
2594		vdev_raidz_io_start_read(zio, rm);
2595	}
2596
2597	zio_execute(zio);
2598}
2599
2600/*
2601 * Report a checksum error for a child of a RAID-Z device.
2602 */
2603void
2604vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2605{
2606	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2607
2608	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2609	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
2610		zio_bad_cksum_t zbc;
2611		raidz_map_t *rm = zio->io_vsd;
2612
2613		zbc.zbc_has_cksum = 0;
2614		zbc.zbc_injected = rm->rm_ecksuminjected;
2615
2616		mutex_enter(&vd->vdev_stat_lock);
2617		vd->vdev_stat.vs_checksum_errors++;
2618		mutex_exit(&vd->vdev_stat_lock);
2619		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
2620		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2621		    rc->rc_abd, bad_data, &zbc);
2622	}
2623}
2624
2625/*
2626 * We keep track of whether or not there were any injected errors, so that
2627 * any ereports we generate can note it.
2628 */
2629static int
2630raidz_checksum_verify(zio_t *zio)
2631{
2632	zio_bad_cksum_t zbc = {0};
2633	raidz_map_t *rm = zio->io_vsd;
2634
2635	int ret = zio_checksum_error(zio, &zbc);
2636	if (ret != 0 && zbc.zbc_injected != 0)
2637		rm->rm_ecksuminjected = 1;
2638
2639	return (ret);
2640}
2641
2642/*
2643 * Generate the parity from the data columns. If we tried and were able to
2644 * read the parity without error, verify that the generated parity matches the
2645 * data we read. If it doesn't, we fire off a checksum error. Return the
2646 * number of such failures.
2647 */
2648static int
2649raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
2650{
2651	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2652	int c, ret = 0;
2653	raidz_map_t *rm = zio->io_vsd;
2654	raidz_col_t *rc;
2655
2656	blkptr_t *bp = zio->io_bp;
2657	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2658	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2659
2660	if (checksum == ZIO_CHECKSUM_NOPARITY)
2661		return (ret);
2662
2663	for (c = 0; c < rr->rr_firstdatacol; c++) {
2664		rc = &rr->rr_col[c];
2665		if (!rc->rc_tried || rc->rc_error != 0)
2666			continue;
2667
2668		orig[c] = rc->rc_abd;
2669		ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2670		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
2671	}
2672
2673	/*
2674	 * Verify any empty sectors are zero filled to ensure the parity
2675	 * is calculated correctly even if these non-data sectors are damaged.
2676	 */
2677	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2678		ret += vdev_draid_map_verify_empty(zio, rr);
2679
2680	/*
2681	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
2682	 * isn't harmful but it does have the side effect of fixing stuff
2683	 * we didn't realize was necessary (i.e. even if we return 0).
2684	 */
2685	vdev_raidz_generate_parity_row(rm, rr);
2686
2687	for (c = 0; c < rr->rr_firstdatacol; c++) {
2688		rc = &rr->rr_col[c];
2689
2690		if (!rc->rc_tried || rc->rc_error != 0)
2691			continue;
2692
2693		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
2694			zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2695			    c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
2696			vdev_raidz_checksum_error(zio, rc, orig[c]);
2697			rc->rc_error = SET_ERROR(ECKSUM);
2698			ret++;
2699		}
2700		abd_free(orig[c]);
2701	}
2702
2703	return (ret);
2704}
2705
2706static int
2707vdev_raidz_worst_error(raidz_row_t *rr)
2708{
2709	int error = 0;
2710
2711	for (int c = 0; c < rr->rr_cols; c++) {
2712		error = zio_worst_error(error, rr->rr_col[c].rc_error);
2713		error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2714	}
2715
2716	return (error);
2717}
2718
2719static void
2720vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
2721{
2722	int unexpected_errors = 0;
2723	int parity_errors = 0;
2724	int parity_untried = 0;
2725	int data_errors = 0;
2726
2727	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2728
2729	for (int c = 0; c < rr->rr_cols; c++) {
2730		raidz_col_t *rc = &rr->rr_col[c];
2731
2732		if (rc->rc_error) {
2733			if (c < rr->rr_firstdatacol)
2734				parity_errors++;
2735			else
2736				data_errors++;
2737
2738			if (!rc->rc_skipped)
2739				unexpected_errors++;
2740		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2741			parity_untried++;
2742		}
2743
2744		if (rc->rc_force_repair)
2745			unexpected_errors++;
2746	}
2747
2748	/*
2749	 * If we read more parity disks than were used for
2750	 * reconstruction, confirm that the other parity disks produced
2751	 * correct data.
2752	 *
2753	 * Note that we also regenerate parity when resilvering so we
2754	 * can write it out to failed devices later.
2755	 */
2756	if (parity_errors + parity_untried <
2757	    rr->rr_firstdatacol - data_errors ||
2758	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2759		int n = raidz_parity_verify(zio, rr);
2760		unexpected_errors += n;
2761	}
2762
2763	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2764	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2765		/*
2766		 * Use the good data we have in hand to repair damaged children.
2767		 */
2768		for (int c = 0; c < rr->rr_cols; c++) {
2769			raidz_col_t *rc = &rr->rr_col[c];
2770			vdev_t *vd = zio->io_vd;
2771			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2772
2773			if (!rc->rc_allow_repair) {
2774				continue;
2775			} else if (!rc->rc_force_repair &&
2776			    (rc->rc_error == 0 || rc->rc_size == 0)) {
2777				continue;
2778			}
2779
2780			zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2781			    "offset=%llx",
2782			    zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2783
2784			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2785			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2786			    ZIO_TYPE_WRITE,
2787			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
2788			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2789			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2790			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2791		}
2792	}
2793
2794	/*
2795	 * Scrub or resilver i/o's: overwrite any shadow locations with the
2796	 * good data.  This ensures that if we've already copied this sector,
2797	 * it will be corrected if it was damaged.  This writes more than is
2798	 * necessary, but since expansion is paused during scrub/resilver, at
2799	 * most a single row will have a shadow location.
2800	 */
2801	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2802	    (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2803		for (int c = 0; c < rr->rr_cols; c++) {
2804			raidz_col_t *rc = &rr->rr_col[c];
2805			vdev_t *vd = zio->io_vd;
2806
2807			if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2808				continue;
2809			vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2810
2811			/*
2812			 * Note: We don't want to update the repair stats
2813			 * because that would incorrectly indicate that there
2814			 * was bad data to repair, which we aren't sure about.
2815			 * By clearing the SCAN_THREAD flag, we prevent this
2816			 * from happening, despite having the REPAIR flag set.
2817			 * We need to set SELF_HEAL so that this i/o can't be
2818			 * bypassed by zio_vdev_io_start().
2819			 */
2820			zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2821			    rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2822			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2823			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2824			    NULL, NULL);
2825			cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2826			zio_nowait(cio);
2827		}
2828	}
2829}
2830
2831static void
2832raidz_restore_orig_data(raidz_map_t *rm)
2833{
2834	for (int i = 0; i < rm->rm_nrows; i++) {
2835		raidz_row_t *rr = rm->rm_row[i];
2836		for (int c = 0; c < rr->rr_cols; c++) {
2837			raidz_col_t *rc = &rr->rr_col[c];
2838			if (rc->rc_need_orig_restore) {
2839				abd_copy(rc->rc_abd,
2840				    rc->rc_orig_data, rc->rc_size);
2841				rc->rc_need_orig_restore = B_FALSE;
2842			}
2843		}
2844	}
2845}
2846
2847/*
2848 * During raidz_reconstruct() for expanded VDEV, we need special consideration
2849 * failure simulations.  See note in raidz_reconstruct() on simulating failure
2850 * of a pre-expansion device.
2851 *
2852 * Treating logical child i as failed, return TRUE if the given column should
2853 * be treated as failed.  The idea of logical children allows us to imagine
2854 * that a disk silently failed before a RAIDZ expansion (reads from this disk
2855 * succeed but return the wrong data).  Since the expansion doesn't verify
2856 * checksums, the incorrect data will be moved to new locations spread among
2857 * the children (going diagonally across them).
2858 *
2859 * Higher "logical child failures" (values of `i`) indicate these
2860 * "pre-expansion failures".  The first physical_width values imagine that a
2861 * current child failed; the next physical_width-1 values imagine that a
2862 * child failed before the most recent expansion; the next physical_width-2
2863 * values imagine a child failed in the expansion before that, etc.
2864 */
2865static boolean_t
2866raidz_simulate_failure(int physical_width, int original_width, int ashift,
2867    int i, raidz_col_t *rc)
2868{
2869	uint64_t sector_id =
2870	    physical_width * (rc->rc_offset >> ashift) +
2871	    rc->rc_devidx;
2872
2873	for (int w = physical_width; w >= original_width; w--) {
2874		if (i < w) {
2875			return (sector_id % w == i);
2876		} else {
2877			i -= w;
2878		}
2879	}
2880	ASSERT(!"invalid logical child id");
2881	return (B_FALSE);
2882}
2883
2884/*
2885 * returns EINVAL if reconstruction of the block will not be possible
2886 * returns ECKSUM if this specific reconstruction failed
2887 * returns 0 on successful reconstruction
2888 */
2889static int
2890raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2891{
2892	raidz_map_t *rm = zio->io_vsd;
2893	int physical_width = zio->io_vd->vdev_children;
2894	int original_width = (rm->rm_original_width != 0) ?
2895	    rm->rm_original_width : physical_width;
2896	int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2897
2898	if (dbgmsg) {
2899		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2900		    "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2901	}
2902
2903	/* Reconstruct each row */
2904	for (int r = 0; r < rm->rm_nrows; r++) {
2905		raidz_row_t *rr = rm->rm_row[r];
2906		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2907		int t = 0;
2908		int dead = 0;
2909		int dead_data = 0;
2910
2911		if (dbgmsg)
2912			zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2913
2914		for (int c = 0; c < rr->rr_cols; c++) {
2915			raidz_col_t *rc = &rr->rr_col[c];
2916			ASSERT0(rc->rc_need_orig_restore);
2917			if (rc->rc_error != 0) {
2918				dead++;
2919				if (c >= nparity)
2920					dead_data++;
2921				continue;
2922			}
2923			if (rc->rc_size == 0)
2924				continue;
2925			for (int lt = 0; lt < ntgts; lt++) {
2926				if (raidz_simulate_failure(physical_width,
2927				    original_width,
2928				    zio->io_vd->vdev_top->vdev_ashift,
2929				    ltgts[lt], rc)) {
2930					if (rc->rc_orig_data == NULL) {
2931						rc->rc_orig_data =
2932						    abd_alloc_linear(
2933						    rc->rc_size, B_TRUE);
2934						abd_copy(rc->rc_orig_data,
2935						    rc->rc_abd, rc->rc_size);
2936					}
2937					rc->rc_need_orig_restore = B_TRUE;
2938
2939					dead++;
2940					if (c >= nparity)
2941						dead_data++;
2942					/*
2943					 * Note: simulating failure of a
2944					 * pre-expansion device can hit more
2945					 * than one column, in which case we
2946					 * might try to simulate more failures
2947					 * than can be reconstructed, which is
2948					 * also more than the size of my_tgts.
2949					 * This check prevents accessing past
2950					 * the end of my_tgts.  The "dead >
2951					 * nparity" check below will fail this
2952					 * reconstruction attempt.
2953					 */
2954					if (t < VDEV_RAIDZ_MAXPARITY) {
2955						my_tgts[t++] = c;
2956						if (dbgmsg) {
2957							zfs_dbgmsg("simulating "
2958							    "failure of col %u "
2959							    "devidx %u", c,
2960							    (int)rc->rc_devidx);
2961						}
2962					}
2963					break;
2964				}
2965			}
2966		}
2967		if (dead > nparity) {
2968			/* reconstruction not possible */
2969			if (dbgmsg) {
2970				zfs_dbgmsg("reconstruction not possible; "
2971				    "too many failures");
2972			}
2973			raidz_restore_orig_data(rm);
2974			return (EINVAL);
2975		}
2976		if (dead_data > 0)
2977			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
2978	}
2979
2980	/* Check for success */
2981	if (raidz_checksum_verify(zio) == 0) {
2982
2983		/* Reconstruction succeeded - report errors */
2984		for (int i = 0; i < rm->rm_nrows; i++) {
2985			raidz_row_t *rr = rm->rm_row[i];
2986
2987			for (int c = 0; c < rr->rr_cols; c++) {
2988				raidz_col_t *rc = &rr->rr_col[c];
2989				if (rc->rc_need_orig_restore) {
2990					/*
2991					 * Note: if this is a parity column,
2992					 * we don't really know if it's wrong.
2993					 * We need to let
2994					 * vdev_raidz_io_done_verified() check
2995					 * it, and if we set rc_error, it will
2996					 * think that it is a "known" error
2997					 * that doesn't need to be checked
2998					 * or corrected.
2999					 */
3000					if (rc->rc_error == 0 &&
3001					    c >= rr->rr_firstdatacol) {
3002						vdev_raidz_checksum_error(zio,
3003						    rc, rc->rc_orig_data);
3004						rc->rc_error =
3005						    SET_ERROR(ECKSUM);
3006					}
3007					rc->rc_need_orig_restore = B_FALSE;
3008				}
3009			}
3010
3011			vdev_raidz_io_done_verified(zio, rr);
3012		}
3013
3014		zio_checksum_verified(zio);
3015
3016		if (dbgmsg) {
3017			zfs_dbgmsg("reconstruction successful "
3018			    "(checksum verified)");
3019		}
3020		return (0);
3021	}
3022
3023	/* Reconstruction failed - restore original data */
3024	raidz_restore_orig_data(rm);
3025	if (dbgmsg) {
3026		zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3027		    "failed", zio);
3028	}
3029	return (ECKSUM);
3030}
3031
3032/*
3033 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3034 * Note that the algorithm below is non-optimal because it doesn't take into
3035 * account how reconstruction is actually performed. For example, with
3036 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3037 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3038 * cases we'd only use parity information in column 0.
3039 *
3040 * The order that we find the various possible combinations of failed
3041 * disks is dictated by these rules:
3042 * - Examine each "slot" (the "i" in tgts[i])
3043 *   - Try to increment this slot (tgts[i] += 1)
3044 *   - if we can't increment because it runs into the next slot,
3045 *     reset our slot to the minimum, and examine the next slot
3046 *
3047 *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3048 *  3 columns to reconstruct), we will generate the following sequence:
3049 *
3050 *  STATE        ACTION
3051 *  0 1 2        special case: skip since these are all parity
3052 *  0 1   3      first slot: reset to 0; middle slot: increment to 2
3053 *  0   2 3      first slot: increment to 1
3054 *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
3055 *  0 1     4    first: reset to 0; middle: increment to 2
3056 *  0   2   4    first: increment to 1
3057 *    1 2   4    first: reset to 0; middle: increment to 3
3058 *  0     3 4    first: increment to 1
3059 *    1   3 4    first: increment to 2
3060 *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
3061 *  0 1       5  first: reset to 0; middle: increment to 2
3062 *  0   2     5  first: increment to 1
3063 *    1 2     5  first: reset to 0; middle: increment to 3
3064 *  0     3   5  first: increment to 1
3065 *    1   3   5  first: increment to 2
3066 *      2 3   5  first: reset to 0; middle: increment to 4
3067 *  0       4 5  first: increment to 1
3068 *    1     4 5  first: increment to 2
3069 *      2   4 5  first: increment to 3
3070 *        3 4 5  done
3071 *
3072 * This strategy works for dRAID but is less efficient when there are a large
3073 * number of child vdevs and therefore permutations to check. Furthermore,
3074 * since the raidz_map_t rows likely do not overlap, reconstruction would be
3075 * possible as long as there are no more than nparity data errors per row.
3076 * These additional permutations are not currently checked but could be as
3077 * a future improvement.
3078 *
3079 * Returns 0 on success, ECKSUM on failure.
3080 */
3081static int
3082vdev_raidz_combrec(zio_t *zio)
3083{
3084	int nparity = vdev_get_nparity(zio->io_vd);
3085	raidz_map_t *rm = zio->io_vsd;
3086	int physical_width = zio->io_vd->vdev_children;
3087	int original_width = (rm->rm_original_width != 0) ?
3088	    rm->rm_original_width : physical_width;
3089
3090	for (int i = 0; i < rm->rm_nrows; i++) {
3091		raidz_row_t *rr = rm->rm_row[i];
3092		int total_errors = 0;
3093
3094		for (int c = 0; c < rr->rr_cols; c++) {
3095			if (rr->rr_col[c].rc_error)
3096				total_errors++;
3097		}
3098
3099		if (total_errors > nparity)
3100			return (vdev_raidz_worst_error(rr));
3101	}
3102
3103	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3104		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3105		int *ltgts = &tstore[1]; /* value is logical child ID */
3106
3107
3108		/*
3109		 * Determine number of logical children, n.  See comment
3110		 * above raidz_simulate_failure().
3111		 */
3112		int n = 0;
3113		for (int w = physical_width;
3114		    w >= original_width; w--) {
3115			n += w;
3116		}
3117
3118		ASSERT3U(num_failures, <=, nparity);
3119		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3120
3121		/* Handle corner cases in combrec logic */
3122		ltgts[-1] = -1;
3123		for (int i = 0; i < num_failures; i++) {
3124			ltgts[i] = i;
3125		}
3126		ltgts[num_failures] = n;
3127
3128		for (;;) {
3129			int err = raidz_reconstruct(zio, ltgts, num_failures,
3130			    nparity);
3131			if (err == EINVAL) {
3132				/*
3133				 * Reconstruction not possible with this #
3134				 * failures; try more failures.
3135				 */
3136				break;
3137			} else if (err == 0)
3138				return (0);
3139
3140			/* Compute next targets to try */
3141			for (int t = 0; ; t++) {
3142				ASSERT3U(t, <, num_failures);
3143				ltgts[t]++;
3144				if (ltgts[t] == n) {
3145					/* try more failures */
3146					ASSERT3U(t, ==, num_failures - 1);
3147					if (zfs_flags &
3148					    ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3149						zfs_dbgmsg("reconstruction "
3150						    "failed for num_failures="
3151						    "%u; tried all "
3152						    "combinations",
3153						    num_failures);
3154					}
3155					break;
3156				}
3157
3158				ASSERT3U(ltgts[t], <, n);
3159				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3160
3161				/*
3162				 * If that spot is available, we're done here.
3163				 * Try the next combination.
3164				 */
3165				if (ltgts[t] != ltgts[t + 1])
3166					break; // found next combination
3167
3168				/*
3169				 * Otherwise, reset this tgt to the minimum,
3170				 * and move on to the next tgt.
3171				 */
3172				ltgts[t] = ltgts[t - 1] + 1;
3173				ASSERT3U(ltgts[t], ==, t);
3174			}
3175
3176			/* Increase the number of failures and keep trying. */
3177			if (ltgts[num_failures - 1] == n)
3178				break;
3179		}
3180	}
3181	if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3182		zfs_dbgmsg("reconstruction failed for all num_failures");
3183	return (ECKSUM);
3184}
3185
3186void
3187vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3188{
3189	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3190		raidz_row_t *rr = rm->rm_row[row];
3191		vdev_raidz_reconstruct_row(rm, rr, t, nt);
3192	}
3193}
3194
3195/*
3196 * Complete a write IO operation on a RAIDZ VDev
3197 *
3198 * Outline:
3199 *   1. Check for errors on the child IOs.
3200 *   2. Return, setting an error code if too few child VDevs were written
3201 *      to reconstruct the data later.  Note that partial writes are
3202 *      considered successful if they can be reconstructed at all.
3203 */
3204static void
3205vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3206{
3207	int normal_errors = 0;
3208	int shadow_errors = 0;
3209
3210	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3211	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3212	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3213
3214	for (int c = 0; c < rr->rr_cols; c++) {
3215		raidz_col_t *rc = &rr->rr_col[c];
3216
3217		if (rc->rc_error != 0) {
3218			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
3219			normal_errors++;
3220		}
3221		if (rc->rc_shadow_error != 0) {
3222			ASSERT(rc->rc_shadow_error != ECKSUM);
3223			shadow_errors++;
3224		}
3225	}
3226
3227	/*
3228	 * Treat partial writes as a success. If we couldn't write enough
3229	 * columns to reconstruct the data, the I/O failed.  Otherwise, good
3230	 * enough.  Note that in the case of a shadow write (during raidz
3231	 * expansion), depending on if we crash, either the normal (old) or
3232	 * shadow (new) location may become the "real" version of the block,
3233	 * so both locations must have sufficient redundancy.
3234	 *
3235	 * Now that we support write reallocation, it would be better
3236	 * to treat partial failure as real failure unless there are
3237	 * no non-degraded top-level vdevs left, and not update DTLs
3238	 * if we intend to reallocate.
3239	 */
3240	if (normal_errors > rr->rr_firstdatacol ||
3241	    shadow_errors > rr->rr_firstdatacol) {
3242		zio->io_error = zio_worst_error(zio->io_error,
3243		    vdev_raidz_worst_error(rr));
3244	}
3245}
3246
3247static void
3248vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3249    raidz_row_t *rr)
3250{
3251	int parity_errors = 0;
3252	int parity_untried = 0;
3253	int data_errors = 0;
3254	int total_errors = 0;
3255
3256	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3257	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3258
3259	for (int c = 0; c < rr->rr_cols; c++) {
3260		raidz_col_t *rc = &rr->rr_col[c];
3261
3262		/*
3263		 * If scrubbing and a replacing/sparing child vdev determined
3264		 * that not all of its children have an identical copy of the
3265		 * data, then clear the error so the column is treated like
3266		 * any other read and force a repair to correct the damage.
3267		 */
3268		if (rc->rc_error == ECKSUM) {
3269			ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3270			vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3271			rc->rc_force_repair = 1;
3272			rc->rc_error = 0;
3273		}
3274
3275		if (rc->rc_error) {
3276			if (c < rr->rr_firstdatacol)
3277				parity_errors++;
3278			else
3279				data_errors++;
3280
3281			total_errors++;
3282		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3283			parity_untried++;
3284		}
3285	}
3286
3287	/*
3288	 * If there were data errors and the number of errors we saw was
3289	 * correctable -- less than or equal to the number of parity disks read
3290	 * -- reconstruct based on the missing data.
3291	 */
3292	if (data_errors != 0 &&
3293	    total_errors <= rr->rr_firstdatacol - parity_untried) {
3294		/*
3295		 * We either attempt to read all the parity columns or
3296		 * none of them. If we didn't try to read parity, we
3297		 * wouldn't be here in the correctable case. There must
3298		 * also have been fewer parity errors than parity
3299		 * columns or, again, we wouldn't be in this code path.
3300		 */
3301		ASSERT(parity_untried == 0);
3302		ASSERT(parity_errors < rr->rr_firstdatacol);
3303
3304		/*
3305		 * Identify the data columns that reported an error.
3306		 */
3307		int n = 0;
3308		int tgts[VDEV_RAIDZ_MAXPARITY];
3309		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3310			raidz_col_t *rc = &rr->rr_col[c];
3311			if (rc->rc_error != 0) {
3312				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3313				tgts[n++] = c;
3314			}
3315		}
3316
3317		ASSERT(rr->rr_firstdatacol >= n);
3318
3319		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3320	}
3321}
3322
3323/*
3324 * Return the number of reads issued.
3325 */
3326static int
3327vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3328{
3329	vdev_t *vd = zio->io_vd;
3330	int nread = 0;
3331
3332	rr->rr_missingdata = 0;
3333	rr->rr_missingparity = 0;
3334
3335	/*
3336	 * If this rows contains empty sectors which are not required
3337	 * for a normal read then allocate an ABD for them now so they
3338	 * may be read, verified, and any needed repairs performed.
3339	 */
3340	if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3341		vdev_draid_map_alloc_empty(zio, rr);
3342
3343	for (int c = 0; c < rr->rr_cols; c++) {
3344		raidz_col_t *rc = &rr->rr_col[c];
3345		if (rc->rc_tried || rc->rc_size == 0)
3346			continue;
3347
3348		zio_nowait(zio_vdev_child_io(zio, NULL,
3349		    vd->vdev_child[rc->rc_devidx],
3350		    rc->rc_offset, rc->rc_abd, rc->rc_size,
3351		    zio->io_type, zio->io_priority, 0,
3352		    vdev_raidz_child_done, rc));
3353		nread++;
3354	}
3355	return (nread);
3356}
3357
3358/*
3359 * We're here because either there were too many errors to even attempt
3360 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3361 * failed. In either case, there is enough bad data to prevent reconstruction.
3362 * Start checksum ereports for all children which haven't failed.
3363 */
3364static void
3365vdev_raidz_io_done_unrecoverable(zio_t *zio)
3366{
3367	raidz_map_t *rm = zio->io_vsd;
3368
3369	for (int i = 0; i < rm->rm_nrows; i++) {
3370		raidz_row_t *rr = rm->rm_row[i];
3371
3372		for (int c = 0; c < rr->rr_cols; c++) {
3373			raidz_col_t *rc = &rr->rr_col[c];
3374			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3375
3376			if (rc->rc_error != 0)
3377				continue;
3378
3379			zio_bad_cksum_t zbc;
3380			zbc.zbc_has_cksum = 0;
3381			zbc.zbc_injected = rm->rm_ecksuminjected;
3382
3383			mutex_enter(&cvd->vdev_stat_lock);
3384			cvd->vdev_stat.vs_checksum_errors++;
3385			mutex_exit(&cvd->vdev_stat_lock);
3386			(void) zfs_ereport_start_checksum(zio->io_spa,
3387			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
3388			    rc->rc_size, &zbc);
3389		}
3390	}
3391}
3392
3393void
3394vdev_raidz_io_done(zio_t *zio)
3395{
3396	raidz_map_t *rm = zio->io_vsd;
3397
3398	ASSERT(zio->io_bp != NULL);
3399	if (zio->io_type == ZIO_TYPE_WRITE) {
3400		for (int i = 0; i < rm->rm_nrows; i++) {
3401			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3402		}
3403	} else {
3404		if (rm->rm_phys_col) {
3405			/*
3406			 * This is an aggregated read.  Copy the data and status
3407			 * from the aggregate abd's to the individual rows.
3408			 */
3409			for (int i = 0; i < rm->rm_nrows; i++) {
3410				raidz_row_t *rr = rm->rm_row[i];
3411
3412				for (int c = 0; c < rr->rr_cols; c++) {
3413					raidz_col_t *rc = &rr->rr_col[c];
3414					if (rc->rc_tried || rc->rc_size == 0)
3415						continue;
3416
3417					raidz_col_t *prc =
3418					    &rm->rm_phys_col[rc->rc_devidx];
3419					rc->rc_error = prc->rc_error;
3420					rc->rc_tried = prc->rc_tried;
3421					rc->rc_skipped = prc->rc_skipped;
3422					if (c >= rr->rr_firstdatacol) {
3423						/*
3424						 * Note: this is slightly faster
3425						 * than using abd_copy_off().
3426						 */
3427						char *physbuf = abd_to_buf(
3428						    prc->rc_abd);
3429						void *physloc = physbuf +
3430						    rc->rc_offset -
3431						    prc->rc_offset;
3432
3433						abd_copy_from_buf(rc->rc_abd,
3434						    physloc, rc->rc_size);
3435					}
3436				}
3437			}
3438		}
3439
3440		for (int i = 0; i < rm->rm_nrows; i++) {
3441			raidz_row_t *rr = rm->rm_row[i];
3442			vdev_raidz_io_done_reconstruct_known_missing(zio,
3443			    rm, rr);
3444		}
3445
3446		if (raidz_checksum_verify(zio) == 0) {
3447			for (int i = 0; i < rm->rm_nrows; i++) {
3448				raidz_row_t *rr = rm->rm_row[i];
3449				vdev_raidz_io_done_verified(zio, rr);
3450			}
3451			zio_checksum_verified(zio);
3452		} else {
3453			/*
3454			 * A sequential resilver has no checksum which makes
3455			 * combinatoral reconstruction impossible. This code
3456			 * path is unreachable since raidz_checksum_verify()
3457			 * has no checksum to verify and must succeed.
3458			 */
3459			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3460
3461			/*
3462			 * This isn't a typical situation -- either we got a
3463			 * read error or a child silently returned bad data.
3464			 * Read every block so we can try again with as much
3465			 * data and parity as we can track down. If we've
3466			 * already been through once before, all children will
3467			 * be marked as tried so we'll proceed to combinatorial
3468			 * reconstruction.
3469			 */
3470			int nread = 0;
3471			for (int i = 0; i < rm->rm_nrows; i++) {
3472				nread += vdev_raidz_read_all(zio,
3473				    rm->rm_row[i]);
3474			}
3475			if (nread != 0) {
3476				/*
3477				 * Normally our stage is VDEV_IO_DONE, but if
3478				 * we've already called redone(), it will have
3479				 * changed to VDEV_IO_START, in which case we
3480				 * don't want to call redone() again.
3481				 */
3482				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3483					zio_vdev_io_redone(zio);
3484				return;
3485			}
3486			/*
3487			 * It would be too expensive to try every possible
3488			 * combination of failed sectors in every row, so
3489			 * instead we try every combination of failed current or
3490			 * past physical disk. This means that if the incorrect
3491			 * sectors were all on Nparity disks at any point in the
3492			 * past, we will find the correct data.  The only known
3493			 * case where this is less durable than a non-expanded
3494			 * RAIDZ, is if we have a silent failure during
3495			 * expansion.  In that case, one block could be
3496			 * partially in the old format and partially in the
3497			 * new format, so we'd lost some sectors from the old
3498			 * format and some from the new format.
3499			 *
3500			 * e.g. logical_width=4 physical_width=6
3501			 * the 15 (6+5+4) possible failed disks are:
3502			 * width=6 child=0
3503			 * width=6 child=1
3504			 * width=6 child=2
3505			 * width=6 child=3
3506			 * width=6 child=4
3507			 * width=6 child=5
3508			 * width=5 child=0
3509			 * width=5 child=1
3510			 * width=5 child=2
3511			 * width=5 child=3
3512			 * width=5 child=4
3513			 * width=4 child=0
3514			 * width=4 child=1
3515			 * width=4 child=2
3516			 * width=4 child=3
3517			 * And we will try every combination of Nparity of these
3518			 * failing.
3519			 *
3520			 * As a first pass, we can generate every combo,
3521			 * and try reconstructing, ignoring any known
3522			 * failures.  If any row has too many known + simulated
3523			 * failures, then we bail on reconstructing with this
3524			 * number of simulated failures.  As an improvement,
3525			 * we could detect the number of whole known failures
3526			 * (i.e. we have known failures on these disks for
3527			 * every row; the disks never succeeded), and
3528			 * subtract that from the max # failures to simulate.
3529			 * We could go even further like the current
3530			 * combrec code, but that doesn't seem like it
3531			 * gains us very much.  If we simulate a failure
3532			 * that is also a known failure, that's fine.
3533			 */
3534			zio->io_error = vdev_raidz_combrec(zio);
3535			if (zio->io_error == ECKSUM &&
3536			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3537				vdev_raidz_io_done_unrecoverable(zio);
3538			}
3539		}
3540	}
3541	if (rm->rm_lr != NULL) {
3542		zfs_rangelock_exit(rm->rm_lr);
3543		rm->rm_lr = NULL;
3544	}
3545}
3546
3547static void
3548vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3549{
3550	vdev_raidz_t *vdrz = vd->vdev_tsd;
3551	if (faulted > vdrz->vd_nparity)
3552		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3553		    VDEV_AUX_NO_REPLICAS);
3554	else if (degraded + faulted != 0)
3555		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3556	else
3557		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3558}
3559
3560/*
3561 * Determine if any portion of the provided block resides on a child vdev
3562 * with a dirty DTL and therefore needs to be resilvered.  The function
3563 * assumes that at least one DTL is dirty which implies that full stripe
3564 * width blocks must be resilvered.
3565 */
3566static boolean_t
3567vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3568    uint64_t phys_birth)
3569{
3570	vdev_raidz_t *vdrz = vd->vdev_tsd;
3571
3572	/*
3573	 * If we're in the middle of a RAIDZ expansion, this block may be in
3574	 * the old and/or new location.  For simplicity, always resilver it.
3575	 */
3576	if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3577		return (B_TRUE);
3578
3579	uint64_t dcols = vd->vdev_children;
3580	uint64_t nparity = vdrz->vd_nparity;
3581	uint64_t ashift = vd->vdev_top->vdev_ashift;
3582	/* The starting RAIDZ (parent) vdev sector of the block. */
3583	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3584	/* The zio's size in units of the vdev's minimum sector size. */
3585	uint64_t s = ((psize - 1) >> ashift) + 1;
3586	/* The first column for this stripe. */
3587	uint64_t f = b % dcols;
3588
3589	/* Unreachable by sequential resilver. */
3590	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3591
3592	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3593		return (B_FALSE);
3594
3595	if (s + nparity >= dcols)
3596		return (B_TRUE);
3597
3598	for (uint64_t c = 0; c < s + nparity; c++) {
3599		uint64_t devidx = (f + c) % dcols;
3600		vdev_t *cvd = vd->vdev_child[devidx];
3601
3602		/*
3603		 * dsl_scan_need_resilver() already checked vd with
3604		 * vdev_dtl_contains(). So here just check cvd with
3605		 * vdev_dtl_empty(), cheaper and a good approximation.
3606		 */
3607		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3608			return (B_TRUE);
3609	}
3610
3611	return (B_FALSE);
3612}
3613
3614static void
3615vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3616    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3617{
3618	(void) remain_rs;
3619
3620	vdev_t *raidvd = cvd->vdev_parent;
3621	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3622
3623	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3624
3625	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3626		/*
3627		 * We're in the middle of expansion, in which case the
3628		 * translation is in flux.  Any answer we give may be wrong
3629		 * by the time we return, so it isn't safe for the caller to
3630		 * act on it.  Therefore we say that this range isn't present
3631		 * on any children.  The only consumers of this are "zpool
3632		 * initialize" and trimming, both of which are "best effort"
3633		 * anyway.
3634		 */
3635		physical_rs->rs_start = physical_rs->rs_end = 0;
3636		remain_rs->rs_start = remain_rs->rs_end = 0;
3637		return;
3638	}
3639
3640	uint64_t width = vdrz->vd_physical_width;
3641	uint64_t tgt_col = cvd->vdev_id;
3642	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3643
3644	/* make sure the offsets are block-aligned */
3645	ASSERT0(logical_rs->rs_start % (1 << ashift));
3646	ASSERT0(logical_rs->rs_end % (1 << ashift));
3647	uint64_t b_start = logical_rs->rs_start >> ashift;
3648	uint64_t b_end = logical_rs->rs_end >> ashift;
3649
3650	uint64_t start_row = 0;
3651	if (b_start > tgt_col) /* avoid underflow */
3652		start_row = ((b_start - tgt_col - 1) / width) + 1;
3653
3654	uint64_t end_row = 0;
3655	if (b_end > tgt_col)
3656		end_row = ((b_end - tgt_col - 1) / width) + 1;
3657
3658	physical_rs->rs_start = start_row << ashift;
3659	physical_rs->rs_end = end_row << ashift;
3660
3661	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3662	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3663	    logical_rs->rs_end - logical_rs->rs_start);
3664}
3665
3666static void
3667raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3668{
3669	spa_t *spa = arg;
3670	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3671	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3672
3673	/*
3674	 * Ensure there are no i/os to the range that is being committed.
3675	 */
3676	uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3677	ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3678
3679	mutex_enter(&vre->vre_lock);
3680	uint64_t new_offset =
3681	    MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3682	/*
3683	 * We should not have committed anything that failed.
3684	 */
3685	VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3686	mutex_exit(&vre->vre_lock);
3687
3688	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3689	    old_offset, new_offset - old_offset,
3690	    RL_WRITER);
3691
3692	/*
3693	 * Update the uberblock that will be written when this txg completes.
3694	 */
3695	RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3696	    RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3697	vre->vre_offset_pertxg[txgoff] = 0;
3698	zfs_rangelock_exit(lr);
3699
3700	mutex_enter(&vre->vre_lock);
3701	vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3702	vre->vre_bytes_copied_pertxg[txgoff] = 0;
3703	mutex_exit(&vre->vre_lock);
3704
3705	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3706	VERIFY0(zap_update(spa->spa_meta_objset,
3707	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3708	    sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3709}
3710
3711static void
3712raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3713{
3714	spa_t *spa = arg;
3715	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3716	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3717	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3718
3719	for (int i = 0; i < TXG_SIZE; i++)
3720		VERIFY0(vre->vre_offset_pertxg[i]);
3721
3722	reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3723	re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3724	re->re_logical_width = vdrz->vd_physical_width;
3725	mutex_enter(&vdrz->vd_expand_lock);
3726	avl_add(&vdrz->vd_expand_txgs, re);
3727	mutex_exit(&vdrz->vd_expand_lock);
3728
3729	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3730
3731	/*
3732	 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3733	 * will get written (based on vd_expand_txgs).
3734	 */
3735	vdev_config_dirty(vd);
3736
3737	/*
3738	 * Before we change vre_state, the on-disk state must reflect that we
3739	 * have completed all copying, so that vdev_raidz_io_start() can use
3740	 * vre_state to determine if the reflow is in progress.  See also the
3741	 * end of spa_raidz_expand_thread().
3742	 */
3743	VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3744	    raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3745
3746	vre->vre_end_time = gethrestime_sec();
3747	vre->vre_state = DSS_FINISHED;
3748
3749	uint64_t state = vre->vre_state;
3750	VERIFY0(zap_update(spa->spa_meta_objset,
3751	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3752	    sizeof (state), 1, &state, tx));
3753
3754	uint64_t end_time = vre->vre_end_time;
3755	VERIFY0(zap_update(spa->spa_meta_objset,
3756	    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3757	    sizeof (end_time), 1, &end_time, tx));
3758
3759	spa->spa_uberblock.ub_raidz_reflow_info = 0;
3760
3761	spa_history_log_internal(spa, "raidz vdev expansion completed",  tx,
3762	    "%s vdev %llu new width %llu", spa_name(spa),
3763	    (unsigned long long)vd->vdev_id,
3764	    (unsigned long long)vd->vdev_children);
3765
3766	spa->spa_raidz_expand = NULL;
3767	raidvd->vdev_rz_expanding = B_FALSE;
3768
3769	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3770	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3771	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3772
3773	spa_notify_waiters(spa);
3774
3775	/*
3776	 * While we're in syncing context take the opportunity to
3777	 * setup a scrub. All the data has been sucessfully copied
3778	 * but we have not validated any checksums.
3779	 */
3780	pool_scan_func_t func = POOL_SCAN_SCRUB;
3781	if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
3782		dsl_scan_setup_sync(&func, tx);
3783}
3784
3785/*
3786 * Struct for one copy zio.
3787 */
3788typedef struct raidz_reflow_arg {
3789	vdev_raidz_expand_t *rra_vre;
3790	zfs_locked_range_t *rra_lr;
3791	uint64_t rra_txg;
3792} raidz_reflow_arg_t;
3793
3794/*
3795 * The write of the new location is done.
3796 */
3797static void
3798raidz_reflow_write_done(zio_t *zio)
3799{
3800	raidz_reflow_arg_t *rra = zio->io_private;
3801	vdev_raidz_expand_t *vre = rra->rra_vre;
3802
3803	abd_free(zio->io_abd);
3804
3805	mutex_enter(&vre->vre_lock);
3806	if (zio->io_error != 0) {
3807		/* Force a reflow pause on errors */
3808		vre->vre_failed_offset =
3809		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3810	}
3811	ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3812	vre->vre_outstanding_bytes -= zio->io_size;
3813	if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3814	    vre->vre_failed_offset) {
3815		vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3816		    zio->io_size;
3817	}
3818	cv_signal(&vre->vre_cv);
3819	mutex_exit(&vre->vre_lock);
3820
3821	zfs_rangelock_exit(rra->rra_lr);
3822
3823	kmem_free(rra, sizeof (*rra));
3824	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3825}
3826
3827/*
3828 * The read of the old location is done.  The parent zio is the write to
3829 * the new location.  Allow it to start.
3830 */
3831static void
3832raidz_reflow_read_done(zio_t *zio)
3833{
3834	raidz_reflow_arg_t *rra = zio->io_private;
3835	vdev_raidz_expand_t *vre = rra->rra_vre;
3836
3837	/*
3838	 * If the read failed, or if it was done on a vdev that is not fully
3839	 * healthy (e.g. a child that has a resilver in progress), we may not
3840	 * have the correct data.  Note that it's OK if the write proceeds.
3841	 * It may write garbage but the location is otherwise unused and we
3842	 * will retry later due to vre_failed_offset.
3843	 */
3844	if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3845		zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3846		    "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3847		    (long long)rra->rra_lr->lr_offset,
3848		    (long long)rra->rra_lr->lr_length,
3849		    (long long)rra->rra_txg,
3850		    zio->io_error,
3851		    vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3852		    vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3853		mutex_enter(&vre->vre_lock);
3854		/* Force a reflow pause on errors */
3855		vre->vre_failed_offset =
3856		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3857		mutex_exit(&vre->vre_lock);
3858	}
3859
3860	zio_nowait(zio_unique_parent(zio));
3861}
3862
3863static void
3864raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3865    dmu_tx_t *tx)
3866{
3867	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3868	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3869
3870	if (offset == 0)
3871		return;
3872
3873	mutex_enter(&vre->vre_lock);
3874	ASSERT3U(vre->vre_offset, <=, offset);
3875	vre->vre_offset = offset;
3876	mutex_exit(&vre->vre_lock);
3877
3878	if (vre->vre_offset_pertxg[txgoff] == 0) {
3879		dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3880		    spa, tx);
3881	}
3882	vre->vre_offset_pertxg[txgoff] = offset;
3883}
3884
3885static boolean_t
3886vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3887{
3888	for (int i = 0; i < raidz_vd->vdev_children; i++) {
3889		/* Quick check if a child is being replaced */
3890		if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3891			return (B_TRUE);
3892	}
3893	return (B_FALSE);
3894}
3895
3896static boolean_t
3897raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3898    dmu_tx_t *tx)
3899{
3900	spa_t *spa = vd->vdev_spa;
3901	int ashift = vd->vdev_top->vdev_ashift;
3902	uint64_t offset, size;
3903
3904	if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
3905	    &offset, &size)) {
3906		return (B_FALSE);
3907	}
3908	ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3909	ASSERT3U(size, >=, 1 << ashift);
3910	uint64_t length = 1 << ashift;
3911	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3912
3913	uint64_t blkid = offset >> ashift;
3914
3915	int old_children = vd->vdev_children - 1;
3916
3917	/*
3918	 * We can only progress to the point that writes will not overlap
3919	 * with blocks whose progress has not yet been recorded on disk.
3920	 * Since partially-copied rows are still read from the old location,
3921	 * we need to stop one row before the sector-wise overlap, to prevent
3922	 * row-wise overlap.
3923	 *
3924	 * Note that even if we are skipping over a large unallocated region,
3925	 * we can't move the on-disk progress to `offset`, because concurrent
3926	 * writes/allocations could still use the currently-unallocated
3927	 * region.
3928	 */
3929	uint64_t ubsync_blkid =
3930	    RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3931	uint64_t next_overwrite_blkid = ubsync_blkid +
3932	    ubsync_blkid / old_children - old_children;
3933	VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3934
3935	if (blkid >= next_overwrite_blkid) {
3936		raidz_reflow_record_progress(vre,
3937		    next_overwrite_blkid << ashift, tx);
3938		return (B_TRUE);
3939	}
3940
3941	range_tree_remove(rt, offset, length);
3942
3943	raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
3944	rra->rra_vre = vre;
3945	rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
3946	    offset, length, RL_WRITER);
3947	rra->rra_txg = dmu_tx_get_txg(tx);
3948
3949	raidz_reflow_record_progress(vre, offset + length, tx);
3950
3951	mutex_enter(&vre->vre_lock);
3952	vre->vre_outstanding_bytes += length;
3953	mutex_exit(&vre->vre_lock);
3954
3955	/*
3956	 * SCL_STATE will be released when the read and write are done,
3957	 * by raidz_reflow_write_done().
3958	 */
3959	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3960
3961	/* check if a replacing vdev was added, if so treat it as an error */
3962	if (vdev_raidz_expand_child_replacing(vd)) {
3963		zfs_dbgmsg("replacing vdev encountered, reflow paused at "
3964		    "offset=%llu txg=%llu",
3965		    (long long)rra->rra_lr->lr_offset,
3966		    (long long)rra->rra_txg);
3967
3968		mutex_enter(&vre->vre_lock);
3969		vre->vre_failed_offset =
3970		    MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3971		cv_signal(&vre->vre_cv);
3972		mutex_exit(&vre->vre_lock);
3973
3974		/* drop everything we acquired */
3975		zfs_rangelock_exit(rra->rra_lr);
3976		kmem_free(rra, sizeof (*rra));
3977		spa_config_exit(spa, SCL_STATE, spa);
3978		return (B_TRUE);
3979	}
3980
3981	zio_t *pio = spa->spa_txg_zio[txgoff];
3982	abd_t *abd = abd_alloc_for_io(length, B_FALSE);
3983	zio_t *write_zio = zio_vdev_child_io(pio, NULL,
3984	    vd->vdev_child[blkid % vd->vdev_children],
3985	    (blkid / vd->vdev_children) << ashift,
3986	    abd, length,
3987	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
3988	    ZIO_FLAG_CANFAIL,
3989	    raidz_reflow_write_done, rra);
3990
3991	zio_nowait(zio_vdev_child_io(write_zio, NULL,
3992	    vd->vdev_child[blkid % old_children],
3993	    (blkid / old_children) << ashift,
3994	    abd, length,
3995	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
3996	    ZIO_FLAG_CANFAIL,
3997	    raidz_reflow_read_done, rra));
3998
3999	return (B_FALSE);
4000}
4001
4002/*
4003 * For testing (ztest specific)
4004 */
4005static void
4006raidz_expand_pause(uint_t pause_point)
4007{
4008	while (raidz_expand_pause_point != 0 &&
4009	    raidz_expand_pause_point <= pause_point)
4010		delay(hz);
4011}
4012
4013static void
4014raidz_scratch_child_done(zio_t *zio)
4015{
4016	zio_t *pio = zio->io_private;
4017
4018	mutex_enter(&pio->io_lock);
4019	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4020	mutex_exit(&pio->io_lock);
4021}
4022
4023/*
4024 * Reflow the beginning portion of the vdev into an intermediate scratch area
4025 * in memory and on disk. This operation must be persisted on disk before we
4026 * proceed to overwrite the beginning portion with the reflowed data.
4027 *
4028 * This multi-step task can fail to complete if disk errors are encountered
4029 * and we can return here after a pause (waiting for disk to become healthy).
4030 */
4031static void
4032raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4033{
4034	vdev_raidz_expand_t *vre = arg;
4035	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4036	zio_t *pio;
4037	int error;
4038
4039	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4040	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4041	int ashift = raidvd->vdev_ashift;
4042	uint64_t write_size = P2ALIGN_TYPED(VDEV_BOOT_SIZE, 1 << ashift,
4043	    uint64_t);
4044	uint64_t logical_size = write_size * raidvd->vdev_children;
4045	uint64_t read_size =
4046	    P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4047	    1 << ashift);
4048
4049	/*
4050	 * The scratch space must be large enough to get us to the point
4051	 * that one row does not overlap itself when moved.  This is checked
4052	 * by vdev_raidz_attach_check().
4053	 */
4054	VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4055	VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4056	VERIFY3U(write_size, <=, read_size);
4057
4058	zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4059	    0, logical_size, RL_WRITER);
4060
4061	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4062	    KM_SLEEP);
4063	for (int i = 0; i < raidvd->vdev_children; i++) {
4064		abds[i] = abd_alloc_linear(read_size, B_FALSE);
4065	}
4066
4067	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4068
4069	/*
4070	 * If we have already written the scratch area then we must read from
4071	 * there, since new writes were redirected there while we were paused
4072	 * or the original location may have been partially overwritten with
4073	 * reflowed data.
4074	 */
4075	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4076		VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4077		/*
4078		 * Read from scratch space.
4079		 */
4080		pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4081		for (int i = 0; i < raidvd->vdev_children; i++) {
4082			/*
4083			 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4084			 * to the offset to calculate the physical offset to
4085			 * write to.  Passing in a negative offset makes us
4086			 * access the scratch area.
4087			 */
4088			zio_nowait(zio_vdev_child_io(pio, NULL,
4089			    raidvd->vdev_child[i],
4090			    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4091			    write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
4092			    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4093		}
4094		error = zio_wait(pio);
4095		if (error != 0) {
4096			zfs_dbgmsg("reflow: error %d reading scratch location",
4097			    error);
4098			goto io_error_exit;
4099		}
4100		goto overwrite;
4101	}
4102
4103	/*
4104	 * Read from original location.
4105	 */
4106	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4107	for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4108		ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4109		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4110		    0, abds[i], read_size, ZIO_TYPE_READ,
4111		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
4112		    raidz_scratch_child_done, pio));
4113	}
4114	error = zio_wait(pio);
4115	if (error != 0) {
4116		zfs_dbgmsg("reflow: error %d reading original location", error);
4117io_error_exit:
4118		for (int i = 0; i < raidvd->vdev_children; i++)
4119			abd_free(abds[i]);
4120		kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4121		zfs_rangelock_exit(lr);
4122		spa_config_exit(spa, SCL_STATE, FTAG);
4123		return;
4124	}
4125
4126	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4127
4128	/*
4129	 * Reflow in memory.
4130	 */
4131	uint64_t logical_sectors = logical_size >> ashift;
4132	for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4133		int oldchild = i % (raidvd->vdev_children - 1);
4134		uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4135
4136		int newchild = i % raidvd->vdev_children;
4137		uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4138
4139		/* a single sector should not be copying over itself */
4140		ASSERT(!(newchild == oldchild && newoff == oldoff));
4141
4142		abd_copy_off(abds[newchild], abds[oldchild],
4143		    newoff, oldoff, 1 << ashift);
4144	}
4145
4146	/*
4147	 * Verify that we filled in everything we intended to (write_size on
4148	 * each child).
4149	 */
4150	VERIFY0(logical_sectors % raidvd->vdev_children);
4151	VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4152	    write_size);
4153
4154	/*
4155	 * Write to scratch location (boot area).
4156	 */
4157	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4158	for (int i = 0; i < raidvd->vdev_children; i++) {
4159		/*
4160		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4161		 * the offset to calculate the physical offset to write to.
4162		 * Passing in a negative offset lets us access the boot area.
4163		 */
4164		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4165		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4166		    write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
4167		    ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4168	}
4169	error = zio_wait(pio);
4170	if (error != 0) {
4171		zfs_dbgmsg("reflow: error %d writing scratch location", error);
4172		goto io_error_exit;
4173	}
4174	pio = zio_root(spa, NULL, NULL, 0);
4175	zio_flush(pio, raidvd);
4176	zio_wait(pio);
4177
4178	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4179	    (long long)logical_size);
4180
4181	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4182
4183	/*
4184	 * Update uberblock to indicate that scratch space is valid.  This is
4185	 * needed because after this point, the real location may be
4186	 * overwritten.  If we crash, we need to get the data from the
4187	 * scratch space, rather than the real location.
4188	 *
4189	 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4190	 * will prefer this uberblock.
4191	 */
4192	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4193	spa->spa_ubsync.ub_timestamp++;
4194	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4195	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4196	if (spa_multihost(spa))
4197		mmp_update_uberblock(spa, &spa->spa_ubsync);
4198
4199	zfs_dbgmsg("reflow: uberblock updated "
4200	    "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4201	    (long long)spa->spa_ubsync.ub_txg,
4202	    (long long)logical_size,
4203	    (long long)spa->spa_ubsync.ub_timestamp);
4204
4205	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4206
4207	/*
4208	 * Overwrite with reflow'ed data.
4209	 */
4210overwrite:
4211	pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4212	for (int i = 0; i < raidvd->vdev_children; i++) {
4213		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4214		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4215		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4216		    raidz_scratch_child_done, pio));
4217	}
4218	error = zio_wait(pio);
4219	if (error != 0) {
4220		/*
4221		 * When we exit early here and drop the range lock, new
4222		 * writes will go into the scratch area so we'll need to
4223		 * read from there when we return after pausing.
4224		 */
4225		zfs_dbgmsg("reflow: error %d writing real location", error);
4226		/*
4227		 * Update the uberblock that is written when this txg completes.
4228		 */
4229		RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4230		    logical_size);
4231		goto io_error_exit;
4232	}
4233	pio = zio_root(spa, NULL, NULL, 0);
4234	zio_flush(pio, raidvd);
4235	zio_wait(pio);
4236
4237	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4238	    (long long)logical_size);
4239	for (int i = 0; i < raidvd->vdev_children; i++)
4240		abd_free(abds[i]);
4241	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4242
4243	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4244
4245	/*
4246	 * Update uberblock to indicate that the initial part has been
4247	 * reflow'ed.  This is needed because after this point (when we exit
4248	 * the rangelock), we allow regular writes to this region, which will
4249	 * be written to the new location only (because reflow_offset_next ==
4250	 * reflow_offset_synced).  If we crashed and re-copied from the
4251	 * scratch space, we would lose the regular writes.
4252	 */
4253	RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4254	    logical_size);
4255	spa->spa_ubsync.ub_timestamp++;
4256	ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4257	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4258	if (spa_multihost(spa))
4259		mmp_update_uberblock(spa, &spa->spa_ubsync);
4260
4261	zfs_dbgmsg("reflow: uberblock updated "
4262	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4263	    (long long)spa->spa_ubsync.ub_txg,
4264	    (long long)logical_size,
4265	    (long long)spa->spa_ubsync.ub_timestamp);
4266
4267	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4268
4269	/*
4270	 * Update progress.
4271	 */
4272	vre->vre_offset = logical_size;
4273	zfs_rangelock_exit(lr);
4274	spa_config_exit(spa, SCL_STATE, FTAG);
4275
4276	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4277	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4278	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4279	/*
4280	 * Note - raidz_reflow_sync() will update the uberblock state to
4281	 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4282	 */
4283	raidz_reflow_sync(spa, tx);
4284
4285	raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4286}
4287
4288/*
4289 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4290 * here.  No other i/o can be in progress, so we don't need the vre_rangelock.
4291 */
4292void
4293vdev_raidz_reflow_copy_scratch(spa_t *spa)
4294{
4295	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4296	uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4297	ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4298
4299	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4300	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4301	ASSERT0(logical_size % raidvd->vdev_children);
4302	uint64_t write_size = logical_size / raidvd->vdev_children;
4303
4304	zio_t *pio;
4305
4306	/*
4307	 * Read from scratch space.
4308	 */
4309	abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4310	    KM_SLEEP);
4311	for (int i = 0; i < raidvd->vdev_children; i++) {
4312		abds[i] = abd_alloc_linear(write_size, B_FALSE);
4313	}
4314
4315	pio = zio_root(spa, NULL, NULL, 0);
4316	for (int i = 0; i < raidvd->vdev_children; i++) {
4317		/*
4318		 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4319		 * the offset to calculate the physical offset to write to.
4320		 * Passing in a negative offset lets us access the boot area.
4321		 */
4322		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4323		    VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4324		    write_size, ZIO_TYPE_READ,
4325		    ZIO_PRIORITY_ASYNC_READ, 0,
4326		    raidz_scratch_child_done, pio));
4327	}
4328	zio_wait(pio);
4329
4330	/*
4331	 * Overwrite real location with reflow'ed data.
4332	 */
4333	pio = zio_root(spa, NULL, NULL, 0);
4334	for (int i = 0; i < raidvd->vdev_children; i++) {
4335		zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4336		    0, abds[i], write_size, ZIO_TYPE_WRITE,
4337		    ZIO_PRIORITY_ASYNC_WRITE, 0,
4338		    raidz_scratch_child_done, pio));
4339	}
4340	zio_wait(pio);
4341	pio = zio_root(spa, NULL, NULL, 0);
4342	zio_flush(pio, raidvd);
4343	zio_wait(pio);
4344
4345	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4346	    "to real location", (long long)logical_size);
4347
4348	for (int i = 0; i < raidvd->vdev_children; i++)
4349		abd_free(abds[i]);
4350	kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4351
4352	/*
4353	 * Update uberblock.
4354	 */
4355	RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4356	    RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4357	spa->spa_ubsync.ub_timestamp++;
4358	VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4359	    &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4360	if (spa_multihost(spa))
4361		mmp_update_uberblock(spa, &spa->spa_ubsync);
4362
4363	zfs_dbgmsg("reflow recovery: uberblock updated "
4364	    "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4365	    (long long)spa->spa_ubsync.ub_txg,
4366	    (long long)logical_size,
4367	    (long long)spa->spa_ubsync.ub_timestamp);
4368
4369	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4370	    spa_first_txg(spa));
4371	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4372	vre->vre_offset = logical_size;
4373	vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4374	vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4375	/*
4376	 * Note that raidz_reflow_sync() will update the uberblock once more
4377	 */
4378	raidz_reflow_sync(spa, tx);
4379
4380	dmu_tx_commit(tx);
4381
4382	spa_config_exit(spa, SCL_STATE, FTAG);
4383}
4384
4385static boolean_t
4386spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4387{
4388	(void) zthr;
4389	spa_t *spa = arg;
4390
4391	return (spa->spa_raidz_expand != NULL &&
4392	    !spa->spa_raidz_expand->vre_waiting_for_resilver);
4393}
4394
4395/*
4396 * RAIDZ expansion background thread
4397 *
4398 * Can be called multiple times if the reflow is paused
4399 */
4400static void
4401spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4402{
4403	spa_t *spa = arg;
4404	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4405
4406	if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4407		vre->vre_offset = 0;
4408	else
4409		vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4410
4411	/* Reflow the begining portion using the scratch area */
4412	if (vre->vre_offset == 0) {
4413		VERIFY0(dsl_sync_task(spa_name(spa),
4414		    NULL, raidz_reflow_scratch_sync,
4415		    vre, 0, ZFS_SPACE_CHECK_NONE));
4416
4417		/* if we encountered errors then pause */
4418		if (vre->vre_offset == 0) {
4419			mutex_enter(&vre->vre_lock);
4420			vre->vre_waiting_for_resilver = B_TRUE;
4421			mutex_exit(&vre->vre_lock);
4422			return;
4423		}
4424	}
4425
4426	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4427	vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4428
4429	uint64_t guid = raidvd->vdev_guid;
4430
4431	/* Iterate over all the remaining metaslabs */
4432	for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4433	    i < raidvd->vdev_ms_count &&
4434	    !zthr_iscancelled(zthr) &&
4435	    vre->vre_failed_offset == UINT64_MAX; i++) {
4436		metaslab_t *msp = raidvd->vdev_ms[i];
4437
4438		metaslab_disable(msp);
4439		mutex_enter(&msp->ms_lock);
4440
4441		/*
4442		 * The metaslab may be newly created (for the expanded
4443		 * space), in which case its trees won't exist yet,
4444		 * so we need to bail out early.
4445		 */
4446		if (msp->ms_new) {
4447			mutex_exit(&msp->ms_lock);
4448			metaslab_enable(msp, B_FALSE, B_FALSE);
4449			continue;
4450		}
4451
4452		VERIFY0(metaslab_load(msp));
4453
4454		/*
4455		 * We want to copy everything except the free (allocatable)
4456		 * space.  Note that there may be a little bit more free
4457		 * space (e.g. in ms_defer), and it's fine to copy that too.
4458		 */
4459		range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
4460		    NULL, 0, 0);
4461		range_tree_add(rt, msp->ms_start, msp->ms_size);
4462		range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4463		mutex_exit(&msp->ms_lock);
4464
4465		/*
4466		 * Force the last sector of each metaslab to be copied.  This
4467		 * ensures that we advance the on-disk progress to the end of
4468		 * this metaslab while the metaslab is disabled.  Otherwise, we
4469		 * could move past this metaslab without advancing the on-disk
4470		 * progress, and then an allocation to this metaslab would not
4471		 * be copied.
4472		 */
4473		int sectorsz = 1 << raidvd->vdev_ashift;
4474		uint64_t ms_last_offset = msp->ms_start +
4475		    msp->ms_size - sectorsz;
4476		if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4477			range_tree_add(rt, ms_last_offset, sectorsz);
4478		}
4479
4480		/*
4481		 * When we are resuming from a paused expansion (i.e.
4482		 * when importing a pool with a expansion in progress),
4483		 * discard any state that we have already processed.
4484		 */
4485		range_tree_clear(rt, 0, vre->vre_offset);
4486
4487		while (!zthr_iscancelled(zthr) &&
4488		    !range_tree_is_empty(rt) &&
4489		    vre->vre_failed_offset == UINT64_MAX) {
4490
4491			/*
4492			 * We need to periodically drop the config lock so that
4493			 * writers can get in.  Additionally, we can't wait
4494			 * for a txg to sync while holding a config lock
4495			 * (since a waiting writer could cause a 3-way deadlock
4496			 * with the sync thread, which also gets a config
4497			 * lock for reader).  So we can't hold the config lock
4498			 * while calling dmu_tx_assign().
4499			 */
4500			spa_config_exit(spa, SCL_CONFIG, FTAG);
4501
4502			/*
4503			 * If requested, pause the reflow when the amount
4504			 * specified by raidz_expand_max_reflow_bytes is reached
4505			 *
4506			 * This pause is only used during testing or debugging.
4507			 */
4508			while (raidz_expand_max_reflow_bytes != 0 &&
4509			    raidz_expand_max_reflow_bytes <=
4510			    vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4511				delay(hz);
4512			}
4513
4514			mutex_enter(&vre->vre_lock);
4515			while (vre->vre_outstanding_bytes >
4516			    raidz_expand_max_copy_bytes) {
4517				cv_wait(&vre->vre_cv, &vre->vre_lock);
4518			}
4519			mutex_exit(&vre->vre_lock);
4520
4521			dmu_tx_t *tx =
4522			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4523
4524			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4525			uint64_t txg = dmu_tx_get_txg(tx);
4526
4527			/*
4528			 * Reacquire the vdev_config lock.  Theoretically, the
4529			 * vdev_t that we're expanding may have changed.
4530			 */
4531			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4532			raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4533
4534			boolean_t needsync =
4535			    raidz_reflow_impl(raidvd, vre, rt, tx);
4536
4537			dmu_tx_commit(tx);
4538
4539			if (needsync) {
4540				spa_config_exit(spa, SCL_CONFIG, FTAG);
4541				txg_wait_synced(spa->spa_dsl_pool, txg);
4542				spa_config_enter(spa, SCL_CONFIG, FTAG,
4543				    RW_READER);
4544			}
4545		}
4546
4547		spa_config_exit(spa, SCL_CONFIG, FTAG);
4548
4549		metaslab_enable(msp, B_FALSE, B_FALSE);
4550		range_tree_vacate(rt, NULL, NULL);
4551		range_tree_destroy(rt);
4552
4553		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4554		raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4555	}
4556
4557	spa_config_exit(spa, SCL_CONFIG, FTAG);
4558
4559	/*
4560	 * The txg_wait_synced() here ensures that all reflow zio's have
4561	 * completed, and vre_failed_offset has been set if necessary.  It
4562	 * also ensures that the progress of the last raidz_reflow_sync() is
4563	 * written to disk before raidz_reflow_complete_sync() changes the
4564	 * in-memory vre_state.  vdev_raidz_io_start() uses vre_state to
4565	 * determine if a reflow is in progress, in which case we may need to
4566	 * write to both old and new locations.  Therefore we can only change
4567	 * vre_state once this is not necessary, which is once the on-disk
4568	 * progress (in spa_ubsync) has been set past any possible writes (to
4569	 * the end of the last metaslab).
4570	 */
4571	txg_wait_synced(spa->spa_dsl_pool, 0);
4572
4573	if (!zthr_iscancelled(zthr) &&
4574	    vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4575		/*
4576		 * We are not being canceled or paused, so the reflow must be
4577		 * complete. In that case also mark it as completed on disk.
4578		 */
4579		ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4580		VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4581		    raidz_reflow_complete_sync, spa,
4582		    0, ZFS_SPACE_CHECK_NONE));
4583		(void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4584	} else {
4585		/*
4586		 * Wait for all copy zio's to complete and for all the
4587		 * raidz_reflow_sync() synctasks to be run.
4588		 */
4589		spa_history_log_internal(spa, "reflow pause",
4590		    NULL, "offset=%llu failed_offset=%lld",
4591		    (long long)vre->vre_offset,
4592		    (long long)vre->vre_failed_offset);
4593		mutex_enter(&vre->vre_lock);
4594		if (vre->vre_failed_offset != UINT64_MAX) {
4595			/*
4596			 * Reset progress so that we will retry everything
4597			 * after the point that something failed.
4598			 */
4599			vre->vre_offset = vre->vre_failed_offset;
4600			vre->vre_failed_offset = UINT64_MAX;
4601			vre->vre_waiting_for_resilver = B_TRUE;
4602		}
4603		mutex_exit(&vre->vre_lock);
4604	}
4605}
4606
4607void
4608spa_start_raidz_expansion_thread(spa_t *spa)
4609{
4610	ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4611	spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4612	    spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4613	    spa, defclsyspri);
4614}
4615
4616void
4617raidz_dtl_reassessed(vdev_t *vd)
4618{
4619	spa_t *spa = vd->vdev_spa;
4620	if (spa->spa_raidz_expand != NULL) {
4621		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4622		/*
4623		 * we get called often from vdev_dtl_reassess() so make
4624		 * sure it's our vdev and any replacing is complete
4625		 */
4626		if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4627		    !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4628			mutex_enter(&vre->vre_lock);
4629			if (vre->vre_waiting_for_resilver) {
4630				vdev_dbgmsg(vd, "DTL reassessed, "
4631				    "continuing raidz expansion");
4632				vre->vre_waiting_for_resilver = B_FALSE;
4633				zthr_wakeup(spa->spa_raidz_expand_zthr);
4634			}
4635			mutex_exit(&vre->vre_lock);
4636		}
4637	}
4638}
4639
4640int
4641vdev_raidz_attach_check(vdev_t *new_child)
4642{
4643	vdev_t *raidvd = new_child->vdev_parent;
4644	uint64_t new_children = raidvd->vdev_children;
4645
4646	/*
4647	 * We use the "boot" space as scratch space to handle overwriting the
4648	 * initial part of the vdev.  If it is too small, then this expansion
4649	 * is not allowed.  This would be very unusual (e.g. ashift > 13 and
4650	 * >200 children).
4651	 */
4652	if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4653		return (EINVAL);
4654	}
4655	return (0);
4656}
4657
4658void
4659vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4660{
4661	vdev_t *new_child = arg;
4662	spa_t *spa = new_child->vdev_spa;
4663	vdev_t *raidvd = new_child->vdev_parent;
4664	vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4665	ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4666	ASSERT3P(raidvd->vdev_top, ==, raidvd);
4667	ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4668	ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4669	ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4670	    new_child);
4671
4672	spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4673
4674	vdrz->vd_physical_width++;
4675
4676	VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4677	vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4678	vdrz->vn_vre.vre_offset = 0;
4679	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4680	spa->spa_raidz_expand = &vdrz->vn_vre;
4681	zthr_wakeup(spa->spa_raidz_expand_zthr);
4682
4683	/*
4684	 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4685	 * written to the config.
4686	 */
4687	vdev_config_dirty(raidvd);
4688
4689	vdrz->vn_vre.vre_start_time = gethrestime_sec();
4690	vdrz->vn_vre.vre_end_time = 0;
4691	vdrz->vn_vre.vre_state = DSS_SCANNING;
4692	vdrz->vn_vre.vre_bytes_copied = 0;
4693
4694	uint64_t state = vdrz->vn_vre.vre_state;
4695	VERIFY0(zap_update(spa->spa_meta_objset,
4696	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4697	    sizeof (state), 1, &state, tx));
4698
4699	uint64_t start_time = vdrz->vn_vre.vre_start_time;
4700	VERIFY0(zap_update(spa->spa_meta_objset,
4701	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4702	    sizeof (start_time), 1, &start_time, tx));
4703
4704	(void) zap_remove(spa->spa_meta_objset,
4705	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4706	(void) zap_remove(spa->spa_meta_objset,
4707	    raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4708
4709	spa_history_log_internal(spa, "raidz vdev expansion started",  tx,
4710	    "%s vdev %llu new width %llu", spa_name(spa),
4711	    (unsigned long long)raidvd->vdev_id,
4712	    (unsigned long long)raidvd->vdev_children);
4713}
4714
4715int
4716vdev_raidz_load(vdev_t *vd)
4717{
4718	vdev_raidz_t *vdrz = vd->vdev_tsd;
4719	int err;
4720
4721	uint64_t state = DSS_NONE;
4722	uint64_t start_time = 0;
4723	uint64_t end_time = 0;
4724	uint64_t bytes_copied = 0;
4725
4726	if (vd->vdev_top_zap != 0) {
4727		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4728		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4729		    sizeof (state), 1, &state);
4730		if (err != 0 && err != ENOENT)
4731			return (err);
4732
4733		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4734		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4735		    sizeof (start_time), 1, &start_time);
4736		if (err != 0 && err != ENOENT)
4737			return (err);
4738
4739		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4740		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4741		    sizeof (end_time), 1, &end_time);
4742		if (err != 0 && err != ENOENT)
4743			return (err);
4744
4745		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4746		    vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4747		    sizeof (bytes_copied), 1, &bytes_copied);
4748		if (err != 0 && err != ENOENT)
4749			return (err);
4750	}
4751
4752	/*
4753	 * If we are in the middle of expansion, vre_state should have
4754	 * already been set by vdev_raidz_init().
4755	 */
4756	EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4757	vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4758	vdrz->vn_vre.vre_start_time = start_time;
4759	vdrz->vn_vre.vre_end_time = end_time;
4760	vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4761
4762	return (0);
4763}
4764
4765int
4766spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4767{
4768	vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4769
4770	if (vre == NULL) {
4771		/* no removal in progress; find most recent completed */
4772		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4773			vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4774			if (vd->vdev_ops == &vdev_raidz_ops) {
4775				vdev_raidz_t *vdrz = vd->vdev_tsd;
4776
4777				if (vdrz->vn_vre.vre_end_time != 0 &&
4778				    (vre == NULL ||
4779				    vdrz->vn_vre.vre_end_time >
4780				    vre->vre_end_time)) {
4781					vre = &vdrz->vn_vre;
4782				}
4783			}
4784		}
4785	}
4786
4787	if (vre == NULL) {
4788		return (SET_ERROR(ENOENT));
4789	}
4790
4791	pres->pres_state = vre->vre_state;
4792	pres->pres_expanding_vdev = vre->vre_vdev_id;
4793
4794	vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4795	pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4796
4797	mutex_enter(&vre->vre_lock);
4798	pres->pres_reflowed = vre->vre_bytes_copied;
4799	for (int i = 0; i < TXG_SIZE; i++)
4800		pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4801	mutex_exit(&vre->vre_lock);
4802
4803	pres->pres_start_time = vre->vre_start_time;
4804	pres->pres_end_time = vre->vre_end_time;
4805	pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4806
4807	return (0);
4808}
4809
4810/*
4811 * Initialize private RAIDZ specific fields from the nvlist.
4812 */
4813static int
4814vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4815{
4816	uint_t children;
4817	nvlist_t **child;
4818	int error = nvlist_lookup_nvlist_array(nv,
4819	    ZPOOL_CONFIG_CHILDREN, &child, &children);
4820	if (error != 0)
4821		return (SET_ERROR(EINVAL));
4822
4823	uint64_t nparity;
4824	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4825		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4826			return (SET_ERROR(EINVAL));
4827
4828		/*
4829		 * Previous versions could only support 1 or 2 parity
4830		 * device.
4831		 */
4832		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4833			return (SET_ERROR(EINVAL));
4834		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4835			return (SET_ERROR(EINVAL));
4836	} else {
4837		/*
4838		 * We require the parity to be specified for SPAs that
4839		 * support multiple parity levels.
4840		 */
4841		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4842			return (SET_ERROR(EINVAL));
4843
4844		/*
4845		 * Otherwise, we default to 1 parity device for RAID-Z.
4846		 */
4847		nparity = 1;
4848	}
4849
4850	vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4851	vdrz->vn_vre.vre_vdev_id = -1;
4852	vdrz->vn_vre.vre_offset = UINT64_MAX;
4853	vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4854	mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4855	cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4856	zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4857	mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4858	avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4859	    sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4860
4861	vdrz->vd_physical_width = children;
4862	vdrz->vd_nparity = nparity;
4863
4864	/* note, the ID does not exist when creating a pool */
4865	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4866	    &vdrz->vn_vre.vre_vdev_id);
4867
4868	boolean_t reflow_in_progress =
4869	    nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4870	if (reflow_in_progress) {
4871		spa->spa_raidz_expand = &vdrz->vn_vre;
4872		vdrz->vn_vre.vre_state = DSS_SCANNING;
4873	}
4874
4875	vdrz->vd_original_width = children;
4876	uint64_t *txgs;
4877	unsigned int txgs_size = 0;
4878	error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4879	    &txgs, &txgs_size);
4880	if (error == 0) {
4881		for (int i = 0; i < txgs_size; i++) {
4882			reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4883			re->re_txg = txgs[txgs_size - i - 1];
4884			re->re_logical_width = vdrz->vd_physical_width - i;
4885
4886			if (reflow_in_progress)
4887				re->re_logical_width--;
4888
4889			avl_add(&vdrz->vd_expand_txgs, re);
4890		}
4891
4892		vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4893	}
4894	if (reflow_in_progress) {
4895		vdrz->vd_original_width--;
4896		zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4897		    children, txgs_size);
4898	}
4899
4900	*tsd = vdrz;
4901
4902	return (0);
4903}
4904
4905static void
4906vdev_raidz_fini(vdev_t *vd)
4907{
4908	vdev_raidz_t *vdrz = vd->vdev_tsd;
4909	if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
4910		vd->vdev_spa->spa_raidz_expand = NULL;
4911	reflow_node_t *re;
4912	void *cookie = NULL;
4913	avl_tree_t *tree = &vdrz->vd_expand_txgs;
4914	while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
4915		kmem_free(re, sizeof (*re));
4916	avl_destroy(&vdrz->vd_expand_txgs);
4917	mutex_destroy(&vdrz->vd_expand_lock);
4918	mutex_destroy(&vdrz->vn_vre.vre_lock);
4919	cv_destroy(&vdrz->vn_vre.vre_cv);
4920	zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
4921	kmem_free(vdrz, sizeof (*vdrz));
4922}
4923
4924/*
4925 * Add RAIDZ specific fields to the config nvlist.
4926 */
4927static void
4928vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
4929{
4930	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
4931	vdev_raidz_t *vdrz = vd->vdev_tsd;
4932
4933	/*
4934	 * Make sure someone hasn't managed to sneak a fancy new vdev
4935	 * into a crufty old storage pool.
4936	 */
4937	ASSERT(vdrz->vd_nparity == 1 ||
4938	    (vdrz->vd_nparity <= 2 &&
4939	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
4940	    (vdrz->vd_nparity <= 3 &&
4941	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
4942
4943	/*
4944	 * Note that we'll add these even on storage pools where they
4945	 * aren't strictly required -- older software will just ignore
4946	 * it.
4947	 */
4948	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
4949
4950	if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4951		fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4952	}
4953
4954	mutex_enter(&vdrz->vd_expand_lock);
4955	if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
4956		uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
4957		uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
4958		    KM_SLEEP);
4959		uint64_t i = 0;
4960
4961		for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
4962		    re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
4963			txgs[i++] = re->re_txg;
4964		}
4965
4966		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4967		    txgs, count);
4968
4969		kmem_free(txgs, sizeof (uint64_t) * count);
4970	}
4971	mutex_exit(&vdrz->vd_expand_lock);
4972}
4973
4974static uint64_t
4975vdev_raidz_nparity(vdev_t *vd)
4976{
4977	vdev_raidz_t *vdrz = vd->vdev_tsd;
4978	return (vdrz->vd_nparity);
4979}
4980
4981static uint64_t
4982vdev_raidz_ndisks(vdev_t *vd)
4983{
4984	return (vd->vdev_children);
4985}
4986
4987vdev_ops_t vdev_raidz_ops = {
4988	.vdev_op_init = vdev_raidz_init,
4989	.vdev_op_fini = vdev_raidz_fini,
4990	.vdev_op_open = vdev_raidz_open,
4991	.vdev_op_close = vdev_raidz_close,
4992	.vdev_op_asize = vdev_raidz_asize,
4993	.vdev_op_min_asize = vdev_raidz_min_asize,
4994	.vdev_op_min_alloc = NULL,
4995	.vdev_op_io_start = vdev_raidz_io_start,
4996	.vdev_op_io_done = vdev_raidz_io_done,
4997	.vdev_op_state_change = vdev_raidz_state_change,
4998	.vdev_op_need_resilver = vdev_raidz_need_resilver,
4999	.vdev_op_hold = NULL,
5000	.vdev_op_rele = NULL,
5001	.vdev_op_remap = NULL,
5002	.vdev_op_xlate = vdev_raidz_xlate,
5003	.vdev_op_rebuild_asize = NULL,
5004	.vdev_op_metaslab_init = NULL,
5005	.vdev_op_config_generate = vdev_raidz_config_generate,
5006	.vdev_op_nparity = vdev_raidz_nparity,
5007	.vdev_op_ndisks = vdev_raidz_ndisks,
5008	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
5009	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
5010};
5011
5012/* BEGIN CSTYLED */
5013ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5014	"For testing, pause RAIDZ expansion after reflowing this many bytes");
5015ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5016	"Max amount of concurrent i/o for RAIDZ expansion");
5017ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5018	"For expanded RAIDZ, aggregate reads that have more rows than this");
5019ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5020	"For expanded RAIDZ, automatically start a pool scrub when expansion "
5021	"completes");
5022/* END CSTYLED */
5023