md_intel.c revision 260385
1178476Sjb/*-
2178476Sjb * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3178476Sjb * Copyright (c) 2000 - 2008 S��ren Schmidt <sos@FreeBSD.org>
4178476Sjb * All rights reserved.
5178476Sjb *
6178476Sjb * Redistribution and use in source and binary forms, with or without
7178476Sjb * modification, are permitted provided that the following conditions
8178476Sjb * are met:
9178476Sjb * 1. Redistributions of source code must retain the above copyright
10178476Sjb *    notice, this list of conditions and the following disclaimer.
11178476Sjb * 2. Redistributions in binary form must reproduce the above copyright
12178476Sjb *    notice, this list of conditions and the following disclaimer in the
13178476Sjb *    documentation and/or other materials provided with the distribution.
14178476Sjb *
15178476Sjb * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16178476Sjb * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17178476Sjb * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18178476Sjb * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19178476Sjb * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20178476Sjb * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21178476Sjb * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22178476Sjb * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23178476Sjb * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24178476Sjb * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25178476Sjb * SUCH DAMAGE.
26178476Sjb */
27178476Sjb
28178476Sjb#include <sys/cdefs.h>
29178476Sjb__FBSDID("$FreeBSD: stable/10/sys/geom/raid/md_intel.c 260385 2014-01-07 01:32:23Z scottl $");
30178476Sjb
31178476Sjb#include <sys/param.h>
32178476Sjb#include <sys/bio.h>
33178476Sjb#include <sys/endian.h>
34178476Sjb#include <sys/kernel.h>
35178476Sjb#include <sys/kobj.h>
36178476Sjb#include <sys/limits.h>
37178476Sjb#include <sys/lock.h>
38178476Sjb#include <sys/malloc.h>
39178476Sjb#include <sys/mutex.h>
40178476Sjb#include <sys/systm.h>
41178476Sjb#include <sys/taskqueue.h>
42178476Sjb#include <geom/geom.h>
43178476Sjb#include "geom/raid/g_raid.h"
44178476Sjb#include "g_raid_md_if.h"
45178476Sjb
46178476Sjbstatic MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata");
47178476Sjb
48178476Sjbstruct intel_raid_map {
49178476Sjb	uint32_t	offset;
50178476Sjb	uint32_t	disk_sectors;
51178476Sjb	uint32_t	stripe_count;
52178476Sjb	uint16_t	strip_sectors;
53178476Sjb	uint8_t		status;
54178476Sjb#define INTEL_S_READY           0x00
55178476Sjb#define INTEL_S_UNINITIALIZED   0x01
56178476Sjb#define INTEL_S_DEGRADED        0x02
57178476Sjb#define INTEL_S_FAILURE         0x03
58178476Sjb
59178476Sjb	uint8_t		type;
60178476Sjb#define INTEL_T_RAID0           0x00
61178476Sjb#define INTEL_T_RAID1           0x01
62178476Sjb#define INTEL_T_RAID5           0x05
63178476Sjb
64178476Sjb	uint8_t		total_disks;
65178476Sjb	uint8_t		total_domains;
66178476Sjb	uint8_t		failed_disk_num;
67178476Sjb	uint8_t		ddf;
68178476Sjb	uint32_t	offset_hi;
69178476Sjb	uint32_t	disk_sectors_hi;
70178476Sjb	uint32_t	stripe_count_hi;
71178476Sjb	uint32_t	filler_2[4];
72178476Sjb	uint32_t	disk_idx[1];	/* total_disks entries. */
73178476Sjb#define INTEL_DI_IDX	0x00ffffff
74178476Sjb#define INTEL_DI_RBLD	0x01000000
75178476Sjb} __packed;
76178476Sjb
77178476Sjbstruct intel_raid_vol {
78178476Sjb	uint8_t		name[16];
79178476Sjb	u_int64_t	total_sectors __packed;
80178476Sjb	uint32_t	state;
81178476Sjb#define INTEL_ST_BOOTABLE		0x00000001
82178476Sjb#define INTEL_ST_BOOT_DEVICE		0x00000002
83178476Sjb#define INTEL_ST_READ_COALESCING	0x00000004
84178476Sjb#define INTEL_ST_WRITE_COALESCING	0x00000008
85178476Sjb#define INTEL_ST_LAST_SHUTDOWN_DIRTY	0x00000010
86178476Sjb#define INTEL_ST_HIDDEN_AT_BOOT		0x00000020
87178476Sjb#define INTEL_ST_CURRENTLY_HIDDEN	0x00000040
88178476Sjb#define INTEL_ST_VERIFY_AND_FIX		0x00000080
89#define INTEL_ST_MAP_STATE_UNINIT	0x00000100
90#define INTEL_ST_NO_AUTO_RECOVERY	0x00000200
91#define INTEL_ST_CLONE_N_GO		0x00000400
92#define INTEL_ST_CLONE_MAN_SYNC		0x00000800
93#define INTEL_ST_CNG_MASTER_DISK_NUM	0x00001000
94	uint32_t	reserved;
95	uint8_t		migr_priority;
96	uint8_t		num_sub_vols;
97	uint8_t		tid;
98	uint8_t		cng_master_disk;
99	uint16_t	cache_policy;
100	uint8_t		cng_state;
101#define INTEL_CNGST_UPDATED		0
102#define INTEL_CNGST_NEEDS_UPDATE	1
103#define INTEL_CNGST_MASTER_MISSING	2
104	uint8_t		cng_sub_state;
105	uint32_t	filler_0[10];
106
107	uint32_t	curr_migr_unit;
108	uint32_t	checkpoint_id;
109	uint8_t		migr_state;
110	uint8_t		migr_type;
111#define INTEL_MT_INIT		0
112#define INTEL_MT_REBUILD	1
113#define INTEL_MT_VERIFY		2
114#define INTEL_MT_GEN_MIGR	3
115#define INTEL_MT_STATE_CHANGE	4
116#define INTEL_MT_REPAIR		5
117	uint8_t		dirty;
118	uint8_t		fs_state;
119	uint16_t	verify_errors;
120	uint16_t	bad_blocks;
121	uint32_t	curr_migr_unit_hi;
122	uint32_t	filler_1[3];
123	struct intel_raid_map map[1];	/* 2 entries if migr_state != 0. */
124} __packed;
125
126struct intel_raid_disk {
127#define INTEL_SERIAL_LEN	16
128	uint8_t		serial[INTEL_SERIAL_LEN];
129	uint32_t	sectors;
130	uint32_t	id;
131	uint32_t	flags;
132#define INTEL_F_SPARE		0x01
133#define INTEL_F_ASSIGNED	0x02
134#define INTEL_F_FAILED		0x04
135#define INTEL_F_ONLINE		0x08
136#define INTEL_F_DISABLED	0x80
137	uint32_t	owner_cfg_num;
138	uint32_t	sectors_hi;
139	uint32_t	filler[3];
140} __packed;
141
142struct intel_raid_conf {
143	uint8_t		intel_id[24];
144#define INTEL_MAGIC             "Intel Raid ISM Cfg Sig. "
145
146	uint8_t		version[6];
147#define INTEL_VERSION_1000	"1.0.00"	/* RAID0 */
148#define INTEL_VERSION_1100	"1.1.00"	/* RAID1 */
149#define INTEL_VERSION_1200	"1.2.00"	/* Many volumes */
150#define INTEL_VERSION_1201	"1.2.01"	/* 3 or 4 disks */
151#define INTEL_VERSION_1202	"1.2.02"	/* RAID5 */
152#define INTEL_VERSION_1204	"1.2.04"	/* 5 or 6 disks */
153#define INTEL_VERSION_1206	"1.2.06"	/* CNG */
154#define INTEL_VERSION_1300	"1.3.00"	/* Attributes */
155
156	uint8_t		dummy_0[2];
157	uint32_t	checksum;
158	uint32_t	config_size;
159	uint32_t	config_id;
160	uint32_t	generation;
161	uint32_t	error_log_size;
162	uint32_t	attributes;
163#define INTEL_ATTR_RAID0	0x00000001
164#define INTEL_ATTR_RAID1	0x00000002
165#define INTEL_ATTR_RAID10	0x00000004
166#define INTEL_ATTR_RAID1E	0x00000008
167#define INTEL_ATTR_RAID5	0x00000010
168#define INTEL_ATTR_RAIDCNG	0x00000020
169#define INTEL_ATTR_EXT_STRIP	0x00000040
170#define INTEL_ATTR_NVM_CACHE	0x02000000
171#define INTEL_ATTR_2TB_DISK	0x04000000
172#define INTEL_ATTR_BBM		0x08000000
173#define INTEL_ATTR_NVM_CACHE2	0x10000000
174#define INTEL_ATTR_2TB		0x20000000
175#define INTEL_ATTR_PM		0x40000000
176#define INTEL_ATTR_CHECKSUM	0x80000000
177
178	uint8_t		total_disks;
179	uint8_t		total_volumes;
180	uint8_t		error_log_pos;
181	uint8_t		dummy_2[1];
182	uint32_t	cache_size;
183	uint32_t	orig_config_id;
184	uint32_t	pwr_cycle_count;
185	uint32_t	bbm_log_size;
186	uint32_t	filler_0[35];
187	struct intel_raid_disk	disk[1];	/* total_disks entries. */
188	/* Here goes total_volumes of struct intel_raid_vol. */
189} __packed;
190
191#define INTEL_ATTR_SUPPORTED	( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 |	\
192    INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 |		\
193    INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK |	\
194    INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM )
195
196#define INTEL_MAX_MD_SIZE(ndisks)				\
197    (sizeof(struct intel_raid_conf) +				\
198     sizeof(struct intel_raid_disk) * (ndisks - 1) +		\
199     sizeof(struct intel_raid_vol) * 2 +			\
200     sizeof(struct intel_raid_map) * 2 +			\
201     sizeof(uint32_t) * (ndisks - 1) * 4)
202
203struct g_raid_md_intel_perdisk {
204	struct intel_raid_conf	*pd_meta;
205	int			 pd_disk_pos;
206	struct intel_raid_disk	 pd_disk_meta;
207};
208
209struct g_raid_md_intel_pervolume {
210	int			 pv_volume_pos;
211	int			 pv_cng;
212	int			 pv_cng_man_sync;
213	int			 pv_cng_master_disk;
214};
215
216struct g_raid_md_intel_object {
217	struct g_raid_md_object	 mdio_base;
218	uint32_t		 mdio_config_id;
219	uint32_t		 mdio_orig_config_id;
220	uint32_t		 mdio_generation;
221	struct intel_raid_conf	*mdio_meta;
222	struct callout		 mdio_start_co;	/* STARTING state timer. */
223	int			 mdio_disks_present;
224	int			 mdio_started;
225	int			 mdio_incomplete;
226	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
227};
228
229static g_raid_md_create_t g_raid_md_create_intel;
230static g_raid_md_taste_t g_raid_md_taste_intel;
231static g_raid_md_event_t g_raid_md_event_intel;
232static g_raid_md_ctl_t g_raid_md_ctl_intel;
233static g_raid_md_write_t g_raid_md_write_intel;
234static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel;
235static g_raid_md_free_disk_t g_raid_md_free_disk_intel;
236static g_raid_md_free_volume_t g_raid_md_free_volume_intel;
237static g_raid_md_free_t g_raid_md_free_intel;
238
239static kobj_method_t g_raid_md_intel_methods[] = {
240	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_intel),
241	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_intel),
242	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_intel),
243	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_intel),
244	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_intel),
245	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_intel),
246	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_intel),
247	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_intel),
248	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_intel),
249	{ 0, 0 }
250};
251
252static struct g_raid_md_class g_raid_md_intel_class = {
253	"Intel",
254	g_raid_md_intel_methods,
255	sizeof(struct g_raid_md_intel_object),
256	.mdc_enable = 1,
257	.mdc_priority = 100
258};
259
260
261static struct intel_raid_map *
262intel_get_map(struct intel_raid_vol *mvol, int i)
263{
264	struct intel_raid_map *mmap;
265
266	if (i > (mvol->migr_state ? 1 : 0))
267		return (NULL);
268	mmap = &mvol->map[0];
269	for (; i > 0; i--) {
270		mmap = (struct intel_raid_map *)
271		    &mmap->disk_idx[mmap->total_disks];
272	}
273	return ((struct intel_raid_map *)mmap);
274}
275
276static struct intel_raid_vol *
277intel_get_volume(struct intel_raid_conf *meta, int i)
278{
279	struct intel_raid_vol *mvol;
280	struct intel_raid_map *mmap;
281
282	if (i > 1)
283		return (NULL);
284	mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks];
285	for (; i > 0; i--) {
286		mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0);
287		mvol = (struct intel_raid_vol *)
288		    &mmap->disk_idx[mmap->total_disks];
289	}
290	return (mvol);
291}
292
293static off_t
294intel_get_map_offset(struct intel_raid_map *mmap)
295{
296	off_t offset = (off_t)mmap->offset_hi << 32;
297
298	offset += mmap->offset;
299	return (offset);
300}
301
302static void
303intel_set_map_offset(struct intel_raid_map *mmap, off_t offset)
304{
305
306	mmap->offset = offset & 0xffffffff;
307	mmap->offset_hi = offset >> 32;
308}
309
310static off_t
311intel_get_map_disk_sectors(struct intel_raid_map *mmap)
312{
313	off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32;
314
315	disk_sectors += mmap->disk_sectors;
316	return (disk_sectors);
317}
318
319static void
320intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors)
321{
322
323	mmap->disk_sectors = disk_sectors & 0xffffffff;
324	mmap->disk_sectors_hi = disk_sectors >> 32;
325}
326
327static void
328intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count)
329{
330
331	mmap->stripe_count = stripe_count & 0xffffffff;
332	mmap->stripe_count_hi = stripe_count >> 32;
333}
334
335static off_t
336intel_get_disk_sectors(struct intel_raid_disk *disk)
337{
338	off_t sectors = (off_t)disk->sectors_hi << 32;
339
340	sectors += disk->sectors;
341	return (sectors);
342}
343
344static void
345intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors)
346{
347
348	disk->sectors = sectors & 0xffffffff;
349	disk->sectors_hi = sectors >> 32;
350}
351
352static off_t
353intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol)
354{
355	off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32;
356
357	curr_migr_unit += vol->curr_migr_unit;
358	return (curr_migr_unit);
359}
360
361static void
362intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit)
363{
364
365	vol->curr_migr_unit = curr_migr_unit & 0xffffffff;
366	vol->curr_migr_unit_hi = curr_migr_unit >> 32;
367}
368
369static void
370g_raid_md_intel_print(struct intel_raid_conf *meta)
371{
372	struct intel_raid_vol *mvol;
373	struct intel_raid_map *mmap;
374	int i, j, k;
375
376	if (g_raid_debug < 1)
377		return;
378
379	printf("********* ATA Intel MatrixRAID Metadata *********\n");
380	printf("intel_id            <%.24s>\n", meta->intel_id);
381	printf("version             <%.6s>\n", meta->version);
382	printf("checksum            0x%08x\n", meta->checksum);
383	printf("config_size         0x%08x\n", meta->config_size);
384	printf("config_id           0x%08x\n", meta->config_id);
385	printf("generation          0x%08x\n", meta->generation);
386	printf("error_log_size      %d\n", meta->error_log_size);
387	printf("attributes          0x%08x\n", meta->attributes);
388	printf("total_disks         %u\n", meta->total_disks);
389	printf("total_volumes       %u\n", meta->total_volumes);
390	printf("error_log_pos       %u\n", meta->error_log_pos);
391	printf("cache_size          %u\n", meta->cache_size);
392	printf("orig_config_id      0x%08x\n", meta->orig_config_id);
393	printf("pwr_cycle_count     %u\n", meta->pwr_cycle_count);
394	printf("bbm_log_size        %u\n", meta->bbm_log_size);
395	printf("DISK#   serial disk_sectors disk_sectors_hi disk_id flags owner\n");
396	for (i = 0; i < meta->total_disks; i++ ) {
397		printf("    %d   <%.16s> %u %u 0x%08x 0x%08x %08x\n", i,
398		    meta->disk[i].serial, meta->disk[i].sectors,
399		    meta->disk[i].sectors_hi, meta->disk[i].id,
400		    meta->disk[i].flags, meta->disk[i].owner_cfg_num);
401	}
402	for (i = 0; i < meta->total_volumes; i++) {
403		mvol = intel_get_volume(meta, i);
404		printf(" ****** Volume %d ******\n", i);
405		printf(" name               %.16s\n", mvol->name);
406		printf(" total_sectors      %ju\n", mvol->total_sectors);
407		printf(" state              0x%08x\n", mvol->state);
408		printf(" reserved           %u\n", mvol->reserved);
409		printf(" migr_priority      %u\n", mvol->migr_priority);
410		printf(" num_sub_vols       %u\n", mvol->num_sub_vols);
411		printf(" tid                %u\n", mvol->tid);
412		printf(" cng_master_disk    %u\n", mvol->cng_master_disk);
413		printf(" cache_policy       %u\n", mvol->cache_policy);
414		printf(" cng_state          %u\n", mvol->cng_state);
415		printf(" cng_sub_state      %u\n", mvol->cng_sub_state);
416		printf(" curr_migr_unit     %u\n", mvol->curr_migr_unit);
417		printf(" curr_migr_unit_hi  %u\n", mvol->curr_migr_unit_hi);
418		printf(" checkpoint_id      %u\n", mvol->checkpoint_id);
419		printf(" migr_state         %u\n", mvol->migr_state);
420		printf(" migr_type          %u\n", mvol->migr_type);
421		printf(" dirty              %u\n", mvol->dirty);
422		printf(" fs_state           %u\n", mvol->fs_state);
423		printf(" verify_errors      %u\n", mvol->verify_errors);
424		printf(" bad_blocks         %u\n", mvol->bad_blocks);
425
426		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
427			printf("  *** Map %d ***\n", j);
428			mmap = intel_get_map(mvol, j);
429			printf("  offset            %u\n", mmap->offset);
430			printf("  offset_hi         %u\n", mmap->offset_hi);
431			printf("  disk_sectors      %u\n", mmap->disk_sectors);
432			printf("  disk_sectors_hi   %u\n", mmap->disk_sectors_hi);
433			printf("  stripe_count      %u\n", mmap->stripe_count);
434			printf("  stripe_count_hi   %u\n", mmap->stripe_count_hi);
435			printf("  strip_sectors     %u\n", mmap->strip_sectors);
436			printf("  status            %u\n", mmap->status);
437			printf("  type              %u\n", mmap->type);
438			printf("  total_disks       %u\n", mmap->total_disks);
439			printf("  total_domains     %u\n", mmap->total_domains);
440			printf("  failed_disk_num   %u\n", mmap->failed_disk_num);
441			printf("  ddf               %u\n", mmap->ddf);
442			printf("  disk_idx         ");
443			for (k = 0; k < mmap->total_disks; k++)
444				printf(" 0x%08x", mmap->disk_idx[k]);
445			printf("\n");
446		}
447	}
448	printf("=================================================\n");
449}
450
451static struct intel_raid_conf *
452intel_meta_copy(struct intel_raid_conf *meta)
453{
454	struct intel_raid_conf *nmeta;
455
456	nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK);
457	memcpy(nmeta, meta, meta->config_size);
458	return (nmeta);
459}
460
461static int
462intel_meta_find_disk(struct intel_raid_conf *meta, char *serial)
463{
464	int pos;
465
466	for (pos = 0; pos < meta->total_disks; pos++) {
467		if (strncmp(meta->disk[pos].serial,
468		    serial, INTEL_SERIAL_LEN) == 0)
469			return (pos);
470	}
471	return (-1);
472}
473
474static struct intel_raid_conf *
475intel_meta_read(struct g_consumer *cp)
476{
477	struct g_provider *pp;
478	struct intel_raid_conf *meta;
479	struct intel_raid_vol *mvol;
480	struct intel_raid_map *mmap, *mmap1;
481	char *buf;
482	int error, i, j, k, left, size;
483	uint32_t checksum, *ptr;
484
485	pp = cp->provider;
486
487	/* Read the anchor sector. */
488	buf = g_read_data(cp,
489	    pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error);
490	if (buf == NULL) {
491		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
492		    pp->name, error);
493		return (NULL);
494	}
495	meta = (struct intel_raid_conf *)buf;
496
497	/* Check if this is an Intel RAID struct */
498	if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) {
499		G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name);
500		g_free(buf);
501		return (NULL);
502	}
503	if (meta->config_size > 65536 ||
504	    meta->config_size < sizeof(struct intel_raid_conf)) {
505		G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d",
506		    meta->config_size);
507		g_free(buf);
508		return (NULL);
509	}
510	size = meta->config_size;
511	meta = malloc(size, M_MD_INTEL, M_WAITOK);
512	memcpy(meta, buf, min(size, pp->sectorsize));
513	g_free(buf);
514
515	/* Read all the rest, if needed. */
516	if (meta->config_size > pp->sectorsize) {
517		left = (meta->config_size - 1) / pp->sectorsize;
518		buf = g_read_data(cp,
519		    pp->mediasize - pp->sectorsize * (2 + left),
520		    pp->sectorsize * left, &error);
521		if (buf == NULL) {
522			G_RAID_DEBUG(1, "Cannot read remaining metadata"
523			    " part from %s (error=%d).",
524			    pp->name, error);
525			free(meta, M_MD_INTEL);
526			return (NULL);
527		}
528		memcpy(((char *)meta) + pp->sectorsize, buf,
529		    pp->sectorsize * left);
530		g_free(buf);
531	}
532
533	/* Check metadata checksum. */
534	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
535	    i < (meta->config_size / sizeof(uint32_t)); i++) {
536		checksum += *ptr++;
537	}
538	checksum -= meta->checksum;
539	if (checksum != meta->checksum) {
540		G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name);
541		free(meta, M_MD_INTEL);
542		return (NULL);
543	}
544
545	/* Validate metadata size. */
546	size = sizeof(struct intel_raid_conf) +
547	    sizeof(struct intel_raid_disk) * (meta->total_disks - 1) +
548	    sizeof(struct intel_raid_vol) * meta->total_volumes;
549	if (size > meta->config_size) {
550badsize:
551		G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d",
552		    meta->config_size, size);
553		free(meta, M_MD_INTEL);
554		return (NULL);
555	}
556	for (i = 0; i < meta->total_volumes; i++) {
557		mvol = intel_get_volume(meta, i);
558		mmap = intel_get_map(mvol, 0);
559		size += 4 * (mmap->total_disks - 1);
560		if (size > meta->config_size)
561			goto badsize;
562		if (mvol->migr_state) {
563			size += sizeof(struct intel_raid_map);
564			if (size > meta->config_size)
565				goto badsize;
566			mmap = intel_get_map(mvol, 1);
567			size += 4 * (mmap->total_disks - 1);
568			if (size > meta->config_size)
569				goto badsize;
570		}
571	}
572
573	g_raid_md_intel_print(meta);
574
575	if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) {
576		G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'",
577		    meta->version);
578		free(meta, M_MD_INTEL);
579		return (NULL);
580	}
581
582	if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 &&
583	    (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) {
584		G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x",
585		    meta->attributes & ~INTEL_ATTR_SUPPORTED);
586		free(meta, M_MD_INTEL);
587		return (NULL);
588	}
589
590	/* Validate disk indexes. */
591	for (i = 0; i < meta->total_volumes; i++) {
592		mvol = intel_get_volume(meta, i);
593		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
594			mmap = intel_get_map(mvol, j);
595			for (k = 0; k < mmap->total_disks; k++) {
596				if ((mmap->disk_idx[k] & INTEL_DI_IDX) >
597				    meta->total_disks) {
598					G_RAID_DEBUG(1, "Intel metadata disk"
599					    " index %d too big (>%d)",
600					    mmap->disk_idx[k] & INTEL_DI_IDX,
601					    meta->total_disks);
602					free(meta, M_MD_INTEL);
603					return (NULL);
604				}
605			}
606		}
607	}
608
609	/* Validate migration types. */
610	for (i = 0; i < meta->total_volumes; i++) {
611		mvol = intel_get_volume(meta, i);
612		/* Deny unknown migration types. */
613		if (mvol->migr_state &&
614		    mvol->migr_type != INTEL_MT_INIT &&
615		    mvol->migr_type != INTEL_MT_REBUILD &&
616		    mvol->migr_type != INTEL_MT_VERIFY &&
617		    mvol->migr_type != INTEL_MT_GEN_MIGR &&
618		    mvol->migr_type != INTEL_MT_REPAIR) {
619			G_RAID_DEBUG(1, "Intel metadata has unsupported"
620			    " migration type %d", mvol->migr_type);
621			free(meta, M_MD_INTEL);
622			return (NULL);
623		}
624		/* Deny general migrations except SINGLE->RAID1. */
625		if (mvol->migr_state &&
626		    mvol->migr_type == INTEL_MT_GEN_MIGR) {
627			mmap = intel_get_map(mvol, 0);
628			mmap1 = intel_get_map(mvol, 1);
629			if (mmap1->total_disks != 1 ||
630			    mmap->type != INTEL_T_RAID1 ||
631			    mmap->total_disks != 2 ||
632			    mmap->offset != mmap1->offset ||
633			    mmap->disk_sectors != mmap1->disk_sectors ||
634			    mmap->total_domains != mmap->total_disks ||
635			    mmap->offset_hi != mmap1->offset_hi ||
636			    mmap->disk_sectors_hi != mmap1->disk_sectors_hi ||
637			    (mmap->disk_idx[0] != mmap1->disk_idx[0] &&
638			     mmap->disk_idx[0] != mmap1->disk_idx[1])) {
639				G_RAID_DEBUG(1, "Intel metadata has unsupported"
640				    " variant of general migration");
641				free(meta, M_MD_INTEL);
642				return (NULL);
643			}
644		}
645	}
646
647	return (meta);
648}
649
650static int
651intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta)
652{
653	struct g_provider *pp;
654	char *buf;
655	int error, i, sectors;
656	uint32_t checksum, *ptr;
657
658	pp = cp->provider;
659
660	/* Recalculate checksum for case if metadata were changed. */
661	meta->checksum = 0;
662	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
663	    i < (meta->config_size / sizeof(uint32_t)); i++) {
664		checksum += *ptr++;
665	}
666	meta->checksum = checksum;
667
668	/* Create and fill buffer. */
669	sectors = (meta->config_size + pp->sectorsize - 1) / pp->sectorsize;
670	buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
671	if (sectors > 1) {
672		memcpy(buf, ((char *)meta) + pp->sectorsize,
673		    (sectors - 1) * pp->sectorsize);
674	}
675	memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize);
676
677	error = g_write_data(cp,
678	    pp->mediasize - pp->sectorsize * (1 + sectors),
679	    buf, pp->sectorsize * sectors);
680	if (error != 0) {
681		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
682		    pp->name, error);
683	}
684
685	free(buf, M_MD_INTEL);
686	return (error);
687}
688
689static int
690intel_meta_erase(struct g_consumer *cp)
691{
692	struct g_provider *pp;
693	char *buf;
694	int error;
695
696	pp = cp->provider;
697	buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
698	error = g_write_data(cp,
699	    pp->mediasize - 2 * pp->sectorsize,
700	    buf, pp->sectorsize);
701	if (error != 0) {
702		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
703		    pp->name, error);
704	}
705	free(buf, M_MD_INTEL);
706	return (error);
707}
708
709static int
710intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d)
711{
712	struct intel_raid_conf *meta;
713	int error;
714
715	/* Fill anchor and single disk. */
716	meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO);
717	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
718	memcpy(&meta->version[0], INTEL_VERSION_1000,
719	    sizeof(INTEL_VERSION_1000) - 1);
720	meta->config_size = INTEL_MAX_MD_SIZE(1);
721	meta->config_id = meta->orig_config_id = arc4random();
722	meta->generation = 1;
723	meta->total_disks = 1;
724	meta->disk[0] = *d;
725	error = intel_meta_write(cp, meta);
726	free(meta, M_MD_INTEL);
727	return (error);
728}
729
730static struct g_raid_disk *
731g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id)
732{
733	struct g_raid_disk	*disk;
734	struct g_raid_md_intel_perdisk *pd;
735
736	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
737		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
738		if (pd->pd_disk_pos == id)
739			break;
740	}
741	return (disk);
742}
743
744static int
745g_raid_md_intel_supported(int level, int qual, int disks, int force)
746{
747
748	switch (level) {
749	case G_RAID_VOLUME_RL_RAID0:
750		if (disks < 1)
751			return (0);
752		if (!force && (disks < 2 || disks > 6))
753			return (0);
754		break;
755	case G_RAID_VOLUME_RL_RAID1:
756		if (disks < 1)
757			return (0);
758		if (!force && (disks != 2))
759			return (0);
760		break;
761	case G_RAID_VOLUME_RL_RAID1E:
762		if (disks < 2)
763			return (0);
764		if (!force && (disks != 4))
765			return (0);
766		break;
767	case G_RAID_VOLUME_RL_RAID5:
768		if (disks < 3)
769			return (0);
770		if (!force && disks > 6)
771			return (0);
772		if (qual != G_RAID_VOLUME_RLQ_R5LA)
773			return (0);
774		break;
775	default:
776		return (0);
777	}
778	if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
779		return (0);
780	return (1);
781}
782
783static struct g_raid_volume *
784g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id)
785{
786	struct g_raid_volume	*mvol;
787	struct g_raid_md_intel_pervolume *pv;
788
789	TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) {
790		pv = mvol->v_md_data;
791		if (pv->pv_volume_pos == id)
792			break;
793	}
794	return (mvol);
795}
796
797static int
798g_raid_md_intel_start_disk(struct g_raid_disk *disk)
799{
800	struct g_raid_softc *sc;
801	struct g_raid_subdisk *sd, *tmpsd;
802	struct g_raid_disk *olddisk, *tmpdisk;
803	struct g_raid_md_object *md;
804	struct g_raid_md_intel_object *mdi;
805	struct g_raid_md_intel_pervolume *pv;
806	struct g_raid_md_intel_perdisk *pd, *oldpd;
807	struct intel_raid_conf *meta;
808	struct intel_raid_vol *mvol;
809	struct intel_raid_map *mmap0, *mmap1;
810	int disk_pos, resurrection = 0, migr_global, i;
811
812	sc = disk->d_softc;
813	md = sc->sc_md;
814	mdi = (struct g_raid_md_intel_object *)md;
815	meta = mdi->mdio_meta;
816	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
817	olddisk = NULL;
818
819	/* Find disk position in metadata by it's serial. */
820	disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial);
821	if (disk_pos < 0) {
822		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
823		/* Failed stale disk is useless for us. */
824		if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) &&
825		    !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) {
826			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
827			return (0);
828		}
829		/* If we are in the start process, that's all for now. */
830		if (!mdi->mdio_started)
831			goto nofit;
832		/*
833		 * If we have already started - try to get use of the disk.
834		 * Try to replace OFFLINE disks first, then FAILED.
835		 */
836		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
837			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
838			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
839				continue;
840			/* Make sure this disk is big enough. */
841			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
842				off_t disk_sectors =
843				    intel_get_disk_sectors(&pd->pd_disk_meta);
844
845				if (sd->sd_offset + sd->sd_size + 4096 >
846				    disk_sectors * 512) {
847					G_RAID_DEBUG1(1, sc,
848					    "Disk too small (%llu < %llu)",
849					    (unsigned long long)
850					    disk_sectors * 512,
851					    (unsigned long long)
852					    sd->sd_offset + sd->sd_size + 4096);
853					break;
854				}
855			}
856			if (sd != NULL)
857				continue;
858			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
859				olddisk = tmpdisk;
860				break;
861			} else if (olddisk == NULL)
862				olddisk = tmpdisk;
863		}
864		if (olddisk == NULL) {
865nofit:
866			if (pd->pd_disk_meta.flags & INTEL_F_SPARE) {
867				g_raid_change_disk_state(disk,
868				    G_RAID_DISK_S_SPARE);
869				return (1);
870			} else {
871				g_raid_change_disk_state(disk,
872				    G_RAID_DISK_S_STALE);
873				return (0);
874			}
875		}
876		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
877		disk_pos = oldpd->pd_disk_pos;
878		resurrection = 1;
879	}
880
881	if (olddisk == NULL) {
882		/* Find placeholder by position. */
883		olddisk = g_raid_md_intel_get_disk(sc, disk_pos);
884		if (olddisk == NULL)
885			panic("No disk at position %d!", disk_pos);
886		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
887			G_RAID_DEBUG1(1, sc, "More then one disk for pos %d",
888			    disk_pos);
889			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
890			return (0);
891		}
892		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
893	}
894
895	/* Replace failed disk or placeholder with new disk. */
896	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
897		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
898		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
899		sd->sd_disk = disk;
900	}
901	oldpd->pd_disk_pos = -2;
902	pd->pd_disk_pos = disk_pos;
903
904	/* If it was placeholder -- destroy it. */
905	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
906		g_raid_destroy_disk(olddisk);
907	} else {
908		/* Otherwise, make it STALE_FAILED. */
909		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
910		/* Update global metadata just in case. */
911		memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta,
912		    sizeof(struct intel_raid_disk));
913	}
914
915	/* Welcome the new disk. */
916	if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
917	    !(pd->pd_disk_meta.flags & INTEL_F_SPARE))
918		g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED);
919	else if (resurrection)
920		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
921	else if (meta->disk[disk_pos].flags & INTEL_F_FAILED)
922		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
923	else if (meta->disk[disk_pos].flags & INTEL_F_SPARE)
924		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
925	else
926		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
927	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
928		pv = sd->sd_volume->v_md_data;
929		mvol = intel_get_volume(meta, pv->pv_volume_pos);
930		mmap0 = intel_get_map(mvol, 0);
931		if (mvol->migr_state)
932			mmap1 = intel_get_map(mvol, 1);
933		else
934			mmap1 = mmap0;
935
936		migr_global = 1;
937		for (i = 0; i < mmap0->total_disks; i++) {
938			if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 &&
939			    (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0)
940				migr_global = 0;
941		}
942
943		if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
944		    !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) {
945			/* Disabled disk, useless. */
946			g_raid_change_subdisk_state(sd,
947			    G_RAID_SUBDISK_S_NONE);
948		} else if (resurrection) {
949			/* Stale disk, almost same as new. */
950			g_raid_change_subdisk_state(sd,
951			    G_RAID_SUBDISK_S_NEW);
952		} else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) {
953			/* Failed disk, almost useless. */
954			g_raid_change_subdisk_state(sd,
955			    G_RAID_SUBDISK_S_FAILED);
956		} else if (mvol->migr_state == 0) {
957			if (mmap0->status == INTEL_S_UNINITIALIZED &&
958			    (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) {
959				/* Freshly created uninitialized volume. */
960				g_raid_change_subdisk_state(sd,
961				    G_RAID_SUBDISK_S_UNINITIALIZED);
962			} else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
963				/* Freshly inserted disk. */
964				g_raid_change_subdisk_state(sd,
965				    G_RAID_SUBDISK_S_NEW);
966			} else if (mvol->dirty && (!pv->pv_cng ||
967			    pv->pv_cng_master_disk != disk_pos)) {
968				/* Dirty volume (unclean shutdown). */
969				g_raid_change_subdisk_state(sd,
970				    G_RAID_SUBDISK_S_STALE);
971			} else {
972				/* Up to date disk. */
973				g_raid_change_subdisk_state(sd,
974				    G_RAID_SUBDISK_S_ACTIVE);
975			}
976		} else if (mvol->migr_type == INTEL_MT_INIT ||
977			   mvol->migr_type == INTEL_MT_REBUILD) {
978			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
979				/* Freshly inserted disk. */
980				g_raid_change_subdisk_state(sd,
981				    G_RAID_SUBDISK_S_NEW);
982			} else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
983				/* Rebuilding disk. */
984				g_raid_change_subdisk_state(sd,
985				    G_RAID_SUBDISK_S_REBUILD);
986				if (mvol->dirty) {
987					sd->sd_rebuild_pos = 0;
988				} else {
989					sd->sd_rebuild_pos =
990					    intel_get_vol_curr_migr_unit(mvol) *
991					    sd->sd_volume->v_strip_size *
992					    mmap0->total_domains;
993				}
994			} else if (mvol->migr_type == INTEL_MT_INIT &&
995			    migr_global) {
996				/* Freshly created uninitialized volume. */
997				g_raid_change_subdisk_state(sd,
998				    G_RAID_SUBDISK_S_UNINITIALIZED);
999			} else if (mvol->dirty && (!pv->pv_cng ||
1000			    pv->pv_cng_master_disk != disk_pos)) {
1001				/* Dirty volume (unclean shutdown). */
1002				g_raid_change_subdisk_state(sd,
1003				    G_RAID_SUBDISK_S_STALE);
1004			} else {
1005				/* Up to date disk. */
1006				g_raid_change_subdisk_state(sd,
1007				    G_RAID_SUBDISK_S_ACTIVE);
1008			}
1009		} else if (mvol->migr_type == INTEL_MT_VERIFY ||
1010			   mvol->migr_type == INTEL_MT_REPAIR) {
1011			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1012				/* Freshly inserted disk. */
1013				g_raid_change_subdisk_state(sd,
1014				    G_RAID_SUBDISK_S_NEW);
1015			} else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) ||
1016			    migr_global) {
1017				/* Resyncing disk. */
1018				g_raid_change_subdisk_state(sd,
1019				    G_RAID_SUBDISK_S_RESYNC);
1020				if (mvol->dirty) {
1021					sd->sd_rebuild_pos = 0;
1022				} else {
1023					sd->sd_rebuild_pos =
1024					    intel_get_vol_curr_migr_unit(mvol) *
1025					    sd->sd_volume->v_strip_size *
1026					    mmap0->total_domains;
1027				}
1028			} else if (mvol->dirty) {
1029				/* Dirty volume (unclean shutdown). */
1030				g_raid_change_subdisk_state(sd,
1031				    G_RAID_SUBDISK_S_STALE);
1032			} else {
1033				/* Up to date disk. */
1034				g_raid_change_subdisk_state(sd,
1035				    G_RAID_SUBDISK_S_ACTIVE);
1036			}
1037		} else if (mvol->migr_type == INTEL_MT_GEN_MIGR) {
1038			if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) {
1039				/* Freshly inserted disk. */
1040				g_raid_change_subdisk_state(sd,
1041				    G_RAID_SUBDISK_S_NEW);
1042			} else {
1043				/* Up to date disk. */
1044				g_raid_change_subdisk_state(sd,
1045				    G_RAID_SUBDISK_S_ACTIVE);
1046			}
1047		}
1048		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1049		    G_RAID_EVENT_SUBDISK);
1050	}
1051
1052	/* Update status of our need for spare. */
1053	if (mdi->mdio_started) {
1054		mdi->mdio_incomplete =
1055		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1056		     g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) <
1057		     meta->total_disks);
1058	}
1059
1060	return (resurrection);
1061}
1062
1063static void
1064g_disk_md_intel_retaste(void *arg, int pending)
1065{
1066
1067	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
1068	g_retaste(&g_raid_class);
1069	free(arg, M_MD_INTEL);
1070}
1071
1072static void
1073g_raid_md_intel_refill(struct g_raid_softc *sc)
1074{
1075	struct g_raid_md_object *md;
1076	struct g_raid_md_intel_object *mdi;
1077	struct intel_raid_conf *meta;
1078	struct g_raid_disk *disk;
1079	struct task *task;
1080	int update, na;
1081
1082	md = sc->sc_md;
1083	mdi = (struct g_raid_md_intel_object *)md;
1084	meta = mdi->mdio_meta;
1085	update = 0;
1086	do {
1087		/* Make sure we miss anything. */
1088		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1089		    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED);
1090		if (na == meta->total_disks)
1091			break;
1092
1093		G_RAID_DEBUG1(1, md->mdo_softc,
1094		    "Array is not complete (%d of %d), "
1095		    "trying to refill.", na, meta->total_disks);
1096
1097		/* Try to get use some of STALE disks. */
1098		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1099			if (disk->d_state == G_RAID_DISK_S_STALE) {
1100				update += g_raid_md_intel_start_disk(disk);
1101				if (disk->d_state == G_RAID_DISK_S_ACTIVE ||
1102				    disk->d_state == G_RAID_DISK_S_DISABLED)
1103					break;
1104			}
1105		}
1106		if (disk != NULL)
1107			continue;
1108
1109		/* Try to get use some of SPARE disks. */
1110		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1111			if (disk->d_state == G_RAID_DISK_S_SPARE) {
1112				update += g_raid_md_intel_start_disk(disk);
1113				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
1114					break;
1115			}
1116		}
1117	} while (disk != NULL);
1118
1119	/* Write new metadata if we changed something. */
1120	if (update) {
1121		g_raid_md_write_intel(md, NULL, NULL, NULL);
1122		meta = mdi->mdio_meta;
1123	}
1124
1125	/* Update status of our need for spare. */
1126	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1127	    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks);
1128
1129	/* Request retaste hoping to find spare. */
1130	if (mdi->mdio_incomplete) {
1131		task = malloc(sizeof(struct task),
1132		    M_MD_INTEL, M_WAITOK | M_ZERO);
1133		TASK_INIT(task, 0, g_disk_md_intel_retaste, task);
1134		taskqueue_enqueue(taskqueue_swi, task);
1135	}
1136}
1137
1138static void
1139g_raid_md_intel_start(struct g_raid_softc *sc)
1140{
1141	struct g_raid_md_object *md;
1142	struct g_raid_md_intel_object *mdi;
1143	struct g_raid_md_intel_pervolume *pv;
1144	struct g_raid_md_intel_perdisk *pd;
1145	struct intel_raid_conf *meta;
1146	struct intel_raid_vol *mvol;
1147	struct intel_raid_map *mmap;
1148	struct g_raid_volume *vol;
1149	struct g_raid_subdisk *sd;
1150	struct g_raid_disk *disk;
1151	int i, j, disk_pos;
1152
1153	md = sc->sc_md;
1154	mdi = (struct g_raid_md_intel_object *)md;
1155	meta = mdi->mdio_meta;
1156
1157	/* Create volumes and subdisks. */
1158	for (i = 0; i < meta->total_volumes; i++) {
1159		mvol = intel_get_volume(meta, i);
1160		mmap = intel_get_map(mvol, 0);
1161		vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1);
1162		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1163		pv->pv_volume_pos = i;
1164		pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0;
1165		pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0;
1166		if (mvol->cng_master_disk < mmap->total_disks)
1167			pv->pv_cng_master_disk = mvol->cng_master_disk;
1168		vol->v_md_data = pv;
1169		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
1170		if (mmap->type == INTEL_T_RAID0)
1171			vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
1172		else if (mmap->type == INTEL_T_RAID1 &&
1173		    mmap->total_domains >= 2 &&
1174		    mmap->total_domains <= mmap->total_disks) {
1175			/* Assume total_domains is correct. */
1176			if (mmap->total_domains == mmap->total_disks)
1177				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1178			else
1179				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1180		} else if (mmap->type == INTEL_T_RAID1) {
1181			/* total_domains looks wrong. */
1182			if (mmap->total_disks <= 2)
1183				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1184			else
1185				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1186		} else if (mmap->type == INTEL_T_RAID5) {
1187			vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
1188			vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
1189		} else
1190			vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1191		vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ
1192		vol->v_disks_count = mmap->total_disks;
1193		vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ
1194		vol->v_sectorsize = 512; //ZZZ
1195		for (j = 0; j < vol->v_disks_count; j++) {
1196			sd = &vol->v_subdisks[j];
1197			sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ
1198			sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ
1199		}
1200		g_raid_start_volume(vol);
1201	}
1202
1203	/* Create disk placeholders to store data for later writing. */
1204	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
1205		pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1206		pd->pd_disk_pos = disk_pos;
1207		pd->pd_disk_meta = meta->disk[disk_pos];
1208		disk = g_raid_create_disk(sc);
1209		disk->d_md_data = (void *)pd;
1210		disk->d_state = G_RAID_DISK_S_OFFLINE;
1211		for (i = 0; i < meta->total_volumes; i++) {
1212			mvol = intel_get_volume(meta, i);
1213			mmap = intel_get_map(mvol, 0);
1214			for (j = 0; j < mmap->total_disks; j++) {
1215				if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos)
1216					break;
1217			}
1218			if (j == mmap->total_disks)
1219				continue;
1220			vol = g_raid_md_intel_get_volume(sc, i);
1221			sd = &vol->v_subdisks[j];
1222			sd->sd_disk = disk;
1223			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1224		}
1225	}
1226
1227	/* Make all disks found till the moment take their places. */
1228	do {
1229		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1230			if (disk->d_state == G_RAID_DISK_S_NONE) {
1231				g_raid_md_intel_start_disk(disk);
1232				break;
1233			}
1234		}
1235	} while (disk != NULL);
1236
1237	mdi->mdio_started = 1;
1238	G_RAID_DEBUG1(0, sc, "Array started.");
1239	g_raid_md_write_intel(md, NULL, NULL, NULL);
1240
1241	/* Pickup any STALE/SPARE disks to refill array if needed. */
1242	g_raid_md_intel_refill(sc);
1243
1244	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1245		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1246		    G_RAID_EVENT_VOLUME);
1247	}
1248
1249	callout_stop(&mdi->mdio_start_co);
1250	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
1251	root_mount_rel(mdi->mdio_rootmount);
1252	mdi->mdio_rootmount = NULL;
1253}
1254
1255static void
1256g_raid_md_intel_new_disk(struct g_raid_disk *disk)
1257{
1258	struct g_raid_softc *sc;
1259	struct g_raid_md_object *md;
1260	struct g_raid_md_intel_object *mdi;
1261	struct intel_raid_conf *pdmeta;
1262	struct g_raid_md_intel_perdisk *pd;
1263
1264	sc = disk->d_softc;
1265	md = sc->sc_md;
1266	mdi = (struct g_raid_md_intel_object *)md;
1267	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1268	pdmeta = pd->pd_meta;
1269
1270	if (mdi->mdio_started) {
1271		if (g_raid_md_intel_start_disk(disk))
1272			g_raid_md_write_intel(md, NULL, NULL, NULL);
1273	} else {
1274		/* If we haven't started yet - check metadata freshness. */
1275		if (mdi->mdio_meta == NULL ||
1276		    ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) {
1277			G_RAID_DEBUG1(1, sc, "Newer disk");
1278			if (mdi->mdio_meta != NULL)
1279				free(mdi->mdio_meta, M_MD_INTEL);
1280			mdi->mdio_meta = intel_meta_copy(pdmeta);
1281			mdi->mdio_generation = mdi->mdio_meta->generation;
1282			mdi->mdio_disks_present = 1;
1283		} else if (pdmeta->generation == mdi->mdio_generation) {
1284			mdi->mdio_disks_present++;
1285			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
1286			    mdi->mdio_disks_present,
1287			    mdi->mdio_meta->total_disks);
1288		} else {
1289			G_RAID_DEBUG1(1, sc, "Older disk");
1290		}
1291		/* If we collected all needed disks - start array. */
1292		if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks)
1293			g_raid_md_intel_start(sc);
1294	}
1295}
1296
1297static void
1298g_raid_intel_go(void *arg)
1299{
1300	struct g_raid_softc *sc;
1301	struct g_raid_md_object *md;
1302	struct g_raid_md_intel_object *mdi;
1303
1304	sc = arg;
1305	md = sc->sc_md;
1306	mdi = (struct g_raid_md_intel_object *)md;
1307	if (!mdi->mdio_started) {
1308		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
1309		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
1310	}
1311}
1312
1313static int
1314g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp,
1315    struct g_geom **gp)
1316{
1317	struct g_raid_softc *sc;
1318	struct g_raid_md_intel_object *mdi;
1319	char name[16];
1320
1321	mdi = (struct g_raid_md_intel_object *)md;
1322	mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random();
1323	mdi->mdio_generation = 0;
1324	snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id);
1325	sc = g_raid_create_node(mp, name, md);
1326	if (sc == NULL)
1327		return (G_RAID_MD_TASTE_FAIL);
1328	md->mdo_softc = sc;
1329	*gp = sc->sc_geom;
1330	return (G_RAID_MD_TASTE_NEW);
1331}
1332
1333/*
1334 * Return the last N characters of the serial label.  The Linux and
1335 * ataraid(7) code always uses the last 16 characters of the label to
1336 * store into the Intel meta format.  Generalize this to N characters
1337 * since that's easy.  Labels can be up to 20 characters for SATA drives
1338 * and up 251 characters for SAS drives.  Since intel controllers don't
1339 * support SAS drives, just stick with the SATA limits for stack friendliness.
1340 */
1341static int
1342g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen)
1343{
1344	char serial_buffer[24];
1345	int len, error;
1346
1347	len = sizeof(serial_buffer);
1348	error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer);
1349	if (error != 0)
1350		return (error);
1351	len = strlen(serial_buffer);
1352	if (len > serlen)
1353		len -= serlen;
1354	else
1355		len = 0;
1356	strncpy(serial, serial_buffer + len, serlen);
1357	return (0);
1358}
1359
1360static int
1361g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp,
1362                              struct g_consumer *cp, struct g_geom **gp)
1363{
1364	struct g_consumer *rcp;
1365	struct g_provider *pp;
1366	struct g_raid_md_intel_object *mdi, *mdi1;
1367	struct g_raid_softc *sc;
1368	struct g_raid_disk *disk;
1369	struct intel_raid_conf *meta;
1370	struct g_raid_md_intel_perdisk *pd;
1371	struct g_geom *geom;
1372	int error, disk_pos, result, spare, len;
1373	char serial[INTEL_SERIAL_LEN];
1374	char name[16];
1375	uint16_t vendor;
1376
1377	G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name);
1378	mdi = (struct g_raid_md_intel_object *)md;
1379	pp = cp->provider;
1380
1381	/* Read metadata from device. */
1382	meta = NULL;
1383	vendor = 0xffff;
1384	disk_pos = 0;
1385	if (g_access(cp, 1, 0, 0) != 0)
1386		return (G_RAID_MD_TASTE_FAIL);
1387	g_topology_unlock();
1388	error = g_raid_md_get_label(cp, serial, sizeof(serial));
1389	if (error != 0) {
1390		G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).",
1391		    pp->name, error);
1392		goto fail2;
1393	}
1394	len = 2;
1395	if (pp->geom->rank == 1)
1396		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1397	meta = intel_meta_read(cp);
1398	g_topology_lock();
1399	g_access(cp, -1, 0, 0);
1400	if (meta == NULL) {
1401		if (g_raid_aggressive_spare) {
1402			if (vendor != 0x8086) {
1403				G_RAID_DEBUG(1,
1404				    "Intel vendor mismatch 0x%04x != 0x8086",
1405				    vendor);
1406			} else {
1407				G_RAID_DEBUG(1,
1408				    "No Intel metadata, forcing spare.");
1409				spare = 2;
1410				goto search;
1411			}
1412		}
1413		return (G_RAID_MD_TASTE_FAIL);
1414	}
1415
1416	/* Check this disk position in obtained metadata. */
1417	disk_pos = intel_meta_find_disk(meta, serial);
1418	if (disk_pos < 0) {
1419		G_RAID_DEBUG(1, "Intel serial '%s' not found", serial);
1420		goto fail1;
1421	}
1422	if (intel_get_disk_sectors(&meta->disk[disk_pos]) !=
1423	    (pp->mediasize / pp->sectorsize)) {
1424		G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju",
1425		    intel_get_disk_sectors(&meta->disk[disk_pos]),
1426		    (off_t)(pp->mediasize / pp->sectorsize));
1427		goto fail1;
1428	}
1429
1430	G_RAID_DEBUG(1, "Intel disk position %d", disk_pos);
1431	spare = meta->disk[disk_pos].flags & INTEL_F_SPARE;
1432
1433search:
1434	/* Search for matching node. */
1435	sc = NULL;
1436	mdi1 = NULL;
1437	LIST_FOREACH(geom, &mp->geom, geom) {
1438		sc = geom->softc;
1439		if (sc == NULL)
1440			continue;
1441		if (sc->sc_stopping != 0)
1442			continue;
1443		if (sc->sc_md->mdo_class != md->mdo_class)
1444			continue;
1445		mdi1 = (struct g_raid_md_intel_object *)sc->sc_md;
1446		if (spare) {
1447			if (mdi1->mdio_incomplete)
1448				break;
1449		} else {
1450			if (mdi1->mdio_config_id == meta->config_id)
1451				break;
1452		}
1453	}
1454
1455	/* Found matching node. */
1456	if (geom != NULL) {
1457		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1458		result = G_RAID_MD_TASTE_EXISTING;
1459
1460	} else if (spare) { /* Not found needy node -- left for later. */
1461		G_RAID_DEBUG(1, "Spare is not needed at this time");
1462		goto fail1;
1463
1464	} else { /* Not found matching node -- create one. */
1465		result = G_RAID_MD_TASTE_NEW;
1466		mdi->mdio_config_id = meta->config_id;
1467		mdi->mdio_orig_config_id = meta->orig_config_id;
1468		snprintf(name, sizeof(name), "Intel-%08x", meta->config_id);
1469		sc = g_raid_create_node(mp, name, md);
1470		md->mdo_softc = sc;
1471		geom = sc->sc_geom;
1472		callout_init(&mdi->mdio_start_co, 1);
1473		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
1474		    g_raid_intel_go, sc);
1475		mdi->mdio_rootmount = root_mount_hold("GRAID-Intel");
1476		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
1477	}
1478
1479	rcp = g_new_consumer(geom);
1480	rcp->flags |= G_CF_DIRECT_RECEIVE;
1481	g_attach(rcp, pp);
1482	if (g_access(rcp, 1, 1, 1) != 0)
1483		; //goto fail1;
1484
1485	g_topology_unlock();
1486	sx_xlock(&sc->sc_lock);
1487
1488	pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1489	pd->pd_meta = meta;
1490	pd->pd_disk_pos = -1;
1491	if (spare == 2) {
1492		memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN);
1493		intel_set_disk_sectors(&pd->pd_disk_meta,
1494		    pp->mediasize / pp->sectorsize);
1495		pd->pd_disk_meta.id = 0;
1496		pd->pd_disk_meta.flags = INTEL_F_SPARE;
1497	} else {
1498		pd->pd_disk_meta = meta->disk[disk_pos];
1499	}
1500	disk = g_raid_create_disk(sc);
1501	disk->d_md_data = (void *)pd;
1502	disk->d_consumer = rcp;
1503	rcp->private = disk;
1504
1505	g_raid_get_disk_info(disk);
1506
1507	g_raid_md_intel_new_disk(disk);
1508
1509	sx_xunlock(&sc->sc_lock);
1510	g_topology_lock();
1511	*gp = geom;
1512	return (result);
1513fail2:
1514	g_topology_lock();
1515	g_access(cp, -1, 0, 0);
1516fail1:
1517	free(meta, M_MD_INTEL);
1518	return (G_RAID_MD_TASTE_FAIL);
1519}
1520
1521static int
1522g_raid_md_event_intel(struct g_raid_md_object *md,
1523    struct g_raid_disk *disk, u_int event)
1524{
1525	struct g_raid_softc *sc;
1526	struct g_raid_subdisk *sd;
1527	struct g_raid_md_intel_object *mdi;
1528	struct g_raid_md_intel_perdisk *pd;
1529
1530	sc = md->mdo_softc;
1531	mdi = (struct g_raid_md_intel_object *)md;
1532	if (disk == NULL) {
1533		switch (event) {
1534		case G_RAID_NODE_E_START:
1535			if (!mdi->mdio_started)
1536				g_raid_md_intel_start(sc);
1537			return (0);
1538		}
1539		return (-1);
1540	}
1541	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1542	switch (event) {
1543	case G_RAID_DISK_E_DISCONNECTED:
1544		/* If disk was assigned, just update statuses. */
1545		if (pd->pd_disk_pos >= 0) {
1546			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1547			if (disk->d_consumer) {
1548				g_raid_kill_consumer(sc, disk->d_consumer);
1549				disk->d_consumer = NULL;
1550			}
1551			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
1552				g_raid_change_subdisk_state(sd,
1553				    G_RAID_SUBDISK_S_NONE);
1554				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
1555				    G_RAID_EVENT_SUBDISK);
1556			}
1557		} else {
1558			/* Otherwise -- delete. */
1559			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1560			g_raid_destroy_disk(disk);
1561		}
1562
1563		/* Write updated metadata to all disks. */
1564		g_raid_md_write_intel(md, NULL, NULL, NULL);
1565
1566		/* Check if anything left except placeholders. */
1567		if (g_raid_ndisks(sc, -1) ==
1568		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
1569			g_raid_destroy_node(sc, 0);
1570		else
1571			g_raid_md_intel_refill(sc);
1572		return (0);
1573	}
1574	return (-2);
1575}
1576
1577static int
1578g_raid_md_ctl_intel(struct g_raid_md_object *md,
1579    struct gctl_req *req)
1580{
1581	struct g_raid_softc *sc;
1582	struct g_raid_volume *vol, *vol1;
1583	struct g_raid_subdisk *sd;
1584	struct g_raid_disk *disk;
1585	struct g_raid_md_intel_object *mdi;
1586	struct g_raid_md_intel_pervolume *pv;
1587	struct g_raid_md_intel_perdisk *pd;
1588	struct g_consumer *cp;
1589	struct g_provider *pp;
1590	char arg[16], serial[INTEL_SERIAL_LEN];
1591	const char *nodename, *verb, *volname, *levelname, *diskname;
1592	char *tmp;
1593	int *nargs, *force;
1594	off_t off, size, sectorsize, strip, disk_sectors;
1595	intmax_t *sizearg, *striparg;
1596	int numdisks, i, len, level, qual, update;
1597	int error;
1598
1599	sc = md->mdo_softc;
1600	mdi = (struct g_raid_md_intel_object *)md;
1601	verb = gctl_get_param(req, "verb", NULL);
1602	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1603	error = 0;
1604	if (strcmp(verb, "label") == 0) {
1605
1606		if (*nargs < 4) {
1607			gctl_error(req, "Invalid number of arguments.");
1608			return (-1);
1609		}
1610		volname = gctl_get_asciiparam(req, "arg1");
1611		if (volname == NULL) {
1612			gctl_error(req, "No volume name.");
1613			return (-2);
1614		}
1615		levelname = gctl_get_asciiparam(req, "arg2");
1616		if (levelname == NULL) {
1617			gctl_error(req, "No RAID level.");
1618			return (-3);
1619		}
1620		if (strcasecmp(levelname, "RAID5") == 0)
1621			levelname = "RAID5-LA";
1622		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1623			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1624			return (-4);
1625		}
1626		numdisks = *nargs - 3;
1627		force = gctl_get_paraml(req, "force", sizeof(*force));
1628		if (!g_raid_md_intel_supported(level, qual, numdisks,
1629		    force ? *force : 0)) {
1630			gctl_error(req, "Unsupported RAID level "
1631			    "(0x%02x/0x%02x), or number of disks (%d).",
1632			    level, qual, numdisks);
1633			return (-5);
1634		}
1635
1636		/* Search for disks, connect them and probe. */
1637		size = 0x7fffffffffffffffllu;
1638		sectorsize = 0;
1639		for (i = 0; i < numdisks; i++) {
1640			snprintf(arg, sizeof(arg), "arg%d", i + 3);
1641			diskname = gctl_get_asciiparam(req, arg);
1642			if (diskname == NULL) {
1643				gctl_error(req, "No disk name (%s).", arg);
1644				error = -6;
1645				break;
1646			}
1647			if (strcmp(diskname, "NONE") == 0) {
1648				cp = NULL;
1649				pp = NULL;
1650			} else {
1651				g_topology_lock();
1652				cp = g_raid_open_consumer(sc, diskname);
1653				if (cp == NULL) {
1654					gctl_error(req, "Can't open disk '%s'.",
1655					    diskname);
1656					g_topology_unlock();
1657					error = -7;
1658					break;
1659				}
1660				pp = cp->provider;
1661			}
1662			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1663			pd->pd_disk_pos = i;
1664			disk = g_raid_create_disk(sc);
1665			disk->d_md_data = (void *)pd;
1666			disk->d_consumer = cp;
1667			if (cp == NULL) {
1668				strcpy(&pd->pd_disk_meta.serial[0], "NONE");
1669				pd->pd_disk_meta.id = 0xffffffff;
1670				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
1671				continue;
1672			}
1673			cp->private = disk;
1674			g_topology_unlock();
1675
1676			error = g_raid_md_get_label(cp,
1677			    &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN);
1678			if (error != 0) {
1679				gctl_error(req,
1680				    "Can't get serial for provider '%s'.",
1681				    diskname);
1682				error = -8;
1683				break;
1684			}
1685
1686			g_raid_get_disk_info(disk);
1687
1688			intel_set_disk_sectors(&pd->pd_disk_meta,
1689			    pp->mediasize / pp->sectorsize);
1690			if (size > pp->mediasize)
1691				size = pp->mediasize;
1692			if (sectorsize < pp->sectorsize)
1693				sectorsize = pp->sectorsize;
1694			pd->pd_disk_meta.id = 0;
1695			pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE;
1696		}
1697		if (error != 0)
1698			return (error);
1699
1700		if (sectorsize <= 0) {
1701			gctl_error(req, "Can't get sector size.");
1702			return (-8);
1703		}
1704
1705		/* Reserve some space for metadata. */
1706		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1707
1708		/* Handle size argument. */
1709		len = sizeof(*sizearg);
1710		sizearg = gctl_get_param(req, "size", &len);
1711		if (sizearg != NULL && len == sizeof(*sizearg) &&
1712		    *sizearg > 0) {
1713			if (*sizearg > size) {
1714				gctl_error(req, "Size too big %lld > %lld.",
1715				    (long long)*sizearg, (long long)size);
1716				return (-9);
1717			}
1718			size = *sizearg;
1719		}
1720
1721		/* Handle strip argument. */
1722		strip = 131072;
1723		len = sizeof(*striparg);
1724		striparg = gctl_get_param(req, "strip", &len);
1725		if (striparg != NULL && len == sizeof(*striparg) &&
1726		    *striparg > 0) {
1727			if (*striparg < sectorsize) {
1728				gctl_error(req, "Strip size too small.");
1729				return (-10);
1730			}
1731			if (*striparg % sectorsize != 0) {
1732				gctl_error(req, "Incorrect strip size.");
1733				return (-11);
1734			}
1735			if (strip > 65535 * sectorsize) {
1736				gctl_error(req, "Strip size too big.");
1737				return (-12);
1738			}
1739			strip = *striparg;
1740		}
1741
1742		/* Round size down to strip or sector. */
1743		if (level == G_RAID_VOLUME_RL_RAID1)
1744			size -= (size % sectorsize);
1745		else if (level == G_RAID_VOLUME_RL_RAID1E &&
1746		    (numdisks & 1) != 0)
1747			size -= (size % (2 * strip));
1748		else
1749			size -= (size % strip);
1750		if (size <= 0) {
1751			gctl_error(req, "Size too small.");
1752			return (-13);
1753		}
1754
1755		/* We have all we need, create things: volume, ... */
1756		mdi->mdio_started = 1;
1757		vol = g_raid_create_volume(sc, volname, -1);
1758		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1759		pv->pv_volume_pos = 0;
1760		vol->v_md_data = pv;
1761		vol->v_raid_level = level;
1762		vol->v_raid_level_qualifier = qual;
1763		vol->v_strip_size = strip;
1764		vol->v_disks_count = numdisks;
1765		if (level == G_RAID_VOLUME_RL_RAID0)
1766			vol->v_mediasize = size * numdisks;
1767		else if (level == G_RAID_VOLUME_RL_RAID1)
1768			vol->v_mediasize = size;
1769		else if (level == G_RAID_VOLUME_RL_RAID5)
1770			vol->v_mediasize = size * (numdisks - 1);
1771		else { /* RAID1E */
1772			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1773			    strip;
1774		}
1775		vol->v_sectorsize = sectorsize;
1776		g_raid_start_volume(vol);
1777
1778		/* , and subdisks. */
1779		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1780			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1781			sd = &vol->v_subdisks[pd->pd_disk_pos];
1782			sd->sd_disk = disk;
1783			sd->sd_offset = 0;
1784			sd->sd_size = size;
1785			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1786			if (sd->sd_disk->d_consumer != NULL) {
1787				g_raid_change_disk_state(disk,
1788				    G_RAID_DISK_S_ACTIVE);
1789				if (level == G_RAID_VOLUME_RL_RAID5)
1790					g_raid_change_subdisk_state(sd,
1791					    G_RAID_SUBDISK_S_UNINITIALIZED);
1792				else
1793					g_raid_change_subdisk_state(sd,
1794					    G_RAID_SUBDISK_S_ACTIVE);
1795				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1796				    G_RAID_EVENT_SUBDISK);
1797			} else {
1798				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1799			}
1800		}
1801
1802		/* Write metadata based on created entities. */
1803		G_RAID_DEBUG1(0, sc, "Array started.");
1804		g_raid_md_write_intel(md, NULL, NULL, NULL);
1805
1806		/* Pickup any STALE/SPARE disks to refill array if needed. */
1807		g_raid_md_intel_refill(sc);
1808
1809		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1810		    G_RAID_EVENT_VOLUME);
1811		return (0);
1812	}
1813	if (strcmp(verb, "add") == 0) {
1814
1815		if (*nargs != 3) {
1816			gctl_error(req, "Invalid number of arguments.");
1817			return (-1);
1818		}
1819		volname = gctl_get_asciiparam(req, "arg1");
1820		if (volname == NULL) {
1821			gctl_error(req, "No volume name.");
1822			return (-2);
1823		}
1824		levelname = gctl_get_asciiparam(req, "arg2");
1825		if (levelname == NULL) {
1826			gctl_error(req, "No RAID level.");
1827			return (-3);
1828		}
1829		if (strcasecmp(levelname, "RAID5") == 0)
1830			levelname = "RAID5-LA";
1831		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1832			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1833			return (-4);
1834		}
1835
1836		/* Look for existing volumes. */
1837		i = 0;
1838		vol1 = NULL;
1839		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1840			vol1 = vol;
1841			i++;
1842		}
1843		if (i > 1) {
1844			gctl_error(req, "Maximum two volumes supported.");
1845			return (-6);
1846		}
1847		if (vol1 == NULL) {
1848			gctl_error(req, "At least one volume must exist.");
1849			return (-7);
1850		}
1851
1852		numdisks = vol1->v_disks_count;
1853		force = gctl_get_paraml(req, "force", sizeof(*force));
1854		if (!g_raid_md_intel_supported(level, qual, numdisks,
1855		    force ? *force : 0)) {
1856			gctl_error(req, "Unsupported RAID level "
1857			    "(0x%02x/0x%02x), or number of disks (%d).",
1858			    level, qual, numdisks);
1859			return (-5);
1860		}
1861
1862		/* Collect info about present disks. */
1863		size = 0x7fffffffffffffffllu;
1864		sectorsize = 512;
1865		for (i = 0; i < numdisks; i++) {
1866			disk = vol1->v_subdisks[i].sd_disk;
1867			pd = (struct g_raid_md_intel_perdisk *)
1868			    disk->d_md_data;
1869			disk_sectors =
1870			    intel_get_disk_sectors(&pd->pd_disk_meta);
1871
1872			if (disk_sectors * 512 < size)
1873				size = disk_sectors * 512;
1874			if (disk->d_consumer != NULL &&
1875			    disk->d_consumer->provider != NULL &&
1876			    disk->d_consumer->provider->sectorsize >
1877			     sectorsize) {
1878				sectorsize =
1879				    disk->d_consumer->provider->sectorsize;
1880			}
1881		}
1882
1883		/* Reserve some space for metadata. */
1884		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1885
1886		/* Decide insert before or after. */
1887		sd = &vol1->v_subdisks[0];
1888		if (sd->sd_offset >
1889		    size - (sd->sd_offset + sd->sd_size)) {
1890			off = 0;
1891			size = sd->sd_offset;
1892		} else {
1893			off = sd->sd_offset + sd->sd_size;
1894			size = size - (sd->sd_offset + sd->sd_size);
1895		}
1896
1897		/* Handle strip argument. */
1898		strip = 131072;
1899		len = sizeof(*striparg);
1900		striparg = gctl_get_param(req, "strip", &len);
1901		if (striparg != NULL && len == sizeof(*striparg) &&
1902		    *striparg > 0) {
1903			if (*striparg < sectorsize) {
1904				gctl_error(req, "Strip size too small.");
1905				return (-10);
1906			}
1907			if (*striparg % sectorsize != 0) {
1908				gctl_error(req, "Incorrect strip size.");
1909				return (-11);
1910			}
1911			if (strip > 65535 * sectorsize) {
1912				gctl_error(req, "Strip size too big.");
1913				return (-12);
1914			}
1915			strip = *striparg;
1916		}
1917
1918		/* Round offset up to strip. */
1919		if (off % strip != 0) {
1920			size -= strip - off % strip;
1921			off += strip - off % strip;
1922		}
1923
1924		/* Handle size argument. */
1925		len = sizeof(*sizearg);
1926		sizearg = gctl_get_param(req, "size", &len);
1927		if (sizearg != NULL && len == sizeof(*sizearg) &&
1928		    *sizearg > 0) {
1929			if (*sizearg > size) {
1930				gctl_error(req, "Size too big %lld > %lld.",
1931				    (long long)*sizearg, (long long)size);
1932				return (-9);
1933			}
1934			size = *sizearg;
1935		}
1936
1937		/* Round size down to strip or sector. */
1938		if (level == G_RAID_VOLUME_RL_RAID1)
1939			size -= (size % sectorsize);
1940		else
1941			size -= (size % strip);
1942		if (size <= 0) {
1943			gctl_error(req, "Size too small.");
1944			return (-13);
1945		}
1946		if (size > 0xffffffffllu * sectorsize) {
1947			gctl_error(req, "Size too big.");
1948			return (-14);
1949		}
1950
1951		/* We have all we need, create things: volume, ... */
1952		vol = g_raid_create_volume(sc, volname, -1);
1953		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1954		pv->pv_volume_pos = i;
1955		vol->v_md_data = pv;
1956		vol->v_raid_level = level;
1957		vol->v_raid_level_qualifier = qual;
1958		vol->v_strip_size = strip;
1959		vol->v_disks_count = numdisks;
1960		if (level == G_RAID_VOLUME_RL_RAID0)
1961			vol->v_mediasize = size * numdisks;
1962		else if (level == G_RAID_VOLUME_RL_RAID1)
1963			vol->v_mediasize = size;
1964		else if (level == G_RAID_VOLUME_RL_RAID5)
1965			vol->v_mediasize = size * (numdisks - 1);
1966		else { /* RAID1E */
1967			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1968			    strip;
1969		}
1970		vol->v_sectorsize = sectorsize;
1971		g_raid_start_volume(vol);
1972
1973		/* , and subdisks. */
1974		for (i = 0; i < numdisks; i++) {
1975			disk = vol1->v_subdisks[i].sd_disk;
1976			sd = &vol->v_subdisks[i];
1977			sd->sd_disk = disk;
1978			sd->sd_offset = off;
1979			sd->sd_size = size;
1980			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1981			if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
1982				if (level == G_RAID_VOLUME_RL_RAID5)
1983					g_raid_change_subdisk_state(sd,
1984					    G_RAID_SUBDISK_S_UNINITIALIZED);
1985				else
1986					g_raid_change_subdisk_state(sd,
1987					    G_RAID_SUBDISK_S_ACTIVE);
1988				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1989				    G_RAID_EVENT_SUBDISK);
1990			}
1991		}
1992
1993		/* Write metadata based on created entities. */
1994		g_raid_md_write_intel(md, NULL, NULL, NULL);
1995
1996		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1997		    G_RAID_EVENT_VOLUME);
1998		return (0);
1999	}
2000	if (strcmp(verb, "delete") == 0) {
2001
2002		nodename = gctl_get_asciiparam(req, "arg0");
2003		if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
2004			nodename = NULL;
2005
2006		/* Full node destruction. */
2007		if (*nargs == 1 && nodename != NULL) {
2008			/* Check if some volume is still open. */
2009			force = gctl_get_paraml(req, "force", sizeof(*force));
2010			if (force != NULL && *force == 0 &&
2011			    g_raid_nopens(sc) != 0) {
2012				gctl_error(req, "Some volume is still open.");
2013				return (-4);
2014			}
2015
2016			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2017				if (disk->d_consumer)
2018					intel_meta_erase(disk->d_consumer);
2019			}
2020			g_raid_destroy_node(sc, 0);
2021			return (0);
2022		}
2023
2024		/* Destroy specified volume. If it was last - all node. */
2025		if (*nargs > 2) {
2026			gctl_error(req, "Invalid number of arguments.");
2027			return (-1);
2028		}
2029		volname = gctl_get_asciiparam(req,
2030		    nodename != NULL ? "arg1" : "arg0");
2031		if (volname == NULL) {
2032			gctl_error(req, "No volume name.");
2033			return (-2);
2034		}
2035
2036		/* Search for volume. */
2037		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2038			if (strcmp(vol->v_name, volname) == 0)
2039				break;
2040			pp = vol->v_provider;
2041			if (pp == NULL)
2042				continue;
2043			if (strcmp(pp->name, volname) == 0)
2044				break;
2045			if (strncmp(pp->name, "raid/", 5) == 0 &&
2046			    strcmp(pp->name + 5, volname) == 0)
2047				break;
2048		}
2049		if (vol == NULL) {
2050			i = strtol(volname, &tmp, 10);
2051			if (verb != volname && tmp[0] == 0) {
2052				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2053					if (vol->v_global_id == i)
2054						break;
2055				}
2056			}
2057		}
2058		if (vol == NULL) {
2059			gctl_error(req, "Volume '%s' not found.", volname);
2060			return (-3);
2061		}
2062
2063		/* Check if volume is still open. */
2064		force = gctl_get_paraml(req, "force", sizeof(*force));
2065		if (force != NULL && *force == 0 &&
2066		    vol->v_provider_open != 0) {
2067			gctl_error(req, "Volume is still open.");
2068			return (-4);
2069		}
2070
2071		/* Destroy volume and potentially node. */
2072		i = 0;
2073		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
2074			i++;
2075		if (i >= 2) {
2076			g_raid_destroy_volume(vol);
2077			g_raid_md_write_intel(md, NULL, NULL, NULL);
2078		} else {
2079			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2080				if (disk->d_consumer)
2081					intel_meta_erase(disk->d_consumer);
2082			}
2083			g_raid_destroy_node(sc, 0);
2084		}
2085		return (0);
2086	}
2087	if (strcmp(verb, "remove") == 0 ||
2088	    strcmp(verb, "fail") == 0) {
2089		if (*nargs < 2) {
2090			gctl_error(req, "Invalid number of arguments.");
2091			return (-1);
2092		}
2093		for (i = 1; i < *nargs; i++) {
2094			snprintf(arg, sizeof(arg), "arg%d", i);
2095			diskname = gctl_get_asciiparam(req, arg);
2096			if (diskname == NULL) {
2097				gctl_error(req, "No disk name (%s).", arg);
2098				error = -2;
2099				break;
2100			}
2101			if (strncmp(diskname, "/dev/", 5) == 0)
2102				diskname += 5;
2103
2104			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2105				if (disk->d_consumer != NULL &&
2106				    disk->d_consumer->provider != NULL &&
2107				    strcmp(disk->d_consumer->provider->name,
2108				     diskname) == 0)
2109					break;
2110			}
2111			if (disk == NULL) {
2112				gctl_error(req, "Disk '%s' not found.",
2113				    diskname);
2114				error = -3;
2115				break;
2116			}
2117
2118			if (strcmp(verb, "fail") == 0) {
2119				g_raid_md_fail_disk_intel(md, NULL, disk);
2120				continue;
2121			}
2122
2123			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2124
2125			/* Erase metadata on deleting disk. */
2126			intel_meta_erase(disk->d_consumer);
2127
2128			/* If disk was assigned, just update statuses. */
2129			if (pd->pd_disk_pos >= 0) {
2130				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
2131				g_raid_kill_consumer(sc, disk->d_consumer);
2132				disk->d_consumer = NULL;
2133				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2134					g_raid_change_subdisk_state(sd,
2135					    G_RAID_SUBDISK_S_NONE);
2136					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2137					    G_RAID_EVENT_SUBDISK);
2138				}
2139			} else {
2140				/* Otherwise -- delete. */
2141				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
2142				g_raid_destroy_disk(disk);
2143			}
2144		}
2145
2146		/* Write updated metadata to remaining disks. */
2147		g_raid_md_write_intel(md, NULL, NULL, NULL);
2148
2149		/* Check if anything left except placeholders. */
2150		if (g_raid_ndisks(sc, -1) ==
2151		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2152			g_raid_destroy_node(sc, 0);
2153		else
2154			g_raid_md_intel_refill(sc);
2155		return (error);
2156	}
2157	if (strcmp(verb, "insert") == 0) {
2158		if (*nargs < 2) {
2159			gctl_error(req, "Invalid number of arguments.");
2160			return (-1);
2161		}
2162		update = 0;
2163		for (i = 1; i < *nargs; i++) {
2164			/* Get disk name. */
2165			snprintf(arg, sizeof(arg), "arg%d", i);
2166			diskname = gctl_get_asciiparam(req, arg);
2167			if (diskname == NULL) {
2168				gctl_error(req, "No disk name (%s).", arg);
2169				error = -3;
2170				break;
2171			}
2172
2173			/* Try to find provider with specified name. */
2174			g_topology_lock();
2175			cp = g_raid_open_consumer(sc, diskname);
2176			if (cp == NULL) {
2177				gctl_error(req, "Can't open disk '%s'.",
2178				    diskname);
2179				g_topology_unlock();
2180				error = -4;
2181				break;
2182			}
2183			pp = cp->provider;
2184			g_topology_unlock();
2185
2186			/* Read disk serial. */
2187			error = g_raid_md_get_label(cp,
2188			    &serial[0], INTEL_SERIAL_LEN);
2189			if (error != 0) {
2190				gctl_error(req,
2191				    "Can't get serial for provider '%s'.",
2192				    diskname);
2193				g_raid_kill_consumer(sc, cp);
2194				error = -7;
2195				break;
2196			}
2197
2198			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
2199			pd->pd_disk_pos = -1;
2200
2201			disk = g_raid_create_disk(sc);
2202			disk->d_consumer = cp;
2203			disk->d_md_data = (void *)pd;
2204			cp->private = disk;
2205
2206			g_raid_get_disk_info(disk);
2207
2208			memcpy(&pd->pd_disk_meta.serial[0], &serial[0],
2209			    INTEL_SERIAL_LEN);
2210			intel_set_disk_sectors(&pd->pd_disk_meta,
2211			    pp->mediasize / pp->sectorsize);
2212			pd->pd_disk_meta.id = 0;
2213			pd->pd_disk_meta.flags = INTEL_F_SPARE;
2214
2215			/* Welcome the "new" disk. */
2216			update += g_raid_md_intel_start_disk(disk);
2217			if (disk->d_state == G_RAID_DISK_S_SPARE) {
2218				intel_meta_write_spare(cp, &pd->pd_disk_meta);
2219				g_raid_destroy_disk(disk);
2220			} else if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2221				gctl_error(req, "Disk '%s' doesn't fit.",
2222				    diskname);
2223				g_raid_destroy_disk(disk);
2224				error = -8;
2225				break;
2226			}
2227		}
2228
2229		/* Write new metadata if we changed something. */
2230		if (update)
2231			g_raid_md_write_intel(md, NULL, NULL, NULL);
2232		return (error);
2233	}
2234	return (-100);
2235}
2236
2237static int
2238g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol,
2239    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2240{
2241	struct g_raid_softc *sc;
2242	struct g_raid_volume *vol;
2243	struct g_raid_subdisk *sd;
2244	struct g_raid_disk *disk;
2245	struct g_raid_md_intel_object *mdi;
2246	struct g_raid_md_intel_pervolume *pv;
2247	struct g_raid_md_intel_perdisk *pd;
2248	struct intel_raid_conf *meta;
2249	struct intel_raid_vol *mvol;
2250	struct intel_raid_map *mmap0, *mmap1;
2251	off_t sectorsize = 512, pos;
2252	const char *version, *cv;
2253	int vi, sdi, numdisks, len, state, stale;
2254
2255	sc = md->mdo_softc;
2256	mdi = (struct g_raid_md_intel_object *)md;
2257
2258	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2259		return (0);
2260
2261	/* Bump generation. Newly written metadata may differ from previous. */
2262	mdi->mdio_generation++;
2263
2264	/* Count number of disks. */
2265	numdisks = 0;
2266	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2267		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2268		if (pd->pd_disk_pos < 0)
2269			continue;
2270		numdisks++;
2271		if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
2272			pd->pd_disk_meta.flags =
2273			    INTEL_F_ONLINE | INTEL_F_ASSIGNED;
2274		} else if (disk->d_state == G_RAID_DISK_S_FAILED) {
2275			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2276			    INTEL_F_ASSIGNED;
2277		} else if (disk->d_state == G_RAID_DISK_S_DISABLED) {
2278			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2279			    INTEL_F_ASSIGNED | INTEL_F_DISABLED;
2280		} else {
2281			if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED))
2282				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
2283			if (pd->pd_disk_meta.id != 0xffffffff) {
2284				pd->pd_disk_meta.id = 0xffffffff;
2285				len = strlen(pd->pd_disk_meta.serial);
2286				len = min(len, INTEL_SERIAL_LEN - 3);
2287				strcpy(pd->pd_disk_meta.serial + len, ":0");
2288			}
2289		}
2290	}
2291
2292	/* Fill anchor and disks. */
2293	meta = malloc(INTEL_MAX_MD_SIZE(numdisks),
2294	    M_MD_INTEL, M_WAITOK | M_ZERO);
2295	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
2296	meta->config_size = INTEL_MAX_MD_SIZE(numdisks);
2297	meta->config_id = mdi->mdio_config_id;
2298	meta->orig_config_id = mdi->mdio_orig_config_id;
2299	meta->generation = mdi->mdio_generation;
2300	meta->attributes = INTEL_ATTR_CHECKSUM;
2301	meta->total_disks = numdisks;
2302	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2303		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2304		if (pd->pd_disk_pos < 0)
2305			continue;
2306		meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta;
2307		if (pd->pd_disk_meta.sectors_hi != 0)
2308			meta->attributes |= INTEL_ATTR_2TB_DISK;
2309	}
2310
2311	/* Fill volumes and maps. */
2312	vi = 0;
2313	version = INTEL_VERSION_1000;
2314	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2315		pv = vol->v_md_data;
2316		if (vol->v_stopping)
2317			continue;
2318		mvol = intel_get_volume(meta, vi);
2319
2320		/* New metadata may have different volumes order. */
2321		pv->pv_volume_pos = vi;
2322
2323		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2324			sd = &vol->v_subdisks[sdi];
2325			if (sd->sd_disk != NULL)
2326				break;
2327		}
2328		if (sdi >= vol->v_disks_count)
2329			panic("No any filled subdisk in volume");
2330		if (vol->v_mediasize >= 0x20000000000llu)
2331			meta->attributes |= INTEL_ATTR_2TB;
2332		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2333			meta->attributes |= INTEL_ATTR_RAID0;
2334		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2335			meta->attributes |= INTEL_ATTR_RAID1;
2336		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2337			meta->attributes |= INTEL_ATTR_RAID5;
2338		else if ((vol->v_disks_count & 1) == 0)
2339			meta->attributes |= INTEL_ATTR_RAID10;
2340		else
2341			meta->attributes |= INTEL_ATTR_RAID1E;
2342		if (pv->pv_cng)
2343			meta->attributes |= INTEL_ATTR_RAIDCNG;
2344		if (vol->v_strip_size > 131072)
2345			meta->attributes |= INTEL_ATTR_EXT_STRIP;
2346
2347		if (pv->pv_cng)
2348			cv = INTEL_VERSION_1206;
2349		else if (vol->v_disks_count > 4)
2350			cv = INTEL_VERSION_1204;
2351		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2352			cv = INTEL_VERSION_1202;
2353		else if (vol->v_disks_count > 2)
2354			cv = INTEL_VERSION_1201;
2355		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2356			cv = INTEL_VERSION_1100;
2357		else
2358			cv = INTEL_VERSION_1000;
2359		if (strcmp(cv, version) > 0)
2360			version = cv;
2361
2362		strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name));
2363		mvol->total_sectors = vol->v_mediasize / sectorsize;
2364		mvol->state = (INTEL_ST_READ_COALESCING |
2365		    INTEL_ST_WRITE_COALESCING);
2366		mvol->tid = vol->v_global_id + 1;
2367		if (pv->pv_cng) {
2368			mvol->state |= INTEL_ST_CLONE_N_GO;
2369			if (pv->pv_cng_man_sync)
2370				mvol->state |= INTEL_ST_CLONE_MAN_SYNC;
2371			mvol->cng_master_disk = pv->pv_cng_master_disk;
2372			if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state ==
2373			    G_RAID_SUBDISK_S_NONE)
2374				mvol->cng_state = INTEL_CNGST_MASTER_MISSING;
2375			else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
2376				mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE;
2377			else
2378				mvol->cng_state = INTEL_CNGST_UPDATED;
2379		}
2380
2381		/* Check for any recovery in progress. */
2382		state = G_RAID_SUBDISK_S_ACTIVE;
2383		pos = 0x7fffffffffffffffllu;
2384		stale = 0;
2385		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2386			sd = &vol->v_subdisks[sdi];
2387			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD)
2388				state = G_RAID_SUBDISK_S_REBUILD;
2389			else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC &&
2390			    state != G_RAID_SUBDISK_S_REBUILD)
2391				state = G_RAID_SUBDISK_S_RESYNC;
2392			else if (sd->sd_state == G_RAID_SUBDISK_S_STALE)
2393				stale = 1;
2394			if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2395			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
2396			     sd->sd_rebuild_pos < pos)
2397			        pos = sd->sd_rebuild_pos;
2398		}
2399		if (state == G_RAID_SUBDISK_S_REBUILD) {
2400			mvol->migr_state = 1;
2401			mvol->migr_type = INTEL_MT_REBUILD;
2402		} else if (state == G_RAID_SUBDISK_S_RESYNC) {
2403			mvol->migr_state = 1;
2404			/* mvol->migr_type = INTEL_MT_REPAIR; */
2405			mvol->migr_type = INTEL_MT_VERIFY;
2406			mvol->state |= INTEL_ST_VERIFY_AND_FIX;
2407		} else
2408			mvol->migr_state = 0;
2409		mvol->dirty = (vol->v_dirty || stale);
2410
2411		mmap0 = intel_get_map(mvol, 0);
2412
2413		/* Write map / common part of two maps. */
2414		intel_set_map_offset(mmap0, sd->sd_offset / sectorsize);
2415		intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize);
2416		mmap0->strip_sectors = vol->v_strip_size / sectorsize;
2417		if (vol->v_state == G_RAID_VOLUME_S_BROKEN)
2418			mmap0->status = INTEL_S_FAILURE;
2419		else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED)
2420			mmap0->status = INTEL_S_DEGRADED;
2421		else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED)
2422		    == g_raid_nsubdisks(vol, -1))
2423			mmap0->status = INTEL_S_UNINITIALIZED;
2424		else
2425			mmap0->status = INTEL_S_READY;
2426		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2427			mmap0->type = INTEL_T_RAID0;
2428		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
2429		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2430			mmap0->type = INTEL_T_RAID1;
2431		else
2432			mmap0->type = INTEL_T_RAID5;
2433		mmap0->total_disks = vol->v_disks_count;
2434		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2435			mmap0->total_domains = vol->v_disks_count;
2436		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2437			mmap0->total_domains = 2;
2438		else
2439			mmap0->total_domains = 1;
2440		intel_set_map_stripe_count(mmap0,
2441		    sd->sd_size / vol->v_strip_size / mmap0->total_domains);
2442		mmap0->failed_disk_num = 0xff;
2443		mmap0->ddf = 1;
2444
2445		/* If there are two maps - copy common and update. */
2446		if (mvol->migr_state) {
2447			intel_set_vol_curr_migr_unit(mvol,
2448			    pos / vol->v_strip_size / mmap0->total_domains);
2449			mmap1 = intel_get_map(mvol, 1);
2450			memcpy(mmap1, mmap0, sizeof(struct intel_raid_map));
2451			mmap0->status = INTEL_S_READY;
2452		} else
2453			mmap1 = NULL;
2454
2455		/* Write disk indexes and put rebuild flags. */
2456		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2457			sd = &vol->v_subdisks[sdi];
2458			pd = (struct g_raid_md_intel_perdisk *)
2459			    sd->sd_disk->d_md_data;
2460			mmap0->disk_idx[sdi] = pd->pd_disk_pos;
2461			if (mvol->migr_state)
2462				mmap1->disk_idx[sdi] = pd->pd_disk_pos;
2463			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2464			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2465				mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2466			} else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
2467			    sd->sd_state != G_RAID_SUBDISK_S_STALE &&
2468			    sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) {
2469				mmap0->disk_idx[sdi] |= INTEL_DI_RBLD;
2470				if (mvol->migr_state)
2471					mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2472			}
2473			if ((sd->sd_state == G_RAID_SUBDISK_S_NONE ||
2474			     sd->sd_state == G_RAID_SUBDISK_S_FAILED ||
2475			     sd->sd_state == G_RAID_SUBDISK_S_REBUILD) &&
2476			    mmap0->failed_disk_num == 0xff) {
2477				mmap0->failed_disk_num = sdi;
2478				if (mvol->migr_state)
2479					mmap1->failed_disk_num = sdi;
2480			}
2481		}
2482		vi++;
2483	}
2484	meta->total_volumes = vi;
2485	if (vi > 1 || meta->attributes &
2486	     (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB))
2487		version = INTEL_VERSION_1300;
2488	if (strcmp(version, INTEL_VERSION_1300) < 0)
2489		meta->attributes &= INTEL_ATTR_CHECKSUM;
2490	memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1);
2491
2492	/* We are done. Print meta data and store them to disks. */
2493	g_raid_md_intel_print(meta);
2494	if (mdi->mdio_meta != NULL)
2495		free(mdi->mdio_meta, M_MD_INTEL);
2496	mdi->mdio_meta = meta;
2497	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2498		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2499		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
2500			continue;
2501		if (pd->pd_meta != NULL) {
2502			free(pd->pd_meta, M_MD_INTEL);
2503			pd->pd_meta = NULL;
2504		}
2505		pd->pd_meta = intel_meta_copy(meta);
2506		intel_meta_write(disk->d_consumer, meta);
2507	}
2508	return (0);
2509}
2510
2511static int
2512g_raid_md_fail_disk_intel(struct g_raid_md_object *md,
2513    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2514{
2515	struct g_raid_softc *sc;
2516	struct g_raid_md_intel_object *mdi;
2517	struct g_raid_md_intel_perdisk *pd;
2518	struct g_raid_subdisk *sd;
2519
2520	sc = md->mdo_softc;
2521	mdi = (struct g_raid_md_intel_object *)md;
2522	pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data;
2523
2524	/* We can't fail disk that is not a part of array now. */
2525	if (pd->pd_disk_pos < 0)
2526		return (-1);
2527
2528	/*
2529	 * Mark disk as failed in metadata and try to write that metadata
2530	 * to the disk itself to prevent it's later resurrection as STALE.
2531	 */
2532	mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED;
2533	pd->pd_disk_meta.flags = INTEL_F_FAILED;
2534	g_raid_md_intel_print(mdi->mdio_meta);
2535	if (tdisk->d_consumer)
2536		intel_meta_write(tdisk->d_consumer, mdi->mdio_meta);
2537
2538	/* Change states. */
2539	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
2540	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
2541		g_raid_change_subdisk_state(sd,
2542		    G_RAID_SUBDISK_S_FAILED);
2543		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
2544		    G_RAID_EVENT_SUBDISK);
2545	}
2546
2547	/* Write updated metadata to remaining disks. */
2548	g_raid_md_write_intel(md, NULL, NULL, tdisk);
2549
2550	/* Check if anything left except placeholders. */
2551	if (g_raid_ndisks(sc, -1) ==
2552	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2553		g_raid_destroy_node(sc, 0);
2554	else
2555		g_raid_md_intel_refill(sc);
2556	return (0);
2557}
2558
2559static int
2560g_raid_md_free_disk_intel(struct g_raid_md_object *md,
2561    struct g_raid_disk *disk)
2562{
2563	struct g_raid_md_intel_perdisk *pd;
2564
2565	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2566	if (pd->pd_meta != NULL) {
2567		free(pd->pd_meta, M_MD_INTEL);
2568		pd->pd_meta = NULL;
2569	}
2570	free(pd, M_MD_INTEL);
2571	disk->d_md_data = NULL;
2572	return (0);
2573}
2574
2575static int
2576g_raid_md_free_volume_intel(struct g_raid_md_object *md,
2577    struct g_raid_volume *vol)
2578{
2579	struct g_raid_md_intel_pervolume *pv;
2580
2581	pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data;
2582	free(pv, M_MD_INTEL);
2583	vol->v_md_data = NULL;
2584	return (0);
2585}
2586
2587static int
2588g_raid_md_free_intel(struct g_raid_md_object *md)
2589{
2590	struct g_raid_md_intel_object *mdi;
2591
2592	mdi = (struct g_raid_md_intel_object *)md;
2593	if (!mdi->mdio_started) {
2594		mdi->mdio_started = 0;
2595		callout_stop(&mdi->mdio_start_co);
2596		G_RAID_DEBUG1(1, md->mdo_softc,
2597		    "root_mount_rel %p", mdi->mdio_rootmount);
2598		root_mount_rel(mdi->mdio_rootmount);
2599		mdi->mdio_rootmount = NULL;
2600	}
2601	if (mdi->mdio_meta != NULL) {
2602		free(mdi->mdio_meta, M_MD_INTEL);
2603		mdi->mdio_meta = NULL;
2604	}
2605	return (0);
2606}
2607
2608G_RAID_MD_DECLARE(intel, "Intel");
2609