1/*-
2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3 * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD$");
30
31#include <sys/param.h>
32#include <sys/bio.h>
33#include <sys/endian.h>
34#include <sys/kernel.h>
35#include <sys/kobj.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/systm.h>
41#include <sys/taskqueue.h>
42#include <geom/geom.h>
43#include "geom/raid/g_raid.h"
44#include "g_raid_md_if.h"
45
46static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata");
47
48struct intel_raid_map {
49	uint32_t	offset;
50	uint32_t	disk_sectors;
51	uint32_t	stripe_count;
52	uint16_t	strip_sectors;
53	uint8_t		status;
54#define INTEL_S_READY           0x00
55#define INTEL_S_UNINITIALIZED   0x01
56#define INTEL_S_DEGRADED        0x02
57#define INTEL_S_FAILURE         0x03
58
59	uint8_t		type;
60#define INTEL_T_RAID0           0x00
61#define INTEL_T_RAID1           0x01
62#define INTEL_T_RAID5           0x05
63
64	uint8_t		total_disks;
65	uint8_t		total_domains;
66	uint8_t		failed_disk_num;
67	uint8_t		ddf;
68	uint32_t	offset_hi;
69	uint32_t	disk_sectors_hi;
70	uint32_t	stripe_count_hi;
71	uint32_t	filler_2[4];
72	uint32_t	disk_idx[1];	/* total_disks entries. */
73#define INTEL_DI_IDX	0x00ffffff
74#define INTEL_DI_RBLD	0x01000000
75} __packed;
76
77struct intel_raid_vol {
78	uint8_t		name[16];
79	u_int64_t	total_sectors __packed;
80	uint32_t	state;
81#define INTEL_ST_BOOTABLE		0x00000001
82#define INTEL_ST_BOOT_DEVICE		0x00000002
83#define INTEL_ST_READ_COALESCING	0x00000004
84#define INTEL_ST_WRITE_COALESCING	0x00000008
85#define INTEL_ST_LAST_SHUTDOWN_DIRTY	0x00000010
86#define INTEL_ST_HIDDEN_AT_BOOT		0x00000020
87#define INTEL_ST_CURRENTLY_HIDDEN	0x00000040
88#define INTEL_ST_VERIFY_AND_FIX		0x00000080
89#define INTEL_ST_MAP_STATE_UNINIT	0x00000100
90#define INTEL_ST_NO_AUTO_RECOVERY	0x00000200
91#define INTEL_ST_CLONE_N_GO		0x00000400
92#define INTEL_ST_CLONE_MAN_SYNC		0x00000800
93#define INTEL_ST_CNG_MASTER_DISK_NUM	0x00001000
94	uint32_t	reserved;
95	uint8_t		migr_priority;
96	uint8_t		num_sub_vols;
97	uint8_t		tid;
98	uint8_t		cng_master_disk;
99	uint16_t	cache_policy;
100	uint8_t		cng_state;
101#define INTEL_CNGST_UPDATED		0
102#define INTEL_CNGST_NEEDS_UPDATE	1
103#define INTEL_CNGST_MASTER_MISSING	2
104	uint8_t		cng_sub_state;
105	uint32_t	filler_0[10];
106
107	uint32_t	curr_migr_unit;
108	uint32_t	checkpoint_id;
109	uint8_t		migr_state;
110	uint8_t		migr_type;
111#define INTEL_MT_INIT		0
112#define INTEL_MT_REBUILD	1
113#define INTEL_MT_VERIFY		2
114#define INTEL_MT_GEN_MIGR	3
115#define INTEL_MT_STATE_CHANGE	4
116#define INTEL_MT_REPAIR		5
117	uint8_t		dirty;
118	uint8_t		fs_state;
119	uint16_t	verify_errors;
120	uint16_t	bad_blocks;
121	uint32_t	curr_migr_unit_hi;
122	uint32_t	filler_1[3];
123	struct intel_raid_map map[1];	/* 2 entries if migr_state != 0. */
124} __packed;
125
126struct intel_raid_disk {
127#define INTEL_SERIAL_LEN	16
128	uint8_t		serial[INTEL_SERIAL_LEN];
129	uint32_t	sectors;
130	uint32_t	id;
131	uint32_t	flags;
132#define INTEL_F_SPARE		0x01
133#define INTEL_F_ASSIGNED	0x02
134#define INTEL_F_FAILED		0x04
135#define INTEL_F_ONLINE		0x08
136#define INTEL_F_DISABLED	0x80
137	uint32_t	owner_cfg_num;
138	uint32_t	sectors_hi;
139	uint32_t	filler[3];
140} __packed;
141
142struct intel_raid_conf {
143	uint8_t		intel_id[24];
144#define INTEL_MAGIC             "Intel Raid ISM Cfg Sig. "
145
146	uint8_t		version[6];
147#define INTEL_VERSION_1000	"1.0.00"	/* RAID0 */
148#define INTEL_VERSION_1100	"1.1.00"	/* RAID1 */
149#define INTEL_VERSION_1200	"1.2.00"	/* Many volumes */
150#define INTEL_VERSION_1201	"1.2.01"	/* 3 or 4 disks */
151#define INTEL_VERSION_1202	"1.2.02"	/* RAID5 */
152#define INTEL_VERSION_1204	"1.2.04"	/* 5 or 6 disks */
153#define INTEL_VERSION_1206	"1.2.06"	/* CNG */
154#define INTEL_VERSION_1300	"1.3.00"	/* Attributes */
155
156	uint8_t		dummy_0[2];
157	uint32_t	checksum;
158	uint32_t	config_size;
159	uint32_t	config_id;
160	uint32_t	generation;
161	uint32_t	error_log_size;
162	uint32_t	attributes;
163#define INTEL_ATTR_RAID0	0x00000001
164#define INTEL_ATTR_RAID1	0x00000002
165#define INTEL_ATTR_RAID10	0x00000004
166#define INTEL_ATTR_RAID1E	0x00000008
167#define INTEL_ATTR_RAID5	0x00000010
168#define INTEL_ATTR_RAIDCNG	0x00000020
169#define INTEL_ATTR_EXT_STRIP	0x00000040
170#define INTEL_ATTR_NVM_CACHE	0x02000000
171#define INTEL_ATTR_2TB_DISK	0x04000000
172#define INTEL_ATTR_BBM		0x08000000
173#define INTEL_ATTR_NVM_CACHE2	0x10000000
174#define INTEL_ATTR_2TB		0x20000000
175#define INTEL_ATTR_PM		0x40000000
176#define INTEL_ATTR_CHECKSUM	0x80000000
177
178	uint8_t		total_disks;
179	uint8_t		total_volumes;
180	uint8_t		error_log_pos;
181	uint8_t		dummy_2[1];
182	uint32_t	cache_size;
183	uint32_t	orig_config_id;
184	uint32_t	pwr_cycle_count;
185	uint32_t	bbm_log_size;
186	uint32_t	filler_0[35];
187	struct intel_raid_disk	disk[1];	/* total_disks entries. */
188	/* Here goes total_volumes of struct intel_raid_vol. */
189} __packed;
190
191#define INTEL_ATTR_SUPPORTED	( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 |	\
192    INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 |		\
193    INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK |	\
194    INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM )
195
196#define INTEL_MAX_MD_SIZE(ndisks)				\
197    (sizeof(struct intel_raid_conf) +				\
198     sizeof(struct intel_raid_disk) * (ndisks - 1) +		\
199     sizeof(struct intel_raid_vol) * 2 +			\
200     sizeof(struct intel_raid_map) * 2 +			\
201     sizeof(uint32_t) * (ndisks - 1) * 4)
202
203struct g_raid_md_intel_perdisk {
204	struct intel_raid_conf	*pd_meta;
205	int			 pd_disk_pos;
206	struct intel_raid_disk	 pd_disk_meta;
207};
208
209struct g_raid_md_intel_pervolume {
210	int			 pv_volume_pos;
211	int			 pv_cng;
212	int			 pv_cng_man_sync;
213	int			 pv_cng_master_disk;
214};
215
216struct g_raid_md_intel_object {
217	struct g_raid_md_object	 mdio_base;
218	uint32_t		 mdio_config_id;
219	uint32_t		 mdio_orig_config_id;
220	uint32_t		 mdio_generation;
221	struct intel_raid_conf	*mdio_meta;
222	struct callout		 mdio_start_co;	/* STARTING state timer. */
223	int			 mdio_disks_present;
224	int			 mdio_started;
225	int			 mdio_incomplete;
226	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
227};
228
229static g_raid_md_create_t g_raid_md_create_intel;
230static g_raid_md_taste_t g_raid_md_taste_intel;
231static g_raid_md_event_t g_raid_md_event_intel;
232static g_raid_md_ctl_t g_raid_md_ctl_intel;
233static g_raid_md_write_t g_raid_md_write_intel;
234static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel;
235static g_raid_md_free_disk_t g_raid_md_free_disk_intel;
236static g_raid_md_free_volume_t g_raid_md_free_volume_intel;
237static g_raid_md_free_t g_raid_md_free_intel;
238
239static kobj_method_t g_raid_md_intel_methods[] = {
240	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_intel),
241	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_intel),
242	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_intel),
243	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_intel),
244	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_intel),
245	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_intel),
246	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_intel),
247	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_intel),
248	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_intel),
249	{ 0, 0 }
250};
251
252static struct g_raid_md_class g_raid_md_intel_class = {
253	"Intel",
254	g_raid_md_intel_methods,
255	sizeof(struct g_raid_md_intel_object),
256	.mdc_enable = 1,
257	.mdc_priority = 100
258};
259
260
261static struct intel_raid_map *
262intel_get_map(struct intel_raid_vol *mvol, int i)
263{
264	struct intel_raid_map *mmap;
265
266	if (i > (mvol->migr_state ? 1 : 0))
267		return (NULL);
268	mmap = &mvol->map[0];
269	for (; i > 0; i--) {
270		mmap = (struct intel_raid_map *)
271		    &mmap->disk_idx[mmap->total_disks];
272	}
273	return ((struct intel_raid_map *)mmap);
274}
275
276static struct intel_raid_vol *
277intel_get_volume(struct intel_raid_conf *meta, int i)
278{
279	struct intel_raid_vol *mvol;
280	struct intel_raid_map *mmap;
281
282	if (i > 1)
283		return (NULL);
284	mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks];
285	for (; i > 0; i--) {
286		mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0);
287		mvol = (struct intel_raid_vol *)
288		    &mmap->disk_idx[mmap->total_disks];
289	}
290	return (mvol);
291}
292
293static off_t
294intel_get_map_offset(struct intel_raid_map *mmap)
295{
296	off_t offset = (off_t)mmap->offset_hi << 32;
297
298	offset += mmap->offset;
299	return (offset);
300}
301
302static void
303intel_set_map_offset(struct intel_raid_map *mmap, off_t offset)
304{
305
306	mmap->offset = offset & 0xffffffff;
307	mmap->offset_hi = offset >> 32;
308}
309
310static off_t
311intel_get_map_disk_sectors(struct intel_raid_map *mmap)
312{
313	off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32;
314
315	disk_sectors += mmap->disk_sectors;
316	return (disk_sectors);
317}
318
319static void
320intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors)
321{
322
323	mmap->disk_sectors = disk_sectors & 0xffffffff;
324	mmap->disk_sectors_hi = disk_sectors >> 32;
325}
326
327static void
328intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count)
329{
330
331	mmap->stripe_count = stripe_count & 0xffffffff;
332	mmap->stripe_count_hi = stripe_count >> 32;
333}
334
335static off_t
336intel_get_disk_sectors(struct intel_raid_disk *disk)
337{
338	off_t sectors = (off_t)disk->sectors_hi << 32;
339
340	sectors += disk->sectors;
341	return (sectors);
342}
343
344static void
345intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors)
346{
347
348	disk->sectors = sectors & 0xffffffff;
349	disk->sectors_hi = sectors >> 32;
350}
351
352static off_t
353intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol)
354{
355	off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32;
356
357	curr_migr_unit += vol->curr_migr_unit;
358	return (curr_migr_unit);
359}
360
361static void
362intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit)
363{
364
365	vol->curr_migr_unit = curr_migr_unit & 0xffffffff;
366	vol->curr_migr_unit_hi = curr_migr_unit >> 32;
367}
368
369static void
370g_raid_md_intel_print(struct intel_raid_conf *meta)
371{
372	struct intel_raid_vol *mvol;
373	struct intel_raid_map *mmap;
374	int i, j, k;
375
376	if (g_raid_debug < 1)
377		return;
378
379	printf("********* ATA Intel MatrixRAID Metadata *********\n");
380	printf("intel_id            <%.24s>\n", meta->intel_id);
381	printf("version             <%.6s>\n", meta->version);
382	printf("checksum            0x%08x\n", meta->checksum);
383	printf("config_size         0x%08x\n", meta->config_size);
384	printf("config_id           0x%08x\n", meta->config_id);
385	printf("generation          0x%08x\n", meta->generation);
386	printf("error_log_size      %d\n", meta->error_log_size);
387	printf("attributes          0x%08x\n", meta->attributes);
388	printf("total_disks         %u\n", meta->total_disks);
389	printf("total_volumes       %u\n", meta->total_volumes);
390	printf("error_log_pos       %u\n", meta->error_log_pos);
391	printf("cache_size          %u\n", meta->cache_size);
392	printf("orig_config_id      0x%08x\n", meta->orig_config_id);
393	printf("pwr_cycle_count     %u\n", meta->pwr_cycle_count);
394	printf("bbm_log_size        %u\n", meta->bbm_log_size);
395	printf("DISK#   serial disk_sectors disk_sectors_hi disk_id flags owner\n");
396	for (i = 0; i < meta->total_disks; i++ ) {
397		printf("    %d   <%.16s> %u %u 0x%08x 0x%08x %08x\n", i,
398		    meta->disk[i].serial, meta->disk[i].sectors,
399		    meta->disk[i].sectors_hi, meta->disk[i].id,
400		    meta->disk[i].flags, meta->disk[i].owner_cfg_num);
401	}
402	for (i = 0; i < meta->total_volumes; i++) {
403		mvol = intel_get_volume(meta, i);
404		printf(" ****** Volume %d ******\n", i);
405		printf(" name               %.16s\n", mvol->name);
406		printf(" total_sectors      %ju\n", mvol->total_sectors);
407		printf(" state              0x%08x\n", mvol->state);
408		printf(" reserved           %u\n", mvol->reserved);
409		printf(" migr_priority      %u\n", mvol->migr_priority);
410		printf(" num_sub_vols       %u\n", mvol->num_sub_vols);
411		printf(" tid                %u\n", mvol->tid);
412		printf(" cng_master_disk    %u\n", mvol->cng_master_disk);
413		printf(" cache_policy       %u\n", mvol->cache_policy);
414		printf(" cng_state          %u\n", mvol->cng_state);
415		printf(" cng_sub_state      %u\n", mvol->cng_sub_state);
416		printf(" curr_migr_unit     %u\n", mvol->curr_migr_unit);
417		printf(" curr_migr_unit_hi  %u\n", mvol->curr_migr_unit_hi);
418		printf(" checkpoint_id      %u\n", mvol->checkpoint_id);
419		printf(" migr_state         %u\n", mvol->migr_state);
420		printf(" migr_type          %u\n", mvol->migr_type);
421		printf(" dirty              %u\n", mvol->dirty);
422		printf(" fs_state           %u\n", mvol->fs_state);
423		printf(" verify_errors      %u\n", mvol->verify_errors);
424		printf(" bad_blocks         %u\n", mvol->bad_blocks);
425
426		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
427			printf("  *** Map %d ***\n", j);
428			mmap = intel_get_map(mvol, j);
429			printf("  offset            %u\n", mmap->offset);
430			printf("  offset_hi         %u\n", mmap->offset_hi);
431			printf("  disk_sectors      %u\n", mmap->disk_sectors);
432			printf("  disk_sectors_hi   %u\n", mmap->disk_sectors_hi);
433			printf("  stripe_count      %u\n", mmap->stripe_count);
434			printf("  stripe_count_hi   %u\n", mmap->stripe_count_hi);
435			printf("  strip_sectors     %u\n", mmap->strip_sectors);
436			printf("  status            %u\n", mmap->status);
437			printf("  type              %u\n", mmap->type);
438			printf("  total_disks       %u\n", mmap->total_disks);
439			printf("  total_domains     %u\n", mmap->total_domains);
440			printf("  failed_disk_num   %u\n", mmap->failed_disk_num);
441			printf("  ddf               %u\n", mmap->ddf);
442			printf("  disk_idx         ");
443			for (k = 0; k < mmap->total_disks; k++)
444				printf(" 0x%08x", mmap->disk_idx[k]);
445			printf("\n");
446		}
447	}
448	printf("=================================================\n");
449}
450
451static struct intel_raid_conf *
452intel_meta_copy(struct intel_raid_conf *meta)
453{
454	struct intel_raid_conf *nmeta;
455
456	nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK);
457	memcpy(nmeta, meta, meta->config_size);
458	return (nmeta);
459}
460
461static int
462intel_meta_find_disk(struct intel_raid_conf *meta, char *serial)
463{
464	int pos;
465
466	for (pos = 0; pos < meta->total_disks; pos++) {
467		if (strncmp(meta->disk[pos].serial,
468		    serial, INTEL_SERIAL_LEN) == 0)
469			return (pos);
470	}
471	return (-1);
472}
473
474static struct intel_raid_conf *
475intel_meta_read(struct g_consumer *cp)
476{
477	struct g_provider *pp;
478	struct intel_raid_conf *meta;
479	struct intel_raid_vol *mvol;
480	struct intel_raid_map *mmap, *mmap1;
481	char *buf;
482	int error, i, j, k, left, size;
483	uint32_t checksum, *ptr;
484
485	pp = cp->provider;
486
487	/* Read the anchor sector. */
488	buf = g_read_data(cp,
489	    pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error);
490	if (buf == NULL) {
491		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
492		    pp->name, error);
493		return (NULL);
494	}
495	meta = (struct intel_raid_conf *)buf;
496
497	/* Check if this is an Intel RAID struct */
498	if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) {
499		G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name);
500		g_free(buf);
501		return (NULL);
502	}
503	if (meta->config_size > 65536 ||
504	    meta->config_size < sizeof(struct intel_raid_conf)) {
505		G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d",
506		    meta->config_size);
507		g_free(buf);
508		return (NULL);
509	}
510	size = meta->config_size;
511	meta = malloc(size, M_MD_INTEL, M_WAITOK);
512	memcpy(meta, buf, min(size, pp->sectorsize));
513	g_free(buf);
514
515	/* Read all the rest, if needed. */
516	if (meta->config_size > pp->sectorsize) {
517		left = (meta->config_size - 1) / pp->sectorsize;
518		buf = g_read_data(cp,
519		    pp->mediasize - pp->sectorsize * (2 + left),
520		    pp->sectorsize * left, &error);
521		if (buf == NULL) {
522			G_RAID_DEBUG(1, "Cannot read remaining metadata"
523			    " part from %s (error=%d).",
524			    pp->name, error);
525			free(meta, M_MD_INTEL);
526			return (NULL);
527		}
528		memcpy(((char *)meta) + pp->sectorsize, buf,
529		    pp->sectorsize * left);
530		g_free(buf);
531	}
532
533	/* Check metadata checksum. */
534	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
535	    i < (meta->config_size / sizeof(uint32_t)); i++) {
536		checksum += *ptr++;
537	}
538	checksum -= meta->checksum;
539	if (checksum != meta->checksum) {
540		G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name);
541		free(meta, M_MD_INTEL);
542		return (NULL);
543	}
544
545	/* Validate metadata size. */
546	size = sizeof(struct intel_raid_conf) +
547	    sizeof(struct intel_raid_disk) * (meta->total_disks - 1) +
548	    sizeof(struct intel_raid_vol) * meta->total_volumes;
549	if (size > meta->config_size) {
550badsize:
551		G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d",
552		    meta->config_size, size);
553		free(meta, M_MD_INTEL);
554		return (NULL);
555	}
556	for (i = 0; i < meta->total_volumes; i++) {
557		mvol = intel_get_volume(meta, i);
558		mmap = intel_get_map(mvol, 0);
559		size += 4 * (mmap->total_disks - 1);
560		if (size > meta->config_size)
561			goto badsize;
562		if (mvol->migr_state) {
563			size += sizeof(struct intel_raid_map);
564			if (size > meta->config_size)
565				goto badsize;
566			mmap = intel_get_map(mvol, 1);
567			size += 4 * (mmap->total_disks - 1);
568			if (size > meta->config_size)
569				goto badsize;
570		}
571	}
572
573	g_raid_md_intel_print(meta);
574
575	if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) {
576		G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'",
577		    meta->version);
578		free(meta, M_MD_INTEL);
579		return (NULL);
580	}
581
582	if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 &&
583	    (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) {
584		G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x",
585		    meta->attributes & ~INTEL_ATTR_SUPPORTED);
586		free(meta, M_MD_INTEL);
587		return (NULL);
588	}
589
590	/* Validate disk indexes. */
591	for (i = 0; i < meta->total_volumes; i++) {
592		mvol = intel_get_volume(meta, i);
593		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
594			mmap = intel_get_map(mvol, j);
595			for (k = 0; k < mmap->total_disks; k++) {
596				if ((mmap->disk_idx[k] & INTEL_DI_IDX) >
597				    meta->total_disks) {
598					G_RAID_DEBUG(1, "Intel metadata disk"
599					    " index %d too big (>%d)",
600					    mmap->disk_idx[k] & INTEL_DI_IDX,
601					    meta->total_disks);
602					free(meta, M_MD_INTEL);
603					return (NULL);
604				}
605			}
606		}
607	}
608
609	/* Validate migration types. */
610	for (i = 0; i < meta->total_volumes; i++) {
611		mvol = intel_get_volume(meta, i);
612		/* Deny unknown migration types. */
613		if (mvol->migr_state &&
614		    mvol->migr_type != INTEL_MT_INIT &&
615		    mvol->migr_type != INTEL_MT_REBUILD &&
616		    mvol->migr_type != INTEL_MT_VERIFY &&
617		    mvol->migr_type != INTEL_MT_GEN_MIGR &&
618		    mvol->migr_type != INTEL_MT_REPAIR) {
619			G_RAID_DEBUG(1, "Intel metadata has unsupported"
620			    " migration type %d", mvol->migr_type);
621			free(meta, M_MD_INTEL);
622			return (NULL);
623		}
624		/* Deny general migrations except SINGLE->RAID1. */
625		if (mvol->migr_state &&
626		    mvol->migr_type == INTEL_MT_GEN_MIGR) {
627			mmap = intel_get_map(mvol, 0);
628			mmap1 = intel_get_map(mvol, 1);
629			if (mmap1->total_disks != 1 ||
630			    mmap->type != INTEL_T_RAID1 ||
631			    mmap->total_disks != 2 ||
632			    mmap->offset != mmap1->offset ||
633			    mmap->disk_sectors != mmap1->disk_sectors ||
634			    mmap->total_domains != mmap->total_disks ||
635			    mmap->offset_hi != mmap1->offset_hi ||
636			    mmap->disk_sectors_hi != mmap1->disk_sectors_hi ||
637			    (mmap->disk_idx[0] != mmap1->disk_idx[0] &&
638			     mmap->disk_idx[0] != mmap1->disk_idx[1])) {
639				G_RAID_DEBUG(1, "Intel metadata has unsupported"
640				    " variant of general migration");
641				free(meta, M_MD_INTEL);
642				return (NULL);
643			}
644		}
645	}
646
647	return (meta);
648}
649
650static int
651intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta)
652{
653	struct g_provider *pp;
654	char *buf;
655	int error, i, sectors;
656	uint32_t checksum, *ptr;
657
658	pp = cp->provider;
659
660	/* Recalculate checksum for case if metadata were changed. */
661	meta->checksum = 0;
662	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
663	    i < (meta->config_size / sizeof(uint32_t)); i++) {
664		checksum += *ptr++;
665	}
666	meta->checksum = checksum;
667
668	/* Create and fill buffer. */
669	sectors = (meta->config_size + pp->sectorsize - 1) / pp->sectorsize;
670	buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
671	if (sectors > 1) {
672		memcpy(buf, ((char *)meta) + pp->sectorsize,
673		    (sectors - 1) * pp->sectorsize);
674	}
675	memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize);
676
677	error = g_write_data(cp,
678	    pp->mediasize - pp->sectorsize * (1 + sectors),
679	    buf, pp->sectorsize * sectors);
680	if (error != 0) {
681		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
682		    pp->name, error);
683	}
684
685	free(buf, M_MD_INTEL);
686	return (error);
687}
688
689static int
690intel_meta_erase(struct g_consumer *cp)
691{
692	struct g_provider *pp;
693	char *buf;
694	int error;
695
696	pp = cp->provider;
697	buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
698	error = g_write_data(cp,
699	    pp->mediasize - 2 * pp->sectorsize,
700	    buf, pp->sectorsize);
701	if (error != 0) {
702		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
703		    pp->name, error);
704	}
705	free(buf, M_MD_INTEL);
706	return (error);
707}
708
709static int
710intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d)
711{
712	struct intel_raid_conf *meta;
713	int error;
714
715	/* Fill anchor and single disk. */
716	meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO);
717	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
718	memcpy(&meta->version[0], INTEL_VERSION_1000,
719	    sizeof(INTEL_VERSION_1000) - 1);
720	meta->config_size = INTEL_MAX_MD_SIZE(1);
721	meta->config_id = meta->orig_config_id = arc4random();
722	meta->generation = 1;
723	meta->total_disks = 1;
724	meta->disk[0] = *d;
725	error = intel_meta_write(cp, meta);
726	free(meta, M_MD_INTEL);
727	return (error);
728}
729
730static struct g_raid_disk *
731g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id)
732{
733	struct g_raid_disk	*disk;
734	struct g_raid_md_intel_perdisk *pd;
735
736	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
737		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
738		if (pd->pd_disk_pos == id)
739			break;
740	}
741	return (disk);
742}
743
744static int
745g_raid_md_intel_supported(int level, int qual, int disks, int force)
746{
747
748	switch (level) {
749	case G_RAID_VOLUME_RL_RAID0:
750		if (disks < 1)
751			return (0);
752		if (!force && (disks < 2 || disks > 6))
753			return (0);
754		break;
755	case G_RAID_VOLUME_RL_RAID1:
756		if (disks < 1)
757			return (0);
758		if (!force && (disks != 2))
759			return (0);
760		break;
761	case G_RAID_VOLUME_RL_RAID1E:
762		if (disks < 2)
763			return (0);
764		if (!force && (disks != 4))
765			return (0);
766		break;
767	case G_RAID_VOLUME_RL_RAID5:
768		if (disks < 3)
769			return (0);
770		if (!force && disks > 6)
771			return (0);
772		if (qual != G_RAID_VOLUME_RLQ_R5LA)
773			return (0);
774		break;
775	default:
776		return (0);
777	}
778	if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
779		return (0);
780	return (1);
781}
782
783static struct g_raid_volume *
784g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id)
785{
786	struct g_raid_volume	*mvol;
787	struct g_raid_md_intel_pervolume *pv;
788
789	TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) {
790		pv = mvol->v_md_data;
791		if (pv->pv_volume_pos == id)
792			break;
793	}
794	return (mvol);
795}
796
797static int
798g_raid_md_intel_start_disk(struct g_raid_disk *disk)
799{
800	struct g_raid_softc *sc;
801	struct g_raid_subdisk *sd, *tmpsd;
802	struct g_raid_disk *olddisk, *tmpdisk;
803	struct g_raid_md_object *md;
804	struct g_raid_md_intel_object *mdi;
805	struct g_raid_md_intel_pervolume *pv;
806	struct g_raid_md_intel_perdisk *pd, *oldpd;
807	struct intel_raid_conf *meta;
808	struct intel_raid_vol *mvol;
809	struct intel_raid_map *mmap0, *mmap1;
810	int disk_pos, resurrection = 0, migr_global, i;
811
812	sc = disk->d_softc;
813	md = sc->sc_md;
814	mdi = (struct g_raid_md_intel_object *)md;
815	meta = mdi->mdio_meta;
816	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
817	olddisk = NULL;
818
819	/* Find disk position in metadata by it's serial. */
820	disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial);
821	if (disk_pos < 0) {
822		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
823		/* Failed stale disk is useless for us. */
824		if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) &&
825		    !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) {
826			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
827			return (0);
828		}
829		/* If we are in the start process, that's all for now. */
830		if (!mdi->mdio_started)
831			goto nofit;
832		/*
833		 * If we have already started - try to get use of the disk.
834		 * Try to replace OFFLINE disks first, then FAILED.
835		 */
836		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
837			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
838			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
839				continue;
840			/* Make sure this disk is big enough. */
841			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
842				off_t disk_sectors =
843				    intel_get_disk_sectors(&pd->pd_disk_meta);
844
845				if (sd->sd_offset + sd->sd_size + 4096 >
846				    disk_sectors * 512) {
847					G_RAID_DEBUG1(1, sc,
848					    "Disk too small (%llu < %llu)",
849					    (unsigned long long)
850					    disk_sectors * 512,
851					    (unsigned long long)
852					    sd->sd_offset + sd->sd_size + 4096);
853					break;
854				}
855			}
856			if (sd != NULL)
857				continue;
858			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
859				olddisk = tmpdisk;
860				break;
861			} else if (olddisk == NULL)
862				olddisk = tmpdisk;
863		}
864		if (olddisk == NULL) {
865nofit:
866			if (pd->pd_disk_meta.flags & INTEL_F_SPARE) {
867				g_raid_change_disk_state(disk,
868				    G_RAID_DISK_S_SPARE);
869				return (1);
870			} else {
871				g_raid_change_disk_state(disk,
872				    G_RAID_DISK_S_STALE);
873				return (0);
874			}
875		}
876		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
877		disk_pos = oldpd->pd_disk_pos;
878		resurrection = 1;
879	}
880
881	if (olddisk == NULL) {
882		/* Find placeholder by position. */
883		olddisk = g_raid_md_intel_get_disk(sc, disk_pos);
884		if (olddisk == NULL)
885			panic("No disk at position %d!", disk_pos);
886		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
887			G_RAID_DEBUG1(1, sc, "More then one disk for pos %d",
888			    disk_pos);
889			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
890			return (0);
891		}
892		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
893	}
894
895	/* Replace failed disk or placeholder with new disk. */
896	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
897		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
898		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
899		sd->sd_disk = disk;
900	}
901	oldpd->pd_disk_pos = -2;
902	pd->pd_disk_pos = disk_pos;
903
904	/* If it was placeholder -- destroy it. */
905	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
906		g_raid_destroy_disk(olddisk);
907	} else {
908		/* Otherwise, make it STALE_FAILED. */
909		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
910		/* Update global metadata just in case. */
911		memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta,
912		    sizeof(struct intel_raid_disk));
913	}
914
915	/* Welcome the new disk. */
916	if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
917	    !(pd->pd_disk_meta.flags & INTEL_F_SPARE))
918		g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED);
919	else if (resurrection)
920		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
921	else if (meta->disk[disk_pos].flags & INTEL_F_FAILED)
922		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
923	else if (meta->disk[disk_pos].flags & INTEL_F_SPARE)
924		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
925	else
926		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
927	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
928		pv = sd->sd_volume->v_md_data;
929		mvol = intel_get_volume(meta, pv->pv_volume_pos);
930		mmap0 = intel_get_map(mvol, 0);
931		if (mvol->migr_state)
932			mmap1 = intel_get_map(mvol, 1);
933		else
934			mmap1 = mmap0;
935
936		migr_global = 1;
937		for (i = 0; i < mmap0->total_disks; i++) {
938			if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 &&
939			    (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0)
940				migr_global = 0;
941		}
942
943		if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
944		    !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) {
945			/* Disabled disk, useless. */
946			g_raid_change_subdisk_state(sd,
947			    G_RAID_SUBDISK_S_NONE);
948		} else if (resurrection) {
949			/* Stale disk, almost same as new. */
950			g_raid_change_subdisk_state(sd,
951			    G_RAID_SUBDISK_S_NEW);
952		} else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) {
953			/* Failed disk, almost useless. */
954			g_raid_change_subdisk_state(sd,
955			    G_RAID_SUBDISK_S_FAILED);
956		} else if (mvol->migr_state == 0) {
957			if (mmap0->status == INTEL_S_UNINITIALIZED &&
958			    (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) {
959				/* Freshly created uninitialized volume. */
960				g_raid_change_subdisk_state(sd,
961				    G_RAID_SUBDISK_S_UNINITIALIZED);
962			} else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
963				/* Freshly inserted disk. */
964				g_raid_change_subdisk_state(sd,
965				    G_RAID_SUBDISK_S_NEW);
966			} else if (mvol->dirty && (!pv->pv_cng ||
967			    pv->pv_cng_master_disk != disk_pos)) {
968				/* Dirty volume (unclean shutdown). */
969				g_raid_change_subdisk_state(sd,
970				    G_RAID_SUBDISK_S_STALE);
971			} else {
972				/* Up to date disk. */
973				g_raid_change_subdisk_state(sd,
974				    G_RAID_SUBDISK_S_ACTIVE);
975			}
976		} else if (mvol->migr_type == INTEL_MT_INIT ||
977			   mvol->migr_type == INTEL_MT_REBUILD) {
978			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
979				/* Freshly inserted disk. */
980				g_raid_change_subdisk_state(sd,
981				    G_RAID_SUBDISK_S_NEW);
982			} else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
983				/* Rebuilding disk. */
984				g_raid_change_subdisk_state(sd,
985				    G_RAID_SUBDISK_S_REBUILD);
986				if (mvol->dirty) {
987					sd->sd_rebuild_pos = 0;
988				} else {
989					sd->sd_rebuild_pos =
990					    intel_get_vol_curr_migr_unit(mvol) *
991					    sd->sd_volume->v_strip_size *
992					    mmap0->total_domains;
993				}
994			} else if (mvol->migr_type == INTEL_MT_INIT &&
995			    migr_global) {
996				/* Freshly created uninitialized volume. */
997				g_raid_change_subdisk_state(sd,
998				    G_RAID_SUBDISK_S_UNINITIALIZED);
999			} else if (mvol->dirty && (!pv->pv_cng ||
1000			    pv->pv_cng_master_disk != disk_pos)) {
1001				/* Dirty volume (unclean shutdown). */
1002				g_raid_change_subdisk_state(sd,
1003				    G_RAID_SUBDISK_S_STALE);
1004			} else {
1005				/* Up to date disk. */
1006				g_raid_change_subdisk_state(sd,
1007				    G_RAID_SUBDISK_S_ACTIVE);
1008			}
1009		} else if (mvol->migr_type == INTEL_MT_VERIFY ||
1010			   mvol->migr_type == INTEL_MT_REPAIR) {
1011			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1012				/* Freshly inserted disk. */
1013				g_raid_change_subdisk_state(sd,
1014				    G_RAID_SUBDISK_S_NEW);
1015			} else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) ||
1016			    migr_global) {
1017				/* Resyncing disk. */
1018				g_raid_change_subdisk_state(sd,
1019				    G_RAID_SUBDISK_S_RESYNC);
1020				if (mvol->dirty) {
1021					sd->sd_rebuild_pos = 0;
1022				} else {
1023					sd->sd_rebuild_pos =
1024					    intel_get_vol_curr_migr_unit(mvol) *
1025					    sd->sd_volume->v_strip_size *
1026					    mmap0->total_domains;
1027				}
1028			} else if (mvol->dirty) {
1029				/* Dirty volume (unclean shutdown). */
1030				g_raid_change_subdisk_state(sd,
1031				    G_RAID_SUBDISK_S_STALE);
1032			} else {
1033				/* Up to date disk. */
1034				g_raid_change_subdisk_state(sd,
1035				    G_RAID_SUBDISK_S_ACTIVE);
1036			}
1037		} else if (mvol->migr_type == INTEL_MT_GEN_MIGR) {
1038			if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) {
1039				/* Freshly inserted disk. */
1040				g_raid_change_subdisk_state(sd,
1041				    G_RAID_SUBDISK_S_NEW);
1042			} else {
1043				/* Up to date disk. */
1044				g_raid_change_subdisk_state(sd,
1045				    G_RAID_SUBDISK_S_ACTIVE);
1046			}
1047		}
1048		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1049		    G_RAID_EVENT_SUBDISK);
1050	}
1051
1052	/* Update status of our need for spare. */
1053	if (mdi->mdio_started) {
1054		mdi->mdio_incomplete =
1055		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1056		     g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) <
1057		     meta->total_disks);
1058	}
1059
1060	return (resurrection);
1061}
1062
1063static void
1064g_disk_md_intel_retaste(void *arg, int pending)
1065{
1066
1067	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
1068	g_retaste(&g_raid_class);
1069	free(arg, M_MD_INTEL);
1070}
1071
1072static void
1073g_raid_md_intel_refill(struct g_raid_softc *sc)
1074{
1075	struct g_raid_md_object *md;
1076	struct g_raid_md_intel_object *mdi;
1077	struct intel_raid_conf *meta;
1078	struct g_raid_disk *disk;
1079	struct task *task;
1080	int update, na;
1081
1082	md = sc->sc_md;
1083	mdi = (struct g_raid_md_intel_object *)md;
1084	meta = mdi->mdio_meta;
1085	update = 0;
1086	do {
1087		/* Make sure we miss anything. */
1088		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1089		    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED);
1090		if (na == meta->total_disks)
1091			break;
1092
1093		G_RAID_DEBUG1(1, md->mdo_softc,
1094		    "Array is not complete (%d of %d), "
1095		    "trying to refill.", na, meta->total_disks);
1096
1097		/* Try to get use some of STALE disks. */
1098		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1099			if (disk->d_state == G_RAID_DISK_S_STALE) {
1100				update += g_raid_md_intel_start_disk(disk);
1101				if (disk->d_state == G_RAID_DISK_S_ACTIVE ||
1102				    disk->d_state == G_RAID_DISK_S_DISABLED)
1103					break;
1104			}
1105		}
1106		if (disk != NULL)
1107			continue;
1108
1109		/* Try to get use some of SPARE disks. */
1110		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1111			if (disk->d_state == G_RAID_DISK_S_SPARE) {
1112				update += g_raid_md_intel_start_disk(disk);
1113				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
1114					break;
1115			}
1116		}
1117	} while (disk != NULL);
1118
1119	/* Write new metadata if we changed something. */
1120	if (update) {
1121		g_raid_md_write_intel(md, NULL, NULL, NULL);
1122		meta = mdi->mdio_meta;
1123	}
1124
1125	/* Update status of our need for spare. */
1126	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1127	    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks);
1128
1129	/* Request retaste hoping to find spare. */
1130	if (mdi->mdio_incomplete) {
1131		task = malloc(sizeof(struct task),
1132		    M_MD_INTEL, M_WAITOK | M_ZERO);
1133		TASK_INIT(task, 0, g_disk_md_intel_retaste, task);
1134		taskqueue_enqueue(taskqueue_swi, task);
1135	}
1136}
1137
1138static void
1139g_raid_md_intel_start(struct g_raid_softc *sc)
1140{
1141	struct g_raid_md_object *md;
1142	struct g_raid_md_intel_object *mdi;
1143	struct g_raid_md_intel_pervolume *pv;
1144	struct g_raid_md_intel_perdisk *pd;
1145	struct intel_raid_conf *meta;
1146	struct intel_raid_vol *mvol;
1147	struct intel_raid_map *mmap;
1148	struct g_raid_volume *vol;
1149	struct g_raid_subdisk *sd;
1150	struct g_raid_disk *disk;
1151	int i, j, disk_pos;
1152
1153	md = sc->sc_md;
1154	mdi = (struct g_raid_md_intel_object *)md;
1155	meta = mdi->mdio_meta;
1156
1157	/* Create volumes and subdisks. */
1158	for (i = 0; i < meta->total_volumes; i++) {
1159		mvol = intel_get_volume(meta, i);
1160		mmap = intel_get_map(mvol, 0);
1161		vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1);
1162		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1163		pv->pv_volume_pos = i;
1164		pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0;
1165		pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0;
1166		if (mvol->cng_master_disk < mmap->total_disks)
1167			pv->pv_cng_master_disk = mvol->cng_master_disk;
1168		vol->v_md_data = pv;
1169		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
1170		if (mmap->type == INTEL_T_RAID0)
1171			vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
1172		else if (mmap->type == INTEL_T_RAID1 &&
1173		    mmap->total_domains >= 2 &&
1174		    mmap->total_domains <= mmap->total_disks) {
1175			/* Assume total_domains is correct. */
1176			if (mmap->total_domains == mmap->total_disks)
1177				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1178			else
1179				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1180		} else if (mmap->type == INTEL_T_RAID1) {
1181			/* total_domains looks wrong. */
1182			if (mmap->total_disks <= 2)
1183				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1184			else
1185				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1186		} else if (mmap->type == INTEL_T_RAID5) {
1187			vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
1188			vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
1189		} else
1190			vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1191		vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ
1192		vol->v_disks_count = mmap->total_disks;
1193		vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ
1194		vol->v_sectorsize = 512; //ZZZ
1195		for (j = 0; j < vol->v_disks_count; j++) {
1196			sd = &vol->v_subdisks[j];
1197			sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ
1198			sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ
1199		}
1200		g_raid_start_volume(vol);
1201	}
1202
1203	/* Create disk placeholders to store data for later writing. */
1204	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
1205		pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1206		pd->pd_disk_pos = disk_pos;
1207		pd->pd_disk_meta = meta->disk[disk_pos];
1208		disk = g_raid_create_disk(sc);
1209		disk->d_md_data = (void *)pd;
1210		disk->d_state = G_RAID_DISK_S_OFFLINE;
1211		for (i = 0; i < meta->total_volumes; i++) {
1212			mvol = intel_get_volume(meta, i);
1213			mmap = intel_get_map(mvol, 0);
1214			for (j = 0; j < mmap->total_disks; j++) {
1215				if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos)
1216					break;
1217			}
1218			if (j == mmap->total_disks)
1219				continue;
1220			vol = g_raid_md_intel_get_volume(sc, i);
1221			sd = &vol->v_subdisks[j];
1222			sd->sd_disk = disk;
1223			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1224		}
1225	}
1226
1227	/* Make all disks found till the moment take their places. */
1228	do {
1229		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1230			if (disk->d_state == G_RAID_DISK_S_NONE) {
1231				g_raid_md_intel_start_disk(disk);
1232				break;
1233			}
1234		}
1235	} while (disk != NULL);
1236
1237	mdi->mdio_started = 1;
1238	G_RAID_DEBUG1(0, sc, "Array started.");
1239	g_raid_md_write_intel(md, NULL, NULL, NULL);
1240
1241	/* Pickup any STALE/SPARE disks to refill array if needed. */
1242	g_raid_md_intel_refill(sc);
1243
1244	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1245		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1246		    G_RAID_EVENT_VOLUME);
1247	}
1248
1249	callout_stop(&mdi->mdio_start_co);
1250	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
1251	root_mount_rel(mdi->mdio_rootmount);
1252	mdi->mdio_rootmount = NULL;
1253}
1254
1255static void
1256g_raid_md_intel_new_disk(struct g_raid_disk *disk)
1257{
1258	struct g_raid_softc *sc;
1259	struct g_raid_md_object *md;
1260	struct g_raid_md_intel_object *mdi;
1261	struct intel_raid_conf *pdmeta;
1262	struct g_raid_md_intel_perdisk *pd;
1263
1264	sc = disk->d_softc;
1265	md = sc->sc_md;
1266	mdi = (struct g_raid_md_intel_object *)md;
1267	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1268	pdmeta = pd->pd_meta;
1269
1270	if (mdi->mdio_started) {
1271		if (g_raid_md_intel_start_disk(disk))
1272			g_raid_md_write_intel(md, NULL, NULL, NULL);
1273	} else {
1274		/* If we haven't started yet - check metadata freshness. */
1275		if (mdi->mdio_meta == NULL ||
1276		    ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) {
1277			G_RAID_DEBUG1(1, sc, "Newer disk");
1278			if (mdi->mdio_meta != NULL)
1279				free(mdi->mdio_meta, M_MD_INTEL);
1280			mdi->mdio_meta = intel_meta_copy(pdmeta);
1281			mdi->mdio_generation = mdi->mdio_meta->generation;
1282			mdi->mdio_disks_present = 1;
1283		} else if (pdmeta->generation == mdi->mdio_generation) {
1284			mdi->mdio_disks_present++;
1285			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
1286			    mdi->mdio_disks_present,
1287			    mdi->mdio_meta->total_disks);
1288		} else {
1289			G_RAID_DEBUG1(1, sc, "Older disk");
1290		}
1291		/* If we collected all needed disks - start array. */
1292		if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks)
1293			g_raid_md_intel_start(sc);
1294	}
1295}
1296
1297static void
1298g_raid_intel_go(void *arg)
1299{
1300	struct g_raid_softc *sc;
1301	struct g_raid_md_object *md;
1302	struct g_raid_md_intel_object *mdi;
1303
1304	sc = arg;
1305	md = sc->sc_md;
1306	mdi = (struct g_raid_md_intel_object *)md;
1307	if (!mdi->mdio_started) {
1308		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
1309		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
1310	}
1311}
1312
1313static int
1314g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp,
1315    struct g_geom **gp)
1316{
1317	struct g_raid_softc *sc;
1318	struct g_raid_md_intel_object *mdi;
1319	char name[16];
1320
1321	mdi = (struct g_raid_md_intel_object *)md;
1322	mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random();
1323	mdi->mdio_generation = 0;
1324	snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id);
1325	sc = g_raid_create_node(mp, name, md);
1326	if (sc == NULL)
1327		return (G_RAID_MD_TASTE_FAIL);
1328	md->mdo_softc = sc;
1329	*gp = sc->sc_geom;
1330	return (G_RAID_MD_TASTE_NEW);
1331}
1332
1333/*
1334 * Return the last N characters of the serial label.  The Linux and
1335 * ataraid(7) code always uses the last 16 characters of the label to
1336 * store into the Intel meta format.  Generalize this to N characters
1337 * since that's easy.  Labels can be up to 20 characters for SATA drives
1338 * and up 251 characters for SAS drives.  Since intel controllers don't
1339 * support SAS drives, just stick with the SATA limits for stack friendliness.
1340 */
1341static int
1342g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen)
1343{
1344	char serial_buffer[24];
1345	int len, error;
1346
1347	len = sizeof(serial_buffer);
1348	error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer);
1349	if (error != 0)
1350		return (error);
1351	len = strlen(serial_buffer);
1352	if (len > serlen)
1353		len -= serlen;
1354	else
1355		len = 0;
1356	strncpy(serial, serial_buffer + len, serlen);
1357	return (0);
1358}
1359
1360static int
1361g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp,
1362                              struct g_consumer *cp, struct g_geom **gp)
1363{
1364	struct g_consumer *rcp;
1365	struct g_provider *pp;
1366	struct g_raid_md_intel_object *mdi, *mdi1;
1367	struct g_raid_softc *sc;
1368	struct g_raid_disk *disk;
1369	struct intel_raid_conf *meta;
1370	struct g_raid_md_intel_perdisk *pd;
1371	struct g_geom *geom;
1372	int error, disk_pos, result, spare, len;
1373	char serial[INTEL_SERIAL_LEN];
1374	char name[16];
1375	uint16_t vendor;
1376
1377	G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name);
1378	mdi = (struct g_raid_md_intel_object *)md;
1379	pp = cp->provider;
1380
1381	/* Read metadata from device. */
1382	meta = NULL;
1383	vendor = 0xffff;
1384	disk_pos = 0;
1385	if (g_access(cp, 1, 0, 0) != 0)
1386		return (G_RAID_MD_TASTE_FAIL);
1387	g_topology_unlock();
1388	error = g_raid_md_get_label(cp, serial, sizeof(serial));
1389	if (error != 0) {
1390		G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).",
1391		    pp->name, error);
1392		goto fail2;
1393	}
1394	len = 2;
1395	if (pp->geom->rank == 1)
1396		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1397	meta = intel_meta_read(cp);
1398	g_topology_lock();
1399	g_access(cp, -1, 0, 0);
1400	if (meta == NULL) {
1401		if (g_raid_aggressive_spare) {
1402			if (vendor != 0x8086) {
1403				G_RAID_DEBUG(1,
1404				    "Intel vendor mismatch 0x%04x != 0x8086",
1405				    vendor);
1406			} else {
1407				G_RAID_DEBUG(1,
1408				    "No Intel metadata, forcing spare.");
1409				spare = 2;
1410				goto search;
1411			}
1412		}
1413		return (G_RAID_MD_TASTE_FAIL);
1414	}
1415
1416	/* Check this disk position in obtained metadata. */
1417	disk_pos = intel_meta_find_disk(meta, serial);
1418	if (disk_pos < 0) {
1419		G_RAID_DEBUG(1, "Intel serial '%s' not found", serial);
1420		goto fail1;
1421	}
1422	if (intel_get_disk_sectors(&meta->disk[disk_pos]) !=
1423	    (pp->mediasize / pp->sectorsize)) {
1424		G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju",
1425		    intel_get_disk_sectors(&meta->disk[disk_pos]),
1426		    (off_t)(pp->mediasize / pp->sectorsize));
1427		goto fail1;
1428	}
1429
1430	G_RAID_DEBUG(1, "Intel disk position %d", disk_pos);
1431	spare = meta->disk[disk_pos].flags & INTEL_F_SPARE;
1432
1433search:
1434	/* Search for matching node. */
1435	sc = NULL;
1436	mdi1 = NULL;
1437	LIST_FOREACH(geom, &mp->geom, geom) {
1438		sc = geom->softc;
1439		if (sc == NULL)
1440			continue;
1441		if (sc->sc_stopping != 0)
1442			continue;
1443		if (sc->sc_md->mdo_class != md->mdo_class)
1444			continue;
1445		mdi1 = (struct g_raid_md_intel_object *)sc->sc_md;
1446		if (spare) {
1447			if (mdi1->mdio_incomplete)
1448				break;
1449		} else {
1450			if (mdi1->mdio_config_id == meta->config_id)
1451				break;
1452		}
1453	}
1454
1455	/* Found matching node. */
1456	if (geom != NULL) {
1457		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1458		result = G_RAID_MD_TASTE_EXISTING;
1459
1460	} else if (spare) { /* Not found needy node -- left for later. */
1461		G_RAID_DEBUG(1, "Spare is not needed at this time");
1462		goto fail1;
1463
1464	} else { /* Not found matching node -- create one. */
1465		result = G_RAID_MD_TASTE_NEW;
1466		mdi->mdio_config_id = meta->config_id;
1467		mdi->mdio_orig_config_id = meta->orig_config_id;
1468		snprintf(name, sizeof(name), "Intel-%08x", meta->config_id);
1469		sc = g_raid_create_node(mp, name, md);
1470		md->mdo_softc = sc;
1471		geom = sc->sc_geom;
1472		callout_init(&mdi->mdio_start_co, 1);
1473		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
1474		    g_raid_intel_go, sc);
1475		mdi->mdio_rootmount = root_mount_hold("GRAID-Intel");
1476		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
1477	}
1478
1479	rcp = g_new_consumer(geom);
1480	g_attach(rcp, pp);
1481	if (g_access(rcp, 1, 1, 1) != 0)
1482		; //goto fail1;
1483
1484	g_topology_unlock();
1485	sx_xlock(&sc->sc_lock);
1486
1487	pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1488	pd->pd_meta = meta;
1489	pd->pd_disk_pos = -1;
1490	if (spare == 2) {
1491		memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN);
1492		intel_set_disk_sectors(&pd->pd_disk_meta,
1493		    pp->mediasize / pp->sectorsize);
1494		pd->pd_disk_meta.id = 0;
1495		pd->pd_disk_meta.flags = INTEL_F_SPARE;
1496	} else {
1497		pd->pd_disk_meta = meta->disk[disk_pos];
1498	}
1499	disk = g_raid_create_disk(sc);
1500	disk->d_md_data = (void *)pd;
1501	disk->d_consumer = rcp;
1502	rcp->private = disk;
1503
1504	g_raid_get_disk_info(disk);
1505
1506	g_raid_md_intel_new_disk(disk);
1507
1508	sx_xunlock(&sc->sc_lock);
1509	g_topology_lock();
1510	*gp = geom;
1511	return (result);
1512fail2:
1513	g_topology_lock();
1514	g_access(cp, -1, 0, 0);
1515fail1:
1516	free(meta, M_MD_INTEL);
1517	return (G_RAID_MD_TASTE_FAIL);
1518}
1519
1520static int
1521g_raid_md_event_intel(struct g_raid_md_object *md,
1522    struct g_raid_disk *disk, u_int event)
1523{
1524	struct g_raid_softc *sc;
1525	struct g_raid_subdisk *sd;
1526	struct g_raid_md_intel_object *mdi;
1527	struct g_raid_md_intel_perdisk *pd;
1528
1529	sc = md->mdo_softc;
1530	mdi = (struct g_raid_md_intel_object *)md;
1531	if (disk == NULL) {
1532		switch (event) {
1533		case G_RAID_NODE_E_START:
1534			if (!mdi->mdio_started)
1535				g_raid_md_intel_start(sc);
1536			return (0);
1537		}
1538		return (-1);
1539	}
1540	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1541	switch (event) {
1542	case G_RAID_DISK_E_DISCONNECTED:
1543		/* If disk was assigned, just update statuses. */
1544		if (pd->pd_disk_pos >= 0) {
1545			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1546			if (disk->d_consumer) {
1547				g_raid_kill_consumer(sc, disk->d_consumer);
1548				disk->d_consumer = NULL;
1549			}
1550			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
1551				g_raid_change_subdisk_state(sd,
1552				    G_RAID_SUBDISK_S_NONE);
1553				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
1554				    G_RAID_EVENT_SUBDISK);
1555			}
1556		} else {
1557			/* Otherwise -- delete. */
1558			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1559			g_raid_destroy_disk(disk);
1560		}
1561
1562		/* Write updated metadata to all disks. */
1563		g_raid_md_write_intel(md, NULL, NULL, NULL);
1564
1565		/* Check if anything left except placeholders. */
1566		if (g_raid_ndisks(sc, -1) ==
1567		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
1568			g_raid_destroy_node(sc, 0);
1569		else
1570			g_raid_md_intel_refill(sc);
1571		return (0);
1572	}
1573	return (-2);
1574}
1575
1576static int
1577g_raid_md_ctl_intel(struct g_raid_md_object *md,
1578    struct gctl_req *req)
1579{
1580	struct g_raid_softc *sc;
1581	struct g_raid_volume *vol, *vol1;
1582	struct g_raid_subdisk *sd;
1583	struct g_raid_disk *disk;
1584	struct g_raid_md_intel_object *mdi;
1585	struct g_raid_md_intel_pervolume *pv;
1586	struct g_raid_md_intel_perdisk *pd;
1587	struct g_consumer *cp;
1588	struct g_provider *pp;
1589	char arg[16], serial[INTEL_SERIAL_LEN];
1590	const char *nodename, *verb, *volname, *levelname, *diskname;
1591	char *tmp;
1592	int *nargs, *force;
1593	off_t off, size, sectorsize, strip, disk_sectors;
1594	intmax_t *sizearg, *striparg;
1595	int numdisks, i, len, level, qual, update;
1596	int error;
1597
1598	sc = md->mdo_softc;
1599	mdi = (struct g_raid_md_intel_object *)md;
1600	verb = gctl_get_param(req, "verb", NULL);
1601	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1602	error = 0;
1603	if (strcmp(verb, "label") == 0) {
1604
1605		if (*nargs < 4) {
1606			gctl_error(req, "Invalid number of arguments.");
1607			return (-1);
1608		}
1609		volname = gctl_get_asciiparam(req, "arg1");
1610		if (volname == NULL) {
1611			gctl_error(req, "No volume name.");
1612			return (-2);
1613		}
1614		levelname = gctl_get_asciiparam(req, "arg2");
1615		if (levelname == NULL) {
1616			gctl_error(req, "No RAID level.");
1617			return (-3);
1618		}
1619		if (strcasecmp(levelname, "RAID5") == 0)
1620			levelname = "RAID5-LA";
1621		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1622			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1623			return (-4);
1624		}
1625		numdisks = *nargs - 3;
1626		force = gctl_get_paraml(req, "force", sizeof(*force));
1627		if (!g_raid_md_intel_supported(level, qual, numdisks,
1628		    force ? *force : 0)) {
1629			gctl_error(req, "Unsupported RAID level "
1630			    "(0x%02x/0x%02x), or number of disks (%d).",
1631			    level, qual, numdisks);
1632			return (-5);
1633		}
1634
1635		/* Search for disks, connect them and probe. */
1636		size = 0x7fffffffffffffffllu;
1637		sectorsize = 0;
1638		for (i = 0; i < numdisks; i++) {
1639			snprintf(arg, sizeof(arg), "arg%d", i + 3);
1640			diskname = gctl_get_asciiparam(req, arg);
1641			if (diskname == NULL) {
1642				gctl_error(req, "No disk name (%s).", arg);
1643				error = -6;
1644				break;
1645			}
1646			if (strcmp(diskname, "NONE") == 0) {
1647				cp = NULL;
1648				pp = NULL;
1649			} else {
1650				g_topology_lock();
1651				cp = g_raid_open_consumer(sc, diskname);
1652				if (cp == NULL) {
1653					gctl_error(req, "Can't open disk '%s'.",
1654					    diskname);
1655					g_topology_unlock();
1656					error = -7;
1657					break;
1658				}
1659				pp = cp->provider;
1660			}
1661			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1662			pd->pd_disk_pos = i;
1663			disk = g_raid_create_disk(sc);
1664			disk->d_md_data = (void *)pd;
1665			disk->d_consumer = cp;
1666			if (cp == NULL) {
1667				strcpy(&pd->pd_disk_meta.serial[0], "NONE");
1668				pd->pd_disk_meta.id = 0xffffffff;
1669				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
1670				continue;
1671			}
1672			cp->private = disk;
1673			g_topology_unlock();
1674
1675			error = g_raid_md_get_label(cp,
1676			    &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN);
1677			if (error != 0) {
1678				gctl_error(req,
1679				    "Can't get serial for provider '%s'.",
1680				    diskname);
1681				error = -8;
1682				break;
1683			}
1684
1685			g_raid_get_disk_info(disk);
1686
1687			intel_set_disk_sectors(&pd->pd_disk_meta,
1688			    pp->mediasize / pp->sectorsize);
1689			if (size > pp->mediasize)
1690				size = pp->mediasize;
1691			if (sectorsize < pp->sectorsize)
1692				sectorsize = pp->sectorsize;
1693			pd->pd_disk_meta.id = 0;
1694			pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE;
1695		}
1696		if (error != 0)
1697			return (error);
1698
1699		if (sectorsize <= 0) {
1700			gctl_error(req, "Can't get sector size.");
1701			return (-8);
1702		}
1703
1704		/* Reserve some space for metadata. */
1705		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1706
1707		/* Handle size argument. */
1708		len = sizeof(*sizearg);
1709		sizearg = gctl_get_param(req, "size", &len);
1710		if (sizearg != NULL && len == sizeof(*sizearg) &&
1711		    *sizearg > 0) {
1712			if (*sizearg > size) {
1713				gctl_error(req, "Size too big %lld > %lld.",
1714				    (long long)*sizearg, (long long)size);
1715				return (-9);
1716			}
1717			size = *sizearg;
1718		}
1719
1720		/* Handle strip argument. */
1721		strip = 131072;
1722		len = sizeof(*striparg);
1723		striparg = gctl_get_param(req, "strip", &len);
1724		if (striparg != NULL && len == sizeof(*striparg) &&
1725		    *striparg > 0) {
1726			if (*striparg < sectorsize) {
1727				gctl_error(req, "Strip size too small.");
1728				return (-10);
1729			}
1730			if (*striparg % sectorsize != 0) {
1731				gctl_error(req, "Incorrect strip size.");
1732				return (-11);
1733			}
1734			if (strip > 65535 * sectorsize) {
1735				gctl_error(req, "Strip size too big.");
1736				return (-12);
1737			}
1738			strip = *striparg;
1739		}
1740
1741		/* Round size down to strip or sector. */
1742		if (level == G_RAID_VOLUME_RL_RAID1)
1743			size -= (size % sectorsize);
1744		else if (level == G_RAID_VOLUME_RL_RAID1E &&
1745		    (numdisks & 1) != 0)
1746			size -= (size % (2 * strip));
1747		else
1748			size -= (size % strip);
1749		if (size <= 0) {
1750			gctl_error(req, "Size too small.");
1751			return (-13);
1752		}
1753
1754		/* We have all we need, create things: volume, ... */
1755		mdi->mdio_started = 1;
1756		vol = g_raid_create_volume(sc, volname, -1);
1757		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1758		pv->pv_volume_pos = 0;
1759		vol->v_md_data = pv;
1760		vol->v_raid_level = level;
1761		vol->v_raid_level_qualifier = qual;
1762		vol->v_strip_size = strip;
1763		vol->v_disks_count = numdisks;
1764		if (level == G_RAID_VOLUME_RL_RAID0)
1765			vol->v_mediasize = size * numdisks;
1766		else if (level == G_RAID_VOLUME_RL_RAID1)
1767			vol->v_mediasize = size;
1768		else if (level == G_RAID_VOLUME_RL_RAID5)
1769			vol->v_mediasize = size * (numdisks - 1);
1770		else { /* RAID1E */
1771			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1772			    strip;
1773		}
1774		vol->v_sectorsize = sectorsize;
1775		g_raid_start_volume(vol);
1776
1777		/* , and subdisks. */
1778		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1779			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1780			sd = &vol->v_subdisks[pd->pd_disk_pos];
1781			sd->sd_disk = disk;
1782			sd->sd_offset = 0;
1783			sd->sd_size = size;
1784			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1785			if (sd->sd_disk->d_consumer != NULL) {
1786				g_raid_change_disk_state(disk,
1787				    G_RAID_DISK_S_ACTIVE);
1788				if (level == G_RAID_VOLUME_RL_RAID5)
1789					g_raid_change_subdisk_state(sd,
1790					    G_RAID_SUBDISK_S_UNINITIALIZED);
1791				else
1792					g_raid_change_subdisk_state(sd,
1793					    G_RAID_SUBDISK_S_ACTIVE);
1794				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1795				    G_RAID_EVENT_SUBDISK);
1796			} else {
1797				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1798			}
1799		}
1800
1801		/* Write metadata based on created entities. */
1802		G_RAID_DEBUG1(0, sc, "Array started.");
1803		g_raid_md_write_intel(md, NULL, NULL, NULL);
1804
1805		/* Pickup any STALE/SPARE disks to refill array if needed. */
1806		g_raid_md_intel_refill(sc);
1807
1808		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1809		    G_RAID_EVENT_VOLUME);
1810		return (0);
1811	}
1812	if (strcmp(verb, "add") == 0) {
1813
1814		if (*nargs != 3) {
1815			gctl_error(req, "Invalid number of arguments.");
1816			return (-1);
1817		}
1818		volname = gctl_get_asciiparam(req, "arg1");
1819		if (volname == NULL) {
1820			gctl_error(req, "No volume name.");
1821			return (-2);
1822		}
1823		levelname = gctl_get_asciiparam(req, "arg2");
1824		if (levelname == NULL) {
1825			gctl_error(req, "No RAID level.");
1826			return (-3);
1827		}
1828		if (strcasecmp(levelname, "RAID5") == 0)
1829			levelname = "RAID5-LA";
1830		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1831			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1832			return (-4);
1833		}
1834
1835		/* Look for existing volumes. */
1836		i = 0;
1837		vol1 = NULL;
1838		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1839			vol1 = vol;
1840			i++;
1841		}
1842		if (i > 1) {
1843			gctl_error(req, "Maximum two volumes supported.");
1844			return (-6);
1845		}
1846		if (vol1 == NULL) {
1847			gctl_error(req, "At least one volume must exist.");
1848			return (-7);
1849		}
1850
1851		numdisks = vol1->v_disks_count;
1852		force = gctl_get_paraml(req, "force", sizeof(*force));
1853		if (!g_raid_md_intel_supported(level, qual, numdisks,
1854		    force ? *force : 0)) {
1855			gctl_error(req, "Unsupported RAID level "
1856			    "(0x%02x/0x%02x), or number of disks (%d).",
1857			    level, qual, numdisks);
1858			return (-5);
1859		}
1860
1861		/* Collect info about present disks. */
1862		size = 0x7fffffffffffffffllu;
1863		sectorsize = 512;
1864		for (i = 0; i < numdisks; i++) {
1865			disk = vol1->v_subdisks[i].sd_disk;
1866			pd = (struct g_raid_md_intel_perdisk *)
1867			    disk->d_md_data;
1868			disk_sectors =
1869			    intel_get_disk_sectors(&pd->pd_disk_meta);
1870
1871			if (disk_sectors * 512 < size)
1872				size = disk_sectors * 512;
1873			if (disk->d_consumer != NULL &&
1874			    disk->d_consumer->provider != NULL &&
1875			    disk->d_consumer->provider->sectorsize >
1876			     sectorsize) {
1877				sectorsize =
1878				    disk->d_consumer->provider->sectorsize;
1879			}
1880		}
1881
1882		/* Reserve some space for metadata. */
1883		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1884
1885		/* Decide insert before or after. */
1886		sd = &vol1->v_subdisks[0];
1887		if (sd->sd_offset >
1888		    size - (sd->sd_offset + sd->sd_size)) {
1889			off = 0;
1890			size = sd->sd_offset;
1891		} else {
1892			off = sd->sd_offset + sd->sd_size;
1893			size = size - (sd->sd_offset + sd->sd_size);
1894		}
1895
1896		/* Handle strip argument. */
1897		strip = 131072;
1898		len = sizeof(*striparg);
1899		striparg = gctl_get_param(req, "strip", &len);
1900		if (striparg != NULL && len == sizeof(*striparg) &&
1901		    *striparg > 0) {
1902			if (*striparg < sectorsize) {
1903				gctl_error(req, "Strip size too small.");
1904				return (-10);
1905			}
1906			if (*striparg % sectorsize != 0) {
1907				gctl_error(req, "Incorrect strip size.");
1908				return (-11);
1909			}
1910			if (strip > 65535 * sectorsize) {
1911				gctl_error(req, "Strip size too big.");
1912				return (-12);
1913			}
1914			strip = *striparg;
1915		}
1916
1917		/* Round offset up to strip. */
1918		if (off % strip != 0) {
1919			size -= strip - off % strip;
1920			off += strip - off % strip;
1921		}
1922
1923		/* Handle size argument. */
1924		len = sizeof(*sizearg);
1925		sizearg = gctl_get_param(req, "size", &len);
1926		if (sizearg != NULL && len == sizeof(*sizearg) &&
1927		    *sizearg > 0) {
1928			if (*sizearg > size) {
1929				gctl_error(req, "Size too big %lld > %lld.",
1930				    (long long)*sizearg, (long long)size);
1931				return (-9);
1932			}
1933			size = *sizearg;
1934		}
1935
1936		/* Round size down to strip or sector. */
1937		if (level == G_RAID_VOLUME_RL_RAID1)
1938			size -= (size % sectorsize);
1939		else
1940			size -= (size % strip);
1941		if (size <= 0) {
1942			gctl_error(req, "Size too small.");
1943			return (-13);
1944		}
1945		if (size > 0xffffffffllu * sectorsize) {
1946			gctl_error(req, "Size too big.");
1947			return (-14);
1948		}
1949
1950		/* We have all we need, create things: volume, ... */
1951		vol = g_raid_create_volume(sc, volname, -1);
1952		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1953		pv->pv_volume_pos = i;
1954		vol->v_md_data = pv;
1955		vol->v_raid_level = level;
1956		vol->v_raid_level_qualifier = qual;
1957		vol->v_strip_size = strip;
1958		vol->v_disks_count = numdisks;
1959		if (level == G_RAID_VOLUME_RL_RAID0)
1960			vol->v_mediasize = size * numdisks;
1961		else if (level == G_RAID_VOLUME_RL_RAID1)
1962			vol->v_mediasize = size;
1963		else if (level == G_RAID_VOLUME_RL_RAID5)
1964			vol->v_mediasize = size * (numdisks - 1);
1965		else { /* RAID1E */
1966			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1967			    strip;
1968		}
1969		vol->v_sectorsize = sectorsize;
1970		g_raid_start_volume(vol);
1971
1972		/* , and subdisks. */
1973		for (i = 0; i < numdisks; i++) {
1974			disk = vol1->v_subdisks[i].sd_disk;
1975			sd = &vol->v_subdisks[i];
1976			sd->sd_disk = disk;
1977			sd->sd_offset = off;
1978			sd->sd_size = size;
1979			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1980			if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
1981				if (level == G_RAID_VOLUME_RL_RAID5)
1982					g_raid_change_subdisk_state(sd,
1983					    G_RAID_SUBDISK_S_UNINITIALIZED);
1984				else
1985					g_raid_change_subdisk_state(sd,
1986					    G_RAID_SUBDISK_S_ACTIVE);
1987				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1988				    G_RAID_EVENT_SUBDISK);
1989			}
1990		}
1991
1992		/* Write metadata based on created entities. */
1993		g_raid_md_write_intel(md, NULL, NULL, NULL);
1994
1995		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1996		    G_RAID_EVENT_VOLUME);
1997		return (0);
1998	}
1999	if (strcmp(verb, "delete") == 0) {
2000
2001		nodename = gctl_get_asciiparam(req, "arg0");
2002		if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
2003			nodename = NULL;
2004
2005		/* Full node destruction. */
2006		if (*nargs == 1 && nodename != NULL) {
2007			/* Check if some volume is still open. */
2008			force = gctl_get_paraml(req, "force", sizeof(*force));
2009			if (force != NULL && *force == 0 &&
2010			    g_raid_nopens(sc) != 0) {
2011				gctl_error(req, "Some volume is still open.");
2012				return (-4);
2013			}
2014
2015			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2016				if (disk->d_consumer)
2017					intel_meta_erase(disk->d_consumer);
2018			}
2019			g_raid_destroy_node(sc, 0);
2020			return (0);
2021		}
2022
2023		/* Destroy specified volume. If it was last - all node. */
2024		if (*nargs > 2) {
2025			gctl_error(req, "Invalid number of arguments.");
2026			return (-1);
2027		}
2028		volname = gctl_get_asciiparam(req,
2029		    nodename != NULL ? "arg1" : "arg0");
2030		if (volname == NULL) {
2031			gctl_error(req, "No volume name.");
2032			return (-2);
2033		}
2034
2035		/* Search for volume. */
2036		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2037			if (strcmp(vol->v_name, volname) == 0)
2038				break;
2039			pp = vol->v_provider;
2040			if (pp == NULL)
2041				continue;
2042			if (strcmp(pp->name, volname) == 0)
2043				break;
2044			if (strncmp(pp->name, "raid/", 5) == 0 &&
2045			    strcmp(pp->name + 5, volname) == 0)
2046				break;
2047		}
2048		if (vol == NULL) {
2049			i = strtol(volname, &tmp, 10);
2050			if (verb != volname && tmp[0] == 0) {
2051				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2052					if (vol->v_global_id == i)
2053						break;
2054				}
2055			}
2056		}
2057		if (vol == NULL) {
2058			gctl_error(req, "Volume '%s' not found.", volname);
2059			return (-3);
2060		}
2061
2062		/* Check if volume is still open. */
2063		force = gctl_get_paraml(req, "force", sizeof(*force));
2064		if (force != NULL && *force == 0 &&
2065		    vol->v_provider_open != 0) {
2066			gctl_error(req, "Volume is still open.");
2067			return (-4);
2068		}
2069
2070		/* Destroy volume and potentially node. */
2071		i = 0;
2072		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
2073			i++;
2074		if (i >= 2) {
2075			g_raid_destroy_volume(vol);
2076			g_raid_md_write_intel(md, NULL, NULL, NULL);
2077		} else {
2078			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2079				if (disk->d_consumer)
2080					intel_meta_erase(disk->d_consumer);
2081			}
2082			g_raid_destroy_node(sc, 0);
2083		}
2084		return (0);
2085	}
2086	if (strcmp(verb, "remove") == 0 ||
2087	    strcmp(verb, "fail") == 0) {
2088		if (*nargs < 2) {
2089			gctl_error(req, "Invalid number of arguments.");
2090			return (-1);
2091		}
2092		for (i = 1; i < *nargs; i++) {
2093			snprintf(arg, sizeof(arg), "arg%d", i);
2094			diskname = gctl_get_asciiparam(req, arg);
2095			if (diskname == NULL) {
2096				gctl_error(req, "No disk name (%s).", arg);
2097				error = -2;
2098				break;
2099			}
2100			if (strncmp(diskname, "/dev/", 5) == 0)
2101				diskname += 5;
2102
2103			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2104				if (disk->d_consumer != NULL &&
2105				    disk->d_consumer->provider != NULL &&
2106				    strcmp(disk->d_consumer->provider->name,
2107				     diskname) == 0)
2108					break;
2109			}
2110			if (disk == NULL) {
2111				gctl_error(req, "Disk '%s' not found.",
2112				    diskname);
2113				error = -3;
2114				break;
2115			}
2116
2117			if (strcmp(verb, "fail") == 0) {
2118				g_raid_md_fail_disk_intel(md, NULL, disk);
2119				continue;
2120			}
2121
2122			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2123
2124			/* Erase metadata on deleting disk. */
2125			intel_meta_erase(disk->d_consumer);
2126
2127			/* If disk was assigned, just update statuses. */
2128			if (pd->pd_disk_pos >= 0) {
2129				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
2130				g_raid_kill_consumer(sc, disk->d_consumer);
2131				disk->d_consumer = NULL;
2132				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2133					g_raid_change_subdisk_state(sd,
2134					    G_RAID_SUBDISK_S_NONE);
2135					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2136					    G_RAID_EVENT_SUBDISK);
2137				}
2138			} else {
2139				/* Otherwise -- delete. */
2140				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
2141				g_raid_destroy_disk(disk);
2142			}
2143		}
2144
2145		/* Write updated metadata to remaining disks. */
2146		g_raid_md_write_intel(md, NULL, NULL, NULL);
2147
2148		/* Check if anything left except placeholders. */
2149		if (g_raid_ndisks(sc, -1) ==
2150		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2151			g_raid_destroy_node(sc, 0);
2152		else
2153			g_raid_md_intel_refill(sc);
2154		return (error);
2155	}
2156	if (strcmp(verb, "insert") == 0) {
2157		if (*nargs < 2) {
2158			gctl_error(req, "Invalid number of arguments.");
2159			return (-1);
2160		}
2161		update = 0;
2162		for (i = 1; i < *nargs; i++) {
2163			/* Get disk name. */
2164			snprintf(arg, sizeof(arg), "arg%d", i);
2165			diskname = gctl_get_asciiparam(req, arg);
2166			if (diskname == NULL) {
2167				gctl_error(req, "No disk name (%s).", arg);
2168				error = -3;
2169				break;
2170			}
2171
2172			/* Try to find provider with specified name. */
2173			g_topology_lock();
2174			cp = g_raid_open_consumer(sc, diskname);
2175			if (cp == NULL) {
2176				gctl_error(req, "Can't open disk '%s'.",
2177				    diskname);
2178				g_topology_unlock();
2179				error = -4;
2180				break;
2181			}
2182			pp = cp->provider;
2183			g_topology_unlock();
2184
2185			/* Read disk serial. */
2186			error = g_raid_md_get_label(cp,
2187			    &serial[0], INTEL_SERIAL_LEN);
2188			if (error != 0) {
2189				gctl_error(req,
2190				    "Can't get serial for provider '%s'.",
2191				    diskname);
2192				g_raid_kill_consumer(sc, cp);
2193				error = -7;
2194				break;
2195			}
2196
2197			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
2198			pd->pd_disk_pos = -1;
2199
2200			disk = g_raid_create_disk(sc);
2201			disk->d_consumer = cp;
2202			disk->d_md_data = (void *)pd;
2203			cp->private = disk;
2204
2205			g_raid_get_disk_info(disk);
2206
2207			memcpy(&pd->pd_disk_meta.serial[0], &serial[0],
2208			    INTEL_SERIAL_LEN);
2209			intel_set_disk_sectors(&pd->pd_disk_meta,
2210			    pp->mediasize / pp->sectorsize);
2211			pd->pd_disk_meta.id = 0;
2212			pd->pd_disk_meta.flags = INTEL_F_SPARE;
2213
2214			/* Welcome the "new" disk. */
2215			update += g_raid_md_intel_start_disk(disk);
2216			if (disk->d_state == G_RAID_DISK_S_SPARE) {
2217				intel_meta_write_spare(cp, &pd->pd_disk_meta);
2218				g_raid_destroy_disk(disk);
2219			} else if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2220				gctl_error(req, "Disk '%s' doesn't fit.",
2221				    diskname);
2222				g_raid_destroy_disk(disk);
2223				error = -8;
2224				break;
2225			}
2226		}
2227
2228		/* Write new metadata if we changed something. */
2229		if (update)
2230			g_raid_md_write_intel(md, NULL, NULL, NULL);
2231		return (error);
2232	}
2233	return (-100);
2234}
2235
2236static int
2237g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol,
2238    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2239{
2240	struct g_raid_softc *sc;
2241	struct g_raid_volume *vol;
2242	struct g_raid_subdisk *sd;
2243	struct g_raid_disk *disk;
2244	struct g_raid_md_intel_object *mdi;
2245	struct g_raid_md_intel_pervolume *pv;
2246	struct g_raid_md_intel_perdisk *pd;
2247	struct intel_raid_conf *meta;
2248	struct intel_raid_vol *mvol;
2249	struct intel_raid_map *mmap0, *mmap1;
2250	off_t sectorsize = 512, pos;
2251	const char *version, *cv;
2252	int vi, sdi, numdisks, len, state, stale;
2253
2254	sc = md->mdo_softc;
2255	mdi = (struct g_raid_md_intel_object *)md;
2256
2257	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2258		return (0);
2259
2260	/* Bump generation. Newly written metadata may differ from previous. */
2261	mdi->mdio_generation++;
2262
2263	/* Count number of disks. */
2264	numdisks = 0;
2265	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2266		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2267		if (pd->pd_disk_pos < 0)
2268			continue;
2269		numdisks++;
2270		if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
2271			pd->pd_disk_meta.flags =
2272			    INTEL_F_ONLINE | INTEL_F_ASSIGNED;
2273		} else if (disk->d_state == G_RAID_DISK_S_FAILED) {
2274			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2275			    INTEL_F_ASSIGNED;
2276		} else if (disk->d_state == G_RAID_DISK_S_DISABLED) {
2277			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2278			    INTEL_F_ASSIGNED | INTEL_F_DISABLED;
2279		} else {
2280			if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED))
2281				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
2282			if (pd->pd_disk_meta.id != 0xffffffff) {
2283				pd->pd_disk_meta.id = 0xffffffff;
2284				len = strlen(pd->pd_disk_meta.serial);
2285				len = min(len, INTEL_SERIAL_LEN - 3);
2286				strcpy(pd->pd_disk_meta.serial + len, ":0");
2287			}
2288		}
2289	}
2290
2291	/* Fill anchor and disks. */
2292	meta = malloc(INTEL_MAX_MD_SIZE(numdisks),
2293	    M_MD_INTEL, M_WAITOK | M_ZERO);
2294	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
2295	meta->config_size = INTEL_MAX_MD_SIZE(numdisks);
2296	meta->config_id = mdi->mdio_config_id;
2297	meta->orig_config_id = mdi->mdio_orig_config_id;
2298	meta->generation = mdi->mdio_generation;
2299	meta->attributes = INTEL_ATTR_CHECKSUM;
2300	meta->total_disks = numdisks;
2301	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2302		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2303		if (pd->pd_disk_pos < 0)
2304			continue;
2305		meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta;
2306		if (pd->pd_disk_meta.sectors_hi != 0)
2307			meta->attributes |= INTEL_ATTR_2TB_DISK;
2308	}
2309
2310	/* Fill volumes and maps. */
2311	vi = 0;
2312	version = INTEL_VERSION_1000;
2313	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2314		pv = vol->v_md_data;
2315		if (vol->v_stopping)
2316			continue;
2317		mvol = intel_get_volume(meta, vi);
2318
2319		/* New metadata may have different volumes order. */
2320		pv->pv_volume_pos = vi;
2321
2322		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2323			sd = &vol->v_subdisks[sdi];
2324			if (sd->sd_disk != NULL)
2325				break;
2326		}
2327		if (sdi >= vol->v_disks_count)
2328			panic("No any filled subdisk in volume");
2329		if (vol->v_mediasize >= 0x20000000000llu)
2330			meta->attributes |= INTEL_ATTR_2TB;
2331		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2332			meta->attributes |= INTEL_ATTR_RAID0;
2333		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2334			meta->attributes |= INTEL_ATTR_RAID1;
2335		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2336			meta->attributes |= INTEL_ATTR_RAID5;
2337		else if ((vol->v_disks_count & 1) == 0)
2338			meta->attributes |= INTEL_ATTR_RAID10;
2339		else
2340			meta->attributes |= INTEL_ATTR_RAID1E;
2341		if (pv->pv_cng)
2342			meta->attributes |= INTEL_ATTR_RAIDCNG;
2343		if (vol->v_strip_size > 131072)
2344			meta->attributes |= INTEL_ATTR_EXT_STRIP;
2345
2346		if (pv->pv_cng)
2347			cv = INTEL_VERSION_1206;
2348		else if (vol->v_disks_count > 4)
2349			cv = INTEL_VERSION_1204;
2350		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2351			cv = INTEL_VERSION_1202;
2352		else if (vol->v_disks_count > 2)
2353			cv = INTEL_VERSION_1201;
2354		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2355			cv = INTEL_VERSION_1100;
2356		else
2357			cv = INTEL_VERSION_1000;
2358		if (strcmp(cv, version) > 0)
2359			version = cv;
2360
2361		strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name));
2362		mvol->total_sectors = vol->v_mediasize / sectorsize;
2363		mvol->state = (INTEL_ST_READ_COALESCING |
2364		    INTEL_ST_WRITE_COALESCING);
2365		mvol->tid = vol->v_global_id + 1;
2366		if (pv->pv_cng) {
2367			mvol->state |= INTEL_ST_CLONE_N_GO;
2368			if (pv->pv_cng_man_sync)
2369				mvol->state |= INTEL_ST_CLONE_MAN_SYNC;
2370			mvol->cng_master_disk = pv->pv_cng_master_disk;
2371			if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state ==
2372			    G_RAID_SUBDISK_S_NONE)
2373				mvol->cng_state = INTEL_CNGST_MASTER_MISSING;
2374			else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
2375				mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE;
2376			else
2377				mvol->cng_state = INTEL_CNGST_UPDATED;
2378		}
2379
2380		/* Check for any recovery in progress. */
2381		state = G_RAID_SUBDISK_S_ACTIVE;
2382		pos = 0x7fffffffffffffffllu;
2383		stale = 0;
2384		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2385			sd = &vol->v_subdisks[sdi];
2386			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD)
2387				state = G_RAID_SUBDISK_S_REBUILD;
2388			else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC &&
2389			    state != G_RAID_SUBDISK_S_REBUILD)
2390				state = G_RAID_SUBDISK_S_RESYNC;
2391			else if (sd->sd_state == G_RAID_SUBDISK_S_STALE)
2392				stale = 1;
2393			if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2394			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
2395			     sd->sd_rebuild_pos < pos)
2396			        pos = sd->sd_rebuild_pos;
2397		}
2398		if (state == G_RAID_SUBDISK_S_REBUILD) {
2399			mvol->migr_state = 1;
2400			mvol->migr_type = INTEL_MT_REBUILD;
2401		} else if (state == G_RAID_SUBDISK_S_RESYNC) {
2402			mvol->migr_state = 1;
2403			/* mvol->migr_type = INTEL_MT_REPAIR; */
2404			mvol->migr_type = INTEL_MT_VERIFY;
2405			mvol->state |= INTEL_ST_VERIFY_AND_FIX;
2406		} else
2407			mvol->migr_state = 0;
2408		mvol->dirty = (vol->v_dirty || stale);
2409
2410		mmap0 = intel_get_map(mvol, 0);
2411
2412		/* Write map / common part of two maps. */
2413		intel_set_map_offset(mmap0, sd->sd_offset / sectorsize);
2414		intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize);
2415		mmap0->strip_sectors = vol->v_strip_size / sectorsize;
2416		if (vol->v_state == G_RAID_VOLUME_S_BROKEN)
2417			mmap0->status = INTEL_S_FAILURE;
2418		else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED)
2419			mmap0->status = INTEL_S_DEGRADED;
2420		else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED)
2421		    == g_raid_nsubdisks(vol, -1))
2422			mmap0->status = INTEL_S_UNINITIALIZED;
2423		else
2424			mmap0->status = INTEL_S_READY;
2425		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2426			mmap0->type = INTEL_T_RAID0;
2427		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
2428		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2429			mmap0->type = INTEL_T_RAID1;
2430		else
2431			mmap0->type = INTEL_T_RAID5;
2432		mmap0->total_disks = vol->v_disks_count;
2433		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2434			mmap0->total_domains = vol->v_disks_count;
2435		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2436			mmap0->total_domains = 2;
2437		else
2438			mmap0->total_domains = 1;
2439		intel_set_map_stripe_count(mmap0,
2440		    sd->sd_size / vol->v_strip_size / mmap0->total_domains);
2441		mmap0->failed_disk_num = 0xff;
2442		mmap0->ddf = 1;
2443
2444		/* If there are two maps - copy common and update. */
2445		if (mvol->migr_state) {
2446			intel_set_vol_curr_migr_unit(mvol,
2447			    pos / vol->v_strip_size / mmap0->total_domains);
2448			mmap1 = intel_get_map(mvol, 1);
2449			memcpy(mmap1, mmap0, sizeof(struct intel_raid_map));
2450			mmap0->status = INTEL_S_READY;
2451		} else
2452			mmap1 = NULL;
2453
2454		/* Write disk indexes and put rebuild flags. */
2455		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2456			sd = &vol->v_subdisks[sdi];
2457			pd = (struct g_raid_md_intel_perdisk *)
2458			    sd->sd_disk->d_md_data;
2459			mmap0->disk_idx[sdi] = pd->pd_disk_pos;
2460			if (mvol->migr_state)
2461				mmap1->disk_idx[sdi] = pd->pd_disk_pos;
2462			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2463			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2464				mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2465			} else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
2466			    sd->sd_state != G_RAID_SUBDISK_S_STALE &&
2467			    sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) {
2468				mmap0->disk_idx[sdi] |= INTEL_DI_RBLD;
2469				if (mvol->migr_state)
2470					mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2471			}
2472			if ((sd->sd_state == G_RAID_SUBDISK_S_NONE ||
2473			     sd->sd_state == G_RAID_SUBDISK_S_FAILED ||
2474			     sd->sd_state == G_RAID_SUBDISK_S_REBUILD) &&
2475			    mmap0->failed_disk_num == 0xff) {
2476				mmap0->failed_disk_num = sdi;
2477				if (mvol->migr_state)
2478					mmap1->failed_disk_num = sdi;
2479			}
2480		}
2481		vi++;
2482	}
2483	meta->total_volumes = vi;
2484	if (vi > 1 || meta->attributes &
2485	     (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB))
2486		version = INTEL_VERSION_1300;
2487	if (strcmp(version, INTEL_VERSION_1300) < 0)
2488		meta->attributes &= INTEL_ATTR_CHECKSUM;
2489	memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1);
2490
2491	/* We are done. Print meta data and store them to disks. */
2492	g_raid_md_intel_print(meta);
2493	if (mdi->mdio_meta != NULL)
2494		free(mdi->mdio_meta, M_MD_INTEL);
2495	mdi->mdio_meta = meta;
2496	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2497		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2498		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
2499			continue;
2500		if (pd->pd_meta != NULL) {
2501			free(pd->pd_meta, M_MD_INTEL);
2502			pd->pd_meta = NULL;
2503		}
2504		pd->pd_meta = intel_meta_copy(meta);
2505		intel_meta_write(disk->d_consumer, meta);
2506	}
2507	return (0);
2508}
2509
2510static int
2511g_raid_md_fail_disk_intel(struct g_raid_md_object *md,
2512    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2513{
2514	struct g_raid_softc *sc;
2515	struct g_raid_md_intel_object *mdi;
2516	struct g_raid_md_intel_perdisk *pd;
2517	struct g_raid_subdisk *sd;
2518
2519	sc = md->mdo_softc;
2520	mdi = (struct g_raid_md_intel_object *)md;
2521	pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data;
2522
2523	/* We can't fail disk that is not a part of array now. */
2524	if (pd->pd_disk_pos < 0)
2525		return (-1);
2526
2527	/*
2528	 * Mark disk as failed in metadata and try to write that metadata
2529	 * to the disk itself to prevent it's later resurrection as STALE.
2530	 */
2531	mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED;
2532	pd->pd_disk_meta.flags = INTEL_F_FAILED;
2533	g_raid_md_intel_print(mdi->mdio_meta);
2534	if (tdisk->d_consumer)
2535		intel_meta_write(tdisk->d_consumer, mdi->mdio_meta);
2536
2537	/* Change states. */
2538	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
2539	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
2540		g_raid_change_subdisk_state(sd,
2541		    G_RAID_SUBDISK_S_FAILED);
2542		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
2543		    G_RAID_EVENT_SUBDISK);
2544	}
2545
2546	/* Write updated metadata to remaining disks. */
2547	g_raid_md_write_intel(md, NULL, NULL, tdisk);
2548
2549	/* Check if anything left except placeholders. */
2550	if (g_raid_ndisks(sc, -1) ==
2551	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2552		g_raid_destroy_node(sc, 0);
2553	else
2554		g_raid_md_intel_refill(sc);
2555	return (0);
2556}
2557
2558static int
2559g_raid_md_free_disk_intel(struct g_raid_md_object *md,
2560    struct g_raid_disk *disk)
2561{
2562	struct g_raid_md_intel_perdisk *pd;
2563
2564	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2565	if (pd->pd_meta != NULL) {
2566		free(pd->pd_meta, M_MD_INTEL);
2567		pd->pd_meta = NULL;
2568	}
2569	free(pd, M_MD_INTEL);
2570	disk->d_md_data = NULL;
2571	return (0);
2572}
2573
2574static int
2575g_raid_md_free_volume_intel(struct g_raid_md_object *md,
2576    struct g_raid_volume *vol)
2577{
2578	struct g_raid_md_intel_pervolume *pv;
2579
2580	pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data;
2581	free(pv, M_MD_INTEL);
2582	vol->v_md_data = NULL;
2583	return (0);
2584}
2585
2586static int
2587g_raid_md_free_intel(struct g_raid_md_object *md)
2588{
2589	struct g_raid_md_intel_object *mdi;
2590
2591	mdi = (struct g_raid_md_intel_object *)md;
2592	if (!mdi->mdio_started) {
2593		mdi->mdio_started = 0;
2594		callout_stop(&mdi->mdio_start_co);
2595		G_RAID_DEBUG1(1, md->mdo_softc,
2596		    "root_mount_rel %p", mdi->mdio_rootmount);
2597		root_mount_rel(mdi->mdio_rootmount);
2598		mdi->mdio_rootmount = NULL;
2599	}
2600	if (mdi->mdio_meta != NULL) {
2601		free(mdi->mdio_meta, M_MD_INTEL);
2602		mdi->mdio_meta = NULL;
2603	}
2604	return (0);
2605}
2606
2607G_RAID_MD_DECLARE(intel, "Intel");
2608