1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * Copyright (c) 2000 - 2008 S��ren Schmidt <sos@FreeBSD.org>
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/param.h>
31#include <sys/bio.h>
32#include <sys/endian.h>
33#include <sys/kernel.h>
34#include <sys/kobj.h>
35#include <sys/limits.h>
36#include <sys/lock.h>
37#include <sys/malloc.h>
38#include <sys/mutex.h>
39#include <sys/systm.h>
40#include <sys/taskqueue.h>
41#include <sys/disk.h>
42#include <geom/geom.h>
43#include <geom/geom_dbg.h>
44#include "geom/raid/g_raid.h"
45#include "g_raid_md_if.h"
46
47static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata");
48
49struct intel_raid_map {
50	uint32_t	offset;
51	uint32_t	disk_sectors;
52	uint32_t	stripe_count;
53	uint16_t	strip_sectors;
54	uint8_t		status;
55#define INTEL_S_READY           0x00
56#define INTEL_S_UNINITIALIZED   0x01
57#define INTEL_S_DEGRADED        0x02
58#define INTEL_S_FAILURE         0x03
59
60	uint8_t		type;
61#define INTEL_T_RAID0           0x00
62#define INTEL_T_RAID1           0x01
63#define INTEL_T_RAID5           0x05
64
65	uint8_t		total_disks;
66	uint8_t		total_domains;
67	uint8_t		failed_disk_num;
68	uint8_t		ddf;
69	uint32_t	offset_hi;
70	uint32_t	disk_sectors_hi;
71	uint32_t	stripe_count_hi;
72	uint32_t	filler_2[4];
73	uint32_t	disk_idx[1];	/* total_disks entries. */
74#define INTEL_DI_IDX	0x00ffffff
75#define INTEL_DI_RBLD	0x01000000
76} __packed;
77
78struct intel_raid_vol {
79	uint8_t		name[16];
80	uint64_t	total_sectors __packed;
81	uint32_t	state;
82#define INTEL_ST_BOOTABLE		0x00000001
83#define INTEL_ST_BOOT_DEVICE		0x00000002
84#define INTEL_ST_READ_COALESCING	0x00000004
85#define INTEL_ST_WRITE_COALESCING	0x00000008
86#define INTEL_ST_LAST_SHUTDOWN_DIRTY	0x00000010
87#define INTEL_ST_HIDDEN_AT_BOOT		0x00000020
88#define INTEL_ST_CURRENTLY_HIDDEN	0x00000040
89#define INTEL_ST_VERIFY_AND_FIX		0x00000080
90#define INTEL_ST_MAP_STATE_UNINIT	0x00000100
91#define INTEL_ST_NO_AUTO_RECOVERY	0x00000200
92#define INTEL_ST_CLONE_N_GO		0x00000400
93#define INTEL_ST_CLONE_MAN_SYNC		0x00000800
94#define INTEL_ST_CNG_MASTER_DISK_NUM	0x00001000
95	uint32_t	reserved;
96	uint8_t		migr_priority;
97	uint8_t		num_sub_vols;
98	uint8_t		tid;
99	uint8_t		cng_master_disk;
100	uint16_t	cache_policy;
101	uint8_t		cng_state;
102#define INTEL_CNGST_UPDATED		0
103#define INTEL_CNGST_NEEDS_UPDATE	1
104#define INTEL_CNGST_MASTER_MISSING	2
105	uint8_t		cng_sub_state;
106	uint32_t	filler_0[10];
107
108	uint32_t	curr_migr_unit;
109	uint32_t	checkpoint_id;
110	uint8_t		migr_state;
111	uint8_t		migr_type;
112#define INTEL_MT_INIT		0
113#define INTEL_MT_REBUILD	1
114#define INTEL_MT_VERIFY		2
115#define INTEL_MT_GEN_MIGR	3
116#define INTEL_MT_STATE_CHANGE	4
117#define INTEL_MT_REPAIR		5
118	uint8_t		dirty;
119	uint8_t		fs_state;
120	uint16_t	verify_errors;
121	uint16_t	bad_blocks;
122	uint32_t	curr_migr_unit_hi;
123	uint32_t	filler_1[3];
124	struct intel_raid_map map[1];	/* 2 entries if migr_state != 0. */
125} __packed;
126
127struct intel_raid_disk {
128#define INTEL_SERIAL_LEN	16
129	uint8_t		serial[INTEL_SERIAL_LEN];
130	uint32_t	sectors;
131	uint32_t	id;
132	uint32_t	flags;
133#define INTEL_F_SPARE		0x01
134#define INTEL_F_ASSIGNED	0x02
135#define INTEL_F_FAILED		0x04
136#define INTEL_F_ONLINE		0x08
137#define INTEL_F_DISABLED	0x80
138	uint32_t	owner_cfg_num;
139	uint32_t	sectors_hi;
140	uint32_t	filler[3];
141} __packed;
142
143struct intel_raid_conf {
144	uint8_t		intel_id[24];
145#define INTEL_MAGIC             "Intel Raid ISM Cfg Sig. "
146
147	uint8_t		version[6];
148#define INTEL_VERSION_1000	"1.0.00"	/* RAID0 */
149#define INTEL_VERSION_1100	"1.1.00"	/* RAID1 */
150#define INTEL_VERSION_1200	"1.2.00"	/* Many volumes */
151#define INTEL_VERSION_1201	"1.2.01"	/* 3 or 4 disks */
152#define INTEL_VERSION_1202	"1.2.02"	/* RAID5 */
153#define INTEL_VERSION_1204	"1.2.04"	/* 5 or 6 disks */
154#define INTEL_VERSION_1206	"1.2.06"	/* CNG */
155#define INTEL_VERSION_1300	"1.3.00"	/* Attributes */
156
157	uint8_t		dummy_0[2];
158	uint32_t	checksum;
159	uint32_t	config_size;
160	uint32_t	config_id;
161	uint32_t	generation;
162	uint32_t	error_log_size;
163	uint32_t	attributes;
164#define INTEL_ATTR_RAID0	0x00000001
165#define INTEL_ATTR_RAID1	0x00000002
166#define INTEL_ATTR_RAID10	0x00000004
167#define INTEL_ATTR_RAID1E	0x00000008
168#define INTEL_ATTR_RAID5	0x00000010
169#define INTEL_ATTR_RAIDCNG	0x00000020
170#define INTEL_ATTR_EXT_STRIP	0x00000040
171#define INTEL_ATTR_NVM_CACHE	0x02000000
172#define INTEL_ATTR_2TB_DISK	0x04000000
173#define INTEL_ATTR_BBM		0x08000000
174#define INTEL_ATTR_NVM_CACHE2	0x10000000
175#define INTEL_ATTR_2TB		0x20000000
176#define INTEL_ATTR_PM		0x40000000
177#define INTEL_ATTR_CHECKSUM	0x80000000
178
179	uint8_t		total_disks;
180	uint8_t		total_volumes;
181	uint8_t		error_log_pos;
182	uint8_t		dummy_2[1];
183	uint32_t	cache_size;
184	uint32_t	orig_config_id;
185	uint32_t	pwr_cycle_count;
186	uint32_t	bbm_log_size;
187	uint32_t	filler_0[35];
188	struct intel_raid_disk	disk[1];	/* total_disks entries. */
189	/* Here goes total_volumes of struct intel_raid_vol. */
190} __packed;
191
192#define INTEL_ATTR_SUPPORTED	( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 |	\
193    INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 |		\
194    INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK |	\
195    INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM )
196
197#define INTEL_MAX_MD_SIZE(ndisks)				\
198    (sizeof(struct intel_raid_conf) +				\
199     sizeof(struct intel_raid_disk) * (ndisks - 1) +		\
200     sizeof(struct intel_raid_vol) * 2 +			\
201     sizeof(struct intel_raid_map) * 2 +			\
202     sizeof(uint32_t) * (ndisks - 1) * 4)
203
204struct g_raid_md_intel_perdisk {
205	struct intel_raid_conf	*pd_meta;
206	int			 pd_disk_pos;
207	struct intel_raid_disk	 pd_disk_meta;
208};
209
210struct g_raid_md_intel_pervolume {
211	int			 pv_volume_pos;
212	int			 pv_cng;
213	int			 pv_cng_man_sync;
214	int			 pv_cng_master_disk;
215};
216
217struct g_raid_md_intel_object {
218	struct g_raid_md_object	 mdio_base;
219	uint32_t		 mdio_config_id;
220	uint32_t		 mdio_orig_config_id;
221	uint32_t		 mdio_generation;
222	struct intel_raid_conf	*mdio_meta;
223	struct callout		 mdio_start_co;	/* STARTING state timer. */
224	int			 mdio_disks_present;
225	int			 mdio_started;
226	int			 mdio_incomplete;
227	struct root_hold_token	*mdio_rootmount; /* Root mount delay token. */
228};
229
230static g_raid_md_create_t g_raid_md_create_intel;
231static g_raid_md_taste_t g_raid_md_taste_intel;
232static g_raid_md_event_t g_raid_md_event_intel;
233static g_raid_md_ctl_t g_raid_md_ctl_intel;
234static g_raid_md_write_t g_raid_md_write_intel;
235static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel;
236static g_raid_md_free_disk_t g_raid_md_free_disk_intel;
237static g_raid_md_free_volume_t g_raid_md_free_volume_intel;
238static g_raid_md_free_t g_raid_md_free_intel;
239
240static kobj_method_t g_raid_md_intel_methods[] = {
241	KOBJMETHOD(g_raid_md_create,	g_raid_md_create_intel),
242	KOBJMETHOD(g_raid_md_taste,	g_raid_md_taste_intel),
243	KOBJMETHOD(g_raid_md_event,	g_raid_md_event_intel),
244	KOBJMETHOD(g_raid_md_ctl,	g_raid_md_ctl_intel),
245	KOBJMETHOD(g_raid_md_write,	g_raid_md_write_intel),
246	KOBJMETHOD(g_raid_md_fail_disk,	g_raid_md_fail_disk_intel),
247	KOBJMETHOD(g_raid_md_free_disk,	g_raid_md_free_disk_intel),
248	KOBJMETHOD(g_raid_md_free_volume,	g_raid_md_free_volume_intel),
249	KOBJMETHOD(g_raid_md_free,	g_raid_md_free_intel),
250	{ 0, 0 }
251};
252
253static struct g_raid_md_class g_raid_md_intel_class = {
254	"Intel",
255	g_raid_md_intel_methods,
256	sizeof(struct g_raid_md_intel_object),
257	.mdc_enable = 1,
258	.mdc_priority = 100
259};
260
261static struct intel_raid_map *
262intel_get_map(struct intel_raid_vol *mvol, int i)
263{
264	struct intel_raid_map *mmap;
265
266	if (i > (mvol->migr_state ? 1 : 0))
267		return (NULL);
268	mmap = &mvol->map[0];
269	for (; i > 0; i--) {
270		mmap = (struct intel_raid_map *)
271		    &mmap->disk_idx[mmap->total_disks];
272	}
273	return ((struct intel_raid_map *)mmap);
274}
275
276static struct intel_raid_vol *
277intel_get_volume(struct intel_raid_conf *meta, int i)
278{
279	struct intel_raid_vol *mvol;
280	struct intel_raid_map *mmap;
281
282	if (i > 1)
283		return (NULL);
284	mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks];
285	for (; i > 0; i--) {
286		mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0);
287		mvol = (struct intel_raid_vol *)
288		    &mmap->disk_idx[mmap->total_disks];
289	}
290	return (mvol);
291}
292
293static off_t
294intel_get_map_offset(struct intel_raid_map *mmap)
295{
296	off_t offset = (off_t)mmap->offset_hi << 32;
297
298	offset += mmap->offset;
299	return (offset);
300}
301
302static void
303intel_set_map_offset(struct intel_raid_map *mmap, off_t offset)
304{
305
306	mmap->offset = offset & 0xffffffff;
307	mmap->offset_hi = offset >> 32;
308}
309
310static off_t
311intel_get_map_disk_sectors(struct intel_raid_map *mmap)
312{
313	off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32;
314
315	disk_sectors += mmap->disk_sectors;
316	return (disk_sectors);
317}
318
319static void
320intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors)
321{
322
323	mmap->disk_sectors = disk_sectors & 0xffffffff;
324	mmap->disk_sectors_hi = disk_sectors >> 32;
325}
326
327static void
328intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count)
329{
330
331	mmap->stripe_count = stripe_count & 0xffffffff;
332	mmap->stripe_count_hi = stripe_count >> 32;
333}
334
335static off_t
336intel_get_disk_sectors(struct intel_raid_disk *disk)
337{
338	off_t sectors = (off_t)disk->sectors_hi << 32;
339
340	sectors += disk->sectors;
341	return (sectors);
342}
343
344static void
345intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors)
346{
347
348	disk->sectors = sectors & 0xffffffff;
349	disk->sectors_hi = sectors >> 32;
350}
351
352static off_t
353intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol)
354{
355	off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32;
356
357	curr_migr_unit += vol->curr_migr_unit;
358	return (curr_migr_unit);
359}
360
361static void
362intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit)
363{
364
365	vol->curr_migr_unit = curr_migr_unit & 0xffffffff;
366	vol->curr_migr_unit_hi = curr_migr_unit >> 32;
367}
368
369static char *
370intel_status2str(int status)
371{
372
373	switch (status) {
374	case INTEL_S_READY:
375		return ("READY");
376	case INTEL_S_UNINITIALIZED:
377		return ("UNINITIALIZED");
378	case INTEL_S_DEGRADED:
379		return ("DEGRADED");
380	case INTEL_S_FAILURE:
381		return ("FAILURE");
382	default:
383		return ("UNKNOWN");
384	}
385}
386
387static char *
388intel_type2str(int type)
389{
390
391	switch (type) {
392	case INTEL_T_RAID0:
393		return ("RAID0");
394	case INTEL_T_RAID1:
395		return ("RAID1");
396	case INTEL_T_RAID5:
397		return ("RAID5");
398	default:
399		return ("UNKNOWN");
400	}
401}
402
403static char *
404intel_cngst2str(int cng_state)
405{
406
407	switch (cng_state) {
408	case INTEL_CNGST_UPDATED:
409		return ("UPDATED");
410	case INTEL_CNGST_NEEDS_UPDATE:
411		return ("NEEDS_UPDATE");
412	case INTEL_CNGST_MASTER_MISSING:
413		return ("MASTER_MISSING");
414	default:
415		return ("UNKNOWN");
416	}
417}
418
419static char *
420intel_mt2str(int type)
421{
422
423	switch (type) {
424	case INTEL_MT_INIT:
425		return ("INIT");
426	case INTEL_MT_REBUILD:
427		return ("REBUILD");
428	case INTEL_MT_VERIFY:
429		return ("VERIFY");
430	case INTEL_MT_GEN_MIGR:
431		return ("GEN_MIGR");
432	case INTEL_MT_STATE_CHANGE:
433		return ("STATE_CHANGE");
434	case INTEL_MT_REPAIR:
435		return ("REPAIR");
436	default:
437		return ("UNKNOWN");
438	}
439}
440
441static void
442g_raid_md_intel_print(struct intel_raid_conf *meta)
443{
444	struct intel_raid_vol *mvol;
445	struct intel_raid_map *mmap;
446	int i, j, k;
447
448	if (g_raid_debug < 1)
449		return;
450
451	printf("********* ATA Intel MatrixRAID Metadata *********\n");
452	printf("intel_id            <%.24s>\n", meta->intel_id);
453	printf("version             <%.6s>\n", meta->version);
454	printf("checksum            0x%08x\n", meta->checksum);
455	printf("config_size         0x%08x\n", meta->config_size);
456	printf("config_id           0x%08x\n", meta->config_id);
457	printf("generation          0x%08x\n", meta->generation);
458	printf("error_log_size      %d\n", meta->error_log_size);
459	printf("attributes          0x%b\n", meta->attributes,
460		"\020"
461		"\001RAID0"
462		"\002RAID1"
463		"\003RAID10"
464		"\004RAID1E"
465		"\005RAID15"
466		"\006RAIDCNG"
467		"\007EXT_STRIP"
468		"\032NVM_CACHE"
469		"\0332TB_DISK"
470		"\034BBM"
471		"\035NVM_CACHE"
472		"\0362TB"
473		"\037PM"
474		"\040CHECKSUM");
475	printf("total_disks         %u\n", meta->total_disks);
476	printf("total_volumes       %u\n", meta->total_volumes);
477	printf("error_log_pos       %u\n", meta->error_log_pos);
478	printf("cache_size          %u\n", meta->cache_size);
479	printf("orig_config_id      0x%08x\n", meta->orig_config_id);
480	printf("pwr_cycle_count     %u\n", meta->pwr_cycle_count);
481	printf("bbm_log_size        %u\n", meta->bbm_log_size);
482	printf("Flags: S - Spare, A - Assigned, F - Failed, O - Online, D - Disabled\n");
483	printf("DISK#   serial disk_sectors disk_sectors_hi disk_id flags owner\n");
484	for (i = 0; i < meta->total_disks; i++ ) {
485		printf("    %d   <%.16s> %u %u 0x%08x 0x%b %08x\n", i,
486		    meta->disk[i].serial, meta->disk[i].sectors,
487		    meta->disk[i].sectors_hi, meta->disk[i].id,
488		    meta->disk[i].flags, "\20\01S\02A\03F\04O\05D",
489		    meta->disk[i].owner_cfg_num);
490	}
491	for (i = 0; i < meta->total_volumes; i++) {
492		mvol = intel_get_volume(meta, i);
493		printf(" ****** Volume %d ******\n", i);
494		printf(" name               %.16s\n", mvol->name);
495		printf(" total_sectors      %ju\n", mvol->total_sectors);
496		printf(" state              0x%b\n", mvol->state,
497			"\020"
498			"\001BOOTABLE"
499			"\002BOOT_DEVICE"
500			"\003READ_COALESCING"
501			"\004WRITE_COALESCING"
502			"\005LAST_SHUTDOWN_DIRTY"
503			"\006HIDDEN_AT_BOOT"
504			"\007CURRENTLY_HIDDEN"
505			"\010VERIFY_AND_FIX"
506			"\011MAP_STATE_UNINIT"
507			"\012NO_AUTO_RECOVERY"
508			"\013CLONE_N_GO"
509			"\014CLONE_MAN_SYNC"
510			"\015CNG_MASTER_DISK_NUM");
511		printf(" reserved           %u\n", mvol->reserved);
512		printf(" migr_priority      %u\n", mvol->migr_priority);
513		printf(" num_sub_vols       %u\n", mvol->num_sub_vols);
514		printf(" tid                %u\n", mvol->tid);
515		printf(" cng_master_disk    %u\n", mvol->cng_master_disk);
516		printf(" cache_policy       %u\n", mvol->cache_policy);
517		printf(" cng_state          %u (%s)\n", mvol->cng_state,
518			intel_cngst2str(mvol->cng_state));
519		printf(" cng_sub_state      %u\n", mvol->cng_sub_state);
520		printf(" curr_migr_unit     %u\n", mvol->curr_migr_unit);
521		printf(" curr_migr_unit_hi  %u\n", mvol->curr_migr_unit_hi);
522		printf(" checkpoint_id      %u\n", mvol->checkpoint_id);
523		printf(" migr_state         %u\n", mvol->migr_state);
524		printf(" migr_type          %u (%s)\n", mvol->migr_type,
525			intel_mt2str(mvol->migr_type));
526		printf(" dirty              %u\n", mvol->dirty);
527		printf(" fs_state           %u\n", mvol->fs_state);
528		printf(" verify_errors      %u\n", mvol->verify_errors);
529		printf(" bad_blocks         %u\n", mvol->bad_blocks);
530
531		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
532			printf("  *** Map %d ***\n", j);
533			mmap = intel_get_map(mvol, j);
534			printf("  offset            %u\n", mmap->offset);
535			printf("  offset_hi         %u\n", mmap->offset_hi);
536			printf("  disk_sectors      %u\n", mmap->disk_sectors);
537			printf("  disk_sectors_hi   %u\n", mmap->disk_sectors_hi);
538			printf("  stripe_count      %u\n", mmap->stripe_count);
539			printf("  stripe_count_hi   %u\n", mmap->stripe_count_hi);
540			printf("  strip_sectors     %u\n", mmap->strip_sectors);
541			printf("  status            %u (%s)\n", mmap->status,
542				intel_status2str(mmap->status));
543			printf("  type              %u (%s)\n", mmap->type,
544				intel_type2str(mmap->type));
545			printf("  total_disks       %u\n", mmap->total_disks);
546			printf("  total_domains     %u\n", mmap->total_domains);
547			printf("  failed_disk_num   %u\n", mmap->failed_disk_num);
548			printf("  ddf               %u\n", mmap->ddf);
549			printf("  disk_idx         ");
550			for (k = 0; k < mmap->total_disks; k++)
551				printf(" 0x%08x", mmap->disk_idx[k]);
552			printf("\n");
553		}
554	}
555	printf("=================================================\n");
556}
557
558static struct intel_raid_conf *
559intel_meta_copy(struct intel_raid_conf *meta)
560{
561	struct intel_raid_conf *nmeta;
562
563	nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK);
564	memcpy(nmeta, meta, meta->config_size);
565	return (nmeta);
566}
567
568static int
569intel_meta_find_disk(struct intel_raid_conf *meta, char *serial)
570{
571	int pos;
572
573	for (pos = 0; pos < meta->total_disks; pos++) {
574		if (strncmp(meta->disk[pos].serial,
575		    serial, INTEL_SERIAL_LEN) == 0)
576			return (pos);
577	}
578	return (-1);
579}
580
581static struct intel_raid_conf *
582intel_meta_read(struct g_consumer *cp)
583{
584	struct g_provider *pp;
585	struct intel_raid_conf *meta;
586	struct intel_raid_vol *mvol;
587	struct intel_raid_map *mmap, *mmap1;
588	char *buf;
589	int error, i, j, k, left, size;
590	uint32_t checksum, *ptr;
591
592	pp = cp->provider;
593	if (pp->sectorsize < sizeof(*meta))
594		return (NULL);
595	/* Read the anchor sector. */
596	buf = g_read_data(cp,
597	    pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error);
598	if (buf == NULL) {
599		G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).",
600		    pp->name, error);
601		return (NULL);
602	}
603	meta = (struct intel_raid_conf *)buf;
604
605	/* Check if this is an Intel RAID struct */
606	if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) {
607		G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name);
608		g_free(buf);
609		return (NULL);
610	}
611	if (meta->config_size > 65536 ||
612	    meta->config_size < sizeof(struct intel_raid_conf)) {
613		G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d",
614		    meta->config_size);
615		g_free(buf);
616		return (NULL);
617	}
618	size = meta->config_size;
619	meta = malloc(size, M_MD_INTEL, M_WAITOK);
620	memcpy(meta, buf, min(size, pp->sectorsize));
621	g_free(buf);
622
623	/* Read all the rest, if needed. */
624	if (meta->config_size > pp->sectorsize) {
625		left = (meta->config_size - 1) / pp->sectorsize;
626		buf = g_read_data(cp,
627		    pp->mediasize - pp->sectorsize * (2 + left),
628		    pp->sectorsize * left, &error);
629		if (buf == NULL) {
630			G_RAID_DEBUG(1, "Cannot read remaining metadata"
631			    " part from %s (error=%d).",
632			    pp->name, error);
633			free(meta, M_MD_INTEL);
634			return (NULL);
635		}
636		memcpy(((char *)meta) + pp->sectorsize, buf,
637		    pp->sectorsize * left);
638		g_free(buf);
639	}
640
641	/* Check metadata checksum. */
642	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
643	    i < (meta->config_size / sizeof(uint32_t)); i++) {
644		checksum += *ptr++;
645	}
646	checksum -= meta->checksum;
647	if (checksum != meta->checksum) {
648		G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name);
649		free(meta, M_MD_INTEL);
650		return (NULL);
651	}
652
653	/* Validate metadata size. */
654	size = sizeof(struct intel_raid_conf) +
655	    sizeof(struct intel_raid_disk) * (meta->total_disks - 1) +
656	    sizeof(struct intel_raid_vol) * meta->total_volumes;
657	if (size > meta->config_size) {
658badsize:
659		G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d",
660		    meta->config_size, size);
661		free(meta, M_MD_INTEL);
662		return (NULL);
663	}
664	for (i = 0; i < meta->total_volumes; i++) {
665		mvol = intel_get_volume(meta, i);
666		mmap = intel_get_map(mvol, 0);
667		size += 4 * (mmap->total_disks - 1);
668		if (size > meta->config_size)
669			goto badsize;
670		if (mvol->migr_state) {
671			size += sizeof(struct intel_raid_map);
672			if (size > meta->config_size)
673				goto badsize;
674			mmap = intel_get_map(mvol, 1);
675			size += 4 * (mmap->total_disks - 1);
676			if (size > meta->config_size)
677				goto badsize;
678		}
679	}
680
681	g_raid_md_intel_print(meta);
682
683	if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) {
684		G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'",
685		    meta->version);
686		free(meta, M_MD_INTEL);
687		return (NULL);
688	}
689
690	if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 &&
691	    (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) {
692		G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x",
693		    meta->attributes & ~INTEL_ATTR_SUPPORTED);
694		free(meta, M_MD_INTEL);
695		return (NULL);
696	}
697
698	/* Validate disk indexes. */
699	for (i = 0; i < meta->total_volumes; i++) {
700		mvol = intel_get_volume(meta, i);
701		for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) {
702			mmap = intel_get_map(mvol, j);
703			for (k = 0; k < mmap->total_disks; k++) {
704				if ((mmap->disk_idx[k] & INTEL_DI_IDX) >
705				    meta->total_disks) {
706					G_RAID_DEBUG(1, "Intel metadata disk"
707					    " index %d too big (>%d)",
708					    mmap->disk_idx[k] & INTEL_DI_IDX,
709					    meta->total_disks);
710					free(meta, M_MD_INTEL);
711					return (NULL);
712				}
713			}
714		}
715	}
716
717	/* Validate migration types. */
718	for (i = 0; i < meta->total_volumes; i++) {
719		mvol = intel_get_volume(meta, i);
720		/* Deny unknown migration types. */
721		if (mvol->migr_state &&
722		    mvol->migr_type != INTEL_MT_INIT &&
723		    mvol->migr_type != INTEL_MT_REBUILD &&
724		    mvol->migr_type != INTEL_MT_VERIFY &&
725		    mvol->migr_type != INTEL_MT_GEN_MIGR &&
726		    mvol->migr_type != INTEL_MT_REPAIR) {
727			G_RAID_DEBUG(1, "Intel metadata has unsupported"
728			    " migration type %d", mvol->migr_type);
729			free(meta, M_MD_INTEL);
730			return (NULL);
731		}
732		/* Deny general migrations except SINGLE->RAID1. */
733		if (mvol->migr_state &&
734		    mvol->migr_type == INTEL_MT_GEN_MIGR) {
735			mmap = intel_get_map(mvol, 0);
736			mmap1 = intel_get_map(mvol, 1);
737			if (mmap1->total_disks != 1 ||
738			    mmap->type != INTEL_T_RAID1 ||
739			    mmap->total_disks != 2 ||
740			    mmap->offset != mmap1->offset ||
741			    mmap->disk_sectors != mmap1->disk_sectors ||
742			    mmap->total_domains != mmap->total_disks ||
743			    mmap->offset_hi != mmap1->offset_hi ||
744			    mmap->disk_sectors_hi != mmap1->disk_sectors_hi ||
745			    (mmap->disk_idx[0] != mmap1->disk_idx[0] &&
746			     mmap->disk_idx[0] != mmap1->disk_idx[1])) {
747				G_RAID_DEBUG(1, "Intel metadata has unsupported"
748				    " variant of general migration");
749				free(meta, M_MD_INTEL);
750				return (NULL);
751			}
752		}
753	}
754
755	return (meta);
756}
757
758static int
759intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta)
760{
761	struct g_provider *pp;
762	char *buf;
763	int error, i, sectors;
764	uint32_t checksum, *ptr;
765
766	pp = cp->provider;
767
768	/* Recalculate checksum for case if metadata were changed. */
769	meta->checksum = 0;
770	for (checksum = 0, ptr = (uint32_t *)meta, i = 0;
771	    i < (meta->config_size / sizeof(uint32_t)); i++) {
772		checksum += *ptr++;
773	}
774	meta->checksum = checksum;
775
776	/* Create and fill buffer. */
777	sectors = howmany(meta->config_size, pp->sectorsize);
778	buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
779	if (sectors > 1) {
780		memcpy(buf, ((char *)meta) + pp->sectorsize,
781		    (sectors - 1) * pp->sectorsize);
782	}
783	memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize);
784
785	error = g_write_data(cp,
786	    pp->mediasize - pp->sectorsize * (1 + sectors),
787	    buf, pp->sectorsize * sectors);
788	if (error != 0) {
789		G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).",
790		    pp->name, error);
791	}
792
793	free(buf, M_MD_INTEL);
794	return (error);
795}
796
797static int
798intel_meta_erase(struct g_consumer *cp)
799{
800	struct g_provider *pp;
801	char *buf;
802	int error;
803
804	pp = cp->provider;
805	buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO);
806	error = g_write_data(cp,
807	    pp->mediasize - 2 * pp->sectorsize,
808	    buf, pp->sectorsize);
809	if (error != 0) {
810		G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).",
811		    pp->name, error);
812	}
813	free(buf, M_MD_INTEL);
814	return (error);
815}
816
817static int
818intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d)
819{
820	struct intel_raid_conf *meta;
821	int error;
822
823	/* Fill anchor and single disk. */
824	meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO);
825	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
826	memcpy(&meta->version[0], INTEL_VERSION_1000,
827	    sizeof(INTEL_VERSION_1000) - 1);
828	meta->config_size = INTEL_MAX_MD_SIZE(1);
829	meta->config_id = meta->orig_config_id = arc4random();
830	meta->generation = 1;
831	meta->total_disks = 1;
832	meta->disk[0] = *d;
833	error = intel_meta_write(cp, meta);
834	free(meta, M_MD_INTEL);
835	return (error);
836}
837
838static struct g_raid_disk *
839g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id)
840{
841	struct g_raid_disk	*disk;
842	struct g_raid_md_intel_perdisk *pd;
843
844	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
845		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
846		if (pd->pd_disk_pos == id)
847			break;
848	}
849	return (disk);
850}
851
852static int
853g_raid_md_intel_supported(int level, int qual, int disks, int force)
854{
855
856	switch (level) {
857	case G_RAID_VOLUME_RL_RAID0:
858		if (disks < 1)
859			return (0);
860		if (!force && (disks < 2 || disks > 6))
861			return (0);
862		break;
863	case G_RAID_VOLUME_RL_RAID1:
864		if (disks < 1)
865			return (0);
866		if (!force && (disks != 2))
867			return (0);
868		break;
869	case G_RAID_VOLUME_RL_RAID1E:
870		if (disks < 2)
871			return (0);
872		if (!force && (disks != 4))
873			return (0);
874		break;
875	case G_RAID_VOLUME_RL_RAID5:
876		if (disks < 3)
877			return (0);
878		if (!force && disks > 6)
879			return (0);
880		if (qual != G_RAID_VOLUME_RLQ_R5LA)
881			return (0);
882		break;
883	default:
884		return (0);
885	}
886	if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE)
887		return (0);
888	return (1);
889}
890
891static struct g_raid_volume *
892g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id)
893{
894	struct g_raid_volume	*mvol;
895	struct g_raid_md_intel_pervolume *pv;
896
897	TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) {
898		pv = mvol->v_md_data;
899		if (pv->pv_volume_pos == id)
900			break;
901	}
902	return (mvol);
903}
904
905static int
906g_raid_md_intel_start_disk(struct g_raid_disk *disk)
907{
908	struct g_raid_softc *sc;
909	struct g_raid_subdisk *sd, *tmpsd;
910	struct g_raid_disk *olddisk, *tmpdisk;
911	struct g_raid_md_object *md;
912	struct g_raid_md_intel_object *mdi;
913	struct g_raid_md_intel_pervolume *pv;
914	struct g_raid_md_intel_perdisk *pd, *oldpd;
915	struct intel_raid_conf *meta;
916	struct intel_raid_vol *mvol;
917	struct intel_raid_map *mmap0, *mmap1;
918	int disk_pos, resurrection = 0, migr_global, i;
919
920	sc = disk->d_softc;
921	md = sc->sc_md;
922	mdi = (struct g_raid_md_intel_object *)md;
923	meta = mdi->mdio_meta;
924	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
925	olddisk = NULL;
926
927	/* Find disk position in metadata by its serial. */
928	disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial);
929	if (disk_pos < 0) {
930		G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk");
931		/* Failed stale disk is useless for us. */
932		if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) &&
933		    !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) {
934			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED);
935			return (0);
936		}
937		/* If we are in the start process, that's all for now. */
938		if (!mdi->mdio_started)
939			goto nofit;
940		/*
941		 * If we have already started - try to get use of the disk.
942		 * Try to replace OFFLINE disks first, then FAILED.
943		 */
944		TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) {
945			if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE &&
946			    tmpdisk->d_state != G_RAID_DISK_S_FAILED)
947				continue;
948			/* Make sure this disk is big enough. */
949			TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) {
950				off_t disk_sectors =
951				    intel_get_disk_sectors(&pd->pd_disk_meta);
952
953				if (sd->sd_offset + sd->sd_size + 4096 >
954				    disk_sectors * 512) {
955					G_RAID_DEBUG1(1, sc,
956					    "Disk too small (%llu < %llu)",
957					    (unsigned long long)
958					    disk_sectors * 512,
959					    (unsigned long long)
960					    sd->sd_offset + sd->sd_size + 4096);
961					break;
962				}
963			}
964			if (sd != NULL)
965				continue;
966			if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) {
967				olddisk = tmpdisk;
968				break;
969			} else if (olddisk == NULL)
970				olddisk = tmpdisk;
971		}
972		if (olddisk == NULL) {
973nofit:
974			if (pd->pd_disk_meta.flags & INTEL_F_SPARE) {
975				g_raid_change_disk_state(disk,
976				    G_RAID_DISK_S_SPARE);
977				return (1);
978			} else {
979				g_raid_change_disk_state(disk,
980				    G_RAID_DISK_S_STALE);
981				return (0);
982			}
983		}
984		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
985		disk_pos = oldpd->pd_disk_pos;
986		resurrection = 1;
987	}
988
989	if (olddisk == NULL) {
990		/* Find placeholder by position. */
991		olddisk = g_raid_md_intel_get_disk(sc, disk_pos);
992		if (olddisk == NULL)
993			panic("No disk at position %d!", disk_pos);
994		if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) {
995			G_RAID_DEBUG1(1, sc, "More than one disk for pos %d",
996			    disk_pos);
997			g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE);
998			return (0);
999		}
1000		oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data;
1001	}
1002
1003	/* Replace failed disk or placeholder with new disk. */
1004	TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) {
1005		TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next);
1006		TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1007		sd->sd_disk = disk;
1008	}
1009	oldpd->pd_disk_pos = -2;
1010	pd->pd_disk_pos = disk_pos;
1011
1012	/* If it was placeholder -- destroy it. */
1013	if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) {
1014		g_raid_destroy_disk(olddisk);
1015	} else {
1016		/* Otherwise, make it STALE_FAILED. */
1017		g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED);
1018		/* Update global metadata just in case. */
1019		memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta,
1020		    sizeof(struct intel_raid_disk));
1021	}
1022
1023	/* Welcome the new disk. */
1024	if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
1025	    !(pd->pd_disk_meta.flags & INTEL_F_SPARE))
1026		g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED);
1027	else if (resurrection)
1028		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
1029	else if (meta->disk[disk_pos].flags & INTEL_F_FAILED)
1030		g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED);
1031	else if (meta->disk[disk_pos].flags & INTEL_F_SPARE)
1032		g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE);
1033	else
1034		g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE);
1035	TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
1036		pv = sd->sd_volume->v_md_data;
1037		mvol = intel_get_volume(meta, pv->pv_volume_pos);
1038		mmap0 = intel_get_map(mvol, 0);
1039		if (mvol->migr_state)
1040			mmap1 = intel_get_map(mvol, 1);
1041		else
1042			mmap1 = mmap0;
1043
1044		migr_global = 1;
1045		for (i = 0; i < mmap0->total_disks; i++) {
1046			if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 &&
1047			    (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0)
1048				migr_global = 0;
1049		}
1050
1051		if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) &&
1052		    !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) {
1053			/* Disabled disk, useless. */
1054			g_raid_change_subdisk_state(sd,
1055			    G_RAID_SUBDISK_S_NONE);
1056		} else if (resurrection) {
1057			/* Stale disk, almost same as new. */
1058			g_raid_change_subdisk_state(sd,
1059			    G_RAID_SUBDISK_S_NEW);
1060		} else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) {
1061			/* Failed disk, almost useless. */
1062			g_raid_change_subdisk_state(sd,
1063			    G_RAID_SUBDISK_S_FAILED);
1064		} else if (mvol->migr_state == 0) {
1065			if (mmap0->status == INTEL_S_UNINITIALIZED &&
1066			    (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) {
1067				/* Freshly created uninitialized volume. */
1068				g_raid_change_subdisk_state(sd,
1069				    G_RAID_SUBDISK_S_UNINITIALIZED);
1070			} else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1071				/* Freshly inserted disk. */
1072				g_raid_change_subdisk_state(sd,
1073				    G_RAID_SUBDISK_S_NEW);
1074			} else if (mvol->dirty && (!pv->pv_cng ||
1075			    pv->pv_cng_master_disk != disk_pos)) {
1076				/* Dirty volume (unclean shutdown). */
1077				g_raid_change_subdisk_state(sd,
1078				    G_RAID_SUBDISK_S_STALE);
1079			} else {
1080				/* Up to date disk. */
1081				g_raid_change_subdisk_state(sd,
1082				    G_RAID_SUBDISK_S_ACTIVE);
1083			}
1084		} else if (mvol->migr_type == INTEL_MT_INIT ||
1085			   mvol->migr_type == INTEL_MT_REBUILD) {
1086			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1087				/* Freshly inserted disk. */
1088				g_raid_change_subdisk_state(sd,
1089				    G_RAID_SUBDISK_S_NEW);
1090			} else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1091				/* Rebuilding disk. */
1092				g_raid_change_subdisk_state(sd,
1093				    G_RAID_SUBDISK_S_REBUILD);
1094				if (mvol->dirty) {
1095					sd->sd_rebuild_pos = 0;
1096				} else {
1097					sd->sd_rebuild_pos =
1098					    intel_get_vol_curr_migr_unit(mvol) *
1099					    sd->sd_volume->v_strip_size *
1100					    mmap0->total_domains;
1101				}
1102			} else if (mvol->migr_type == INTEL_MT_INIT &&
1103			    migr_global) {
1104				/* Freshly created uninitialized volume. */
1105				g_raid_change_subdisk_state(sd,
1106				    G_RAID_SUBDISK_S_UNINITIALIZED);
1107			} else if (mvol->dirty && (!pv->pv_cng ||
1108			    pv->pv_cng_master_disk != disk_pos)) {
1109				/* Dirty volume (unclean shutdown). */
1110				g_raid_change_subdisk_state(sd,
1111				    G_RAID_SUBDISK_S_STALE);
1112			} else {
1113				/* Up to date disk. */
1114				g_raid_change_subdisk_state(sd,
1115				    G_RAID_SUBDISK_S_ACTIVE);
1116			}
1117		} else if (mvol->migr_type == INTEL_MT_VERIFY ||
1118			   mvol->migr_type == INTEL_MT_REPAIR) {
1119			if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) {
1120				/* Freshly inserted disk. */
1121				g_raid_change_subdisk_state(sd,
1122				    G_RAID_SUBDISK_S_NEW);
1123			} else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) ||
1124			    migr_global) {
1125				/* Resyncing disk. */
1126				g_raid_change_subdisk_state(sd,
1127				    G_RAID_SUBDISK_S_RESYNC);
1128				if (mvol->dirty) {
1129					sd->sd_rebuild_pos = 0;
1130				} else {
1131					sd->sd_rebuild_pos =
1132					    intel_get_vol_curr_migr_unit(mvol) *
1133					    sd->sd_volume->v_strip_size *
1134					    mmap0->total_domains;
1135				}
1136			} else if (mvol->dirty) {
1137				/* Dirty volume (unclean shutdown). */
1138				g_raid_change_subdisk_state(sd,
1139				    G_RAID_SUBDISK_S_STALE);
1140			} else {
1141				/* Up to date disk. */
1142				g_raid_change_subdisk_state(sd,
1143				    G_RAID_SUBDISK_S_ACTIVE);
1144			}
1145		} else if (mvol->migr_type == INTEL_MT_GEN_MIGR) {
1146			if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) {
1147				/* Freshly inserted disk. */
1148				g_raid_change_subdisk_state(sd,
1149				    G_RAID_SUBDISK_S_NEW);
1150			} else {
1151				/* Up to date disk. */
1152				g_raid_change_subdisk_state(sd,
1153				    G_RAID_SUBDISK_S_ACTIVE);
1154			}
1155		}
1156		g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1157		    G_RAID_EVENT_SUBDISK);
1158	}
1159
1160	/* Update status of our need for spare. */
1161	if (mdi->mdio_started) {
1162		mdi->mdio_incomplete =
1163		    (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1164		     g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) <
1165		     meta->total_disks);
1166	}
1167
1168	return (resurrection);
1169}
1170
1171static void
1172g_disk_md_intel_retaste(void *arg, int pending)
1173{
1174
1175	G_RAID_DEBUG(1, "Array is not complete, trying to retaste.");
1176	g_retaste(&g_raid_class);
1177	free(arg, M_MD_INTEL);
1178}
1179
1180static void
1181g_raid_md_intel_refill(struct g_raid_softc *sc)
1182{
1183	struct g_raid_md_object *md;
1184	struct g_raid_md_intel_object *mdi;
1185	struct intel_raid_conf *meta;
1186	struct g_raid_disk *disk;
1187	struct task *task;
1188	int update, na;
1189
1190	md = sc->sc_md;
1191	mdi = (struct g_raid_md_intel_object *)md;
1192	meta = mdi->mdio_meta;
1193	update = 0;
1194	do {
1195		/* Make sure we miss anything. */
1196		na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1197		    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED);
1198		if (na == meta->total_disks)
1199			break;
1200
1201		G_RAID_DEBUG1(1, md->mdo_softc,
1202		    "Array is not complete (%d of %d), "
1203		    "trying to refill.", na, meta->total_disks);
1204
1205		/* Try to get use some of STALE disks. */
1206		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1207			if (disk->d_state == G_RAID_DISK_S_STALE) {
1208				update += g_raid_md_intel_start_disk(disk);
1209				if (disk->d_state == G_RAID_DISK_S_ACTIVE ||
1210				    disk->d_state == G_RAID_DISK_S_DISABLED)
1211					break;
1212			}
1213		}
1214		if (disk != NULL)
1215			continue;
1216
1217		/* Try to get use some of SPARE disks. */
1218		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1219			if (disk->d_state == G_RAID_DISK_S_SPARE) {
1220				update += g_raid_md_intel_start_disk(disk);
1221				if (disk->d_state == G_RAID_DISK_S_ACTIVE)
1222					break;
1223			}
1224		}
1225	} while (disk != NULL);
1226
1227	/* Write new metadata if we changed something. */
1228	if (update) {
1229		g_raid_md_write_intel(md, NULL, NULL, NULL);
1230		meta = mdi->mdio_meta;
1231	}
1232
1233	/* Update status of our need for spare. */
1234	mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) +
1235	    g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks);
1236
1237	/* Request retaste hoping to find spare. */
1238	if (mdi->mdio_incomplete) {
1239		task = malloc(sizeof(struct task),
1240		    M_MD_INTEL, M_WAITOK | M_ZERO);
1241		TASK_INIT(task, 0, g_disk_md_intel_retaste, task);
1242		taskqueue_enqueue(taskqueue_swi, task);
1243	}
1244}
1245
1246static void
1247g_raid_md_intel_start(struct g_raid_softc *sc)
1248{
1249	struct g_raid_md_object *md;
1250	struct g_raid_md_intel_object *mdi;
1251	struct g_raid_md_intel_pervolume *pv;
1252	struct g_raid_md_intel_perdisk *pd;
1253	struct intel_raid_conf *meta;
1254	struct intel_raid_vol *mvol;
1255	struct intel_raid_map *mmap;
1256	struct g_raid_volume *vol;
1257	struct g_raid_subdisk *sd;
1258	struct g_raid_disk *disk;
1259	int i, j, disk_pos;
1260
1261	md = sc->sc_md;
1262	mdi = (struct g_raid_md_intel_object *)md;
1263	meta = mdi->mdio_meta;
1264
1265	/* Create volumes and subdisks. */
1266	for (i = 0; i < meta->total_volumes; i++) {
1267		mvol = intel_get_volume(meta, i);
1268		mmap = intel_get_map(mvol, 0);
1269		vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1);
1270		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1271		pv->pv_volume_pos = i;
1272		pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0;
1273		pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0;
1274		if (mvol->cng_master_disk < mmap->total_disks)
1275			pv->pv_cng_master_disk = mvol->cng_master_disk;
1276		vol->v_md_data = pv;
1277		vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE;
1278		if (mmap->type == INTEL_T_RAID0)
1279			vol->v_raid_level = G_RAID_VOLUME_RL_RAID0;
1280		else if (mmap->type == INTEL_T_RAID1 &&
1281		    mmap->total_domains >= 2 &&
1282		    mmap->total_domains <= mmap->total_disks) {
1283			/* Assume total_domains is correct. */
1284			if (mmap->total_domains == mmap->total_disks)
1285				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1286			else
1287				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1288		} else if (mmap->type == INTEL_T_RAID1) {
1289			/* total_domains looks wrong. */
1290			if (mmap->total_disks <= 2)
1291				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1;
1292			else
1293				vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E;
1294		} else if (mmap->type == INTEL_T_RAID5) {
1295			vol->v_raid_level = G_RAID_VOLUME_RL_RAID5;
1296			vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA;
1297		} else
1298			vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1299		vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ
1300		vol->v_disks_count = mmap->total_disks;
1301		vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ
1302		vol->v_sectorsize = 512; //ZZZ
1303		for (j = 0; j < vol->v_disks_count; j++) {
1304			sd = &vol->v_subdisks[j];
1305			sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ
1306			sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ
1307		}
1308		g_raid_start_volume(vol);
1309	}
1310
1311	/* Create disk placeholders to store data for later writing. */
1312	for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) {
1313		pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1314		pd->pd_disk_pos = disk_pos;
1315		pd->pd_disk_meta = meta->disk[disk_pos];
1316		disk = g_raid_create_disk(sc);
1317		disk->d_md_data = (void *)pd;
1318		disk->d_state = G_RAID_DISK_S_OFFLINE;
1319		for (i = 0; i < meta->total_volumes; i++) {
1320			mvol = intel_get_volume(meta, i);
1321			mmap = intel_get_map(mvol, 0);
1322			for (j = 0; j < mmap->total_disks; j++) {
1323				if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos)
1324					break;
1325			}
1326			if (j == mmap->total_disks)
1327				continue;
1328			vol = g_raid_md_intel_get_volume(sc, i);
1329			sd = &vol->v_subdisks[j];
1330			sd->sd_disk = disk;
1331			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1332		}
1333	}
1334
1335	/* Make all disks found till the moment take their places. */
1336	do {
1337		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1338			if (disk->d_state == G_RAID_DISK_S_NONE) {
1339				g_raid_md_intel_start_disk(disk);
1340				break;
1341			}
1342		}
1343	} while (disk != NULL);
1344
1345	mdi->mdio_started = 1;
1346	G_RAID_DEBUG1(0, sc, "Array started.");
1347	g_raid_md_write_intel(md, NULL, NULL, NULL);
1348
1349	/* Pickup any STALE/SPARE disks to refill array if needed. */
1350	g_raid_md_intel_refill(sc);
1351
1352	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1353		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1354		    G_RAID_EVENT_VOLUME);
1355	}
1356
1357	callout_stop(&mdi->mdio_start_co);
1358	G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount);
1359	root_mount_rel(mdi->mdio_rootmount);
1360	mdi->mdio_rootmount = NULL;
1361}
1362
1363static void
1364g_raid_md_intel_new_disk(struct g_raid_disk *disk)
1365{
1366	struct g_raid_softc *sc;
1367	struct g_raid_md_object *md;
1368	struct g_raid_md_intel_object *mdi;
1369	struct intel_raid_conf *pdmeta;
1370	struct g_raid_md_intel_perdisk *pd;
1371
1372	sc = disk->d_softc;
1373	md = sc->sc_md;
1374	mdi = (struct g_raid_md_intel_object *)md;
1375	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1376	pdmeta = pd->pd_meta;
1377
1378	if (mdi->mdio_started) {
1379		if (g_raid_md_intel_start_disk(disk))
1380			g_raid_md_write_intel(md, NULL, NULL, NULL);
1381	} else {
1382		/* If we haven't started yet - check metadata freshness. */
1383		if (mdi->mdio_meta == NULL ||
1384		    ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) {
1385			G_RAID_DEBUG1(1, sc, "Newer disk");
1386			if (mdi->mdio_meta != NULL)
1387				free(mdi->mdio_meta, M_MD_INTEL);
1388			mdi->mdio_meta = intel_meta_copy(pdmeta);
1389			mdi->mdio_generation = mdi->mdio_meta->generation;
1390			mdi->mdio_disks_present = 1;
1391		} else if (pdmeta->generation == mdi->mdio_generation) {
1392			mdi->mdio_disks_present++;
1393			G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)",
1394			    mdi->mdio_disks_present,
1395			    mdi->mdio_meta->total_disks);
1396		} else {
1397			G_RAID_DEBUG1(1, sc, "Older disk");
1398		}
1399		/* If we collected all needed disks - start array. */
1400		if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks)
1401			g_raid_md_intel_start(sc);
1402	}
1403}
1404
1405static void
1406g_raid_intel_go(void *arg)
1407{
1408	struct g_raid_softc *sc;
1409	struct g_raid_md_object *md;
1410	struct g_raid_md_intel_object *mdi;
1411
1412	sc = arg;
1413	md = sc->sc_md;
1414	mdi = (struct g_raid_md_intel_object *)md;
1415	if (!mdi->mdio_started) {
1416		G_RAID_DEBUG1(0, sc, "Force array start due to timeout.");
1417		g_raid_event_send(sc, G_RAID_NODE_E_START, 0);
1418	}
1419}
1420
1421static int
1422g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp,
1423    struct g_geom **gp)
1424{
1425	struct g_raid_softc *sc;
1426	struct g_raid_md_intel_object *mdi;
1427	char name[16];
1428
1429	mdi = (struct g_raid_md_intel_object *)md;
1430	mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random();
1431	mdi->mdio_generation = 0;
1432	snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id);
1433	sc = g_raid_create_node(mp, name, md);
1434	if (sc == NULL)
1435		return (G_RAID_MD_TASTE_FAIL);
1436	md->mdo_softc = sc;
1437	*gp = sc->sc_geom;
1438	return (G_RAID_MD_TASTE_NEW);
1439}
1440
1441/*
1442 * Return the last N characters of the serial label.  The Linux and
1443 * ataraid(7) code always uses the last 16 characters of the label to
1444 * store into the Intel meta format.  Generalize this to N characters
1445 * since that's easy.  Labels can be up to 20 characters for SATA drives
1446 * and up 251 characters for SAS drives.  Since intel controllers don't
1447 * support SAS drives, just stick with the SATA limits for stack friendliness.
1448 */
1449static int
1450g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen)
1451{
1452	char serial_buffer[DISK_IDENT_SIZE];
1453	int len, error;
1454
1455	len = sizeof(serial_buffer);
1456	error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer);
1457	if (error != 0)
1458		return (error);
1459	len = strlen(serial_buffer);
1460	if (len > serlen)
1461		len -= serlen;
1462	else
1463		len = 0;
1464	strncpy(serial, serial_buffer + len, serlen);
1465	return (0);
1466}
1467
1468static int
1469g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp,
1470                              struct g_consumer *cp, struct g_geom **gp)
1471{
1472	struct g_consumer *rcp;
1473	struct g_provider *pp;
1474	struct g_raid_md_intel_object *mdi, *mdi1;
1475	struct g_raid_softc *sc;
1476	struct g_raid_disk *disk;
1477	struct intel_raid_conf *meta;
1478	struct g_raid_md_intel_perdisk *pd;
1479	struct g_geom *geom;
1480	int error, disk_pos, result, spare, len;
1481	char serial[INTEL_SERIAL_LEN];
1482	char name[16];
1483	uint16_t vendor;
1484
1485	G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name);
1486	mdi = (struct g_raid_md_intel_object *)md;
1487	pp = cp->provider;
1488
1489	/* Read metadata from device. */
1490	meta = NULL;
1491	disk_pos = 0;
1492	g_topology_unlock();
1493	error = g_raid_md_get_label(cp, serial, sizeof(serial));
1494	if (error != 0) {
1495		G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).",
1496		    pp->name, error);
1497		goto fail2;
1498	}
1499	vendor = 0xffff;
1500	len = sizeof(vendor);
1501	if (pp->geom->rank == 1)
1502		g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor);
1503	meta = intel_meta_read(cp);
1504	g_topology_lock();
1505	if (meta == NULL) {
1506		if (g_raid_aggressive_spare) {
1507			if (vendor != 0x8086) {
1508				G_RAID_DEBUG(1,
1509				    "Intel vendor mismatch 0x%04x != 0x8086",
1510				    vendor);
1511			} else {
1512				G_RAID_DEBUG(1,
1513				    "No Intel metadata, forcing spare.");
1514				spare = 2;
1515				goto search;
1516			}
1517		}
1518		return (G_RAID_MD_TASTE_FAIL);
1519	}
1520
1521	/* Check this disk position in obtained metadata. */
1522	disk_pos = intel_meta_find_disk(meta, serial);
1523	if (disk_pos < 0) {
1524		G_RAID_DEBUG(1, "Intel serial '%s' not found", serial);
1525		goto fail1;
1526	}
1527	if (intel_get_disk_sectors(&meta->disk[disk_pos]) !=
1528	    (pp->mediasize / pp->sectorsize)) {
1529		G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju",
1530		    intel_get_disk_sectors(&meta->disk[disk_pos]),
1531		    (off_t)(pp->mediasize / pp->sectorsize));
1532		goto fail1;
1533	}
1534
1535	G_RAID_DEBUG(1, "Intel disk position %d", disk_pos);
1536	spare = meta->disk[disk_pos].flags & INTEL_F_SPARE;
1537
1538search:
1539	/* Search for matching node. */
1540	sc = NULL;
1541	mdi1 = NULL;
1542	LIST_FOREACH(geom, &mp->geom, geom) {
1543		sc = geom->softc;
1544		if (sc == NULL)
1545			continue;
1546		if (sc->sc_stopping != 0)
1547			continue;
1548		if (sc->sc_md->mdo_class != md->mdo_class)
1549			continue;
1550		mdi1 = (struct g_raid_md_intel_object *)sc->sc_md;
1551		if (spare) {
1552			if (mdi1->mdio_incomplete)
1553				break;
1554		} else {
1555			if (mdi1->mdio_config_id == meta->config_id)
1556				break;
1557		}
1558	}
1559
1560	/* Found matching node. */
1561	if (geom != NULL) {
1562		G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name);
1563		result = G_RAID_MD_TASTE_EXISTING;
1564
1565	} else if (spare) { /* Not found needy node -- left for later. */
1566		G_RAID_DEBUG(1, "Spare is not needed at this time");
1567		goto fail1;
1568
1569	} else { /* Not found matching node -- create one. */
1570		result = G_RAID_MD_TASTE_NEW;
1571		mdi->mdio_config_id = meta->config_id;
1572		mdi->mdio_orig_config_id = meta->orig_config_id;
1573		snprintf(name, sizeof(name), "Intel-%08x", meta->config_id);
1574		sc = g_raid_create_node(mp, name, md);
1575		md->mdo_softc = sc;
1576		geom = sc->sc_geom;
1577		callout_init(&mdi->mdio_start_co, 1);
1578		callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz,
1579		    g_raid_intel_go, sc);
1580		mdi->mdio_rootmount = root_mount_hold("GRAID-Intel");
1581		G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount);
1582	}
1583
1584	/* There is no return after this point, so we close passed consumer. */
1585	g_access(cp, -1, 0, 0);
1586
1587	rcp = g_new_consumer(geom);
1588	rcp->flags |= G_CF_DIRECT_RECEIVE;
1589	g_attach(rcp, pp);
1590	if (g_access(rcp, 1, 1, 1) != 0)
1591		; //goto fail1;
1592
1593	g_topology_unlock();
1594	sx_xlock(&sc->sc_lock);
1595
1596	pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1597	pd->pd_meta = meta;
1598	pd->pd_disk_pos = -1;
1599	if (spare == 2) {
1600		memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN);
1601		intel_set_disk_sectors(&pd->pd_disk_meta,
1602		    pp->mediasize / pp->sectorsize);
1603		pd->pd_disk_meta.id = 0;
1604		pd->pd_disk_meta.flags = INTEL_F_SPARE;
1605	} else {
1606		pd->pd_disk_meta = meta->disk[disk_pos];
1607	}
1608	disk = g_raid_create_disk(sc);
1609	disk->d_md_data = (void *)pd;
1610	disk->d_consumer = rcp;
1611	rcp->private = disk;
1612
1613	g_raid_get_disk_info(disk);
1614
1615	g_raid_md_intel_new_disk(disk);
1616
1617	sx_xunlock(&sc->sc_lock);
1618	g_topology_lock();
1619	*gp = geom;
1620	return (result);
1621fail2:
1622	g_topology_lock();
1623fail1:
1624	free(meta, M_MD_INTEL);
1625	return (G_RAID_MD_TASTE_FAIL);
1626}
1627
1628static int
1629g_raid_md_event_intel(struct g_raid_md_object *md,
1630    struct g_raid_disk *disk, u_int event)
1631{
1632	struct g_raid_softc *sc;
1633	struct g_raid_subdisk *sd;
1634	struct g_raid_md_intel_object *mdi;
1635	struct g_raid_md_intel_perdisk *pd;
1636
1637	sc = md->mdo_softc;
1638	mdi = (struct g_raid_md_intel_object *)md;
1639	if (disk == NULL) {
1640		switch (event) {
1641		case G_RAID_NODE_E_START:
1642			if (!mdi->mdio_started)
1643				g_raid_md_intel_start(sc);
1644			return (0);
1645		}
1646		return (-1);
1647	}
1648	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1649	switch (event) {
1650	case G_RAID_DISK_E_DISCONNECTED:
1651		/* If disk was assigned, just update statuses. */
1652		if (pd->pd_disk_pos >= 0) {
1653			g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1654			if (disk->d_consumer) {
1655				g_raid_kill_consumer(sc, disk->d_consumer);
1656				disk->d_consumer = NULL;
1657			}
1658			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
1659				g_raid_change_subdisk_state(sd,
1660				    G_RAID_SUBDISK_S_NONE);
1661				g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
1662				    G_RAID_EVENT_SUBDISK);
1663			}
1664		} else {
1665			/* Otherwise -- delete. */
1666			g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
1667			g_raid_destroy_disk(disk);
1668		}
1669
1670		/* Write updated metadata to all disks. */
1671		g_raid_md_write_intel(md, NULL, NULL, NULL);
1672
1673		/* Check if anything left except placeholders. */
1674		if (g_raid_ndisks(sc, -1) ==
1675		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
1676			g_raid_destroy_node(sc, 0);
1677		else
1678			g_raid_md_intel_refill(sc);
1679		return (0);
1680	}
1681	return (-2);
1682}
1683
1684static int
1685g_raid_md_ctl_intel(struct g_raid_md_object *md,
1686    struct gctl_req *req)
1687{
1688	struct g_raid_softc *sc;
1689	struct g_raid_volume *vol, *vol1;
1690	struct g_raid_subdisk *sd;
1691	struct g_raid_disk *disk;
1692	struct g_raid_md_intel_object *mdi;
1693	struct g_raid_md_intel_pervolume *pv;
1694	struct g_raid_md_intel_perdisk *pd;
1695	struct g_consumer *cp;
1696	struct g_provider *pp;
1697	char arg[16], serial[INTEL_SERIAL_LEN];
1698	const char *nodename, *verb, *volname, *levelname, *diskname;
1699	char *tmp;
1700	int *nargs, *force;
1701	off_t off, size, sectorsize, strip, disk_sectors;
1702	intmax_t *sizearg, *striparg;
1703	int numdisks, i, len, level, qual, update;
1704	int error;
1705
1706	sc = md->mdo_softc;
1707	mdi = (struct g_raid_md_intel_object *)md;
1708	verb = gctl_get_param(req, "verb", NULL);
1709	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
1710	error = 0;
1711	if (strcmp(verb, "label") == 0) {
1712		if (*nargs < 4) {
1713			gctl_error(req, "Invalid number of arguments.");
1714			return (-1);
1715		}
1716		volname = gctl_get_asciiparam(req, "arg1");
1717		if (volname == NULL) {
1718			gctl_error(req, "No volume name.");
1719			return (-2);
1720		}
1721		levelname = gctl_get_asciiparam(req, "arg2");
1722		if (levelname == NULL) {
1723			gctl_error(req, "No RAID level.");
1724			return (-3);
1725		}
1726		if (strcasecmp(levelname, "RAID5") == 0)
1727			levelname = "RAID5-LA";
1728		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1729			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1730			return (-4);
1731		}
1732		numdisks = *nargs - 3;
1733		force = gctl_get_paraml(req, "force", sizeof(*force));
1734		if (!g_raid_md_intel_supported(level, qual, numdisks,
1735		    force ? *force : 0)) {
1736			gctl_error(req, "Unsupported RAID level "
1737			    "(0x%02x/0x%02x), or number of disks (%d).",
1738			    level, qual, numdisks);
1739			return (-5);
1740		}
1741
1742		/* Search for disks, connect them and probe. */
1743		size = 0x7fffffffffffffffllu;
1744		sectorsize = 0;
1745		for (i = 0; i < numdisks; i++) {
1746			snprintf(arg, sizeof(arg), "arg%d", i + 3);
1747			diskname = gctl_get_asciiparam(req, arg);
1748			if (diskname == NULL) {
1749				gctl_error(req, "No disk name (%s).", arg);
1750				error = -6;
1751				break;
1752			}
1753			if (strcmp(diskname, "NONE") == 0) {
1754				cp = NULL;
1755				pp = NULL;
1756			} else {
1757				g_topology_lock();
1758				cp = g_raid_open_consumer(sc, diskname);
1759				if (cp == NULL) {
1760					gctl_error(req, "Can't open disk '%s'.",
1761					    diskname);
1762					g_topology_unlock();
1763					error = -7;
1764					break;
1765				}
1766				pp = cp->provider;
1767			}
1768			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
1769			pd->pd_disk_pos = i;
1770			disk = g_raid_create_disk(sc);
1771			disk->d_md_data = (void *)pd;
1772			disk->d_consumer = cp;
1773			if (cp == NULL) {
1774				strcpy(&pd->pd_disk_meta.serial[0], "NONE");
1775				pd->pd_disk_meta.id = 0xffffffff;
1776				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
1777				continue;
1778			}
1779			cp->private = disk;
1780			g_topology_unlock();
1781
1782			error = g_raid_md_get_label(cp,
1783			    &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN);
1784			if (error != 0) {
1785				gctl_error(req,
1786				    "Can't get serial for provider '%s'.",
1787				    diskname);
1788				error = -8;
1789				break;
1790			}
1791
1792			g_raid_get_disk_info(disk);
1793
1794			intel_set_disk_sectors(&pd->pd_disk_meta,
1795			    pp->mediasize / pp->sectorsize);
1796			if (size > pp->mediasize)
1797				size = pp->mediasize;
1798			if (sectorsize < pp->sectorsize)
1799				sectorsize = pp->sectorsize;
1800			pd->pd_disk_meta.id = 0;
1801			pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE;
1802		}
1803		if (error != 0)
1804			return (error);
1805
1806		if (sectorsize <= 0) {
1807			gctl_error(req, "Can't get sector size.");
1808			return (-8);
1809		}
1810
1811		/* Reserve some space for metadata. */
1812		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1813
1814		/* Handle size argument. */
1815		len = sizeof(*sizearg);
1816		sizearg = gctl_get_param(req, "size", &len);
1817		if (sizearg != NULL && len == sizeof(*sizearg) &&
1818		    *sizearg > 0) {
1819			if (*sizearg > size) {
1820				gctl_error(req, "Size too big %lld > %lld.",
1821				    (long long)*sizearg, (long long)size);
1822				return (-9);
1823			}
1824			size = *sizearg;
1825		}
1826
1827		/* Handle strip argument. */
1828		strip = 131072;
1829		len = sizeof(*striparg);
1830		striparg = gctl_get_param(req, "strip", &len);
1831		if (striparg != NULL && len == sizeof(*striparg) &&
1832		    *striparg > 0) {
1833			if (*striparg < sectorsize) {
1834				gctl_error(req, "Strip size too small.");
1835				return (-10);
1836			}
1837			if (*striparg % sectorsize != 0) {
1838				gctl_error(req, "Incorrect strip size.");
1839				return (-11);
1840			}
1841			if (strip > 65535 * sectorsize) {
1842				gctl_error(req, "Strip size too big.");
1843				return (-12);
1844			}
1845			strip = *striparg;
1846		}
1847
1848		/* Round size down to strip or sector. */
1849		if (level == G_RAID_VOLUME_RL_RAID1)
1850			size -= (size % sectorsize);
1851		else if (level == G_RAID_VOLUME_RL_RAID1E &&
1852		    (numdisks & 1) != 0)
1853			size -= (size % (2 * strip));
1854		else
1855			size -= (size % strip);
1856		if (size <= 0) {
1857			gctl_error(req, "Size too small.");
1858			return (-13);
1859		}
1860
1861		/* We have all we need, create things: volume, ... */
1862		mdi->mdio_started = 1;
1863		vol = g_raid_create_volume(sc, volname, -1);
1864		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
1865		pv->pv_volume_pos = 0;
1866		vol->v_md_data = pv;
1867		vol->v_raid_level = level;
1868		vol->v_raid_level_qualifier = qual;
1869		vol->v_strip_size = strip;
1870		vol->v_disks_count = numdisks;
1871		if (level == G_RAID_VOLUME_RL_RAID0)
1872			vol->v_mediasize = size * numdisks;
1873		else if (level == G_RAID_VOLUME_RL_RAID1)
1874			vol->v_mediasize = size;
1875		else if (level == G_RAID_VOLUME_RL_RAID5)
1876			vol->v_mediasize = size * (numdisks - 1);
1877		else { /* RAID1E */
1878			vol->v_mediasize = ((size * numdisks) / strip / 2) *
1879			    strip;
1880		}
1881		vol->v_sectorsize = sectorsize;
1882		g_raid_start_volume(vol);
1883
1884		/* , and subdisks. */
1885		TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
1886			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
1887			sd = &vol->v_subdisks[pd->pd_disk_pos];
1888			sd->sd_disk = disk;
1889			sd->sd_offset = 0;
1890			sd->sd_size = size;
1891			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
1892			if (sd->sd_disk->d_consumer != NULL) {
1893				g_raid_change_disk_state(disk,
1894				    G_RAID_DISK_S_ACTIVE);
1895				if (level == G_RAID_VOLUME_RL_RAID5)
1896					g_raid_change_subdisk_state(sd,
1897					    G_RAID_SUBDISK_S_UNINITIALIZED);
1898				else
1899					g_raid_change_subdisk_state(sd,
1900					    G_RAID_SUBDISK_S_ACTIVE);
1901				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
1902				    G_RAID_EVENT_SUBDISK);
1903			} else {
1904				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
1905			}
1906		}
1907
1908		/* Write metadata based on created entities. */
1909		G_RAID_DEBUG1(0, sc, "Array started.");
1910		g_raid_md_write_intel(md, NULL, NULL, NULL);
1911
1912		/* Pickup any STALE/SPARE disks to refill array if needed. */
1913		g_raid_md_intel_refill(sc);
1914
1915		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
1916		    G_RAID_EVENT_VOLUME);
1917		return (0);
1918	}
1919	if (strcmp(verb, "add") == 0) {
1920		if (*nargs != 3) {
1921			gctl_error(req, "Invalid number of arguments.");
1922			return (-1);
1923		}
1924		volname = gctl_get_asciiparam(req, "arg1");
1925		if (volname == NULL) {
1926			gctl_error(req, "No volume name.");
1927			return (-2);
1928		}
1929		levelname = gctl_get_asciiparam(req, "arg2");
1930		if (levelname == NULL) {
1931			gctl_error(req, "No RAID level.");
1932			return (-3);
1933		}
1934		if (strcasecmp(levelname, "RAID5") == 0)
1935			levelname = "RAID5-LA";
1936		if (g_raid_volume_str2level(levelname, &level, &qual)) {
1937			gctl_error(req, "Unknown RAID level '%s'.", levelname);
1938			return (-4);
1939		}
1940
1941		/* Look for existing volumes. */
1942		i = 0;
1943		vol1 = NULL;
1944		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1945			vol1 = vol;
1946			i++;
1947		}
1948		if (i > 1) {
1949			gctl_error(req, "Maximum two volumes supported.");
1950			return (-6);
1951		}
1952		if (vol1 == NULL) {
1953			gctl_error(req, "At least one volume must exist.");
1954			return (-7);
1955		}
1956
1957		numdisks = vol1->v_disks_count;
1958		force = gctl_get_paraml(req, "force", sizeof(*force));
1959		if (!g_raid_md_intel_supported(level, qual, numdisks,
1960		    force ? *force : 0)) {
1961			gctl_error(req, "Unsupported RAID level "
1962			    "(0x%02x/0x%02x), or number of disks (%d).",
1963			    level, qual, numdisks);
1964			return (-5);
1965		}
1966
1967		/* Collect info about present disks. */
1968		size = 0x7fffffffffffffffllu;
1969		sectorsize = 512;
1970		for (i = 0; i < numdisks; i++) {
1971			disk = vol1->v_subdisks[i].sd_disk;
1972			pd = (struct g_raid_md_intel_perdisk *)
1973			    disk->d_md_data;
1974			disk_sectors =
1975			    intel_get_disk_sectors(&pd->pd_disk_meta);
1976
1977			if (disk_sectors * 512 < size)
1978				size = disk_sectors * 512;
1979			if (disk->d_consumer != NULL &&
1980			    disk->d_consumer->provider != NULL &&
1981			    disk->d_consumer->provider->sectorsize >
1982			     sectorsize) {
1983				sectorsize =
1984				    disk->d_consumer->provider->sectorsize;
1985			}
1986		}
1987
1988		/* Reserve some space for metadata. */
1989		size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize;
1990
1991		/* Decide insert before or after. */
1992		sd = &vol1->v_subdisks[0];
1993		if (sd->sd_offset >
1994		    size - (sd->sd_offset + sd->sd_size)) {
1995			off = 0;
1996			size = sd->sd_offset;
1997		} else {
1998			off = sd->sd_offset + sd->sd_size;
1999			size = size - (sd->sd_offset + sd->sd_size);
2000		}
2001
2002		/* Handle strip argument. */
2003		strip = 131072;
2004		len = sizeof(*striparg);
2005		striparg = gctl_get_param(req, "strip", &len);
2006		if (striparg != NULL && len == sizeof(*striparg) &&
2007		    *striparg > 0) {
2008			if (*striparg < sectorsize) {
2009				gctl_error(req, "Strip size too small.");
2010				return (-10);
2011			}
2012			if (*striparg % sectorsize != 0) {
2013				gctl_error(req, "Incorrect strip size.");
2014				return (-11);
2015			}
2016			if (strip > 65535 * sectorsize) {
2017				gctl_error(req, "Strip size too big.");
2018				return (-12);
2019			}
2020			strip = *striparg;
2021		}
2022
2023		/* Round offset up to strip. */
2024		if (off % strip != 0) {
2025			size -= strip - off % strip;
2026			off += strip - off % strip;
2027		}
2028
2029		/* Handle size argument. */
2030		len = sizeof(*sizearg);
2031		sizearg = gctl_get_param(req, "size", &len);
2032		if (sizearg != NULL && len == sizeof(*sizearg) &&
2033		    *sizearg > 0) {
2034			if (*sizearg > size) {
2035				gctl_error(req, "Size too big %lld > %lld.",
2036				    (long long)*sizearg, (long long)size);
2037				return (-9);
2038			}
2039			size = *sizearg;
2040		}
2041
2042		/* Round size down to strip or sector. */
2043		if (level == G_RAID_VOLUME_RL_RAID1)
2044			size -= (size % sectorsize);
2045		else
2046			size -= (size % strip);
2047		if (size <= 0) {
2048			gctl_error(req, "Size too small.");
2049			return (-13);
2050		}
2051		if (size > 0xffffffffllu * sectorsize) {
2052			gctl_error(req, "Size too big.");
2053			return (-14);
2054		}
2055
2056		/* We have all we need, create things: volume, ... */
2057		vol = g_raid_create_volume(sc, volname, -1);
2058		pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO);
2059		pv->pv_volume_pos = i;
2060		vol->v_md_data = pv;
2061		vol->v_raid_level = level;
2062		vol->v_raid_level_qualifier = qual;
2063		vol->v_strip_size = strip;
2064		vol->v_disks_count = numdisks;
2065		if (level == G_RAID_VOLUME_RL_RAID0)
2066			vol->v_mediasize = size * numdisks;
2067		else if (level == G_RAID_VOLUME_RL_RAID1)
2068			vol->v_mediasize = size;
2069		else if (level == G_RAID_VOLUME_RL_RAID5)
2070			vol->v_mediasize = size * (numdisks - 1);
2071		else { /* RAID1E */
2072			vol->v_mediasize = ((size * numdisks) / strip / 2) *
2073			    strip;
2074		}
2075		vol->v_sectorsize = sectorsize;
2076		g_raid_start_volume(vol);
2077
2078		/* , and subdisks. */
2079		for (i = 0; i < numdisks; i++) {
2080			disk = vol1->v_subdisks[i].sd_disk;
2081			sd = &vol->v_subdisks[i];
2082			sd->sd_disk = disk;
2083			sd->sd_offset = off;
2084			sd->sd_size = size;
2085			TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next);
2086			if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
2087				if (level == G_RAID_VOLUME_RL_RAID5)
2088					g_raid_change_subdisk_state(sd,
2089					    G_RAID_SUBDISK_S_UNINITIALIZED);
2090				else
2091					g_raid_change_subdisk_state(sd,
2092					    G_RAID_SUBDISK_S_ACTIVE);
2093				g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW,
2094				    G_RAID_EVENT_SUBDISK);
2095			}
2096		}
2097
2098		/* Write metadata based on created entities. */
2099		g_raid_md_write_intel(md, NULL, NULL, NULL);
2100
2101		g_raid_event_send(vol, G_RAID_VOLUME_E_START,
2102		    G_RAID_EVENT_VOLUME);
2103		return (0);
2104	}
2105	if (strcmp(verb, "delete") == 0) {
2106		nodename = gctl_get_asciiparam(req, "arg0");
2107		if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0)
2108			nodename = NULL;
2109
2110		/* Full node destruction. */
2111		if (*nargs == 1 && nodename != NULL) {
2112			/* Check if some volume is still open. */
2113			force = gctl_get_paraml(req, "force", sizeof(*force));
2114			if (force != NULL && *force == 0 &&
2115			    g_raid_nopens(sc) != 0) {
2116				gctl_error(req, "Some volume is still open.");
2117				return (-4);
2118			}
2119
2120			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2121				if (disk->d_consumer)
2122					intel_meta_erase(disk->d_consumer);
2123			}
2124			g_raid_destroy_node(sc, 0);
2125			return (0);
2126		}
2127
2128		/* Destroy specified volume. If it was last - all node. */
2129		if (*nargs > 2) {
2130			gctl_error(req, "Invalid number of arguments.");
2131			return (-1);
2132		}
2133		volname = gctl_get_asciiparam(req,
2134		    nodename != NULL ? "arg1" : "arg0");
2135		if (volname == NULL) {
2136			gctl_error(req, "No volume name.");
2137			return (-2);
2138		}
2139
2140		/* Search for volume. */
2141		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2142			if (strcmp(vol->v_name, volname) == 0)
2143				break;
2144			pp = vol->v_provider;
2145			if (pp == NULL)
2146				continue;
2147			if (strcmp(pp->name, volname) == 0)
2148				break;
2149			if (strncmp(pp->name, "raid/", 5) == 0 &&
2150			    strcmp(pp->name + 5, volname) == 0)
2151				break;
2152		}
2153		if (vol == NULL) {
2154			i = strtol(volname, &tmp, 10);
2155			if (verb != volname && tmp[0] == 0) {
2156				TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2157					if (vol->v_global_id == i)
2158						break;
2159				}
2160			}
2161		}
2162		if (vol == NULL) {
2163			gctl_error(req, "Volume '%s' not found.", volname);
2164			return (-3);
2165		}
2166
2167		/* Check if volume is still open. */
2168		force = gctl_get_paraml(req, "force", sizeof(*force));
2169		if (force != NULL && *force == 0 &&
2170		    vol->v_provider_open != 0) {
2171			gctl_error(req, "Volume is still open.");
2172			return (-4);
2173		}
2174
2175		/* Destroy volume and potentially node. */
2176		i = 0;
2177		TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next)
2178			i++;
2179		if (i >= 2) {
2180			g_raid_destroy_volume(vol);
2181			g_raid_md_write_intel(md, NULL, NULL, NULL);
2182		} else {
2183			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2184				if (disk->d_consumer)
2185					intel_meta_erase(disk->d_consumer);
2186			}
2187			g_raid_destroy_node(sc, 0);
2188		}
2189		return (0);
2190	}
2191	if (strcmp(verb, "remove") == 0 ||
2192	    strcmp(verb, "fail") == 0) {
2193		if (*nargs < 2) {
2194			gctl_error(req, "Invalid number of arguments.");
2195			return (-1);
2196		}
2197		for (i = 1; i < *nargs; i++) {
2198			snprintf(arg, sizeof(arg), "arg%d", i);
2199			diskname = gctl_get_asciiparam(req, arg);
2200			if (diskname == NULL) {
2201				gctl_error(req, "No disk name (%s).", arg);
2202				error = -2;
2203				break;
2204			}
2205			if (strncmp(diskname, _PATH_DEV, 5) == 0)
2206				diskname += 5;
2207
2208			TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2209				if (disk->d_consumer != NULL &&
2210				    disk->d_consumer->provider != NULL &&
2211				    strcmp(disk->d_consumer->provider->name,
2212				     diskname) == 0)
2213					break;
2214			}
2215			if (disk == NULL) {
2216				gctl_error(req, "Disk '%s' not found.",
2217				    diskname);
2218				error = -3;
2219				break;
2220			}
2221
2222			if (strcmp(verb, "fail") == 0) {
2223				g_raid_md_fail_disk_intel(md, NULL, disk);
2224				continue;
2225			}
2226
2227			pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2228
2229			/* Erase metadata on deleting disk. */
2230			intel_meta_erase(disk->d_consumer);
2231
2232			/* If disk was assigned, just update statuses. */
2233			if (pd->pd_disk_pos >= 0) {
2234				g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE);
2235				g_raid_kill_consumer(sc, disk->d_consumer);
2236				disk->d_consumer = NULL;
2237				TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2238					g_raid_change_subdisk_state(sd,
2239					    G_RAID_SUBDISK_S_NONE);
2240					g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2241					    G_RAID_EVENT_SUBDISK);
2242				}
2243			} else {
2244				/* Otherwise -- delete. */
2245				g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE);
2246				g_raid_destroy_disk(disk);
2247			}
2248		}
2249
2250		/* Write updated metadata to remaining disks. */
2251		g_raid_md_write_intel(md, NULL, NULL, NULL);
2252
2253		/* Check if anything left except placeholders. */
2254		if (g_raid_ndisks(sc, -1) ==
2255		    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2256			g_raid_destroy_node(sc, 0);
2257		else
2258			g_raid_md_intel_refill(sc);
2259		return (error);
2260	}
2261	if (strcmp(verb, "insert") == 0) {
2262		if (*nargs < 2) {
2263			gctl_error(req, "Invalid number of arguments.");
2264			return (-1);
2265		}
2266		update = 0;
2267		for (i = 1; i < *nargs; i++) {
2268			/* Get disk name. */
2269			snprintf(arg, sizeof(arg), "arg%d", i);
2270			diskname = gctl_get_asciiparam(req, arg);
2271			if (diskname == NULL) {
2272				gctl_error(req, "No disk name (%s).", arg);
2273				error = -3;
2274				break;
2275			}
2276
2277			/* Try to find provider with specified name. */
2278			g_topology_lock();
2279			cp = g_raid_open_consumer(sc, diskname);
2280			if (cp == NULL) {
2281				gctl_error(req, "Can't open disk '%s'.",
2282				    diskname);
2283				g_topology_unlock();
2284				error = -4;
2285				break;
2286			}
2287			pp = cp->provider;
2288			g_topology_unlock();
2289
2290			/* Read disk serial. */
2291			error = g_raid_md_get_label(cp,
2292			    &serial[0], INTEL_SERIAL_LEN);
2293			if (error != 0) {
2294				gctl_error(req,
2295				    "Can't get serial for provider '%s'.",
2296				    diskname);
2297				g_raid_kill_consumer(sc, cp);
2298				error = -7;
2299				break;
2300			}
2301
2302			pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO);
2303			pd->pd_disk_pos = -1;
2304
2305			disk = g_raid_create_disk(sc);
2306			disk->d_consumer = cp;
2307			disk->d_md_data = (void *)pd;
2308			cp->private = disk;
2309
2310			g_raid_get_disk_info(disk);
2311
2312			memcpy(&pd->pd_disk_meta.serial[0], &serial[0],
2313			    INTEL_SERIAL_LEN);
2314			intel_set_disk_sectors(&pd->pd_disk_meta,
2315			    pp->mediasize / pp->sectorsize);
2316			pd->pd_disk_meta.id = 0;
2317			pd->pd_disk_meta.flags = INTEL_F_SPARE;
2318
2319			/* Welcome the "new" disk. */
2320			update += g_raid_md_intel_start_disk(disk);
2321			if (disk->d_state == G_RAID_DISK_S_SPARE) {
2322				intel_meta_write_spare(cp, &pd->pd_disk_meta);
2323				g_raid_destroy_disk(disk);
2324			} else if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2325				gctl_error(req, "Disk '%s' doesn't fit.",
2326				    diskname);
2327				g_raid_destroy_disk(disk);
2328				error = -8;
2329				break;
2330			}
2331		}
2332
2333		/* Write new metadata if we changed something. */
2334		if (update)
2335			g_raid_md_write_intel(md, NULL, NULL, NULL);
2336		return (error);
2337	}
2338	return (-100);
2339}
2340
2341static int
2342g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol,
2343    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2344{
2345	struct g_raid_softc *sc;
2346	struct g_raid_volume *vol;
2347	struct g_raid_subdisk *sd;
2348	struct g_raid_disk *disk;
2349	struct g_raid_md_intel_object *mdi;
2350	struct g_raid_md_intel_pervolume *pv;
2351	struct g_raid_md_intel_perdisk *pd;
2352	struct intel_raid_conf *meta;
2353	struct intel_raid_vol *mvol;
2354	struct intel_raid_map *mmap0, *mmap1;
2355	off_t sectorsize = 512, pos;
2356	const char *version, *cv;
2357	int vi, sdi, numdisks, len, state, stale;
2358
2359	sc = md->mdo_softc;
2360	mdi = (struct g_raid_md_intel_object *)md;
2361
2362	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2363		return (0);
2364
2365	/* Bump generation. Newly written metadata may differ from previous. */
2366	mdi->mdio_generation++;
2367
2368	/* Count number of disks. */
2369	numdisks = 0;
2370	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2371		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2372		if (pd->pd_disk_pos < 0)
2373			continue;
2374		numdisks++;
2375		if (disk->d_state == G_RAID_DISK_S_ACTIVE) {
2376			pd->pd_disk_meta.flags =
2377			    INTEL_F_ONLINE | INTEL_F_ASSIGNED;
2378		} else if (disk->d_state == G_RAID_DISK_S_FAILED) {
2379			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2380			    INTEL_F_ASSIGNED;
2381		} else if (disk->d_state == G_RAID_DISK_S_DISABLED) {
2382			pd->pd_disk_meta.flags = INTEL_F_FAILED |
2383			    INTEL_F_ASSIGNED | INTEL_F_DISABLED;
2384		} else {
2385			if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED))
2386				pd->pd_disk_meta.flags = INTEL_F_ASSIGNED;
2387			if (pd->pd_disk_meta.id != 0xffffffff) {
2388				pd->pd_disk_meta.id = 0xffffffff;
2389				len = strlen(pd->pd_disk_meta.serial);
2390				len = min(len, INTEL_SERIAL_LEN - 3);
2391				strcpy(pd->pd_disk_meta.serial + len, ":0");
2392			}
2393		}
2394	}
2395
2396	/* Fill anchor and disks. */
2397	meta = malloc(INTEL_MAX_MD_SIZE(numdisks),
2398	    M_MD_INTEL, M_WAITOK | M_ZERO);
2399	memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1);
2400	meta->config_size = INTEL_MAX_MD_SIZE(numdisks);
2401	meta->config_id = mdi->mdio_config_id;
2402	meta->orig_config_id = mdi->mdio_orig_config_id;
2403	meta->generation = mdi->mdio_generation;
2404	meta->attributes = INTEL_ATTR_CHECKSUM;
2405	meta->total_disks = numdisks;
2406	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2407		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2408		if (pd->pd_disk_pos < 0)
2409			continue;
2410		meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta;
2411		if (pd->pd_disk_meta.sectors_hi != 0)
2412			meta->attributes |= INTEL_ATTR_2TB_DISK;
2413	}
2414
2415	/* Fill volumes and maps. */
2416	vi = 0;
2417	version = INTEL_VERSION_1000;
2418	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2419		pv = vol->v_md_data;
2420		if (vol->v_stopping)
2421			continue;
2422		mvol = intel_get_volume(meta, vi);
2423
2424		/* New metadata may have different volumes order. */
2425		pv->pv_volume_pos = vi;
2426
2427		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2428			sd = &vol->v_subdisks[sdi];
2429			if (sd->sd_disk != NULL)
2430				break;
2431		}
2432		if (sdi >= vol->v_disks_count)
2433			panic("No any filled subdisk in volume");
2434		if (vol->v_mediasize >= 0x20000000000llu)
2435			meta->attributes |= INTEL_ATTR_2TB;
2436		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2437			meta->attributes |= INTEL_ATTR_RAID0;
2438		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2439			meta->attributes |= INTEL_ATTR_RAID1;
2440		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2441			meta->attributes |= INTEL_ATTR_RAID5;
2442		else if ((vol->v_disks_count & 1) == 0)
2443			meta->attributes |= INTEL_ATTR_RAID10;
2444		else
2445			meta->attributes |= INTEL_ATTR_RAID1E;
2446		if (pv->pv_cng)
2447			meta->attributes |= INTEL_ATTR_RAIDCNG;
2448		if (vol->v_strip_size > 131072)
2449			meta->attributes |= INTEL_ATTR_EXT_STRIP;
2450
2451		if (pv->pv_cng)
2452			cv = INTEL_VERSION_1206;
2453		else if (vol->v_disks_count > 4)
2454			cv = INTEL_VERSION_1204;
2455		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5)
2456			cv = INTEL_VERSION_1202;
2457		else if (vol->v_disks_count > 2)
2458			cv = INTEL_VERSION_1201;
2459		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2460			cv = INTEL_VERSION_1100;
2461		else
2462			cv = INTEL_VERSION_1000;
2463		if (strcmp(cv, version) > 0)
2464			version = cv;
2465
2466		strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name));
2467		mvol->total_sectors = vol->v_mediasize / sectorsize;
2468		mvol->state = (INTEL_ST_READ_COALESCING |
2469		    INTEL_ST_WRITE_COALESCING);
2470		mvol->tid = vol->v_global_id + 1;
2471		if (pv->pv_cng) {
2472			mvol->state |= INTEL_ST_CLONE_N_GO;
2473			if (pv->pv_cng_man_sync)
2474				mvol->state |= INTEL_ST_CLONE_MAN_SYNC;
2475			mvol->cng_master_disk = pv->pv_cng_master_disk;
2476			if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state ==
2477			    G_RAID_SUBDISK_S_NONE)
2478				mvol->cng_state = INTEL_CNGST_MASTER_MISSING;
2479			else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL)
2480				mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE;
2481			else
2482				mvol->cng_state = INTEL_CNGST_UPDATED;
2483		}
2484
2485		/* Check for any recovery in progress. */
2486		state = G_RAID_SUBDISK_S_ACTIVE;
2487		pos = 0x7fffffffffffffffllu;
2488		stale = 0;
2489		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2490			sd = &vol->v_subdisks[sdi];
2491			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD)
2492				state = G_RAID_SUBDISK_S_REBUILD;
2493			else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC &&
2494			    state != G_RAID_SUBDISK_S_REBUILD)
2495				state = G_RAID_SUBDISK_S_RESYNC;
2496			else if (sd->sd_state == G_RAID_SUBDISK_S_STALE)
2497				stale = 1;
2498			if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2499			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
2500			     sd->sd_rebuild_pos < pos)
2501			        pos = sd->sd_rebuild_pos;
2502		}
2503		if (state == G_RAID_SUBDISK_S_REBUILD) {
2504			mvol->migr_state = 1;
2505			mvol->migr_type = INTEL_MT_REBUILD;
2506		} else if (state == G_RAID_SUBDISK_S_RESYNC) {
2507			mvol->migr_state = 1;
2508			/* mvol->migr_type = INTEL_MT_REPAIR; */
2509			mvol->migr_type = INTEL_MT_VERIFY;
2510			mvol->state |= INTEL_ST_VERIFY_AND_FIX;
2511		} else
2512			mvol->migr_state = 0;
2513		mvol->dirty = (vol->v_dirty || stale);
2514
2515		mmap0 = intel_get_map(mvol, 0);
2516
2517		/* Write map / common part of two maps. */
2518		intel_set_map_offset(mmap0, sd->sd_offset / sectorsize);
2519		intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize);
2520		mmap0->strip_sectors = vol->v_strip_size / sectorsize;
2521		if (vol->v_state == G_RAID_VOLUME_S_BROKEN)
2522			mmap0->status = INTEL_S_FAILURE;
2523		else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED)
2524			mmap0->status = INTEL_S_DEGRADED;
2525		else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED)
2526		    == g_raid_nsubdisks(vol, -1))
2527			mmap0->status = INTEL_S_UNINITIALIZED;
2528		else
2529			mmap0->status = INTEL_S_READY;
2530		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0)
2531			mmap0->type = INTEL_T_RAID0;
2532		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
2533		    vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2534			mmap0->type = INTEL_T_RAID1;
2535		else
2536			mmap0->type = INTEL_T_RAID5;
2537		mmap0->total_disks = vol->v_disks_count;
2538		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1)
2539			mmap0->total_domains = vol->v_disks_count;
2540		else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E)
2541			mmap0->total_domains = 2;
2542		else
2543			mmap0->total_domains = 1;
2544		intel_set_map_stripe_count(mmap0,
2545		    sd->sd_size / vol->v_strip_size / mmap0->total_domains);
2546		mmap0->failed_disk_num = 0xff;
2547		mmap0->ddf = 1;
2548
2549		/* If there are two maps - copy common and update. */
2550		if (mvol->migr_state) {
2551			intel_set_vol_curr_migr_unit(mvol,
2552			    pos / vol->v_strip_size / mmap0->total_domains);
2553			mmap1 = intel_get_map(mvol, 1);
2554			memcpy(mmap1, mmap0, sizeof(struct intel_raid_map));
2555			mmap0->status = INTEL_S_READY;
2556		} else
2557			mmap1 = NULL;
2558
2559		/* Write disk indexes and put rebuild flags. */
2560		for (sdi = 0; sdi < vol->v_disks_count; sdi++) {
2561			sd = &vol->v_subdisks[sdi];
2562			pd = (struct g_raid_md_intel_perdisk *)
2563			    sd->sd_disk->d_md_data;
2564			mmap0->disk_idx[sdi] = pd->pd_disk_pos;
2565			if (mvol->migr_state)
2566				mmap1->disk_idx[sdi] = pd->pd_disk_pos;
2567			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2568			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2569				mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2570			} else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE &&
2571			    sd->sd_state != G_RAID_SUBDISK_S_STALE &&
2572			    sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) {
2573				mmap0->disk_idx[sdi] |= INTEL_DI_RBLD;
2574				if (mvol->migr_state)
2575					mmap1->disk_idx[sdi] |= INTEL_DI_RBLD;
2576			}
2577			if ((sd->sd_state == G_RAID_SUBDISK_S_NONE ||
2578			     sd->sd_state == G_RAID_SUBDISK_S_FAILED ||
2579			     sd->sd_state == G_RAID_SUBDISK_S_REBUILD) &&
2580			    mmap0->failed_disk_num == 0xff) {
2581				mmap0->failed_disk_num = sdi;
2582				if (mvol->migr_state)
2583					mmap1->failed_disk_num = sdi;
2584			}
2585		}
2586		vi++;
2587	}
2588	meta->total_volumes = vi;
2589	if (vi > 1 || meta->attributes &
2590	     (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB))
2591		version = INTEL_VERSION_1300;
2592	if (strcmp(version, INTEL_VERSION_1300) < 0)
2593		meta->attributes &= INTEL_ATTR_CHECKSUM;
2594	memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1);
2595
2596	/* We are done. Print meta data and store them to disks. */
2597	g_raid_md_intel_print(meta);
2598	if (mdi->mdio_meta != NULL)
2599		free(mdi->mdio_meta, M_MD_INTEL);
2600	mdi->mdio_meta = meta;
2601	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
2602		pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2603		if (disk->d_state != G_RAID_DISK_S_ACTIVE)
2604			continue;
2605		if (pd->pd_meta != NULL) {
2606			free(pd->pd_meta, M_MD_INTEL);
2607			pd->pd_meta = NULL;
2608		}
2609		pd->pd_meta = intel_meta_copy(meta);
2610		intel_meta_write(disk->d_consumer, meta);
2611	}
2612	return (0);
2613}
2614
2615static int
2616g_raid_md_fail_disk_intel(struct g_raid_md_object *md,
2617    struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk)
2618{
2619	struct g_raid_softc *sc;
2620	struct g_raid_md_intel_object *mdi;
2621	struct g_raid_md_intel_perdisk *pd;
2622	struct g_raid_subdisk *sd;
2623
2624	sc = md->mdo_softc;
2625	mdi = (struct g_raid_md_intel_object *)md;
2626	pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data;
2627
2628	/* We can't fail disk that is not a part of array now. */
2629	if (pd->pd_disk_pos < 0)
2630		return (-1);
2631
2632	/*
2633	 * Mark disk as failed in metadata and try to write that metadata
2634	 * to the disk itself to prevent it's later resurrection as STALE.
2635	 */
2636	mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED;
2637	pd->pd_disk_meta.flags = INTEL_F_FAILED;
2638	g_raid_md_intel_print(mdi->mdio_meta);
2639	if (tdisk->d_consumer)
2640		intel_meta_write(tdisk->d_consumer, mdi->mdio_meta);
2641
2642	/* Change states. */
2643	g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED);
2644	TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) {
2645		g_raid_change_subdisk_state(sd,
2646		    G_RAID_SUBDISK_S_FAILED);
2647		g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED,
2648		    G_RAID_EVENT_SUBDISK);
2649	}
2650
2651	/* Write updated metadata to remaining disks. */
2652	g_raid_md_write_intel(md, NULL, NULL, tdisk);
2653
2654	/* Check if anything left except placeholders. */
2655	if (g_raid_ndisks(sc, -1) ==
2656	    g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE))
2657		g_raid_destroy_node(sc, 0);
2658	else
2659		g_raid_md_intel_refill(sc);
2660	return (0);
2661}
2662
2663static int
2664g_raid_md_free_disk_intel(struct g_raid_md_object *md,
2665    struct g_raid_disk *disk)
2666{
2667	struct g_raid_md_intel_perdisk *pd;
2668
2669	pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data;
2670	if (pd->pd_meta != NULL) {
2671		free(pd->pd_meta, M_MD_INTEL);
2672		pd->pd_meta = NULL;
2673	}
2674	free(pd, M_MD_INTEL);
2675	disk->d_md_data = NULL;
2676	return (0);
2677}
2678
2679static int
2680g_raid_md_free_volume_intel(struct g_raid_md_object *md,
2681    struct g_raid_volume *vol)
2682{
2683	struct g_raid_md_intel_pervolume *pv;
2684
2685	pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data;
2686	free(pv, M_MD_INTEL);
2687	vol->v_md_data = NULL;
2688	return (0);
2689}
2690
2691static int
2692g_raid_md_free_intel(struct g_raid_md_object *md)
2693{
2694	struct g_raid_md_intel_object *mdi;
2695
2696	mdi = (struct g_raid_md_intel_object *)md;
2697	if (!mdi->mdio_started) {
2698		mdi->mdio_started = 0;
2699		callout_stop(&mdi->mdio_start_co);
2700		G_RAID_DEBUG1(1, md->mdo_softc,
2701		    "root_mount_rel %p", mdi->mdio_rootmount);
2702		root_mount_rel(mdi->mdio_rootmount);
2703		mdi->mdio_rootmount = NULL;
2704	}
2705	if (mdi->mdio_meta != NULL) {
2706		free(mdi->mdio_meta, M_MD_INTEL);
2707		mdi->mdio_meta = NULL;
2708	}
2709	return (0);
2710}
2711
2712G_RAID_MD_DECLARE(intel, "Intel");
2713