g_raid.c revision 265669
1/*-
2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/sys/geom/raid/g_raid.c 265669 2014-05-08 12:07:40Z mav $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sbuf.h>
39#include <sys/sysctl.h>
40#include <sys/malloc.h>
41#include <sys/eventhandler.h>
42#include <vm/uma.h>
43#include <geom/geom.h>
44#include <sys/proc.h>
45#include <sys/kthread.h>
46#include <sys/sched.h>
47#include <geom/raid/g_raid.h>
48#include "g_raid_md_if.h"
49#include "g_raid_tr_if.h"
50
51static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
52
53SYSCTL_DECL(_kern_geom);
54SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
55int g_raid_enable = 1;
56TUNABLE_INT("kern.geom.raid.enable", &g_raid_enable);
57SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RW,
58    &g_raid_enable, 0, "Enable on-disk metadata taste");
59u_int g_raid_aggressive_spare = 0;
60TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
61SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
62    &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
63u_int g_raid_debug = 0;
64TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
65SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
66    "Debug level");
67int g_raid_read_err_thresh = 10;
68TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
69SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
70    &g_raid_read_err_thresh, 0,
71    "Number of read errors equated to disk failure");
72u_int g_raid_start_timeout = 30;
73TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
74SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
75    &g_raid_start_timeout, 0,
76    "Time to wait for all array components");
77static u_int g_raid_clean_time = 5;
78TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
79SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
80    &g_raid_clean_time, 0, "Mark volume as clean when idling");
81static u_int g_raid_disconnect_on_failure = 1;
82TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
83    &g_raid_disconnect_on_failure);
84SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
85    &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
86static u_int g_raid_name_format = 0;
87TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
88SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
89    &g_raid_name_format, 0, "Providers name format.");
90static u_int g_raid_idle_threshold = 1000000;
91TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
92SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
93    &g_raid_idle_threshold, 1000000,
94    "Time in microseconds to consider a volume idle.");
95static u_int ar_legacy_aliases = 1;
96SYSCTL_INT(_kern_geom_raid, OID_AUTO, legacy_aliases, CTLFLAG_RW,
97           &ar_legacy_aliases, 0, "Create aliases named as the legacy ataraid style.");
98TUNABLE_INT("kern.geom_raid.legacy_aliases", &ar_legacy_aliases);
99
100
101#define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
102	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
103	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
104	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
105} while (0)
106
107LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
108    LIST_HEAD_INITIALIZER(g_raid_md_classes);
109
110LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
111    LIST_HEAD_INITIALIZER(g_raid_tr_classes);
112
113LIST_HEAD(, g_raid_volume) g_raid_volumes =
114    LIST_HEAD_INITIALIZER(g_raid_volumes);
115
116static eventhandler_tag g_raid_post_sync = NULL;
117static int g_raid_started = 0;
118static int g_raid_shutdown = 0;
119
120static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
121    struct g_geom *gp);
122static g_taste_t g_raid_taste;
123static void g_raid_init(struct g_class *mp);
124static void g_raid_fini(struct g_class *mp);
125
126struct g_class g_raid_class = {
127	.name = G_RAID_CLASS_NAME,
128	.version = G_VERSION,
129	.ctlreq = g_raid_ctl,
130	.taste = g_raid_taste,
131	.destroy_geom = g_raid_destroy_geom,
132	.init = g_raid_init,
133	.fini = g_raid_fini
134};
135
136static void g_raid_destroy_provider(struct g_raid_volume *vol);
137static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
138static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
139static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
140static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
141static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
142    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
143static void g_raid_start(struct bio *bp);
144static void g_raid_start_request(struct bio *bp);
145static void g_raid_disk_done(struct bio *bp);
146static void g_raid_poll(struct g_raid_softc *sc);
147
148static const char *
149g_raid_node_event2str(int event)
150{
151
152	switch (event) {
153	case G_RAID_NODE_E_WAKE:
154		return ("WAKE");
155	case G_RAID_NODE_E_START:
156		return ("START");
157	default:
158		return ("INVALID");
159	}
160}
161
162const char *
163g_raid_disk_state2str(int state)
164{
165
166	switch (state) {
167	case G_RAID_DISK_S_NONE:
168		return ("NONE");
169	case G_RAID_DISK_S_OFFLINE:
170		return ("OFFLINE");
171	case G_RAID_DISK_S_DISABLED:
172		return ("DISABLED");
173	case G_RAID_DISK_S_FAILED:
174		return ("FAILED");
175	case G_RAID_DISK_S_STALE_FAILED:
176		return ("STALE_FAILED");
177	case G_RAID_DISK_S_SPARE:
178		return ("SPARE");
179	case G_RAID_DISK_S_STALE:
180		return ("STALE");
181	case G_RAID_DISK_S_ACTIVE:
182		return ("ACTIVE");
183	default:
184		return ("INVALID");
185	}
186}
187
188static const char *
189g_raid_disk_event2str(int event)
190{
191
192	switch (event) {
193	case G_RAID_DISK_E_DISCONNECTED:
194		return ("DISCONNECTED");
195	default:
196		return ("INVALID");
197	}
198}
199
200const char *
201g_raid_subdisk_state2str(int state)
202{
203
204	switch (state) {
205	case G_RAID_SUBDISK_S_NONE:
206		return ("NONE");
207	case G_RAID_SUBDISK_S_FAILED:
208		return ("FAILED");
209	case G_RAID_SUBDISK_S_NEW:
210		return ("NEW");
211	case G_RAID_SUBDISK_S_REBUILD:
212		return ("REBUILD");
213	case G_RAID_SUBDISK_S_UNINITIALIZED:
214		return ("UNINITIALIZED");
215	case G_RAID_SUBDISK_S_STALE:
216		return ("STALE");
217	case G_RAID_SUBDISK_S_RESYNC:
218		return ("RESYNC");
219	case G_RAID_SUBDISK_S_ACTIVE:
220		return ("ACTIVE");
221	default:
222		return ("INVALID");
223	}
224}
225
226static const char *
227g_raid_subdisk_event2str(int event)
228{
229
230	switch (event) {
231	case G_RAID_SUBDISK_E_NEW:
232		return ("NEW");
233	case G_RAID_SUBDISK_E_FAILED:
234		return ("FAILED");
235	case G_RAID_SUBDISK_E_DISCONNECTED:
236		return ("DISCONNECTED");
237	default:
238		return ("INVALID");
239	}
240}
241
242const char *
243g_raid_volume_state2str(int state)
244{
245
246	switch (state) {
247	case G_RAID_VOLUME_S_STARTING:
248		return ("STARTING");
249	case G_RAID_VOLUME_S_BROKEN:
250		return ("BROKEN");
251	case G_RAID_VOLUME_S_DEGRADED:
252		return ("DEGRADED");
253	case G_RAID_VOLUME_S_SUBOPTIMAL:
254		return ("SUBOPTIMAL");
255	case G_RAID_VOLUME_S_OPTIMAL:
256		return ("OPTIMAL");
257	case G_RAID_VOLUME_S_UNSUPPORTED:
258		return ("UNSUPPORTED");
259	case G_RAID_VOLUME_S_STOPPED:
260		return ("STOPPED");
261	default:
262		return ("INVALID");
263	}
264}
265
266static const char *
267g_raid_volume_event2str(int event)
268{
269
270	switch (event) {
271	case G_RAID_VOLUME_E_UP:
272		return ("UP");
273	case G_RAID_VOLUME_E_DOWN:
274		return ("DOWN");
275	case G_RAID_VOLUME_E_START:
276		return ("START");
277	case G_RAID_VOLUME_E_STARTMD:
278		return ("STARTMD");
279	default:
280		return ("INVALID");
281	}
282}
283
284const char *
285g_raid_volume_level2str(int level, int qual)
286{
287
288	switch (level) {
289	case G_RAID_VOLUME_RL_RAID0:
290		return ("RAID0");
291	case G_RAID_VOLUME_RL_RAID1:
292		return ("RAID1");
293	case G_RAID_VOLUME_RL_RAID3:
294		if (qual == G_RAID_VOLUME_RLQ_R3P0)
295			return ("RAID3-P0");
296		if (qual == G_RAID_VOLUME_RLQ_R3PN)
297			return ("RAID3-PN");
298		return ("RAID3");
299	case G_RAID_VOLUME_RL_RAID4:
300		if (qual == G_RAID_VOLUME_RLQ_R4P0)
301			return ("RAID4-P0");
302		if (qual == G_RAID_VOLUME_RLQ_R4PN)
303			return ("RAID4-PN");
304		return ("RAID4");
305	case G_RAID_VOLUME_RL_RAID5:
306		if (qual == G_RAID_VOLUME_RLQ_R5RA)
307			return ("RAID5-RA");
308		if (qual == G_RAID_VOLUME_RLQ_R5RS)
309			return ("RAID5-RS");
310		if (qual == G_RAID_VOLUME_RLQ_R5LA)
311			return ("RAID5-LA");
312		if (qual == G_RAID_VOLUME_RLQ_R5LS)
313			return ("RAID5-LS");
314		return ("RAID5");
315	case G_RAID_VOLUME_RL_RAID6:
316		if (qual == G_RAID_VOLUME_RLQ_R6RA)
317			return ("RAID6-RA");
318		if (qual == G_RAID_VOLUME_RLQ_R6RS)
319			return ("RAID6-RS");
320		if (qual == G_RAID_VOLUME_RLQ_R6LA)
321			return ("RAID6-LA");
322		if (qual == G_RAID_VOLUME_RLQ_R6LS)
323			return ("RAID6-LS");
324		return ("RAID6");
325	case G_RAID_VOLUME_RL_RAIDMDF:
326		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
327			return ("RAIDMDF-RA");
328		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
329			return ("RAIDMDF-RS");
330		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
331			return ("RAIDMDF-LA");
332		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
333			return ("RAIDMDF-LS");
334		return ("RAIDMDF");
335	case G_RAID_VOLUME_RL_RAID1E:
336		if (qual == G_RAID_VOLUME_RLQ_R1EA)
337			return ("RAID1E-A");
338		if (qual == G_RAID_VOLUME_RLQ_R1EO)
339			return ("RAID1E-O");
340		return ("RAID1E");
341	case G_RAID_VOLUME_RL_SINGLE:
342		return ("SINGLE");
343	case G_RAID_VOLUME_RL_CONCAT:
344		return ("CONCAT");
345	case G_RAID_VOLUME_RL_RAID5E:
346		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
347			return ("RAID5E-RA");
348		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
349			return ("RAID5E-RS");
350		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
351			return ("RAID5E-LA");
352		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
353			return ("RAID5E-LS");
354		return ("RAID5E");
355	case G_RAID_VOLUME_RL_RAID5EE:
356		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
357			return ("RAID5EE-RA");
358		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
359			return ("RAID5EE-RS");
360		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
361			return ("RAID5EE-LA");
362		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
363			return ("RAID5EE-LS");
364		return ("RAID5EE");
365	case G_RAID_VOLUME_RL_RAID5R:
366		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
367			return ("RAID5R-RA");
368		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
369			return ("RAID5R-RS");
370		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
371			return ("RAID5R-LA");
372		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
373			return ("RAID5R-LS");
374		return ("RAID5E");
375	default:
376		return ("UNKNOWN");
377	}
378}
379
380int
381g_raid_volume_str2level(const char *str, int *level, int *qual)
382{
383
384	*level = G_RAID_VOLUME_RL_UNKNOWN;
385	*qual = G_RAID_VOLUME_RLQ_NONE;
386	if (strcasecmp(str, "RAID0") == 0)
387		*level = G_RAID_VOLUME_RL_RAID0;
388	else if (strcasecmp(str, "RAID1") == 0)
389		*level = G_RAID_VOLUME_RL_RAID1;
390	else if (strcasecmp(str, "RAID3-P0") == 0) {
391		*level = G_RAID_VOLUME_RL_RAID3;
392		*qual = G_RAID_VOLUME_RLQ_R3P0;
393	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
394		   strcasecmp(str, "RAID3") == 0) {
395		*level = G_RAID_VOLUME_RL_RAID3;
396		*qual = G_RAID_VOLUME_RLQ_R3PN;
397	} else if (strcasecmp(str, "RAID4-P0") == 0) {
398		*level = G_RAID_VOLUME_RL_RAID4;
399		*qual = G_RAID_VOLUME_RLQ_R4P0;
400	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
401		   strcasecmp(str, "RAID4") == 0) {
402		*level = G_RAID_VOLUME_RL_RAID4;
403		*qual = G_RAID_VOLUME_RLQ_R4PN;
404	} else if (strcasecmp(str, "RAID5-RA") == 0) {
405		*level = G_RAID_VOLUME_RL_RAID5;
406		*qual = G_RAID_VOLUME_RLQ_R5RA;
407	} else if (strcasecmp(str, "RAID5-RS") == 0) {
408		*level = G_RAID_VOLUME_RL_RAID5;
409		*qual = G_RAID_VOLUME_RLQ_R5RS;
410	} else if (strcasecmp(str, "RAID5") == 0 ||
411		   strcasecmp(str, "RAID5-LA") == 0) {
412		*level = G_RAID_VOLUME_RL_RAID5;
413		*qual = G_RAID_VOLUME_RLQ_R5LA;
414	} else if (strcasecmp(str, "RAID5-LS") == 0) {
415		*level = G_RAID_VOLUME_RL_RAID5;
416		*qual = G_RAID_VOLUME_RLQ_R5LS;
417	} else if (strcasecmp(str, "RAID6-RA") == 0) {
418		*level = G_RAID_VOLUME_RL_RAID6;
419		*qual = G_RAID_VOLUME_RLQ_R6RA;
420	} else if (strcasecmp(str, "RAID6-RS") == 0) {
421		*level = G_RAID_VOLUME_RL_RAID6;
422		*qual = G_RAID_VOLUME_RLQ_R6RS;
423	} else if (strcasecmp(str, "RAID6") == 0 ||
424		   strcasecmp(str, "RAID6-LA") == 0) {
425		*level = G_RAID_VOLUME_RL_RAID6;
426		*qual = G_RAID_VOLUME_RLQ_R6LA;
427	} else if (strcasecmp(str, "RAID6-LS") == 0) {
428		*level = G_RAID_VOLUME_RL_RAID6;
429		*qual = G_RAID_VOLUME_RLQ_R6LS;
430	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
431		*level = G_RAID_VOLUME_RL_RAIDMDF;
432		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
433	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
434		*level = G_RAID_VOLUME_RL_RAIDMDF;
435		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
436	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
437		   strcasecmp(str, "RAIDMDF-LA") == 0) {
438		*level = G_RAID_VOLUME_RL_RAIDMDF;
439		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
440	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
441		*level = G_RAID_VOLUME_RL_RAIDMDF;
442		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
443	} else if (strcasecmp(str, "RAID10") == 0 ||
444		   strcasecmp(str, "RAID1E") == 0 ||
445		   strcasecmp(str, "RAID1E-A") == 0) {
446		*level = G_RAID_VOLUME_RL_RAID1E;
447		*qual = G_RAID_VOLUME_RLQ_R1EA;
448	} else if (strcasecmp(str, "RAID1E-O") == 0) {
449		*level = G_RAID_VOLUME_RL_RAID1E;
450		*qual = G_RAID_VOLUME_RLQ_R1EO;
451	} else if (strcasecmp(str, "SINGLE") == 0)
452		*level = G_RAID_VOLUME_RL_SINGLE;
453	else if (strcasecmp(str, "CONCAT") == 0)
454		*level = G_RAID_VOLUME_RL_CONCAT;
455	else if (strcasecmp(str, "RAID5E-RA") == 0) {
456		*level = G_RAID_VOLUME_RL_RAID5E;
457		*qual = G_RAID_VOLUME_RLQ_R5ERA;
458	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
459		*level = G_RAID_VOLUME_RL_RAID5E;
460		*qual = G_RAID_VOLUME_RLQ_R5ERS;
461	} else if (strcasecmp(str, "RAID5E") == 0 ||
462		   strcasecmp(str, "RAID5E-LA") == 0) {
463		*level = G_RAID_VOLUME_RL_RAID5E;
464		*qual = G_RAID_VOLUME_RLQ_R5ELA;
465	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
466		*level = G_RAID_VOLUME_RL_RAID5E;
467		*qual = G_RAID_VOLUME_RLQ_R5ELS;
468	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
469		*level = G_RAID_VOLUME_RL_RAID5EE;
470		*qual = G_RAID_VOLUME_RLQ_R5EERA;
471	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
472		*level = G_RAID_VOLUME_RL_RAID5EE;
473		*qual = G_RAID_VOLUME_RLQ_R5EERS;
474	} else if (strcasecmp(str, "RAID5EE") == 0 ||
475		   strcasecmp(str, "RAID5EE-LA") == 0) {
476		*level = G_RAID_VOLUME_RL_RAID5EE;
477		*qual = G_RAID_VOLUME_RLQ_R5EELA;
478	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
479		*level = G_RAID_VOLUME_RL_RAID5EE;
480		*qual = G_RAID_VOLUME_RLQ_R5EELS;
481	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
482		*level = G_RAID_VOLUME_RL_RAID5R;
483		*qual = G_RAID_VOLUME_RLQ_R5RRA;
484	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
485		*level = G_RAID_VOLUME_RL_RAID5R;
486		*qual = G_RAID_VOLUME_RLQ_R5RRS;
487	} else if (strcasecmp(str, "RAID5R") == 0 ||
488		   strcasecmp(str, "RAID5R-LA") == 0) {
489		*level = G_RAID_VOLUME_RL_RAID5R;
490		*qual = G_RAID_VOLUME_RLQ_R5RLA;
491	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
492		*level = G_RAID_VOLUME_RL_RAID5R;
493		*qual = G_RAID_VOLUME_RLQ_R5RLS;
494	} else
495		return (-1);
496	return (0);
497}
498
499const char *
500g_raid_get_diskname(struct g_raid_disk *disk)
501{
502
503	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
504		return ("[unknown]");
505	return (disk->d_consumer->provider->name);
506}
507
508void
509g_raid_get_disk_info(struct g_raid_disk *disk)
510{
511	struct g_consumer *cp = disk->d_consumer;
512	int error, len;
513
514	/* Read kernel dumping information. */
515	disk->d_kd.offset = 0;
516	disk->d_kd.length = OFF_MAX;
517	len = sizeof(disk->d_kd);
518	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
519	if (error)
520		disk->d_kd.di.dumper = NULL;
521	if (disk->d_kd.di.dumper == NULL)
522		G_RAID_DEBUG1(2, disk->d_softc,
523		    "Dumping not supported by %s: %d.",
524		    cp->provider->name, error);
525
526	/* Read BIO_DELETE support. */
527	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
528	if (error)
529		disk->d_candelete = 0;
530	if (!disk->d_candelete)
531		G_RAID_DEBUG1(2, disk->d_softc,
532		    "BIO_DELETE not supported by %s: %d.",
533		    cp->provider->name, error);
534}
535
536void
537g_raid_report_disk_state(struct g_raid_disk *disk)
538{
539	struct g_raid_subdisk *sd;
540	int len, state;
541	uint32_t s;
542
543	if (disk->d_consumer == NULL)
544		return;
545	if (disk->d_state == G_RAID_DISK_S_DISABLED) {
546		s = G_STATE_ACTIVE; /* XXX */
547	} else if (disk->d_state == G_RAID_DISK_S_FAILED ||
548	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
549		s = G_STATE_FAILED;
550	} else {
551		state = G_RAID_SUBDISK_S_ACTIVE;
552		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
553			if (sd->sd_state < state)
554				state = sd->sd_state;
555		}
556		if (state == G_RAID_SUBDISK_S_FAILED)
557			s = G_STATE_FAILED;
558		else if (state == G_RAID_SUBDISK_S_NEW ||
559		    state == G_RAID_SUBDISK_S_REBUILD)
560			s = G_STATE_REBUILD;
561		else if (state == G_RAID_SUBDISK_S_STALE ||
562		    state == G_RAID_SUBDISK_S_RESYNC)
563			s = G_STATE_RESYNC;
564		else
565			s = G_STATE_ACTIVE;
566	}
567	len = sizeof(s);
568	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
569	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
570	    g_raid_get_diskname(disk), s);
571}
572
573void
574g_raid_change_disk_state(struct g_raid_disk *disk, int state)
575{
576
577	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
578	    g_raid_get_diskname(disk),
579	    g_raid_disk_state2str(disk->d_state),
580	    g_raid_disk_state2str(state));
581	disk->d_state = state;
582	g_raid_report_disk_state(disk);
583}
584
585void
586g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
587{
588
589	G_RAID_DEBUG1(0, sd->sd_softc,
590	    "Subdisk %s:%d-%s state changed from %s to %s.",
591	    sd->sd_volume->v_name, sd->sd_pos,
592	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
593	    g_raid_subdisk_state2str(sd->sd_state),
594	    g_raid_subdisk_state2str(state));
595	sd->sd_state = state;
596	if (sd->sd_disk)
597		g_raid_report_disk_state(sd->sd_disk);
598}
599
600void
601g_raid_change_volume_state(struct g_raid_volume *vol, int state)
602{
603
604	G_RAID_DEBUG1(0, vol->v_softc,
605	    "Volume %s state changed from %s to %s.",
606	    vol->v_name,
607	    g_raid_volume_state2str(vol->v_state),
608	    g_raid_volume_state2str(state));
609	vol->v_state = state;
610}
611
612/*
613 * --- Events handling functions ---
614 * Events in geom_raid are used to maintain subdisks and volumes status
615 * from one thread to simplify locking.
616 */
617static void
618g_raid_event_free(struct g_raid_event *ep)
619{
620
621	free(ep, M_RAID);
622}
623
624int
625g_raid_event_send(void *arg, int event, int flags)
626{
627	struct g_raid_softc *sc;
628	struct g_raid_event *ep;
629	int error;
630
631	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
632		sc = ((struct g_raid_volume *)arg)->v_softc;
633	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
634		sc = ((struct g_raid_disk *)arg)->d_softc;
635	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
636		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
637	} else {
638		sc = arg;
639	}
640	ep = malloc(sizeof(*ep), M_RAID,
641	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
642	if (ep == NULL)
643		return (ENOMEM);
644	ep->e_tgt = arg;
645	ep->e_event = event;
646	ep->e_flags = flags;
647	ep->e_error = 0;
648	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
649	mtx_lock(&sc->sc_queue_mtx);
650	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
651	mtx_unlock(&sc->sc_queue_mtx);
652	wakeup(sc);
653
654	if ((flags & G_RAID_EVENT_WAIT) == 0)
655		return (0);
656
657	sx_assert(&sc->sc_lock, SX_XLOCKED);
658	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
659	sx_xunlock(&sc->sc_lock);
660	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
661		mtx_lock(&sc->sc_queue_mtx);
662		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
663		    hz * 5);
664	}
665	error = ep->e_error;
666	g_raid_event_free(ep);
667	sx_xlock(&sc->sc_lock);
668	return (error);
669}
670
671static void
672g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
673{
674	struct g_raid_event *ep, *tmpep;
675
676	sx_assert(&sc->sc_lock, SX_XLOCKED);
677
678	mtx_lock(&sc->sc_queue_mtx);
679	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
680		if (ep->e_tgt != tgt)
681			continue;
682		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
683		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
684			g_raid_event_free(ep);
685		else {
686			ep->e_error = ECANCELED;
687			wakeup(ep);
688		}
689	}
690	mtx_unlock(&sc->sc_queue_mtx);
691}
692
693static int
694g_raid_event_check(struct g_raid_softc *sc, void *tgt)
695{
696	struct g_raid_event *ep;
697	int	res = 0;
698
699	sx_assert(&sc->sc_lock, SX_XLOCKED);
700
701	mtx_lock(&sc->sc_queue_mtx);
702	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
703		if (ep->e_tgt != tgt)
704			continue;
705		res = 1;
706		break;
707	}
708	mtx_unlock(&sc->sc_queue_mtx);
709	return (res);
710}
711
712/*
713 * Return the number of disks in given state.
714 * If state is equal to -1, count all connected disks.
715 */
716u_int
717g_raid_ndisks(struct g_raid_softc *sc, int state)
718{
719	struct g_raid_disk *disk;
720	u_int n;
721
722	sx_assert(&sc->sc_lock, SX_LOCKED);
723
724	n = 0;
725	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
726		if (disk->d_state == state || state == -1)
727			n++;
728	}
729	return (n);
730}
731
732/*
733 * Return the number of subdisks in given state.
734 * If state is equal to -1, count all connected disks.
735 */
736u_int
737g_raid_nsubdisks(struct g_raid_volume *vol, int state)
738{
739	struct g_raid_subdisk *subdisk;
740	struct g_raid_softc *sc;
741	u_int i, n ;
742
743	sc = vol->v_softc;
744	sx_assert(&sc->sc_lock, SX_LOCKED);
745
746	n = 0;
747	for (i = 0; i < vol->v_disks_count; i++) {
748		subdisk = &vol->v_subdisks[i];
749		if ((state == -1 &&
750		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
751		    subdisk->sd_state == state)
752			n++;
753	}
754	return (n);
755}
756
757/*
758 * Return the first subdisk in given state.
759 * If state is equal to -1, then the first connected disks.
760 */
761struct g_raid_subdisk *
762g_raid_get_subdisk(struct g_raid_volume *vol, int state)
763{
764	struct g_raid_subdisk *sd;
765	struct g_raid_softc *sc;
766	u_int i;
767
768	sc = vol->v_softc;
769	sx_assert(&sc->sc_lock, SX_LOCKED);
770
771	for (i = 0; i < vol->v_disks_count; i++) {
772		sd = &vol->v_subdisks[i];
773		if ((state == -1 &&
774		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
775		    sd->sd_state == state)
776			return (sd);
777	}
778	return (NULL);
779}
780
781struct g_consumer *
782g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
783{
784	struct g_consumer *cp;
785	struct g_provider *pp;
786
787	g_topology_assert();
788
789	if (strncmp(name, "/dev/", 5) == 0)
790		name += 5;
791	pp = g_provider_by_name(name);
792	if (pp == NULL)
793		return (NULL);
794	cp = g_new_consumer(sc->sc_geom);
795	cp->flags |= G_CF_DIRECT_RECEIVE;
796	if (g_attach(cp, pp) != 0) {
797		g_destroy_consumer(cp);
798		return (NULL);
799	}
800	if (g_access(cp, 1, 1, 1) != 0) {
801		g_detach(cp);
802		g_destroy_consumer(cp);
803		return (NULL);
804	}
805	return (cp);
806}
807
808static u_int
809g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
810{
811	struct bio *bp;
812	u_int nreqs = 0;
813
814	mtx_lock(&sc->sc_queue_mtx);
815	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
816		if (bp->bio_from == cp)
817			nreqs++;
818	}
819	mtx_unlock(&sc->sc_queue_mtx);
820	return (nreqs);
821}
822
823u_int
824g_raid_nopens(struct g_raid_softc *sc)
825{
826	struct g_raid_volume *vol;
827	u_int opens;
828
829	opens = 0;
830	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
831		if (vol->v_provider_open != 0)
832			opens++;
833	}
834	return (opens);
835}
836
837static int
838g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
839{
840
841	if (cp->index > 0) {
842		G_RAID_DEBUG1(2, sc,
843		    "I/O requests for %s exist, can't destroy it now.",
844		    cp->provider->name);
845		return (1);
846	}
847	if (g_raid_nrequests(sc, cp) > 0) {
848		G_RAID_DEBUG1(2, sc,
849		    "I/O requests for %s in queue, can't destroy it now.",
850		    cp->provider->name);
851		return (1);
852	}
853	return (0);
854}
855
856static void
857g_raid_destroy_consumer(void *arg, int flags __unused)
858{
859	struct g_consumer *cp;
860
861	g_topology_assert();
862
863	cp = arg;
864	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
865	g_detach(cp);
866	g_destroy_consumer(cp);
867}
868
869void
870g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
871{
872	struct g_provider *pp;
873	int retaste_wait;
874
875	g_topology_assert_not();
876
877	g_topology_lock();
878	cp->private = NULL;
879	if (g_raid_consumer_is_busy(sc, cp))
880		goto out;
881	pp = cp->provider;
882	retaste_wait = 0;
883	if (cp->acw == 1) {
884		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
885			retaste_wait = 1;
886	}
887	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
888		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
889	if (retaste_wait) {
890		/*
891		 * After retaste event was send (inside g_access()), we can send
892		 * event to detach and destroy consumer.
893		 * A class, which has consumer to the given provider connected
894		 * will not receive retaste event for the provider.
895		 * This is the way how I ignore retaste events when I close
896		 * consumers opened for write: I detach and destroy consumer
897		 * after retaste event is sent.
898		 */
899		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
900		goto out;
901	}
902	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
903	g_detach(cp);
904	g_destroy_consumer(cp);
905out:
906	g_topology_unlock();
907}
908
909static void
910g_raid_orphan(struct g_consumer *cp)
911{
912	struct g_raid_disk *disk;
913
914	g_topology_assert();
915
916	disk = cp->private;
917	if (disk == NULL)
918		return;
919	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
920	    G_RAID_EVENT_DISK);
921}
922
923static void
924g_raid_clean(struct g_raid_volume *vol, int acw)
925{
926	struct g_raid_softc *sc;
927	int timeout;
928
929	sc = vol->v_softc;
930	g_topology_assert_not();
931	sx_assert(&sc->sc_lock, SX_XLOCKED);
932
933//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
934//		return;
935	if (!vol->v_dirty)
936		return;
937	if (vol->v_writes > 0)
938		return;
939	if (acw > 0 || (acw == -1 &&
940	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
941		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
942		if (!g_raid_shutdown && timeout > 0)
943			return;
944	}
945	vol->v_dirty = 0;
946	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
947	    vol->v_name);
948	g_raid_write_metadata(sc, vol, NULL, NULL);
949}
950
951static void
952g_raid_dirty(struct g_raid_volume *vol)
953{
954	struct g_raid_softc *sc;
955
956	sc = vol->v_softc;
957	g_topology_assert_not();
958	sx_assert(&sc->sc_lock, SX_XLOCKED);
959
960//	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
961//		return;
962	vol->v_dirty = 1;
963	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
964	    vol->v_name);
965	g_raid_write_metadata(sc, vol, NULL, NULL);
966}
967
968void
969g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
970{
971	struct g_raid_softc *sc;
972	struct g_raid_volume *vol;
973	struct g_raid_subdisk *sd;
974	struct bio_queue_head queue;
975	struct bio *cbp;
976	int i;
977
978	vol = tr->tro_volume;
979	sc = vol->v_softc;
980
981	/*
982	 * Allocate all bios before sending any request, so we can return
983	 * ENOMEM in nice and clean way.
984	 */
985	bioq_init(&queue);
986	for (i = 0; i < vol->v_disks_count; i++) {
987		sd = &vol->v_subdisks[i];
988		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
989		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
990			continue;
991		cbp = g_clone_bio(bp);
992		if (cbp == NULL)
993			goto failure;
994		cbp->bio_caller1 = sd;
995		bioq_insert_tail(&queue, cbp);
996	}
997	while ((cbp = bioq_takefirst(&queue)) != NULL) {
998		sd = cbp->bio_caller1;
999		cbp->bio_caller1 = NULL;
1000		g_raid_subdisk_iostart(sd, cbp);
1001	}
1002	return;
1003failure:
1004	while ((cbp = bioq_takefirst(&queue)) != NULL)
1005		g_destroy_bio(cbp);
1006	if (bp->bio_error == 0)
1007		bp->bio_error = ENOMEM;
1008	g_raid_iodone(bp, bp->bio_error);
1009}
1010
1011static void
1012g_raid_tr_kerneldump_common_done(struct bio *bp)
1013{
1014
1015	bp->bio_flags |= BIO_DONE;
1016}
1017
1018int
1019g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
1020    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1021{
1022	struct g_raid_softc *sc;
1023	struct g_raid_volume *vol;
1024	struct bio bp;
1025
1026	vol = tr->tro_volume;
1027	sc = vol->v_softc;
1028
1029	bzero(&bp, sizeof(bp));
1030	bp.bio_cmd = BIO_WRITE;
1031	bp.bio_done = g_raid_tr_kerneldump_common_done;
1032	bp.bio_attribute = NULL;
1033	bp.bio_offset = offset;
1034	bp.bio_length = length;
1035	bp.bio_data = virtual;
1036	bp.bio_to = vol->v_provider;
1037
1038	g_raid_start(&bp);
1039	while (!(bp.bio_flags & BIO_DONE)) {
1040		G_RAID_DEBUG1(4, sc, "Poll...");
1041		g_raid_poll(sc);
1042		DELAY(10);
1043	}
1044
1045	return (bp.bio_error != 0 ? EIO : 0);
1046}
1047
1048static int
1049g_raid_dump(void *arg,
1050    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1051{
1052	struct g_raid_volume *vol;
1053	int error;
1054
1055	vol = (struct g_raid_volume *)arg;
1056	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1057	    (long long unsigned)offset, (long long unsigned)length);
1058
1059	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
1060	    virtual, physical, offset, length);
1061	return (error);
1062}
1063
1064static void
1065g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1066{
1067	struct g_kerneldump *gkd;
1068	struct g_provider *pp;
1069	struct g_raid_volume *vol;
1070
1071	gkd = (struct g_kerneldump*)bp->bio_data;
1072	pp = bp->bio_to;
1073	vol = pp->private;
1074	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1075		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1076	gkd->di.dumper = g_raid_dump;
1077	gkd->di.priv = vol;
1078	gkd->di.blocksize = vol->v_sectorsize;
1079	gkd->di.maxiosize = DFLTPHYS;
1080	gkd->di.mediaoffset = gkd->offset;
1081	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1082		gkd->length = vol->v_mediasize - gkd->offset;
1083	gkd->di.mediasize = gkd->length;
1084	g_io_deliver(bp, 0);
1085}
1086
1087static void
1088g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
1089{
1090	struct g_provider *pp;
1091	struct g_raid_volume *vol;
1092	struct g_raid_subdisk *sd;
1093	int *val;
1094	int i;
1095
1096	val = (int *)bp->bio_data;
1097	pp = bp->bio_to;
1098	vol = pp->private;
1099	*val = 0;
1100	for (i = 0; i < vol->v_disks_count; i++) {
1101		sd = &vol->v_subdisks[i];
1102		if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1103			continue;
1104		if (sd->sd_disk->d_candelete) {
1105			*val = 1;
1106			break;
1107		}
1108	}
1109	g_io_deliver(bp, 0);
1110}
1111
1112static void
1113g_raid_start(struct bio *bp)
1114{
1115	struct g_raid_softc *sc;
1116
1117	sc = bp->bio_to->geom->softc;
1118	/*
1119	 * If sc == NULL or there are no valid disks, provider's error
1120	 * should be set and g_raid_start() should not be called at all.
1121	 */
1122//	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1123//	    ("Provider's error should be set (error=%d)(mirror=%s).",
1124//	    bp->bio_to->error, bp->bio_to->name));
1125	G_RAID_LOGREQ(3, bp, "Request received.");
1126
1127	switch (bp->bio_cmd) {
1128	case BIO_READ:
1129	case BIO_WRITE:
1130	case BIO_DELETE:
1131	case BIO_FLUSH:
1132		break;
1133	case BIO_GETATTR:
1134		if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
1135			g_raid_candelete(sc, bp);
1136		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1137			g_raid_kerneldump(sc, bp);
1138		else
1139			g_io_deliver(bp, EOPNOTSUPP);
1140		return;
1141	default:
1142		g_io_deliver(bp, EOPNOTSUPP);
1143		return;
1144	}
1145	mtx_lock(&sc->sc_queue_mtx);
1146	bioq_disksort(&sc->sc_queue, bp);
1147	mtx_unlock(&sc->sc_queue_mtx);
1148	if (!dumping) {
1149		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1150		wakeup(sc);
1151	}
1152}
1153
1154static int
1155g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1156{
1157	/*
1158	 * 5 cases:
1159	 * (1) bp entirely below NO
1160	 * (2) bp entirely above NO
1161	 * (3) bp start below, but end in range YES
1162	 * (4) bp entirely within YES
1163	 * (5) bp starts within, ends above YES
1164	 *
1165	 * lock range 10-19 (offset 10 length 10)
1166	 * (1) 1-5: first if kicks it out
1167	 * (2) 30-35: second if kicks it out
1168	 * (3) 5-15: passes both ifs
1169	 * (4) 12-14: passes both ifs
1170	 * (5) 19-20: passes both
1171	 */
1172	off_t lend = lstart + len - 1;
1173	off_t bstart = bp->bio_offset;
1174	off_t bend = bp->bio_offset + bp->bio_length - 1;
1175
1176	if (bend < lstart)
1177		return (0);
1178	if (lend < bstart)
1179		return (0);
1180	return (1);
1181}
1182
1183static int
1184g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1185{
1186	struct g_raid_lock *lp;
1187
1188	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1189
1190	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1191		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1192			return (1);
1193	}
1194	return (0);
1195}
1196
1197static void
1198g_raid_start_request(struct bio *bp)
1199{
1200	struct g_raid_softc *sc;
1201	struct g_raid_volume *vol;
1202
1203	sc = bp->bio_to->geom->softc;
1204	sx_assert(&sc->sc_lock, SX_LOCKED);
1205	vol = bp->bio_to->private;
1206
1207	/*
1208	 * Check to see if this item is in a locked range.  If so,
1209	 * queue it to our locked queue and return.  We'll requeue
1210	 * it when the range is unlocked.  Internal I/O for the
1211	 * rebuild/rescan/recovery process is excluded from this
1212	 * check so we can actually do the recovery.
1213	 */
1214	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1215	    g_raid_is_in_locked_range(vol, bp)) {
1216		G_RAID_LOGREQ(3, bp, "Defer request.");
1217		bioq_insert_tail(&vol->v_locked, bp);
1218		return;
1219	}
1220
1221	/*
1222	 * If we're actually going to do the write/delete, then
1223	 * update the idle stats for the volume.
1224	 */
1225	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1226		if (!vol->v_dirty)
1227			g_raid_dirty(vol);
1228		vol->v_writes++;
1229	}
1230
1231	/*
1232	 * Put request onto inflight queue, so we can check if new
1233	 * synchronization requests don't collide with it.  Then tell
1234	 * the transformation layer to start the I/O.
1235	 */
1236	bioq_insert_tail(&vol->v_inflight, bp);
1237	G_RAID_LOGREQ(4, bp, "Request started");
1238	G_RAID_TR_IOSTART(vol->v_tr, bp);
1239}
1240
1241static void
1242g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1243{
1244	off_t off, len;
1245	struct bio *nbp;
1246	struct g_raid_lock *lp;
1247
1248	vol->v_pending_lock = 0;
1249	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1250		if (lp->l_pending) {
1251			off = lp->l_offset;
1252			len = lp->l_length;
1253			lp->l_pending = 0;
1254			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1255				if (g_raid_bio_overlaps(nbp, off, len))
1256					lp->l_pending++;
1257			}
1258			if (lp->l_pending) {
1259				vol->v_pending_lock = 1;
1260				G_RAID_DEBUG1(4, vol->v_softc,
1261				    "Deferred lock(%jd, %jd) has %d pending",
1262				    (intmax_t)off, (intmax_t)(off + len),
1263				    lp->l_pending);
1264				continue;
1265			}
1266			G_RAID_DEBUG1(4, vol->v_softc,
1267			    "Deferred lock of %jd to %jd completed",
1268			    (intmax_t)off, (intmax_t)(off + len));
1269			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1270		}
1271	}
1272}
1273
1274void
1275g_raid_iodone(struct bio *bp, int error)
1276{
1277	struct g_raid_softc *sc;
1278	struct g_raid_volume *vol;
1279
1280	sc = bp->bio_to->geom->softc;
1281	sx_assert(&sc->sc_lock, SX_LOCKED);
1282	vol = bp->bio_to->private;
1283	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1284
1285	/* Update stats if we done write/delete. */
1286	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1287		vol->v_writes--;
1288		vol->v_last_write = time_uptime;
1289	}
1290
1291	bioq_remove(&vol->v_inflight, bp);
1292	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1293		g_raid_finish_with_locked_ranges(vol, bp);
1294	getmicrouptime(&vol->v_last_done);
1295	g_io_deliver(bp, error);
1296}
1297
1298int
1299g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1300    struct bio *ignore, void *argp)
1301{
1302	struct g_raid_softc *sc;
1303	struct g_raid_lock *lp;
1304	struct bio *bp;
1305
1306	sc = vol->v_softc;
1307	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1308	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1309	lp->l_offset = off;
1310	lp->l_length = len;
1311	lp->l_callback_arg = argp;
1312
1313	lp->l_pending = 0;
1314	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1315		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1316			lp->l_pending++;
1317	}
1318
1319	/*
1320	 * If there are any writes that are pending, we return EBUSY.  All
1321	 * callers will have to wait until all pending writes clear.
1322	 */
1323	if (lp->l_pending > 0) {
1324		vol->v_pending_lock = 1;
1325		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1326		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1327		return (EBUSY);
1328	}
1329	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1330	    (intmax_t)off, (intmax_t)(off+len));
1331	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1332	return (0);
1333}
1334
1335int
1336g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1337{
1338	struct g_raid_lock *lp;
1339	struct g_raid_softc *sc;
1340	struct bio *bp;
1341
1342	sc = vol->v_softc;
1343	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1344		if (lp->l_offset == off && lp->l_length == len) {
1345			LIST_REMOVE(lp, l_next);
1346			/* XXX
1347			 * Right now we just put them all back on the queue
1348			 * and hope for the best.  We hope this because any
1349			 * locked ranges will go right back on this list
1350			 * when the worker thread runs.
1351			 * XXX
1352			 */
1353			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1354			    (intmax_t)lp->l_offset,
1355			    (intmax_t)(lp->l_offset+lp->l_length));
1356			mtx_lock(&sc->sc_queue_mtx);
1357			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1358				bioq_disksort(&sc->sc_queue, bp);
1359			mtx_unlock(&sc->sc_queue_mtx);
1360			free(lp, M_RAID);
1361			return (0);
1362		}
1363	}
1364	return (EINVAL);
1365}
1366
1367void
1368g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1369{
1370	struct g_consumer *cp;
1371	struct g_raid_disk *disk, *tdisk;
1372
1373	bp->bio_caller1 = sd;
1374
1375	/*
1376	 * Make sure that the disk is present. Generally it is a task of
1377	 * transformation layers to not send requests to absent disks, but
1378	 * it is better to be safe and report situation then sorry.
1379	 */
1380	if (sd->sd_disk == NULL) {
1381		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1382nodisk:
1383		bp->bio_from = NULL;
1384		bp->bio_to = NULL;
1385		bp->bio_error = ENXIO;
1386		g_raid_disk_done(bp);
1387		return;
1388	}
1389	disk = sd->sd_disk;
1390	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1391	    disk->d_state != G_RAID_DISK_S_FAILED) {
1392		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1393		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1394		goto nodisk;
1395	}
1396
1397	cp = disk->d_consumer;
1398	bp->bio_from = cp;
1399	bp->bio_to = cp->provider;
1400	cp->index++;
1401
1402	/* Update average disks load. */
1403	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1404		if (tdisk->d_consumer == NULL)
1405			tdisk->d_load = 0;
1406		else
1407			tdisk->d_load = (tdisk->d_consumer->index *
1408			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1409	}
1410
1411	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1412	if (dumping) {
1413		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1414		if (bp->bio_cmd == BIO_WRITE) {
1415			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1416			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1417		} else
1418			bp->bio_error = EOPNOTSUPP;
1419		g_raid_disk_done(bp);
1420	} else {
1421		bp->bio_done = g_raid_disk_done;
1422		bp->bio_offset += sd->sd_offset;
1423		G_RAID_LOGREQ(3, bp, "Sending request.");
1424		g_io_request(bp, cp);
1425	}
1426}
1427
1428int
1429g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1430    void *virtual, vm_offset_t physical, off_t offset, size_t length)
1431{
1432
1433	if (sd->sd_disk == NULL)
1434		return (ENXIO);
1435	if (sd->sd_disk->d_kd.di.dumper == NULL)
1436		return (EOPNOTSUPP);
1437	return (dump_write(&sd->sd_disk->d_kd.di,
1438	    virtual, physical,
1439	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1440	    length));
1441}
1442
1443static void
1444g_raid_disk_done(struct bio *bp)
1445{
1446	struct g_raid_softc *sc;
1447	struct g_raid_subdisk *sd;
1448
1449	sd = bp->bio_caller1;
1450	sc = sd->sd_softc;
1451	mtx_lock(&sc->sc_queue_mtx);
1452	bioq_disksort(&sc->sc_queue, bp);
1453	mtx_unlock(&sc->sc_queue_mtx);
1454	if (!dumping)
1455		wakeup(sc);
1456}
1457
1458static void
1459g_raid_disk_done_request(struct bio *bp)
1460{
1461	struct g_raid_softc *sc;
1462	struct g_raid_disk *disk;
1463	struct g_raid_subdisk *sd;
1464	struct g_raid_volume *vol;
1465
1466	g_topology_assert_not();
1467
1468	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1469	sd = bp->bio_caller1;
1470	sc = sd->sd_softc;
1471	vol = sd->sd_volume;
1472	if (bp->bio_from != NULL) {
1473		bp->bio_from->index--;
1474		disk = bp->bio_from->private;
1475		if (disk == NULL)
1476			g_raid_kill_consumer(sc, bp->bio_from);
1477	}
1478	bp->bio_offset -= sd->sd_offset;
1479
1480	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1481}
1482
1483static void
1484g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1485{
1486
1487	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1488		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1489	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1490		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1491	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1492		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1493	else
1494		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1495	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1496		KASSERT(ep->e_error == 0,
1497		    ("Error cannot be handled."));
1498		g_raid_event_free(ep);
1499	} else {
1500		ep->e_flags |= G_RAID_EVENT_DONE;
1501		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1502		mtx_lock(&sc->sc_queue_mtx);
1503		wakeup(ep);
1504		mtx_unlock(&sc->sc_queue_mtx);
1505	}
1506}
1507
1508/*
1509 * Worker thread.
1510 */
1511static void
1512g_raid_worker(void *arg)
1513{
1514	struct g_raid_softc *sc;
1515	struct g_raid_event *ep;
1516	struct g_raid_volume *vol;
1517	struct bio *bp;
1518	struct timeval now, t;
1519	int timeout, rv;
1520
1521	sc = arg;
1522	thread_lock(curthread);
1523	sched_prio(curthread, PRIBIO);
1524	thread_unlock(curthread);
1525
1526	sx_xlock(&sc->sc_lock);
1527	for (;;) {
1528		mtx_lock(&sc->sc_queue_mtx);
1529		/*
1530		 * First take a look at events.
1531		 * This is important to handle events before any I/O requests.
1532		 */
1533		bp = NULL;
1534		vol = NULL;
1535		rv = 0;
1536		ep = TAILQ_FIRST(&sc->sc_events);
1537		if (ep != NULL)
1538			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1539		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1540			;
1541		else {
1542			getmicrouptime(&now);
1543			t = now;
1544			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1545				if (bioq_first(&vol->v_inflight) == NULL &&
1546				    vol->v_tr &&
1547				    timevalcmp(&vol->v_last_done, &t, < ))
1548					t = vol->v_last_done;
1549			}
1550			timevalsub(&t, &now);
1551			timeout = g_raid_idle_threshold +
1552			    t.tv_sec * 1000000 + t.tv_usec;
1553			if (timeout > 0) {
1554				/*
1555				 * Two steps to avoid overflows at HZ=1000
1556				 * and idle timeouts > 2.1s.  Some rounding
1557				 * errors can occur, but they are < 1tick,
1558				 * which is deemed to be close enough for
1559				 * this purpose.
1560				 */
1561				int micpertic = 1000000 / hz;
1562				timeout = (timeout + micpertic - 1) / micpertic;
1563				sx_xunlock(&sc->sc_lock);
1564				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1565				    PRIBIO | PDROP, "-", timeout);
1566				sx_xlock(&sc->sc_lock);
1567				goto process;
1568			} else
1569				rv = EWOULDBLOCK;
1570		}
1571		mtx_unlock(&sc->sc_queue_mtx);
1572process:
1573		if (ep != NULL) {
1574			g_raid_handle_event(sc, ep);
1575		} else if (bp != NULL) {
1576			if (bp->bio_to != NULL &&
1577			    bp->bio_to->geom == sc->sc_geom)
1578				g_raid_start_request(bp);
1579			else
1580				g_raid_disk_done_request(bp);
1581		} else if (rv == EWOULDBLOCK) {
1582			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1583				g_raid_clean(vol, -1);
1584				if (bioq_first(&vol->v_inflight) == NULL &&
1585				    vol->v_tr) {
1586					t.tv_sec = g_raid_idle_threshold / 1000000;
1587					t.tv_usec = g_raid_idle_threshold % 1000000;
1588					timevaladd(&t, &vol->v_last_done);
1589					getmicrouptime(&now);
1590					if (timevalcmp(&t, &now, <= )) {
1591						G_RAID_TR_IDLE(vol->v_tr);
1592						vol->v_last_done = now;
1593					}
1594				}
1595			}
1596		}
1597		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1598			g_raid_destroy_node(sc, 1);	/* May not return. */
1599	}
1600}
1601
1602static void
1603g_raid_poll(struct g_raid_softc *sc)
1604{
1605	struct g_raid_event *ep;
1606	struct bio *bp;
1607
1608	sx_xlock(&sc->sc_lock);
1609	mtx_lock(&sc->sc_queue_mtx);
1610	/*
1611	 * First take a look at events.
1612	 * This is important to handle events before any I/O requests.
1613	 */
1614	ep = TAILQ_FIRST(&sc->sc_events);
1615	if (ep != NULL) {
1616		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1617		mtx_unlock(&sc->sc_queue_mtx);
1618		g_raid_handle_event(sc, ep);
1619		goto out;
1620	}
1621	bp = bioq_takefirst(&sc->sc_queue);
1622	if (bp != NULL) {
1623		mtx_unlock(&sc->sc_queue_mtx);
1624		if (bp->bio_from == NULL ||
1625		    bp->bio_from->geom != sc->sc_geom)
1626			g_raid_start_request(bp);
1627		else
1628			g_raid_disk_done_request(bp);
1629	}
1630out:
1631	sx_xunlock(&sc->sc_lock);
1632}
1633
1634static void
1635g_raid_launch_provider(struct g_raid_volume *vol)
1636{
1637	struct g_raid_disk *disk;
1638	struct g_raid_subdisk *sd;
1639	struct g_raid_softc *sc;
1640	struct g_provider *pp;
1641	char name[G_RAID_MAX_VOLUMENAME];
1642	char   announce_buf[80], buf1[32];
1643	off_t off;
1644	int i;
1645
1646	sc = vol->v_softc;
1647	sx_assert(&sc->sc_lock, SX_LOCKED);
1648
1649	g_topology_lock();
1650	/* Try to name provider with volume name. */
1651	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1652	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1653	    g_provider_by_name(name) != NULL) {
1654		/* Otherwise use sequential volume number. */
1655		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1656	}
1657
1658	/*
1659	 * Create a /dev/ar%d that the old ataraid(4) stack once
1660	 * created as an alias for /dev/raid/r%d if requested.
1661	 * This helps going from stable/7 ataraid devices to newer
1662	 * FreeBSD releases. sbruno 07 MAY 2013
1663	 */
1664
1665        if (ar_legacy_aliases) {
1666		snprintf(announce_buf, sizeof(announce_buf),
1667                        "kern.devalias.%s", name);
1668                snprintf(buf1, sizeof(buf1),
1669                        "ar%d", vol->v_global_id);
1670                setenv(announce_buf, buf1);
1671        }
1672
1673	pp = g_new_providerf(sc->sc_geom, "%s", name);
1674	pp->flags |= G_PF_DIRECT_RECEIVE;
1675	if (vol->v_tr->tro_class->trc_accept_unmapped) {
1676		pp->flags |= G_PF_ACCEPT_UNMAPPED;
1677		for (i = 0; i < vol->v_disks_count; i++) {
1678			sd = &vol->v_subdisks[i];
1679			if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1680				continue;
1681			if ((sd->sd_disk->d_consumer->provider->flags &
1682			    G_PF_ACCEPT_UNMAPPED) == 0)
1683				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
1684		}
1685	}
1686	pp->private = vol;
1687	pp->mediasize = vol->v_mediasize;
1688	pp->sectorsize = vol->v_sectorsize;
1689	pp->stripesize = 0;
1690	pp->stripeoffset = 0;
1691	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1692	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1693	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1694	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1695		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1696		    disk->d_consumer != NULL &&
1697		    disk->d_consumer->provider != NULL) {
1698			pp->stripesize = disk->d_consumer->provider->stripesize;
1699			off = disk->d_consumer->provider->stripeoffset;
1700			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1701			if (off > 0)
1702				pp->stripeoffset %= off;
1703		}
1704		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1705			pp->stripesize *= (vol->v_disks_count - 1);
1706			pp->stripeoffset *= (vol->v_disks_count - 1);
1707		}
1708	} else
1709		pp->stripesize = vol->v_strip_size;
1710	vol->v_provider = pp;
1711	g_error_provider(pp, 0);
1712	g_topology_unlock();
1713	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1714	    pp->name, vol->v_name);
1715}
1716
1717static void
1718g_raid_destroy_provider(struct g_raid_volume *vol)
1719{
1720	struct g_raid_softc *sc;
1721	struct g_provider *pp;
1722	struct bio *bp, *tmp;
1723
1724	g_topology_assert_not();
1725	sc = vol->v_softc;
1726	pp = vol->v_provider;
1727	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1728
1729	g_topology_lock();
1730	g_error_provider(pp, ENXIO);
1731	mtx_lock(&sc->sc_queue_mtx);
1732	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1733		if (bp->bio_to != pp)
1734			continue;
1735		bioq_remove(&sc->sc_queue, bp);
1736		g_io_deliver(bp, ENXIO);
1737	}
1738	mtx_unlock(&sc->sc_queue_mtx);
1739	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1740	    pp->name, vol->v_name);
1741	g_wither_provider(pp, ENXIO);
1742	g_topology_unlock();
1743	vol->v_provider = NULL;
1744}
1745
1746/*
1747 * Update device state.
1748 */
1749static int
1750g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1751{
1752	struct g_raid_softc *sc;
1753
1754	sc = vol->v_softc;
1755	sx_assert(&sc->sc_lock, SX_XLOCKED);
1756
1757	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1758	    g_raid_volume_event2str(event),
1759	    vol->v_name);
1760	switch (event) {
1761	case G_RAID_VOLUME_E_DOWN:
1762		if (vol->v_provider != NULL)
1763			g_raid_destroy_provider(vol);
1764		break;
1765	case G_RAID_VOLUME_E_UP:
1766		if (vol->v_provider == NULL)
1767			g_raid_launch_provider(vol);
1768		break;
1769	case G_RAID_VOLUME_E_START:
1770		if (vol->v_tr)
1771			G_RAID_TR_START(vol->v_tr);
1772		return (0);
1773	default:
1774		if (sc->sc_md)
1775			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1776		return (0);
1777	}
1778
1779	/* Manage root mount release. */
1780	if (vol->v_starting) {
1781		vol->v_starting = 0;
1782		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1783		root_mount_rel(vol->v_rootmount);
1784		vol->v_rootmount = NULL;
1785	}
1786	if (vol->v_stopping && vol->v_provider_open == 0)
1787		g_raid_destroy_volume(vol);
1788	return (0);
1789}
1790
1791/*
1792 * Update subdisk state.
1793 */
1794static int
1795g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1796{
1797	struct g_raid_softc *sc;
1798	struct g_raid_volume *vol;
1799
1800	sc = sd->sd_softc;
1801	vol = sd->sd_volume;
1802	sx_assert(&sc->sc_lock, SX_XLOCKED);
1803
1804	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1805	    g_raid_subdisk_event2str(event),
1806	    vol->v_name, sd->sd_pos,
1807	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1808	if (vol->v_tr)
1809		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1810
1811	return (0);
1812}
1813
1814/*
1815 * Update disk state.
1816 */
1817static int
1818g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1819{
1820	struct g_raid_softc *sc;
1821
1822	sc = disk->d_softc;
1823	sx_assert(&sc->sc_lock, SX_XLOCKED);
1824
1825	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1826	    g_raid_disk_event2str(event),
1827	    g_raid_get_diskname(disk));
1828
1829	if (sc->sc_md)
1830		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1831	return (0);
1832}
1833
1834/*
1835 * Node event.
1836 */
1837static int
1838g_raid_update_node(struct g_raid_softc *sc, u_int event)
1839{
1840	sx_assert(&sc->sc_lock, SX_XLOCKED);
1841
1842	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1843	    g_raid_node_event2str(event));
1844
1845	if (event == G_RAID_NODE_E_WAKE)
1846		return (0);
1847	if (sc->sc_md)
1848		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1849	return (0);
1850}
1851
1852static int
1853g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1854{
1855	struct g_raid_volume *vol;
1856	struct g_raid_softc *sc;
1857	int dcw, opens, error = 0;
1858
1859	g_topology_assert();
1860	sc = pp->geom->softc;
1861	vol = pp->private;
1862	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1863	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1864
1865	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1866	    acr, acw, ace);
1867	dcw = pp->acw + acw;
1868
1869	g_topology_unlock();
1870	sx_xlock(&sc->sc_lock);
1871	/* Deny new opens while dying. */
1872	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1873		error = ENXIO;
1874		goto out;
1875	}
1876	/* Deny write opens for read-only volumes. */
1877	if (vol->v_read_only && acw > 0) {
1878		error = EROFS;
1879		goto out;
1880	}
1881	if (dcw == 0)
1882		g_raid_clean(vol, dcw);
1883	vol->v_provider_open += acr + acw + ace;
1884	/* Handle delayed node destruction. */
1885	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1886	    vol->v_provider_open == 0) {
1887		/* Count open volumes. */
1888		opens = g_raid_nopens(sc);
1889		if (opens == 0) {
1890			sc->sc_stopping = G_RAID_DESTROY_HARD;
1891			/* Wake up worker to make it selfdestruct. */
1892			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1893		}
1894	}
1895	/* Handle open volume destruction. */
1896	if (vol->v_stopping && vol->v_provider_open == 0)
1897		g_raid_destroy_volume(vol);
1898out:
1899	sx_xunlock(&sc->sc_lock);
1900	g_topology_lock();
1901	return (error);
1902}
1903
1904struct g_raid_softc *
1905g_raid_create_node(struct g_class *mp,
1906    const char *name, struct g_raid_md_object *md)
1907{
1908	struct g_raid_softc *sc;
1909	struct g_geom *gp;
1910	int error;
1911
1912	g_topology_assert();
1913	G_RAID_DEBUG(1, "Creating array %s.", name);
1914
1915	gp = g_new_geomf(mp, "%s", name);
1916	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1917	gp->start = g_raid_start;
1918	gp->orphan = g_raid_orphan;
1919	gp->access = g_raid_access;
1920	gp->dumpconf = g_raid_dumpconf;
1921
1922	sc->sc_md = md;
1923	sc->sc_geom = gp;
1924	sc->sc_flags = 0;
1925	TAILQ_INIT(&sc->sc_volumes);
1926	TAILQ_INIT(&sc->sc_disks);
1927	sx_init(&sc->sc_lock, "graid:lock");
1928	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
1929	TAILQ_INIT(&sc->sc_events);
1930	bioq_init(&sc->sc_queue);
1931	gp->softc = sc;
1932	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1933	    "g_raid %s", name);
1934	if (error != 0) {
1935		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1936		mtx_destroy(&sc->sc_queue_mtx);
1937		sx_destroy(&sc->sc_lock);
1938		g_destroy_geom(sc->sc_geom);
1939		free(sc, M_RAID);
1940		return (NULL);
1941	}
1942
1943	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1944	return (sc);
1945}
1946
1947struct g_raid_volume *
1948g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1949{
1950	struct g_raid_volume	*vol, *vol1;
1951	int i;
1952
1953	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1954	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1955	vol->v_softc = sc;
1956	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1957	vol->v_state = G_RAID_VOLUME_S_STARTING;
1958	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1959	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1960	vol->v_rotate_parity = 1;
1961	bioq_init(&vol->v_inflight);
1962	bioq_init(&vol->v_locked);
1963	LIST_INIT(&vol->v_locks);
1964	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1965		vol->v_subdisks[i].sd_softc = sc;
1966		vol->v_subdisks[i].sd_volume = vol;
1967		vol->v_subdisks[i].sd_pos = i;
1968		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1969	}
1970
1971	/* Find free ID for this volume. */
1972	g_topology_lock();
1973	vol1 = vol;
1974	if (id >= 0) {
1975		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1976			if (vol1->v_global_id == id)
1977				break;
1978		}
1979	}
1980	if (vol1 != NULL) {
1981		for (id = 0; ; id++) {
1982			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1983				if (vol1->v_global_id == id)
1984					break;
1985			}
1986			if (vol1 == NULL)
1987				break;
1988		}
1989	}
1990	vol->v_global_id = id;
1991	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1992	g_topology_unlock();
1993
1994	/* Delay root mounting. */
1995	vol->v_rootmount = root_mount_hold("GRAID");
1996	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1997	vol->v_starting = 1;
1998	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1999	return (vol);
2000}
2001
2002struct g_raid_disk *
2003g_raid_create_disk(struct g_raid_softc *sc)
2004{
2005	struct g_raid_disk	*disk;
2006
2007	G_RAID_DEBUG1(1, sc, "Creating disk.");
2008	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
2009	disk->d_softc = sc;
2010	disk->d_state = G_RAID_DISK_S_NONE;
2011	TAILQ_INIT(&disk->d_subdisks);
2012	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
2013	return (disk);
2014}
2015
2016int g_raid_start_volume(struct g_raid_volume *vol)
2017{
2018	struct g_raid_tr_class *class;
2019	struct g_raid_tr_object *obj;
2020	int status;
2021
2022	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
2023	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
2024		if (!class->trc_enable)
2025			continue;
2026		G_RAID_DEBUG1(2, vol->v_softc,
2027		    "Tasting volume %s for %s transformation.",
2028		    vol->v_name, class->name);
2029		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2030		    M_WAITOK);
2031		obj->tro_class = class;
2032		obj->tro_volume = vol;
2033		status = G_RAID_TR_TASTE(obj, vol);
2034		if (status != G_RAID_TR_TASTE_FAIL)
2035			break;
2036		kobj_delete((kobj_t)obj, M_RAID);
2037	}
2038	if (class == NULL) {
2039		G_RAID_DEBUG1(0, vol->v_softc,
2040		    "No transformation module found for %s.",
2041		    vol->v_name);
2042		vol->v_tr = NULL;
2043		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
2044		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
2045		    G_RAID_EVENT_VOLUME);
2046		return (-1);
2047	}
2048	G_RAID_DEBUG1(2, vol->v_softc,
2049	    "Transformation module %s chosen for %s.",
2050	    class->name, vol->v_name);
2051	vol->v_tr = obj;
2052	return (0);
2053}
2054
2055int
2056g_raid_destroy_node(struct g_raid_softc *sc, int worker)
2057{
2058	struct g_raid_volume *vol, *tmpv;
2059	struct g_raid_disk *disk, *tmpd;
2060	int error = 0;
2061
2062	sc->sc_stopping = G_RAID_DESTROY_HARD;
2063	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
2064		if (g_raid_destroy_volume(vol))
2065			error = EBUSY;
2066	}
2067	if (error)
2068		return (error);
2069	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
2070		if (g_raid_destroy_disk(disk))
2071			error = EBUSY;
2072	}
2073	if (error)
2074		return (error);
2075	if (sc->sc_md) {
2076		G_RAID_MD_FREE(sc->sc_md);
2077		kobj_delete((kobj_t)sc->sc_md, M_RAID);
2078		sc->sc_md = NULL;
2079	}
2080	if (sc->sc_geom != NULL) {
2081		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
2082		g_topology_lock();
2083		sc->sc_geom->softc = NULL;
2084		g_wither_geom(sc->sc_geom, ENXIO);
2085		g_topology_unlock();
2086		sc->sc_geom = NULL;
2087	} else
2088		G_RAID_DEBUG(1, "Array destroyed.");
2089	if (worker) {
2090		g_raid_event_cancel(sc, sc);
2091		mtx_destroy(&sc->sc_queue_mtx);
2092		sx_xunlock(&sc->sc_lock);
2093		sx_destroy(&sc->sc_lock);
2094		wakeup(&sc->sc_stopping);
2095		free(sc, M_RAID);
2096		curthread->td_pflags &= ~TDP_GEOM;
2097		G_RAID_DEBUG(1, "Thread exiting.");
2098		kproc_exit(0);
2099	} else {
2100		/* Wake up worker to make it selfdestruct. */
2101		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2102	}
2103	return (0);
2104}
2105
2106int
2107g_raid_destroy_volume(struct g_raid_volume *vol)
2108{
2109	struct g_raid_softc *sc;
2110	struct g_raid_disk *disk;
2111	int i;
2112
2113	sc = vol->v_softc;
2114	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2115	vol->v_stopping = 1;
2116	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2117		if (vol->v_tr) {
2118			G_RAID_TR_STOP(vol->v_tr);
2119			return (EBUSY);
2120		} else
2121			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2122	}
2123	if (g_raid_event_check(sc, vol) != 0)
2124		return (EBUSY);
2125	if (vol->v_provider != NULL)
2126		return (EBUSY);
2127	if (vol->v_provider_open != 0)
2128		return (EBUSY);
2129	if (vol->v_tr) {
2130		G_RAID_TR_FREE(vol->v_tr);
2131		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2132		vol->v_tr = NULL;
2133	}
2134	if (vol->v_rootmount)
2135		root_mount_rel(vol->v_rootmount);
2136	g_topology_lock();
2137	LIST_REMOVE(vol, v_global_next);
2138	g_topology_unlock();
2139	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2140	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2141		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2142		disk = vol->v_subdisks[i].sd_disk;
2143		if (disk == NULL)
2144			continue;
2145		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2146	}
2147	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2148	if (sc->sc_md)
2149		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2150	g_raid_event_cancel(sc, vol);
2151	free(vol, M_RAID);
2152	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2153		/* Wake up worker to let it selfdestruct. */
2154		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2155	}
2156	return (0);
2157}
2158
2159int
2160g_raid_destroy_disk(struct g_raid_disk *disk)
2161{
2162	struct g_raid_softc *sc;
2163	struct g_raid_subdisk *sd, *tmp;
2164
2165	sc = disk->d_softc;
2166	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2167	if (disk->d_consumer) {
2168		g_raid_kill_consumer(sc, disk->d_consumer);
2169		disk->d_consumer = NULL;
2170	}
2171	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2172		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2173		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2174		    G_RAID_EVENT_SUBDISK);
2175		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2176		sd->sd_disk = NULL;
2177	}
2178	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2179	if (sc->sc_md)
2180		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2181	g_raid_event_cancel(sc, disk);
2182	free(disk, M_RAID);
2183	return (0);
2184}
2185
2186int
2187g_raid_destroy(struct g_raid_softc *sc, int how)
2188{
2189	int error, opens;
2190
2191	g_topology_assert_not();
2192	if (sc == NULL)
2193		return (ENXIO);
2194	sx_assert(&sc->sc_lock, SX_XLOCKED);
2195
2196	/* Count open volumes. */
2197	opens = g_raid_nopens(sc);
2198
2199	/* React on some opened volumes. */
2200	if (opens > 0) {
2201		switch (how) {
2202		case G_RAID_DESTROY_SOFT:
2203			G_RAID_DEBUG1(1, sc,
2204			    "%d volumes are still open.",
2205			    opens);
2206			sx_xunlock(&sc->sc_lock);
2207			return (EBUSY);
2208		case G_RAID_DESTROY_DELAYED:
2209			G_RAID_DEBUG1(1, sc,
2210			    "Array will be destroyed on last close.");
2211			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2212			sx_xunlock(&sc->sc_lock);
2213			return (EBUSY);
2214		case G_RAID_DESTROY_HARD:
2215			G_RAID_DEBUG1(1, sc,
2216			    "%d volumes are still open.",
2217			    opens);
2218		}
2219	}
2220
2221	/* Mark node for destruction. */
2222	sc->sc_stopping = G_RAID_DESTROY_HARD;
2223	/* Wake up worker to let it selfdestruct. */
2224	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2225	/* Sleep until node destroyed. */
2226	error = sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2227	    PRIBIO | PDROP, "r:destroy", hz * 3);
2228	return (error == EWOULDBLOCK ? EBUSY : 0);
2229}
2230
2231static void
2232g_raid_taste_orphan(struct g_consumer *cp)
2233{
2234
2235	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2236	    cp->provider->name));
2237}
2238
2239static struct g_geom *
2240g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2241{
2242	struct g_consumer *cp;
2243	struct g_geom *gp, *geom;
2244	struct g_raid_md_class *class;
2245	struct g_raid_md_object *obj;
2246	int status;
2247
2248	g_topology_assert();
2249	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2250	if (!g_raid_enable)
2251		return (NULL);
2252	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2253
2254	geom = NULL;
2255	status = G_RAID_MD_TASTE_FAIL;
2256	gp = g_new_geomf(mp, "raid:taste");
2257	/*
2258	 * This orphan function should be never called.
2259	 */
2260	gp->orphan = g_raid_taste_orphan;
2261	cp = g_new_consumer(gp);
2262	cp->flags |= G_CF_DIRECT_RECEIVE;
2263	g_attach(cp, pp);
2264	if (g_access(cp, 1, 0, 0) != 0)
2265		goto ofail;
2266
2267	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2268		if (!class->mdc_enable)
2269			continue;
2270		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2271		    pp->name, class->name);
2272		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2273		    M_WAITOK);
2274		obj->mdo_class = class;
2275		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2276		if (status != G_RAID_MD_TASTE_NEW)
2277			kobj_delete((kobj_t)obj, M_RAID);
2278		if (status != G_RAID_MD_TASTE_FAIL)
2279			break;
2280	}
2281
2282	if (status == G_RAID_MD_TASTE_FAIL)
2283		(void)g_access(cp, -1, 0, 0);
2284ofail:
2285	g_detach(cp);
2286	g_destroy_consumer(cp);
2287	g_destroy_geom(gp);
2288	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2289	return (geom);
2290}
2291
2292int
2293g_raid_create_node_format(const char *format, struct gctl_req *req,
2294    struct g_geom **gp)
2295{
2296	struct g_raid_md_class *class;
2297	struct g_raid_md_object *obj;
2298	int status;
2299
2300	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2301	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2302		if (strcasecmp(class->name, format) == 0)
2303			break;
2304	}
2305	if (class == NULL) {
2306		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2307		return (G_RAID_MD_TASTE_FAIL);
2308	}
2309	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2310	    M_WAITOK);
2311	obj->mdo_class = class;
2312	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
2313	if (status != G_RAID_MD_TASTE_NEW)
2314		kobj_delete((kobj_t)obj, M_RAID);
2315	return (status);
2316}
2317
2318static int
2319g_raid_destroy_geom(struct gctl_req *req __unused,
2320    struct g_class *mp __unused, struct g_geom *gp)
2321{
2322	struct g_raid_softc *sc;
2323	int error;
2324
2325	g_topology_unlock();
2326	sc = gp->softc;
2327	sx_xlock(&sc->sc_lock);
2328	g_cancel_event(sc);
2329	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2330	g_topology_lock();
2331	return (error);
2332}
2333
2334void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2335    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2336{
2337
2338	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2339		return;
2340	if (sc->sc_md)
2341		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2342}
2343
2344void g_raid_fail_disk(struct g_raid_softc *sc,
2345    struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2346{
2347
2348	if (disk == NULL)
2349		disk = sd->sd_disk;
2350	if (disk == NULL) {
2351		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2352		return;
2353	}
2354	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2355		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2356		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2357		return;
2358	}
2359	if (sc->sc_md)
2360		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2361}
2362
2363static void
2364g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2365    struct g_consumer *cp, struct g_provider *pp)
2366{
2367	struct g_raid_softc *sc;
2368	struct g_raid_volume *vol;
2369	struct g_raid_subdisk *sd;
2370	struct g_raid_disk *disk;
2371	int i, s;
2372
2373	g_topology_assert();
2374
2375	sc = gp->softc;
2376	if (sc == NULL)
2377		return;
2378	if (pp != NULL) {
2379		vol = pp->private;
2380		g_topology_unlock();
2381		sx_xlock(&sc->sc_lock);
2382		sbuf_printf(sb, "%s<descr>%s %s volume</descr>\n", indent,
2383		    sc->sc_md->mdo_class->name,
2384		    g_raid_volume_level2str(vol->v_raid_level,
2385		    vol->v_raid_level_qualifier));
2386		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2387		    vol->v_name);
2388		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2389		    g_raid_volume_level2str(vol->v_raid_level,
2390		    vol->v_raid_level_qualifier));
2391		sbuf_printf(sb,
2392		    "%s<Transformation>%s</Transformation>\n", indent,
2393		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2394		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2395		    vol->v_disks_count);
2396		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2397		    vol->v_strip_size);
2398		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2399		    g_raid_volume_state2str(vol->v_state));
2400		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2401		    vol->v_dirty ? "Yes" : "No");
2402		sbuf_printf(sb, "%s<Subdisks>", indent);
2403		for (i = 0; i < vol->v_disks_count; i++) {
2404			sd = &vol->v_subdisks[i];
2405			if (sd->sd_disk != NULL &&
2406			    sd->sd_disk->d_consumer != NULL) {
2407				sbuf_printf(sb, "%s ",
2408				    g_raid_get_diskname(sd->sd_disk));
2409			} else {
2410				sbuf_printf(sb, "NONE ");
2411			}
2412			sbuf_printf(sb, "(%s",
2413			    g_raid_subdisk_state2str(sd->sd_state));
2414			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2415			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2416				sbuf_printf(sb, " %d%%",
2417				    (int)(sd->sd_rebuild_pos * 100 /
2418				     sd->sd_size));
2419			}
2420			sbuf_printf(sb, ")");
2421			if (i + 1 < vol->v_disks_count)
2422				sbuf_printf(sb, ", ");
2423		}
2424		sbuf_printf(sb, "</Subdisks>\n");
2425		sx_xunlock(&sc->sc_lock);
2426		g_topology_lock();
2427	} else if (cp != NULL) {
2428		disk = cp->private;
2429		if (disk == NULL)
2430			return;
2431		g_topology_unlock();
2432		sx_xlock(&sc->sc_lock);
2433		sbuf_printf(sb, "%s<State>%s", indent,
2434		    g_raid_disk_state2str(disk->d_state));
2435		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2436			sbuf_printf(sb, " (");
2437			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2438				sbuf_printf(sb, "%s",
2439				    g_raid_subdisk_state2str(sd->sd_state));
2440				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2441				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2442					sbuf_printf(sb, " %d%%",
2443					    (int)(sd->sd_rebuild_pos * 100 /
2444					     sd->sd_size));
2445				}
2446				if (TAILQ_NEXT(sd, sd_next))
2447					sbuf_printf(sb, ", ");
2448			}
2449			sbuf_printf(sb, ")");
2450		}
2451		sbuf_printf(sb, "</State>\n");
2452		sbuf_printf(sb, "%s<Subdisks>", indent);
2453		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2454			sbuf_printf(sb, "r%d(%s):%d@%ju",
2455			    sd->sd_volume->v_global_id,
2456			    sd->sd_volume->v_name,
2457			    sd->sd_pos, sd->sd_offset);
2458			if (TAILQ_NEXT(sd, sd_next))
2459				sbuf_printf(sb, ", ");
2460		}
2461		sbuf_printf(sb, "</Subdisks>\n");
2462		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2463		    disk->d_read_errs);
2464		sx_xunlock(&sc->sc_lock);
2465		g_topology_lock();
2466	} else {
2467		g_topology_unlock();
2468		sx_xlock(&sc->sc_lock);
2469		if (sc->sc_md) {
2470			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2471			    sc->sc_md->mdo_class->name);
2472		}
2473		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2474			s = 0xff;
2475			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2476				if (vol->v_state < s)
2477					s = vol->v_state;
2478			}
2479			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2480			    g_raid_volume_state2str(s));
2481		}
2482		sx_xunlock(&sc->sc_lock);
2483		g_topology_lock();
2484	}
2485}
2486
2487static void
2488g_raid_shutdown_post_sync(void *arg, int howto)
2489{
2490	struct g_class *mp;
2491	struct g_geom *gp, *gp2;
2492	struct g_raid_softc *sc;
2493	struct g_raid_volume *vol;
2494
2495	mp = arg;
2496	DROP_GIANT();
2497	g_topology_lock();
2498	g_raid_shutdown = 1;
2499	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2500		if ((sc = gp->softc) == NULL)
2501			continue;
2502		g_topology_unlock();
2503		sx_xlock(&sc->sc_lock);
2504		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
2505			g_raid_clean(vol, -1);
2506		g_cancel_event(sc);
2507		g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2508		g_topology_lock();
2509	}
2510	g_topology_unlock();
2511	PICKUP_GIANT();
2512}
2513
2514static void
2515g_raid_init(struct g_class *mp)
2516{
2517
2518	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
2519	    g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
2520	if (g_raid_post_sync == NULL)
2521		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2522	g_raid_started = 1;
2523}
2524
2525static void
2526g_raid_fini(struct g_class *mp)
2527{
2528
2529	if (g_raid_post_sync != NULL)
2530		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
2531	g_raid_started = 0;
2532}
2533
2534int
2535g_raid_md_modevent(module_t mod, int type, void *arg)
2536{
2537	struct g_raid_md_class *class, *c, *nc;
2538	int error;
2539
2540	error = 0;
2541	class = arg;
2542	switch (type) {
2543	case MOD_LOAD:
2544		c = LIST_FIRST(&g_raid_md_classes);
2545		if (c == NULL || c->mdc_priority > class->mdc_priority)
2546			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2547		else {
2548			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2549			    nc->mdc_priority < class->mdc_priority)
2550				c = nc;
2551			LIST_INSERT_AFTER(c, class, mdc_list);
2552		}
2553		if (g_raid_started)
2554			g_retaste(&g_raid_class);
2555		break;
2556	case MOD_UNLOAD:
2557		LIST_REMOVE(class, mdc_list);
2558		break;
2559	default:
2560		error = EOPNOTSUPP;
2561		break;
2562	}
2563
2564	return (error);
2565}
2566
2567int
2568g_raid_tr_modevent(module_t mod, int type, void *arg)
2569{
2570	struct g_raid_tr_class *class, *c, *nc;
2571	int error;
2572
2573	error = 0;
2574	class = arg;
2575	switch (type) {
2576	case MOD_LOAD:
2577		c = LIST_FIRST(&g_raid_tr_classes);
2578		if (c == NULL || c->trc_priority > class->trc_priority)
2579			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2580		else {
2581			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2582			    nc->trc_priority < class->trc_priority)
2583				c = nc;
2584			LIST_INSERT_AFTER(c, class, trc_list);
2585		}
2586		break;
2587	case MOD_UNLOAD:
2588		LIST_REMOVE(class, trc_list);
2589		break;
2590	default:
2591		error = EOPNOTSUPP;
2592		break;
2593	}
2594
2595	return (error);
2596}
2597
2598/*
2599 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2600 * to reduce module priority, allowing submodules to register them first.
2601 */
2602static moduledata_t g_raid_mod = {
2603	"g_raid",
2604	g_modevent,
2605	&g_raid_class
2606};
2607DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2608MODULE_VERSION(geom_raid, 0);
2609