g_mirror.c revision 309206
1/*-
2 * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/sys/geom/mirror/g_mirror.c 309206 2016-11-27 05:59:17Z mav $");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/module.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/mutex.h>
37#include <sys/bio.h>
38#include <sys/sbuf.h>
39#include <sys/sysctl.h>
40#include <sys/malloc.h>
41#include <sys/eventhandler.h>
42#include <vm/uma.h>
43#include <geom/geom.h>
44#include <sys/proc.h>
45#include <sys/kthread.h>
46#include <sys/sched.h>
47#include <geom/mirror/g_mirror.h>
48
49FEATURE(geom_mirror, "GEOM mirroring support");
50
51static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data");
52
53SYSCTL_DECL(_kern_geom);
54static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0,
55    "GEOM_MIRROR stuff");
56u_int g_mirror_debug = 0;
57TUNABLE_INT("kern.geom.mirror.debug", &g_mirror_debug);
58SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RW, &g_mirror_debug, 0,
59    "Debug level");
60static u_int g_mirror_timeout = 4;
61TUNABLE_INT("kern.geom.mirror.timeout", &g_mirror_timeout);
62SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RW, &g_mirror_timeout,
63    0, "Time to wait on all mirror components");
64static u_int g_mirror_idletime = 5;
65TUNABLE_INT("kern.geom.mirror.idletime", &g_mirror_idletime);
66SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RW,
67    &g_mirror_idletime, 0, "Mark components as clean when idling");
68static u_int g_mirror_disconnect_on_failure = 1;
69TUNABLE_INT("kern.geom.mirror.disconnect_on_failure",
70    &g_mirror_disconnect_on_failure);
71SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
72    &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
73static u_int g_mirror_syncreqs = 2;
74TUNABLE_INT("kern.geom.mirror.sync_requests", &g_mirror_syncreqs);
75SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
76    &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests.");
77
78#define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
79	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
80	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
81	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
82} while (0)
83
84static eventhandler_tag g_mirror_post_sync = NULL;
85static int g_mirror_shutdown = 0;
86
87static g_ctl_destroy_geom_t g_mirror_destroy_geom;
88static g_taste_t g_mirror_taste;
89static g_init_t g_mirror_init;
90static g_fini_t g_mirror_fini;
91static g_provgone_t g_mirror_providergone;
92static g_resize_t g_mirror_resize;
93
94struct g_class g_mirror_class = {
95	.name = G_MIRROR_CLASS_NAME,
96	.version = G_VERSION,
97	.ctlreq = g_mirror_config,
98	.taste = g_mirror_taste,
99	.destroy_geom = g_mirror_destroy_geom,
100	.init = g_mirror_init,
101	.fini = g_mirror_fini,
102	.providergone = g_mirror_providergone,
103	.resize = g_mirror_resize
104};
105
106
107static void g_mirror_destroy_provider(struct g_mirror_softc *sc);
108static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state);
109static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force);
110static void g_mirror_dumpconf(struct sbuf *sb, const char *indent,
111    struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
112static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type);
113static void g_mirror_register_request(struct bio *bp);
114static void g_mirror_sync_release(struct g_mirror_softc *sc);
115
116
117static const char *
118g_mirror_disk_state2str(int state)
119{
120
121	switch (state) {
122	case G_MIRROR_DISK_STATE_NONE:
123		return ("NONE");
124	case G_MIRROR_DISK_STATE_NEW:
125		return ("NEW");
126	case G_MIRROR_DISK_STATE_ACTIVE:
127		return ("ACTIVE");
128	case G_MIRROR_DISK_STATE_STALE:
129		return ("STALE");
130	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
131		return ("SYNCHRONIZING");
132	case G_MIRROR_DISK_STATE_DISCONNECTED:
133		return ("DISCONNECTED");
134	case G_MIRROR_DISK_STATE_DESTROY:
135		return ("DESTROY");
136	default:
137		return ("INVALID");
138	}
139}
140
141static const char *
142g_mirror_device_state2str(int state)
143{
144
145	switch (state) {
146	case G_MIRROR_DEVICE_STATE_STARTING:
147		return ("STARTING");
148	case G_MIRROR_DEVICE_STATE_RUNNING:
149		return ("RUNNING");
150	default:
151		return ("INVALID");
152	}
153}
154
155static const char *
156g_mirror_get_diskname(struct g_mirror_disk *disk)
157{
158
159	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
160		return ("[unknown]");
161	return (disk->d_name);
162}
163
164/*
165 * --- Events handling functions ---
166 * Events in geom_mirror are used to maintain disks and device status
167 * from one thread to simplify locking.
168 */
169static void
170g_mirror_event_free(struct g_mirror_event *ep)
171{
172
173	free(ep, M_MIRROR);
174}
175
176int
177g_mirror_event_send(void *arg, int state, int flags)
178{
179	struct g_mirror_softc *sc;
180	struct g_mirror_disk *disk;
181	struct g_mirror_event *ep;
182	int error;
183
184	ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK);
185	G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep);
186	if ((flags & G_MIRROR_EVENT_DEVICE) != 0) {
187		disk = NULL;
188		sc = arg;
189	} else {
190		disk = arg;
191		sc = disk->d_softc;
192	}
193	ep->e_disk = disk;
194	ep->e_state = state;
195	ep->e_flags = flags;
196	ep->e_error = 0;
197	mtx_lock(&sc->sc_events_mtx);
198	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
199	mtx_unlock(&sc->sc_events_mtx);
200	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
201	mtx_lock(&sc->sc_queue_mtx);
202	wakeup(sc);
203	mtx_unlock(&sc->sc_queue_mtx);
204	if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0)
205		return (0);
206	sx_assert(&sc->sc_lock, SX_XLOCKED);
207	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
208	sx_xunlock(&sc->sc_lock);
209	while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) {
210		mtx_lock(&sc->sc_events_mtx);
211		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event",
212		    hz * 5);
213	}
214	error = ep->e_error;
215	g_mirror_event_free(ep);
216	sx_xlock(&sc->sc_lock);
217	return (error);
218}
219
220static struct g_mirror_event *
221g_mirror_event_get(struct g_mirror_softc *sc)
222{
223	struct g_mirror_event *ep;
224
225	mtx_lock(&sc->sc_events_mtx);
226	ep = TAILQ_FIRST(&sc->sc_events);
227	mtx_unlock(&sc->sc_events_mtx);
228	return (ep);
229}
230
231static void
232g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep)
233{
234
235	mtx_lock(&sc->sc_events_mtx);
236	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
237	mtx_unlock(&sc->sc_events_mtx);
238}
239
240static void
241g_mirror_event_cancel(struct g_mirror_disk *disk)
242{
243	struct g_mirror_softc *sc;
244	struct g_mirror_event *ep, *tmpep;
245
246	sc = disk->d_softc;
247	sx_assert(&sc->sc_lock, SX_XLOCKED);
248
249	mtx_lock(&sc->sc_events_mtx);
250	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
251		if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0)
252			continue;
253		if (ep->e_disk != disk)
254			continue;
255		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
256		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
257			g_mirror_event_free(ep);
258		else {
259			ep->e_error = ECANCELED;
260			wakeup(ep);
261		}
262	}
263	mtx_unlock(&sc->sc_events_mtx);
264}
265
266/*
267 * Return the number of disks in given state.
268 * If state is equal to -1, count all connected disks.
269 */
270u_int
271g_mirror_ndisks(struct g_mirror_softc *sc, int state)
272{
273	struct g_mirror_disk *disk;
274	u_int n = 0;
275
276	sx_assert(&sc->sc_lock, SX_LOCKED);
277
278	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
279		if (state == -1 || disk->d_state == state)
280			n++;
281	}
282	return (n);
283}
284
285/*
286 * Find a disk in mirror by its disk ID.
287 */
288static struct g_mirror_disk *
289g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id)
290{
291	struct g_mirror_disk *disk;
292
293	sx_assert(&sc->sc_lock, SX_XLOCKED);
294
295	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
296		if (disk->d_id == id)
297			return (disk);
298	}
299	return (NULL);
300}
301
302static u_int
303g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp)
304{
305	struct bio *bp;
306	u_int nreqs = 0;
307
308	mtx_lock(&sc->sc_queue_mtx);
309	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
310		if (bp->bio_from == cp)
311			nreqs++;
312	}
313	mtx_unlock(&sc->sc_queue_mtx);
314	return (nreqs);
315}
316
317static int
318g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp)
319{
320
321	if (cp->index > 0) {
322		G_MIRROR_DEBUG(2,
323		    "I/O requests for %s exist, can't destroy it now.",
324		    cp->provider->name);
325		return (1);
326	}
327	if (g_mirror_nrequests(sc, cp) > 0) {
328		G_MIRROR_DEBUG(2,
329		    "I/O requests for %s in queue, can't destroy it now.",
330		    cp->provider->name);
331		return (1);
332	}
333	return (0);
334}
335
336static void
337g_mirror_destroy_consumer(void *arg, int flags __unused)
338{
339	struct g_consumer *cp;
340
341	g_topology_assert();
342
343	cp = arg;
344	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
345	g_detach(cp);
346	g_destroy_consumer(cp);
347}
348
349static void
350g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
351{
352	struct g_provider *pp;
353	int retaste_wait;
354
355	g_topology_assert();
356
357	cp->private = NULL;
358	if (g_mirror_is_busy(sc, cp))
359		return;
360	pp = cp->provider;
361	retaste_wait = 0;
362	if (cp->acw == 1) {
363		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
364			retaste_wait = 1;
365	}
366	G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
367	    -cp->acw, -cp->ace, 0);
368	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
369		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
370	if (retaste_wait) {
371		/*
372		 * After retaste event was send (inside g_access()), we can send
373		 * event to detach and destroy consumer.
374		 * A class, which has consumer to the given provider connected
375		 * will not receive retaste event for the provider.
376		 * This is the way how I ignore retaste events when I close
377		 * consumers opened for write: I detach and destroy consumer
378		 * after retaste event is sent.
379		 */
380		g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL);
381		return;
382	}
383	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name);
384	g_detach(cp);
385	g_destroy_consumer(cp);
386}
387
388static int
389g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp)
390{
391	struct g_consumer *cp;
392	int error;
393
394	g_topology_assert_not();
395	KASSERT(disk->d_consumer == NULL,
396	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
397
398	g_topology_lock();
399	cp = g_new_consumer(disk->d_softc->sc_geom);
400	cp->flags |= G_CF_DIRECT_RECEIVE;
401	error = g_attach(cp, pp);
402	if (error != 0) {
403		g_destroy_consumer(cp);
404		g_topology_unlock();
405		return (error);
406	}
407	error = g_access(cp, 1, 1, 1);
408	if (error != 0) {
409		g_detach(cp);
410		g_destroy_consumer(cp);
411		g_topology_unlock();
412		G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).",
413		    pp->name, error);
414		return (error);
415	}
416	g_topology_unlock();
417	disk->d_consumer = cp;
418	disk->d_consumer->private = disk;
419	disk->d_consumer->index = 0;
420
421	G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk));
422	return (0);
423}
424
425static void
426g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
427{
428
429	g_topology_assert();
430
431	if (cp == NULL)
432		return;
433	if (cp->provider != NULL)
434		g_mirror_kill_consumer(sc, cp);
435	else
436		g_destroy_consumer(cp);
437}
438
439/*
440 * Initialize disk. This means allocate memory, create consumer, attach it
441 * to the provider and open access (r1w1e1) to it.
442 */
443static struct g_mirror_disk *
444g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp,
445    struct g_mirror_metadata *md, int *errorp)
446{
447	struct g_mirror_disk *disk;
448	int i, error;
449
450	disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO);
451	if (disk == NULL) {
452		error = ENOMEM;
453		goto fail;
454	}
455	disk->d_softc = sc;
456	error = g_mirror_connect_disk(disk, pp);
457	if (error != 0)
458		goto fail;
459	disk->d_id = md->md_did;
460	disk->d_state = G_MIRROR_DISK_STATE_NONE;
461	disk->d_priority = md->md_priority;
462	disk->d_flags = md->md_dflags;
463	error = g_getattr("GEOM::candelete", disk->d_consumer, &i);
464	if (error == 0 && i != 0)
465		disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE;
466	if (md->md_provider[0] != '\0')
467		disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED;
468	disk->d_sync.ds_consumer = NULL;
469	disk->d_sync.ds_offset = md->md_sync_offset;
470	disk->d_sync.ds_offset_done = md->md_sync_offset;
471	disk->d_genid = md->md_genid;
472	disk->d_sync.ds_syncid = md->md_syncid;
473	if (errorp != NULL)
474		*errorp = 0;
475	return (disk);
476fail:
477	if (errorp != NULL)
478		*errorp = error;
479	if (disk != NULL)
480		free(disk, M_MIRROR);
481	return (NULL);
482}
483
484static void
485g_mirror_destroy_disk(struct g_mirror_disk *disk)
486{
487	struct g_mirror_softc *sc;
488
489	g_topology_assert_not();
490	sc = disk->d_softc;
491	sx_assert(&sc->sc_lock, SX_XLOCKED);
492
493	LIST_REMOVE(disk, d_next);
494	g_mirror_event_cancel(disk);
495	if (sc->sc_hint == disk)
496		sc->sc_hint = NULL;
497	switch (disk->d_state) {
498	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
499		g_mirror_sync_stop(disk, 1);
500		/* FALLTHROUGH */
501	case G_MIRROR_DISK_STATE_NEW:
502	case G_MIRROR_DISK_STATE_STALE:
503	case G_MIRROR_DISK_STATE_ACTIVE:
504		g_topology_lock();
505		g_mirror_disconnect_consumer(sc, disk->d_consumer);
506		g_topology_unlock();
507		free(disk, M_MIRROR);
508		break;
509	default:
510		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
511		    g_mirror_get_diskname(disk),
512		    g_mirror_disk_state2str(disk->d_state)));
513	}
514}
515
516static void
517g_mirror_free_device(struct g_mirror_softc *sc)
518{
519
520	mtx_destroy(&sc->sc_queue_mtx);
521	mtx_destroy(&sc->sc_events_mtx);
522	mtx_destroy(&sc->sc_done_mtx);
523	sx_destroy(&sc->sc_lock);
524	free(sc, M_MIRROR);
525}
526
527static void
528g_mirror_providergone(struct g_provider *pp)
529{
530	struct g_mirror_softc *sc = pp->private;
531
532	if ((--sc->sc_refcnt) == 0)
533		g_mirror_free_device(sc);
534}
535
536static void
537g_mirror_destroy_device(struct g_mirror_softc *sc)
538{
539	struct g_mirror_disk *disk;
540	struct g_mirror_event *ep;
541	struct g_geom *gp;
542	struct g_consumer *cp, *tmpcp;
543
544	g_topology_assert_not();
545	sx_assert(&sc->sc_lock, SX_XLOCKED);
546
547	gp = sc->sc_geom;
548	if (sc->sc_provider != NULL)
549		g_mirror_destroy_provider(sc);
550	for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL;
551	    disk = LIST_FIRST(&sc->sc_disks)) {
552		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
553		g_mirror_update_metadata(disk);
554		g_mirror_destroy_disk(disk);
555	}
556	while ((ep = g_mirror_event_get(sc)) != NULL) {
557		g_mirror_event_remove(sc, ep);
558		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
559			g_mirror_event_free(ep);
560		else {
561			ep->e_error = ECANCELED;
562			ep->e_flags |= G_MIRROR_EVENT_DONE;
563			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep);
564			mtx_lock(&sc->sc_events_mtx);
565			wakeup(ep);
566			mtx_unlock(&sc->sc_events_mtx);
567		}
568	}
569	callout_drain(&sc->sc_callout);
570
571	g_topology_lock();
572	LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) {
573		g_mirror_disconnect_consumer(sc, cp);
574	}
575	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
576	G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name);
577	g_wither_geom(gp, ENXIO);
578	sx_xunlock(&sc->sc_lock);
579	if ((--sc->sc_refcnt) == 0)
580		g_mirror_free_device(sc);
581	g_topology_unlock();
582}
583
584static void
585g_mirror_orphan(struct g_consumer *cp)
586{
587	struct g_mirror_disk *disk;
588
589	g_topology_assert();
590
591	disk = cp->private;
592	if (disk == NULL)
593		return;
594	disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
595	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
596	    G_MIRROR_EVENT_DONTWAIT);
597}
598
599/*
600 * Function should return the next active disk on the list.
601 * It is possible that it will be the same disk as given.
602 * If there are no active disks on list, NULL is returned.
603 */
604static __inline struct g_mirror_disk *
605g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
606{
607	struct g_mirror_disk *dp;
608
609	for (dp = LIST_NEXT(disk, d_next); dp != disk;
610	    dp = LIST_NEXT(dp, d_next)) {
611		if (dp == NULL)
612			dp = LIST_FIRST(&sc->sc_disks);
613		if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE)
614			break;
615	}
616	if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
617		return (NULL);
618	return (dp);
619}
620
621static struct g_mirror_disk *
622g_mirror_get_disk(struct g_mirror_softc *sc)
623{
624	struct g_mirror_disk *disk;
625
626	if (sc->sc_hint == NULL) {
627		sc->sc_hint = LIST_FIRST(&sc->sc_disks);
628		if (sc->sc_hint == NULL)
629			return (NULL);
630	}
631	disk = sc->sc_hint;
632	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) {
633		disk = g_mirror_find_next(sc, disk);
634		if (disk == NULL)
635			return (NULL);
636	}
637	sc->sc_hint = g_mirror_find_next(sc, disk);
638	return (disk);
639}
640
641static int
642g_mirror_write_metadata(struct g_mirror_disk *disk,
643    struct g_mirror_metadata *md)
644{
645	struct g_mirror_softc *sc;
646	struct g_consumer *cp;
647	off_t offset, length;
648	u_char *sector;
649	int error = 0;
650
651	g_topology_assert_not();
652	sc = disk->d_softc;
653	sx_assert(&sc->sc_lock, SX_LOCKED);
654
655	cp = disk->d_consumer;
656	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
657	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
658	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
659	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
660	    cp->acw, cp->ace));
661	length = cp->provider->sectorsize;
662	offset = cp->provider->mediasize - length;
663	sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO);
664	if (md != NULL &&
665	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) {
666		/*
667		 * Handle the case, when the size of parent provider reduced.
668		 */
669		if (offset < md->md_mediasize)
670			error = ENOSPC;
671		else
672			mirror_metadata_encode(md, sector);
673	}
674	if (error == 0)
675		error = g_write_data(cp, offset, sector, length);
676	free(sector, M_MIRROR);
677	if (error != 0) {
678		if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
679			disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
680			G_MIRROR_DEBUG(0, "Cannot write metadata on %s "
681			    "(device=%s, error=%d).",
682			    g_mirror_get_diskname(disk), sc->sc_name, error);
683		} else {
684			G_MIRROR_DEBUG(1, "Cannot write metadata on %s "
685			    "(device=%s, error=%d).",
686			    g_mirror_get_diskname(disk), sc->sc_name, error);
687		}
688		if (g_mirror_disconnect_on_failure &&
689		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
690			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
691			g_mirror_event_send(disk,
692			    G_MIRROR_DISK_STATE_DISCONNECTED,
693			    G_MIRROR_EVENT_DONTWAIT);
694		}
695	}
696	return (error);
697}
698
699static int
700g_mirror_clear_metadata(struct g_mirror_disk *disk)
701{
702	int error;
703
704	g_topology_assert_not();
705	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
706
707	error = g_mirror_write_metadata(disk, NULL);
708	if (error == 0) {
709		G_MIRROR_DEBUG(2, "Metadata on %s cleared.",
710		    g_mirror_get_diskname(disk));
711	} else {
712		G_MIRROR_DEBUG(0,
713		    "Cannot clear metadata on disk %s (error=%d).",
714		    g_mirror_get_diskname(disk), error);
715	}
716	return (error);
717}
718
719void
720g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk,
721    struct g_mirror_metadata *md)
722{
723
724	strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic));
725	md->md_version = G_MIRROR_VERSION;
726	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
727	md->md_mid = sc->sc_id;
728	md->md_all = sc->sc_ndisks;
729	md->md_slice = sc->sc_slice;
730	md->md_balance = sc->sc_balance;
731	md->md_genid = sc->sc_genid;
732	md->md_mediasize = sc->sc_mediasize;
733	md->md_sectorsize = sc->sc_sectorsize;
734	md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK);
735	bzero(md->md_provider, sizeof(md->md_provider));
736	if (disk == NULL) {
737		md->md_did = arc4random();
738		md->md_priority = 0;
739		md->md_syncid = 0;
740		md->md_dflags = 0;
741		md->md_sync_offset = 0;
742		md->md_provsize = 0;
743	} else {
744		md->md_did = disk->d_id;
745		md->md_priority = disk->d_priority;
746		md->md_syncid = disk->d_sync.ds_syncid;
747		md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK);
748		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
749			md->md_sync_offset = disk->d_sync.ds_offset_done;
750		else
751			md->md_sync_offset = 0;
752		if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) {
753			strlcpy(md->md_provider,
754			    disk->d_consumer->provider->name,
755			    sizeof(md->md_provider));
756		}
757		md->md_provsize = disk->d_consumer->provider->mediasize;
758	}
759}
760
761void
762g_mirror_update_metadata(struct g_mirror_disk *disk)
763{
764	struct g_mirror_softc *sc;
765	struct g_mirror_metadata md;
766	int error;
767
768	g_topology_assert_not();
769	sc = disk->d_softc;
770	sx_assert(&sc->sc_lock, SX_LOCKED);
771
772	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0)
773		g_mirror_fill_metadata(sc, disk, &md);
774	error = g_mirror_write_metadata(disk, &md);
775	if (error == 0) {
776		G_MIRROR_DEBUG(2, "Metadata on %s updated.",
777		    g_mirror_get_diskname(disk));
778	} else {
779		G_MIRROR_DEBUG(0,
780		    "Cannot update metadata on disk %s (error=%d).",
781		    g_mirror_get_diskname(disk), error);
782	}
783}
784
785static void
786g_mirror_bump_syncid(struct g_mirror_softc *sc)
787{
788	struct g_mirror_disk *disk;
789
790	g_topology_assert_not();
791	sx_assert(&sc->sc_lock, SX_XLOCKED);
792	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
793	    ("%s called with no active disks (device=%s).", __func__,
794	    sc->sc_name));
795
796	sc->sc_syncid++;
797	G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
798	    sc->sc_syncid);
799	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
800		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
801		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
802			disk->d_sync.ds_syncid = sc->sc_syncid;
803			g_mirror_update_metadata(disk);
804		}
805	}
806}
807
808static void
809g_mirror_bump_genid(struct g_mirror_softc *sc)
810{
811	struct g_mirror_disk *disk;
812
813	g_topology_assert_not();
814	sx_assert(&sc->sc_lock, SX_XLOCKED);
815	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
816	    ("%s called with no active disks (device=%s).", __func__,
817	    sc->sc_name));
818
819	sc->sc_genid++;
820	G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
821	    sc->sc_genid);
822	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
823		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
824		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
825			disk->d_genid = sc->sc_genid;
826			g_mirror_update_metadata(disk);
827		}
828	}
829}
830
831static int
832g_mirror_idle(struct g_mirror_softc *sc, int acw)
833{
834	struct g_mirror_disk *disk;
835	int timeout;
836
837	g_topology_assert_not();
838	sx_assert(&sc->sc_lock, SX_XLOCKED);
839
840	if (sc->sc_provider == NULL)
841		return (0);
842	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
843		return (0);
844	if (sc->sc_idle)
845		return (0);
846	if (sc->sc_writes > 0)
847		return (0);
848	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
849		timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write);
850		if (!g_mirror_shutdown && timeout > 0)
851			return (timeout);
852	}
853	sc->sc_idle = 1;
854	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
855		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
856			continue;
857		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.",
858		    g_mirror_get_diskname(disk), sc->sc_name);
859		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
860		g_mirror_update_metadata(disk);
861	}
862	return (0);
863}
864
865static void
866g_mirror_unidle(struct g_mirror_softc *sc)
867{
868	struct g_mirror_disk *disk;
869
870	g_topology_assert_not();
871	sx_assert(&sc->sc_lock, SX_XLOCKED);
872
873	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
874		return;
875	sc->sc_idle = 0;
876	sc->sc_last_write = time_uptime;
877	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
878		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
879			continue;
880		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.",
881		    g_mirror_get_diskname(disk), sc->sc_name);
882		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
883		g_mirror_update_metadata(disk);
884	}
885}
886
887static void
888g_mirror_flush_done(struct bio *bp)
889{
890	struct g_mirror_softc *sc;
891	struct bio *pbp;
892
893	pbp = bp->bio_parent;
894	sc = pbp->bio_to->private;
895	mtx_lock(&sc->sc_done_mtx);
896	if (pbp->bio_error == 0)
897		pbp->bio_error = bp->bio_error;
898	pbp->bio_completed += bp->bio_completed;
899	pbp->bio_inbed++;
900	if (pbp->bio_children == pbp->bio_inbed) {
901		mtx_unlock(&sc->sc_done_mtx);
902		g_io_deliver(pbp, pbp->bio_error);
903	} else
904		mtx_unlock(&sc->sc_done_mtx);
905	g_destroy_bio(bp);
906}
907
908static void
909g_mirror_done(struct bio *bp)
910{
911	struct g_mirror_softc *sc;
912
913	sc = bp->bio_from->geom->softc;
914	bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR;
915	mtx_lock(&sc->sc_queue_mtx);
916	bioq_insert_tail(&sc->sc_queue, bp);
917	mtx_unlock(&sc->sc_queue_mtx);
918	wakeup(sc);
919}
920
921static void
922g_mirror_regular_request(struct bio *bp)
923{
924	struct g_mirror_softc *sc;
925	struct g_mirror_disk *disk;
926	struct bio *pbp;
927
928	g_topology_assert_not();
929
930	pbp = bp->bio_parent;
931	sc = pbp->bio_to->private;
932	bp->bio_from->index--;
933	if (bp->bio_cmd == BIO_WRITE)
934		sc->sc_writes--;
935	disk = bp->bio_from->private;
936	if (disk == NULL) {
937		g_topology_lock();
938		g_mirror_kill_consumer(sc, bp->bio_from);
939		g_topology_unlock();
940	}
941
942	pbp->bio_inbed++;
943	KASSERT(pbp->bio_inbed <= pbp->bio_children,
944	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
945	    pbp->bio_children));
946	if (bp->bio_error == 0 && pbp->bio_error == 0) {
947		G_MIRROR_LOGREQ(3, bp, "Request delivered.");
948		g_destroy_bio(bp);
949		if (pbp->bio_children == pbp->bio_inbed) {
950			G_MIRROR_LOGREQ(3, pbp, "Request delivered.");
951			pbp->bio_completed = pbp->bio_length;
952			if (pbp->bio_cmd == BIO_WRITE ||
953			    pbp->bio_cmd == BIO_DELETE) {
954				bioq_remove(&sc->sc_inflight, pbp);
955				/* Release delayed sync requests if possible. */
956				g_mirror_sync_release(sc);
957			}
958			g_io_deliver(pbp, pbp->bio_error);
959		}
960		return;
961	} else if (bp->bio_error != 0) {
962		if (pbp->bio_error == 0)
963			pbp->bio_error = bp->bio_error;
964		if (disk != NULL) {
965			if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
966				disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
967				G_MIRROR_LOGREQ(0, bp,
968				    "Request failed (error=%d).",
969				    bp->bio_error);
970			} else {
971				G_MIRROR_LOGREQ(1, bp,
972				    "Request failed (error=%d).",
973				    bp->bio_error);
974			}
975			if (g_mirror_disconnect_on_failure &&
976			    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1)
977			{
978				sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
979				g_mirror_event_send(disk,
980				    G_MIRROR_DISK_STATE_DISCONNECTED,
981				    G_MIRROR_EVENT_DONTWAIT);
982			}
983		}
984		switch (pbp->bio_cmd) {
985		case BIO_DELETE:
986		case BIO_WRITE:
987			pbp->bio_inbed--;
988			pbp->bio_children--;
989			break;
990		}
991	}
992	g_destroy_bio(bp);
993
994	switch (pbp->bio_cmd) {
995	case BIO_READ:
996		if (pbp->bio_inbed < pbp->bio_children)
997			break;
998		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1)
999			g_io_deliver(pbp, pbp->bio_error);
1000		else {
1001			pbp->bio_error = 0;
1002			mtx_lock(&sc->sc_queue_mtx);
1003			bioq_insert_tail(&sc->sc_queue, pbp);
1004			mtx_unlock(&sc->sc_queue_mtx);
1005			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1006			wakeup(sc);
1007		}
1008		break;
1009	case BIO_DELETE:
1010	case BIO_WRITE:
1011		if (pbp->bio_children == 0) {
1012			/*
1013			 * All requests failed.
1014			 */
1015		} else if (pbp->bio_inbed < pbp->bio_children) {
1016			/* Do nothing. */
1017			break;
1018		} else if (pbp->bio_children == pbp->bio_inbed) {
1019			/* Some requests succeeded. */
1020			pbp->bio_error = 0;
1021			pbp->bio_completed = pbp->bio_length;
1022		}
1023		bioq_remove(&sc->sc_inflight, pbp);
1024		/* Release delayed sync requests if possible. */
1025		g_mirror_sync_release(sc);
1026		g_io_deliver(pbp, pbp->bio_error);
1027		break;
1028	default:
1029		KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd));
1030		break;
1031	}
1032}
1033
1034static void
1035g_mirror_sync_done(struct bio *bp)
1036{
1037	struct g_mirror_softc *sc;
1038
1039	G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered.");
1040	sc = bp->bio_from->geom->softc;
1041	bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC;
1042	mtx_lock(&sc->sc_queue_mtx);
1043	bioq_insert_tail(&sc->sc_queue, bp);
1044	mtx_unlock(&sc->sc_queue_mtx);
1045	wakeup(sc);
1046}
1047
1048static void
1049g_mirror_candelete(struct bio *bp)
1050{
1051	struct g_mirror_softc *sc;
1052	struct g_mirror_disk *disk;
1053	int *val;
1054
1055	sc = bp->bio_to->private;
1056	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
1057		if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE)
1058			break;
1059	}
1060	val = (int *)bp->bio_data;
1061	*val = (disk != NULL);
1062	g_io_deliver(bp, 0);
1063}
1064
1065static void
1066g_mirror_kernel_dump(struct bio *bp)
1067{
1068	struct g_mirror_softc *sc;
1069	struct g_mirror_disk *disk;
1070	struct bio *cbp;
1071	struct g_kerneldump *gkd;
1072
1073	/*
1074	 * We configure dumping to the first component, because this component
1075	 * will be used for reading with 'prefer' balance algorithm.
1076	 * If the component with the higest priority is currently disconnected
1077	 * we will not be able to read the dump after the reboot if it will be
1078	 * connected and synchronized later. Can we do something better?
1079	 */
1080	sc = bp->bio_to->private;
1081	disk = LIST_FIRST(&sc->sc_disks);
1082
1083	gkd = (struct g_kerneldump *)bp->bio_data;
1084	if (gkd->length > bp->bio_to->mediasize)
1085		gkd->length = bp->bio_to->mediasize;
1086	cbp = g_clone_bio(bp);
1087	if (cbp == NULL) {
1088		g_io_deliver(bp, ENOMEM);
1089		return;
1090	}
1091	cbp->bio_done = g_std_done;
1092	g_io_request(cbp, disk->d_consumer);
1093	G_MIRROR_DEBUG(1, "Kernel dump will go to %s.",
1094	    g_mirror_get_diskname(disk));
1095}
1096
1097static void
1098g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp)
1099{
1100	struct bio_queue_head queue;
1101	struct g_mirror_disk *disk;
1102	struct g_consumer *cp;
1103	struct bio *cbp;
1104
1105	bioq_init(&queue);
1106	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
1107		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
1108			continue;
1109		cbp = g_clone_bio(bp);
1110		if (cbp == NULL) {
1111			while ((cbp = bioq_takefirst(&queue)) != NULL)
1112				g_destroy_bio(cbp);
1113			if (bp->bio_error == 0)
1114				bp->bio_error = ENOMEM;
1115			g_io_deliver(bp, bp->bio_error);
1116			return;
1117		}
1118		bioq_insert_tail(&queue, cbp);
1119		cbp->bio_done = g_mirror_flush_done;
1120		cbp->bio_caller1 = disk;
1121		cbp->bio_to = disk->d_consumer->provider;
1122	}
1123	while ((cbp = bioq_takefirst(&queue)) != NULL) {
1124		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
1125		disk = cbp->bio_caller1;
1126		cbp->bio_caller1 = NULL;
1127		cp = disk->d_consumer;
1128		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1129		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1130		    cp->acr, cp->acw, cp->ace));
1131		g_io_request(cbp, disk->d_consumer);
1132	}
1133}
1134
1135static void
1136g_mirror_start(struct bio *bp)
1137{
1138	struct g_mirror_softc *sc;
1139
1140	sc = bp->bio_to->private;
1141	/*
1142	 * If sc == NULL or there are no valid disks, provider's error
1143	 * should be set and g_mirror_start() should not be called at all.
1144	 */
1145	KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
1146	    ("Provider's error should be set (error=%d)(mirror=%s).",
1147	    bp->bio_to->error, bp->bio_to->name));
1148	G_MIRROR_LOGREQ(3, bp, "Request received.");
1149
1150	switch (bp->bio_cmd) {
1151	case BIO_READ:
1152	case BIO_WRITE:
1153	case BIO_DELETE:
1154		break;
1155	case BIO_FLUSH:
1156		g_mirror_flush(sc, bp);
1157		return;
1158	case BIO_GETATTR:
1159		if (!strcmp(bp->bio_attribute, "GEOM::candelete")) {
1160			g_mirror_candelete(bp);
1161			return;
1162		} else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
1163			g_mirror_kernel_dump(bp);
1164			return;
1165		}
1166		/* FALLTHROUGH */
1167	default:
1168		g_io_deliver(bp, EOPNOTSUPP);
1169		return;
1170	}
1171	mtx_lock(&sc->sc_queue_mtx);
1172	bioq_insert_tail(&sc->sc_queue, bp);
1173	mtx_unlock(&sc->sc_queue_mtx);
1174	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1175	wakeup(sc);
1176}
1177
1178/*
1179 * Return TRUE if the given request is colliding with a in-progress
1180 * synchronization request.
1181 */
1182static int
1183g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp)
1184{
1185	struct g_mirror_disk *disk;
1186	struct bio *sbp;
1187	off_t rstart, rend, sstart, send;
1188	u_int i;
1189
1190	if (sc->sc_sync.ds_ndisks == 0)
1191		return (0);
1192	rstart = bp->bio_offset;
1193	rend = bp->bio_offset + bp->bio_length;
1194	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
1195		if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING)
1196			continue;
1197		for (i = 0; i < g_mirror_syncreqs; i++) {
1198			sbp = disk->d_sync.ds_bios[i];
1199			if (sbp == NULL)
1200				continue;
1201			sstart = sbp->bio_offset;
1202			send = sbp->bio_offset + sbp->bio_length;
1203			if (rend > sstart && rstart < send)
1204				return (1);
1205		}
1206	}
1207	return (0);
1208}
1209
1210/*
1211 * Return TRUE if the given sync request is colliding with a in-progress regular
1212 * request.
1213 */
1214static int
1215g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp)
1216{
1217	off_t rstart, rend, sstart, send;
1218	struct bio *bp;
1219
1220	if (sc->sc_sync.ds_ndisks == 0)
1221		return (0);
1222	sstart = sbp->bio_offset;
1223	send = sbp->bio_offset + sbp->bio_length;
1224	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
1225		rstart = bp->bio_offset;
1226		rend = bp->bio_offset + bp->bio_length;
1227		if (rend > sstart && rstart < send)
1228			return (1);
1229	}
1230	return (0);
1231}
1232
1233/*
1234 * Puts request onto delayed queue.
1235 */
1236static void
1237g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp)
1238{
1239
1240	G_MIRROR_LOGREQ(2, bp, "Delaying request.");
1241	bioq_insert_head(&sc->sc_regular_delayed, bp);
1242}
1243
1244/*
1245 * Puts synchronization request onto delayed queue.
1246 */
1247static void
1248g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp)
1249{
1250
1251	G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request.");
1252	bioq_insert_tail(&sc->sc_sync_delayed, bp);
1253}
1254
1255/*
1256 * Releases delayed regular requests which don't collide anymore with sync
1257 * requests.
1258 */
1259static void
1260g_mirror_regular_release(struct g_mirror_softc *sc)
1261{
1262	struct bio *bp, *bp2;
1263
1264	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
1265		if (g_mirror_sync_collision(sc, bp))
1266			continue;
1267		bioq_remove(&sc->sc_regular_delayed, bp);
1268		G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
1269		mtx_lock(&sc->sc_queue_mtx);
1270		bioq_insert_head(&sc->sc_queue, bp);
1271#if 0
1272		/*
1273		 * wakeup() is not needed, because this function is called from
1274		 * the worker thread.
1275		 */
1276		wakeup(&sc->sc_queue);
1277#endif
1278		mtx_unlock(&sc->sc_queue_mtx);
1279	}
1280}
1281
1282/*
1283 * Releases delayed sync requests which don't collide anymore with regular
1284 * requests.
1285 */
1286static void
1287g_mirror_sync_release(struct g_mirror_softc *sc)
1288{
1289	struct bio *bp, *bp2;
1290
1291	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
1292		if (g_mirror_regular_collision(sc, bp))
1293			continue;
1294		bioq_remove(&sc->sc_sync_delayed, bp);
1295		G_MIRROR_LOGREQ(2, bp,
1296		    "Releasing delayed synchronization request.");
1297		g_io_request(bp, bp->bio_from);
1298	}
1299}
1300
1301/*
1302 * Handle synchronization requests.
1303 * Every synchronization request is two-steps process: first, READ request is
1304 * send to active provider and then WRITE request (with read data) to the provider
1305 * beeing synchronized. When WRITE is finished, new synchronization request is
1306 * send.
1307 */
1308static void
1309g_mirror_sync_request(struct bio *bp)
1310{
1311	struct g_mirror_softc *sc;
1312	struct g_mirror_disk *disk;
1313
1314	bp->bio_from->index--;
1315	sc = bp->bio_from->geom->softc;
1316	disk = bp->bio_from->private;
1317	if (disk == NULL) {
1318		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
1319		g_topology_lock();
1320		g_mirror_kill_consumer(sc, bp->bio_from);
1321		g_topology_unlock();
1322		free(bp->bio_data, M_MIRROR);
1323		g_destroy_bio(bp);
1324		sx_xlock(&sc->sc_lock);
1325		return;
1326	}
1327
1328	/*
1329	 * Synchronization request.
1330	 */
1331	switch (bp->bio_cmd) {
1332	case BIO_READ:
1333	    {
1334		struct g_consumer *cp;
1335
1336		if (bp->bio_error != 0) {
1337			G_MIRROR_LOGREQ(0, bp,
1338			    "Synchronization request failed (error=%d).",
1339			    bp->bio_error);
1340			g_destroy_bio(bp);
1341			return;
1342		}
1343		G_MIRROR_LOGREQ(3, bp,
1344		    "Synchronization request half-finished.");
1345		bp->bio_cmd = BIO_WRITE;
1346		bp->bio_cflags = 0;
1347		cp = disk->d_consumer;
1348		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1349		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1350		    cp->acr, cp->acw, cp->ace));
1351		cp->index++;
1352		g_io_request(bp, cp);
1353		return;
1354	    }
1355	case BIO_WRITE:
1356	    {
1357		struct g_mirror_disk_sync *sync;
1358		off_t offset;
1359		void *data;
1360		int i;
1361
1362		if (bp->bio_error != 0) {
1363			G_MIRROR_LOGREQ(0, bp,
1364			    "Synchronization request failed (error=%d).",
1365			    bp->bio_error);
1366			g_destroy_bio(bp);
1367			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
1368			g_mirror_event_send(disk,
1369			    G_MIRROR_DISK_STATE_DISCONNECTED,
1370			    G_MIRROR_EVENT_DONTWAIT);
1371			return;
1372		}
1373		G_MIRROR_LOGREQ(3, bp, "Synchronization request finished.");
1374		sync = &disk->d_sync;
1375		if (sync->ds_offset >= sc->sc_mediasize ||
1376		    sync->ds_consumer == NULL ||
1377		    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
1378			/* Don't send more synchronization requests. */
1379			sync->ds_inflight--;
1380			if (sync->ds_bios != NULL) {
1381				i = (int)(uintptr_t)bp->bio_caller1;
1382				sync->ds_bios[i] = NULL;
1383			}
1384			free(bp->bio_data, M_MIRROR);
1385			g_destroy_bio(bp);
1386			if (sync->ds_inflight > 0)
1387				return;
1388			if (sync->ds_consumer == NULL ||
1389			    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
1390				return;
1391			}
1392			/* Disk up-to-date, activate it. */
1393			g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE,
1394			    G_MIRROR_EVENT_DONTWAIT);
1395			return;
1396		}
1397
1398		/* Send next synchronization request. */
1399		data = bp->bio_data;
1400		bzero(bp, sizeof(*bp));
1401		bp->bio_cmd = BIO_READ;
1402		bp->bio_offset = sync->ds_offset;
1403		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1404		sync->ds_offset += bp->bio_length;
1405		bp->bio_done = g_mirror_sync_done;
1406		bp->bio_data = data;
1407		bp->bio_from = sync->ds_consumer;
1408		bp->bio_to = sc->sc_provider;
1409		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
1410		sync->ds_consumer->index++;
1411		/*
1412		 * Delay the request if it is colliding with a regular request.
1413		 */
1414		if (g_mirror_regular_collision(sc, bp))
1415			g_mirror_sync_delay(sc, bp);
1416		else
1417			g_io_request(bp, sync->ds_consumer);
1418
1419		/* Release delayed requests if possible. */
1420		g_mirror_regular_release(sc);
1421
1422		/* Find the smallest offset */
1423		offset = sc->sc_mediasize;
1424		for (i = 0; i < g_mirror_syncreqs; i++) {
1425			bp = sync->ds_bios[i];
1426			if (bp->bio_offset < offset)
1427				offset = bp->bio_offset;
1428		}
1429		if (sync->ds_offset_done + (MAXPHYS * 100) < offset) {
1430			/* Update offset_done on every 100 blocks. */
1431			sync->ds_offset_done = offset;
1432			g_mirror_update_metadata(disk);
1433		}
1434		return;
1435	    }
1436	default:
1437		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1438		    bp->bio_cmd, sc->sc_name));
1439		break;
1440	}
1441}
1442
1443static void
1444g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp)
1445{
1446	struct g_mirror_disk *disk;
1447	struct g_consumer *cp;
1448	struct bio *cbp;
1449
1450	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
1451		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE)
1452			break;
1453	}
1454	if (disk == NULL) {
1455		if (bp->bio_error == 0)
1456			bp->bio_error = ENXIO;
1457		g_io_deliver(bp, bp->bio_error);
1458		return;
1459	}
1460	cbp = g_clone_bio(bp);
1461	if (cbp == NULL) {
1462		if (bp->bio_error == 0)
1463			bp->bio_error = ENOMEM;
1464		g_io_deliver(bp, bp->bio_error);
1465		return;
1466	}
1467	/*
1468	 * Fill in the component buf structure.
1469	 */
1470	cp = disk->d_consumer;
1471	cbp->bio_done = g_mirror_done;
1472	cbp->bio_to = cp->provider;
1473	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
1474	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1475	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
1476	    cp->acw, cp->ace));
1477	cp->index++;
1478	g_io_request(cbp, cp);
1479}
1480
1481static void
1482g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp)
1483{
1484	struct g_mirror_disk *disk;
1485	struct g_consumer *cp;
1486	struct bio *cbp;
1487
1488	disk = g_mirror_get_disk(sc);
1489	if (disk == NULL) {
1490		if (bp->bio_error == 0)
1491			bp->bio_error = ENXIO;
1492		g_io_deliver(bp, bp->bio_error);
1493		return;
1494	}
1495	cbp = g_clone_bio(bp);
1496	if (cbp == NULL) {
1497		if (bp->bio_error == 0)
1498			bp->bio_error = ENOMEM;
1499		g_io_deliver(bp, bp->bio_error);
1500		return;
1501	}
1502	/*
1503	 * Fill in the component buf structure.
1504	 */
1505	cp = disk->d_consumer;
1506	cbp->bio_done = g_mirror_done;
1507	cbp->bio_to = cp->provider;
1508	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
1509	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1510	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
1511	    cp->acw, cp->ace));
1512	cp->index++;
1513	g_io_request(cbp, cp);
1514}
1515
1516#define TRACK_SIZE  (1 * 1024 * 1024)
1517#define LOAD_SCALE	256
1518#define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
1519
1520static void
1521g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp)
1522{
1523	struct g_mirror_disk *disk, *dp;
1524	struct g_consumer *cp;
1525	struct bio *cbp;
1526	int prio, best;
1527
1528	/* Find a disk with the smallest load. */
1529	disk = NULL;
1530	best = INT_MAX;
1531	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
1532		if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
1533			continue;
1534		prio = dp->load;
1535		/* If disk head is precisely in position - highly prefer it. */
1536		if (dp->d_last_offset == bp->bio_offset)
1537			prio -= 2 * LOAD_SCALE;
1538		else
1539		/* If disk head is close to position - prefer it. */
1540		if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE)
1541			prio -= 1 * LOAD_SCALE;
1542		if (prio <= best) {
1543			disk = dp;
1544			best = prio;
1545		}
1546	}
1547	KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name));
1548	cbp = g_clone_bio(bp);
1549	if (cbp == NULL) {
1550		if (bp->bio_error == 0)
1551			bp->bio_error = ENOMEM;
1552		g_io_deliver(bp, bp->bio_error);
1553		return;
1554	}
1555	/*
1556	 * Fill in the component buf structure.
1557	 */
1558	cp = disk->d_consumer;
1559	cbp->bio_done = g_mirror_done;
1560	cbp->bio_to = cp->provider;
1561	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
1562	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1563	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
1564	    cp->acw, cp->ace));
1565	cp->index++;
1566	/* Remember last head position */
1567	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1568	/* Update loads. */
1569	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
1570		dp->load = (dp->d_consumer->index * LOAD_SCALE +
1571		    dp->load * 7) / 8;
1572	}
1573	g_io_request(cbp, cp);
1574}
1575
1576static void
1577g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp)
1578{
1579	struct bio_queue_head queue;
1580	struct g_mirror_disk *disk;
1581	struct g_consumer *cp;
1582	struct bio *cbp;
1583	off_t left, mod, offset, slice;
1584	u_char *data;
1585	u_int ndisks;
1586
1587	if (bp->bio_length <= sc->sc_slice) {
1588		g_mirror_request_round_robin(sc, bp);
1589		return;
1590	}
1591	ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE);
1592	slice = bp->bio_length / ndisks;
1593	mod = slice % sc->sc_provider->sectorsize;
1594	if (mod != 0)
1595		slice += sc->sc_provider->sectorsize - mod;
1596	/*
1597	 * Allocate all bios before sending any request, so we can
1598	 * return ENOMEM in nice and clean way.
1599	 */
1600	left = bp->bio_length;
1601	offset = bp->bio_offset;
1602	data = bp->bio_data;
1603	bioq_init(&queue);
1604	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
1605		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
1606			continue;
1607		cbp = g_clone_bio(bp);
1608		if (cbp == NULL) {
1609			while ((cbp = bioq_takefirst(&queue)) != NULL)
1610				g_destroy_bio(cbp);
1611			if (bp->bio_error == 0)
1612				bp->bio_error = ENOMEM;
1613			g_io_deliver(bp, bp->bio_error);
1614			return;
1615		}
1616		bioq_insert_tail(&queue, cbp);
1617		cbp->bio_done = g_mirror_done;
1618		cbp->bio_caller1 = disk;
1619		cbp->bio_to = disk->d_consumer->provider;
1620		cbp->bio_offset = offset;
1621		cbp->bio_data = data;
1622		cbp->bio_length = MIN(left, slice);
1623		left -= cbp->bio_length;
1624		if (left == 0)
1625			break;
1626		offset += cbp->bio_length;
1627		data += cbp->bio_length;
1628	}
1629	while ((cbp = bioq_takefirst(&queue)) != NULL) {
1630		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
1631		disk = cbp->bio_caller1;
1632		cbp->bio_caller1 = NULL;
1633		cp = disk->d_consumer;
1634		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1635		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1636		    cp->acr, cp->acw, cp->ace));
1637		disk->d_consumer->index++;
1638		g_io_request(cbp, disk->d_consumer);
1639	}
1640}
1641
1642static void
1643g_mirror_register_request(struct bio *bp)
1644{
1645	struct g_mirror_softc *sc;
1646
1647	sc = bp->bio_to->private;
1648	switch (bp->bio_cmd) {
1649	case BIO_READ:
1650		switch (sc->sc_balance) {
1651		case G_MIRROR_BALANCE_LOAD:
1652			g_mirror_request_load(sc, bp);
1653			break;
1654		case G_MIRROR_BALANCE_PREFER:
1655			g_mirror_request_prefer(sc, bp);
1656			break;
1657		case G_MIRROR_BALANCE_ROUND_ROBIN:
1658			g_mirror_request_round_robin(sc, bp);
1659			break;
1660		case G_MIRROR_BALANCE_SPLIT:
1661			g_mirror_request_split(sc, bp);
1662			break;
1663		}
1664		return;
1665	case BIO_WRITE:
1666	case BIO_DELETE:
1667	    {
1668		struct g_mirror_disk *disk;
1669		struct g_mirror_disk_sync *sync;
1670		struct bio_queue_head queue;
1671		struct g_consumer *cp;
1672		struct bio *cbp;
1673
1674		/*
1675		 * Delay the request if it is colliding with a synchronization
1676		 * request.
1677		 */
1678		if (g_mirror_sync_collision(sc, bp)) {
1679			g_mirror_regular_delay(sc, bp);
1680			return;
1681		}
1682
1683		if (sc->sc_idle)
1684			g_mirror_unidle(sc);
1685		else
1686			sc->sc_last_write = time_uptime;
1687
1688		/*
1689		 * Allocate all bios before sending any request, so we can
1690		 * return ENOMEM in nice and clean way.
1691		 */
1692		bioq_init(&queue);
1693		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
1694			sync = &disk->d_sync;
1695			switch (disk->d_state) {
1696			case G_MIRROR_DISK_STATE_ACTIVE:
1697				break;
1698			case G_MIRROR_DISK_STATE_SYNCHRONIZING:
1699				if (bp->bio_offset >= sync->ds_offset)
1700					continue;
1701				break;
1702			default:
1703				continue;
1704			}
1705			if (bp->bio_cmd == BIO_DELETE &&
1706			    (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0)
1707				continue;
1708			cbp = g_clone_bio(bp);
1709			if (cbp == NULL) {
1710				while ((cbp = bioq_takefirst(&queue)) != NULL)
1711					g_destroy_bio(cbp);
1712				if (bp->bio_error == 0)
1713					bp->bio_error = ENOMEM;
1714				g_io_deliver(bp, bp->bio_error);
1715				return;
1716			}
1717			bioq_insert_tail(&queue, cbp);
1718			cbp->bio_done = g_mirror_done;
1719			cp = disk->d_consumer;
1720			cbp->bio_caller1 = cp;
1721			cbp->bio_to = cp->provider;
1722			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1723			    ("Consumer %s not opened (r%dw%de%d).",
1724			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1725		}
1726		if (bioq_first(&queue) == NULL) {
1727			g_io_deliver(bp, EOPNOTSUPP);
1728			return;
1729		}
1730		while ((cbp = bioq_takefirst(&queue)) != NULL) {
1731			G_MIRROR_LOGREQ(3, cbp, "Sending request.");
1732			cp = cbp->bio_caller1;
1733			cbp->bio_caller1 = NULL;
1734			cp->index++;
1735			sc->sc_writes++;
1736			g_io_request(cbp, cp);
1737		}
1738		/*
1739		 * Put request onto inflight queue, so we can check if new
1740		 * synchronization requests don't collide with it.
1741		 */
1742		bioq_insert_tail(&sc->sc_inflight, bp);
1743		/*
1744		 * Bump syncid on first write.
1745		 */
1746		if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) {
1747			sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
1748			g_mirror_bump_syncid(sc);
1749		}
1750		return;
1751	    }
1752	default:
1753		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1754		    bp->bio_cmd, sc->sc_name));
1755		break;
1756	}
1757}
1758
1759static int
1760g_mirror_can_destroy(struct g_mirror_softc *sc)
1761{
1762	struct g_geom *gp;
1763	struct g_consumer *cp;
1764
1765	g_topology_assert();
1766	gp = sc->sc_geom;
1767	if (gp->softc == NULL)
1768		return (1);
1769	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0)
1770		return (0);
1771	LIST_FOREACH(cp, &gp->consumer, consumer) {
1772		if (g_mirror_is_busy(sc, cp))
1773			return (0);
1774	}
1775	gp = sc->sc_sync.ds_geom;
1776	LIST_FOREACH(cp, &gp->consumer, consumer) {
1777		if (g_mirror_is_busy(sc, cp))
1778			return (0);
1779	}
1780	G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1781	    sc->sc_name);
1782	return (1);
1783}
1784
1785static int
1786g_mirror_try_destroy(struct g_mirror_softc *sc)
1787{
1788
1789	if (sc->sc_rootmount != NULL) {
1790		G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
1791		    sc->sc_rootmount);
1792		root_mount_rel(sc->sc_rootmount);
1793		sc->sc_rootmount = NULL;
1794	}
1795	g_topology_lock();
1796	if (!g_mirror_can_destroy(sc)) {
1797		g_topology_unlock();
1798		return (0);
1799	}
1800	sc->sc_geom->softc = NULL;
1801	sc->sc_sync.ds_geom->softc = NULL;
1802	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WAIT) != 0) {
1803		g_topology_unlock();
1804		G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
1805		    &sc->sc_worker);
1806		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
1807		sx_xunlock(&sc->sc_lock);
1808		wakeup(&sc->sc_worker);
1809		sc->sc_worker = NULL;
1810	} else {
1811		g_topology_unlock();
1812		g_mirror_destroy_device(sc);
1813	}
1814	return (1);
1815}
1816
1817/*
1818 * Worker thread.
1819 */
1820static void
1821g_mirror_worker(void *arg)
1822{
1823	struct g_mirror_softc *sc;
1824	struct g_mirror_event *ep;
1825	struct bio *bp;
1826	int timeout;
1827
1828	sc = arg;
1829	thread_lock(curthread);
1830	sched_prio(curthread, PRIBIO);
1831	thread_unlock(curthread);
1832
1833	sx_xlock(&sc->sc_lock);
1834	for (;;) {
1835		G_MIRROR_DEBUG(5, "%s: Let's see...", __func__);
1836		/*
1837		 * First take a look at events.
1838		 * This is important to handle events before any I/O requests.
1839		 */
1840		ep = g_mirror_event_get(sc);
1841		if (ep != NULL) {
1842			g_mirror_event_remove(sc, ep);
1843			if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) {
1844				/* Update only device status. */
1845				G_MIRROR_DEBUG(3,
1846				    "Running event for device %s.",
1847				    sc->sc_name);
1848				ep->e_error = 0;
1849				g_mirror_update_device(sc, 1);
1850			} else {
1851				/* Update disk status. */
1852				G_MIRROR_DEBUG(3, "Running event for disk %s.",
1853				     g_mirror_get_diskname(ep->e_disk));
1854				ep->e_error = g_mirror_update_disk(ep->e_disk,
1855				    ep->e_state);
1856				if (ep->e_error == 0)
1857					g_mirror_update_device(sc, 0);
1858			}
1859			if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) {
1860				KASSERT(ep->e_error == 0,
1861				    ("Error cannot be handled."));
1862				g_mirror_event_free(ep);
1863			} else {
1864				ep->e_flags |= G_MIRROR_EVENT_DONE;
1865				G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
1866				    ep);
1867				mtx_lock(&sc->sc_events_mtx);
1868				wakeup(ep);
1869				mtx_unlock(&sc->sc_events_mtx);
1870			}
1871			if ((sc->sc_flags &
1872			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
1873				if (g_mirror_try_destroy(sc)) {
1874					curthread->td_pflags &= ~TDP_GEOM;
1875					G_MIRROR_DEBUG(1, "Thread exiting.");
1876					kproc_exit(0);
1877				}
1878			}
1879			G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__);
1880			continue;
1881		}
1882		/*
1883		 * Check if we can mark array as CLEAN and if we can't take
1884		 * how much seconds should we wait.
1885		 */
1886		timeout = g_mirror_idle(sc, -1);
1887		/*
1888		 * Now I/O requests.
1889		 */
1890		/* Get first request from the queue. */
1891		mtx_lock(&sc->sc_queue_mtx);
1892		bp = bioq_takefirst(&sc->sc_queue);
1893		if (bp == NULL) {
1894			if ((sc->sc_flags &
1895			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
1896				mtx_unlock(&sc->sc_queue_mtx);
1897				if (g_mirror_try_destroy(sc)) {
1898					curthread->td_pflags &= ~TDP_GEOM;
1899					G_MIRROR_DEBUG(1, "Thread exiting.");
1900					kproc_exit(0);
1901				}
1902				mtx_lock(&sc->sc_queue_mtx);
1903			}
1904			sx_xunlock(&sc->sc_lock);
1905			/*
1906			 * XXX: We can miss an event here, because an event
1907			 *      can be added without sx-device-lock and without
1908			 *      mtx-queue-lock. Maybe I should just stop using
1909			 *      dedicated mutex for events synchronization and
1910			 *      stick with the queue lock?
1911			 *      The event will hang here until next I/O request
1912			 *      or next event is received.
1913			 */
1914			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1",
1915			    timeout * hz);
1916			sx_xlock(&sc->sc_lock);
1917			G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__);
1918			continue;
1919		}
1920		mtx_unlock(&sc->sc_queue_mtx);
1921
1922		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
1923		    (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) {
1924			g_mirror_sync_request(bp);	/* READ */
1925		} else if (bp->bio_to != sc->sc_provider) {
1926			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0)
1927				g_mirror_regular_request(bp);
1928			else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
1929				g_mirror_sync_request(bp);	/* WRITE */
1930			else {
1931				KASSERT(0,
1932				    ("Invalid request cflags=0x%hhx to=%s.",
1933				    bp->bio_cflags, bp->bio_to->name));
1934			}
1935		} else {
1936			g_mirror_register_request(bp);
1937		}
1938		G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__);
1939	}
1940}
1941
1942static void
1943g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
1944{
1945
1946	sx_assert(&sc->sc_lock, SX_LOCKED);
1947
1948	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
1949		return;
1950	if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) {
1951		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1952		    g_mirror_get_diskname(disk), sc->sc_name);
1953		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
1954	} else if (sc->sc_idle &&
1955	    (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
1956		G_MIRROR_DEBUG(1, "Disk %s (device %s) marked as clean.",
1957		    g_mirror_get_diskname(disk), sc->sc_name);
1958		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
1959	}
1960}
1961
1962static void
1963g_mirror_sync_start(struct g_mirror_disk *disk)
1964{
1965	struct g_mirror_softc *sc;
1966	struct g_consumer *cp;
1967	struct bio *bp;
1968	int error, i;
1969
1970	g_topology_assert_not();
1971	sc = disk->d_softc;
1972	sx_assert(&sc->sc_lock, SX_LOCKED);
1973
1974	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
1975	    ("Disk %s is not marked for synchronization.",
1976	    g_mirror_get_diskname(disk)));
1977	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
1978	    ("Device not in RUNNING state (%s, %u).", sc->sc_name,
1979	    sc->sc_state));
1980
1981	sx_xunlock(&sc->sc_lock);
1982	g_topology_lock();
1983	cp = g_new_consumer(sc->sc_sync.ds_geom);
1984	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
1985	error = g_attach(cp, sc->sc_provider);
1986	KASSERT(error == 0,
1987	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
1988	error = g_access(cp, 1, 0, 0);
1989	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
1990	g_topology_unlock();
1991	sx_xlock(&sc->sc_lock);
1992
1993	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1994	    g_mirror_get_diskname(disk));
1995	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0)
1996		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
1997	KASSERT(disk->d_sync.ds_consumer == NULL,
1998	    ("Sync consumer already exists (device=%s, disk=%s).",
1999	    sc->sc_name, g_mirror_get_diskname(disk)));
2000
2001	disk->d_sync.ds_consumer = cp;
2002	disk->d_sync.ds_consumer->private = disk;
2003	disk->d_sync.ds_consumer->index = 0;
2004
2005	/*
2006	 * Allocate memory for synchronization bios and initialize them.
2007	 */
2008	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs,
2009	    M_MIRROR, M_WAITOK);
2010	for (i = 0; i < g_mirror_syncreqs; i++) {
2011		bp = g_alloc_bio();
2012		disk->d_sync.ds_bios[i] = bp;
2013		bp->bio_parent = NULL;
2014		bp->bio_cmd = BIO_READ;
2015		bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK);
2016		bp->bio_cflags = 0;
2017		bp->bio_offset = disk->d_sync.ds_offset;
2018		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
2019		disk->d_sync.ds_offset += bp->bio_length;
2020		bp->bio_done = g_mirror_sync_done;
2021		bp->bio_from = disk->d_sync.ds_consumer;
2022		bp->bio_to = sc->sc_provider;
2023		bp->bio_caller1 = (void *)(uintptr_t)i;
2024	}
2025
2026	/* Increase the number of disks in SYNCHRONIZING state. */
2027	sc->sc_sync.ds_ndisks++;
2028	/* Set the number of in-flight synchronization requests. */
2029	disk->d_sync.ds_inflight = g_mirror_syncreqs;
2030
2031	/*
2032	 * Fire off first synchronization requests.
2033	 */
2034	for (i = 0; i < g_mirror_syncreqs; i++) {
2035		bp = disk->d_sync.ds_bios[i];
2036		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
2037		disk->d_sync.ds_consumer->index++;
2038		/*
2039		 * Delay the request if it is colliding with a regular request.
2040		 */
2041		if (g_mirror_regular_collision(sc, bp))
2042			g_mirror_sync_delay(sc, bp);
2043		else
2044			g_io_request(bp, disk->d_sync.ds_consumer);
2045	}
2046}
2047
2048/*
2049 * Stop synchronization process.
2050 * type: 0 - synchronization finished
2051 *       1 - synchronization stopped
2052 */
2053static void
2054g_mirror_sync_stop(struct g_mirror_disk *disk, int type)
2055{
2056	struct g_mirror_softc *sc;
2057	struct g_consumer *cp;
2058
2059	g_topology_assert_not();
2060	sc = disk->d_softc;
2061	sx_assert(&sc->sc_lock, SX_LOCKED);
2062
2063	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
2064	    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
2065	    g_mirror_disk_state2str(disk->d_state)));
2066	if (disk->d_sync.ds_consumer == NULL)
2067		return;
2068
2069	if (type == 0) {
2070		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.",
2071		    sc->sc_name, g_mirror_get_diskname(disk));
2072	} else /* if (type == 1) */ {
2073		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2074		    sc->sc_name, g_mirror_get_diskname(disk));
2075	}
2076	free(disk->d_sync.ds_bios, M_MIRROR);
2077	disk->d_sync.ds_bios = NULL;
2078	cp = disk->d_sync.ds_consumer;
2079	disk->d_sync.ds_consumer = NULL;
2080	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
2081	sc->sc_sync.ds_ndisks--;
2082	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
2083	g_topology_lock();
2084	g_mirror_kill_consumer(sc, cp);
2085	g_topology_unlock();
2086	sx_xlock(&sc->sc_lock);
2087}
2088
2089static void
2090g_mirror_launch_provider(struct g_mirror_softc *sc)
2091{
2092	struct g_mirror_disk *disk;
2093	struct g_provider *pp, *dp;
2094
2095	sx_assert(&sc->sc_lock, SX_LOCKED);
2096
2097	g_topology_lock();
2098	pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name);
2099	pp->flags |= G_PF_DIRECT_RECEIVE;
2100	pp->mediasize = sc->sc_mediasize;
2101	pp->sectorsize = sc->sc_sectorsize;
2102	pp->stripesize = 0;
2103	pp->stripeoffset = 0;
2104
2105	/* Splitting of unmapped BIO's could work but isn't implemented now */
2106	if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT)
2107		pp->flags |= G_PF_ACCEPT_UNMAPPED;
2108
2109	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2110		if (disk->d_consumer && disk->d_consumer->provider) {
2111			dp = disk->d_consumer->provider;
2112			if (dp->stripesize > pp->stripesize) {
2113				pp->stripesize = dp->stripesize;
2114				pp->stripeoffset = dp->stripeoffset;
2115			}
2116			/* A provider underneath us doesn't support unmapped */
2117			if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
2118				G_MIRROR_DEBUG(0, "Cancelling unmapped "
2119				    "because of %s.", dp->name);
2120				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
2121			}
2122		}
2123	}
2124	pp->private = sc;
2125	sc->sc_refcnt++;
2126	sc->sc_provider = pp;
2127	g_error_provider(pp, 0);
2128	g_topology_unlock();
2129	G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
2130	    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks);
2131	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2132		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
2133			g_mirror_sync_start(disk);
2134	}
2135}
2136
2137static void
2138g_mirror_destroy_provider(struct g_mirror_softc *sc)
2139{
2140	struct g_mirror_disk *disk;
2141	struct bio *bp;
2142
2143	g_topology_assert_not();
2144	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2145	    sc->sc_name));
2146
2147	g_topology_lock();
2148	g_error_provider(sc->sc_provider, ENXIO);
2149	mtx_lock(&sc->sc_queue_mtx);
2150	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
2151		/*
2152		 * Abort any pending I/O that wasn't generated by us.
2153		 * Synchronization requests and requests destined for individual
2154		 * mirror components can be destroyed immediately.
2155		 */
2156		if (bp->bio_to == sc->sc_provider &&
2157		    bp->bio_from->geom != sc->sc_sync.ds_geom) {
2158			g_io_deliver(bp, ENXIO);
2159		} else {
2160			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
2161				free(bp->bio_data, M_MIRROR);
2162			g_destroy_bio(bp);
2163		}
2164	}
2165	mtx_unlock(&sc->sc_queue_mtx);
2166	g_wither_provider(sc->sc_provider, ENXIO);
2167	sc->sc_provider = NULL;
2168	G_MIRROR_DEBUG(0, "Device %s: provider destroyed.", sc->sc_name);
2169	g_topology_unlock();
2170	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2171		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
2172			g_mirror_sync_stop(disk, 1);
2173	}
2174}
2175
2176static void
2177g_mirror_go(void *arg)
2178{
2179	struct g_mirror_softc *sc;
2180
2181	sc = arg;
2182	G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2183	g_mirror_event_send(sc, 0,
2184	    G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE);
2185}
2186
2187static u_int
2188g_mirror_determine_state(struct g_mirror_disk *disk)
2189{
2190	struct g_mirror_softc *sc;
2191	u_int state;
2192
2193	sc = disk->d_softc;
2194	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2195		if ((disk->d_flags &
2196		    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
2197			/* Disk does not need synchronization. */
2198			state = G_MIRROR_DISK_STATE_ACTIVE;
2199		} else {
2200			if ((sc->sc_flags &
2201			     G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2202			    (disk->d_flags &
2203			     G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
2204				/*
2205				 * We can start synchronization from
2206				 * the stored offset.
2207				 */
2208				state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
2209			} else {
2210				state = G_MIRROR_DISK_STATE_STALE;
2211			}
2212		}
2213	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2214		/*
2215		 * Reset all synchronization data for this disk,
2216		 * because if it even was synchronized, it was
2217		 * synchronized to disks with different syncid.
2218		 */
2219		disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING;
2220		disk->d_sync.ds_offset = 0;
2221		disk->d_sync.ds_offset_done = 0;
2222		disk->d_sync.ds_syncid = sc->sc_syncid;
2223		if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2224		    (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
2225			state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
2226		} else {
2227			state = G_MIRROR_DISK_STATE_STALE;
2228		}
2229	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2230		/*
2231		 * Not good, NOT GOOD!
2232		 * It means that mirror was started on stale disks
2233		 * and more fresh disk just arrive.
2234		 * If there were writes, mirror is broken, sorry.
2235		 * I think the best choice here is don't touch
2236		 * this disk and inform the user loudly.
2237		 */
2238		G_MIRROR_DEBUG(0, "Device %s was started before the freshest "
2239		    "disk (%s) arrives!! It will not be connected to the "
2240		    "running device.", sc->sc_name,
2241		    g_mirror_get_diskname(disk));
2242		g_mirror_destroy_disk(disk);
2243		state = G_MIRROR_DISK_STATE_NONE;
2244		/* Return immediately, because disk was destroyed. */
2245		return (state);
2246	}
2247	G_MIRROR_DEBUG(3, "State for %s disk: %s.",
2248	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(state));
2249	return (state);
2250}
2251
2252/*
2253 * Update device state.
2254 */
2255static void
2256g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force)
2257{
2258	struct g_mirror_disk *disk;
2259	u_int state;
2260
2261	sx_assert(&sc->sc_lock, SX_XLOCKED);
2262
2263	switch (sc->sc_state) {
2264	case G_MIRROR_DEVICE_STATE_STARTING:
2265	    {
2266		struct g_mirror_disk *pdisk, *tdisk;
2267		u_int dirty, ndisks, genid, syncid;
2268
2269		KASSERT(sc->sc_provider == NULL,
2270		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2271		/*
2272		 * Are we ready? We are, if all disks are connected or
2273		 * if we have any disks and 'force' is true.
2274		 */
2275		ndisks = g_mirror_ndisks(sc, -1);
2276		if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) {
2277			;
2278		} else if (ndisks == 0) {
2279			/*
2280			 * Disks went down in starting phase, so destroy
2281			 * device.
2282			 */
2283			callout_drain(&sc->sc_callout);
2284			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
2285			G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2286			    sc->sc_rootmount);
2287			root_mount_rel(sc->sc_rootmount);
2288			sc->sc_rootmount = NULL;
2289			return;
2290		} else {
2291			return;
2292		}
2293
2294		/*
2295		 * Activate all disks with the biggest syncid.
2296		 */
2297		if (force) {
2298			/*
2299			 * If 'force' is true, we have been called due to
2300			 * timeout, so don't bother canceling timeout.
2301			 */
2302			ndisks = 0;
2303			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2304				if ((disk->d_flags &
2305				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
2306					ndisks++;
2307				}
2308			}
2309			if (ndisks == 0) {
2310				/* No valid disks found, destroy device. */
2311				sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
2312				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
2313				    __LINE__, sc->sc_rootmount);
2314				root_mount_rel(sc->sc_rootmount);
2315				sc->sc_rootmount = NULL;
2316				return;
2317			}
2318		} else {
2319			/* Cancel timeout. */
2320			callout_drain(&sc->sc_callout);
2321		}
2322
2323		/*
2324		 * Find the biggest genid.
2325		 */
2326		genid = 0;
2327		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2328			if (disk->d_genid > genid)
2329				genid = disk->d_genid;
2330		}
2331		sc->sc_genid = genid;
2332		/*
2333		 * Remove all disks without the biggest genid.
2334		 */
2335		LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
2336			if (disk->d_genid < genid) {
2337				G_MIRROR_DEBUG(0,
2338				    "Component %s (device %s) broken, skipping.",
2339				    g_mirror_get_diskname(disk), sc->sc_name);
2340				g_mirror_destroy_disk(disk);
2341			}
2342		}
2343
2344		/*
2345		 * Find the biggest syncid.
2346		 */
2347		syncid = 0;
2348		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2349			if (disk->d_sync.ds_syncid > syncid)
2350				syncid = disk->d_sync.ds_syncid;
2351		}
2352
2353		/*
2354		 * Here we need to look for dirty disks and if all disks
2355		 * with the biggest syncid are dirty, we have to choose
2356		 * one with the biggest priority and rebuild the rest.
2357		 */
2358		/*
2359		 * Find the number of dirty disks with the biggest syncid.
2360		 * Find the number of disks with the biggest syncid.
2361		 * While here, find a disk with the biggest priority.
2362		 */
2363		dirty = ndisks = 0;
2364		pdisk = NULL;
2365		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2366			if (disk->d_sync.ds_syncid != syncid)
2367				continue;
2368			if ((disk->d_flags &
2369			    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
2370				continue;
2371			}
2372			ndisks++;
2373			if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
2374				dirty++;
2375				if (pdisk == NULL ||
2376				    pdisk->d_priority < disk->d_priority) {
2377					pdisk = disk;
2378				}
2379			}
2380		}
2381		if (dirty == 0) {
2382			/* No dirty disks at all, great. */
2383		} else if (dirty == ndisks) {
2384			/*
2385			 * Force synchronization for all dirty disks except one
2386			 * with the biggest priority.
2387			 */
2388			KASSERT(pdisk != NULL, ("pdisk == NULL"));
2389			G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a "
2390			    "master disk for synchronization.",
2391			    g_mirror_get_diskname(pdisk), sc->sc_name);
2392			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2393				if (disk->d_sync.ds_syncid != syncid)
2394					continue;
2395				if ((disk->d_flags &
2396				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
2397					continue;
2398				}
2399				KASSERT((disk->d_flags &
2400				    G_MIRROR_DISK_FLAG_DIRTY) != 0,
2401				    ("Disk %s isn't marked as dirty.",
2402				    g_mirror_get_diskname(disk)));
2403				/* Skip the disk with the biggest priority. */
2404				if (disk == pdisk)
2405					continue;
2406				disk->d_sync.ds_syncid = 0;
2407			}
2408		} else if (dirty < ndisks) {
2409			/*
2410			 * Force synchronization for all dirty disks.
2411			 * We have some non-dirty disks.
2412			 */
2413			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2414				if (disk->d_sync.ds_syncid != syncid)
2415					continue;
2416				if ((disk->d_flags &
2417				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
2418					continue;
2419				}
2420				if ((disk->d_flags &
2421				    G_MIRROR_DISK_FLAG_DIRTY) == 0) {
2422					continue;
2423				}
2424				disk->d_sync.ds_syncid = 0;
2425			}
2426		}
2427
2428		/* Reset hint. */
2429		sc->sc_hint = NULL;
2430		sc->sc_syncid = syncid;
2431		if (force) {
2432			/* Remember to bump syncid on first write. */
2433			sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
2434		}
2435		state = G_MIRROR_DEVICE_STATE_RUNNING;
2436		G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.",
2437		    sc->sc_name, g_mirror_device_state2str(sc->sc_state),
2438		    g_mirror_device_state2str(state));
2439		sc->sc_state = state;
2440		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2441			state = g_mirror_determine_state(disk);
2442			g_mirror_event_send(disk, state,
2443			    G_MIRROR_EVENT_DONTWAIT);
2444			if (state == G_MIRROR_DISK_STATE_STALE)
2445				sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
2446		}
2447		break;
2448	    }
2449	case G_MIRROR_DEVICE_STATE_RUNNING:
2450		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 &&
2451		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
2452			/*
2453			 * No active disks or no disks at all,
2454			 * so destroy device.
2455			 */
2456			if (sc->sc_provider != NULL)
2457				g_mirror_destroy_provider(sc);
2458			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
2459			break;
2460		} else if (g_mirror_ndisks(sc,
2461		    G_MIRROR_DISK_STATE_ACTIVE) > 0 &&
2462		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
2463			/*
2464			 * We have active disks, launch provider if it doesn't
2465			 * exist.
2466			 */
2467			if (sc->sc_provider == NULL)
2468				g_mirror_launch_provider(sc);
2469			if (sc->sc_rootmount != NULL) {
2470				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
2471				    __LINE__, sc->sc_rootmount);
2472				root_mount_rel(sc->sc_rootmount);
2473				sc->sc_rootmount = NULL;
2474			}
2475		}
2476		/*
2477		 * Genid should be bumped immediately, so do it here.
2478		 */
2479		if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) {
2480			sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID;
2481			g_mirror_bump_genid(sc);
2482		}
2483		break;
2484	default:
2485		KASSERT(1 == 0, ("Wrong device state (%s, %s).",
2486		    sc->sc_name, g_mirror_device_state2str(sc->sc_state)));
2487		break;
2488	}
2489}
2490
2491/*
2492 * Update disk state and device state if needed.
2493 */
2494#define	DISK_STATE_CHANGED()	G_MIRROR_DEBUG(1,			\
2495	"Disk %s state changed from %s to %s (device %s).",		\
2496	g_mirror_get_diskname(disk),					\
2497	g_mirror_disk_state2str(disk->d_state),				\
2498	g_mirror_disk_state2str(state), sc->sc_name)
2499static int
2500g_mirror_update_disk(struct g_mirror_disk *disk, u_int state)
2501{
2502	struct g_mirror_softc *sc;
2503
2504	sc = disk->d_softc;
2505	sx_assert(&sc->sc_lock, SX_XLOCKED);
2506
2507again:
2508	G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.",
2509	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state),
2510	    g_mirror_disk_state2str(state));
2511	switch (state) {
2512	case G_MIRROR_DISK_STATE_NEW:
2513		/*
2514		 * Possible scenarios:
2515		 * 1. New disk arrive.
2516		 */
2517		/* Previous state should be NONE. */
2518		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE,
2519		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
2520		    g_mirror_disk_state2str(disk->d_state)));
2521		DISK_STATE_CHANGED();
2522
2523		disk->d_state = state;
2524		if (LIST_EMPTY(&sc->sc_disks))
2525			LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next);
2526		else {
2527			struct g_mirror_disk *dp;
2528
2529			LIST_FOREACH(dp, &sc->sc_disks, d_next) {
2530				if (disk->d_priority >= dp->d_priority) {
2531					LIST_INSERT_BEFORE(dp, disk, d_next);
2532					dp = NULL;
2533					break;
2534				}
2535				if (LIST_NEXT(dp, d_next) == NULL)
2536					break;
2537			}
2538			if (dp != NULL)
2539				LIST_INSERT_AFTER(dp, disk, d_next);
2540		}
2541		G_MIRROR_DEBUG(1, "Device %s: provider %s detected.",
2542		    sc->sc_name, g_mirror_get_diskname(disk));
2543		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
2544			break;
2545		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
2546		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2547		    g_mirror_device_state2str(sc->sc_state),
2548		    g_mirror_get_diskname(disk),
2549		    g_mirror_disk_state2str(disk->d_state)));
2550		state = g_mirror_determine_state(disk);
2551		if (state != G_MIRROR_DISK_STATE_NONE)
2552			goto again;
2553		break;
2554	case G_MIRROR_DISK_STATE_ACTIVE:
2555		/*
2556		 * Possible scenarios:
2557		 * 1. New disk does not need synchronization.
2558		 * 2. Synchronization process finished successfully.
2559		 */
2560		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
2561		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2562		    g_mirror_device_state2str(sc->sc_state),
2563		    g_mirror_get_diskname(disk),
2564		    g_mirror_disk_state2str(disk->d_state)));
2565		/* Previous state should be NEW or SYNCHRONIZING. */
2566		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW ||
2567		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
2568		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
2569		    g_mirror_disk_state2str(disk->d_state)));
2570		DISK_STATE_CHANGED();
2571
2572		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
2573			disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING;
2574			disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC;
2575			g_mirror_sync_stop(disk, 0);
2576		}
2577		disk->d_state = state;
2578		disk->d_sync.ds_offset = 0;
2579		disk->d_sync.ds_offset_done = 0;
2580		g_mirror_update_idle(sc, disk);
2581		g_mirror_update_metadata(disk);
2582		G_MIRROR_DEBUG(1, "Device %s: provider %s activated.",
2583		    sc->sc_name, g_mirror_get_diskname(disk));
2584		break;
2585	case G_MIRROR_DISK_STATE_STALE:
2586		/*
2587		 * Possible scenarios:
2588		 * 1. Stale disk was connected.
2589		 */
2590		/* Previous state should be NEW. */
2591		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
2592		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
2593		    g_mirror_disk_state2str(disk->d_state)));
2594		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
2595		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2596		    g_mirror_device_state2str(sc->sc_state),
2597		    g_mirror_get_diskname(disk),
2598		    g_mirror_disk_state2str(disk->d_state)));
2599		/*
2600		 * STALE state is only possible if device is marked
2601		 * NOAUTOSYNC.
2602		 */
2603		KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0,
2604		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2605		    g_mirror_device_state2str(sc->sc_state),
2606		    g_mirror_get_diskname(disk),
2607		    g_mirror_disk_state2str(disk->d_state)));
2608		DISK_STATE_CHANGED();
2609
2610		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
2611		disk->d_state = state;
2612		g_mirror_update_metadata(disk);
2613		G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.",
2614		    sc->sc_name, g_mirror_get_diskname(disk));
2615		break;
2616	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
2617		/*
2618		 * Possible scenarios:
2619		 * 1. Disk which needs synchronization was connected.
2620		 */
2621		/* Previous state should be NEW. */
2622		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
2623		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
2624		    g_mirror_disk_state2str(disk->d_state)));
2625		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
2626		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2627		    g_mirror_device_state2str(sc->sc_state),
2628		    g_mirror_get_diskname(disk),
2629		    g_mirror_disk_state2str(disk->d_state)));
2630		DISK_STATE_CHANGED();
2631
2632		if (disk->d_state == G_MIRROR_DISK_STATE_NEW)
2633			disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
2634		disk->d_state = state;
2635		if (sc->sc_provider != NULL) {
2636			g_mirror_sync_start(disk);
2637			g_mirror_update_metadata(disk);
2638		}
2639		break;
2640	case G_MIRROR_DISK_STATE_DISCONNECTED:
2641		/*
2642		 * Possible scenarios:
2643		 * 1. Device wasn't running yet, but disk disappear.
2644		 * 2. Disk was active and disapppear.
2645		 * 3. Disk disappear during synchronization process.
2646		 */
2647		if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) {
2648			/*
2649			 * Previous state should be ACTIVE, STALE or
2650			 * SYNCHRONIZING.
2651			 */
2652			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
2653			    disk->d_state == G_MIRROR_DISK_STATE_STALE ||
2654			    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
2655			    ("Wrong disk state (%s, %s).",
2656			    g_mirror_get_diskname(disk),
2657			    g_mirror_disk_state2str(disk->d_state)));
2658		} else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) {
2659			/* Previous state should be NEW. */
2660			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
2661			    ("Wrong disk state (%s, %s).",
2662			    g_mirror_get_diskname(disk),
2663			    g_mirror_disk_state2str(disk->d_state)));
2664			/*
2665			 * Reset bumping syncid if disk disappeared in STARTING
2666			 * state.
2667			 */
2668			if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0)
2669				sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
2670#ifdef	INVARIANTS
2671		} else {
2672			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2673			    sc->sc_name,
2674			    g_mirror_device_state2str(sc->sc_state),
2675			    g_mirror_get_diskname(disk),
2676			    g_mirror_disk_state2str(disk->d_state)));
2677#endif
2678		}
2679		DISK_STATE_CHANGED();
2680		G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.",
2681		    sc->sc_name, g_mirror_get_diskname(disk));
2682
2683		g_mirror_destroy_disk(disk);
2684		break;
2685	case G_MIRROR_DISK_STATE_DESTROY:
2686	    {
2687		int error;
2688
2689		error = g_mirror_clear_metadata(disk);
2690		if (error != 0) {
2691			G_MIRROR_DEBUG(0,
2692			    "Device %s: failed to clear metadata on %s: %d.",
2693			    sc->sc_name, g_mirror_get_diskname(disk), error);
2694			break;
2695		}
2696		DISK_STATE_CHANGED();
2697		G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.",
2698		    sc->sc_name, g_mirror_get_diskname(disk));
2699
2700		g_mirror_destroy_disk(disk);
2701		sc->sc_ndisks--;
2702		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
2703			g_mirror_update_metadata(disk);
2704		}
2705		break;
2706	    }
2707	default:
2708		KASSERT(1 == 0, ("Unknown state (%u).", state));
2709		break;
2710	}
2711	return (0);
2712}
2713#undef	DISK_STATE_CHANGED
2714
2715int
2716g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md)
2717{
2718	struct g_provider *pp;
2719	u_char *buf;
2720	int error;
2721
2722	g_topology_assert();
2723
2724	error = g_access(cp, 1, 0, 0);
2725	if (error != 0)
2726		return (error);
2727	pp = cp->provider;
2728	g_topology_unlock();
2729	/* Metadata are stored on last sector. */
2730	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2731	    &error);
2732	g_topology_lock();
2733	g_access(cp, -1, 0, 0);
2734	if (buf == NULL) {
2735		G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2736		    cp->provider->name, error);
2737		return (error);
2738	}
2739
2740	/* Decode metadata. */
2741	error = mirror_metadata_decode(buf, md);
2742	g_free(buf);
2743	if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0)
2744		return (EINVAL);
2745	if (md->md_version > G_MIRROR_VERSION) {
2746		G_MIRROR_DEBUG(0,
2747		    "Kernel module is too old to handle metadata from %s.",
2748		    cp->provider->name);
2749		return (EINVAL);
2750	}
2751	if (error != 0) {
2752		G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2753		    cp->provider->name);
2754		return (error);
2755	}
2756
2757	return (0);
2758}
2759
2760static int
2761g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp,
2762    struct g_mirror_metadata *md)
2763{
2764
2765	if (g_mirror_id2disk(sc, md->md_did) != NULL) {
2766		G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.",
2767		    pp->name, md->md_did);
2768		return (EEXIST);
2769	}
2770	if (md->md_all != sc->sc_ndisks) {
2771		G_MIRROR_DEBUG(1,
2772		    "Invalid '%s' field on disk %s (device %s), skipping.",
2773		    "md_all", pp->name, sc->sc_name);
2774		return (EINVAL);
2775	}
2776	if (md->md_slice != sc->sc_slice) {
2777		G_MIRROR_DEBUG(1,
2778		    "Invalid '%s' field on disk %s (device %s), skipping.",
2779		    "md_slice", pp->name, sc->sc_name);
2780		return (EINVAL);
2781	}
2782	if (md->md_balance != sc->sc_balance) {
2783		G_MIRROR_DEBUG(1,
2784		    "Invalid '%s' field on disk %s (device %s), skipping.",
2785		    "md_balance", pp->name, sc->sc_name);
2786		return (EINVAL);
2787	}
2788#if 0
2789	if (md->md_mediasize != sc->sc_mediasize) {
2790		G_MIRROR_DEBUG(1,
2791		    "Invalid '%s' field on disk %s (device %s), skipping.",
2792		    "md_mediasize", pp->name, sc->sc_name);
2793		return (EINVAL);
2794	}
2795#endif
2796	if (sc->sc_mediasize > pp->mediasize) {
2797		G_MIRROR_DEBUG(1,
2798		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2799		    sc->sc_name);
2800		return (EINVAL);
2801	}
2802	if (md->md_sectorsize != sc->sc_sectorsize) {
2803		G_MIRROR_DEBUG(1,
2804		    "Invalid '%s' field on disk %s (device %s), skipping.",
2805		    "md_sectorsize", pp->name, sc->sc_name);
2806		return (EINVAL);
2807	}
2808	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2809		G_MIRROR_DEBUG(1,
2810		    "Invalid sector size of disk %s (device %s), skipping.",
2811		    pp->name, sc->sc_name);
2812		return (EINVAL);
2813	}
2814	if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) {
2815		G_MIRROR_DEBUG(1,
2816		    "Invalid device flags on disk %s (device %s), skipping.",
2817		    pp->name, sc->sc_name);
2818		return (EINVAL);
2819	}
2820	if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) {
2821		G_MIRROR_DEBUG(1,
2822		    "Invalid disk flags on disk %s (device %s), skipping.",
2823		    pp->name, sc->sc_name);
2824		return (EINVAL);
2825	}
2826	return (0);
2827}
2828
2829int
2830g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp,
2831    struct g_mirror_metadata *md)
2832{
2833	struct g_mirror_disk *disk;
2834	int error;
2835
2836	g_topology_assert_not();
2837	G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name);
2838
2839	error = g_mirror_check_metadata(sc, pp, md);
2840	if (error != 0)
2841		return (error);
2842	if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING &&
2843	    md->md_genid < sc->sc_genid) {
2844		G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.",
2845		    pp->name, sc->sc_name);
2846		return (EINVAL);
2847	}
2848	disk = g_mirror_init_disk(sc, pp, md, &error);
2849	if (disk == NULL)
2850		return (error);
2851	error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW,
2852	    G_MIRROR_EVENT_WAIT);
2853	if (error != 0)
2854		return (error);
2855	if (md->md_version < G_MIRROR_VERSION) {
2856		G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2857		    pp->name, md->md_version, G_MIRROR_VERSION);
2858		g_mirror_update_metadata(disk);
2859	}
2860	return (0);
2861}
2862
2863static void
2864g_mirror_destroy_delayed(void *arg, int flag)
2865{
2866	struct g_mirror_softc *sc;
2867	int error;
2868
2869	if (flag == EV_CANCEL) {
2870		G_MIRROR_DEBUG(1, "Destroying canceled.");
2871		return;
2872	}
2873	sc = arg;
2874	g_topology_unlock();
2875	sx_xlock(&sc->sc_lock);
2876	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0,
2877	    ("DESTROY flag set on %s.", sc->sc_name));
2878	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0,
2879	    ("DESTROYING flag not set on %s.", sc->sc_name));
2880	G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name);
2881	error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT);
2882	if (error != 0) {
2883		G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).",
2884		    sc->sc_name, error);
2885		sx_xunlock(&sc->sc_lock);
2886	}
2887	g_topology_lock();
2888}
2889
2890static int
2891g_mirror_access(struct g_provider *pp, int acr, int acw, int ace)
2892{
2893	struct g_mirror_softc *sc;
2894	int error = 0;
2895
2896	g_topology_assert();
2897	G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2898	    acw, ace);
2899
2900	sc = pp->private;
2901	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
2902
2903	g_topology_unlock();
2904	sx_xlock(&sc->sc_lock);
2905	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 ||
2906	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0 ||
2907	    LIST_EMPTY(&sc->sc_disks)) {
2908		if (acr > 0 || acw > 0 || ace > 0)
2909			error = ENXIO;
2910		goto end;
2911	}
2912	sc->sc_provider_open += acr + acw + ace;
2913	if (pp->acw + acw == 0)
2914		g_mirror_idle(sc, 0);
2915	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0 &&
2916	    sc->sc_provider_open == 0)
2917		g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK, sc, NULL);
2918end:
2919	sx_xunlock(&sc->sc_lock);
2920	g_topology_lock();
2921	return (error);
2922}
2923
2924static struct g_geom *
2925g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md)
2926{
2927	struct g_mirror_softc *sc;
2928	struct g_geom *gp;
2929	int error, timeout;
2930
2931	g_topology_assert();
2932	G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
2933	    md->md_mid);
2934
2935	/* One disk is minimum. */
2936	if (md->md_all < 1)
2937		return (NULL);
2938	/*
2939	 * Action geom.
2940	 */
2941	gp = g_new_geomf(mp, "%s", md->md_name);
2942	sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO);
2943	gp->start = g_mirror_start;
2944	gp->orphan = g_mirror_orphan;
2945	gp->access = g_mirror_access;
2946	gp->dumpconf = g_mirror_dumpconf;
2947
2948	sc->sc_id = md->md_mid;
2949	sc->sc_slice = md->md_slice;
2950	sc->sc_balance = md->md_balance;
2951	sc->sc_mediasize = md->md_mediasize;
2952	sc->sc_sectorsize = md->md_sectorsize;
2953	sc->sc_ndisks = md->md_all;
2954	sc->sc_flags = md->md_mflags;
2955	sc->sc_bump_id = 0;
2956	sc->sc_idle = 1;
2957	sc->sc_last_write = time_uptime;
2958	sc->sc_writes = 0;
2959	sc->sc_refcnt = 1;
2960	sx_init(&sc->sc_lock, "gmirror:lock");
2961	bioq_init(&sc->sc_queue);
2962	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
2963	bioq_init(&sc->sc_regular_delayed);
2964	bioq_init(&sc->sc_inflight);
2965	bioq_init(&sc->sc_sync_delayed);
2966	LIST_INIT(&sc->sc_disks);
2967	TAILQ_INIT(&sc->sc_events);
2968	mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF);
2969	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2970	mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF);
2971	sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING;
2972	gp->softc = sc;
2973	sc->sc_geom = gp;
2974	sc->sc_provider = NULL;
2975	sc->sc_provider_open = 0;
2976	/*
2977	 * Synchronization geom.
2978	 */
2979	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2980	gp->softc = sc;
2981	gp->orphan = g_mirror_orphan;
2982	sc->sc_sync.ds_geom = gp;
2983	sc->sc_sync.ds_ndisks = 0;
2984	error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0,
2985	    "g_mirror %s", md->md_name);
2986	if (error != 0) {
2987		G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.",
2988		    sc->sc_name);
2989		g_destroy_geom(sc->sc_sync.ds_geom);
2990		g_destroy_geom(sc->sc_geom);
2991		g_mirror_free_device(sc);
2992		return (NULL);
2993	}
2994
2995	G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).",
2996	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
2997
2998	sc->sc_rootmount = root_mount_hold("GMIRROR");
2999	G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
3000	/*
3001	 * Run timeout.
3002	 */
3003	timeout = g_mirror_timeout * hz;
3004	callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc);
3005	return (sc->sc_geom);
3006}
3007
3008int
3009g_mirror_destroy(struct g_mirror_softc *sc, int how)
3010{
3011	struct g_mirror_disk *disk;
3012
3013	g_topology_assert_not();
3014	sx_assert(&sc->sc_lock, SX_XLOCKED);
3015
3016	if (sc->sc_provider_open != 0) {
3017		switch (how) {
3018		case G_MIRROR_DESTROY_SOFT:
3019			G_MIRROR_DEBUG(1,
3020			    "Device %s is still open (%d).", sc->sc_name,
3021			    sc->sc_provider_open);
3022			return (EBUSY);
3023		case G_MIRROR_DESTROY_DELAYED:
3024			G_MIRROR_DEBUG(1,
3025			    "Device %s will be destroyed on last close.",
3026			    sc->sc_name);
3027			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
3028				if (disk->d_state ==
3029				    G_MIRROR_DISK_STATE_SYNCHRONIZING) {
3030					g_mirror_sync_stop(disk, 1);
3031				}
3032			}
3033			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROYING;
3034			return (EBUSY);
3035		case G_MIRROR_DESTROY_HARD:
3036			G_MIRROR_DEBUG(1, "Device %s is still open, so it "
3037			    "can't be definitely removed.", sc->sc_name);
3038		}
3039	}
3040
3041	g_topology_lock();
3042	if (sc->sc_geom->softc == NULL) {
3043		g_topology_unlock();
3044		return (0);
3045	}
3046	sc->sc_geom->softc = NULL;
3047	sc->sc_sync.ds_geom->softc = NULL;
3048	g_topology_unlock();
3049
3050	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
3051	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WAIT;
3052	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
3053	sx_xunlock(&sc->sc_lock);
3054	mtx_lock(&sc->sc_queue_mtx);
3055	wakeup(sc);
3056	mtx_unlock(&sc->sc_queue_mtx);
3057	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
3058	while (sc->sc_worker != NULL)
3059		tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5);
3060	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
3061	sx_xlock(&sc->sc_lock);
3062	g_mirror_destroy_device(sc);
3063	return (0);
3064}
3065
3066static void
3067g_mirror_taste_orphan(struct g_consumer *cp)
3068{
3069
3070	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
3071	    cp->provider->name));
3072}
3073
3074static struct g_geom *
3075g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
3076{
3077	struct g_mirror_metadata md;
3078	struct g_mirror_softc *sc;
3079	struct g_consumer *cp;
3080	struct g_geom *gp;
3081	int error;
3082
3083	g_topology_assert();
3084	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
3085	G_MIRROR_DEBUG(2, "Tasting %s.", pp->name);
3086
3087	gp = g_new_geomf(mp, "mirror:taste");
3088	/*
3089	 * This orphan function should be never called.
3090	 */
3091	gp->orphan = g_mirror_taste_orphan;
3092	cp = g_new_consumer(gp);
3093	g_attach(cp, pp);
3094	error = g_mirror_read_metadata(cp, &md);
3095	g_detach(cp);
3096	g_destroy_consumer(cp);
3097	g_destroy_geom(gp);
3098	if (error != 0)
3099		return (NULL);
3100	gp = NULL;
3101
3102	if (md.md_provider[0] != '\0' &&
3103	    !g_compare_names(md.md_provider, pp->name))
3104		return (NULL);
3105	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
3106		return (NULL);
3107	if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) {
3108		G_MIRROR_DEBUG(0,
3109		    "Device %s: provider %s marked as inactive, skipping.",
3110		    md.md_name, pp->name);
3111		return (NULL);
3112	}
3113	if (g_mirror_debug >= 2)
3114		mirror_metadata_dump(&md);
3115
3116	/*
3117	 * Let's check if device already exists.
3118	 */
3119	sc = NULL;
3120	LIST_FOREACH(gp, &mp->geom, geom) {
3121		sc = gp->softc;
3122		if (sc == NULL)
3123			continue;
3124		if (sc->sc_sync.ds_geom == gp)
3125			continue;
3126		if (strcmp(md.md_name, sc->sc_name) != 0)
3127			continue;
3128		if (md.md_mid != sc->sc_id) {
3129			G_MIRROR_DEBUG(0, "Device %s already configured.",
3130			    sc->sc_name);
3131			return (NULL);
3132		}
3133		break;
3134	}
3135	if (gp == NULL) {
3136		gp = g_mirror_create(mp, &md);
3137		if (gp == NULL) {
3138			G_MIRROR_DEBUG(0, "Cannot create device %s.",
3139			    md.md_name);
3140			return (NULL);
3141		}
3142		sc = gp->softc;
3143	}
3144	G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
3145	g_topology_unlock();
3146	sx_xlock(&sc->sc_lock);
3147	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING;
3148	error = g_mirror_add_disk(sc, pp, &md);
3149	if (error != 0) {
3150		G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
3151		    pp->name, gp->name, error);
3152		if (LIST_EMPTY(&sc->sc_disks)) {
3153			g_cancel_event(sc);
3154			g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
3155			g_topology_lock();
3156			return (NULL);
3157		}
3158		gp = NULL;
3159	}
3160	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING;
3161	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
3162		g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
3163		g_topology_lock();
3164		return (NULL);
3165	}
3166	sx_xunlock(&sc->sc_lock);
3167	g_topology_lock();
3168	return (gp);
3169}
3170
3171static void
3172g_mirror_resize(struct g_consumer *cp)
3173{
3174	struct g_mirror_disk *disk;
3175
3176	g_topology_assert();
3177	g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name);
3178
3179	disk = cp->private;
3180	if (disk == NULL)
3181		return;
3182	g_topology_unlock();
3183	g_mirror_update_metadata(disk);
3184	g_topology_lock();
3185}
3186
3187static int
3188g_mirror_destroy_geom(struct gctl_req *req __unused,
3189    struct g_class *mp __unused, struct g_geom *gp)
3190{
3191	struct g_mirror_softc *sc;
3192	int error;
3193
3194	g_topology_unlock();
3195	sc = gp->softc;
3196	sx_xlock(&sc->sc_lock);
3197	g_cancel_event(sc);
3198	error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT);
3199	if (error != 0)
3200		sx_xunlock(&sc->sc_lock);
3201	g_topology_lock();
3202	return (error);
3203}
3204
3205static void
3206g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
3207    struct g_consumer *cp, struct g_provider *pp)
3208{
3209	struct g_mirror_softc *sc;
3210
3211	g_topology_assert();
3212
3213	sc = gp->softc;
3214	if (sc == NULL)
3215		return;
3216	/* Skip synchronization geom. */
3217	if (gp == sc->sc_sync.ds_geom)
3218		return;
3219	if (pp != NULL) {
3220		/* Nothing here. */
3221	} else if (cp != NULL) {
3222		struct g_mirror_disk *disk;
3223
3224		disk = cp->private;
3225		if (disk == NULL)
3226			return;
3227		g_topology_unlock();
3228		sx_xlock(&sc->sc_lock);
3229		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)disk->d_id);
3230		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
3231			sbuf_printf(sb, "%s<Synchronized>", indent);
3232			if (disk->d_sync.ds_offset == 0)
3233				sbuf_printf(sb, "0%%");
3234			else {
3235				sbuf_printf(sb, "%u%%",
3236				    (u_int)((disk->d_sync.ds_offset * 100) /
3237				    sc->sc_provider->mediasize));
3238			}
3239			sbuf_printf(sb, "</Synchronized>\n");
3240			if (disk->d_sync.ds_offset > 0) {
3241				sbuf_printf(sb, "%s<BytesSynced>%jd"
3242				    "</BytesSynced>\n", indent,
3243				    (intmax_t)disk->d_sync.ds_offset);
3244			}
3245		}
3246		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3247		    disk->d_sync.ds_syncid);
3248		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent,
3249		    disk->d_genid);
3250		sbuf_printf(sb, "%s<Flags>", indent);
3251		if (disk->d_flags == 0)
3252			sbuf_printf(sb, "NONE");
3253		else {
3254			int first = 1;
3255
3256#define	ADD_FLAG(flag, name)	do {					\
3257	if ((disk->d_flags & (flag)) != 0) {				\
3258		if (!first)						\
3259			sbuf_printf(sb, ", ");				\
3260		else							\
3261			first = 0;					\
3262		sbuf_printf(sb, name);					\
3263	}								\
3264} while (0)
3265			ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY");
3266			ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED");
3267			ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE");
3268			ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING,
3269			    "SYNCHRONIZING");
3270			ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3271			ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN");
3272#undef	ADD_FLAG
3273		}
3274		sbuf_printf(sb, "</Flags>\n");
3275		sbuf_printf(sb, "%s<Priority>%u</Priority>\n", indent,
3276		    disk->d_priority);
3277		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3278		    g_mirror_disk_state2str(disk->d_state));
3279		sx_xunlock(&sc->sc_lock);
3280		g_topology_lock();
3281	} else {
3282		g_topology_unlock();
3283		sx_xlock(&sc->sc_lock);
3284		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3285		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3286		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3287		sbuf_printf(sb, "%s<Flags>", indent);
3288		if (sc->sc_flags == 0)
3289			sbuf_printf(sb, "NONE");
3290		else {
3291			int first = 1;
3292
3293#define	ADD_FLAG(flag, name)	do {					\
3294	if ((sc->sc_flags & (flag)) != 0) {				\
3295		if (!first)						\
3296			sbuf_printf(sb, ", ");				\
3297		else							\
3298			first = 0;					\
3299		sbuf_printf(sb, name);					\
3300	}								\
3301} while (0)
3302			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
3303			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3304#undef	ADD_FLAG
3305		}
3306		sbuf_printf(sb, "</Flags>\n");
3307		sbuf_printf(sb, "%s<Slice>%u</Slice>\n", indent,
3308		    (u_int)sc->sc_slice);
3309		sbuf_printf(sb, "%s<Balance>%s</Balance>\n", indent,
3310		    balance_name(sc->sc_balance));
3311		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3312		    sc->sc_ndisks);
3313		sbuf_printf(sb, "%s<State>", indent);
3314		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
3315			sbuf_printf(sb, "%s", "STARTING");
3316		else if (sc->sc_ndisks ==
3317		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE))
3318			sbuf_printf(sb, "%s", "COMPLETE");
3319		else
3320			sbuf_printf(sb, "%s", "DEGRADED");
3321		sbuf_printf(sb, "</State>\n");
3322		sx_xunlock(&sc->sc_lock);
3323		g_topology_lock();
3324	}
3325}
3326
3327static void
3328g_mirror_shutdown_post_sync(void *arg, int howto)
3329{
3330	struct g_class *mp;
3331	struct g_geom *gp, *gp2;
3332	struct g_mirror_softc *sc;
3333	int error;
3334
3335	mp = arg;
3336	DROP_GIANT();
3337	g_topology_lock();
3338	g_mirror_shutdown = 1;
3339	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3340		if ((sc = gp->softc) == NULL)
3341			continue;
3342		/* Skip synchronization geom. */
3343		if (gp == sc->sc_sync.ds_geom)
3344			continue;
3345		g_topology_unlock();
3346		sx_xlock(&sc->sc_lock);
3347		g_mirror_idle(sc, -1);
3348		g_cancel_event(sc);
3349		error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED);
3350		if (error != 0)
3351			sx_xunlock(&sc->sc_lock);
3352		g_topology_lock();
3353	}
3354	g_topology_unlock();
3355	PICKUP_GIANT();
3356}
3357
3358static void
3359g_mirror_init(struct g_class *mp)
3360{
3361
3362	g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
3363	    g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
3364	if (g_mirror_post_sync == NULL)
3365		G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event.");
3366}
3367
3368static void
3369g_mirror_fini(struct g_class *mp)
3370{
3371
3372	if (g_mirror_post_sync != NULL)
3373		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync);
3374}
3375
3376DECLARE_GEOM_CLASS(g_mirror_class, g_mirror);
3377