vdev_geom.c revision 332525
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52struct consumer_vdev_elem {
53	SLIST_ENTRY(consumer_vdev_elem)	elems;
54	vdev_t				*vd;
55};
56
57SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
58_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
59    == sizeof(struct consumer_priv_t*),
60    "consumer_priv_t* can't be stored in g_consumer.private");
61
62DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
63
64SYSCTL_DECL(_vfs_zfs_vdev);
65/* Don't send BIO_FLUSH. */
66static int vdev_geom_bio_flush_disable;
67SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
68    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
69/* Don't send BIO_DELETE. */
70static int vdev_geom_bio_delete_disable;
71SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
72    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
73
74/* Declare local functions */
75static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
76
77/*
78 * Thread local storage used to indicate when a thread is probing geoms
79 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
80 * it is looking for a replacement for the vdev_t* that is its value.
81 */
82uint_t zfs_geom_probe_vdev_key;
83
84static void
85vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
86{
87	int error;
88	uint16_t rate;
89
90	error = g_getattr("GEOM::rotation_rate", cp, &rate);
91	if (error == 0)
92		vd->vdev_rotation_rate = rate;
93	else
94		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
95}
96
97static void
98vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
99		       boolean_t do_null_update)
100{
101	boolean_t needs_update = B_FALSE;
102	char *physpath;
103	int error, physpath_len;
104
105	physpath_len = MAXPATHLEN;
106	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
107	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
108	if (error == 0) {
109		char *old_physpath;
110
111		/* g_topology lock ensures that vdev has not been closed */
112		g_topology_assert();
113		old_physpath = vd->vdev_physpath;
114		vd->vdev_physpath = spa_strdup(physpath);
115
116		if (old_physpath != NULL) {
117			needs_update = (strcmp(old_physpath,
118						vd->vdev_physpath) != 0);
119			spa_strfree(old_physpath);
120		} else
121			needs_update = do_null_update;
122	}
123	g_free(physpath);
124
125	/*
126	 * If the physical path changed, update the config.
127	 * Only request an update for previously unset physpaths if
128	 * requested by the caller.
129	 */
130	if (needs_update)
131		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
132
133}
134
135static void
136vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
137{
138	char *old_physpath;
139	struct consumer_priv_t *priv;
140	struct consumer_vdev_elem *elem;
141	int error;
142
143	priv = (struct consumer_priv_t*)&cp->private;
144	if (SLIST_EMPTY(priv))
145		return;
146
147	SLIST_FOREACH(elem, priv, elems) {
148		vdev_t *vd = elem->vd;
149		if (strcmp(attr, "GEOM::rotation_rate") == 0) {
150			vdev_geom_set_rotation_rate(vd, cp);
151			return;
152		}
153		if (strcmp(attr, "GEOM::physpath") == 0) {
154			vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
155			return;
156		}
157	}
158}
159
160static void
161vdev_geom_orphan(struct g_consumer *cp)
162{
163	struct consumer_priv_t *priv;
164	struct consumer_vdev_elem *elem;
165
166	g_topology_assert();
167
168	priv = (struct consumer_priv_t*)&cp->private;
169	if (SLIST_EMPTY(priv))
170		/* Vdev close in progress.  Ignore the event. */
171		return;
172
173	/*
174	 * Orphan callbacks occur from the GEOM event thread.
175	 * Concurrent with this call, new I/O requests may be
176	 * working their way through GEOM about to find out
177	 * (only once executed by the g_down thread) that we've
178	 * been orphaned from our disk provider.  These I/Os
179	 * must be retired before we can detach our consumer.
180	 * This is most easily achieved by acquiring the
181	 * SPA ZIO configuration lock as a writer, but doing
182	 * so with the GEOM topology lock held would cause
183	 * a lock order reversal.  Instead, rely on the SPA's
184	 * async removal support to invoke a close on this
185	 * vdev once it is safe to do so.
186	 */
187	SLIST_FOREACH(elem, priv, elems) {
188		vdev_t *vd = elem->vd;
189
190		vd->vdev_remove_wanted = B_TRUE;
191		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
192	}
193}
194
195static struct g_consumer *
196vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
197{
198	struct g_geom *gp;
199	struct g_consumer *cp;
200	int error;
201
202	g_topology_assert();
203
204	ZFS_LOG(1, "Attaching to %s.", pp->name);
205
206	if (sanity) {
207		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
208			ZFS_LOG(1, "Failing attach of %s. "
209				   "Incompatible sectorsize %d\n",
210			    pp->name, pp->sectorsize);
211			return (NULL);
212		} else if (pp->mediasize < SPA_MINDEVSIZE) {
213			ZFS_LOG(1, "Failing attach of %s. "
214				   "Incompatible mediasize %ju\n",
215			    pp->name, pp->mediasize);
216			return (NULL);
217		}
218	}
219
220	/* Do we have geom already? No? Create one. */
221	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
222		if (gp->flags & G_GEOM_WITHER)
223			continue;
224		if (strcmp(gp->name, "zfs::vdev") != 0)
225			continue;
226		break;
227	}
228	if (gp == NULL) {
229		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
230		gp->orphan = vdev_geom_orphan;
231		gp->attrchanged = vdev_geom_attrchanged;
232		cp = g_new_consumer(gp);
233		error = g_attach(cp, pp);
234		if (error != 0) {
235			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
236			    __LINE__, error);
237			vdev_geom_detach(cp, B_FALSE);
238			return (NULL);
239		}
240		error = g_access(cp, 1, 0, 1);
241		if (error != 0) {
242			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
243			       __LINE__, error);
244			vdev_geom_detach(cp, B_FALSE);
245			return (NULL);
246		}
247		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
248	} else {
249		/* Check if we are already connected to this provider. */
250		LIST_FOREACH(cp, &gp->consumer, consumer) {
251			if (cp->provider == pp) {
252				ZFS_LOG(1, "Found consumer for %s.", pp->name);
253				break;
254			}
255		}
256		if (cp == NULL) {
257			cp = g_new_consumer(gp);
258			error = g_attach(cp, pp);
259			if (error != 0) {
260				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
261				    __func__, __LINE__, error);
262				vdev_geom_detach(cp, B_FALSE);
263				return (NULL);
264			}
265			error = g_access(cp, 1, 0, 1);
266			if (error != 0) {
267				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
268				    __func__, __LINE__, error);
269				vdev_geom_detach(cp, B_FALSE);
270				return (NULL);
271			}
272			ZFS_LOG(1, "Created consumer for %s.", pp->name);
273		} else {
274			error = g_access(cp, 1, 0, 1);
275			if (error != 0) {
276				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
277				    __func__, __LINE__, error);
278				return (NULL);
279			}
280			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
281		}
282	}
283
284	if (vd != NULL)
285		vd->vdev_tsd = cp;
286
287	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
288	return (cp);
289}
290
291static void
292vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
293{
294	struct g_geom *gp;
295
296	g_topology_assert();
297
298	ZFS_LOG(1, "Detaching from %s.",
299	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
300
301	gp = cp->geom;
302	if (open_for_read)
303		g_access(cp, -1, 0, -1);
304	/* Destroy consumer on last close. */
305	if (cp->acr == 0 && cp->ace == 0) {
306		if (cp->acw > 0)
307			g_access(cp, 0, -cp->acw, 0);
308		if (cp->provider != NULL) {
309			ZFS_LOG(1, "Destroying consumer for %s.",
310			    cp->provider->name ? cp->provider->name : "NULL");
311			g_detach(cp);
312		}
313		g_destroy_consumer(cp);
314	}
315	/* Destroy geom if there are no consumers left. */
316	if (LIST_EMPTY(&gp->consumer)) {
317		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
318		g_wither_geom(gp, ENXIO);
319	}
320}
321
322static void
323vdev_geom_close_locked(vdev_t *vd)
324{
325	struct g_consumer *cp;
326	struct consumer_priv_t *priv;
327	struct consumer_vdev_elem *elem, *elem_temp;
328
329	g_topology_assert();
330
331	cp = vd->vdev_tsd;
332	vd->vdev_delayed_close = B_FALSE;
333	if (cp == NULL)
334		return;
335
336	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
337	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
338	priv = (struct consumer_priv_t*)&cp->private;
339	vd->vdev_tsd = NULL;
340	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
341		if (elem->vd == vd) {
342			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
343			g_free(elem);
344		}
345	}
346
347	vdev_geom_detach(cp, B_TRUE);
348}
349
350/*
351 * Issue one or more bios to the vdev in parallel
352 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
353 * operation is described by parallel entries from each array.  There may be
354 * more bios actually issued than entries in the array
355 */
356static void
357vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
358    off_t *sizes, int *errors, int ncmds)
359{
360	struct bio **bios;
361	u_char *p;
362	off_t off, maxio, s, end;
363	int i, n_bios, j;
364	size_t bios_size;
365
366	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
367	n_bios = 0;
368
369	/* How many bios are required for all commands ? */
370	for (i = 0; i < ncmds; i++)
371		n_bios += (sizes[i] + maxio - 1) / maxio;
372
373	/* Allocate memory for the bios */
374	bios_size = n_bios * sizeof(struct bio*);
375	bios = kmem_zalloc(bios_size, KM_SLEEP);
376
377	/* Prepare and issue all of the bios */
378	for (i = j = 0; i < ncmds; i++) {
379		off = offsets[i];
380		p = datas[i];
381		s = sizes[i];
382		end = off + s;
383		ASSERT((off % cp->provider->sectorsize) == 0);
384		ASSERT((s % cp->provider->sectorsize) == 0);
385
386		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
387			bios[j] = g_alloc_bio();
388			bios[j]->bio_cmd = cmds[i];
389			bios[j]->bio_done = NULL;
390			bios[j]->bio_offset = off;
391			bios[j]->bio_length = MIN(s, maxio);
392			bios[j]->bio_data = p;
393			g_io_request(bios[j], cp);
394		}
395	}
396	ASSERT(j == n_bios);
397
398	/* Wait for all of the bios to complete, and clean them up */
399	for (i = j = 0; i < ncmds; i++) {
400		off = offsets[i];
401		s = sizes[i];
402		end = off + s;
403
404		for (; off < end; off += maxio, s -= maxio, j++) {
405			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
406			g_destroy_bio(bios[j]);
407		}
408	}
409	kmem_free(bios, bios_size);
410}
411
412/*
413 * Read the vdev config from a device.  Return the number of valid labels that
414 * were found.  The vdev config will be returned in config if and only if at
415 * least one valid label was found.
416 */
417static int
418vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
419{
420	struct g_provider *pp;
421	vdev_phys_t *vdev_lists[VDEV_LABELS];
422	char *buf;
423	size_t buflen;
424	uint64_t psize, state, txg;
425	off_t offsets[VDEV_LABELS];
426	off_t size;
427	off_t sizes[VDEV_LABELS];
428	int cmds[VDEV_LABELS];
429	int errors[VDEV_LABELS];
430	int l, nlabels;
431
432	g_topology_assert_not();
433
434	pp = cp->provider;
435	ZFS_LOG(1, "Reading config from %s...", pp->name);
436
437	psize = pp->mediasize;
438	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
439
440	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
441	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
442
443	buflen = sizeof(vdev_lists[0]->vp_nvlist);
444
445	*config = NULL;
446	/* Create all of the IO requests */
447	for (l = 0; l < VDEV_LABELS; l++) {
448		cmds[l] = BIO_READ;
449		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
450		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
451		sizes[l] = size;
452		errors[l] = 0;
453		ASSERT(offsets[l] % pp->sectorsize == 0);
454	}
455
456	/* Issue the IO requests */
457	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
458	    VDEV_LABELS);
459
460	/* Parse the labels */
461	nlabels = 0;
462	for (l = 0; l < VDEV_LABELS; l++) {
463		if (errors[l] != 0)
464			continue;
465
466		buf = vdev_lists[l]->vp_nvlist;
467
468		if (nvlist_unpack(buf, buflen, config, 0) != 0)
469			continue;
470
471		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
472		    &state) != 0 || state > POOL_STATE_L2CACHE) {
473			nvlist_free(*config);
474			*config = NULL;
475			continue;
476		}
477
478		if (state != POOL_STATE_SPARE &&
479		    state != POOL_STATE_L2CACHE &&
480		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
481		    &txg) != 0 || txg == 0)) {
482			nvlist_free(*config);
483			*config = NULL;
484			continue;
485		}
486
487		nlabels++;
488	}
489
490	/* Free the label storage */
491	for (l = 0; l < VDEV_LABELS; l++)
492		kmem_free(vdev_lists[l], size);
493
494	return (nlabels);
495}
496
497static void
498resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
499{
500	nvlist_t **new_configs;
501	uint64_t i;
502
503	if (id < *count)
504		return;
505	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
506	    KM_SLEEP);
507	for (i = 0; i < *count; i++)
508		new_configs[i] = (*configs)[i];
509	if (*configs != NULL)
510		kmem_free(*configs, *count * sizeof(void *));
511	*configs = new_configs;
512	*count = id + 1;
513}
514
515static void
516process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
517    const char *name, uint64_t* known_pool_guid)
518{
519	nvlist_t *vdev_tree;
520	uint64_t pool_guid;
521	uint64_t vdev_guid, known_guid;
522	uint64_t id, txg, known_txg;
523	char *pname;
524	int i;
525
526	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
527	    strcmp(pname, name) != 0)
528		goto ignore;
529
530	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
531		goto ignore;
532
533	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
534		goto ignore;
535
536	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
537		goto ignore;
538
539	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
540		goto ignore;
541
542	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
543
544	if (*known_pool_guid != 0) {
545		if (pool_guid != *known_pool_guid)
546			goto ignore;
547	} else
548		*known_pool_guid = pool_guid;
549
550	resize_configs(configs, count, id);
551
552	if ((*configs)[id] != NULL) {
553		VERIFY(nvlist_lookup_uint64((*configs)[id],
554		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
555		if (txg <= known_txg)
556			goto ignore;
557		nvlist_free((*configs)[id]);
558	}
559
560	(*configs)[id] = cfg;
561	return;
562
563ignore:
564	nvlist_free(cfg);
565}
566
567int
568vdev_geom_read_pool_label(const char *name,
569    nvlist_t ***configs, uint64_t *count)
570{
571	struct g_class *mp;
572	struct g_geom *gp;
573	struct g_provider *pp;
574	struct g_consumer *zcp;
575	nvlist_t *vdev_cfg;
576	uint64_t pool_guid;
577	int error, nlabels;
578
579	DROP_GIANT();
580	g_topology_lock();
581
582	*configs = NULL;
583	*count = 0;
584	pool_guid = 0;
585	LIST_FOREACH(mp, &g_classes, class) {
586		if (mp == &zfs_vdev_class)
587			continue;
588		LIST_FOREACH(gp, &mp->geom, geom) {
589			if (gp->flags & G_GEOM_WITHER)
590				continue;
591			LIST_FOREACH(pp, &gp->provider, provider) {
592				if (pp->flags & G_PF_WITHER)
593					continue;
594				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
595				if (zcp == NULL)
596					continue;
597				g_topology_unlock();
598				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
599				g_topology_lock();
600				vdev_geom_detach(zcp, B_TRUE);
601				if (nlabels == 0)
602					continue;
603				ZFS_LOG(1, "successfully read vdev config");
604
605				process_vdev_config(configs, count,
606				    vdev_cfg, name, &pool_guid);
607			}
608		}
609	}
610	g_topology_unlock();
611	PICKUP_GIANT();
612
613	return (*count > 0 ? 0 : ENOENT);
614}
615
616enum match {
617	NO_MATCH = 0,		/* No matching labels found */
618	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid*/
619	ZERO_MATCH = 1,		/* Should never be returned */
620	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
621	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
622	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
623	FULL_MATCH = 5		/* all labels match the vdev_guid */
624};
625
626static enum match
627vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
628{
629	nvlist_t *config;
630	uint64_t pool_guid, top_guid, vdev_guid;
631	struct g_consumer *cp;
632	int nlabels;
633
634	cp = vdev_geom_attach(pp, NULL, B_TRUE);
635	if (cp == NULL) {
636		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
637		    pp->name);
638		return (NO_MATCH);
639	}
640	g_topology_unlock();
641	nlabels = vdev_geom_read_config(cp, &config);
642	g_topology_lock();
643	vdev_geom_detach(cp, B_TRUE);
644	if (nlabels == 0) {
645		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
646		return (NO_MATCH);
647	}
648
649	pool_guid = 0;
650	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
651	top_guid = 0;
652	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
653	vdev_guid = 0;
654	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
655	nvlist_free(config);
656
657	/*
658	 * Check that the label's pool guid matches the desired guid.
659	 * Inactive spares and L2ARCs do not have any pool guid in the label.
660	 */
661	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
662		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
663		    pp->name,
664		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
665		return (NO_MATCH);
666	}
667
668	/*
669	 * Check that the label's vdev guid matches the desired guid.
670	 * The second condition handles possible race on vdev detach, when
671	 * remaining vdev receives GUID of destroyed top level mirror vdev.
672	 */
673	if (vdev_guid == vd->vdev_guid) {
674		ZFS_LOG(1, "guids match for provider %s.", pp->name);
675		return (ZERO_MATCH + nlabels);
676	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
677		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
678		return (TOPGUID_MATCH);
679	}
680	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
681	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
682	return (NO_MATCH);
683}
684
685static struct g_consumer *
686vdev_geom_attach_by_guids(vdev_t *vd)
687{
688	struct g_class *mp;
689	struct g_geom *gp;
690	struct g_provider *pp, *best_pp;
691	struct g_consumer *cp;
692	enum match match, best_match;
693
694	g_topology_assert();
695
696	cp = NULL;
697	best_pp = NULL;
698	best_match = NO_MATCH;
699	LIST_FOREACH(mp, &g_classes, class) {
700		if (mp == &zfs_vdev_class)
701			continue;
702		LIST_FOREACH(gp, &mp->geom, geom) {
703			if (gp->flags & G_GEOM_WITHER)
704				continue;
705			LIST_FOREACH(pp, &gp->provider, provider) {
706				match = vdev_attach_ok(vd, pp);
707				if (match > best_match) {
708					best_match = match;
709					best_pp = pp;
710				}
711				if (match == FULL_MATCH)
712					goto out;
713			}
714		}
715	}
716
717out:
718	if (best_pp) {
719		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
720		if (cp == NULL) {
721			printf("ZFS WARNING: Unable to attach to %s.\n",
722			    best_pp->name);
723		}
724	}
725	return (cp);
726}
727
728static struct g_consumer *
729vdev_geom_open_by_guids(vdev_t *vd)
730{
731	struct g_consumer *cp;
732	char *buf;
733	size_t len;
734
735	g_topology_assert();
736
737	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
738		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
739	cp = vdev_geom_attach_by_guids(vd);
740	if (cp != NULL) {
741		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
742		buf = kmem_alloc(len, KM_SLEEP);
743
744		snprintf(buf, len, "/dev/%s", cp->provider->name);
745		spa_strfree(vd->vdev_path);
746		vd->vdev_path = buf;
747
748		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
749		    (uintmax_t)spa_guid(vd->vdev_spa),
750		    (uintmax_t)vd->vdev_guid, cp->provider->name);
751	} else {
752		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
753		    (uintmax_t)spa_guid(vd->vdev_spa),
754		    (uintmax_t)vd->vdev_guid);
755	}
756
757	return (cp);
758}
759
760static struct g_consumer *
761vdev_geom_open_by_path(vdev_t *vd, int check_guid)
762{
763	struct g_provider *pp;
764	struct g_consumer *cp;
765
766	g_topology_assert();
767
768	cp = NULL;
769	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
770	if (pp != NULL) {
771		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
772		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
773			cp = vdev_geom_attach(pp, vd, B_FALSE);
774	}
775
776	return (cp);
777}
778
779static int
780vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
781    uint64_t *logical_ashift, uint64_t *physical_ashift)
782{
783	struct g_provider *pp;
784	struct g_consumer *cp;
785	size_t bufsize;
786	int error;
787
788	/* Set the TLS to indicate downstack that we should not access zvols*/
789	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
790
791	/*
792	 * We must have a pathname, and it must be absolute.
793	 */
794	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
795		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
796		return (EINVAL);
797	}
798
799	/*
800	 * Reopen the device if it's not currently open. Otherwise,
801	 * just update the physical size of the device.
802	 */
803	if ((cp = vd->vdev_tsd) != NULL) {
804		ASSERT(vd->vdev_reopening);
805		goto skip_open;
806	}
807
808	DROP_GIANT();
809	g_topology_lock();
810	error = 0;
811
812	if (vd->vdev_spa->spa_splitting_newspa ||
813	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
814	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
815	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
816		/*
817		 * We are dealing with a vdev that hasn't been previously
818		 * opened (since boot), and we are not loading an
819		 * existing pool configuration.  This looks like a
820		 * vdev add operation to a new or existing pool.
821		 * Assume the user knows what he/she is doing and find
822		 * GEOM provider by its name, ignoring GUID mismatches.
823		 *
824		 * XXPOLICY: It would be safer to only allow a device
825		 *           that is unlabeled or labeled but missing
826		 *           GUID information to be opened in this fashion,
827		 *           unless we are doing a split, in which case we
828		 *           should allow any guid.
829		 */
830		cp = vdev_geom_open_by_path(vd, 0);
831	} else {
832		/*
833		 * Try using the recorded path for this device, but only
834		 * accept it if its label data contains the expected GUIDs.
835		 */
836		cp = vdev_geom_open_by_path(vd, 1);
837		if (cp == NULL) {
838			/*
839			 * The device at vd->vdev_path doesn't have the
840			 * expected GUIDs. The disks might have merely
841			 * moved around so try all other GEOM providers
842			 * to find one with the right GUIDs.
843			 */
844			cp = vdev_geom_open_by_guids(vd);
845		}
846	}
847
848	/* Clear the TLS now that tasting is done */
849	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
850
851	if (cp == NULL) {
852		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
853		error = ENOENT;
854	} else {
855		struct consumer_priv_t *priv;
856		struct consumer_vdev_elem *elem;
857		int spamode;
858
859		priv = (struct consumer_priv_t*)&cp->private;
860		if (cp->private == NULL)
861			SLIST_INIT(priv);
862		elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
863		elem->vd = vd;
864		SLIST_INSERT_HEAD(priv, elem, elems);
865
866		spamode = spa_mode(vd->vdev_spa);
867		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
868		    !ISP2(cp->provider->sectorsize)) {
869			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
870			    cp->provider->name);
871
872			vdev_geom_close_locked(vd);
873			error = EINVAL;
874			cp = NULL;
875		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
876			int i;
877
878			for (i = 0; i < 5; i++) {
879				error = g_access(cp, 0, 1, 0);
880				if (error == 0)
881					break;
882				g_topology_unlock();
883				tsleep(vd, 0, "vdev", hz / 2);
884				g_topology_lock();
885			}
886			if (error != 0) {
887				printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
888				    cp->provider->name, error);
889				vdev_geom_close_locked(vd);
890				cp = NULL;
891			}
892		}
893	}
894
895	/* Fetch initial physical path information for this device. */
896	if (cp != NULL) {
897		vdev_geom_attrchanged(cp, "GEOM::physpath");
898
899		/* Set other GEOM characteristics */
900		vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
901		vdev_geom_set_rotation_rate(vd, cp);
902	}
903
904	g_topology_unlock();
905	PICKUP_GIANT();
906	if (cp == NULL) {
907		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
908		return (error);
909	}
910skip_open:
911	pp = cp->provider;
912
913	/*
914	 * Determine the actual size of the device.
915	 */
916	*max_psize = *psize = pp->mediasize;
917
918	/*
919	 * Determine the device's minimum transfer size and preferred
920	 * transfer size.
921	 */
922	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
923	*physical_ashift = 0;
924	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
925	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
926		*physical_ashift = highbit(pp->stripesize) - 1;
927
928	/*
929	 * Clear the nowritecache settings, so that on a vdev_reopen()
930	 * we will try again.
931	 */
932	vd->vdev_nowritecache = B_FALSE;
933
934	return (0);
935}
936
937static void
938vdev_geom_close(vdev_t *vd)
939{
940	struct g_consumer *cp;
941
942	cp = vd->vdev_tsd;
943
944	DROP_GIANT();
945	g_topology_lock();
946
947	if (!vd->vdev_reopening ||
948	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
949	    (cp->provider != NULL && cp->provider->error != 0))))
950		vdev_geom_close_locked(vd);
951
952	g_topology_unlock();
953	PICKUP_GIANT();
954}
955
956static void
957vdev_geom_io_intr(struct bio *bp)
958{
959	vdev_t *vd;
960	zio_t *zio;
961
962	zio = bp->bio_caller1;
963	vd = zio->io_vd;
964	zio->io_error = bp->bio_error;
965	if (zio->io_error == 0 && bp->bio_resid != 0)
966		zio->io_error = SET_ERROR(EIO);
967
968	switch(zio->io_error) {
969	case ENOTSUP:
970		/*
971		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
972		 * that future attempts will never succeed. In this case
973		 * we set a persistent flag so that we don't bother with
974		 * requests in the future.
975		 */
976		switch(bp->bio_cmd) {
977		case BIO_FLUSH:
978			vd->vdev_nowritecache = B_TRUE;
979			break;
980		case BIO_DELETE:
981			vd->vdev_notrim = B_TRUE;
982			break;
983		}
984		break;
985	case ENXIO:
986		if (!vd->vdev_remove_wanted) {
987			/*
988			 * If provider's error is set we assume it is being
989			 * removed.
990			 */
991			if (bp->bio_to->error != 0) {
992				vd->vdev_remove_wanted = B_TRUE;
993				spa_async_request(zio->io_spa,
994				    SPA_ASYNC_REMOVE);
995			} else if (!vd->vdev_delayed_close) {
996				vd->vdev_delayed_close = B_TRUE;
997			}
998		}
999		break;
1000	}
1001
1002	/*
1003	 * We have to split bio freeing into two parts, because the ABD code
1004	 * cannot be called in this context and vdev_op_io_done is not called
1005	 * for ZIO_TYPE_IOCTL zio-s.
1006	 */
1007	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
1008		g_destroy_bio(bp);
1009		zio->io_bio = NULL;
1010	}
1011	zio_delay_interrupt(zio);
1012}
1013
1014static void
1015vdev_geom_io_start(zio_t *zio)
1016{
1017	vdev_t *vd;
1018	struct g_consumer *cp;
1019	struct bio *bp;
1020	int error;
1021
1022	vd = zio->io_vd;
1023
1024	switch (zio->io_type) {
1025	case ZIO_TYPE_IOCTL:
1026		/* XXPOLICY */
1027		if (!vdev_readable(vd)) {
1028			zio->io_error = SET_ERROR(ENXIO);
1029			zio_interrupt(zio);
1030			return;
1031		} else {
1032			switch (zio->io_cmd) {
1033			case DKIOCFLUSHWRITECACHE:
1034				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
1035					break;
1036				if (vd->vdev_nowritecache) {
1037					zio->io_error = SET_ERROR(ENOTSUP);
1038					break;
1039				}
1040				goto sendreq;
1041			default:
1042				zio->io_error = SET_ERROR(ENOTSUP);
1043			}
1044		}
1045
1046		zio_execute(zio);
1047		return;
1048	case ZIO_TYPE_FREE:
1049		if (vd->vdev_notrim) {
1050			zio->io_error = SET_ERROR(ENOTSUP);
1051		} else if (!vdev_geom_bio_delete_disable) {
1052			goto sendreq;
1053		}
1054		zio_execute(zio);
1055		return;
1056	}
1057sendreq:
1058	ASSERT(zio->io_type == ZIO_TYPE_READ ||
1059	    zio->io_type == ZIO_TYPE_WRITE ||
1060	    zio->io_type == ZIO_TYPE_FREE ||
1061	    zio->io_type == ZIO_TYPE_IOCTL);
1062
1063	cp = vd->vdev_tsd;
1064	if (cp == NULL) {
1065		zio->io_error = SET_ERROR(ENXIO);
1066		zio_interrupt(zio);
1067		return;
1068	}
1069	bp = g_alloc_bio();
1070	bp->bio_caller1 = zio;
1071	switch (zio->io_type) {
1072	case ZIO_TYPE_READ:
1073	case ZIO_TYPE_WRITE:
1074		zio->io_target_timestamp = zio_handle_io_delay(zio);
1075		bp->bio_offset = zio->io_offset;
1076		bp->bio_length = zio->io_size;
1077		if (zio->io_type == ZIO_TYPE_READ) {
1078			bp->bio_cmd = BIO_READ;
1079			bp->bio_data =
1080			    abd_borrow_buf(zio->io_abd, zio->io_size);
1081		} else {
1082			bp->bio_cmd = BIO_WRITE;
1083			bp->bio_data =
1084			    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
1085		}
1086		break;
1087	case ZIO_TYPE_FREE:
1088		bp->bio_cmd = BIO_DELETE;
1089		bp->bio_data = NULL;
1090		bp->bio_offset = zio->io_offset;
1091		bp->bio_length = zio->io_size;
1092		break;
1093	case ZIO_TYPE_IOCTL:
1094		bp->bio_cmd = BIO_FLUSH;
1095		bp->bio_flags |= BIO_ORDERED;
1096		bp->bio_data = NULL;
1097		bp->bio_offset = cp->provider->mediasize;
1098		bp->bio_length = 0;
1099		break;
1100	}
1101	bp->bio_done = vdev_geom_io_intr;
1102	zio->io_bio = bp;
1103
1104	g_io_request(bp, cp);
1105}
1106
1107static void
1108vdev_geom_io_done(zio_t *zio)
1109{
1110	struct bio *bp = zio->io_bio;
1111
1112	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
1113		ASSERT(bp == NULL);
1114		return;
1115	}
1116
1117	if (bp == NULL) {
1118		ASSERT3S(zio->io_error, ==, ENXIO);
1119		return;
1120	}
1121
1122	if (zio->io_type == ZIO_TYPE_READ)
1123		abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
1124	else
1125		abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
1126
1127	g_destroy_bio(bp);
1128	zio->io_bio = NULL;
1129}
1130
1131static void
1132vdev_geom_hold(vdev_t *vd)
1133{
1134}
1135
1136static void
1137vdev_geom_rele(vdev_t *vd)
1138{
1139}
1140
1141vdev_ops_t vdev_geom_ops = {
1142	vdev_geom_open,
1143	vdev_geom_close,
1144	vdev_default_asize,
1145	vdev_geom_io_start,
1146	vdev_geom_io_done,
1147	NULL,
1148	vdev_geom_hold,
1149	vdev_geom_rele,
1150	NULL,
1151	VDEV_TYPE_DISK,		/* name of this vdev type */
1152	B_TRUE			/* leaf vdev */
1153};
1154