vdev_geom.c revision 339034
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52struct consumer_vdev_elem {
53	SLIST_ENTRY(consumer_vdev_elem)	elems;
54	vdev_t				*vd;
55};
56
57SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
58_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
59    == sizeof(struct consumer_priv_t*),
60    "consumer_priv_t* can't be stored in g_consumer.private");
61
62DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
63
64SYSCTL_DECL(_vfs_zfs_vdev);
65/* Don't send BIO_FLUSH. */
66static int vdev_geom_bio_flush_disable;
67SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
68    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
69/* Don't send BIO_DELETE. */
70static int vdev_geom_bio_delete_disable;
71SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
72    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
73
74/* Declare local functions */
75static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
76
77/*
78 * Thread local storage used to indicate when a thread is probing geoms
79 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
80 * it is looking for a replacement for the vdev_t* that is its value.
81 */
82uint_t zfs_geom_probe_vdev_key;
83
84static void
85vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
86{
87	int error;
88	uint16_t rate;
89
90	error = g_getattr("GEOM::rotation_rate", cp, &rate);
91	if (error == 0)
92		vd->vdev_rotation_rate = rate;
93	else
94		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
95}
96
97static void
98vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
99		       boolean_t do_null_update)
100{
101	boolean_t needs_update = B_FALSE;
102	char *physpath;
103	int error, physpath_len;
104
105	physpath_len = MAXPATHLEN;
106	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
107	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
108	if (error == 0) {
109		char *old_physpath;
110
111		/* g_topology lock ensures that vdev has not been closed */
112		g_topology_assert();
113		old_physpath = vd->vdev_physpath;
114		vd->vdev_physpath = spa_strdup(physpath);
115
116		if (old_physpath != NULL) {
117			needs_update = (strcmp(old_physpath,
118						vd->vdev_physpath) != 0);
119			spa_strfree(old_physpath);
120		} else
121			needs_update = do_null_update;
122	}
123	g_free(physpath);
124
125	/*
126	 * If the physical path changed, update the config.
127	 * Only request an update for previously unset physpaths if
128	 * requested by the caller.
129	 */
130	if (needs_update)
131		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
132
133}
134
135static void
136vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
137{
138	char *old_physpath;
139	struct consumer_priv_t *priv;
140	struct consumer_vdev_elem *elem;
141	int error;
142
143	priv = (struct consumer_priv_t*)&cp->private;
144	if (SLIST_EMPTY(priv))
145		return;
146
147	SLIST_FOREACH(elem, priv, elems) {
148		vdev_t *vd = elem->vd;
149		if (strcmp(attr, "GEOM::rotation_rate") == 0) {
150			vdev_geom_set_rotation_rate(vd, cp);
151			return;
152		}
153		if (strcmp(attr, "GEOM::physpath") == 0) {
154			vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
155			return;
156		}
157	}
158}
159
160static void
161vdev_geom_orphan(struct g_consumer *cp)
162{
163	struct consumer_priv_t *priv;
164	struct consumer_vdev_elem *elem;
165
166	g_topology_assert();
167
168	priv = (struct consumer_priv_t*)&cp->private;
169	if (SLIST_EMPTY(priv))
170		/* Vdev close in progress.  Ignore the event. */
171		return;
172
173	/*
174	 * Orphan callbacks occur from the GEOM event thread.
175	 * Concurrent with this call, new I/O requests may be
176	 * working their way through GEOM about to find out
177	 * (only once executed by the g_down thread) that we've
178	 * been orphaned from our disk provider.  These I/Os
179	 * must be retired before we can detach our consumer.
180	 * This is most easily achieved by acquiring the
181	 * SPA ZIO configuration lock as a writer, but doing
182	 * so with the GEOM topology lock held would cause
183	 * a lock order reversal.  Instead, rely on the SPA's
184	 * async removal support to invoke a close on this
185	 * vdev once it is safe to do so.
186	 */
187	SLIST_FOREACH(elem, priv, elems) {
188		vdev_t *vd = elem->vd;
189
190		vd->vdev_remove_wanted = B_TRUE;
191		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
192	}
193}
194
195static struct g_consumer *
196vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
197{
198	struct g_geom *gp;
199	struct g_consumer *cp;
200	int error;
201
202	g_topology_assert();
203
204	ZFS_LOG(1, "Attaching to %s.", pp->name);
205
206	if (sanity) {
207		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
208			ZFS_LOG(1, "Failing attach of %s. "
209				   "Incompatible sectorsize %d\n",
210			    pp->name, pp->sectorsize);
211			return (NULL);
212		} else if (pp->mediasize < SPA_MINDEVSIZE) {
213			ZFS_LOG(1, "Failing attach of %s. "
214				   "Incompatible mediasize %ju\n",
215			    pp->name, pp->mediasize);
216			return (NULL);
217		}
218	}
219
220	/* Do we have geom already? No? Create one. */
221	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
222		if (gp->flags & G_GEOM_WITHER)
223			continue;
224		if (strcmp(gp->name, "zfs::vdev") != 0)
225			continue;
226		break;
227	}
228	if (gp == NULL) {
229		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
230		gp->orphan = vdev_geom_orphan;
231		gp->attrchanged = vdev_geom_attrchanged;
232		cp = g_new_consumer(gp);
233		error = g_attach(cp, pp);
234		if (error != 0) {
235			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
236			    __LINE__, error);
237			vdev_geom_detach(cp, B_FALSE);
238			return (NULL);
239		}
240		error = g_access(cp, 1, 0, 1);
241		if (error != 0) {
242			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
243			       __LINE__, error);
244			vdev_geom_detach(cp, B_FALSE);
245			return (NULL);
246		}
247		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
248	} else {
249		/* Check if we are already connected to this provider. */
250		LIST_FOREACH(cp, &gp->consumer, consumer) {
251			if (cp->provider == pp) {
252				ZFS_LOG(1, "Found consumer for %s.", pp->name);
253				break;
254			}
255		}
256		if (cp == NULL) {
257			cp = g_new_consumer(gp);
258			error = g_attach(cp, pp);
259			if (error != 0) {
260				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
261				    __func__, __LINE__, error);
262				vdev_geom_detach(cp, B_FALSE);
263				return (NULL);
264			}
265			error = g_access(cp, 1, 0, 1);
266			if (error != 0) {
267				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
268				    __func__, __LINE__, error);
269				vdev_geom_detach(cp, B_FALSE);
270				return (NULL);
271			}
272			ZFS_LOG(1, "Created consumer for %s.", pp->name);
273		} else {
274			error = g_access(cp, 1, 0, 1);
275			if (error != 0) {
276				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
277				    __func__, __LINE__, error);
278				return (NULL);
279			}
280			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
281		}
282	}
283
284	if (vd != NULL)
285		vd->vdev_tsd = cp;
286
287	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
288	return (cp);
289}
290
291static void
292vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
293{
294	struct g_geom *gp;
295
296	g_topology_assert();
297
298	ZFS_LOG(1, "Detaching from %s.",
299	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
300
301	gp = cp->geom;
302	if (open_for_read)
303		g_access(cp, -1, 0, -1);
304	/* Destroy consumer on last close. */
305	if (cp->acr == 0 && cp->ace == 0) {
306		if (cp->acw > 0)
307			g_access(cp, 0, -cp->acw, 0);
308		if (cp->provider != NULL) {
309			ZFS_LOG(1, "Destroying consumer for %s.",
310			    cp->provider->name ? cp->provider->name : "NULL");
311			g_detach(cp);
312		}
313		g_destroy_consumer(cp);
314	}
315	/* Destroy geom if there are no consumers left. */
316	if (LIST_EMPTY(&gp->consumer)) {
317		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
318		g_wither_geom(gp, ENXIO);
319	}
320}
321
322static void
323vdev_geom_close_locked(vdev_t *vd)
324{
325	struct g_consumer *cp;
326	struct consumer_priv_t *priv;
327	struct consumer_vdev_elem *elem, *elem_temp;
328
329	g_topology_assert();
330
331	cp = vd->vdev_tsd;
332	vd->vdev_delayed_close = B_FALSE;
333	if (cp == NULL)
334		return;
335
336	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
337	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
338	priv = (struct consumer_priv_t*)&cp->private;
339	vd->vdev_tsd = NULL;
340	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
341		if (elem->vd == vd) {
342			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
343			g_free(elem);
344		}
345	}
346
347	vdev_geom_detach(cp, B_TRUE);
348}
349
350/*
351 * Issue one or more bios to the vdev in parallel
352 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
353 * operation is described by parallel entries from each array.  There may be
354 * more bios actually issued than entries in the array
355 */
356static void
357vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
358    off_t *sizes, int *errors, int ncmds)
359{
360	struct bio **bios;
361	u_char *p;
362	off_t off, maxio, s, end;
363	int i, n_bios, j;
364	size_t bios_size;
365
366	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
367	n_bios = 0;
368
369	/* How many bios are required for all commands ? */
370	for (i = 0; i < ncmds; i++)
371		n_bios += (sizes[i] + maxio - 1) / maxio;
372
373	/* Allocate memory for the bios */
374	bios_size = n_bios * sizeof(struct bio*);
375	bios = kmem_zalloc(bios_size, KM_SLEEP);
376
377	/* Prepare and issue all of the bios */
378	for (i = j = 0; i < ncmds; i++) {
379		off = offsets[i];
380		p = datas[i];
381		s = sizes[i];
382		end = off + s;
383		ASSERT((off % cp->provider->sectorsize) == 0);
384		ASSERT((s % cp->provider->sectorsize) == 0);
385
386		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
387			bios[j] = g_alloc_bio();
388			bios[j]->bio_cmd = cmds[i];
389			bios[j]->bio_done = NULL;
390			bios[j]->bio_offset = off;
391			bios[j]->bio_length = MIN(s, maxio);
392			bios[j]->bio_data = p;
393			g_io_request(bios[j], cp);
394		}
395	}
396	ASSERT(j == n_bios);
397
398	/* Wait for all of the bios to complete, and clean them up */
399	for (i = j = 0; i < ncmds; i++) {
400		off = offsets[i];
401		s = sizes[i];
402		end = off + s;
403
404		for (; off < end; off += maxio, s -= maxio, j++) {
405			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
406			g_destroy_bio(bios[j]);
407		}
408	}
409	kmem_free(bios, bios_size);
410}
411
412/*
413 * Read the vdev config from a device.  Return the number of valid labels that
414 * were found.  The vdev config will be returned in config if and only if at
415 * least one valid label was found.
416 */
417static int
418vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
419{
420	struct g_provider *pp;
421	nvlist_t *config;
422	vdev_phys_t *vdev_lists[VDEV_LABELS];
423	char *buf;
424	size_t buflen;
425	uint64_t psize, state, txg;
426	off_t offsets[VDEV_LABELS];
427	off_t size;
428	off_t sizes[VDEV_LABELS];
429	int cmds[VDEV_LABELS];
430	int errors[VDEV_LABELS];
431	int l, nlabels;
432
433	g_topology_assert_not();
434
435	pp = cp->provider;
436	ZFS_LOG(1, "Reading config from %s...", pp->name);
437
438	psize = pp->mediasize;
439	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
440
441	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
442	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
443
444	buflen = sizeof(vdev_lists[0]->vp_nvlist);
445
446	/* Create all of the IO requests */
447	for (l = 0; l < VDEV_LABELS; l++) {
448		cmds[l] = BIO_READ;
449		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
450		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
451		sizes[l] = size;
452		errors[l] = 0;
453		ASSERT(offsets[l] % pp->sectorsize == 0);
454	}
455
456	/* Issue the IO requests */
457	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
458	    VDEV_LABELS);
459
460	/* Parse the labels */
461	config = *configp = NULL;
462	nlabels = 0;
463	for (l = 0; l < VDEV_LABELS; l++) {
464		if (errors[l] != 0)
465			continue;
466
467		buf = vdev_lists[l]->vp_nvlist;
468
469		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
470			continue;
471
472		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
473		    &state) != 0 || state > POOL_STATE_L2CACHE) {
474			nvlist_free(config);
475			continue;
476		}
477
478		if (state != POOL_STATE_SPARE &&
479		    state != POOL_STATE_L2CACHE &&
480		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
481		    &txg) != 0 || txg == 0)) {
482			nvlist_free(config);
483			continue;
484		}
485
486		if (*configp != NULL)
487			nvlist_free(*configp);
488		*configp = config;
489
490		nlabels++;
491	}
492
493	/* Free the label storage */
494	for (l = 0; l < VDEV_LABELS; l++)
495		kmem_free(vdev_lists[l], size);
496
497	return (nlabels);
498}
499
500static void
501resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
502{
503	nvlist_t **new_configs;
504	uint64_t i;
505
506	if (id < *count)
507		return;
508	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
509	    KM_SLEEP);
510	for (i = 0; i < *count; i++)
511		new_configs[i] = (*configs)[i];
512	if (*configs != NULL)
513		kmem_free(*configs, *count * sizeof(void *));
514	*configs = new_configs;
515	*count = id + 1;
516}
517
518static void
519process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
520    const char *name, uint64_t* known_pool_guid)
521{
522	nvlist_t *vdev_tree;
523	uint64_t pool_guid;
524	uint64_t vdev_guid, known_guid;
525	uint64_t id, txg, known_txg;
526	char *pname;
527	int i;
528
529	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
530	    strcmp(pname, name) != 0)
531		goto ignore;
532
533	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
534		goto ignore;
535
536	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
537		goto ignore;
538
539	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
540		goto ignore;
541
542	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
543		goto ignore;
544
545	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
546
547	if (*known_pool_guid != 0) {
548		if (pool_guid != *known_pool_guid)
549			goto ignore;
550	} else
551		*known_pool_guid = pool_guid;
552
553	resize_configs(configs, count, id);
554
555	if ((*configs)[id] != NULL) {
556		VERIFY(nvlist_lookup_uint64((*configs)[id],
557		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
558		if (txg <= known_txg)
559			goto ignore;
560		nvlist_free((*configs)[id]);
561	}
562
563	(*configs)[id] = cfg;
564	return;
565
566ignore:
567	nvlist_free(cfg);
568}
569
570int
571vdev_geom_read_pool_label(const char *name,
572    nvlist_t ***configs, uint64_t *count)
573{
574	struct g_class *mp;
575	struct g_geom *gp;
576	struct g_provider *pp;
577	struct g_consumer *zcp;
578	nvlist_t *vdev_cfg;
579	uint64_t pool_guid;
580	int error, nlabels;
581
582	DROP_GIANT();
583	g_topology_lock();
584
585	*configs = NULL;
586	*count = 0;
587	pool_guid = 0;
588	LIST_FOREACH(mp, &g_classes, class) {
589		if (mp == &zfs_vdev_class)
590			continue;
591		LIST_FOREACH(gp, &mp->geom, geom) {
592			if (gp->flags & G_GEOM_WITHER)
593				continue;
594			LIST_FOREACH(pp, &gp->provider, provider) {
595				if (pp->flags & G_PF_WITHER)
596					continue;
597				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
598				if (zcp == NULL)
599					continue;
600				g_topology_unlock();
601				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
602				g_topology_lock();
603				vdev_geom_detach(zcp, B_TRUE);
604				if (nlabels == 0)
605					continue;
606				ZFS_LOG(1, "successfully read vdev config");
607
608				process_vdev_config(configs, count,
609				    vdev_cfg, name, &pool_guid);
610			}
611		}
612	}
613	g_topology_unlock();
614	PICKUP_GIANT();
615
616	return (*count > 0 ? 0 : ENOENT);
617}
618
619enum match {
620	NO_MATCH = 0,		/* No matching labels found */
621	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid*/
622	ZERO_MATCH = 1,		/* Should never be returned */
623	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
624	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
625	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
626	FULL_MATCH = 5		/* all labels match the vdev_guid */
627};
628
629static enum match
630vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
631{
632	nvlist_t *config;
633	uint64_t pool_guid, top_guid, vdev_guid;
634	struct g_consumer *cp;
635	int nlabels;
636
637	cp = vdev_geom_attach(pp, NULL, B_TRUE);
638	if (cp == NULL) {
639		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
640		    pp->name);
641		return (NO_MATCH);
642	}
643	g_topology_unlock();
644	nlabels = vdev_geom_read_config(cp, &config);
645	g_topology_lock();
646	vdev_geom_detach(cp, B_TRUE);
647	if (nlabels == 0) {
648		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
649		return (NO_MATCH);
650	}
651
652	pool_guid = 0;
653	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
654	top_guid = 0;
655	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
656	vdev_guid = 0;
657	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
658	nvlist_free(config);
659
660	/*
661	 * Check that the label's pool guid matches the desired guid.
662	 * Inactive spares and L2ARCs do not have any pool guid in the label.
663	 */
664	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
665		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
666		    pp->name,
667		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
668		return (NO_MATCH);
669	}
670
671	/*
672	 * Check that the label's vdev guid matches the desired guid.
673	 * The second condition handles possible race on vdev detach, when
674	 * remaining vdev receives GUID of destroyed top level mirror vdev.
675	 */
676	if (vdev_guid == vd->vdev_guid) {
677		ZFS_LOG(1, "guids match for provider %s.", pp->name);
678		return (ZERO_MATCH + nlabels);
679	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
680		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
681		return (TOPGUID_MATCH);
682	}
683	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
684	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
685	return (NO_MATCH);
686}
687
688static struct g_consumer *
689vdev_geom_attach_by_guids(vdev_t *vd)
690{
691	struct g_class *mp;
692	struct g_geom *gp;
693	struct g_provider *pp, *best_pp;
694	struct g_consumer *cp;
695	enum match match, best_match;
696
697	g_topology_assert();
698
699	cp = NULL;
700	best_pp = NULL;
701	best_match = NO_MATCH;
702	LIST_FOREACH(mp, &g_classes, class) {
703		if (mp == &zfs_vdev_class)
704			continue;
705		LIST_FOREACH(gp, &mp->geom, geom) {
706			if (gp->flags & G_GEOM_WITHER)
707				continue;
708			LIST_FOREACH(pp, &gp->provider, provider) {
709				match = vdev_attach_ok(vd, pp);
710				if (match > best_match) {
711					best_match = match;
712					best_pp = pp;
713				}
714				if (match == FULL_MATCH)
715					goto out;
716			}
717		}
718	}
719
720out:
721	if (best_pp) {
722		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
723		if (cp == NULL) {
724			printf("ZFS WARNING: Unable to attach to %s.\n",
725			    best_pp->name);
726		}
727	}
728	return (cp);
729}
730
731static struct g_consumer *
732vdev_geom_open_by_guids(vdev_t *vd)
733{
734	struct g_consumer *cp;
735	char *buf;
736	size_t len;
737
738	g_topology_assert();
739
740	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
741		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
742	cp = vdev_geom_attach_by_guids(vd);
743	if (cp != NULL) {
744		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
745		buf = kmem_alloc(len, KM_SLEEP);
746
747		snprintf(buf, len, "/dev/%s", cp->provider->name);
748		spa_strfree(vd->vdev_path);
749		vd->vdev_path = buf;
750
751		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
752		    (uintmax_t)spa_guid(vd->vdev_spa),
753		    (uintmax_t)vd->vdev_guid, cp->provider->name);
754	} else {
755		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
756		    (uintmax_t)spa_guid(vd->vdev_spa),
757		    (uintmax_t)vd->vdev_guid);
758	}
759
760	return (cp);
761}
762
763static struct g_consumer *
764vdev_geom_open_by_path(vdev_t *vd, int check_guid)
765{
766	struct g_provider *pp;
767	struct g_consumer *cp;
768
769	g_topology_assert();
770
771	cp = NULL;
772	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
773	if (pp != NULL) {
774		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
775		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
776			cp = vdev_geom_attach(pp, vd, B_FALSE);
777	}
778
779	return (cp);
780}
781
782static int
783vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
784    uint64_t *logical_ashift, uint64_t *physical_ashift)
785{
786	struct g_provider *pp;
787	struct g_consumer *cp;
788	size_t bufsize;
789	int error;
790
791	/* Set the TLS to indicate downstack that we should not access zvols*/
792	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
793
794	/*
795	 * We must have a pathname, and it must be absolute.
796	 */
797	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
798		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
799		return (EINVAL);
800	}
801
802	/*
803	 * Reopen the device if it's not currently open. Otherwise,
804	 * just update the physical size of the device.
805	 */
806	if ((cp = vd->vdev_tsd) != NULL) {
807		ASSERT(vd->vdev_reopening);
808		goto skip_open;
809	}
810
811	DROP_GIANT();
812	g_topology_lock();
813	error = 0;
814
815	if (vd->vdev_spa->spa_splitting_newspa ||
816	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
817	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
818	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
819		/*
820		 * We are dealing with a vdev that hasn't been previously
821		 * opened (since boot), and we are not loading an
822		 * existing pool configuration.  This looks like a
823		 * vdev add operation to a new or existing pool.
824		 * Assume the user knows what he/she is doing and find
825		 * GEOM provider by its name, ignoring GUID mismatches.
826		 *
827		 * XXPOLICY: It would be safer to only allow a device
828		 *           that is unlabeled or labeled but missing
829		 *           GUID information to be opened in this fashion,
830		 *           unless we are doing a split, in which case we
831		 *           should allow any guid.
832		 */
833		cp = vdev_geom_open_by_path(vd, 0);
834	} else {
835		/*
836		 * Try using the recorded path for this device, but only
837		 * accept it if its label data contains the expected GUIDs.
838		 */
839		cp = vdev_geom_open_by_path(vd, 1);
840		if (cp == NULL) {
841			/*
842			 * The device at vd->vdev_path doesn't have the
843			 * expected GUIDs. The disks might have merely
844			 * moved around so try all other GEOM providers
845			 * to find one with the right GUIDs.
846			 */
847			cp = vdev_geom_open_by_guids(vd);
848		}
849	}
850
851	/* Clear the TLS now that tasting is done */
852	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
853
854	if (cp == NULL) {
855		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
856		error = ENOENT;
857	} else {
858		struct consumer_priv_t *priv;
859		struct consumer_vdev_elem *elem;
860		int spamode;
861
862		priv = (struct consumer_priv_t*)&cp->private;
863		if (cp->private == NULL)
864			SLIST_INIT(priv);
865		elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
866		elem->vd = vd;
867		SLIST_INSERT_HEAD(priv, elem, elems);
868
869		spamode = spa_mode(vd->vdev_spa);
870		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
871		    !ISP2(cp->provider->sectorsize)) {
872			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
873			    cp->provider->name);
874
875			vdev_geom_close_locked(vd);
876			error = EINVAL;
877			cp = NULL;
878		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
879			int i;
880
881			for (i = 0; i < 5; i++) {
882				error = g_access(cp, 0, 1, 0);
883				if (error == 0)
884					break;
885				g_topology_unlock();
886				tsleep(vd, 0, "vdev", hz / 2);
887				g_topology_lock();
888			}
889			if (error != 0) {
890				printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
891				    cp->provider->name, error);
892				vdev_geom_close_locked(vd);
893				cp = NULL;
894			}
895		}
896	}
897
898	/* Fetch initial physical path information for this device. */
899	if (cp != NULL) {
900		vdev_geom_attrchanged(cp, "GEOM::physpath");
901
902		/* Set other GEOM characteristics */
903		vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
904		vdev_geom_set_rotation_rate(vd, cp);
905	}
906
907	g_topology_unlock();
908	PICKUP_GIANT();
909	if (cp == NULL) {
910		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
911		vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
912		    error);
913		return (error);
914	}
915skip_open:
916	pp = cp->provider;
917
918	/*
919	 * Determine the actual size of the device.
920	 */
921	*max_psize = *psize = pp->mediasize;
922
923	/*
924	 * Determine the device's minimum transfer size and preferred
925	 * transfer size.
926	 */
927	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
928	*physical_ashift = 0;
929	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
930	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
931		*physical_ashift = highbit(pp->stripesize) - 1;
932
933	/*
934	 * Clear the nowritecache settings, so that on a vdev_reopen()
935	 * we will try again.
936	 */
937	vd->vdev_nowritecache = B_FALSE;
938
939	return (0);
940}
941
942static void
943vdev_geom_close(vdev_t *vd)
944{
945	struct g_consumer *cp;
946
947	cp = vd->vdev_tsd;
948
949	DROP_GIANT();
950	g_topology_lock();
951
952	if (!vd->vdev_reopening ||
953	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
954	    (cp->provider != NULL && cp->provider->error != 0))))
955		vdev_geom_close_locked(vd);
956
957	g_topology_unlock();
958	PICKUP_GIANT();
959}
960
961static void
962vdev_geom_io_intr(struct bio *bp)
963{
964	vdev_t *vd;
965	zio_t *zio;
966
967	zio = bp->bio_caller1;
968	vd = zio->io_vd;
969	zio->io_error = bp->bio_error;
970	if (zio->io_error == 0 && bp->bio_resid != 0)
971		zio->io_error = SET_ERROR(EIO);
972
973	switch(zio->io_error) {
974	case ENOTSUP:
975		/*
976		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
977		 * that future attempts will never succeed. In this case
978		 * we set a persistent flag so that we don't bother with
979		 * requests in the future.
980		 */
981		switch(bp->bio_cmd) {
982		case BIO_FLUSH:
983			vd->vdev_nowritecache = B_TRUE;
984			break;
985		case BIO_DELETE:
986			vd->vdev_notrim = B_TRUE;
987			break;
988		}
989		break;
990	case ENXIO:
991		if (!vd->vdev_remove_wanted) {
992			/*
993			 * If provider's error is set we assume it is being
994			 * removed.
995			 */
996			if (bp->bio_to->error != 0) {
997				vd->vdev_remove_wanted = B_TRUE;
998				spa_async_request(zio->io_spa,
999				    SPA_ASYNC_REMOVE);
1000			} else if (!vd->vdev_delayed_close) {
1001				vd->vdev_delayed_close = B_TRUE;
1002			}
1003		}
1004		break;
1005	}
1006
1007	/*
1008	 * We have to split bio freeing into two parts, because the ABD code
1009	 * cannot be called in this context and vdev_op_io_done is not called
1010	 * for ZIO_TYPE_IOCTL zio-s.
1011	 */
1012	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
1013		g_destroy_bio(bp);
1014		zio->io_bio = NULL;
1015	}
1016	zio_delay_interrupt(zio);
1017}
1018
1019static void
1020vdev_geom_io_start(zio_t *zio)
1021{
1022	vdev_t *vd;
1023	struct g_consumer *cp;
1024	struct bio *bp;
1025	int error;
1026
1027	vd = zio->io_vd;
1028
1029	switch (zio->io_type) {
1030	case ZIO_TYPE_IOCTL:
1031		/* XXPOLICY */
1032		if (!vdev_readable(vd)) {
1033			zio->io_error = SET_ERROR(ENXIO);
1034			zio_interrupt(zio);
1035			return;
1036		} else {
1037			switch (zio->io_cmd) {
1038			case DKIOCFLUSHWRITECACHE:
1039				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
1040					break;
1041				if (vd->vdev_nowritecache) {
1042					zio->io_error = SET_ERROR(ENOTSUP);
1043					break;
1044				}
1045				goto sendreq;
1046			default:
1047				zio->io_error = SET_ERROR(ENOTSUP);
1048			}
1049		}
1050
1051		zio_execute(zio);
1052		return;
1053	case ZIO_TYPE_FREE:
1054		if (vd->vdev_notrim) {
1055			zio->io_error = SET_ERROR(ENOTSUP);
1056		} else if (!vdev_geom_bio_delete_disable) {
1057			goto sendreq;
1058		}
1059		zio_execute(zio);
1060		return;
1061	}
1062sendreq:
1063	ASSERT(zio->io_type == ZIO_TYPE_READ ||
1064	    zio->io_type == ZIO_TYPE_WRITE ||
1065	    zio->io_type == ZIO_TYPE_FREE ||
1066	    zio->io_type == ZIO_TYPE_IOCTL);
1067
1068	cp = vd->vdev_tsd;
1069	if (cp == NULL) {
1070		zio->io_error = SET_ERROR(ENXIO);
1071		zio_interrupt(zio);
1072		return;
1073	}
1074	bp = g_alloc_bio();
1075	bp->bio_caller1 = zio;
1076	switch (zio->io_type) {
1077	case ZIO_TYPE_READ:
1078	case ZIO_TYPE_WRITE:
1079		zio->io_target_timestamp = zio_handle_io_delay(zio);
1080		bp->bio_offset = zio->io_offset;
1081		bp->bio_length = zio->io_size;
1082		if (zio->io_type == ZIO_TYPE_READ) {
1083			bp->bio_cmd = BIO_READ;
1084			bp->bio_data =
1085			    abd_borrow_buf(zio->io_abd, zio->io_size);
1086		} else {
1087			bp->bio_cmd = BIO_WRITE;
1088			bp->bio_data =
1089			    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
1090		}
1091		break;
1092	case ZIO_TYPE_FREE:
1093		bp->bio_cmd = BIO_DELETE;
1094		bp->bio_data = NULL;
1095		bp->bio_offset = zio->io_offset;
1096		bp->bio_length = zio->io_size;
1097		break;
1098	case ZIO_TYPE_IOCTL:
1099		bp->bio_cmd = BIO_FLUSH;
1100		bp->bio_flags |= BIO_ORDERED;
1101		bp->bio_data = NULL;
1102		bp->bio_offset = cp->provider->mediasize;
1103		bp->bio_length = 0;
1104		break;
1105	}
1106	bp->bio_done = vdev_geom_io_intr;
1107	zio->io_bio = bp;
1108
1109	g_io_request(bp, cp);
1110}
1111
1112static void
1113vdev_geom_io_done(zio_t *zio)
1114{
1115	struct bio *bp = zio->io_bio;
1116
1117	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
1118		ASSERT(bp == NULL);
1119		return;
1120	}
1121
1122	if (bp == NULL) {
1123		ASSERT3S(zio->io_error, ==, ENXIO);
1124		return;
1125	}
1126
1127	if (zio->io_type == ZIO_TYPE_READ)
1128		abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
1129	else
1130		abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
1131
1132	g_destroy_bio(bp);
1133	zio->io_bio = NULL;
1134}
1135
1136static void
1137vdev_geom_hold(vdev_t *vd)
1138{
1139}
1140
1141static void
1142vdev_geom_rele(vdev_t *vd)
1143{
1144}
1145
1146vdev_ops_t vdev_geom_ops = {
1147	vdev_geom_open,
1148	vdev_geom_close,
1149	vdev_default_asize,
1150	vdev_geom_io_start,
1151	vdev_geom_io_done,
1152	NULL,
1153	NULL,
1154	vdev_geom_hold,
1155	vdev_geom_rele,
1156	NULL,
1157	VDEV_TYPE_DISK,		/* name of this vdev type */
1158	B_TRUE			/* leaf vdev */
1159};
1160