vdev_geom.c revision 325913
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52struct consumer_vdev_elem {
53	SLIST_ENTRY(consumer_vdev_elem)	elems;
54	vdev_t				*vd;
55};
56
57SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
58_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
59    == sizeof(struct consumer_priv_t*),
60    "consumer_priv_t* can't be stored in g_consumer.private");
61
62DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
63
64SYSCTL_DECL(_vfs_zfs_vdev);
65/* Don't send BIO_FLUSH. */
66static int vdev_geom_bio_flush_disable = 0;
67TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
68SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
69    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
70/* Don't send BIO_DELETE. */
71static int vdev_geom_bio_delete_disable = 0;
72TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
73SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
74    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
75
76/* Declare local functions */
77static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
78
79/*
80 * Thread local storage used to indicate when a thread is probing geoms
81 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
82 * it is looking for a replacement for the vdev_t* that is its value.
83 */
84uint_t zfs_geom_probe_vdev_key;
85
86static void
87vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
88{
89	int error;
90	uint16_t rate;
91
92	error = g_getattr("GEOM::rotation_rate", cp, &rate);
93	if (error == 0)
94		vd->vdev_rotation_rate = rate;
95	else
96		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
97}
98
99static void
100vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
101		       boolean_t do_null_update)
102{
103	boolean_t needs_update = B_FALSE;
104	char *physpath;
105	int error, physpath_len;
106
107	physpath_len = MAXPATHLEN;
108	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
109	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
110	if (error == 0) {
111		char *old_physpath;
112
113		/* g_topology lock ensures that vdev has not been closed */
114		g_topology_assert();
115		old_physpath = vd->vdev_physpath;
116		vd->vdev_physpath = spa_strdup(physpath);
117
118		if (old_physpath != NULL) {
119			needs_update = (strcmp(old_physpath,
120						vd->vdev_physpath) != 0);
121			spa_strfree(old_physpath);
122		} else
123			needs_update = do_null_update;
124	}
125	g_free(physpath);
126
127	/*
128	 * If the physical path changed, update the config.
129	 * Only request an update for previously unset physpaths if
130	 * requested by the caller.
131	 */
132	if (needs_update)
133		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
134
135}
136
137static void
138vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
139{
140	char *old_physpath;
141	struct consumer_priv_t *priv;
142	struct consumer_vdev_elem *elem;
143	int error;
144
145	priv = (struct consumer_priv_t*)&cp->private;
146	if (SLIST_EMPTY(priv))
147		return;
148
149	SLIST_FOREACH(elem, priv, elems) {
150		vdev_t *vd = elem->vd;
151		if (strcmp(attr, "GEOM::rotation_rate") == 0) {
152			vdev_geom_set_rotation_rate(vd, cp);
153			return;
154		}
155		if (strcmp(attr, "GEOM::physpath") == 0) {
156			vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
157			return;
158		}
159	}
160}
161
162static void
163vdev_geom_orphan(struct g_consumer *cp)
164{
165	struct consumer_priv_t *priv;
166	struct consumer_vdev_elem *elem;
167
168	g_topology_assert();
169
170	priv = (struct consumer_priv_t*)&cp->private;
171	if (SLIST_EMPTY(priv))
172		/* Vdev close in progress.  Ignore the event. */
173		return;
174
175	/*
176	 * Orphan callbacks occur from the GEOM event thread.
177	 * Concurrent with this call, new I/O requests may be
178	 * working their way through GEOM about to find out
179	 * (only once executed by the g_down thread) that we've
180	 * been orphaned from our disk provider.  These I/Os
181	 * must be retired before we can detach our consumer.
182	 * This is most easily achieved by acquiring the
183	 * SPA ZIO configuration lock as a writer, but doing
184	 * so with the GEOM topology lock held would cause
185	 * a lock order reversal.  Instead, rely on the SPA's
186	 * async removal support to invoke a close on this
187	 * vdev once it is safe to do so.
188	 */
189	SLIST_FOREACH(elem, priv, elems) {
190		vdev_t *vd = elem->vd;
191
192		vd->vdev_remove_wanted = B_TRUE;
193		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
194	}
195}
196
197static struct g_consumer *
198vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
199{
200	struct g_geom *gp;
201	struct g_consumer *cp;
202	int error;
203
204	g_topology_assert();
205
206	ZFS_LOG(1, "Attaching to %s.", pp->name);
207
208	if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
209		ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
210		    pp->name, pp->sectorsize);
211		return (NULL);
212	} else if (pp->mediasize < SPA_MINDEVSIZE) {
213		ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
214		    pp->name, pp->mediasize);
215		return (NULL);
216	}
217
218	/* Do we have geom already? No? Create one. */
219	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
220		if (gp->flags & G_GEOM_WITHER)
221			continue;
222		if (strcmp(gp->name, "zfs::vdev") != 0)
223			continue;
224		break;
225	}
226	if (gp == NULL) {
227		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
228		gp->orphan = vdev_geom_orphan;
229		gp->attrchanged = vdev_geom_attrchanged;
230		cp = g_new_consumer(gp);
231		error = g_attach(cp, pp);
232		if (error != 0) {
233			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
234			    __LINE__, error);
235			vdev_geom_detach(cp, B_FALSE);
236			return (NULL);
237		}
238		error = g_access(cp, 1, 0, 1);
239		if (error != 0) {
240			ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
241			       __LINE__, error);
242			vdev_geom_detach(cp, B_FALSE);
243			return (NULL);
244		}
245		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
246	} else {
247		/* Check if we are already connected to this provider. */
248		LIST_FOREACH(cp, &gp->consumer, consumer) {
249			if (cp->provider == pp) {
250				ZFS_LOG(1, "Found consumer for %s.", pp->name);
251				break;
252			}
253		}
254		if (cp == NULL) {
255			cp = g_new_consumer(gp);
256			error = g_attach(cp, pp);
257			if (error != 0) {
258				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
259				    __func__, __LINE__, error);
260				vdev_geom_detach(cp, B_FALSE);
261				return (NULL);
262			}
263			error = g_access(cp, 1, 0, 1);
264			if (error != 0) {
265				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
266				    __func__, __LINE__, error);
267				vdev_geom_detach(cp, B_FALSE);
268				return (NULL);
269			}
270			ZFS_LOG(1, "Created consumer for %s.", pp->name);
271		} else {
272			error = g_access(cp, 1, 0, 1);
273			if (error != 0) {
274				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
275				    __func__, __LINE__, error);
276				return (NULL);
277			}
278			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
279		}
280	}
281
282	if (vd != NULL)
283		vd->vdev_tsd = cp;
284
285	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
286	return (cp);
287}
288
289static void
290vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
291{
292	struct g_geom *gp;
293
294	g_topology_assert();
295
296	ZFS_LOG(1, "Detaching from %s.",
297	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
298
299	gp = cp->geom;
300	if (open_for_read)
301		g_access(cp, -1, 0, -1);
302	/* Destroy consumer on last close. */
303	if (cp->acr == 0 && cp->ace == 0) {
304		if (cp->acw > 0)
305			g_access(cp, 0, -cp->acw, 0);
306		if (cp->provider != NULL) {
307			ZFS_LOG(1, "Destroying consumer for %s.",
308			    cp->provider->name ? cp->provider->name : "NULL");
309			g_detach(cp);
310		}
311		g_destroy_consumer(cp);
312	}
313	/* Destroy geom if there are no consumers left. */
314	if (LIST_EMPTY(&gp->consumer)) {
315		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
316		g_wither_geom(gp, ENXIO);
317	}
318}
319
320static void
321vdev_geom_close_locked(vdev_t *vd)
322{
323	struct g_consumer *cp;
324	struct consumer_priv_t *priv;
325	struct consumer_vdev_elem *elem, *elem_temp;
326
327	g_topology_assert();
328
329	cp = vd->vdev_tsd;
330	vd->vdev_delayed_close = B_FALSE;
331	if (cp == NULL)
332		return;
333
334	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
335	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
336	priv = (struct consumer_priv_t*)&cp->private;
337	vd->vdev_tsd = NULL;
338	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
339		if (elem->vd == vd) {
340			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
341			g_free(elem);
342		}
343	}
344
345	vdev_geom_detach(cp, B_TRUE);
346}
347
348/*
349 * Issue one or more bios to the vdev in parallel
350 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
351 * operation is described by parallel entries from each array.  There may be
352 * more bios actually issued than entries in the array
353 */
354static void
355vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
356    off_t *sizes, int *errors, int ncmds)
357{
358	struct bio **bios;
359	u_char *p;
360	off_t off, maxio, s, end;
361	int i, n_bios, j;
362	size_t bios_size;
363
364	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
365	n_bios = 0;
366
367	/* How many bios are required for all commands ? */
368	for (i = 0; i < ncmds; i++)
369		n_bios += (sizes[i] + maxio - 1) / maxio;
370
371	/* Allocate memory for the bios */
372	bios_size = n_bios * sizeof(struct bio*);
373	bios = kmem_zalloc(bios_size, KM_SLEEP);
374
375	/* Prepare and issue all of the bios */
376	for (i = j = 0; i < ncmds; i++) {
377		off = offsets[i];
378		p = datas[i];
379		s = sizes[i];
380		end = off + s;
381		ASSERT((off % cp->provider->sectorsize) == 0);
382		ASSERT((s % cp->provider->sectorsize) == 0);
383
384		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
385			bios[j] = g_alloc_bio();
386			bios[j]->bio_cmd = cmds[i];
387			bios[j]->bio_done = NULL;
388			bios[j]->bio_offset = off;
389			bios[j]->bio_length = MIN(s, maxio);
390			bios[j]->bio_data = p;
391			g_io_request(bios[j], cp);
392		}
393	}
394	ASSERT(j == n_bios);
395
396	/* Wait for all of the bios to complete, and clean them up */
397	for (i = j = 0; i < ncmds; i++) {
398		off = offsets[i];
399		s = sizes[i];
400		end = off + s;
401
402		for (; off < end; off += maxio, s -= maxio, j++) {
403			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
404			g_destroy_bio(bios[j]);
405		}
406	}
407	kmem_free(bios, bios_size);
408}
409
410/*
411 * Read the vdev config from a device.  Return the number of valid labels that
412 * were found.  The vdev config will be returned in config if and only if at
413 * least one valid label was found.
414 */
415static int
416vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
417{
418	struct g_provider *pp;
419	vdev_phys_t *vdev_lists[VDEV_LABELS];
420	char *buf;
421	size_t buflen;
422	uint64_t psize, state, txg;
423	off_t offsets[VDEV_LABELS];
424	off_t size;
425	off_t sizes[VDEV_LABELS];
426	int cmds[VDEV_LABELS];
427	int errors[VDEV_LABELS];
428	int l, nlabels;
429
430	g_topology_assert_not();
431
432	pp = cp->provider;
433	ZFS_LOG(1, "Reading config from %s...", pp->name);
434
435	psize = pp->mediasize;
436	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
437
438	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
439	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
440
441	buflen = sizeof(vdev_lists[0]->vp_nvlist);
442
443	*config = NULL;
444	/* Create all of the IO requests */
445	for (l = 0; l < VDEV_LABELS; l++) {
446		cmds[l] = BIO_READ;
447		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
448		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
449		sizes[l] = size;
450		errors[l] = 0;
451		ASSERT(offsets[l] % pp->sectorsize == 0);
452	}
453
454	/* Issue the IO requests */
455	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
456	    VDEV_LABELS);
457
458	/* Parse the labels */
459	nlabels = 0;
460	for (l = 0; l < VDEV_LABELS; l++) {
461		if (errors[l] != 0)
462			continue;
463
464		buf = vdev_lists[l]->vp_nvlist;
465
466		if (nvlist_unpack(buf, buflen, config, 0) != 0)
467			continue;
468
469		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
470		    &state) != 0 || state > POOL_STATE_L2CACHE) {
471			nvlist_free(*config);
472			*config = NULL;
473			continue;
474		}
475
476		if (state != POOL_STATE_SPARE &&
477		    state != POOL_STATE_L2CACHE &&
478		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
479		    &txg) != 0 || txg == 0)) {
480			nvlist_free(*config);
481			*config = NULL;
482			continue;
483		}
484
485		nlabels++;
486	}
487
488	/* Free the label storage */
489	for (l = 0; l < VDEV_LABELS; l++)
490		kmem_free(vdev_lists[l], size);
491
492	return (nlabels);
493}
494
495static void
496resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
497{
498	nvlist_t **new_configs;
499	uint64_t i;
500
501	if (id < *count)
502		return;
503	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
504	    KM_SLEEP);
505	for (i = 0; i < *count; i++)
506		new_configs[i] = (*configs)[i];
507	if (*configs != NULL)
508		kmem_free(*configs, *count * sizeof(void *));
509	*configs = new_configs;
510	*count = id + 1;
511}
512
513static void
514process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
515    const char *name, uint64_t* known_pool_guid)
516{
517	nvlist_t *vdev_tree;
518	uint64_t pool_guid;
519	uint64_t vdev_guid, known_guid;
520	uint64_t id, txg, known_txg;
521	char *pname;
522	int i;
523
524	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
525	    strcmp(pname, name) != 0)
526		goto ignore;
527
528	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
529		goto ignore;
530
531	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
532		goto ignore;
533
534	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
535		goto ignore;
536
537	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
538		goto ignore;
539
540	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
541
542	if (*known_pool_guid != 0) {
543		if (pool_guid != *known_pool_guid)
544			goto ignore;
545	} else
546		*known_pool_guid = pool_guid;
547
548	resize_configs(configs, count, id);
549
550	if ((*configs)[id] != NULL) {
551		VERIFY(nvlist_lookup_uint64((*configs)[id],
552		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
553		if (txg <= known_txg)
554			goto ignore;
555		nvlist_free((*configs)[id]);
556	}
557
558	(*configs)[id] = cfg;
559	return;
560
561ignore:
562	nvlist_free(cfg);
563}
564
565int
566vdev_geom_read_pool_label(const char *name,
567    nvlist_t ***configs, uint64_t *count)
568{
569	struct g_class *mp;
570	struct g_geom *gp;
571	struct g_provider *pp;
572	struct g_consumer *zcp;
573	nvlist_t *vdev_cfg;
574	uint64_t pool_guid;
575	int error, nlabels;
576
577	DROP_GIANT();
578	g_topology_lock();
579
580	*configs = NULL;
581	*count = 0;
582	pool_guid = 0;
583	LIST_FOREACH(mp, &g_classes, class) {
584		if (mp == &zfs_vdev_class)
585			continue;
586		LIST_FOREACH(gp, &mp->geom, geom) {
587			if (gp->flags & G_GEOM_WITHER)
588				continue;
589			LIST_FOREACH(pp, &gp->provider, provider) {
590				if (pp->flags & G_PF_WITHER)
591					continue;
592				zcp = vdev_geom_attach(pp, NULL);
593				if (zcp == NULL)
594					continue;
595				g_topology_unlock();
596				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
597				g_topology_lock();
598				vdev_geom_detach(zcp, B_TRUE);
599				if (nlabels == 0)
600					continue;
601				ZFS_LOG(1, "successfully read vdev config");
602
603				process_vdev_config(configs, count,
604				    vdev_cfg, name, &pool_guid);
605			}
606		}
607	}
608	g_topology_unlock();
609	PICKUP_GIANT();
610
611	return (*count > 0 ? 0 : ENOENT);
612}
613
614enum match {
615	NO_MATCH = 0,		/* No matching labels found */
616	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid*/
617	ZERO_MATCH = 1,		/* Should never be returned */
618	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
619	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
620	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
621	FULL_MATCH = 5		/* all labels match the vdev_guid */
622};
623
624static enum match
625vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
626{
627	nvlist_t *config;
628	uint64_t pool_guid, top_guid, vdev_guid;
629	struct g_consumer *cp;
630	int nlabels;
631
632	cp = vdev_geom_attach(pp, NULL);
633	if (cp == NULL) {
634		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
635		    pp->name);
636		return (NO_MATCH);
637	}
638	g_topology_unlock();
639	nlabels = vdev_geom_read_config(cp, &config);
640	if (nlabels == 0) {
641		g_topology_lock();
642		vdev_geom_detach(cp, B_TRUE);
643		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
644		return (NO_MATCH);
645	}
646	g_topology_lock();
647	vdev_geom_detach(cp, B_TRUE);
648
649	pool_guid = 0;
650	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
651	top_guid = 0;
652	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
653	vdev_guid = 0;
654	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
655	nvlist_free(config);
656
657	/*
658	 * Check that the label's pool guid matches the desired guid.
659	 * Inactive spares and L2ARCs do not have any pool guid in the label.
660	 */
661	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
662		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
663		    pp->name,
664		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
665		return (NO_MATCH);
666	}
667
668	/*
669	 * Check that the label's vdev guid matches the desired guid.
670	 * The second condition handles possible race on vdev detach, when
671	 * remaining vdev receives GUID of destroyed top level mirror vdev.
672	 */
673	if (vdev_guid == vd->vdev_guid) {
674		ZFS_LOG(1, "guids match for provider %s.", pp->name);
675		return (ZERO_MATCH + nlabels);
676	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
677		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
678		return (TOPGUID_MATCH);
679	}
680	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
681	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
682	return (NO_MATCH);
683}
684
685static struct g_consumer *
686vdev_geom_attach_by_guids(vdev_t *vd)
687{
688	struct g_class *mp;
689	struct g_geom *gp;
690	struct g_provider *pp, *best_pp;
691	struct g_consumer *cp;
692	enum match match, best_match;
693
694	g_topology_assert();
695
696	cp = NULL;
697	best_pp = NULL;
698	best_match = NO_MATCH;
699	LIST_FOREACH(mp, &g_classes, class) {
700		if (mp == &zfs_vdev_class)
701			continue;
702		LIST_FOREACH(gp, &mp->geom, geom) {
703			if (gp->flags & G_GEOM_WITHER)
704				continue;
705			LIST_FOREACH(pp, &gp->provider, provider) {
706				match = vdev_attach_ok(vd, pp);
707				if (match > best_match) {
708					best_match = match;
709					best_pp = pp;
710				}
711				if (match == FULL_MATCH)
712					goto out;
713			}
714		}
715	}
716
717out:
718	if (best_pp) {
719		cp = vdev_geom_attach(best_pp, vd);
720		if (cp == NULL) {
721			printf("ZFS WARNING: Unable to attach to %s.\n",
722			    best_pp->name);
723		}
724	}
725	return (cp);
726}
727
728static struct g_consumer *
729vdev_geom_open_by_guids(vdev_t *vd)
730{
731	struct g_consumer *cp;
732	char *buf;
733	size_t len;
734
735	g_topology_assert();
736
737	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
738		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
739	cp = vdev_geom_attach_by_guids(vd);
740	if (cp != NULL) {
741		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
742		buf = kmem_alloc(len, KM_SLEEP);
743
744		snprintf(buf, len, "/dev/%s", cp->provider->name);
745		spa_strfree(vd->vdev_path);
746		vd->vdev_path = buf;
747
748		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
749		    (uintmax_t)spa_guid(vd->vdev_spa),
750		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
751	} else {
752		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
753		    (uintmax_t)spa_guid(vd->vdev_spa),
754		    (uintmax_t)vd->vdev_guid);
755	}
756
757	return (cp);
758}
759
760static struct g_consumer *
761vdev_geom_open_by_path(vdev_t *vd, int check_guid)
762{
763	struct g_provider *pp;
764	struct g_consumer *cp;
765
766	g_topology_assert();
767
768	cp = NULL;
769	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
770	if (pp != NULL) {
771		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
772		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
773			cp = vdev_geom_attach(pp, vd);
774	}
775
776	return (cp);
777}
778
779static int
780vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
781    uint64_t *logical_ashift, uint64_t *physical_ashift)
782{
783	struct g_provider *pp;
784	struct g_consumer *cp;
785	size_t bufsize;
786	int error;
787
788	/* Set the TLS to indicate downstack that we should not access zvols*/
789	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
790
791	/*
792	 * We must have a pathname, and it must be absolute.
793	 */
794	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
795		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
796		return (EINVAL);
797	}
798
799	/*
800	 * Reopen the device if it's not currently open. Otherwise,
801	 * just update the physical size of the device.
802	 */
803	if ((cp = vd->vdev_tsd) != NULL) {
804		ASSERT(vd->vdev_reopening);
805		goto skip_open;
806	}
807
808	DROP_GIANT();
809	g_topology_lock();
810	error = 0;
811
812	if (vd->vdev_spa->spa_splitting_newspa ||
813	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
814	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
815	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
816		/*
817		 * We are dealing with a vdev that hasn't been previously
818		 * opened (since boot), and we are not loading an
819		 * existing pool configuration.  This looks like a
820		 * vdev add operation to a new or existing pool.
821		 * Assume the user knows what he/she is doing and find
822		 * GEOM provider by its name, ignoring GUID mismatches.
823		 *
824		 * XXPOLICY: It would be safer to only allow a device
825		 *           that is unlabeled or labeled but missing
826		 *           GUID information to be opened in this fashion,
827		 *           unless we are doing a split, in which case we
828		 *           should allow any guid.
829		 */
830		cp = vdev_geom_open_by_path(vd, 0);
831	} else {
832		/*
833		 * Try using the recorded path for this device, but only
834		 * accept it if its label data contains the expected GUIDs.
835		 */
836		cp = vdev_geom_open_by_path(vd, 1);
837		if (cp == NULL) {
838			/*
839			 * The device at vd->vdev_path doesn't have the
840			 * expected GUIDs. The disks might have merely
841			 * moved around so try all other GEOM providers
842			 * to find one with the right GUIDs.
843			 */
844			cp = vdev_geom_open_by_guids(vd);
845		}
846	}
847
848	/* Clear the TLS now that tasting is done */
849	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
850
851	if (cp == NULL) {
852		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
853		error = ENOENT;
854	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
855	    !ISP2(cp->provider->sectorsize)) {
856		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
857		    vd->vdev_path);
858
859		vdev_geom_close_locked(vd);
860		error = EINVAL;
861		cp = NULL;
862	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
863		int i;
864
865		for (i = 0; i < 5; i++) {
866			error = g_access(cp, 0, 1, 0);
867			if (error == 0)
868				break;
869			g_topology_unlock();
870			tsleep(vd, 0, "vdev", hz / 2);
871			g_topology_lock();
872		}
873		if (error != 0) {
874			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
875			    vd->vdev_path, error);
876			vdev_geom_close_locked(vd);
877			cp = NULL;
878		}
879	}
880	if (cp != NULL) {
881		struct consumer_priv_t *priv;
882		struct consumer_vdev_elem *elem;
883
884		priv = (struct consumer_priv_t*)&cp->private;
885		if (cp->private == NULL)
886			SLIST_INIT(priv);
887		elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
888		elem->vd = vd;
889		SLIST_INSERT_HEAD(priv, elem, elems);
890	}
891
892	/* Fetch initial physical path information for this device. */
893	if (cp != NULL) {
894		vdev_geom_attrchanged(cp, "GEOM::physpath");
895
896		/* Set other GEOM characteristics */
897		vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
898		vdev_geom_set_rotation_rate(vd, cp);
899	}
900
901	g_topology_unlock();
902	PICKUP_GIANT();
903	if (cp == NULL) {
904		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
905		return (error);
906	}
907skip_open:
908	pp = cp->provider;
909
910	/*
911	 * Determine the actual size of the device.
912	 */
913	*max_psize = *psize = pp->mediasize;
914
915	/*
916	 * Determine the device's minimum transfer size and preferred
917	 * transfer size.
918	 */
919	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
920	*physical_ashift = 0;
921	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
922	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
923		*physical_ashift = highbit(pp->stripesize) - 1;
924
925	/*
926	 * Clear the nowritecache settings, so that on a vdev_reopen()
927	 * we will try again.
928	 */
929	vd->vdev_nowritecache = B_FALSE;
930
931	return (0);
932}
933
934static void
935vdev_geom_close(vdev_t *vd)
936{
937	struct g_consumer *cp;
938
939	cp = vd->vdev_tsd;
940
941	DROP_GIANT();
942	g_topology_lock();
943
944	if (!vd->vdev_reopening ||
945	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
946	    (cp->provider != NULL && cp->provider->error != 0))))
947		vdev_geom_close_locked(vd);
948
949	g_topology_unlock();
950	PICKUP_GIANT();
951}
952
953static void
954vdev_geom_io_intr(struct bio *bp)
955{
956	vdev_t *vd;
957	zio_t *zio;
958
959	zio = bp->bio_caller1;
960	vd = zio->io_vd;
961	zio->io_error = bp->bio_error;
962	if (zio->io_error == 0 && bp->bio_resid != 0)
963		zio->io_error = SET_ERROR(EIO);
964
965	switch(zio->io_error) {
966	case ENOTSUP:
967		/*
968		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
969		 * that future attempts will never succeed. In this case
970		 * we set a persistent flag so that we don't bother with
971		 * requests in the future.
972		 */
973		switch(bp->bio_cmd) {
974		case BIO_FLUSH:
975			vd->vdev_nowritecache = B_TRUE;
976			break;
977		case BIO_DELETE:
978			vd->vdev_notrim = B_TRUE;
979			break;
980		}
981		break;
982	case ENXIO:
983		if (!vd->vdev_remove_wanted) {
984			/*
985			 * If provider's error is set we assume it is being
986			 * removed.
987			 */
988			if (bp->bio_to->error != 0) {
989				vd->vdev_remove_wanted = B_TRUE;
990				spa_async_request(zio->io_spa,
991				    SPA_ASYNC_REMOVE);
992			} else if (!vd->vdev_delayed_close) {
993				vd->vdev_delayed_close = B_TRUE;
994			}
995		}
996		break;
997	}
998	g_destroy_bio(bp);
999	zio_delay_interrupt(zio);
1000}
1001
1002static void
1003vdev_geom_io_start(zio_t *zio)
1004{
1005	vdev_t *vd;
1006	struct g_consumer *cp;
1007	struct bio *bp;
1008	int error;
1009
1010	vd = zio->io_vd;
1011
1012	switch (zio->io_type) {
1013	case ZIO_TYPE_IOCTL:
1014		/* XXPOLICY */
1015		if (!vdev_readable(vd)) {
1016			zio->io_error = SET_ERROR(ENXIO);
1017			zio_interrupt(zio);
1018			return;
1019		} else {
1020			switch (zio->io_cmd) {
1021			case DKIOCFLUSHWRITECACHE:
1022				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
1023					break;
1024				if (vd->vdev_nowritecache) {
1025					zio->io_error = SET_ERROR(ENOTSUP);
1026					break;
1027				}
1028				goto sendreq;
1029			default:
1030				zio->io_error = SET_ERROR(ENOTSUP);
1031			}
1032		}
1033
1034		zio_execute(zio);
1035		return;
1036	case ZIO_TYPE_FREE:
1037		if (vd->vdev_notrim) {
1038			zio->io_error = SET_ERROR(ENOTSUP);
1039		} else if (!vdev_geom_bio_delete_disable) {
1040			goto sendreq;
1041		}
1042		zio_execute(zio);
1043		return;
1044	}
1045sendreq:
1046	ASSERT(zio->io_type == ZIO_TYPE_READ ||
1047	    zio->io_type == ZIO_TYPE_WRITE ||
1048	    zio->io_type == ZIO_TYPE_FREE ||
1049	    zio->io_type == ZIO_TYPE_IOCTL);
1050
1051	cp = vd->vdev_tsd;
1052	if (cp == NULL) {
1053		zio->io_error = SET_ERROR(ENXIO);
1054		zio_interrupt(zio);
1055		return;
1056	}
1057	bp = g_alloc_bio();
1058	bp->bio_caller1 = zio;
1059	switch (zio->io_type) {
1060	case ZIO_TYPE_READ:
1061	case ZIO_TYPE_WRITE:
1062		zio->io_target_timestamp = zio_handle_io_delay(zio);
1063		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1064		bp->bio_data = zio->io_data;
1065		bp->bio_offset = zio->io_offset;
1066		bp->bio_length = zio->io_size;
1067		break;
1068	case ZIO_TYPE_FREE:
1069		bp->bio_cmd = BIO_DELETE;
1070		bp->bio_data = NULL;
1071		bp->bio_offset = zio->io_offset;
1072		bp->bio_length = zio->io_size;
1073		break;
1074	case ZIO_TYPE_IOCTL:
1075		bp->bio_cmd = BIO_FLUSH;
1076		bp->bio_flags |= BIO_ORDERED;
1077		bp->bio_data = NULL;
1078		bp->bio_offset = cp->provider->mediasize;
1079		bp->bio_length = 0;
1080		break;
1081	}
1082	bp->bio_done = vdev_geom_io_intr;
1083
1084	g_io_request(bp, cp);
1085}
1086
1087static void
1088vdev_geom_io_done(zio_t *zio)
1089{
1090}
1091
1092static void
1093vdev_geom_hold(vdev_t *vd)
1094{
1095}
1096
1097static void
1098vdev_geom_rele(vdev_t *vd)
1099{
1100}
1101
1102vdev_ops_t vdev_geom_ops = {
1103	vdev_geom_open,
1104	vdev_geom_close,
1105	vdev_default_asize,
1106	vdev_geom_io_start,
1107	vdev_geom_io_done,
1108	NULL,
1109	vdev_geom_hold,
1110	vdev_geom_rele,
1111	VDEV_TYPE_DISK,		/* name of this vdev type */
1112	B_TRUE			/* leaf vdev */
1113};
1114