vdev_geom.c revision 330524
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52struct consumer_vdev_elem {
53	SLIST_ENTRY(consumer_vdev_elem)	elems;
54	vdev_t				*vd;
55};
56
57SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
58_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
59    == sizeof(struct consumer_priv_t*),
60    "consumer_priv_t* can't be stored in g_consumer.private");
61
62DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
63
64SYSCTL_DECL(_vfs_zfs_vdev);
65/* Don't send BIO_FLUSH. */
66static int vdev_geom_bio_flush_disable = 0;
67TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
68SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
69    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
70/* Don't send BIO_DELETE. */
71static int vdev_geom_bio_delete_disable = 0;
72TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
73SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
74    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
75
76/* Declare local functions */
77static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
78
79/*
80 * Thread local storage used to indicate when a thread is probing geoms
81 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
82 * it is looking for a replacement for the vdev_t* that is its value.
83 */
84uint_t zfs_geom_probe_vdev_key;
85
86static void
87vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
88{
89	int error;
90	uint16_t rate;
91
92	error = g_getattr("GEOM::rotation_rate", cp, &rate);
93	if (error == 0)
94		vd->vdev_rotation_rate = rate;
95	else
96		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
97}
98
99static void
100vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
101		       boolean_t do_null_update)
102{
103	boolean_t needs_update = B_FALSE;
104	char *physpath;
105	int error, physpath_len;
106
107	physpath_len = MAXPATHLEN;
108	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
109	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
110	if (error == 0) {
111		char *old_physpath;
112
113		/* g_topology lock ensures that vdev has not been closed */
114		g_topology_assert();
115		old_physpath = vd->vdev_physpath;
116		vd->vdev_physpath = spa_strdup(physpath);
117
118		if (old_physpath != NULL) {
119			needs_update = (strcmp(old_physpath,
120						vd->vdev_physpath) != 0);
121			spa_strfree(old_physpath);
122		} else
123			needs_update = do_null_update;
124	}
125	g_free(physpath);
126
127	/*
128	 * If the physical path changed, update the config.
129	 * Only request an update for previously unset physpaths if
130	 * requested by the caller.
131	 */
132	if (needs_update)
133		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
134
135}
136
137static void
138vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
139{
140	char *old_physpath;
141	struct consumer_priv_t *priv;
142	struct consumer_vdev_elem *elem;
143	int error;
144
145	priv = (struct consumer_priv_t*)&cp->private;
146	if (SLIST_EMPTY(priv))
147		return;
148
149	SLIST_FOREACH(elem, priv, elems) {
150		vdev_t *vd = elem->vd;
151		if (strcmp(attr, "GEOM::rotation_rate") == 0) {
152			vdev_geom_set_rotation_rate(vd, cp);
153			return;
154		}
155		if (strcmp(attr, "GEOM::physpath") == 0) {
156			vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
157			return;
158		}
159	}
160}
161
162static void
163vdev_geom_orphan(struct g_consumer *cp)
164{
165	struct consumer_priv_t *priv;
166	struct consumer_vdev_elem *elem;
167
168	g_topology_assert();
169
170	priv = (struct consumer_priv_t*)&cp->private;
171	if (SLIST_EMPTY(priv))
172		/* Vdev close in progress.  Ignore the event. */
173		return;
174
175	/*
176	 * Orphan callbacks occur from the GEOM event thread.
177	 * Concurrent with this call, new I/O requests may be
178	 * working their way through GEOM about to find out
179	 * (only once executed by the g_down thread) that we've
180	 * been orphaned from our disk provider.  These I/Os
181	 * must be retired before we can detach our consumer.
182	 * This is most easily achieved by acquiring the
183	 * SPA ZIO configuration lock as a writer, but doing
184	 * so with the GEOM topology lock held would cause
185	 * a lock order reversal.  Instead, rely on the SPA's
186	 * async removal support to invoke a close on this
187	 * vdev once it is safe to do so.
188	 */
189	SLIST_FOREACH(elem, priv, elems) {
190		vdev_t *vd = elem->vd;
191
192		vd->vdev_remove_wanted = B_TRUE;
193		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
194	}
195}
196
197static struct g_consumer *
198vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
199{
200	struct g_geom *gp;
201	struct g_consumer *cp;
202	int error;
203
204	g_topology_assert();
205
206	ZFS_LOG(1, "Attaching to %s.", pp->name);
207
208	if (sanity) {
209		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
210			ZFS_LOG(1, "Failing attach of %s. "
211				   "Incompatible sectorsize %d\n",
212			    pp->name, pp->sectorsize);
213			return (NULL);
214		} else if (pp->mediasize < SPA_MINDEVSIZE) {
215			ZFS_LOG(1, "Failing attach of %s. "
216				   "Incompatible mediasize %ju\n",
217			    pp->name, pp->mediasize);
218			return (NULL);
219		}
220	}
221
222	/* Do we have geom already? No? Create one. */
223	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
224		if (gp->flags & G_GEOM_WITHER)
225			continue;
226		if (strcmp(gp->name, "zfs::vdev") != 0)
227			continue;
228		break;
229	}
230	if (gp == NULL) {
231		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
232		gp->orphan = vdev_geom_orphan;
233		gp->attrchanged = vdev_geom_attrchanged;
234		cp = g_new_consumer(gp);
235		error = g_attach(cp, pp);
236		if (error != 0) {
237			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
238			    __LINE__, error);
239			vdev_geom_detach(cp, B_FALSE);
240			return (NULL);
241		}
242		error = g_access(cp, 1, 0, 1);
243		if (error != 0) {
244			ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
245			       __LINE__, error);
246			vdev_geom_detach(cp, B_FALSE);
247			return (NULL);
248		}
249		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
250	} else {
251		/* Check if we are already connected to this provider. */
252		LIST_FOREACH(cp, &gp->consumer, consumer) {
253			if (cp->provider == pp) {
254				ZFS_LOG(1, "Found consumer for %s.", pp->name);
255				break;
256			}
257		}
258		if (cp == NULL) {
259			cp = g_new_consumer(gp);
260			error = g_attach(cp, pp);
261			if (error != 0) {
262				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
263				    __func__, __LINE__, error);
264				vdev_geom_detach(cp, B_FALSE);
265				return (NULL);
266			}
267			error = g_access(cp, 1, 0, 1);
268			if (error != 0) {
269				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
270				    __func__, __LINE__, error);
271				vdev_geom_detach(cp, B_FALSE);
272				return (NULL);
273			}
274			ZFS_LOG(1, "Created consumer for %s.", pp->name);
275		} else {
276			error = g_access(cp, 1, 0, 1);
277			if (error != 0) {
278				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
279				    __func__, __LINE__, error);
280				return (NULL);
281			}
282			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
283		}
284	}
285
286	if (vd != NULL)
287		vd->vdev_tsd = cp;
288
289	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
290	return (cp);
291}
292
293static void
294vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
295{
296	struct g_geom *gp;
297
298	g_topology_assert();
299
300	ZFS_LOG(1, "Detaching from %s.",
301	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
302
303	gp = cp->geom;
304	if (open_for_read)
305		g_access(cp, -1, 0, -1);
306	/* Destroy consumer on last close. */
307	if (cp->acr == 0 && cp->ace == 0) {
308		if (cp->acw > 0)
309			g_access(cp, 0, -cp->acw, 0);
310		if (cp->provider != NULL) {
311			ZFS_LOG(1, "Destroying consumer for %s.",
312			    cp->provider->name ? cp->provider->name : "NULL");
313			g_detach(cp);
314		}
315		g_destroy_consumer(cp);
316	}
317	/* Destroy geom if there are no consumers left. */
318	if (LIST_EMPTY(&gp->consumer)) {
319		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
320		g_wither_geom(gp, ENXIO);
321	}
322}
323
324static void
325vdev_geom_close_locked(vdev_t *vd)
326{
327	struct g_consumer *cp;
328	struct consumer_priv_t *priv;
329	struct consumer_vdev_elem *elem, *elem_temp;
330
331	g_topology_assert();
332
333	cp = vd->vdev_tsd;
334	vd->vdev_delayed_close = B_FALSE;
335	if (cp == NULL)
336		return;
337
338	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
339	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
340	priv = (struct consumer_priv_t*)&cp->private;
341	vd->vdev_tsd = NULL;
342	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
343		if (elem->vd == vd) {
344			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
345			g_free(elem);
346		}
347	}
348
349	vdev_geom_detach(cp, B_TRUE);
350}
351
352/*
353 * Issue one or more bios to the vdev in parallel
354 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
355 * operation is described by parallel entries from each array.  There may be
356 * more bios actually issued than entries in the array
357 */
358static void
359vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
360    off_t *sizes, int *errors, int ncmds)
361{
362	struct bio **bios;
363	u_char *p;
364	off_t off, maxio, s, end;
365	int i, n_bios, j;
366	size_t bios_size;
367
368	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
369	n_bios = 0;
370
371	/* How many bios are required for all commands ? */
372	for (i = 0; i < ncmds; i++)
373		n_bios += (sizes[i] + maxio - 1) / maxio;
374
375	/* Allocate memory for the bios */
376	bios_size = n_bios * sizeof(struct bio*);
377	bios = kmem_zalloc(bios_size, KM_SLEEP);
378
379	/* Prepare and issue all of the bios */
380	for (i = j = 0; i < ncmds; i++) {
381		off = offsets[i];
382		p = datas[i];
383		s = sizes[i];
384		end = off + s;
385		ASSERT((off % cp->provider->sectorsize) == 0);
386		ASSERT((s % cp->provider->sectorsize) == 0);
387
388		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
389			bios[j] = g_alloc_bio();
390			bios[j]->bio_cmd = cmds[i];
391			bios[j]->bio_done = NULL;
392			bios[j]->bio_offset = off;
393			bios[j]->bio_length = MIN(s, maxio);
394			bios[j]->bio_data = p;
395			g_io_request(bios[j], cp);
396		}
397	}
398	ASSERT(j == n_bios);
399
400	/* Wait for all of the bios to complete, and clean them up */
401	for (i = j = 0; i < ncmds; i++) {
402		off = offsets[i];
403		s = sizes[i];
404		end = off + s;
405
406		for (; off < end; off += maxio, s -= maxio, j++) {
407			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
408			g_destroy_bio(bios[j]);
409		}
410	}
411	kmem_free(bios, bios_size);
412}
413
414/*
415 * Read the vdev config from a device.  Return the number of valid labels that
416 * were found.  The vdev config will be returned in config if and only if at
417 * least one valid label was found.
418 */
419static int
420vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
421{
422	struct g_provider *pp;
423	vdev_phys_t *vdev_lists[VDEV_LABELS];
424	char *buf;
425	size_t buflen;
426	uint64_t psize, state, txg;
427	off_t offsets[VDEV_LABELS];
428	off_t size;
429	off_t sizes[VDEV_LABELS];
430	int cmds[VDEV_LABELS];
431	int errors[VDEV_LABELS];
432	int l, nlabels;
433
434	g_topology_assert_not();
435
436	pp = cp->provider;
437	ZFS_LOG(1, "Reading config from %s...", pp->name);
438
439	psize = pp->mediasize;
440	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
441
442	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
443	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
444
445	buflen = sizeof(vdev_lists[0]->vp_nvlist);
446
447	*config = NULL;
448	/* Create all of the IO requests */
449	for (l = 0; l < VDEV_LABELS; l++) {
450		cmds[l] = BIO_READ;
451		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
452		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
453		sizes[l] = size;
454		errors[l] = 0;
455		ASSERT(offsets[l] % pp->sectorsize == 0);
456	}
457
458	/* Issue the IO requests */
459	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
460	    VDEV_LABELS);
461
462	/* Parse the labels */
463	nlabels = 0;
464	for (l = 0; l < VDEV_LABELS; l++) {
465		if (errors[l] != 0)
466			continue;
467
468		buf = vdev_lists[l]->vp_nvlist;
469
470		if (nvlist_unpack(buf, buflen, config, 0) != 0)
471			continue;
472
473		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
474		    &state) != 0 || state > POOL_STATE_L2CACHE) {
475			nvlist_free(*config);
476			*config = NULL;
477			continue;
478		}
479
480		if (state != POOL_STATE_SPARE &&
481		    state != POOL_STATE_L2CACHE &&
482		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
483		    &txg) != 0 || txg == 0)) {
484			nvlist_free(*config);
485			*config = NULL;
486			continue;
487		}
488
489		nlabels++;
490	}
491
492	/* Free the label storage */
493	for (l = 0; l < VDEV_LABELS; l++)
494		kmem_free(vdev_lists[l], size);
495
496	return (nlabels);
497}
498
499static void
500resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
501{
502	nvlist_t **new_configs;
503	uint64_t i;
504
505	if (id < *count)
506		return;
507	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
508	    KM_SLEEP);
509	for (i = 0; i < *count; i++)
510		new_configs[i] = (*configs)[i];
511	if (*configs != NULL)
512		kmem_free(*configs, *count * sizeof(void *));
513	*configs = new_configs;
514	*count = id + 1;
515}
516
517static void
518process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
519    const char *name, uint64_t* known_pool_guid)
520{
521	nvlist_t *vdev_tree;
522	uint64_t pool_guid;
523	uint64_t vdev_guid, known_guid;
524	uint64_t id, txg, known_txg;
525	char *pname;
526	int i;
527
528	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
529	    strcmp(pname, name) != 0)
530		goto ignore;
531
532	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
533		goto ignore;
534
535	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
536		goto ignore;
537
538	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
539		goto ignore;
540
541	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
542		goto ignore;
543
544	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
545
546	if (*known_pool_guid != 0) {
547		if (pool_guid != *known_pool_guid)
548			goto ignore;
549	} else
550		*known_pool_guid = pool_guid;
551
552	resize_configs(configs, count, id);
553
554	if ((*configs)[id] != NULL) {
555		VERIFY(nvlist_lookup_uint64((*configs)[id],
556		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
557		if (txg <= known_txg)
558			goto ignore;
559		nvlist_free((*configs)[id]);
560	}
561
562	(*configs)[id] = cfg;
563	return;
564
565ignore:
566	nvlist_free(cfg);
567}
568
569int
570vdev_geom_read_pool_label(const char *name,
571    nvlist_t ***configs, uint64_t *count)
572{
573	struct g_class *mp;
574	struct g_geom *gp;
575	struct g_provider *pp;
576	struct g_consumer *zcp;
577	nvlist_t *vdev_cfg;
578	uint64_t pool_guid;
579	int error, nlabels;
580
581	DROP_GIANT();
582	g_topology_lock();
583
584	*configs = NULL;
585	*count = 0;
586	pool_guid = 0;
587	LIST_FOREACH(mp, &g_classes, class) {
588		if (mp == &zfs_vdev_class)
589			continue;
590		LIST_FOREACH(gp, &mp->geom, geom) {
591			if (gp->flags & G_GEOM_WITHER)
592				continue;
593			LIST_FOREACH(pp, &gp->provider, provider) {
594				if (pp->flags & G_PF_WITHER)
595					continue;
596				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
597				if (zcp == NULL)
598					continue;
599				g_topology_unlock();
600				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
601				g_topology_lock();
602				vdev_geom_detach(zcp, B_TRUE);
603				if (nlabels == 0)
604					continue;
605				ZFS_LOG(1, "successfully read vdev config");
606
607				process_vdev_config(configs, count,
608				    vdev_cfg, name, &pool_guid);
609			}
610		}
611	}
612	g_topology_unlock();
613	PICKUP_GIANT();
614
615	return (*count > 0 ? 0 : ENOENT);
616}
617
618enum match {
619	NO_MATCH = 0,		/* No matching labels found */
620	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid*/
621	ZERO_MATCH = 1,		/* Should never be returned */
622	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
623	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
624	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
625	FULL_MATCH = 5		/* all labels match the vdev_guid */
626};
627
628static enum match
629vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
630{
631	nvlist_t *config;
632	uint64_t pool_guid, top_guid, vdev_guid;
633	struct g_consumer *cp;
634	int nlabels;
635
636	cp = vdev_geom_attach(pp, NULL, B_TRUE);
637	if (cp == NULL) {
638		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
639		    pp->name);
640		return (NO_MATCH);
641	}
642	g_topology_unlock();
643	nlabels = vdev_geom_read_config(cp, &config);
644	g_topology_lock();
645	vdev_geom_detach(cp, B_TRUE);
646	if (nlabels == 0) {
647		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
648		return (NO_MATCH);
649	}
650
651	pool_guid = 0;
652	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
653	top_guid = 0;
654	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
655	vdev_guid = 0;
656	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
657	nvlist_free(config);
658
659	/*
660	 * Check that the label's pool guid matches the desired guid.
661	 * Inactive spares and L2ARCs do not have any pool guid in the label.
662	 */
663	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
664		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
665		    pp->name,
666		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
667		return (NO_MATCH);
668	}
669
670	/*
671	 * Check that the label's vdev guid matches the desired guid.
672	 * The second condition handles possible race on vdev detach, when
673	 * remaining vdev receives GUID of destroyed top level mirror vdev.
674	 */
675	if (vdev_guid == vd->vdev_guid) {
676		ZFS_LOG(1, "guids match for provider %s.", pp->name);
677		return (ZERO_MATCH + nlabels);
678	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
679		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
680		return (TOPGUID_MATCH);
681	}
682	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
683	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
684	return (NO_MATCH);
685}
686
687static struct g_consumer *
688vdev_geom_attach_by_guids(vdev_t *vd)
689{
690	struct g_class *mp;
691	struct g_geom *gp;
692	struct g_provider *pp, *best_pp;
693	struct g_consumer *cp;
694	enum match match, best_match;
695
696	g_topology_assert();
697
698	cp = NULL;
699	best_pp = NULL;
700	best_match = NO_MATCH;
701	LIST_FOREACH(mp, &g_classes, class) {
702		if (mp == &zfs_vdev_class)
703			continue;
704		LIST_FOREACH(gp, &mp->geom, geom) {
705			if (gp->flags & G_GEOM_WITHER)
706				continue;
707			LIST_FOREACH(pp, &gp->provider, provider) {
708				match = vdev_attach_ok(vd, pp);
709				if (match > best_match) {
710					best_match = match;
711					best_pp = pp;
712				}
713				if (match == FULL_MATCH)
714					goto out;
715			}
716		}
717	}
718
719out:
720	if (best_pp) {
721		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
722		if (cp == NULL) {
723			printf("ZFS WARNING: Unable to attach to %s.\n",
724			    best_pp->name);
725		}
726	}
727	return (cp);
728}
729
730static struct g_consumer *
731vdev_geom_open_by_guids(vdev_t *vd)
732{
733	struct g_consumer *cp;
734	char *buf;
735	size_t len;
736
737	g_topology_assert();
738
739	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
740		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
741	cp = vdev_geom_attach_by_guids(vd);
742	if (cp != NULL) {
743		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
744		buf = kmem_alloc(len, KM_SLEEP);
745
746		snprintf(buf, len, "/dev/%s", cp->provider->name);
747		spa_strfree(vd->vdev_path);
748		vd->vdev_path = buf;
749
750		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
751		    (uintmax_t)spa_guid(vd->vdev_spa),
752		    (uintmax_t)vd->vdev_guid, cp->provider->name);
753	} else {
754		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
755		    (uintmax_t)spa_guid(vd->vdev_spa),
756		    (uintmax_t)vd->vdev_guid);
757	}
758
759	return (cp);
760}
761
762static struct g_consumer *
763vdev_geom_open_by_path(vdev_t *vd, int check_guid)
764{
765	struct g_provider *pp;
766	struct g_consumer *cp;
767
768	g_topology_assert();
769
770	cp = NULL;
771	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
772	if (pp != NULL) {
773		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
774		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
775			cp = vdev_geom_attach(pp, vd, B_FALSE);
776	}
777
778	return (cp);
779}
780
781static int
782vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
783    uint64_t *logical_ashift, uint64_t *physical_ashift)
784{
785	struct g_provider *pp;
786	struct g_consumer *cp;
787	size_t bufsize;
788	int error;
789
790	/* Set the TLS to indicate downstack that we should not access zvols*/
791	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
792
793	/*
794	 * We must have a pathname, and it must be absolute.
795	 */
796	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
797		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
798		return (EINVAL);
799	}
800
801	/*
802	 * Reopen the device if it's not currently open. Otherwise,
803	 * just update the physical size of the device.
804	 */
805	if ((cp = vd->vdev_tsd) != NULL) {
806		ASSERT(vd->vdev_reopening);
807		goto skip_open;
808	}
809
810	DROP_GIANT();
811	g_topology_lock();
812	error = 0;
813
814	if (vd->vdev_spa->spa_splitting_newspa ||
815	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
816	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
817	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
818		/*
819		 * We are dealing with a vdev that hasn't been previously
820		 * opened (since boot), and we are not loading an
821		 * existing pool configuration.  This looks like a
822		 * vdev add operation to a new or existing pool.
823		 * Assume the user knows what he/she is doing and find
824		 * GEOM provider by its name, ignoring GUID mismatches.
825		 *
826		 * XXPOLICY: It would be safer to only allow a device
827		 *           that is unlabeled or labeled but missing
828		 *           GUID information to be opened in this fashion,
829		 *           unless we are doing a split, in which case we
830		 *           should allow any guid.
831		 */
832		cp = vdev_geom_open_by_path(vd, 0);
833	} else {
834		/*
835		 * Try using the recorded path for this device, but only
836		 * accept it if its label data contains the expected GUIDs.
837		 */
838		cp = vdev_geom_open_by_path(vd, 1);
839		if (cp == NULL) {
840			/*
841			 * The device at vd->vdev_path doesn't have the
842			 * expected GUIDs. The disks might have merely
843			 * moved around so try all other GEOM providers
844			 * to find one with the right GUIDs.
845			 */
846			cp = vdev_geom_open_by_guids(vd);
847		}
848	}
849
850	/* Clear the TLS now that tasting is done */
851	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
852
853	if (cp == NULL) {
854		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
855		error = ENOENT;
856	} else {
857		struct consumer_priv_t *priv;
858		struct consumer_vdev_elem *elem;
859		int spamode;
860
861		priv = (struct consumer_priv_t*)&cp->private;
862		if (cp->private == NULL)
863			SLIST_INIT(priv);
864		elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
865		elem->vd = vd;
866		SLIST_INSERT_HEAD(priv, elem, elems);
867
868		spamode = spa_mode(vd->vdev_spa);
869		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
870		    !ISP2(cp->provider->sectorsize)) {
871			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
872			    cp->provider->name);
873
874			vdev_geom_close_locked(vd);
875			error = EINVAL;
876			cp = NULL;
877		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
878			int i;
879
880			for (i = 0; i < 5; i++) {
881				error = g_access(cp, 0, 1, 0);
882				if (error == 0)
883					break;
884				g_topology_unlock();
885				tsleep(vd, 0, "vdev", hz / 2);
886				g_topology_lock();
887			}
888			if (error != 0) {
889				printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
890				    cp->provider->name, error);
891				vdev_geom_close_locked(vd);
892				cp = NULL;
893			}
894		}
895	}
896
897	/* Fetch initial physical path information for this device. */
898	if (cp != NULL) {
899		vdev_geom_attrchanged(cp, "GEOM::physpath");
900
901		/* Set other GEOM characteristics */
902		vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
903		vdev_geom_set_rotation_rate(vd, cp);
904	}
905
906	g_topology_unlock();
907	PICKUP_GIANT();
908	if (cp == NULL) {
909		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
910		return (error);
911	}
912skip_open:
913	pp = cp->provider;
914
915	/*
916	 * Determine the actual size of the device.
917	 */
918	*max_psize = *psize = pp->mediasize;
919
920	/*
921	 * Determine the device's minimum transfer size and preferred
922	 * transfer size.
923	 */
924	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
925	*physical_ashift = 0;
926	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
927	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
928		*physical_ashift = highbit(pp->stripesize) - 1;
929
930	/*
931	 * Clear the nowritecache settings, so that on a vdev_reopen()
932	 * we will try again.
933	 */
934	vd->vdev_nowritecache = B_FALSE;
935
936	return (0);
937}
938
939static void
940vdev_geom_close(vdev_t *vd)
941{
942	struct g_consumer *cp;
943
944	cp = vd->vdev_tsd;
945
946	DROP_GIANT();
947	g_topology_lock();
948
949	if (!vd->vdev_reopening ||
950	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
951	    (cp->provider != NULL && cp->provider->error != 0))))
952		vdev_geom_close_locked(vd);
953
954	g_topology_unlock();
955	PICKUP_GIANT();
956}
957
958static void
959vdev_geom_io_intr(struct bio *bp)
960{
961	vdev_t *vd;
962	zio_t *zio;
963
964	zio = bp->bio_caller1;
965	vd = zio->io_vd;
966	zio->io_error = bp->bio_error;
967	if (zio->io_error == 0 && bp->bio_resid != 0)
968		zio->io_error = SET_ERROR(EIO);
969
970	switch(zio->io_error) {
971	case ENOTSUP:
972		/*
973		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
974		 * that future attempts will never succeed. In this case
975		 * we set a persistent flag so that we don't bother with
976		 * requests in the future.
977		 */
978		switch(bp->bio_cmd) {
979		case BIO_FLUSH:
980			vd->vdev_nowritecache = B_TRUE;
981			break;
982		case BIO_DELETE:
983			vd->vdev_notrim = B_TRUE;
984			break;
985		}
986		break;
987	case ENXIO:
988		if (!vd->vdev_remove_wanted) {
989			/*
990			 * If provider's error is set we assume it is being
991			 * removed.
992			 */
993			if (bp->bio_to->error != 0) {
994				vd->vdev_remove_wanted = B_TRUE;
995				spa_async_request(zio->io_spa,
996				    SPA_ASYNC_REMOVE);
997			} else if (!vd->vdev_delayed_close) {
998				vd->vdev_delayed_close = B_TRUE;
999			}
1000		}
1001		break;
1002	}
1003	g_destroy_bio(bp);
1004	zio_delay_interrupt(zio);
1005}
1006
1007static void
1008vdev_geom_io_start(zio_t *zio)
1009{
1010	vdev_t *vd;
1011	struct g_consumer *cp;
1012	struct bio *bp;
1013	int error;
1014
1015	vd = zio->io_vd;
1016
1017	switch (zio->io_type) {
1018	case ZIO_TYPE_IOCTL:
1019		/* XXPOLICY */
1020		if (!vdev_readable(vd)) {
1021			zio->io_error = SET_ERROR(ENXIO);
1022			zio_interrupt(zio);
1023			return;
1024		} else {
1025			switch (zio->io_cmd) {
1026			case DKIOCFLUSHWRITECACHE:
1027				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
1028					break;
1029				if (vd->vdev_nowritecache) {
1030					zio->io_error = SET_ERROR(ENOTSUP);
1031					break;
1032				}
1033				goto sendreq;
1034			default:
1035				zio->io_error = SET_ERROR(ENOTSUP);
1036			}
1037		}
1038
1039		zio_execute(zio);
1040		return;
1041	case ZIO_TYPE_FREE:
1042		if (vd->vdev_notrim) {
1043			zio->io_error = SET_ERROR(ENOTSUP);
1044		} else if (!vdev_geom_bio_delete_disable) {
1045			goto sendreq;
1046		}
1047		zio_execute(zio);
1048		return;
1049	}
1050sendreq:
1051	ASSERT(zio->io_type == ZIO_TYPE_READ ||
1052	    zio->io_type == ZIO_TYPE_WRITE ||
1053	    zio->io_type == ZIO_TYPE_FREE ||
1054	    zio->io_type == ZIO_TYPE_IOCTL);
1055
1056	cp = vd->vdev_tsd;
1057	if (cp == NULL) {
1058		zio->io_error = SET_ERROR(ENXIO);
1059		zio_interrupt(zio);
1060		return;
1061	}
1062	bp = g_alloc_bio();
1063	bp->bio_caller1 = zio;
1064	switch (zio->io_type) {
1065	case ZIO_TYPE_READ:
1066	case ZIO_TYPE_WRITE:
1067		zio->io_target_timestamp = zio_handle_io_delay(zio);
1068		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1069		bp->bio_data = zio->io_data;
1070		bp->bio_offset = zio->io_offset;
1071		bp->bio_length = zio->io_size;
1072		break;
1073	case ZIO_TYPE_FREE:
1074		bp->bio_cmd = BIO_DELETE;
1075		bp->bio_data = NULL;
1076		bp->bio_offset = zio->io_offset;
1077		bp->bio_length = zio->io_size;
1078		break;
1079	case ZIO_TYPE_IOCTL:
1080		bp->bio_cmd = BIO_FLUSH;
1081		bp->bio_flags |= BIO_ORDERED;
1082		bp->bio_data = NULL;
1083		bp->bio_offset = cp->provider->mediasize;
1084		bp->bio_length = 0;
1085		break;
1086	}
1087	bp->bio_done = vdev_geom_io_intr;
1088
1089	g_io_request(bp, cp);
1090}
1091
1092static void
1093vdev_geom_io_done(zio_t *zio)
1094{
1095}
1096
1097static void
1098vdev_geom_hold(vdev_t *vd)
1099{
1100}
1101
1102static void
1103vdev_geom_rele(vdev_t *vd)
1104{
1105}
1106
1107vdev_ops_t vdev_geom_ops = {
1108	vdev_geom_open,
1109	vdev_geom_close,
1110	vdev_default_asize,
1111	vdev_geom_io_start,
1112	vdev_geom_io_done,
1113	NULL,
1114	vdev_geom_hold,
1115	vdev_geom_rele,
1116	VDEV_TYPE_DISK,		/* name of this vdev type */
1117	B_TRUE			/* leaf vdev */
1118};
1119