vdev_geom.c revision 308057
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54SYSCTL_DECL(_vfs_zfs_vdev);
55/* Don't send BIO_FLUSH. */
56static int vdev_geom_bio_flush_disable = 0;
57TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60/* Don't send BIO_DELETE. */
61static int vdev_geom_bio_delete_disable = 0;
62TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66/*
67 * Thread local storage used to indicate when a thread is probing geoms
68 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
69 * it is looking for a replacement for the vdev_t* that is its value.
70 */
71uint_t zfs_geom_probe_vdev_key;
72
73static void
74vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
75{
76	int error;
77	uint16_t rate;
78
79	error = g_getattr("GEOM::rotation_rate", cp, &rate);
80	if (error == 0)
81		vd->vdev_rotation_rate = rate;
82	else
83		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
84}
85
86static void
87vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
88{
89	vdev_t *vd;
90	spa_t *spa;
91	char *physpath;
92	int error, physpath_len;
93
94	vd = cp->private;
95	if (vd == NULL)
96		return;
97
98	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
99		vdev_geom_set_rotation_rate(vd, cp);
100		return;
101	}
102
103	if (strcmp(attr, "GEOM::physpath") != 0)
104		return;
105
106	if (g_access(cp, 1, 0, 0) != 0)
107		return;
108
109	/*
110	 * Record/Update physical path information for this device.
111	 */
112	spa = vd->vdev_spa;
113	physpath_len = MAXPATHLEN;
114	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
115	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
116	g_access(cp, -1, 0, 0);
117	if (error == 0) {
118		char *old_physpath;
119
120		/* g_topology lock ensures that vdev has not been closed */
121		g_topology_assert();
122		old_physpath = vd->vdev_physpath;
123		vd->vdev_physpath = spa_strdup(physpath);
124		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
125
126		if (old_physpath != NULL)
127			spa_strfree(old_physpath);
128	}
129	g_free(physpath);
130}
131
132static void
133vdev_geom_orphan(struct g_consumer *cp)
134{
135	vdev_t *vd;
136
137	g_topology_assert();
138
139	vd = cp->private;
140	if (vd == NULL) {
141		/* Vdev close in progress.  Ignore the event. */
142		return;
143	}
144
145	/*
146	 * Orphan callbacks occur from the GEOM event thread.
147	 * Concurrent with this call, new I/O requests may be
148	 * working their way through GEOM about to find out
149	 * (only once executed by the g_down thread) that we've
150	 * been orphaned from our disk provider.  These I/Os
151	 * must be retired before we can detach our consumer.
152	 * This is most easily achieved by acquiring the
153	 * SPA ZIO configuration lock as a writer, but doing
154	 * so with the GEOM topology lock held would cause
155	 * a lock order reversal.  Instead, rely on the SPA's
156	 * async removal support to invoke a close on this
157	 * vdev once it is safe to do so.
158	 */
159	vd->vdev_remove_wanted = B_TRUE;
160	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
161}
162
163static struct g_consumer *
164vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
165{
166	struct g_geom *gp;
167	struct g_consumer *cp;
168	int error;
169
170	g_topology_assert();
171
172	ZFS_LOG(1, "Attaching to %s.", pp->name);
173	/* Do we have geom already? No? Create one. */
174	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
175		if (gp->flags & G_GEOM_WITHER)
176			continue;
177		if (strcmp(gp->name, "zfs::vdev") != 0)
178			continue;
179		break;
180	}
181	if (gp == NULL) {
182		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
183		gp->orphan = vdev_geom_orphan;
184		gp->attrchanged = vdev_geom_attrchanged;
185		cp = g_new_consumer(gp);
186		error = g_attach(cp, pp);
187		if (error != 0) {
188			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
189			    __LINE__, error);
190			g_wither_geom(gp, ENXIO);
191			return (NULL);
192		}
193		error = g_access(cp, 1, 0, 1);
194		if (error != 0) {
195			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
196			       __LINE__, error);
197			g_wither_geom(gp, ENXIO);
198			return (NULL);
199		}
200		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
201	} else {
202		/* Check if we are already connected to this provider. */
203		LIST_FOREACH(cp, &gp->consumer, consumer) {
204			if (cp->provider == pp) {
205				ZFS_LOG(1, "Found consumer for %s.", pp->name);
206				break;
207			}
208		}
209		if (cp == NULL) {
210			cp = g_new_consumer(gp);
211			error = g_attach(cp, pp);
212			if (error != 0) {
213				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
214				    __func__, __LINE__, error);
215				g_destroy_consumer(cp);
216				return (NULL);
217			}
218			error = g_access(cp, 1, 0, 1);
219			if (error != 0) {
220				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
221				    __func__, __LINE__, error);
222				g_detach(cp);
223				g_destroy_consumer(cp);
224				return (NULL);
225			}
226			ZFS_LOG(1, "Created consumer for %s.", pp->name);
227		} else {
228			error = g_access(cp, 1, 0, 1);
229			if (error != 0) {
230				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
231				    __func__, __LINE__, error);
232				return (NULL);
233			}
234			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
235		}
236	}
237
238	/*
239	 * BUG: cp may already belong to a vdev.  This could happen if:
240	 * 1) That vdev is a shared spare, or
241	 * 2) We are trying to reopen a missing vdev and we are scanning by
242	 *    guid.  In that case, we'll ultimately fail to open this consumer,
243	 *    but not until after setting the private field.
244	 * The solution is to:
245	 * 1) Don't set the private field until after the open succeeds, and
246	 * 2) Set it to a linked list of vdevs, not just a single vdev
247	 */
248	cp->private = vd;
249	vd->vdev_tsd = cp;
250
251	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
252	return (cp);
253}
254
255static void
256vdev_geom_close_locked(vdev_t *vd)
257{
258	struct g_geom *gp;
259	struct g_consumer *cp;
260
261	g_topology_assert();
262
263	cp = vd->vdev_tsd;
264	if (cp == NULL)
265		return;
266
267	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
268	KASSERT(vd->vdev_tsd == cp, ("%s: vdev_tsd is not cp", __func__));
269	vd->vdev_tsd = NULL;
270	vd->vdev_delayed_close = B_FALSE;
271	cp->private = NULL;
272
273	gp = cp->geom;
274	g_access(cp, -1, 0, -1);
275	/* Destroy consumer on last close. */
276	if (cp->acr == 0 && cp->ace == 0) {
277		if (cp->acw > 0)
278			g_access(cp, 0, -cp->acw, 0);
279		if (cp->provider != NULL) {
280			ZFS_LOG(1, "Destroyed consumer to %s.",
281			    cp->provider->name);
282			g_detach(cp);
283		}
284		g_destroy_consumer(cp);
285	}
286	/* Destroy geom if there are no consumers left. */
287	if (LIST_EMPTY(&gp->consumer)) {
288		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
289		g_wither_geom(gp, ENXIO);
290	}
291}
292
293static void
294nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
295{
296
297	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
298	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
299}
300
301static int
302vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
303{
304	struct bio *bp;
305	u_char *p;
306	off_t off, maxio;
307	int error;
308
309	ASSERT((offset % cp->provider->sectorsize) == 0);
310	ASSERT((size % cp->provider->sectorsize) == 0);
311
312	bp = g_alloc_bio();
313	off = offset;
314	offset += size;
315	p = data;
316	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
317	error = 0;
318
319	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
320		bzero(bp, sizeof(*bp));
321		bp->bio_cmd = cmd;
322		bp->bio_done = NULL;
323		bp->bio_offset = off;
324		bp->bio_length = MIN(size, maxio);
325		bp->bio_data = p;
326		g_io_request(bp, cp);
327		error = biowait(bp, "vdev_geom_io");
328		if (error != 0)
329			break;
330	}
331
332	g_destroy_bio(bp);
333	return (error);
334}
335
336static void
337vdev_geom_taste_orphan(struct g_consumer *cp)
338{
339	ZFS_LOG(0, "WARNING: Orphan %s while tasting its VDev GUID.",
340	    cp->provider->name);
341}
342
343static int
344vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
345{
346	struct g_provider *pp;
347	vdev_label_t *label;
348	char *p, *buf;
349	size_t buflen;
350	uint64_t psize;
351	off_t offset, size;
352	uint64_t state, txg;
353	int error, l, len;
354
355	g_topology_assert_not();
356
357	pp = cp->provider;
358	ZFS_LOG(1, "Reading config from %s...", pp->name);
359
360	psize = pp->mediasize;
361	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
362
363	size = sizeof(*label) + pp->sectorsize -
364	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
365
366	label = kmem_alloc(size, KM_SLEEP);
367	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
368
369	*config = NULL;
370	for (l = 0; l < VDEV_LABELS; l++) {
371
372		offset = vdev_label_offset(psize, l, 0);
373		if ((offset % pp->sectorsize) != 0)
374			continue;
375
376		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
377			continue;
378		buf = label->vl_vdev_phys.vp_nvlist;
379
380		if (nvlist_unpack(buf, buflen, config, 0) != 0)
381			continue;
382
383		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
384		    &state) != 0 || state > POOL_STATE_L2CACHE) {
385			nvlist_free(*config);
386			*config = NULL;
387			continue;
388		}
389
390		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
391		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
392		    &txg) != 0 || txg == 0)) {
393			nvlist_free(*config);
394			*config = NULL;
395			continue;
396		}
397
398		break;
399	}
400
401	kmem_free(label, size);
402	return (*config == NULL ? ENOENT : 0);
403}
404
405static void
406resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
407{
408	nvlist_t **new_configs;
409	uint64_t i;
410
411	if (id < *count)
412		return;
413	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
414	    KM_SLEEP);
415	for (i = 0; i < *count; i++)
416		new_configs[i] = (*configs)[i];
417	if (*configs != NULL)
418		kmem_free(*configs, *count * sizeof(void *));
419	*configs = new_configs;
420	*count = id + 1;
421}
422
423static void
424process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
425    const char *name, uint64_t* known_pool_guid)
426{
427	nvlist_t *vdev_tree;
428	uint64_t pool_guid;
429	uint64_t vdev_guid, known_guid;
430	uint64_t id, txg, known_txg;
431	char *pname;
432	int i;
433
434	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
435	    strcmp(pname, name) != 0)
436		goto ignore;
437
438	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
439		goto ignore;
440
441	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
442		goto ignore;
443
444	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
445		goto ignore;
446
447	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
448		goto ignore;
449
450	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
451
452	if (*known_pool_guid != 0) {
453		if (pool_guid != *known_pool_guid)
454			goto ignore;
455	} else
456		*known_pool_guid = pool_guid;
457
458	resize_configs(configs, count, id);
459
460	if ((*configs)[id] != NULL) {
461		VERIFY(nvlist_lookup_uint64((*configs)[id],
462		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
463		if (txg <= known_txg)
464			goto ignore;
465		nvlist_free((*configs)[id]);
466	}
467
468	(*configs)[id] = cfg;
469	return;
470
471ignore:
472	nvlist_free(cfg);
473}
474
475static int
476vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp)
477{
478	int error;
479
480	if (pp->flags & G_PF_WITHER)
481		return (EINVAL);
482	g_attach(cp, pp);
483	error = g_access(cp, 1, 0, 0);
484	if (error == 0) {
485		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize))
486			error = EINVAL;
487		else if (pp->mediasize < SPA_MINDEVSIZE)
488			error = EINVAL;
489		if (error != 0)
490			g_access(cp, -1, 0, 0);
491	}
492	if (error != 0)
493		g_detach(cp);
494	return (error);
495}
496
497static void
498vdev_geom_detach_taster(struct g_consumer *cp)
499{
500	g_access(cp, -1, 0, 0);
501	g_detach(cp);
502}
503
504int
505vdev_geom_read_pool_label(const char *name,
506    nvlist_t ***configs, uint64_t *count)
507{
508	struct g_class *mp;
509	struct g_geom *gp, *zgp;
510	struct g_provider *pp;
511	struct g_consumer *zcp;
512	nvlist_t *vdev_cfg;
513	uint64_t pool_guid;
514	int error;
515
516	DROP_GIANT();
517	g_topology_lock();
518
519	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
520	/* This orphan function should be never called. */
521	zgp->orphan = vdev_geom_taste_orphan;
522	zcp = g_new_consumer(zgp);
523
524	*configs = NULL;
525	*count = 0;
526	pool_guid = 0;
527	LIST_FOREACH(mp, &g_classes, class) {
528		if (mp == &zfs_vdev_class)
529			continue;
530		LIST_FOREACH(gp, &mp->geom, geom) {
531			if (gp->flags & G_GEOM_WITHER)
532				continue;
533			LIST_FOREACH(pp, &gp->provider, provider) {
534				if (pp->flags & G_PF_WITHER)
535					continue;
536				if (vdev_geom_attach_taster(zcp, pp) != 0)
537					continue;
538				g_topology_unlock();
539				error = vdev_geom_read_config(zcp, &vdev_cfg);
540				g_topology_lock();
541				vdev_geom_detach_taster(zcp);
542				if (error)
543					continue;
544				ZFS_LOG(1, "successfully read vdev config");
545
546				process_vdev_config(configs, count,
547				    vdev_cfg, name, &pool_guid);
548			}
549		}
550	}
551
552	g_destroy_consumer(zcp);
553	g_destroy_geom(zgp);
554	g_topology_unlock();
555	PICKUP_GIANT();
556
557	return (*count > 0 ? 0 : ENOENT);
558}
559
560static void
561vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
562{
563	nvlist_t *config;
564
565	g_topology_assert_not();
566
567	*pguid = 0;
568	*vguid = 0;
569	if (vdev_geom_read_config(cp, &config) == 0) {
570		nvlist_get_guids(config, pguid, vguid);
571		nvlist_free(config);
572	}
573}
574
575static struct g_consumer *
576vdev_geom_attach_by_guids(vdev_t *vd)
577{
578	struct g_class *mp;
579	struct g_geom *gp, *zgp;
580	struct g_provider *pp;
581	struct g_consumer *cp, *zcp;
582	uint64_t pguid, vguid;
583
584	g_topology_assert();
585
586	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
587	zgp->orphan = vdev_geom_taste_orphan;
588	zcp = g_new_consumer(zgp);
589
590	cp = NULL;
591	LIST_FOREACH(mp, &g_classes, class) {
592		if (mp == &zfs_vdev_class)
593			continue;
594		LIST_FOREACH(gp, &mp->geom, geom) {
595			if (gp->flags & G_GEOM_WITHER)
596				continue;
597			LIST_FOREACH(pp, &gp->provider, provider) {
598				if (vdev_geom_attach_taster(zcp, pp) != 0)
599					continue;
600				g_topology_unlock();
601				vdev_geom_read_guids(zcp, &pguid, &vguid);
602				g_topology_lock();
603				vdev_geom_detach_taster(zcp);
604				/*
605				 * Check that the label's vdev guid matches the
606				 * desired guid.  If the label has a pool guid,
607				 * check that it matches too. (Inactive spares
608				 * and L2ARCs do not have any pool guid in the
609				 * label.)
610				*/
611				if ((pguid != 0 &&
612				     pguid != spa_guid(vd->vdev_spa)) ||
613				    vguid != vd->vdev_guid)
614					continue;
615				cp = vdev_geom_attach(pp, vd);
616				if (cp == NULL) {
617					printf("ZFS WARNING: Unable to "
618					    "attach to %s.\n", pp->name);
619					continue;
620				}
621				break;
622			}
623			if (cp != NULL)
624				break;
625		}
626		if (cp != NULL)
627			break;
628	}
629end:
630	g_destroy_consumer(zcp);
631	g_destroy_geom(zgp);
632	return (cp);
633}
634
635static struct g_consumer *
636vdev_geom_open_by_guids(vdev_t *vd)
637{
638	struct g_consumer *cp;
639	char *buf;
640	size_t len;
641
642	g_topology_assert();
643
644	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
645		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
646	cp = vdev_geom_attach_by_guids(vd);
647	if (cp != NULL) {
648		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
649		buf = kmem_alloc(len, KM_SLEEP);
650
651		snprintf(buf, len, "/dev/%s", cp->provider->name);
652		spa_strfree(vd->vdev_path);
653		vd->vdev_path = buf;
654
655		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
656		    (uintmax_t)spa_guid(vd->vdev_spa),
657		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
658	} else {
659		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
660		    (uintmax_t)spa_guid(vd->vdev_spa),
661		    (uintmax_t)vd->vdev_guid);
662	}
663
664	return (cp);
665}
666
667static struct g_consumer *
668vdev_geom_open_by_path(vdev_t *vd, int check_guid)
669{
670	struct g_provider *pp;
671	struct g_consumer *cp;
672	uint64_t pguid, vguid;
673
674	g_topology_assert();
675
676	cp = NULL;
677	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
678	if (pp != NULL) {
679		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
680		cp = vdev_geom_attach(pp, vd);
681		if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
682		    pp->sectorsize <= VDEV_PAD_SIZE) {
683			g_topology_unlock();
684			vdev_geom_read_guids(cp, &pguid, &vguid);
685			g_topology_lock();
686			/*
687			 * Check that the label's vdev guid matches the
688			 * desired guid.  If the label has a pool guid,
689			 * check that it matches too. (Inactive spares
690			 * and L2ARCs do not have any pool guid in the
691			 * label.)
692			 */
693			if ((pguid != 0 &&
694			    pguid != spa_guid(vd->vdev_spa)) ||
695			    vguid != vd->vdev_guid) {
696				vdev_geom_close_locked(vd);
697				cp = NULL;
698				ZFS_LOG(1, "guid mismatch for provider %s: "
699				    "%ju:%ju != %ju:%ju.", vd->vdev_path,
700				    (uintmax_t)spa_guid(vd->vdev_spa),
701				    (uintmax_t)vd->vdev_guid,
702				    (uintmax_t)pguid, (uintmax_t)vguid);
703			} else {
704				ZFS_LOG(1, "guid match for provider %s.",
705				    vd->vdev_path);
706			}
707		}
708	}
709
710	return (cp);
711}
712
713static int
714vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
715    uint64_t *logical_ashift, uint64_t *physical_ashift)
716{
717	struct g_provider *pp;
718	struct g_consumer *cp;
719	size_t bufsize;
720	int error;
721
722	/* Set the TLS to indicate downstack that we should not access zvols*/
723	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
724
725	/*
726	 * We must have a pathname, and it must be absolute.
727	 */
728	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
729		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
730		return (EINVAL);
731	}
732
733	vd->vdev_tsd = NULL;
734
735	DROP_GIANT();
736	g_topology_lock();
737	error = 0;
738
739	if (vd->vdev_spa->spa_splitting_newspa ||
740	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
741	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
742	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
743		/*
744		 * We are dealing with a vdev that hasn't been previously
745		 * opened (since boot), and we are not loading an
746		 * existing pool configuration.  This looks like a
747		 * vdev add operation to a new or existing pool.
748		 * Assume the user knows what he/she is doing and find
749		 * GEOM provider by its name, ignoring GUID mismatches.
750		 *
751		 * XXPOLICY: It would be safer to only allow a device
752		 *           that is unlabeled or labeled but missing
753		 *           GUID information to be opened in this fashion,
754		 *           unless we are doing a split, in which case we
755		 *           should allow any guid.
756		 */
757		cp = vdev_geom_open_by_path(vd, 0);
758	} else {
759		/*
760		 * Try using the recorded path for this device, but only
761		 * accept it if its label data contains the expected GUIDs.
762		 */
763		cp = vdev_geom_open_by_path(vd, 1);
764		if (cp == NULL) {
765			/*
766			 * The device at vd->vdev_path doesn't have the
767			 * expected GUIDs. The disks might have merely
768			 * moved around so try all other GEOM providers
769			 * to find one with the right GUIDs.
770			 */
771			cp = vdev_geom_open_by_guids(vd);
772		}
773	}
774
775	/* Clear the TLS now that tasting is done */
776	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
777
778	if (cp == NULL) {
779		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
780		error = ENOENT;
781	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
782	    !ISP2(cp->provider->sectorsize)) {
783		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
784		    vd->vdev_path);
785
786		vdev_geom_close_locked(vd);
787		error = EINVAL;
788		cp = NULL;
789	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
790		int i;
791
792		for (i = 0; i < 5; i++) {
793			error = g_access(cp, 0, 1, 0);
794			if (error == 0)
795				break;
796			g_topology_unlock();
797			tsleep(vd, 0, "vdev", hz / 2);
798			g_topology_lock();
799		}
800		if (error != 0) {
801			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
802			    vd->vdev_path, error);
803			vdev_geom_close_locked(vd);
804			cp = NULL;
805		}
806	}
807
808	/* Fetch initial physical path information for this device. */
809	if (cp != NULL)
810		vdev_geom_attrchanged(cp, "GEOM::physpath");
811
812	g_topology_unlock();
813	PICKUP_GIANT();
814	if (cp == NULL) {
815		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
816		return (error);
817	}
818	pp = cp->provider;
819
820	/*
821	 * Determine the actual size of the device.
822	 */
823	*max_psize = *psize = pp->mediasize;
824
825	/*
826	 * Determine the device's minimum transfer size and preferred
827	 * transfer size.
828	 */
829	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
830	*physical_ashift = 0;
831	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
832	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
833		*physical_ashift = highbit(pp->stripesize) - 1;
834
835	/*
836	 * Clear the nowritecache settings, so that on a vdev_reopen()
837	 * we will try again.
838	 */
839	vd->vdev_nowritecache = B_FALSE;
840
841	/*
842	 * Determine the device's rotation rate.
843	 */
844	vdev_geom_set_rotation_rate(vd, cp);
845
846	return (0);
847}
848
849static void
850vdev_geom_close(vdev_t *vd)
851{
852
853	DROP_GIANT();
854	g_topology_lock();
855	vdev_geom_close_locked(vd);
856	g_topology_unlock();
857	PICKUP_GIANT();
858}
859
860static void
861vdev_geom_io_intr(struct bio *bp)
862{
863	vdev_t *vd;
864	zio_t *zio;
865
866	zio = bp->bio_caller1;
867	vd = zio->io_vd;
868	zio->io_error = bp->bio_error;
869	if (zio->io_error == 0 && bp->bio_resid != 0)
870		zio->io_error = SET_ERROR(EIO);
871
872	switch(zio->io_error) {
873	case ENOTSUP:
874		/*
875		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
876		 * that future attempts will never succeed. In this case
877		 * we set a persistent flag so that we don't bother with
878		 * requests in the future.
879		 */
880		switch(bp->bio_cmd) {
881		case BIO_FLUSH:
882			vd->vdev_nowritecache = B_TRUE;
883			break;
884		case BIO_DELETE:
885			vd->vdev_notrim = B_TRUE;
886			break;
887		}
888		break;
889	case ENXIO:
890		if (!vd->vdev_remove_wanted) {
891			/*
892			 * If provider's error is set we assume it is being
893			 * removed.
894			 */
895			if (bp->bio_to->error != 0) {
896				vd->vdev_remove_wanted = B_TRUE;
897				spa_async_request(zio->io_spa,
898				    SPA_ASYNC_REMOVE);
899			} else if (!vd->vdev_delayed_close) {
900				vd->vdev_delayed_close = B_TRUE;
901			}
902		}
903		break;
904	}
905	g_destroy_bio(bp);
906	zio_delay_interrupt(zio);
907}
908
909static void
910vdev_geom_io_start(zio_t *zio)
911{
912	vdev_t *vd;
913	struct g_consumer *cp;
914	struct bio *bp;
915	int error;
916
917	vd = zio->io_vd;
918
919	switch (zio->io_type) {
920	case ZIO_TYPE_IOCTL:
921		/* XXPOLICY */
922		if (!vdev_readable(vd)) {
923			zio->io_error = SET_ERROR(ENXIO);
924			zio_interrupt(zio);
925			return;
926		} else {
927			switch (zio->io_cmd) {
928			case DKIOCFLUSHWRITECACHE:
929				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
930					break;
931				if (vd->vdev_nowritecache) {
932					zio->io_error = SET_ERROR(ENOTSUP);
933					break;
934				}
935				goto sendreq;
936			default:
937				zio->io_error = SET_ERROR(ENOTSUP);
938			}
939		}
940
941		zio_execute(zio);
942		return;
943	case ZIO_TYPE_FREE:
944		if (vd->vdev_notrim) {
945			zio->io_error = SET_ERROR(ENOTSUP);
946		} else if (!vdev_geom_bio_delete_disable) {
947			goto sendreq;
948		}
949		zio_execute(zio);
950		return;
951	}
952sendreq:
953	ASSERT(zio->io_type == ZIO_TYPE_READ ||
954	    zio->io_type == ZIO_TYPE_WRITE ||
955	    zio->io_type == ZIO_TYPE_FREE ||
956	    zio->io_type == ZIO_TYPE_IOCTL);
957
958	cp = vd->vdev_tsd;
959	if (cp == NULL) {
960		zio->io_error = SET_ERROR(ENXIO);
961		zio_interrupt(zio);
962		return;
963	}
964	bp = g_alloc_bio();
965	bp->bio_caller1 = zio;
966	switch (zio->io_type) {
967	case ZIO_TYPE_READ:
968	case ZIO_TYPE_WRITE:
969		zio->io_target_timestamp = zio_handle_io_delay(zio);
970		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
971		bp->bio_data = zio->io_data;
972		bp->bio_offset = zio->io_offset;
973		bp->bio_length = zio->io_size;
974		break;
975	case ZIO_TYPE_FREE:
976		bp->bio_cmd = BIO_DELETE;
977		bp->bio_data = NULL;
978		bp->bio_offset = zio->io_offset;
979		bp->bio_length = zio->io_size;
980		break;
981	case ZIO_TYPE_IOCTL:
982		bp->bio_cmd = BIO_FLUSH;
983		bp->bio_flags |= BIO_ORDERED;
984		bp->bio_data = NULL;
985		bp->bio_offset = cp->provider->mediasize;
986		bp->bio_length = 0;
987		break;
988	}
989	bp->bio_done = vdev_geom_io_intr;
990
991	g_io_request(bp, cp);
992}
993
994static void
995vdev_geom_io_done(zio_t *zio)
996{
997}
998
999static void
1000vdev_geom_hold(vdev_t *vd)
1001{
1002}
1003
1004static void
1005vdev_geom_rele(vdev_t *vd)
1006{
1007}
1008
1009vdev_ops_t vdev_geom_ops = {
1010	vdev_geom_open,
1011	vdev_geom_close,
1012	vdev_default_asize,
1013	vdev_geom_io_start,
1014	vdev_geom_io_done,
1015	NULL,
1016	vdev_geom_hold,
1017	vdev_geom_rele,
1018	VDEV_TYPE_DISK,		/* name of this vdev type */
1019	B_TRUE			/* leaf vdev */
1020};
1021