vdev_geom.c revision 299536
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54SYSCTL_DECL(_vfs_zfs_vdev);
55/* Don't send BIO_FLUSH. */
56static int vdev_geom_bio_flush_disable = 0;
57TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60/* Don't send BIO_DELETE. */
61static int vdev_geom_bio_delete_disable = 0;
62TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66static void
67vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
68{
69	int error;
70	uint16_t rate;
71
72	error = g_getattr("GEOM::rotation_rate", cp, &rate);
73	if (error == 0)
74		vd->vdev_rotation_rate = rate;
75	else
76		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
77}
78
79static void
80vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
81{
82	vdev_t *vd;
83	spa_t *spa;
84	char *physpath;
85	int error, physpath_len;
86
87	vd = cp->private;
88	if (vd == NULL)
89		return;
90
91	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
92		vdev_geom_set_rotation_rate(vd, cp);
93		return;
94	}
95
96	if (strcmp(attr, "GEOM::physpath") != 0)
97		return;
98
99	if (g_access(cp, 1, 0, 0) != 0)
100		return;
101
102	/*
103	 * Record/Update physical path information for this device.
104	 */
105	spa = vd->vdev_spa;
106	physpath_len = MAXPATHLEN;
107	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
108	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
109	g_access(cp, -1, 0, 0);
110	if (error == 0) {
111		char *old_physpath;
112
113		/* g_topology lock ensures that vdev has not been closed */
114		g_topology_assert();
115		old_physpath = vd->vdev_physpath;
116		vd->vdev_physpath = spa_strdup(physpath);
117		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
118
119		if (old_physpath != NULL)
120			spa_strfree(old_physpath);
121	}
122	g_free(physpath);
123}
124
125static void
126vdev_geom_orphan(struct g_consumer *cp)
127{
128	vdev_t *vd;
129
130	g_topology_assert();
131
132	vd = cp->private;
133	if (vd == NULL) {
134		/* Vdev close in progress.  Ignore the event. */
135		return;
136	}
137
138	/*
139	 * Orphan callbacks occur from the GEOM event thread.
140	 * Concurrent with this call, new I/O requests may be
141	 * working their way through GEOM about to find out
142	 * (only once executed by the g_down thread) that we've
143	 * been orphaned from our disk provider.  These I/Os
144	 * must be retired before we can detach our consumer.
145	 * This is most easily achieved by acquiring the
146	 * SPA ZIO configuration lock as a writer, but doing
147	 * so with the GEOM topology lock held would cause
148	 * a lock order reversal.  Instead, rely on the SPA's
149	 * async removal support to invoke a close on this
150	 * vdev once it is safe to do so.
151	 */
152	vd->vdev_remove_wanted = B_TRUE;
153	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
154}
155
156static struct g_consumer *
157vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
158{
159	struct g_geom *gp;
160	struct g_consumer *cp;
161	int error;
162
163	g_topology_assert();
164
165	ZFS_LOG(1, "Attaching to %s.", pp->name);
166	/* Do we have geom already? No? Create one. */
167	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
168		if (gp->flags & G_GEOM_WITHER)
169			continue;
170		if (strcmp(gp->name, "zfs::vdev") != 0)
171			continue;
172		break;
173	}
174	if (gp == NULL) {
175		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
176		gp->orphan = vdev_geom_orphan;
177		gp->attrchanged = vdev_geom_attrchanged;
178		cp = g_new_consumer(gp);
179		error = g_attach(cp, pp);
180		if (error != 0) {
181			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
182			    __LINE__, error);
183			g_wither_geom(gp, ENXIO);
184			return (NULL);
185		}
186		error = g_access(cp, 1, 0, 1);
187		if (error != 0) {
188			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
189			       __LINE__, error);
190			g_wither_geom(gp, ENXIO);
191			return (NULL);
192		}
193		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
194	} else {
195		/* Check if we are already connected to this provider. */
196		LIST_FOREACH(cp, &gp->consumer, consumer) {
197			if (cp->provider == pp) {
198				ZFS_LOG(1, "Found consumer for %s.", pp->name);
199				break;
200			}
201		}
202		if (cp == NULL) {
203			cp = g_new_consumer(gp);
204			error = g_attach(cp, pp);
205			if (error != 0) {
206				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
207				    __func__, __LINE__, error);
208				g_destroy_consumer(cp);
209				return (NULL);
210			}
211			error = g_access(cp, 1, 0, 1);
212			if (error != 0) {
213				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
214				    __func__, __LINE__, error);
215				g_detach(cp);
216				g_destroy_consumer(cp);
217				return (NULL);
218			}
219			ZFS_LOG(1, "Created consumer for %s.", pp->name);
220		} else {
221			error = g_access(cp, 1, 0, 1);
222			if (error != 0) {
223				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
224				    __func__, __LINE__, error);
225				return (NULL);
226			}
227			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
228		}
229	}
230
231	/*
232	 * BUG: cp may already belong to a vdev.  This could happen if:
233	 * 1) That vdev is a shared spare, or
234	 * 2) We are trying to reopen a missing vdev and we are scanning by
235	 *    guid.  In that case, we'll ultimately fail to open this consumer,
236	 *    but not until after setting the private field.
237	 * The solution is to:
238	 * 1) Don't set the private field until after the open succeeds, and
239	 * 2) Set it to a linked list of vdevs, not just a single vdev
240	 */
241	cp->private = vd;
242	vd->vdev_tsd = cp;
243
244	/* Fetch initial physical path information for this device. */
245	vdev_geom_attrchanged(cp, "GEOM::physpath");
246
247	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
248	return (cp);
249}
250
251static void
252vdev_geom_close_locked(vdev_t *vd)
253{
254	struct g_geom *gp;
255	struct g_consumer *cp;
256
257	g_topology_assert();
258
259	cp = vd->vdev_tsd;
260	if (cp == NULL)
261		return;
262
263	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
264	KASSERT(vd->vdev_tsd == cp, ("%s: vdev_tsd is not cp", __func__));
265	vd->vdev_tsd = NULL;
266	vd->vdev_delayed_close = B_FALSE;
267	cp->private = NULL;
268
269	gp = cp->geom;
270	g_access(cp, -1, 0, -1);
271	/* Destroy consumer on last close. */
272	if (cp->acr == 0 && cp->ace == 0) {
273		if (cp->acw > 0)
274			g_access(cp, 0, -cp->acw, 0);
275		if (cp->provider != NULL) {
276			ZFS_LOG(1, "Destroyed consumer to %s.",
277			    cp->provider->name);
278			g_detach(cp);
279		}
280		g_destroy_consumer(cp);
281	}
282	/* Destroy geom if there are no consumers left. */
283	if (LIST_EMPTY(&gp->consumer)) {
284		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
285		g_wither_geom(gp, ENXIO);
286	}
287}
288
289static void
290nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
291{
292
293	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
294	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
295}
296
297static int
298vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
299{
300	struct bio *bp;
301	u_char *p;
302	off_t off, maxio;
303	int error;
304
305	ASSERT((offset % cp->provider->sectorsize) == 0);
306	ASSERT((size % cp->provider->sectorsize) == 0);
307
308	bp = g_alloc_bio();
309	off = offset;
310	offset += size;
311	p = data;
312	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
313	error = 0;
314
315	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
316		bzero(bp, sizeof(*bp));
317		bp->bio_cmd = cmd;
318		bp->bio_done = NULL;
319		bp->bio_offset = off;
320		bp->bio_length = MIN(size, maxio);
321		bp->bio_data = p;
322		g_io_request(bp, cp);
323		error = biowait(bp, "vdev_geom_io");
324		if (error != 0)
325			break;
326	}
327
328	g_destroy_bio(bp);
329	return (error);
330}
331
332static void
333vdev_geom_taste_orphan(struct g_consumer *cp)
334{
335
336	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
337	    cp->provider->name));
338}
339
340static int
341vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
342{
343	struct g_provider *pp;
344	vdev_label_t *label;
345	char *p, *buf;
346	size_t buflen;
347	uint64_t psize;
348	off_t offset, size;
349	uint64_t state, txg;
350	int error, l, len;
351
352	g_topology_assert_not();
353
354	pp = cp->provider;
355	ZFS_LOG(1, "Reading config from %s...", pp->name);
356
357	psize = pp->mediasize;
358	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
359
360	size = sizeof(*label) + pp->sectorsize -
361	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
362
363	label = kmem_alloc(size, KM_SLEEP);
364	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
365
366	*config = NULL;
367	for (l = 0; l < VDEV_LABELS; l++) {
368
369		offset = vdev_label_offset(psize, l, 0);
370		if ((offset % pp->sectorsize) != 0)
371			continue;
372
373		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
374			continue;
375		buf = label->vl_vdev_phys.vp_nvlist;
376
377		if (nvlist_unpack(buf, buflen, config, 0) != 0)
378			continue;
379
380		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
381		    &state) != 0 || state > POOL_STATE_L2CACHE) {
382			nvlist_free(*config);
383			*config = NULL;
384			continue;
385		}
386
387		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
388		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
389		    &txg) != 0 || txg == 0)) {
390			nvlist_free(*config);
391			*config = NULL;
392			continue;
393		}
394
395		break;
396	}
397
398	kmem_free(label, size);
399	return (*config == NULL ? ENOENT : 0);
400}
401
402static void
403resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
404{
405	nvlist_t **new_configs;
406	uint64_t i;
407
408	if (id < *count)
409		return;
410	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
411	    KM_SLEEP);
412	for (i = 0; i < *count; i++)
413		new_configs[i] = (*configs)[i];
414	if (*configs != NULL)
415		kmem_free(*configs, *count * sizeof(void *));
416	*configs = new_configs;
417	*count = id + 1;
418}
419
420static void
421process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
422    const char *name, uint64_t* known_pool_guid)
423{
424	nvlist_t *vdev_tree;
425	uint64_t pool_guid;
426	uint64_t vdev_guid, known_guid;
427	uint64_t id, txg, known_txg;
428	char *pname;
429	int i;
430
431	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
432	    strcmp(pname, name) != 0)
433		goto ignore;
434
435	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
436		goto ignore;
437
438	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
439		goto ignore;
440
441	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
442		goto ignore;
443
444	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
445		goto ignore;
446
447	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
448
449	if (*known_pool_guid != 0) {
450		if (pool_guid != *known_pool_guid)
451			goto ignore;
452	} else
453		*known_pool_guid = pool_guid;
454
455	resize_configs(configs, count, id);
456
457	if ((*configs)[id] != NULL) {
458		VERIFY(nvlist_lookup_uint64((*configs)[id],
459		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
460		if (txg <= known_txg)
461			goto ignore;
462		nvlist_free((*configs)[id]);
463	}
464
465	(*configs)[id] = cfg;
466	return;
467
468ignore:
469	nvlist_free(cfg);
470}
471
472static int
473vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp)
474{
475	int error;
476
477	if (pp->flags & G_PF_WITHER)
478		return (EINVAL);
479	g_attach(cp, pp);
480	error = g_access(cp, 1, 0, 0);
481	if (error == 0) {
482		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize))
483			error = EINVAL;
484		else if (pp->mediasize < SPA_MINDEVSIZE)
485			error = EINVAL;
486		if (error != 0)
487			g_access(cp, -1, 0, 0);
488	}
489	if (error != 0)
490		g_detach(cp);
491	return (error);
492}
493
494static void
495vdev_geom_detach_taster(struct g_consumer *cp)
496{
497	g_access(cp, -1, 0, 0);
498	g_detach(cp);
499}
500
501int
502vdev_geom_read_pool_label(const char *name,
503    nvlist_t ***configs, uint64_t *count)
504{
505	struct g_class *mp;
506	struct g_geom *gp, *zgp;
507	struct g_provider *pp;
508	struct g_consumer *zcp;
509	nvlist_t *vdev_cfg;
510	uint64_t pool_guid;
511	int error;
512
513	DROP_GIANT();
514	g_topology_lock();
515
516	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
517	/* This orphan function should be never called. */
518	zgp->orphan = vdev_geom_taste_orphan;
519	zcp = g_new_consumer(zgp);
520
521	*configs = NULL;
522	*count = 0;
523	pool_guid = 0;
524	LIST_FOREACH(mp, &g_classes, class) {
525		if (mp == &zfs_vdev_class)
526			continue;
527		LIST_FOREACH(gp, &mp->geom, geom) {
528			if (gp->flags & G_GEOM_WITHER)
529				continue;
530			LIST_FOREACH(pp, &gp->provider, provider) {
531				if (pp->flags & G_PF_WITHER)
532					continue;
533				if (vdev_geom_attach_taster(zcp, pp) != 0)
534					continue;
535				g_topology_unlock();
536				error = vdev_geom_read_config(zcp, &vdev_cfg);
537				g_topology_lock();
538				vdev_geom_detach_taster(zcp);
539				if (error)
540					continue;
541				ZFS_LOG(1, "successfully read vdev config");
542
543				process_vdev_config(configs, count,
544				    vdev_cfg, name, &pool_guid);
545			}
546		}
547	}
548
549	g_destroy_consumer(zcp);
550	g_destroy_geom(zgp);
551	g_topology_unlock();
552	PICKUP_GIANT();
553
554	return (*count > 0 ? 0 : ENOENT);
555}
556
557static void
558vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
559{
560	nvlist_t *config;
561
562	g_topology_assert_not();
563
564	*pguid = 0;
565	*vguid = 0;
566	if (vdev_geom_read_config(cp, &config) == 0) {
567		nvlist_get_guids(config, pguid, vguid);
568		nvlist_free(config);
569	}
570}
571
572static struct g_consumer *
573vdev_geom_attach_by_guids(vdev_t *vd)
574{
575	struct g_class *mp;
576	struct g_geom *gp, *zgp;
577	struct g_provider *pp;
578	struct g_consumer *cp, *zcp;
579	uint64_t pguid, vguid;
580
581	g_topology_assert();
582
583	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
584	/* This orphan function should be never called. */
585	zgp->orphan = vdev_geom_taste_orphan;
586	zcp = g_new_consumer(zgp);
587
588	cp = NULL;
589	LIST_FOREACH(mp, &g_classes, class) {
590		if (mp == &zfs_vdev_class)
591			continue;
592		LIST_FOREACH(gp, &mp->geom, geom) {
593			if (gp->flags & G_GEOM_WITHER)
594				continue;
595			LIST_FOREACH(pp, &gp->provider, provider) {
596				if (vdev_geom_attach_taster(zcp, pp) != 0)
597					continue;
598				g_topology_unlock();
599				vdev_geom_read_guids(zcp, &pguid, &vguid);
600				g_topology_lock();
601				vdev_geom_detach_taster(zcp);
602				/*
603				 * Check that the label's vdev guid matches the
604				 * desired guid.  If the label has a pool guid,
605				 * check that it matches too. (Inactive spares
606				 * and L2ARCs do not have any pool guid in the
607				 * label.)
608				*/
609				if ((pguid != 0 &&
610				     pguid != spa_guid(vd->vdev_spa)) ||
611				    vguid != vd->vdev_guid)
612					continue;
613				cp = vdev_geom_attach(pp, vd);
614				if (cp == NULL) {
615					printf("ZFS WARNING: Unable to "
616					    "attach to %s.\n", pp->name);
617					continue;
618				}
619				break;
620			}
621			if (cp != NULL)
622				break;
623		}
624		if (cp != NULL)
625			break;
626	}
627end:
628	g_destroy_consumer(zcp);
629	g_destroy_geom(zgp);
630	return (cp);
631}
632
633static struct g_consumer *
634vdev_geom_open_by_guids(vdev_t *vd)
635{
636	struct g_consumer *cp;
637	char *buf;
638	size_t len;
639
640	g_topology_assert();
641
642	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
643		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
644	cp = vdev_geom_attach_by_guids(vd);
645	if (cp != NULL) {
646		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
647		buf = kmem_alloc(len, KM_SLEEP);
648
649		snprintf(buf, len, "/dev/%s", cp->provider->name);
650		spa_strfree(vd->vdev_path);
651		vd->vdev_path = buf;
652
653		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
654		    (uintmax_t)spa_guid(vd->vdev_spa),
655		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
656	} else {
657		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
658		    (uintmax_t)spa_guid(vd->vdev_spa),
659		    (uintmax_t)vd->vdev_guid);
660	}
661
662	return (cp);
663}
664
665static struct g_consumer *
666vdev_geom_open_by_path(vdev_t *vd, int check_guid)
667{
668	struct g_provider *pp;
669	struct g_consumer *cp;
670	uint64_t pguid, vguid;
671
672	g_topology_assert();
673
674	cp = NULL;
675	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
676	if (pp != NULL) {
677		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
678		cp = vdev_geom_attach(pp, vd);
679		if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
680		    pp->sectorsize <= VDEV_PAD_SIZE) {
681			g_topology_unlock();
682			vdev_geom_read_guids(cp, &pguid, &vguid);
683			g_topology_lock();
684			/*
685			 * Check that the label's vdev guid matches the
686			 * desired guid.  If the label has a pool guid,
687			 * check that it matches too. (Inactive spares
688			 * and L2ARCs do not have any pool guid in the
689			 * label.)
690			 */
691			if ((pguid != 0 &&
692			    pguid != spa_guid(vd->vdev_spa)) ||
693			    vguid != vd->vdev_guid) {
694				vdev_geom_close_locked(vd);
695				cp = NULL;
696				ZFS_LOG(1, "guid mismatch for provider %s: "
697				    "%ju:%ju != %ju:%ju.", vd->vdev_path,
698				    (uintmax_t)spa_guid(vd->vdev_spa),
699				    (uintmax_t)vd->vdev_guid,
700				    (uintmax_t)pguid, (uintmax_t)vguid);
701			} else {
702				ZFS_LOG(1, "guid match for provider %s.",
703				    vd->vdev_path);
704			}
705		}
706	}
707
708	return (cp);
709}
710
711static int
712vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
713    uint64_t *logical_ashift, uint64_t *physical_ashift)
714{
715	struct g_provider *pp;
716	struct g_consumer *cp;
717	size_t bufsize;
718	int error;
719
720	/*
721	 * We must have a pathname, and it must be absolute.
722	 */
723	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
724		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
725		return (EINVAL);
726	}
727
728	vd->vdev_tsd = NULL;
729
730	DROP_GIANT();
731	g_topology_lock();
732	error = 0;
733
734	if (vd->vdev_spa->spa_splitting_newspa ||
735	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
736	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) {
737		/*
738		 * We are dealing with a vdev that hasn't been previously
739		 * opened (since boot), and we are not loading an
740		 * existing pool configuration.  This looks like a
741		 * vdev add operation to a new or existing pool.
742		 * Assume the user knows what he/she is doing and find
743		 * GEOM provider by its name, ignoring GUID mismatches.
744		 *
745		 * XXPOLICY: It would be safer to only allow a device
746		 *           that is unlabeled or labeled but missing
747		 *           GUID information to be opened in this fashion,
748		 *           unless we are doing a split, in which case we
749		 *           should allow any guid.
750		 */
751		cp = vdev_geom_open_by_path(vd, 0);
752	} else {
753		/*
754		 * Try using the recorded path for this device, but only
755		 * accept it if its label data contains the expected GUIDs.
756		 */
757		cp = vdev_geom_open_by_path(vd, 1);
758		if (cp == NULL) {
759			/*
760			 * The device at vd->vdev_path doesn't have the
761			 * expected GUIDs. The disks might have merely
762			 * moved around so try all other GEOM providers
763			 * to find one with the right GUIDs.
764			 */
765			cp = vdev_geom_open_by_guids(vd);
766		}
767	}
768
769	if (cp == NULL) {
770		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
771		error = ENOENT;
772	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
773	    !ISP2(cp->provider->sectorsize)) {
774		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
775		    vd->vdev_path);
776
777		vdev_geom_close_locked(vd);
778		error = EINVAL;
779		cp = NULL;
780	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
781		int i;
782
783		for (i = 0; i < 5; i++) {
784			error = g_access(cp, 0, 1, 0);
785			if (error == 0)
786				break;
787			g_topology_unlock();
788			tsleep(vd, 0, "vdev", hz / 2);
789			g_topology_lock();
790		}
791		if (error != 0) {
792			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
793			    vd->vdev_path, error);
794			vdev_geom_close_locked(vd);
795			cp = NULL;
796		}
797	}
798
799	g_topology_unlock();
800	PICKUP_GIANT();
801	if (cp == NULL) {
802		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
803		return (error);
804	}
805	pp = cp->provider;
806
807	/*
808	 * Determine the actual size of the device.
809	 */
810	*max_psize = *psize = pp->mediasize;
811
812	/*
813	 * Determine the device's minimum transfer size and preferred
814	 * transfer size.
815	 */
816	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
817	*physical_ashift = 0;
818	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
819	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
820		*physical_ashift = highbit(pp->stripesize) - 1;
821
822	/*
823	 * Clear the nowritecache settings, so that on a vdev_reopen()
824	 * we will try again.
825	 */
826	vd->vdev_nowritecache = B_FALSE;
827
828	/*
829	 * Determine the device's rotation rate.
830	 */
831	vdev_geom_set_rotation_rate(vd, cp);
832
833	return (0);
834}
835
836static void
837vdev_geom_close(vdev_t *vd)
838{
839
840	DROP_GIANT();
841	g_topology_lock();
842	vdev_geom_close_locked(vd);
843	g_topology_unlock();
844	PICKUP_GIANT();
845}
846
847static void
848vdev_geom_io_intr(struct bio *bp)
849{
850	vdev_t *vd;
851	zio_t *zio;
852
853	zio = bp->bio_caller1;
854	vd = zio->io_vd;
855	zio->io_error = bp->bio_error;
856	if (zio->io_error == 0 && bp->bio_resid != 0)
857		zio->io_error = SET_ERROR(EIO);
858
859	switch(zio->io_error) {
860	case ENOTSUP:
861		/*
862		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
863		 * that future attempts will never succeed. In this case
864		 * we set a persistent flag so that we don't bother with
865		 * requests in the future.
866		 */
867		switch(bp->bio_cmd) {
868		case BIO_FLUSH:
869			vd->vdev_nowritecache = B_TRUE;
870			break;
871		case BIO_DELETE:
872			vd->vdev_notrim = B_TRUE;
873			break;
874		}
875		break;
876	case ENXIO:
877		if (!vd->vdev_remove_wanted) {
878			/*
879			 * If provider's error is set we assume it is being
880			 * removed.
881			 */
882			if (bp->bio_to->error != 0) {
883				vd->vdev_remove_wanted = B_TRUE;
884				spa_async_request(zio->io_spa,
885				    SPA_ASYNC_REMOVE);
886			} else if (!vd->vdev_delayed_close) {
887				vd->vdev_delayed_close = B_TRUE;
888			}
889		}
890		break;
891	}
892	g_destroy_bio(bp);
893	zio_delay_interrupt(zio);
894}
895
896static void
897vdev_geom_io_start(zio_t *zio)
898{
899	vdev_t *vd;
900	struct g_consumer *cp;
901	struct bio *bp;
902	int error;
903
904	vd = zio->io_vd;
905
906	switch (zio->io_type) {
907	case ZIO_TYPE_IOCTL:
908		/* XXPOLICY */
909		if (!vdev_readable(vd)) {
910			zio->io_error = SET_ERROR(ENXIO);
911			zio_interrupt(zio);
912			return;
913		} else {
914			switch (zio->io_cmd) {
915			case DKIOCFLUSHWRITECACHE:
916				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
917					break;
918				if (vd->vdev_nowritecache) {
919					zio->io_error = SET_ERROR(ENOTSUP);
920					break;
921				}
922				goto sendreq;
923			default:
924				zio->io_error = SET_ERROR(ENOTSUP);
925			}
926		}
927
928		zio_execute(zio);
929		return;
930	case ZIO_TYPE_FREE:
931		if (vd->vdev_notrim) {
932			zio->io_error = SET_ERROR(ENOTSUP);
933		} else if (!vdev_geom_bio_delete_disable) {
934			goto sendreq;
935		}
936		zio_execute(zio);
937		return;
938	}
939sendreq:
940	ASSERT(zio->io_type == ZIO_TYPE_READ ||
941	    zio->io_type == ZIO_TYPE_WRITE ||
942	    zio->io_type == ZIO_TYPE_FREE ||
943	    zio->io_type == ZIO_TYPE_IOCTL);
944
945	cp = vd->vdev_tsd;
946	if (cp == NULL) {
947		zio->io_error = SET_ERROR(ENXIO);
948		zio_interrupt(zio);
949		return;
950	}
951	bp = g_alloc_bio();
952	bp->bio_caller1 = zio;
953	switch (zio->io_type) {
954	case ZIO_TYPE_READ:
955	case ZIO_TYPE_WRITE:
956		zio->io_target_timestamp = zio_handle_io_delay(zio);
957		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
958		bp->bio_data = zio->io_data;
959		bp->bio_offset = zio->io_offset;
960		bp->bio_length = zio->io_size;
961		break;
962	case ZIO_TYPE_FREE:
963		bp->bio_cmd = BIO_DELETE;
964		bp->bio_data = NULL;
965		bp->bio_offset = zio->io_offset;
966		bp->bio_length = zio->io_size;
967		break;
968	case ZIO_TYPE_IOCTL:
969		bp->bio_cmd = BIO_FLUSH;
970		bp->bio_flags |= BIO_ORDERED;
971		bp->bio_data = NULL;
972		bp->bio_offset = cp->provider->mediasize;
973		bp->bio_length = 0;
974		break;
975	}
976	bp->bio_done = vdev_geom_io_intr;
977
978	g_io_request(bp, cp);
979}
980
981static void
982vdev_geom_io_done(zio_t *zio)
983{
984}
985
986static void
987vdev_geom_hold(vdev_t *vd)
988{
989}
990
991static void
992vdev_geom_rele(vdev_t *vd)
993{
994}
995
996vdev_ops_t vdev_geom_ops = {
997	vdev_geom_open,
998	vdev_geom_close,
999	vdev_default_asize,
1000	vdev_geom_io_start,
1001	vdev_geom_io_done,
1002	NULL,
1003	vdev_geom_hold,
1004	vdev_geom_rele,
1005	VDEV_TYPE_DISK,		/* name of this vdev type */
1006	B_TRUE			/* leaf vdev */
1007};
1008