vdev_geom.c revision 299376
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54SYSCTL_DECL(_vfs_zfs_vdev);
55/* Don't send BIO_FLUSH. */
56static int vdev_geom_bio_flush_disable = 0;
57TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60/* Don't send BIO_DELETE. */
61static int vdev_geom_bio_delete_disable = 0;
62TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66static void
67vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
68{
69	int error;
70	uint16_t rate;
71
72	error = g_getattr("GEOM::rotation_rate", cp, &rate);
73	if (error == 0)
74		vd->vdev_rotation_rate = rate;
75	else
76		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
77}
78
79static void
80vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
81{
82	vdev_t *vd;
83	spa_t *spa;
84	char *physpath;
85	int error, physpath_len;
86
87	vd = cp->private;
88	if (vd == NULL)
89		return;
90
91	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
92		vdev_geom_set_rotation_rate(vd, cp);
93		return;
94	}
95
96	if (strcmp(attr, "GEOM::physpath") != 0)
97		return;
98
99	if (g_access(cp, 1, 0, 0) != 0)
100		return;
101
102	/*
103	 * Record/Update physical path information for this device.
104	 */
105	spa = vd->vdev_spa;
106	physpath_len = MAXPATHLEN;
107	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
108	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
109	g_access(cp, -1, 0, 0);
110	if (error == 0) {
111		char *old_physpath;
112
113		/* g_topology lock ensures that vdev has not been closed */
114		g_topology_assert();
115		old_physpath = vd->vdev_physpath;
116		vd->vdev_physpath = spa_strdup(physpath);
117		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
118
119		if (old_physpath != NULL)
120			spa_strfree(old_physpath);
121	}
122	g_free(physpath);
123}
124
125static void
126vdev_geom_orphan(struct g_consumer *cp)
127{
128	vdev_t *vd;
129
130	g_topology_assert();
131
132	vd = cp->private;
133	if (vd == NULL) {
134		/* Vdev close in progress.  Ignore the event. */
135		return;
136	}
137
138	/*
139	 * Orphan callbacks occur from the GEOM event thread.
140	 * Concurrent with this call, new I/O requests may be
141	 * working their way through GEOM about to find out
142	 * (only once executed by the g_down thread) that we've
143	 * been orphaned from our disk provider.  These I/Os
144	 * must be retired before we can detach our consumer.
145	 * This is most easily achieved by acquiring the
146	 * SPA ZIO configuration lock as a writer, but doing
147	 * so with the GEOM topology lock held would cause
148	 * a lock order reversal.  Instead, rely on the SPA's
149	 * async removal support to invoke a close on this
150	 * vdev once it is safe to do so.
151	 */
152	vd->vdev_remove_wanted = B_TRUE;
153	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
154}
155
156static struct g_consumer *
157vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
158{
159	struct g_geom *gp;
160	struct g_consumer *cp;
161
162	g_topology_assert();
163
164	ZFS_LOG(1, "Attaching to %s.", pp->name);
165	/* Do we have geom already? No? Create one. */
166	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
167		if (gp->flags & G_GEOM_WITHER)
168			continue;
169		if (strcmp(gp->name, "zfs::vdev") != 0)
170			continue;
171		break;
172	}
173	if (gp == NULL) {
174		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
175		gp->orphan = vdev_geom_orphan;
176		gp->attrchanged = vdev_geom_attrchanged;
177		cp = g_new_consumer(gp);
178		if (g_attach(cp, pp) != 0) {
179			g_wither_geom(gp, ENXIO);
180			return (NULL);
181		}
182		if (g_access(cp, 1, 0, 1) != 0) {
183			g_wither_geom(gp, ENXIO);
184			return (NULL);
185		}
186		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
187	} else {
188		/* Check if we are already connected to this provider. */
189		LIST_FOREACH(cp, &gp->consumer, consumer) {
190			if (cp->provider == pp) {
191				ZFS_LOG(1, "Found consumer for %s.", pp->name);
192				break;
193			}
194		}
195		if (cp == NULL) {
196			cp = g_new_consumer(gp);
197			if (g_attach(cp, pp) != 0) {
198				g_destroy_consumer(cp);
199				return (NULL);
200			}
201			if (g_access(cp, 1, 0, 1) != 0) {
202				g_detach(cp);
203				g_destroy_consumer(cp);
204				return (NULL);
205			}
206			ZFS_LOG(1, "Created consumer for %s.", pp->name);
207		} else {
208			if (g_access(cp, 1, 0, 1) != 0)
209				return (NULL);
210			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
211		}
212	}
213
214	/*
215	 * BUG: cp may already belong to a vdev.  This could happen if:
216	 * 1) That vdev is a shared spare, or
217	 * 2) We are trying to reopen a missing vdev and we are scanning by
218	 *    guid.  In that case, we'll ultimately fail to open this consumer,
219	 *    but not until after setting the private field.
220	 * The solution is to:
221	 * 1) Don't set the private field until after the open succeeds, and
222	 * 2) Set it to a linked list of vdevs, not just a single vdev
223	 */
224	cp->private = vd;
225	vd->vdev_tsd = cp;
226
227	/* Fetch initial physical path information for this device. */
228	vdev_geom_attrchanged(cp, "GEOM::physpath");
229
230	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
231	return (cp);
232}
233
234static void
235vdev_geom_close_locked(vdev_t *vd)
236{
237	struct g_geom *gp;
238	struct g_consumer *cp;
239
240	g_topology_assert();
241
242	cp = vd->vdev_tsd;
243	if (cp == NULL)
244		return;
245
246	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
247	KASSERT(vd->vdev_tsd == cp, ("%s: vdev_tsd is not cp", __func__));
248	vd->vdev_tsd = NULL;
249	vd->vdev_delayed_close = B_FALSE;
250	cp->private = NULL;
251
252	gp = cp->geom;
253	g_access(cp, -1, 0, -1);
254	/* Destroy consumer on last close. */
255	if (cp->acr == 0 && cp->ace == 0) {
256		if (cp->acw > 0)
257			g_access(cp, 0, -cp->acw, 0);
258		if (cp->provider != NULL) {
259			ZFS_LOG(1, "Destroyed consumer to %s.",
260			    cp->provider->name);
261			g_detach(cp);
262		}
263		g_destroy_consumer(cp);
264	}
265	/* Destroy geom if there are no consumers left. */
266	if (LIST_EMPTY(&gp->consumer)) {
267		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
268		g_wither_geom(gp, ENXIO);
269	}
270}
271
272static void
273nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
274{
275
276	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
277	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
278}
279
280static int
281vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
282{
283	struct bio *bp;
284	u_char *p;
285	off_t off, maxio;
286	int error;
287
288	ASSERT((offset % cp->provider->sectorsize) == 0);
289	ASSERT((size % cp->provider->sectorsize) == 0);
290
291	bp = g_alloc_bio();
292	off = offset;
293	offset += size;
294	p = data;
295	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
296	error = 0;
297
298	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
299		bzero(bp, sizeof(*bp));
300		bp->bio_cmd = cmd;
301		bp->bio_done = NULL;
302		bp->bio_offset = off;
303		bp->bio_length = MIN(size, maxio);
304		bp->bio_data = p;
305		g_io_request(bp, cp);
306		error = biowait(bp, "vdev_geom_io");
307		if (error != 0)
308			break;
309	}
310
311	g_destroy_bio(bp);
312	return (error);
313}
314
315static void
316vdev_geom_taste_orphan(struct g_consumer *cp)
317{
318
319	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
320	    cp->provider->name));
321}
322
323static int
324vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
325{
326	struct g_provider *pp;
327	vdev_label_t *label;
328	char *p, *buf;
329	size_t buflen;
330	uint64_t psize;
331	off_t offset, size;
332	uint64_t state, txg;
333	int error, l, len;
334
335	g_topology_assert_not();
336
337	pp = cp->provider;
338	ZFS_LOG(1, "Reading config from %s...", pp->name);
339
340	psize = pp->mediasize;
341	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
342
343	size = sizeof(*label) + pp->sectorsize -
344	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
345
346	label = kmem_alloc(size, KM_SLEEP);
347	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
348
349	*config = NULL;
350	for (l = 0; l < VDEV_LABELS; l++) {
351
352		offset = vdev_label_offset(psize, l, 0);
353		if ((offset % pp->sectorsize) != 0)
354			continue;
355
356		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
357			continue;
358		buf = label->vl_vdev_phys.vp_nvlist;
359
360		if (nvlist_unpack(buf, buflen, config, 0) != 0)
361			continue;
362
363		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
364		    &state) != 0 || state > POOL_STATE_L2CACHE) {
365			nvlist_free(*config);
366			*config = NULL;
367			continue;
368		}
369
370		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
371		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
372		    &txg) != 0 || txg == 0)) {
373			nvlist_free(*config);
374			*config = NULL;
375			continue;
376		}
377
378		break;
379	}
380
381	kmem_free(label, size);
382	return (*config == NULL ? ENOENT : 0);
383}
384
385static void
386resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
387{
388	nvlist_t **new_configs;
389	uint64_t i;
390
391	if (id < *count)
392		return;
393	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
394	    KM_SLEEP);
395	for (i = 0; i < *count; i++)
396		new_configs[i] = (*configs)[i];
397	if (*configs != NULL)
398		kmem_free(*configs, *count * sizeof(void *));
399	*configs = new_configs;
400	*count = id + 1;
401}
402
403static void
404process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
405    const char *name, uint64_t* known_pool_guid)
406{
407	nvlist_t *vdev_tree;
408	uint64_t pool_guid;
409	uint64_t vdev_guid, known_guid;
410	uint64_t id, txg, known_txg;
411	char *pname;
412	int i;
413
414	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
415	    strcmp(pname, name) != 0)
416		goto ignore;
417
418	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
419		goto ignore;
420
421	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
422		goto ignore;
423
424	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
425		goto ignore;
426
427	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
428		goto ignore;
429
430	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
431
432	if (*known_pool_guid != 0) {
433		if (pool_guid != *known_pool_guid)
434			goto ignore;
435	} else
436		*known_pool_guid = pool_guid;
437
438	resize_configs(configs, count, id);
439
440	if ((*configs)[id] != NULL) {
441		VERIFY(nvlist_lookup_uint64((*configs)[id],
442		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
443		if (txg <= known_txg)
444			goto ignore;
445		nvlist_free((*configs)[id]);
446	}
447
448	(*configs)[id] = cfg;
449	return;
450
451ignore:
452	nvlist_free(cfg);
453}
454
455static int
456vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp)
457{
458	int error;
459
460	if (pp->flags & G_PF_WITHER)
461		return (EINVAL);
462	g_attach(cp, pp);
463	error = g_access(cp, 1, 0, 0);
464	if (error == 0) {
465		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize))
466			error = EINVAL;
467		else if (pp->mediasize < SPA_MINDEVSIZE)
468			error = EINVAL;
469		if (error != 0)
470			g_access(cp, -1, 0, 0);
471	}
472	if (error != 0)
473		g_detach(cp);
474	return (error);
475}
476
477static void
478vdev_geom_detach_taster(struct g_consumer *cp)
479{
480	g_access(cp, -1, 0, 0);
481	g_detach(cp);
482}
483
484int
485vdev_geom_read_pool_label(const char *name,
486    nvlist_t ***configs, uint64_t *count)
487{
488	struct g_class *mp;
489	struct g_geom *gp, *zgp;
490	struct g_provider *pp;
491	struct g_consumer *zcp;
492	nvlist_t *vdev_cfg;
493	uint64_t pool_guid;
494	int error;
495
496	DROP_GIANT();
497	g_topology_lock();
498
499	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
500	/* This orphan function should be never called. */
501	zgp->orphan = vdev_geom_taste_orphan;
502	zcp = g_new_consumer(zgp);
503
504	*configs = NULL;
505	*count = 0;
506	pool_guid = 0;
507	LIST_FOREACH(mp, &g_classes, class) {
508		if (mp == &zfs_vdev_class)
509			continue;
510		LIST_FOREACH(gp, &mp->geom, geom) {
511			if (gp->flags & G_GEOM_WITHER)
512				continue;
513			LIST_FOREACH(pp, &gp->provider, provider) {
514				if (pp->flags & G_PF_WITHER)
515					continue;
516				if (vdev_geom_attach_taster(zcp, pp) != 0)
517					continue;
518				g_topology_unlock();
519				error = vdev_geom_read_config(zcp, &vdev_cfg);
520				g_topology_lock();
521				vdev_geom_detach_taster(zcp);
522				if (error)
523					continue;
524				ZFS_LOG(1, "successfully read vdev config");
525
526				process_vdev_config(configs, count,
527				    vdev_cfg, name, &pool_guid);
528			}
529		}
530	}
531
532	g_destroy_consumer(zcp);
533	g_destroy_geom(zgp);
534	g_topology_unlock();
535	PICKUP_GIANT();
536
537	return (*count > 0 ? 0 : ENOENT);
538}
539
540static void
541vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
542{
543	nvlist_t *config;
544
545	g_topology_assert_not();
546
547	*pguid = 0;
548	*vguid = 0;
549	if (vdev_geom_read_config(cp, &config) == 0) {
550		nvlist_get_guids(config, pguid, vguid);
551		nvlist_free(config);
552	}
553}
554
555static struct g_consumer *
556vdev_geom_attach_by_guids(vdev_t *vd)
557{
558	struct g_class *mp;
559	struct g_geom *gp, *zgp;
560	struct g_provider *pp;
561	struct g_consumer *cp, *zcp;
562	uint64_t pguid, vguid;
563
564	g_topology_assert();
565
566	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
567	/* This orphan function should be never called. */
568	zgp->orphan = vdev_geom_taste_orphan;
569	zcp = g_new_consumer(zgp);
570
571	cp = NULL;
572	LIST_FOREACH(mp, &g_classes, class) {
573		if (mp == &zfs_vdev_class)
574			continue;
575		LIST_FOREACH(gp, &mp->geom, geom) {
576			if (gp->flags & G_GEOM_WITHER)
577				continue;
578			LIST_FOREACH(pp, &gp->provider, provider) {
579				if (vdev_geom_attach_taster(zcp, pp) != 0)
580					continue;
581				g_topology_unlock();
582				vdev_geom_read_guids(zcp, &pguid, &vguid);
583				g_topology_lock();
584				vdev_geom_detach_taster(zcp);
585				/*
586				 * Check that the label's vdev guid matches the
587				 * desired guid.  If the label has a pool guid,
588				 * check that it matches too. (Inactive spares
589				 * and L2ARCs do not have any pool guid in the
590				 * label.)
591				*/
592				if ((pguid != 0 &&
593				     pguid != spa_guid(vd->vdev_spa)) ||
594				    vguid != vd->vdev_guid)
595					continue;
596				cp = vdev_geom_attach(pp, vd);
597				if (cp == NULL) {
598					printf("ZFS WARNING: Unable to "
599					    "attach to %s.\n", pp->name);
600					continue;
601				}
602				break;
603			}
604			if (cp != NULL)
605				break;
606		}
607		if (cp != NULL)
608			break;
609	}
610end:
611	g_destroy_consumer(zcp);
612	g_destroy_geom(zgp);
613	return (cp);
614}
615
616static struct g_consumer *
617vdev_geom_open_by_guids(vdev_t *vd)
618{
619	struct g_consumer *cp;
620	char *buf;
621	size_t len;
622
623	g_topology_assert();
624
625	ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
626	cp = vdev_geom_attach_by_guids(vd);
627	if (cp != NULL) {
628		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
629		buf = kmem_alloc(len, KM_SLEEP);
630
631		snprintf(buf, len, "/dev/%s", cp->provider->name);
632		spa_strfree(vd->vdev_path);
633		vd->vdev_path = buf;
634
635		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
636		    (uintmax_t)spa_guid(vd->vdev_spa),
637		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
638	} else {
639		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
640		    (uintmax_t)spa_guid(vd->vdev_spa),
641		    (uintmax_t)vd->vdev_guid);
642	}
643
644	return (cp);
645}
646
647static struct g_consumer *
648vdev_geom_open_by_path(vdev_t *vd, int check_guid)
649{
650	struct g_provider *pp;
651	struct g_consumer *cp;
652	uint64_t pguid, vguid;
653
654	g_topology_assert();
655
656	cp = NULL;
657	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
658	if (pp != NULL) {
659		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
660		cp = vdev_geom_attach(pp, vd);
661		if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
662		    pp->sectorsize <= VDEV_PAD_SIZE) {
663			g_topology_unlock();
664			vdev_geom_read_guids(cp, &pguid, &vguid);
665			g_topology_lock();
666			/*
667			 * Check that the label's vdev guid matches the
668			 * desired guid.  If the label has a pool guid,
669			 * check that it matches too. (Inactive spares
670			 * and L2ARCs do not have any pool guid in the
671			 * label.)
672			 */
673			if ((pguid != 0 &&
674			    pguid != spa_guid(vd->vdev_spa)) ||
675			    vguid != vd->vdev_guid) {
676				vdev_geom_close_locked(vd);
677				cp = NULL;
678				ZFS_LOG(1, "guid mismatch for provider %s: "
679				    "%ju:%ju != %ju:%ju.", vd->vdev_path,
680				    (uintmax_t)spa_guid(vd->vdev_spa),
681				    (uintmax_t)vd->vdev_guid,
682				    (uintmax_t)pguid, (uintmax_t)vguid);
683			} else {
684				ZFS_LOG(1, "guid match for provider %s.",
685				    vd->vdev_path);
686			}
687		}
688	}
689
690	return (cp);
691}
692
693static int
694vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
695    uint64_t *logical_ashift, uint64_t *physical_ashift)
696{
697	struct g_provider *pp;
698	struct g_consumer *cp;
699	size_t bufsize;
700	int error;
701
702	/*
703	 * We must have a pathname, and it must be absolute.
704	 */
705	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
706		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
707		return (EINVAL);
708	}
709
710	vd->vdev_tsd = NULL;
711
712	DROP_GIANT();
713	g_topology_lock();
714	error = 0;
715
716	if (vd->vdev_spa->spa_splitting_newspa ||
717	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
718	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) {
719		/*
720		 * We are dealing with a vdev that hasn't been previously
721		 * opened (since boot), and we are not loading an
722		 * existing pool configuration.  This looks like a
723		 * vdev add operation to a new or existing pool.
724		 * Assume the user knows what he/she is doing and find
725		 * GEOM provider by its name, ignoring GUID mismatches.
726		 *
727		 * XXPOLICY: It would be safer to only allow a device
728		 *           that is unlabeled or labeled but missing
729		 *           GUID information to be opened in this fashion,
730		 *           unless we are doing a split, in which case we
731		 *           should allow any guid.
732		 */
733		cp = vdev_geom_open_by_path(vd, 0);
734	} else {
735		/*
736		 * Try using the recorded path for this device, but only
737		 * accept it if its label data contains the expected GUIDs.
738		 */
739		cp = vdev_geom_open_by_path(vd, 1);
740		if (cp == NULL) {
741			/*
742			 * The device at vd->vdev_path doesn't have the
743			 * expected GUIDs. The disks might have merely
744			 * moved around so try all other GEOM providers
745			 * to find one with the right GUIDs.
746			 */
747			cp = vdev_geom_open_by_guids(vd);
748		}
749	}
750
751	if (cp == NULL) {
752		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
753		error = ENOENT;
754	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
755	    !ISP2(cp->provider->sectorsize)) {
756		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
757		    vd->vdev_path);
758
759		vdev_geom_close_locked(vd);
760		error = EINVAL;
761		cp = NULL;
762	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
763		int i;
764
765		for (i = 0; i < 5; i++) {
766			error = g_access(cp, 0, 1, 0);
767			if (error == 0)
768				break;
769			g_topology_unlock();
770			tsleep(vd, 0, "vdev", hz / 2);
771			g_topology_lock();
772		}
773		if (error != 0) {
774			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
775			    vd->vdev_path, error);
776			vdev_geom_close_locked(vd);
777			cp = NULL;
778		}
779	}
780
781	g_topology_unlock();
782	PICKUP_GIANT();
783	if (cp == NULL) {
784		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
785		return (error);
786	}
787	pp = cp->provider;
788
789	/*
790	 * Determine the actual size of the device.
791	 */
792	*max_psize = *psize = pp->mediasize;
793
794	/*
795	 * Determine the device's minimum transfer size and preferred
796	 * transfer size.
797	 */
798	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
799	*physical_ashift = 0;
800	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
801	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
802		*physical_ashift = highbit(pp->stripesize) - 1;
803
804	/*
805	 * Clear the nowritecache settings, so that on a vdev_reopen()
806	 * we will try again.
807	 */
808	vd->vdev_nowritecache = B_FALSE;
809
810	/*
811	 * Determine the device's rotation rate.
812	 */
813	vdev_geom_set_rotation_rate(vd, cp);
814
815	return (0);
816}
817
818static void
819vdev_geom_close(vdev_t *vd)
820{
821
822	DROP_GIANT();
823	g_topology_lock();
824	vdev_geom_close_locked(vd);
825	g_topology_unlock();
826	PICKUP_GIANT();
827}
828
829static void
830vdev_geom_io_intr(struct bio *bp)
831{
832	vdev_t *vd;
833	zio_t *zio;
834
835	zio = bp->bio_caller1;
836	vd = zio->io_vd;
837	zio->io_error = bp->bio_error;
838	if (zio->io_error == 0 && bp->bio_resid != 0)
839		zio->io_error = SET_ERROR(EIO);
840
841	switch(zio->io_error) {
842	case ENOTSUP:
843		/*
844		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
845		 * that future attempts will never succeed. In this case
846		 * we set a persistent flag so that we don't bother with
847		 * requests in the future.
848		 */
849		switch(bp->bio_cmd) {
850		case BIO_FLUSH:
851			vd->vdev_nowritecache = B_TRUE;
852			break;
853		case BIO_DELETE:
854			vd->vdev_notrim = B_TRUE;
855			break;
856		}
857		break;
858	case ENXIO:
859		if (!vd->vdev_remove_wanted) {
860			/*
861			 * If provider's error is set we assume it is being
862			 * removed.
863			 */
864			if (bp->bio_to->error != 0) {
865				vd->vdev_remove_wanted = B_TRUE;
866				spa_async_request(zio->io_spa,
867				    SPA_ASYNC_REMOVE);
868			} else if (!vd->vdev_delayed_close) {
869				vd->vdev_delayed_close = B_TRUE;
870			}
871		}
872		break;
873	}
874	g_destroy_bio(bp);
875	zio_delay_interrupt(zio);
876}
877
878static void
879vdev_geom_io_start(zio_t *zio)
880{
881	vdev_t *vd;
882	struct g_consumer *cp;
883	struct bio *bp;
884	int error;
885
886	vd = zio->io_vd;
887
888	switch (zio->io_type) {
889	case ZIO_TYPE_IOCTL:
890		/* XXPOLICY */
891		if (!vdev_readable(vd)) {
892			zio->io_error = SET_ERROR(ENXIO);
893			zio_interrupt(zio);
894			return;
895		} else {
896			switch (zio->io_cmd) {
897			case DKIOCFLUSHWRITECACHE:
898				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
899					break;
900				if (vd->vdev_nowritecache) {
901					zio->io_error = SET_ERROR(ENOTSUP);
902					break;
903				}
904				goto sendreq;
905			default:
906				zio->io_error = SET_ERROR(ENOTSUP);
907			}
908		}
909
910		zio_execute(zio);
911		return;
912	case ZIO_TYPE_FREE:
913		if (vd->vdev_notrim) {
914			zio->io_error = SET_ERROR(ENOTSUP);
915		} else if (!vdev_geom_bio_delete_disable) {
916			goto sendreq;
917		}
918		zio_execute(zio);
919		return;
920	}
921sendreq:
922	ASSERT(zio->io_type == ZIO_TYPE_READ ||
923	    zio->io_type == ZIO_TYPE_WRITE ||
924	    zio->io_type == ZIO_TYPE_FREE ||
925	    zio->io_type == ZIO_TYPE_IOCTL);
926
927	cp = vd->vdev_tsd;
928	if (cp == NULL) {
929		zio->io_error = SET_ERROR(ENXIO);
930		zio_interrupt(zio);
931		return;
932	}
933	bp = g_alloc_bio();
934	bp->bio_caller1 = zio;
935	switch (zio->io_type) {
936	case ZIO_TYPE_READ:
937	case ZIO_TYPE_WRITE:
938		zio->io_target_timestamp = zio_handle_io_delay(zio);
939		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
940		bp->bio_data = zio->io_data;
941		bp->bio_offset = zio->io_offset;
942		bp->bio_length = zio->io_size;
943		break;
944	case ZIO_TYPE_FREE:
945		bp->bio_cmd = BIO_DELETE;
946		bp->bio_data = NULL;
947		bp->bio_offset = zio->io_offset;
948		bp->bio_length = zio->io_size;
949		break;
950	case ZIO_TYPE_IOCTL:
951		bp->bio_cmd = BIO_FLUSH;
952		bp->bio_flags |= BIO_ORDERED;
953		bp->bio_data = NULL;
954		bp->bio_offset = cp->provider->mediasize;
955		bp->bio_length = 0;
956		break;
957	}
958	bp->bio_done = vdev_geom_io_intr;
959
960	g_io_request(bp, cp);
961}
962
963static void
964vdev_geom_io_done(zio_t *zio)
965{
966}
967
968static void
969vdev_geom_hold(vdev_t *vd)
970{
971}
972
973static void
974vdev_geom_rele(vdev_t *vd)
975{
976}
977
978vdev_ops_t vdev_geom_ops = {
979	vdev_geom_open,
980	vdev_geom_close,
981	vdev_default_asize,
982	vdev_geom_io_start,
983	vdev_geom_io_done,
984	NULL,
985	vdev_geom_hold,
986	vdev_geom_rele,
987	VDEV_TYPE_DISK,		/* name of this vdev type */
988	B_TRUE			/* leaf vdev */
989};
990