vdev_geom.c revision 308059
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54SYSCTL_DECL(_vfs_zfs_vdev);
55/* Don't send BIO_FLUSH. */
56static int vdev_geom_bio_flush_disable = 0;
57TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60/* Don't send BIO_DELETE. */
61static int vdev_geom_bio_delete_disable = 0;
62TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66/* Declare local functions */
67static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
68
69/*
70 * Thread local storage used to indicate when a thread is probing geoms
71 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
72 * it is looking for a replacement for the vdev_t* that is its value.
73 */
74uint_t zfs_geom_probe_vdev_key;
75
76static void
77vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
78{
79	int error;
80	uint16_t rate;
81
82	error = g_getattr("GEOM::rotation_rate", cp, &rate);
83	if (error == 0)
84		vd->vdev_rotation_rate = rate;
85	else
86		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
87}
88
89static void
90vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
91{
92	vdev_t *vd;
93	spa_t *spa;
94	char *physpath;
95	int error, physpath_len;
96
97	vd = cp->private;
98	if (vd == NULL)
99		return;
100
101	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
102		vdev_geom_set_rotation_rate(vd, cp);
103		return;
104	}
105
106	if (strcmp(attr, "GEOM::physpath") != 0)
107		return;
108
109	if (g_access(cp, 1, 0, 0) != 0)
110		return;
111
112	/*
113	 * Record/Update physical path information for this device.
114	 */
115	spa = vd->vdev_spa;
116	physpath_len = MAXPATHLEN;
117	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
118	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
119	g_access(cp, -1, 0, 0);
120	if (error == 0) {
121		char *old_physpath;
122
123		/* g_topology lock ensures that vdev has not been closed */
124		g_topology_assert();
125		old_physpath = vd->vdev_physpath;
126		vd->vdev_physpath = spa_strdup(physpath);
127		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
128
129		if (old_physpath != NULL)
130			spa_strfree(old_physpath);
131	}
132	g_free(physpath);
133}
134
135static void
136vdev_geom_orphan(struct g_consumer *cp)
137{
138	vdev_t *vd;
139
140	g_topology_assert();
141
142	vd = cp->private;
143	if (vd == NULL) {
144		/* Vdev close in progress.  Ignore the event. */
145		return;
146	}
147
148	/*
149	 * Orphan callbacks occur from the GEOM event thread.
150	 * Concurrent with this call, new I/O requests may be
151	 * working their way through GEOM about to find out
152	 * (only once executed by the g_down thread) that we've
153	 * been orphaned from our disk provider.  These I/Os
154	 * must be retired before we can detach our consumer.
155	 * This is most easily achieved by acquiring the
156	 * SPA ZIO configuration lock as a writer, but doing
157	 * so with the GEOM topology lock held would cause
158	 * a lock order reversal.  Instead, rely on the SPA's
159	 * async removal support to invoke a close on this
160	 * vdev once it is safe to do so.
161	 */
162	vd->vdev_remove_wanted = B_TRUE;
163	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
164}
165
166static struct g_consumer *
167vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
168{
169	struct g_geom *gp;
170	struct g_consumer *cp;
171	int error;
172
173	g_topology_assert();
174
175	ZFS_LOG(1, "Attaching to %s.", pp->name);
176
177	if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
178		ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
179		    pp->name, pp->sectorsize);
180		return (NULL);
181	} else if (pp->mediasize < SPA_MINDEVSIZE) {
182		ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
183		    pp->name, pp->mediasize);
184		return (NULL);
185	}
186
187	/* Do we have geom already? No? Create one. */
188	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
189		if (gp->flags & G_GEOM_WITHER)
190			continue;
191		if (strcmp(gp->name, "zfs::vdev") != 0)
192			continue;
193		break;
194	}
195	if (gp == NULL) {
196		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
197		gp->orphan = vdev_geom_orphan;
198		gp->attrchanged = vdev_geom_attrchanged;
199		cp = g_new_consumer(gp);
200		error = g_attach(cp, pp);
201		if (error != 0) {
202			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
203			    __LINE__, error);
204			vdev_geom_detach(cp, B_FALSE);
205			return (NULL);
206		}
207		error = g_access(cp, 1, 0, 1);
208		if (error != 0) {
209			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
210			       __LINE__, error);
211			vdev_geom_detach(cp, B_FALSE);
212			return (NULL);
213		}
214		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
215	} else {
216		/* Check if we are already connected to this provider. */
217		LIST_FOREACH(cp, &gp->consumer, consumer) {
218			if (cp->provider == pp) {
219				ZFS_LOG(1, "Found consumer for %s.", pp->name);
220				break;
221			}
222		}
223		if (cp == NULL) {
224			cp = g_new_consumer(gp);
225			error = g_attach(cp, pp);
226			if (error != 0) {
227				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
228				    __func__, __LINE__, error);
229				vdev_geom_detach(cp, B_FALSE);
230				return (NULL);
231			}
232			error = g_access(cp, 1, 0, 1);
233			if (error != 0) {
234				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
235				    __func__, __LINE__, error);
236				vdev_geom_detach(cp, B_FALSE);
237				return (NULL);
238			}
239			ZFS_LOG(1, "Created consumer for %s.", pp->name);
240		} else {
241			error = g_access(cp, 1, 0, 1);
242			if (error != 0) {
243				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
244				    __func__, __LINE__, error);
245				return (NULL);
246			}
247			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
248		}
249	}
250
251	/*
252	 * BUG: cp may already belong to a vdev.  This could happen if:
253	 * 1) That vdev is a shared spare, or
254	 * 2) We are trying to reopen a missing vdev and we are scanning by
255	 *    guid.  In that case, we'll ultimately fail to open this consumer,
256	 *    but not until after setting the private field.
257	 * The solution is to:
258	 * 1) Don't set the private field until after the open succeeds, and
259	 * 2) Set it to a linked list of vdevs, not just a single vdev
260	 */
261	cp->private = vd;
262	if (vd != NULL)
263		vd->vdev_tsd = cp;
264
265	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
266	return (cp);
267}
268
269static void
270vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
271{
272	struct g_geom *gp;
273	vdev_t *vd;
274
275	g_topology_assert();
276
277	ZFS_LOG(1, "Detaching consumer. Provider %s.",
278	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
279
280	vd = cp->private;
281	cp->private = NULL;
282
283	gp = cp->geom;
284	if (open_for_read)
285		g_access(cp, -1, 0, -1);
286	/* Destroy consumer on last close. */
287	if (cp->acr == 0 && cp->ace == 0) {
288		if (cp->acw > 0)
289			g_access(cp, 0, -cp->acw, 0);
290		if (cp->provider != NULL) {
291			ZFS_LOG(1, "Destroying consumer to %s.",
292			    cp->provider->name ? cp->provider->name : "NULL");
293			g_detach(cp);
294		}
295		g_destroy_consumer(cp);
296	}
297	/* Destroy geom if there are no consumers left. */
298	if (LIST_EMPTY(&gp->consumer)) {
299		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
300		g_wither_geom(gp, ENXIO);
301	}
302}
303
304static void
305vdev_geom_close_locked(vdev_t *vd)
306{
307	struct g_consumer *cp;
308
309	g_topology_assert();
310
311	cp = vd->vdev_tsd;
312	vd->vdev_tsd = NULL;
313	vd->vdev_delayed_close = B_FALSE;
314	if (cp == NULL)
315		return;
316
317	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
318
319	vdev_geom_detach(cp, B_TRUE);
320}
321
322static void
323nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
324{
325
326	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
327	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
328}
329
330static int
331vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
332{
333	struct bio *bp;
334	u_char *p;
335	off_t off, maxio;
336	int error;
337
338	ASSERT((offset % cp->provider->sectorsize) == 0);
339	ASSERT((size % cp->provider->sectorsize) == 0);
340
341	bp = g_alloc_bio();
342	off = offset;
343	offset += size;
344	p = data;
345	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
346	error = 0;
347
348	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
349		bzero(bp, sizeof(*bp));
350		bp->bio_cmd = cmd;
351		bp->bio_done = NULL;
352		bp->bio_offset = off;
353		bp->bio_length = MIN(size, maxio);
354		bp->bio_data = p;
355		g_io_request(bp, cp);
356		error = biowait(bp, "vdev_geom_io");
357		if (error != 0)
358			break;
359	}
360
361	g_destroy_bio(bp);
362	return (error);
363}
364
365static int
366vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
367{
368	struct g_provider *pp;
369	vdev_label_t *label;
370	char *p, *buf;
371	size_t buflen;
372	uint64_t psize;
373	off_t offset, size;
374	uint64_t state, txg;
375	int error, l, len;
376
377	g_topology_assert_not();
378
379	pp = cp->provider;
380	ZFS_LOG(1, "Reading config from %s...", pp->name);
381
382	psize = pp->mediasize;
383	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
384
385	size = sizeof(*label) + pp->sectorsize -
386	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
387
388	label = kmem_alloc(size, KM_SLEEP);
389	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
390
391	*config = NULL;
392	for (l = 0; l < VDEV_LABELS; l++) {
393
394		offset = vdev_label_offset(psize, l, 0);
395		if ((offset % pp->sectorsize) != 0)
396			continue;
397
398		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
399			continue;
400		buf = label->vl_vdev_phys.vp_nvlist;
401
402		if (nvlist_unpack(buf, buflen, config, 0) != 0)
403			continue;
404
405		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
406		    &state) != 0 || state > POOL_STATE_L2CACHE) {
407			nvlist_free(*config);
408			*config = NULL;
409			continue;
410		}
411
412		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
413		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
414		    &txg) != 0 || txg == 0)) {
415			nvlist_free(*config);
416			*config = NULL;
417			continue;
418		}
419
420		break;
421	}
422
423	kmem_free(label, size);
424	return (*config == NULL ? ENOENT : 0);
425}
426
427static void
428resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
429{
430	nvlist_t **new_configs;
431	uint64_t i;
432
433	if (id < *count)
434		return;
435	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
436	    KM_SLEEP);
437	for (i = 0; i < *count; i++)
438		new_configs[i] = (*configs)[i];
439	if (*configs != NULL)
440		kmem_free(*configs, *count * sizeof(void *));
441	*configs = new_configs;
442	*count = id + 1;
443}
444
445static void
446process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
447    const char *name, uint64_t* known_pool_guid)
448{
449	nvlist_t *vdev_tree;
450	uint64_t pool_guid;
451	uint64_t vdev_guid, known_guid;
452	uint64_t id, txg, known_txg;
453	char *pname;
454	int i;
455
456	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
457	    strcmp(pname, name) != 0)
458		goto ignore;
459
460	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
461		goto ignore;
462
463	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
464		goto ignore;
465
466	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
467		goto ignore;
468
469	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
470		goto ignore;
471
472	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
473
474	if (*known_pool_guid != 0) {
475		if (pool_guid != *known_pool_guid)
476			goto ignore;
477	} else
478		*known_pool_guid = pool_guid;
479
480	resize_configs(configs, count, id);
481
482	if ((*configs)[id] != NULL) {
483		VERIFY(nvlist_lookup_uint64((*configs)[id],
484		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
485		if (txg <= known_txg)
486			goto ignore;
487		nvlist_free((*configs)[id]);
488	}
489
490	(*configs)[id] = cfg;
491	return;
492
493ignore:
494	nvlist_free(cfg);
495}
496
497int
498vdev_geom_read_pool_label(const char *name,
499    nvlist_t ***configs, uint64_t *count)
500{
501	struct g_class *mp;
502	struct g_geom *gp;
503	struct g_provider *pp;
504	struct g_consumer *zcp;
505	nvlist_t *vdev_cfg;
506	uint64_t pool_guid;
507	int error;
508
509	DROP_GIANT();
510	g_topology_lock();
511
512	*configs = NULL;
513	*count = 0;
514	pool_guid = 0;
515	LIST_FOREACH(mp, &g_classes, class) {
516		if (mp == &zfs_vdev_class)
517			continue;
518		LIST_FOREACH(gp, &mp->geom, geom) {
519			if (gp->flags & G_GEOM_WITHER)
520				continue;
521			LIST_FOREACH(pp, &gp->provider, provider) {
522				if (pp->flags & G_PF_WITHER)
523					continue;
524				zcp = vdev_geom_attach(pp, NULL);
525				if (zcp == NULL)
526					continue;
527				g_topology_unlock();
528				error = vdev_geom_read_config(zcp, &vdev_cfg);
529				g_topology_lock();
530				vdev_geom_detach(zcp, B_TRUE);
531				if (error)
532					continue;
533				ZFS_LOG(1, "successfully read vdev config");
534
535				process_vdev_config(configs, count,
536				    vdev_cfg, name, &pool_guid);
537			}
538		}
539	}
540	g_topology_unlock();
541	PICKUP_GIANT();
542
543	return (*count > 0 ? 0 : ENOENT);
544}
545
546static void
547vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
548{
549	nvlist_t *config;
550
551	g_topology_assert_not();
552
553	*pguid = 0;
554	*vguid = 0;
555	if (vdev_geom_read_config(cp, &config) == 0) {
556		nvlist_get_guids(config, pguid, vguid);
557		nvlist_free(config);
558	}
559}
560
561static boolean_t
562vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
563{
564	uint64_t pool_guid;
565	uint64_t vdev_guid;
566	struct g_consumer *zcp;
567	boolean_t pool_ok;
568	boolean_t vdev_ok;
569
570	zcp = vdev_geom_attach(pp, NULL);
571	if (zcp == NULL) {
572		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
573		    pp->name);
574		return (B_FALSE);
575	}
576	g_topology_unlock();
577	vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
578	g_topology_lock();
579	vdev_geom_detach(zcp, B_TRUE);
580
581	/*
582	 * Check that the label's vdev guid matches the desired guid.  If the
583	 * label has a pool guid, check that it matches too. (Inactive spares
584	 * and L2ARCs do not have any pool guid in the label.)
585	 */
586	if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
587	    vdev_guid == vd->vdev_guid) {
588		ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
589		return (B_TRUE);
590	} else {
591		ZFS_LOG(1, "guid mismatch for provider %s: "
592		    "%ju:%ju != %ju:%ju.", vd->vdev_path,
593		    (uintmax_t)spa_guid(vd->vdev_spa),
594		    (uintmax_t)vd->vdev_guid,
595		    (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
596		return (B_FALSE);
597	}
598}
599
600static struct g_consumer *
601vdev_geom_attach_by_guids(vdev_t *vd)
602{
603	struct g_class *mp;
604	struct g_geom *gp;
605	struct g_provider *pp;
606	struct g_consumer *cp;
607
608	g_topology_assert();
609
610	cp = NULL;
611	LIST_FOREACH(mp, &g_classes, class) {
612		if (mp == &zfs_vdev_class)
613			continue;
614		LIST_FOREACH(gp, &mp->geom, geom) {
615			if (gp->flags & G_GEOM_WITHER)
616				continue;
617			LIST_FOREACH(pp, &gp->provider, provider) {
618				if (!vdev_attach_ok(vd, pp))
619					continue;
620				cp = vdev_geom_attach(pp, vd);
621				if (cp == NULL) {
622					printf("ZFS WARNING: Unable to "
623					    "attach to %s.\n", pp->name);
624					continue;
625				}
626				break;
627			}
628			if (cp != NULL)
629				break;
630		}
631		if (cp != NULL)
632			break;
633	}
634end:
635	return (cp);
636}
637
638static struct g_consumer *
639vdev_geom_open_by_guids(vdev_t *vd)
640{
641	struct g_consumer *cp;
642	char *buf;
643	size_t len;
644
645	g_topology_assert();
646
647	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
648		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
649	cp = vdev_geom_attach_by_guids(vd);
650	if (cp != NULL) {
651		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
652		buf = kmem_alloc(len, KM_SLEEP);
653
654		snprintf(buf, len, "/dev/%s", cp->provider->name);
655		spa_strfree(vd->vdev_path);
656		vd->vdev_path = buf;
657
658		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
659		    (uintmax_t)spa_guid(vd->vdev_spa),
660		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
661	} else {
662		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
663		    (uintmax_t)spa_guid(vd->vdev_spa),
664		    (uintmax_t)vd->vdev_guid);
665	}
666
667	return (cp);
668}
669
670static struct g_consumer *
671vdev_geom_open_by_path(vdev_t *vd, int check_guid)
672{
673	struct g_provider *pp;
674	struct g_consumer *cp;
675
676	g_topology_assert();
677
678	cp = NULL;
679	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
680	if (pp != NULL) {
681		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
682		if (!check_guid || vdev_attach_ok(vd, pp))
683			cp = vdev_geom_attach(pp, vd);
684	}
685
686	return (cp);
687}
688
689static int
690vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
691    uint64_t *logical_ashift, uint64_t *physical_ashift)
692{
693	struct g_provider *pp;
694	struct g_consumer *cp;
695	size_t bufsize;
696	int error;
697
698	/* Set the TLS to indicate downstack that we should not access zvols*/
699	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
700
701	/*
702	 * We must have a pathname, and it must be absolute.
703	 */
704	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
705		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
706		return (EINVAL);
707	}
708
709	vd->vdev_tsd = NULL;
710
711	DROP_GIANT();
712	g_topology_lock();
713	error = 0;
714
715	if (vd->vdev_spa->spa_splitting_newspa ||
716	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
717	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
718	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
719		/*
720		 * We are dealing with a vdev that hasn't been previously
721		 * opened (since boot), and we are not loading an
722		 * existing pool configuration.  This looks like a
723		 * vdev add operation to a new or existing pool.
724		 * Assume the user knows what he/she is doing and find
725		 * GEOM provider by its name, ignoring GUID mismatches.
726		 *
727		 * XXPOLICY: It would be safer to only allow a device
728		 *           that is unlabeled or labeled but missing
729		 *           GUID information to be opened in this fashion,
730		 *           unless we are doing a split, in which case we
731		 *           should allow any guid.
732		 */
733		cp = vdev_geom_open_by_path(vd, 0);
734	} else {
735		/*
736		 * Try using the recorded path for this device, but only
737		 * accept it if its label data contains the expected GUIDs.
738		 */
739		cp = vdev_geom_open_by_path(vd, 1);
740		if (cp == NULL) {
741			/*
742			 * The device at vd->vdev_path doesn't have the
743			 * expected GUIDs. The disks might have merely
744			 * moved around so try all other GEOM providers
745			 * to find one with the right GUIDs.
746			 */
747			cp = vdev_geom_open_by_guids(vd);
748		}
749	}
750
751	/* Clear the TLS now that tasting is done */
752	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
753
754	if (cp == NULL) {
755		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
756		error = ENOENT;
757	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
758	    !ISP2(cp->provider->sectorsize)) {
759		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
760		    vd->vdev_path);
761
762		vdev_geom_close_locked(vd);
763		error = EINVAL;
764		cp = NULL;
765	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
766		int i;
767
768		for (i = 0; i < 5; i++) {
769			error = g_access(cp, 0, 1, 0);
770			if (error == 0)
771				break;
772			g_topology_unlock();
773			tsleep(vd, 0, "vdev", hz / 2);
774			g_topology_lock();
775		}
776		if (error != 0) {
777			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
778			    vd->vdev_path, error);
779			vdev_geom_close_locked(vd);
780			cp = NULL;
781		}
782	}
783
784	/* Fetch initial physical path information for this device. */
785	if (cp != NULL)
786		vdev_geom_attrchanged(cp, "GEOM::physpath");
787
788	g_topology_unlock();
789	PICKUP_GIANT();
790	if (cp == NULL) {
791		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
792		return (error);
793	}
794	pp = cp->provider;
795
796	/*
797	 * Determine the actual size of the device.
798	 */
799	*max_psize = *psize = pp->mediasize;
800
801	/*
802	 * Determine the device's minimum transfer size and preferred
803	 * transfer size.
804	 */
805	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
806	*physical_ashift = 0;
807	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
808	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
809		*physical_ashift = highbit(pp->stripesize) - 1;
810
811	/*
812	 * Clear the nowritecache settings, so that on a vdev_reopen()
813	 * we will try again.
814	 */
815	vd->vdev_nowritecache = B_FALSE;
816
817	/*
818	 * Determine the device's rotation rate.
819	 */
820	vdev_geom_set_rotation_rate(vd, cp);
821
822	return (0);
823}
824
825static void
826vdev_geom_close(vdev_t *vd)
827{
828
829	DROP_GIANT();
830	g_topology_lock();
831	vdev_geom_close_locked(vd);
832	g_topology_unlock();
833	PICKUP_GIANT();
834}
835
836static void
837vdev_geom_io_intr(struct bio *bp)
838{
839	vdev_t *vd;
840	zio_t *zio;
841
842	zio = bp->bio_caller1;
843	vd = zio->io_vd;
844	zio->io_error = bp->bio_error;
845	if (zio->io_error == 0 && bp->bio_resid != 0)
846		zio->io_error = SET_ERROR(EIO);
847
848	switch(zio->io_error) {
849	case ENOTSUP:
850		/*
851		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
852		 * that future attempts will never succeed. In this case
853		 * we set a persistent flag so that we don't bother with
854		 * requests in the future.
855		 */
856		switch(bp->bio_cmd) {
857		case BIO_FLUSH:
858			vd->vdev_nowritecache = B_TRUE;
859			break;
860		case BIO_DELETE:
861			vd->vdev_notrim = B_TRUE;
862			break;
863		}
864		break;
865	case ENXIO:
866		if (!vd->vdev_remove_wanted) {
867			/*
868			 * If provider's error is set we assume it is being
869			 * removed.
870			 */
871			if (bp->bio_to->error != 0) {
872				vd->vdev_remove_wanted = B_TRUE;
873				spa_async_request(zio->io_spa,
874				    SPA_ASYNC_REMOVE);
875			} else if (!vd->vdev_delayed_close) {
876				vd->vdev_delayed_close = B_TRUE;
877			}
878		}
879		break;
880	}
881	g_destroy_bio(bp);
882	zio_delay_interrupt(zio);
883}
884
885static void
886vdev_geom_io_start(zio_t *zio)
887{
888	vdev_t *vd;
889	struct g_consumer *cp;
890	struct bio *bp;
891	int error;
892
893	vd = zio->io_vd;
894
895	switch (zio->io_type) {
896	case ZIO_TYPE_IOCTL:
897		/* XXPOLICY */
898		if (!vdev_readable(vd)) {
899			zio->io_error = SET_ERROR(ENXIO);
900			zio_interrupt(zio);
901			return;
902		} else {
903			switch (zio->io_cmd) {
904			case DKIOCFLUSHWRITECACHE:
905				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
906					break;
907				if (vd->vdev_nowritecache) {
908					zio->io_error = SET_ERROR(ENOTSUP);
909					break;
910				}
911				goto sendreq;
912			default:
913				zio->io_error = SET_ERROR(ENOTSUP);
914			}
915		}
916
917		zio_execute(zio);
918		return;
919	case ZIO_TYPE_FREE:
920		if (vd->vdev_notrim) {
921			zio->io_error = SET_ERROR(ENOTSUP);
922		} else if (!vdev_geom_bio_delete_disable) {
923			goto sendreq;
924		}
925		zio_execute(zio);
926		return;
927	}
928sendreq:
929	ASSERT(zio->io_type == ZIO_TYPE_READ ||
930	    zio->io_type == ZIO_TYPE_WRITE ||
931	    zio->io_type == ZIO_TYPE_FREE ||
932	    zio->io_type == ZIO_TYPE_IOCTL);
933
934	cp = vd->vdev_tsd;
935	if (cp == NULL) {
936		zio->io_error = SET_ERROR(ENXIO);
937		zio_interrupt(zio);
938		return;
939	}
940	bp = g_alloc_bio();
941	bp->bio_caller1 = zio;
942	switch (zio->io_type) {
943	case ZIO_TYPE_READ:
944	case ZIO_TYPE_WRITE:
945		zio->io_target_timestamp = zio_handle_io_delay(zio);
946		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
947		bp->bio_data = zio->io_data;
948		bp->bio_offset = zio->io_offset;
949		bp->bio_length = zio->io_size;
950		break;
951	case ZIO_TYPE_FREE:
952		bp->bio_cmd = BIO_DELETE;
953		bp->bio_data = NULL;
954		bp->bio_offset = zio->io_offset;
955		bp->bio_length = zio->io_size;
956		break;
957	case ZIO_TYPE_IOCTL:
958		bp->bio_cmd = BIO_FLUSH;
959		bp->bio_flags |= BIO_ORDERED;
960		bp->bio_data = NULL;
961		bp->bio_offset = cp->provider->mediasize;
962		bp->bio_length = 0;
963		break;
964	}
965	bp->bio_done = vdev_geom_io_intr;
966
967	g_io_request(bp, cp);
968}
969
970static void
971vdev_geom_io_done(zio_t *zio)
972{
973}
974
975static void
976vdev_geom_hold(vdev_t *vd)
977{
978}
979
980static void
981vdev_geom_rele(vdev_t *vd)
982{
983}
984
985vdev_ops_t vdev_geom_ops = {
986	vdev_geom_open,
987	vdev_geom_close,
988	vdev_default_asize,
989	vdev_geom_io_start,
990	vdev_geom_io_done,
991	NULL,
992	vdev_geom_hold,
993	vdev_geom_rele,
994	VDEV_TYPE_DISK,		/* name of this vdev type */
995	B_TRUE			/* leaf vdev */
996};
997