vdev_geom.c revision 308060
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54SYSCTL_DECL(_vfs_zfs_vdev);
55/* Don't send BIO_FLUSH. */
56static int vdev_geom_bio_flush_disable = 0;
57TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60/* Don't send BIO_DELETE. */
61static int vdev_geom_bio_delete_disable = 0;
62TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66/* Declare local functions */
67static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
68
69/*
70 * Thread local storage used to indicate when a thread is probing geoms
71 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
72 * it is looking for a replacement for the vdev_t* that is its value.
73 */
74uint_t zfs_geom_probe_vdev_key;
75
76static void
77vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
78{
79	int error;
80	uint16_t rate;
81
82	error = g_getattr("GEOM::rotation_rate", cp, &rate);
83	if (error == 0)
84		vd->vdev_rotation_rate = rate;
85	else
86		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
87}
88
89static void
90vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
91{
92	vdev_t *vd;
93	spa_t *spa;
94	char *physpath;
95	int error, physpath_len;
96
97	vd = cp->private;
98	if (vd == NULL)
99		return;
100
101	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
102		vdev_geom_set_rotation_rate(vd, cp);
103		return;
104	}
105
106	if (strcmp(attr, "GEOM::physpath") != 0)
107		return;
108
109	if (g_access(cp, 1, 0, 0) != 0)
110		return;
111
112	/*
113	 * Record/Update physical path information for this device.
114	 */
115	spa = vd->vdev_spa;
116	physpath_len = MAXPATHLEN;
117	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
118	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
119	g_access(cp, -1, 0, 0);
120	if (error == 0) {
121		char *old_physpath;
122
123		/* g_topology lock ensures that vdev has not been closed */
124		g_topology_assert();
125		old_physpath = vd->vdev_physpath;
126		vd->vdev_physpath = spa_strdup(physpath);
127		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
128
129		if (old_physpath != NULL)
130			spa_strfree(old_physpath);
131	}
132	g_free(physpath);
133}
134
135static void
136vdev_geom_orphan(struct g_consumer *cp)
137{
138	vdev_t *vd;
139
140	g_topology_assert();
141
142	vd = cp->private;
143	if (vd == NULL) {
144		/* Vdev close in progress.  Ignore the event. */
145		return;
146	}
147
148	/*
149	 * Orphan callbacks occur from the GEOM event thread.
150	 * Concurrent with this call, new I/O requests may be
151	 * working their way through GEOM about to find out
152	 * (only once executed by the g_down thread) that we've
153	 * been orphaned from our disk provider.  These I/Os
154	 * must be retired before we can detach our consumer.
155	 * This is most easily achieved by acquiring the
156	 * SPA ZIO configuration lock as a writer, but doing
157	 * so with the GEOM topology lock held would cause
158	 * a lock order reversal.  Instead, rely on the SPA's
159	 * async removal support to invoke a close on this
160	 * vdev once it is safe to do so.
161	 */
162	vd->vdev_remove_wanted = B_TRUE;
163	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
164}
165
166static struct g_consumer *
167vdev_geom_attach(struct g_provider *pp, vdev_t *vd)
168{
169	struct g_geom *gp;
170	struct g_consumer *cp;
171	int error;
172
173	g_topology_assert();
174
175	ZFS_LOG(1, "Attaching to %s.", pp->name);
176
177	if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
178		ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
179		    pp->name, pp->sectorsize);
180		return (NULL);
181	} else if (pp->mediasize < SPA_MINDEVSIZE) {
182		ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
183		    pp->name, pp->mediasize);
184		return (NULL);
185	}
186
187	/* Do we have geom already? No? Create one. */
188	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
189		if (gp->flags & G_GEOM_WITHER)
190			continue;
191		if (strcmp(gp->name, "zfs::vdev") != 0)
192			continue;
193		break;
194	}
195	if (gp == NULL) {
196		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
197		gp->orphan = vdev_geom_orphan;
198		gp->attrchanged = vdev_geom_attrchanged;
199		cp = g_new_consumer(gp);
200		error = g_attach(cp, pp);
201		if (error != 0) {
202			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
203			    __LINE__, error);
204			vdev_geom_detach(cp, B_FALSE);
205			return (NULL);
206		}
207		error = g_access(cp, 1, 0, 1);
208		if (error != 0) {
209			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
210			       __LINE__, error);
211			vdev_geom_detach(cp, B_FALSE);
212			return (NULL);
213		}
214		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
215	} else {
216		/* Check if we are already connected to this provider. */
217		LIST_FOREACH(cp, &gp->consumer, consumer) {
218			if (cp->provider == pp) {
219				ZFS_LOG(1, "Found consumer for %s.", pp->name);
220				break;
221			}
222		}
223		if (cp == NULL) {
224			cp = g_new_consumer(gp);
225			error = g_attach(cp, pp);
226			if (error != 0) {
227				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
228				    __func__, __LINE__, error);
229				vdev_geom_detach(cp, B_FALSE);
230				return (NULL);
231			}
232			error = g_access(cp, 1, 0, 1);
233			if (error != 0) {
234				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
235				    __func__, __LINE__, error);
236				vdev_geom_detach(cp, B_FALSE);
237				return (NULL);
238			}
239			ZFS_LOG(1, "Created consumer for %s.", pp->name);
240		} else {
241			error = g_access(cp, 1, 0, 1);
242			if (error != 0) {
243				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
244				    __func__, __LINE__, error);
245				return (NULL);
246			}
247			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
248		}
249	}
250
251	/*
252	 * BUG: cp may already belong to a vdev.  This could happen if:
253	 * 1) That vdev is a shared spare, or
254	 * 2) We are trying to reopen a missing vdev and we are scanning by
255	 *    guid.  In that case, we'll ultimately fail to open this consumer,
256	 *    but not until after setting the private field.
257	 * The solution is to:
258	 * 1) Don't set the private field until after the open succeeds, and
259	 * 2) Set it to a linked list of vdevs, not just a single vdev
260	 */
261	cp->private = vd;
262	if (vd != NULL)
263		vd->vdev_tsd = cp;
264
265	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
266	return (cp);
267}
268
269static void
270vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
271{
272	struct g_geom *gp;
273	vdev_t *vd;
274
275	g_topology_assert();
276
277	ZFS_LOG(1, "Detaching consumer. Provider %s.",
278	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
279
280	vd = cp->private;
281	cp->private = NULL;
282
283	gp = cp->geom;
284	if (open_for_read)
285		g_access(cp, -1, 0, -1);
286	/* Destroy consumer on last close. */
287	if (cp->acr == 0 && cp->ace == 0) {
288		if (cp->acw > 0)
289			g_access(cp, 0, -cp->acw, 0);
290		if (cp->provider != NULL) {
291			ZFS_LOG(1, "Destroying consumer to %s.",
292			    cp->provider->name ? cp->provider->name : "NULL");
293			g_detach(cp);
294		}
295		g_destroy_consumer(cp);
296	}
297	/* Destroy geom if there are no consumers left. */
298	if (LIST_EMPTY(&gp->consumer)) {
299		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
300		g_wither_geom(gp, ENXIO);
301	}
302}
303
304static void
305vdev_geom_close_locked(vdev_t *vd)
306{
307	struct g_consumer *cp;
308
309	g_topology_assert();
310
311	cp = vd->vdev_tsd;
312	vd->vdev_tsd = NULL;
313	vd->vdev_delayed_close = B_FALSE;
314	if (cp == NULL)
315		return;
316
317	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
318
319	vdev_geom_detach(cp, B_TRUE);
320}
321
322static void
323nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
324{
325
326	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
327	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
328}
329
330/*
331 * Issue one or more bios to the vdev in parallel
332 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
333 * operation is described by parallel entries from each array.  There may be
334 * more bios actually issued than entries in the array
335 */
336static void
337vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
338    off_t *sizes, int *errors, int ncmds)
339{
340	struct bio **bios;
341	u_char *p;
342	off_t off, maxio, s, end;
343	int i, n_bios, j;
344	size_t bios_size;
345
346	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
347	n_bios = 0;
348
349	/* How many bios are required for all commands ? */
350	for (i = 0; i < ncmds; i++)
351		n_bios += (sizes[i] + maxio - 1) / maxio;
352
353	/* Allocate memory for the bios */
354	bios_size = n_bios * sizeof(struct bio*);
355	bios = kmem_zalloc(bios_size, KM_SLEEP);
356
357	/* Prepare and issue all of the bios */
358	for (i = j = 0; i < ncmds; i++) {
359		off = offsets[i];
360		p = datas[i];
361		s = sizes[i];
362		end = off + s;
363		ASSERT((off % cp->provider->sectorsize) == 0);
364		ASSERT((s % cp->provider->sectorsize) == 0);
365
366		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
367			bios[j] = g_alloc_bio();
368			bios[j]->bio_cmd = cmds[i];
369			bios[j]->bio_done = NULL;
370			bios[j]->bio_offset = off;
371			bios[j]->bio_length = MIN(s, maxio);
372			bios[j]->bio_data = p;
373			g_io_request(bios[j], cp);
374		}
375	}
376	ASSERT(j == n_bios);
377
378	/* Wait for all of the bios to complete, and clean them up */
379	for (i = j = 0; i < ncmds; i++) {
380		off = offsets[i];
381		s = sizes[i];
382		end = off + s;
383
384		for (; off < end; off += maxio, s -= maxio, j++) {
385			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
386			g_destroy_bio(bios[j]);
387		}
388	}
389	kmem_free(bios, bios_size);
390}
391
392static int
393vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
394{
395	struct g_provider *pp;
396	vdev_phys_t *vdev_lists[VDEV_LABELS];
397	char *p, *buf;
398	size_t buflen;
399	uint64_t psize, state, txg;
400	off_t offsets[VDEV_LABELS];
401	off_t size;
402	off_t sizes[VDEV_LABELS];
403	int cmds[VDEV_LABELS];
404	int errors[VDEV_LABELS];
405	int l, len;
406
407	g_topology_assert_not();
408
409	pp = cp->provider;
410	ZFS_LOG(1, "Reading config from %s...", pp->name);
411
412	psize = pp->mediasize;
413	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
414
415	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
416	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
417
418	buflen = sizeof(vdev_lists[0]->vp_nvlist);
419
420	*config = NULL;
421	/* Create all of the IO requests */
422	for (l = 0; l < VDEV_LABELS; l++) {
423		cmds[l] = BIO_READ;
424		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
425		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
426		sizes[l] = size;
427		errors[l] = 0;
428		ASSERT(offsets[l] % pp->sectorsize == 0);
429	}
430
431	/* Issue the IO requests */
432	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
433	    VDEV_LABELS);
434
435	/* Parse the labels */
436	for (l = 0; l < VDEV_LABELS; l++) {
437		if (errors[l] != 0)
438			continue;
439
440		buf = vdev_lists[l]->vp_nvlist;
441
442		if (nvlist_unpack(buf, buflen, config, 0) != 0)
443			continue;
444
445		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
446		    &state) != 0 || state > POOL_STATE_L2CACHE) {
447			nvlist_free(*config);
448			*config = NULL;
449			continue;
450		}
451
452		if (state != POOL_STATE_SPARE &&
453		    state != POOL_STATE_L2CACHE &&
454		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
455		    &txg) != 0 || txg == 0)) {
456			nvlist_free(*config);
457			*config = NULL;
458			continue;
459		}
460
461		break;
462	}
463
464	/* Free the label storage */
465	for (l = 0; l < VDEV_LABELS; l++)
466		kmem_free(vdev_lists[l], size);
467
468	return (*config == NULL ? ENOENT : 0);
469}
470
471static void
472resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
473{
474	nvlist_t **new_configs;
475	uint64_t i;
476
477	if (id < *count)
478		return;
479	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
480	    KM_SLEEP);
481	for (i = 0; i < *count; i++)
482		new_configs[i] = (*configs)[i];
483	if (*configs != NULL)
484		kmem_free(*configs, *count * sizeof(void *));
485	*configs = new_configs;
486	*count = id + 1;
487}
488
489static void
490process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
491    const char *name, uint64_t* known_pool_guid)
492{
493	nvlist_t *vdev_tree;
494	uint64_t pool_guid;
495	uint64_t vdev_guid, known_guid;
496	uint64_t id, txg, known_txg;
497	char *pname;
498	int i;
499
500	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
501	    strcmp(pname, name) != 0)
502		goto ignore;
503
504	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
505		goto ignore;
506
507	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
508		goto ignore;
509
510	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
511		goto ignore;
512
513	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
514		goto ignore;
515
516	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
517
518	if (*known_pool_guid != 0) {
519		if (pool_guid != *known_pool_guid)
520			goto ignore;
521	} else
522		*known_pool_guid = pool_guid;
523
524	resize_configs(configs, count, id);
525
526	if ((*configs)[id] != NULL) {
527		VERIFY(nvlist_lookup_uint64((*configs)[id],
528		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
529		if (txg <= known_txg)
530			goto ignore;
531		nvlist_free((*configs)[id]);
532	}
533
534	(*configs)[id] = cfg;
535	return;
536
537ignore:
538	nvlist_free(cfg);
539}
540
541int
542vdev_geom_read_pool_label(const char *name,
543    nvlist_t ***configs, uint64_t *count)
544{
545	struct g_class *mp;
546	struct g_geom *gp;
547	struct g_provider *pp;
548	struct g_consumer *zcp;
549	nvlist_t *vdev_cfg;
550	uint64_t pool_guid;
551	int error;
552
553	DROP_GIANT();
554	g_topology_lock();
555
556	*configs = NULL;
557	*count = 0;
558	pool_guid = 0;
559	LIST_FOREACH(mp, &g_classes, class) {
560		if (mp == &zfs_vdev_class)
561			continue;
562		LIST_FOREACH(gp, &mp->geom, geom) {
563			if (gp->flags & G_GEOM_WITHER)
564				continue;
565			LIST_FOREACH(pp, &gp->provider, provider) {
566				if (pp->flags & G_PF_WITHER)
567					continue;
568				zcp = vdev_geom_attach(pp, NULL);
569				if (zcp == NULL)
570					continue;
571				g_topology_unlock();
572				error = vdev_geom_read_config(zcp, &vdev_cfg);
573				g_topology_lock();
574				vdev_geom_detach(zcp, B_TRUE);
575				if (error)
576					continue;
577				ZFS_LOG(1, "successfully read vdev config");
578
579				process_vdev_config(configs, count,
580				    vdev_cfg, name, &pool_guid);
581			}
582		}
583	}
584	g_topology_unlock();
585	PICKUP_GIANT();
586
587	return (*count > 0 ? 0 : ENOENT);
588}
589
590static void
591vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
592{
593	nvlist_t *config;
594
595	g_topology_assert_not();
596
597	*pguid = 0;
598	*vguid = 0;
599	if (vdev_geom_read_config(cp, &config) == 0) {
600		nvlist_get_guids(config, pguid, vguid);
601		nvlist_free(config);
602	}
603}
604
605static boolean_t
606vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
607{
608	uint64_t pool_guid;
609	uint64_t vdev_guid;
610	struct g_consumer *zcp;
611	boolean_t pool_ok;
612	boolean_t vdev_ok;
613
614	zcp = vdev_geom_attach(pp, NULL);
615	if (zcp == NULL) {
616		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
617		    pp->name);
618		return (B_FALSE);
619	}
620	g_topology_unlock();
621	vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
622	g_topology_lock();
623	vdev_geom_detach(zcp, B_TRUE);
624
625	/*
626	 * Check that the label's vdev guid matches the desired guid.  If the
627	 * label has a pool guid, check that it matches too. (Inactive spares
628	 * and L2ARCs do not have any pool guid in the label.)
629	 */
630	if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
631	    vdev_guid == vd->vdev_guid) {
632		ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
633		return (B_TRUE);
634	} else {
635		ZFS_LOG(1, "guid mismatch for provider %s: "
636		    "%ju:%ju != %ju:%ju.", vd->vdev_path,
637		    (uintmax_t)spa_guid(vd->vdev_spa),
638		    (uintmax_t)vd->vdev_guid,
639		    (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
640		return (B_FALSE);
641	}
642}
643
644static struct g_consumer *
645vdev_geom_attach_by_guids(vdev_t *vd)
646{
647	struct g_class *mp;
648	struct g_geom *gp;
649	struct g_provider *pp;
650	struct g_consumer *cp;
651
652	g_topology_assert();
653
654	cp = NULL;
655	LIST_FOREACH(mp, &g_classes, class) {
656		if (mp == &zfs_vdev_class)
657			continue;
658		LIST_FOREACH(gp, &mp->geom, geom) {
659			if (gp->flags & G_GEOM_WITHER)
660				continue;
661			LIST_FOREACH(pp, &gp->provider, provider) {
662				if (!vdev_attach_ok(vd, pp))
663					continue;
664				cp = vdev_geom_attach(pp, vd);
665				if (cp == NULL) {
666					printf("ZFS WARNING: Unable to "
667					    "attach to %s.\n", pp->name);
668					continue;
669				}
670				break;
671			}
672			if (cp != NULL)
673				break;
674		}
675		if (cp != NULL)
676			break;
677	}
678end:
679	return (cp);
680}
681
682static struct g_consumer *
683vdev_geom_open_by_guids(vdev_t *vd)
684{
685	struct g_consumer *cp;
686	char *buf;
687	size_t len;
688
689	g_topology_assert();
690
691	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
692		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
693	cp = vdev_geom_attach_by_guids(vd);
694	if (cp != NULL) {
695		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
696		buf = kmem_alloc(len, KM_SLEEP);
697
698		snprintf(buf, len, "/dev/%s", cp->provider->name);
699		spa_strfree(vd->vdev_path);
700		vd->vdev_path = buf;
701
702		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
703		    (uintmax_t)spa_guid(vd->vdev_spa),
704		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
705	} else {
706		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
707		    (uintmax_t)spa_guid(vd->vdev_spa),
708		    (uintmax_t)vd->vdev_guid);
709	}
710
711	return (cp);
712}
713
714static struct g_consumer *
715vdev_geom_open_by_path(vdev_t *vd, int check_guid)
716{
717	struct g_provider *pp;
718	struct g_consumer *cp;
719
720	g_topology_assert();
721
722	cp = NULL;
723	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
724	if (pp != NULL) {
725		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
726		if (!check_guid || vdev_attach_ok(vd, pp))
727			cp = vdev_geom_attach(pp, vd);
728	}
729
730	return (cp);
731}
732
733static int
734vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
735    uint64_t *logical_ashift, uint64_t *physical_ashift)
736{
737	struct g_provider *pp;
738	struct g_consumer *cp;
739	size_t bufsize;
740	int error;
741
742	/* Set the TLS to indicate downstack that we should not access zvols*/
743	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
744
745	/*
746	 * We must have a pathname, and it must be absolute.
747	 */
748	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
749		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
750		return (EINVAL);
751	}
752
753	vd->vdev_tsd = NULL;
754
755	DROP_GIANT();
756	g_topology_lock();
757	error = 0;
758
759	if (vd->vdev_spa->spa_splitting_newspa ||
760	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
761	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
762	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
763		/*
764		 * We are dealing with a vdev that hasn't been previously
765		 * opened (since boot), and we are not loading an
766		 * existing pool configuration.  This looks like a
767		 * vdev add operation to a new or existing pool.
768		 * Assume the user knows what he/she is doing and find
769		 * GEOM provider by its name, ignoring GUID mismatches.
770		 *
771		 * XXPOLICY: It would be safer to only allow a device
772		 *           that is unlabeled or labeled but missing
773		 *           GUID information to be opened in this fashion,
774		 *           unless we are doing a split, in which case we
775		 *           should allow any guid.
776		 */
777		cp = vdev_geom_open_by_path(vd, 0);
778	} else {
779		/*
780		 * Try using the recorded path for this device, but only
781		 * accept it if its label data contains the expected GUIDs.
782		 */
783		cp = vdev_geom_open_by_path(vd, 1);
784		if (cp == NULL) {
785			/*
786			 * The device at vd->vdev_path doesn't have the
787			 * expected GUIDs. The disks might have merely
788			 * moved around so try all other GEOM providers
789			 * to find one with the right GUIDs.
790			 */
791			cp = vdev_geom_open_by_guids(vd);
792		}
793	}
794
795	/* Clear the TLS now that tasting is done */
796	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
797
798	if (cp == NULL) {
799		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
800		error = ENOENT;
801	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
802	    !ISP2(cp->provider->sectorsize)) {
803		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
804		    vd->vdev_path);
805
806		vdev_geom_close_locked(vd);
807		error = EINVAL;
808		cp = NULL;
809	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
810		int i;
811
812		for (i = 0; i < 5; i++) {
813			error = g_access(cp, 0, 1, 0);
814			if (error == 0)
815				break;
816			g_topology_unlock();
817			tsleep(vd, 0, "vdev", hz / 2);
818			g_topology_lock();
819		}
820		if (error != 0) {
821			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
822			    vd->vdev_path, error);
823			vdev_geom_close_locked(vd);
824			cp = NULL;
825		}
826	}
827
828	/* Fetch initial physical path information for this device. */
829	if (cp != NULL)
830		vdev_geom_attrchanged(cp, "GEOM::physpath");
831
832	g_topology_unlock();
833	PICKUP_GIANT();
834	if (cp == NULL) {
835		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
836		return (error);
837	}
838	pp = cp->provider;
839
840	/*
841	 * Determine the actual size of the device.
842	 */
843	*max_psize = *psize = pp->mediasize;
844
845	/*
846	 * Determine the device's minimum transfer size and preferred
847	 * transfer size.
848	 */
849	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
850	*physical_ashift = 0;
851	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
852	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
853		*physical_ashift = highbit(pp->stripesize) - 1;
854
855	/*
856	 * Clear the nowritecache settings, so that on a vdev_reopen()
857	 * we will try again.
858	 */
859	vd->vdev_nowritecache = B_FALSE;
860
861	/*
862	 * Determine the device's rotation rate.
863	 */
864	vdev_geom_set_rotation_rate(vd, cp);
865
866	return (0);
867}
868
869static void
870vdev_geom_close(vdev_t *vd)
871{
872
873	DROP_GIANT();
874	g_topology_lock();
875	vdev_geom_close_locked(vd);
876	g_topology_unlock();
877	PICKUP_GIANT();
878}
879
880static void
881vdev_geom_io_intr(struct bio *bp)
882{
883	vdev_t *vd;
884	zio_t *zio;
885
886	zio = bp->bio_caller1;
887	vd = zio->io_vd;
888	zio->io_error = bp->bio_error;
889	if (zio->io_error == 0 && bp->bio_resid != 0)
890		zio->io_error = SET_ERROR(EIO);
891
892	switch(zio->io_error) {
893	case ENOTSUP:
894		/*
895		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
896		 * that future attempts will never succeed. In this case
897		 * we set a persistent flag so that we don't bother with
898		 * requests in the future.
899		 */
900		switch(bp->bio_cmd) {
901		case BIO_FLUSH:
902			vd->vdev_nowritecache = B_TRUE;
903			break;
904		case BIO_DELETE:
905			vd->vdev_notrim = B_TRUE;
906			break;
907		}
908		break;
909	case ENXIO:
910		if (!vd->vdev_remove_wanted) {
911			/*
912			 * If provider's error is set we assume it is being
913			 * removed.
914			 */
915			if (bp->bio_to->error != 0) {
916				vd->vdev_remove_wanted = B_TRUE;
917				spa_async_request(zio->io_spa,
918				    SPA_ASYNC_REMOVE);
919			} else if (!vd->vdev_delayed_close) {
920				vd->vdev_delayed_close = B_TRUE;
921			}
922		}
923		break;
924	}
925	g_destroy_bio(bp);
926	zio_delay_interrupt(zio);
927}
928
929static void
930vdev_geom_io_start(zio_t *zio)
931{
932	vdev_t *vd;
933	struct g_consumer *cp;
934	struct bio *bp;
935	int error;
936
937	vd = zio->io_vd;
938
939	switch (zio->io_type) {
940	case ZIO_TYPE_IOCTL:
941		/* XXPOLICY */
942		if (!vdev_readable(vd)) {
943			zio->io_error = SET_ERROR(ENXIO);
944			zio_interrupt(zio);
945			return;
946		} else {
947			switch (zio->io_cmd) {
948			case DKIOCFLUSHWRITECACHE:
949				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
950					break;
951				if (vd->vdev_nowritecache) {
952					zio->io_error = SET_ERROR(ENOTSUP);
953					break;
954				}
955				goto sendreq;
956			default:
957				zio->io_error = SET_ERROR(ENOTSUP);
958			}
959		}
960
961		zio_execute(zio);
962		return;
963	case ZIO_TYPE_FREE:
964		if (vd->vdev_notrim) {
965			zio->io_error = SET_ERROR(ENOTSUP);
966		} else if (!vdev_geom_bio_delete_disable) {
967			goto sendreq;
968		}
969		zio_execute(zio);
970		return;
971	}
972sendreq:
973	ASSERT(zio->io_type == ZIO_TYPE_READ ||
974	    zio->io_type == ZIO_TYPE_WRITE ||
975	    zio->io_type == ZIO_TYPE_FREE ||
976	    zio->io_type == ZIO_TYPE_IOCTL);
977
978	cp = vd->vdev_tsd;
979	if (cp == NULL) {
980		zio->io_error = SET_ERROR(ENXIO);
981		zio_interrupt(zio);
982		return;
983	}
984	bp = g_alloc_bio();
985	bp->bio_caller1 = zio;
986	switch (zio->io_type) {
987	case ZIO_TYPE_READ:
988	case ZIO_TYPE_WRITE:
989		zio->io_target_timestamp = zio_handle_io_delay(zio);
990		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
991		bp->bio_data = zio->io_data;
992		bp->bio_offset = zio->io_offset;
993		bp->bio_length = zio->io_size;
994		break;
995	case ZIO_TYPE_FREE:
996		bp->bio_cmd = BIO_DELETE;
997		bp->bio_data = NULL;
998		bp->bio_offset = zio->io_offset;
999		bp->bio_length = zio->io_size;
1000		break;
1001	case ZIO_TYPE_IOCTL:
1002		bp->bio_cmd = BIO_FLUSH;
1003		bp->bio_flags |= BIO_ORDERED;
1004		bp->bio_data = NULL;
1005		bp->bio_offset = cp->provider->mediasize;
1006		bp->bio_length = 0;
1007		break;
1008	}
1009	bp->bio_done = vdev_geom_io_intr;
1010
1011	g_io_request(bp, cp);
1012}
1013
1014static void
1015vdev_geom_io_done(zio_t *zio)
1016{
1017}
1018
1019static void
1020vdev_geom_hold(vdev_t *vd)
1021{
1022}
1023
1024static void
1025vdev_geom_rele(vdev_t *vd)
1026{
1027}
1028
1029vdev_ops_t vdev_geom_ops = {
1030	vdev_geom_open,
1031	vdev_geom_close,
1032	vdev_default_asize,
1033	vdev_geom_io_start,
1034	vdev_geom_io_done,
1035	NULL,
1036	vdev_geom_hold,
1037	vdev_geom_rele,
1038	VDEV_TYPE_DISK,		/* name of this vdev type */
1039	B_TRUE			/* leaf vdev */
1040};
1041