vdev_geom.c revision 338905
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52struct consumer_vdev_elem {
53	SLIST_ENTRY(consumer_vdev_elem)	elems;
54	vdev_t				*vd;
55};
56
57SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
58_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
59    == sizeof(struct consumer_priv_t*),
60    "consumer_priv_t* can't be stored in g_consumer.private");
61
62DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
63
64SYSCTL_DECL(_vfs_zfs_vdev);
65/* Don't send BIO_FLUSH. */
66static int vdev_geom_bio_flush_disable = 0;
67TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
68SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
69    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
70/* Don't send BIO_DELETE. */
71static int vdev_geom_bio_delete_disable = 0;
72TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
73SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
74    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
75
76/* Declare local functions */
77static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
78
79/*
80 * Thread local storage used to indicate when a thread is probing geoms
81 * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
82 * it is looking for a replacement for the vdev_t* that is its value.
83 */
84uint_t zfs_geom_probe_vdev_key;
85
86static void
87vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
88{
89	int error;
90	uint16_t rate;
91
92	error = g_getattr("GEOM::rotation_rate", cp, &rate);
93	if (error == 0)
94		vd->vdev_rotation_rate = rate;
95	else
96		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
97}
98
99static void
100vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
101		       boolean_t do_null_update)
102{
103	boolean_t needs_update = B_FALSE;
104	char *physpath;
105	int error, physpath_len;
106
107	physpath_len = MAXPATHLEN;
108	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
109	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
110	if (error == 0) {
111		char *old_physpath;
112
113		/* g_topology lock ensures that vdev has not been closed */
114		g_topology_assert();
115		old_physpath = vd->vdev_physpath;
116		vd->vdev_physpath = spa_strdup(physpath);
117
118		if (old_physpath != NULL) {
119			needs_update = (strcmp(old_physpath,
120						vd->vdev_physpath) != 0);
121			spa_strfree(old_physpath);
122		} else
123			needs_update = do_null_update;
124	}
125	g_free(physpath);
126
127	/*
128	 * If the physical path changed, update the config.
129	 * Only request an update for previously unset physpaths if
130	 * requested by the caller.
131	 */
132	if (needs_update)
133		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
134
135}
136
137static void
138vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
139{
140	char *old_physpath;
141	struct consumer_priv_t *priv;
142	struct consumer_vdev_elem *elem;
143	int error;
144
145	priv = (struct consumer_priv_t*)&cp->private;
146	if (SLIST_EMPTY(priv))
147		return;
148
149	SLIST_FOREACH(elem, priv, elems) {
150		vdev_t *vd = elem->vd;
151		if (strcmp(attr, "GEOM::rotation_rate") == 0) {
152			vdev_geom_set_rotation_rate(vd, cp);
153			return;
154		}
155		if (strcmp(attr, "GEOM::physpath") == 0) {
156			vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
157			return;
158		}
159	}
160}
161
162static void
163vdev_geom_orphan(struct g_consumer *cp)
164{
165	struct consumer_priv_t *priv;
166	struct consumer_vdev_elem *elem;
167
168	g_topology_assert();
169
170	priv = (struct consumer_priv_t*)&cp->private;
171	if (SLIST_EMPTY(priv))
172		/* Vdev close in progress.  Ignore the event. */
173		return;
174
175	/*
176	 * Orphan callbacks occur from the GEOM event thread.
177	 * Concurrent with this call, new I/O requests may be
178	 * working their way through GEOM about to find out
179	 * (only once executed by the g_down thread) that we've
180	 * been orphaned from our disk provider.  These I/Os
181	 * must be retired before we can detach our consumer.
182	 * This is most easily achieved by acquiring the
183	 * SPA ZIO configuration lock as a writer, but doing
184	 * so with the GEOM topology lock held would cause
185	 * a lock order reversal.  Instead, rely on the SPA's
186	 * async removal support to invoke a close on this
187	 * vdev once it is safe to do so.
188	 */
189	SLIST_FOREACH(elem, priv, elems) {
190		vdev_t *vd = elem->vd;
191
192		vd->vdev_remove_wanted = B_TRUE;
193		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
194	}
195}
196
197static struct g_consumer *
198vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
199{
200	struct g_geom *gp;
201	struct g_consumer *cp;
202	int error;
203
204	g_topology_assert();
205
206	ZFS_LOG(1, "Attaching to %s.", pp->name);
207
208	if (sanity) {
209		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
210			ZFS_LOG(1, "Failing attach of %s. "
211				   "Incompatible sectorsize %d\n",
212			    pp->name, pp->sectorsize);
213			return (NULL);
214		} else if (pp->mediasize < SPA_MINDEVSIZE) {
215			ZFS_LOG(1, "Failing attach of %s. "
216				   "Incompatible mediasize %ju\n",
217			    pp->name, pp->mediasize);
218			return (NULL);
219		}
220	}
221
222	/* Do we have geom already? No? Create one. */
223	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
224		if (gp->flags & G_GEOM_WITHER)
225			continue;
226		if (strcmp(gp->name, "zfs::vdev") != 0)
227			continue;
228		break;
229	}
230	if (gp == NULL) {
231		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
232		gp->orphan = vdev_geom_orphan;
233		gp->attrchanged = vdev_geom_attrchanged;
234		cp = g_new_consumer(gp);
235		error = g_attach(cp, pp);
236		if (error != 0) {
237			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
238			    __LINE__, error);
239			vdev_geom_detach(cp, B_FALSE);
240			return (NULL);
241		}
242		error = g_access(cp, 1, 0, 1);
243		if (error != 0) {
244			ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
245			       __LINE__, error);
246			vdev_geom_detach(cp, B_FALSE);
247			return (NULL);
248		}
249		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
250	} else {
251		/* Check if we are already connected to this provider. */
252		LIST_FOREACH(cp, &gp->consumer, consumer) {
253			if (cp->provider == pp) {
254				ZFS_LOG(1, "Found consumer for %s.", pp->name);
255				break;
256			}
257		}
258		if (cp == NULL) {
259			cp = g_new_consumer(gp);
260			error = g_attach(cp, pp);
261			if (error != 0) {
262				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
263				    __func__, __LINE__, error);
264				vdev_geom_detach(cp, B_FALSE);
265				return (NULL);
266			}
267			error = g_access(cp, 1, 0, 1);
268			if (error != 0) {
269				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
270				    __func__, __LINE__, error);
271				vdev_geom_detach(cp, B_FALSE);
272				return (NULL);
273			}
274			ZFS_LOG(1, "Created consumer for %s.", pp->name);
275		} else {
276			error = g_access(cp, 1, 0, 1);
277			if (error != 0) {
278				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
279				    __func__, __LINE__, error);
280				return (NULL);
281			}
282			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
283		}
284	}
285
286	if (vd != NULL)
287		vd->vdev_tsd = cp;
288
289	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
290	return (cp);
291}
292
293static void
294vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
295{
296	struct g_geom *gp;
297
298	g_topology_assert();
299
300	ZFS_LOG(1, "Detaching from %s.",
301	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
302
303	gp = cp->geom;
304	if (open_for_read)
305		g_access(cp, -1, 0, -1);
306	/* Destroy consumer on last close. */
307	if (cp->acr == 0 && cp->ace == 0) {
308		if (cp->acw > 0)
309			g_access(cp, 0, -cp->acw, 0);
310		if (cp->provider != NULL) {
311			ZFS_LOG(1, "Destroying consumer for %s.",
312			    cp->provider->name ? cp->provider->name : "NULL");
313			g_detach(cp);
314		}
315		g_destroy_consumer(cp);
316	}
317	/* Destroy geom if there are no consumers left. */
318	if (LIST_EMPTY(&gp->consumer)) {
319		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
320		g_wither_geom(gp, ENXIO);
321	}
322}
323
324static void
325vdev_geom_close_locked(vdev_t *vd)
326{
327	struct g_consumer *cp;
328	struct consumer_priv_t *priv;
329	struct consumer_vdev_elem *elem, *elem_temp;
330
331	g_topology_assert();
332
333	cp = vd->vdev_tsd;
334	vd->vdev_delayed_close = B_FALSE;
335	if (cp == NULL)
336		return;
337
338	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
339	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
340	priv = (struct consumer_priv_t*)&cp->private;
341	vd->vdev_tsd = NULL;
342	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
343		if (elem->vd == vd) {
344			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
345			g_free(elem);
346		}
347	}
348
349	vdev_geom_detach(cp, B_TRUE);
350}
351
352/*
353 * Issue one or more bios to the vdev in parallel
354 * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
355 * operation is described by parallel entries from each array.  There may be
356 * more bios actually issued than entries in the array
357 */
358static void
359vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
360    off_t *sizes, int *errors, int ncmds)
361{
362	struct bio **bios;
363	u_char *p;
364	off_t off, maxio, s, end;
365	int i, n_bios, j;
366	size_t bios_size;
367
368	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
369	n_bios = 0;
370
371	/* How many bios are required for all commands ? */
372	for (i = 0; i < ncmds; i++)
373		n_bios += (sizes[i] + maxio - 1) / maxio;
374
375	/* Allocate memory for the bios */
376	bios_size = n_bios * sizeof(struct bio*);
377	bios = kmem_zalloc(bios_size, KM_SLEEP);
378
379	/* Prepare and issue all of the bios */
380	for (i = j = 0; i < ncmds; i++) {
381		off = offsets[i];
382		p = datas[i];
383		s = sizes[i];
384		end = off + s;
385		ASSERT((off % cp->provider->sectorsize) == 0);
386		ASSERT((s % cp->provider->sectorsize) == 0);
387
388		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
389			bios[j] = g_alloc_bio();
390			bios[j]->bio_cmd = cmds[i];
391			bios[j]->bio_done = NULL;
392			bios[j]->bio_offset = off;
393			bios[j]->bio_length = MIN(s, maxio);
394			bios[j]->bio_data = p;
395			g_io_request(bios[j], cp);
396		}
397	}
398	ASSERT(j == n_bios);
399
400	/* Wait for all of the bios to complete, and clean them up */
401	for (i = j = 0; i < ncmds; i++) {
402		off = offsets[i];
403		s = sizes[i];
404		end = off + s;
405
406		for (; off < end; off += maxio, s -= maxio, j++) {
407			errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
408			g_destroy_bio(bios[j]);
409		}
410	}
411	kmem_free(bios, bios_size);
412}
413
414/*
415 * Read the vdev config from a device.  Return the number of valid labels that
416 * were found.  The vdev config will be returned in config if and only if at
417 * least one valid label was found.
418 */
419static int
420vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
421{
422	struct g_provider *pp;
423	nvlist_t *config;
424	vdev_phys_t *vdev_lists[VDEV_LABELS];
425	char *buf;
426	size_t buflen;
427	uint64_t psize, state, txg;
428	off_t offsets[VDEV_LABELS];
429	off_t size;
430	off_t sizes[VDEV_LABELS];
431	int cmds[VDEV_LABELS];
432	int errors[VDEV_LABELS];
433	int l, nlabels;
434
435	g_topology_assert_not();
436
437	pp = cp->provider;
438	ZFS_LOG(1, "Reading config from %s...", pp->name);
439
440	psize = pp->mediasize;
441	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
442
443	size = sizeof(*vdev_lists[0]) + pp->sectorsize -
444	    ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
445
446	buflen = sizeof(vdev_lists[0]->vp_nvlist);
447
448	/* Create all of the IO requests */
449	for (l = 0; l < VDEV_LABELS; l++) {
450		cmds[l] = BIO_READ;
451		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
452		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
453		sizes[l] = size;
454		errors[l] = 0;
455		ASSERT(offsets[l] % pp->sectorsize == 0);
456	}
457
458	/* Issue the IO requests */
459	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
460	    VDEV_LABELS);
461
462	/* Parse the labels */
463	config = *configp = NULL;
464	nlabels = 0;
465	for (l = 0; l < VDEV_LABELS; l++) {
466		if (errors[l] != 0)
467			continue;
468
469		buf = vdev_lists[l]->vp_nvlist;
470
471		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
472			continue;
473
474		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
475		    &state) != 0 || state > POOL_STATE_L2CACHE) {
476			nvlist_free(config);
477			continue;
478		}
479
480		if (state != POOL_STATE_SPARE &&
481		    state != POOL_STATE_L2CACHE &&
482		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
483		    &txg) != 0 || txg == 0)) {
484			nvlist_free(config);
485			continue;
486		}
487
488		if (*configp != NULL)
489			nvlist_free(*configp);
490		*configp = config;
491
492		nlabels++;
493	}
494
495	/* Free the label storage */
496	for (l = 0; l < VDEV_LABELS; l++)
497		kmem_free(vdev_lists[l], size);
498
499	return (nlabels);
500}
501
502static void
503resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
504{
505	nvlist_t **new_configs;
506	uint64_t i;
507
508	if (id < *count)
509		return;
510	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
511	    KM_SLEEP);
512	for (i = 0; i < *count; i++)
513		new_configs[i] = (*configs)[i];
514	if (*configs != NULL)
515		kmem_free(*configs, *count * sizeof(void *));
516	*configs = new_configs;
517	*count = id + 1;
518}
519
520static void
521process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
522    const char *name, uint64_t* known_pool_guid)
523{
524	nvlist_t *vdev_tree;
525	uint64_t pool_guid;
526	uint64_t vdev_guid, known_guid;
527	uint64_t id, txg, known_txg;
528	char *pname;
529	int i;
530
531	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
532	    strcmp(pname, name) != 0)
533		goto ignore;
534
535	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
536		goto ignore;
537
538	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
539		goto ignore;
540
541	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
542		goto ignore;
543
544	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
545		goto ignore;
546
547	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
548
549	if (*known_pool_guid != 0) {
550		if (pool_guid != *known_pool_guid)
551			goto ignore;
552	} else
553		*known_pool_guid = pool_guid;
554
555	resize_configs(configs, count, id);
556
557	if ((*configs)[id] != NULL) {
558		VERIFY(nvlist_lookup_uint64((*configs)[id],
559		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
560		if (txg <= known_txg)
561			goto ignore;
562		nvlist_free((*configs)[id]);
563	}
564
565	(*configs)[id] = cfg;
566	return;
567
568ignore:
569	nvlist_free(cfg);
570}
571
572int
573vdev_geom_read_pool_label(const char *name,
574    nvlist_t ***configs, uint64_t *count)
575{
576	struct g_class *mp;
577	struct g_geom *gp;
578	struct g_provider *pp;
579	struct g_consumer *zcp;
580	nvlist_t *vdev_cfg;
581	uint64_t pool_guid;
582	int error, nlabels;
583
584	DROP_GIANT();
585	g_topology_lock();
586
587	*configs = NULL;
588	*count = 0;
589	pool_guid = 0;
590	LIST_FOREACH(mp, &g_classes, class) {
591		if (mp == &zfs_vdev_class)
592			continue;
593		LIST_FOREACH(gp, &mp->geom, geom) {
594			if (gp->flags & G_GEOM_WITHER)
595				continue;
596			LIST_FOREACH(pp, &gp->provider, provider) {
597				if (pp->flags & G_PF_WITHER)
598					continue;
599				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
600				if (zcp == NULL)
601					continue;
602				g_topology_unlock();
603				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
604				g_topology_lock();
605				vdev_geom_detach(zcp, B_TRUE);
606				if (nlabels == 0)
607					continue;
608				ZFS_LOG(1, "successfully read vdev config");
609
610				process_vdev_config(configs, count,
611				    vdev_cfg, name, &pool_guid);
612			}
613		}
614	}
615	g_topology_unlock();
616	PICKUP_GIANT();
617
618	return (*count > 0 ? 0 : ENOENT);
619}
620
621enum match {
622	NO_MATCH = 0,		/* No matching labels found */
623	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid*/
624	ZERO_MATCH = 1,		/* Should never be returned */
625	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
626	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
627	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
628	FULL_MATCH = 5		/* all labels match the vdev_guid */
629};
630
631static enum match
632vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
633{
634	nvlist_t *config;
635	uint64_t pool_guid, top_guid, vdev_guid;
636	struct g_consumer *cp;
637	int nlabels;
638
639	cp = vdev_geom_attach(pp, NULL, B_TRUE);
640	if (cp == NULL) {
641		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
642		    pp->name);
643		return (NO_MATCH);
644	}
645	g_topology_unlock();
646	nlabels = vdev_geom_read_config(cp, &config);
647	g_topology_lock();
648	vdev_geom_detach(cp, B_TRUE);
649	if (nlabels == 0) {
650		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
651		return (NO_MATCH);
652	}
653
654	pool_guid = 0;
655	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
656	top_guid = 0;
657	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
658	vdev_guid = 0;
659	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
660	nvlist_free(config);
661
662	/*
663	 * Check that the label's pool guid matches the desired guid.
664	 * Inactive spares and L2ARCs do not have any pool guid in the label.
665	 */
666	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
667		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
668		    pp->name,
669		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
670		return (NO_MATCH);
671	}
672
673	/*
674	 * Check that the label's vdev guid matches the desired guid.
675	 * The second condition handles possible race on vdev detach, when
676	 * remaining vdev receives GUID of destroyed top level mirror vdev.
677	 */
678	if (vdev_guid == vd->vdev_guid) {
679		ZFS_LOG(1, "guids match for provider %s.", pp->name);
680		return (ZERO_MATCH + nlabels);
681	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
682		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
683		return (TOPGUID_MATCH);
684	}
685	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
686	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
687	return (NO_MATCH);
688}
689
690static struct g_consumer *
691vdev_geom_attach_by_guids(vdev_t *vd)
692{
693	struct g_class *mp;
694	struct g_geom *gp;
695	struct g_provider *pp, *best_pp;
696	struct g_consumer *cp;
697	enum match match, best_match;
698
699	g_topology_assert();
700
701	cp = NULL;
702	best_pp = NULL;
703	best_match = NO_MATCH;
704	LIST_FOREACH(mp, &g_classes, class) {
705		if (mp == &zfs_vdev_class)
706			continue;
707		LIST_FOREACH(gp, &mp->geom, geom) {
708			if (gp->flags & G_GEOM_WITHER)
709				continue;
710			LIST_FOREACH(pp, &gp->provider, provider) {
711				match = vdev_attach_ok(vd, pp);
712				if (match > best_match) {
713					best_match = match;
714					best_pp = pp;
715				}
716				if (match == FULL_MATCH)
717					goto out;
718			}
719		}
720	}
721
722out:
723	if (best_pp) {
724		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
725		if (cp == NULL) {
726			printf("ZFS WARNING: Unable to attach to %s.\n",
727			    best_pp->name);
728		}
729	}
730	return (cp);
731}
732
733static struct g_consumer *
734vdev_geom_open_by_guids(vdev_t *vd)
735{
736	struct g_consumer *cp;
737	char *buf;
738	size_t len;
739
740	g_topology_assert();
741
742	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
743		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
744	cp = vdev_geom_attach_by_guids(vd);
745	if (cp != NULL) {
746		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
747		buf = kmem_alloc(len, KM_SLEEP);
748
749		snprintf(buf, len, "/dev/%s", cp->provider->name);
750		spa_strfree(vd->vdev_path);
751		vd->vdev_path = buf;
752
753		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
754		    (uintmax_t)spa_guid(vd->vdev_spa),
755		    (uintmax_t)vd->vdev_guid, cp->provider->name);
756	} else {
757		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
758		    (uintmax_t)spa_guid(vd->vdev_spa),
759		    (uintmax_t)vd->vdev_guid);
760	}
761
762	return (cp);
763}
764
765static struct g_consumer *
766vdev_geom_open_by_path(vdev_t *vd, int check_guid)
767{
768	struct g_provider *pp;
769	struct g_consumer *cp;
770
771	g_topology_assert();
772
773	cp = NULL;
774	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
775	if (pp != NULL) {
776		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
777		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
778			cp = vdev_geom_attach(pp, vd, B_FALSE);
779	}
780
781	return (cp);
782}
783
784static int
785vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
786    uint64_t *logical_ashift, uint64_t *physical_ashift)
787{
788	struct g_provider *pp;
789	struct g_consumer *cp;
790	size_t bufsize;
791	int error;
792
793	/* Set the TLS to indicate downstack that we should not access zvols*/
794	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
795
796	/*
797	 * We must have a pathname, and it must be absolute.
798	 */
799	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
800		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
801		return (EINVAL);
802	}
803
804	/*
805	 * Reopen the device if it's not currently open. Otherwise,
806	 * just update the physical size of the device.
807	 */
808	if ((cp = vd->vdev_tsd) != NULL) {
809		ASSERT(vd->vdev_reopening);
810		goto skip_open;
811	}
812
813	DROP_GIANT();
814	g_topology_lock();
815	error = 0;
816
817	if (vd->vdev_spa->spa_splitting_newspa ||
818	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
819	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
820	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
821		/*
822		 * We are dealing with a vdev that hasn't been previously
823		 * opened (since boot), and we are not loading an
824		 * existing pool configuration.  This looks like a
825		 * vdev add operation to a new or existing pool.
826		 * Assume the user knows what he/she is doing and find
827		 * GEOM provider by its name, ignoring GUID mismatches.
828		 *
829		 * XXPOLICY: It would be safer to only allow a device
830		 *           that is unlabeled or labeled but missing
831		 *           GUID information to be opened in this fashion,
832		 *           unless we are doing a split, in which case we
833		 *           should allow any guid.
834		 */
835		cp = vdev_geom_open_by_path(vd, 0);
836	} else {
837		/*
838		 * Try using the recorded path for this device, but only
839		 * accept it if its label data contains the expected GUIDs.
840		 */
841		cp = vdev_geom_open_by_path(vd, 1);
842		if (cp == NULL) {
843			/*
844			 * The device at vd->vdev_path doesn't have the
845			 * expected GUIDs. The disks might have merely
846			 * moved around so try all other GEOM providers
847			 * to find one with the right GUIDs.
848			 */
849			cp = vdev_geom_open_by_guids(vd);
850		}
851	}
852
853	/* Clear the TLS now that tasting is done */
854	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
855
856	if (cp == NULL) {
857		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
858		error = ENOENT;
859	} else {
860		struct consumer_priv_t *priv;
861		struct consumer_vdev_elem *elem;
862		int spamode;
863
864		priv = (struct consumer_priv_t*)&cp->private;
865		if (cp->private == NULL)
866			SLIST_INIT(priv);
867		elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
868		elem->vd = vd;
869		SLIST_INSERT_HEAD(priv, elem, elems);
870
871		spamode = spa_mode(vd->vdev_spa);
872		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
873		    !ISP2(cp->provider->sectorsize)) {
874			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
875			    cp->provider->name);
876
877			vdev_geom_close_locked(vd);
878			error = EINVAL;
879			cp = NULL;
880		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
881			int i;
882
883			for (i = 0; i < 5; i++) {
884				error = g_access(cp, 0, 1, 0);
885				if (error == 0)
886					break;
887				g_topology_unlock();
888				tsleep(vd, 0, "vdev", hz / 2);
889				g_topology_lock();
890			}
891			if (error != 0) {
892				printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
893				    cp->provider->name, error);
894				vdev_geom_close_locked(vd);
895				cp = NULL;
896			}
897		}
898	}
899
900	/* Fetch initial physical path information for this device. */
901	if (cp != NULL) {
902		vdev_geom_attrchanged(cp, "GEOM::physpath");
903
904		/* Set other GEOM characteristics */
905		vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
906		vdev_geom_set_rotation_rate(vd, cp);
907	}
908
909	g_topology_unlock();
910	PICKUP_GIANT();
911	if (cp == NULL) {
912		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
913		return (error);
914	}
915skip_open:
916	pp = cp->provider;
917
918	/*
919	 * Determine the actual size of the device.
920	 */
921	*max_psize = *psize = pp->mediasize;
922
923	/*
924	 * Determine the device's minimum transfer size and preferred
925	 * transfer size.
926	 */
927	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
928	*physical_ashift = 0;
929	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
930	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
931		*physical_ashift = highbit(pp->stripesize) - 1;
932
933	/*
934	 * Clear the nowritecache settings, so that on a vdev_reopen()
935	 * we will try again.
936	 */
937	vd->vdev_nowritecache = B_FALSE;
938
939	return (0);
940}
941
942static void
943vdev_geom_close(vdev_t *vd)
944{
945	struct g_consumer *cp;
946
947	cp = vd->vdev_tsd;
948
949	DROP_GIANT();
950	g_topology_lock();
951
952	if (!vd->vdev_reopening ||
953	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
954	    (cp->provider != NULL && cp->provider->error != 0))))
955		vdev_geom_close_locked(vd);
956
957	g_topology_unlock();
958	PICKUP_GIANT();
959}
960
961static void
962vdev_geom_io_intr(struct bio *bp)
963{
964	vdev_t *vd;
965	zio_t *zio;
966
967	zio = bp->bio_caller1;
968	vd = zio->io_vd;
969	zio->io_error = bp->bio_error;
970	if (zio->io_error == 0 && bp->bio_resid != 0)
971		zio->io_error = SET_ERROR(EIO);
972
973	switch(zio->io_error) {
974	case ENOTSUP:
975		/*
976		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
977		 * that future attempts will never succeed. In this case
978		 * we set a persistent flag so that we don't bother with
979		 * requests in the future.
980		 */
981		switch(bp->bio_cmd) {
982		case BIO_FLUSH:
983			vd->vdev_nowritecache = B_TRUE;
984			break;
985		case BIO_DELETE:
986			vd->vdev_notrim = B_TRUE;
987			break;
988		}
989		break;
990	case ENXIO:
991		if (!vd->vdev_remove_wanted) {
992			/*
993			 * If provider's error is set we assume it is being
994			 * removed.
995			 */
996			if (bp->bio_to->error != 0) {
997				vd->vdev_remove_wanted = B_TRUE;
998				spa_async_request(zio->io_spa,
999				    SPA_ASYNC_REMOVE);
1000			} else if (!vd->vdev_delayed_close) {
1001				vd->vdev_delayed_close = B_TRUE;
1002			}
1003		}
1004		break;
1005	}
1006	g_destroy_bio(bp);
1007	zio_delay_interrupt(zio);
1008}
1009
1010static void
1011vdev_geom_io_start(zio_t *zio)
1012{
1013	vdev_t *vd;
1014	struct g_consumer *cp;
1015	struct bio *bp;
1016	int error;
1017
1018	vd = zio->io_vd;
1019
1020	switch (zio->io_type) {
1021	case ZIO_TYPE_IOCTL:
1022		/* XXPOLICY */
1023		if (!vdev_readable(vd)) {
1024			zio->io_error = SET_ERROR(ENXIO);
1025			zio_interrupt(zio);
1026			return;
1027		} else {
1028			switch (zio->io_cmd) {
1029			case DKIOCFLUSHWRITECACHE:
1030				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
1031					break;
1032				if (vd->vdev_nowritecache) {
1033					zio->io_error = SET_ERROR(ENOTSUP);
1034					break;
1035				}
1036				goto sendreq;
1037			default:
1038				zio->io_error = SET_ERROR(ENOTSUP);
1039			}
1040		}
1041
1042		zio_execute(zio);
1043		return;
1044	case ZIO_TYPE_FREE:
1045		if (vd->vdev_notrim) {
1046			zio->io_error = SET_ERROR(ENOTSUP);
1047		} else if (!vdev_geom_bio_delete_disable) {
1048			goto sendreq;
1049		}
1050		zio_execute(zio);
1051		return;
1052	}
1053sendreq:
1054	ASSERT(zio->io_type == ZIO_TYPE_READ ||
1055	    zio->io_type == ZIO_TYPE_WRITE ||
1056	    zio->io_type == ZIO_TYPE_FREE ||
1057	    zio->io_type == ZIO_TYPE_IOCTL);
1058
1059	cp = vd->vdev_tsd;
1060	if (cp == NULL) {
1061		zio->io_error = SET_ERROR(ENXIO);
1062		zio_interrupt(zio);
1063		return;
1064	}
1065	bp = g_alloc_bio();
1066	bp->bio_caller1 = zio;
1067	switch (zio->io_type) {
1068	case ZIO_TYPE_READ:
1069	case ZIO_TYPE_WRITE:
1070		zio->io_target_timestamp = zio_handle_io_delay(zio);
1071		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
1072		bp->bio_data = zio->io_data;
1073		bp->bio_offset = zio->io_offset;
1074		bp->bio_length = zio->io_size;
1075		break;
1076	case ZIO_TYPE_FREE:
1077		bp->bio_cmd = BIO_DELETE;
1078		bp->bio_data = NULL;
1079		bp->bio_offset = zio->io_offset;
1080		bp->bio_length = zio->io_size;
1081		break;
1082	case ZIO_TYPE_IOCTL:
1083		bp->bio_cmd = BIO_FLUSH;
1084		bp->bio_flags |= BIO_ORDERED;
1085		bp->bio_data = NULL;
1086		bp->bio_offset = cp->provider->mediasize;
1087		bp->bio_length = 0;
1088		break;
1089	}
1090	bp->bio_done = vdev_geom_io_intr;
1091
1092	g_io_request(bp, cp);
1093}
1094
1095static void
1096vdev_geom_io_done(zio_t *zio)
1097{
1098}
1099
1100static void
1101vdev_geom_hold(vdev_t *vd)
1102{
1103}
1104
1105static void
1106vdev_geom_rele(vdev_t *vd)
1107{
1108}
1109
1110vdev_ops_t vdev_geom_ops = {
1111	vdev_geom_open,
1112	vdev_geom_close,
1113	vdev_default_asize,
1114	vdev_geom_io_start,
1115	vdev_geom_io_done,
1116	NULL,
1117	vdev_geom_hold,
1118	vdev_geom_rele,
1119	VDEV_TYPE_DISK,		/* name of this vdev type */
1120	B_TRUE			/* leaf vdev */
1121};
1122