vdev_geom.c revision 308058
1296781Sdes/*
2204861Sdes * CDDL HEADER START
3204861Sdes *
4204861Sdes * The contents of this file are subject to the terms of the
5204861Sdes * Common Development and Distribution License (the "License").
6204861Sdes * You may not use this file except in compliance with the License.
7204861Sdes *
8204861Sdes * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9204861Sdes * or http://www.opensolaris.org/os/licensing.
10204861Sdes * See the License for the specific language governing permissions
11204861Sdes * and limitations under the License.
12204861Sdes *
13204861Sdes * When distributing Covered Code, include this CDDL HEADER in each
14204861Sdes * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15204861Sdes * If applicable, add the following below this CDDL HEADER, with the
16204861Sdes * fields enclosed by brackets "[]" replaced with your own identifying
17204861Sdes * information: Portions Copyright [yyyy] [name of copyright owner]
18204861Sdes *
19204861Sdes * CDDL HEADER END
20204861Sdes */
21204861Sdes/*
22204861Sdes * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23204861Sdes * All rights reserved.
24204861Sdes *
25204861Sdes * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26204861Sdes */
27204861Sdes
28204861Sdes#include <sys/zfs_context.h>
29204861Sdes#include <sys/param.h>
30204861Sdes#include <sys/kernel.h>
31204861Sdes#include <sys/bio.h>
32204861Sdes#include <sys/disk.h>
33204861Sdes#include <sys/spa.h>
34262566Sdes#include <sys/spa_impl.h>
35262566Sdes#include <sys/vdev_impl.h>
36204861Sdes#include <sys/fs/zfs.h>
37204861Sdes#include <sys/zio.h>
38204861Sdes#include <geom/geom.h>
39204861Sdes#include <geom/geom_int.h>
40204861Sdes
41295367Sdes/*
42204861Sdes * Virtual device vector for GEOM.
43204861Sdes */
44204861Sdes
45204861Sdesstatic g_attrchanged_t vdev_geom_attrchanged;
46204861Sdesstruct g_class zfs_vdev_class = {
47204861Sdes	.name = "ZFS::VDEV",
48204861Sdes	.version = G_VERSION,
49204861Sdes	.attrchanged = vdev_geom_attrchanged,
50204861Sdes};
51204861Sdes
52204861SdesDECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53204861Sdes
54204861SdesSYSCTL_DECL(_vfs_zfs_vdev);
55204861Sdes/* Don't send BIO_FLUSH. */
56204861Sdesstatic int vdev_geom_bio_flush_disable = 0;
57204861SdesTUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58204861SdesSYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59204861Sdes    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60204861Sdes/* Don't send BIO_DELETE. */
61204861Sdesstatic int vdev_geom_bio_delete_disable = 0;
62204861SdesTUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63204861SdesSYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64204861Sdes    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65204861Sdes
66204861Sdes/* Declare local functions */
67204861Sdesstatic void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
68204861Sdes
69204861Sdes/*
70204861Sdes * Thread local storage used to indicate when a thread is probing geoms
71204861Sdes * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
72204861Sdes * it is looking for a replacement for the vdev_t* that is its value.
73204861Sdes */
74204861Sdesuint_t zfs_geom_probe_vdev_key;
75204861Sdes
76204861Sdesstatic void
77204861Sdesvdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
78204861Sdes{
79204861Sdes	int error;
80204861Sdes	uint16_t rate;
81204861Sdes
82204861Sdes	error = g_getattr("GEOM::rotation_rate", cp, &rate);
83204861Sdes	if (error == 0)
84204861Sdes		vd->vdev_rotation_rate = rate;
85204861Sdes	else
86204861Sdes		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
87204861Sdes}
88204861Sdes
89204861Sdesstatic void
90204861Sdesvdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
91204861Sdes{
92204861Sdes	vdev_t *vd;
93204861Sdes	spa_t *spa;
94204861Sdes	char *physpath;
95204861Sdes	int error, physpath_len;
96204861Sdes
97204861Sdes	vd = cp->private;
98204861Sdes	if (vd == NULL)
99204861Sdes		return;
100204861Sdes
101204861Sdes	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
102204861Sdes		vdev_geom_set_rotation_rate(vd, cp);
103204861Sdes		return;
104204861Sdes	}
105204861Sdes
106204861Sdes	if (strcmp(attr, "GEOM::physpath") != 0)
107204861Sdes		return;
108204861Sdes
109204861Sdes	if (g_access(cp, 1, 0, 0) != 0)
110204861Sdes		return;
111204861Sdes
112204861Sdes	/*
113204861Sdes	 * Record/Update physical path information for this device.
114204861Sdes	 */
115204861Sdes	spa = vd->vdev_spa;
116204861Sdes	physpath_len = MAXPATHLEN;
117204861Sdes	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
118204861Sdes	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
119204861Sdes	g_access(cp, -1, 0, 0);
120204861Sdes	if (error == 0) {
121204861Sdes		char *old_physpath;
122204861Sdes
123204861Sdes		/* g_topology lock ensures that vdev has not been closed */
124204861Sdes		g_topology_assert();
125255767Sdes		old_physpath = vd->vdev_physpath;
126255767Sdes		vd->vdev_physpath = spa_strdup(physpath);
127255767Sdes		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
128204861Sdes
129204861Sdes		if (old_physpath != NULL)
130204861Sdes			spa_strfree(old_physpath);
131204861Sdes	}
132204861Sdes	g_free(physpath);
133204861Sdes}
134204861Sdes
135204861Sdesstatic void
136204861Sdesvdev_geom_orphan(struct g_consumer *cp)
137204861Sdes{
138204861Sdes	vdev_t *vd;
139204861Sdes
140204861Sdes	g_topology_assert();
141204861Sdes
142204861Sdes	vd = cp->private;
143204861Sdes	if (vd == NULL) {
144204861Sdes		/* Vdev close in progress.  Ignore the event. */
145204861Sdes		return;
146204861Sdes	}
147204861Sdes
148204861Sdes	/*
149204861Sdes	 * Orphan callbacks occur from the GEOM event thread.
150204861Sdes	 * Concurrent with this call, new I/O requests may be
151204861Sdes	 * working their way through GEOM about to find out
152204861Sdes	 * (only once executed by the g_down thread) that we've
153204861Sdes	 * been orphaned from our disk provider.  These I/Os
154204861Sdes	 * must be retired before we can detach our consumer.
155204861Sdes	 * This is most easily achieved by acquiring the
156204861Sdes	 * SPA ZIO configuration lock as a writer, but doing
157204861Sdes	 * so with the GEOM topology lock held would cause
158204861Sdes	 * a lock order reversal.  Instead, rely on the SPA's
159204861Sdes	 * async removal support to invoke a close on this
160204861Sdes	 * vdev once it is safe to do so.
161204861Sdes	 */
162204861Sdes	vd->vdev_remove_wanted = B_TRUE;
163204861Sdes	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
164204861Sdes}
165204861Sdes
166204861Sdesstatic struct g_consumer *
167204861Sdesvdev_geom_attach(struct g_provider *pp, vdev_t *vd)
168204861Sdes{
169204861Sdes	struct g_geom *gp;
170204861Sdes	struct g_consumer *cp;
171204861Sdes	int error;
172204861Sdes
173204861Sdes	g_topology_assert();
174204861Sdes
175204861Sdes	ZFS_LOG(1, "Attaching to %s.", pp->name);
176204861Sdes
177204861Sdes	if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
178204861Sdes		ZFS_LOG(1, "Failing attach of %s. Incompatible sectorsize %d\n",
179204861Sdes		    pp->name, pp->sectorsize);
180204861Sdes		return (NULL);
181204861Sdes	} else if (pp->mediasize < SPA_MINDEVSIZE) {
182204861Sdes		ZFS_LOG(1, "Failing attach of %s. Incompatible mediasize %ju\n",
183204861Sdes		    pp->name, pp->mediasize);
184204861Sdes		return (NULL);
185255767Sdes	}
186255767Sdes
187204861Sdes	/* Do we have geom already? No? Create one. */
188204861Sdes	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
189204861Sdes		if (gp->flags & G_GEOM_WITHER)
190204861Sdes			continue;
191215116Sdes		if (strcmp(gp->name, "zfs::vdev") != 0)
192215116Sdes			continue;
193215116Sdes		break;
194215116Sdes	}
195215116Sdes	if (gp == NULL) {
196215116Sdes		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
197215116Sdes		gp->orphan = vdev_geom_orphan;
198215116Sdes		gp->attrchanged = vdev_geom_attrchanged;
199215116Sdes		cp = g_new_consumer(gp);
200215116Sdes		error = g_attach(cp, pp);
201215116Sdes		if (error != 0) {
202215116Sdes			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
203215116Sdes			    __LINE__, error);
204215116Sdes			vdev_geom_detach(cp, B_FALSE);
205215116Sdes			return (NULL);
206215116Sdes		}
207215116Sdes		error = g_access(cp, 1, 0, 1);
208215116Sdes		if (error != 0) {
209215116Sdes			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
210215116Sdes			       __LINE__, error);
211215116Sdes			vdev_geom_detach(cp, B_FALSE);
212215116Sdes			return (NULL);
213215116Sdes		}
214215116Sdes		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
215215116Sdes	} else {
216215116Sdes		/* Check if we are already connected to this provider. */
217215116Sdes		LIST_FOREACH(cp, &gp->consumer, consumer) {
218215116Sdes			if (cp->provider == pp) {
219204861Sdes				ZFS_LOG(1, "Found consumer for %s.", pp->name);
220204861Sdes				break;
221204861Sdes			}
222204861Sdes		}
223204861Sdes		if (cp == NULL) {
224204861Sdes			cp = g_new_consumer(gp);
225204861Sdes			error = g_attach(cp, pp);
226204861Sdes			if (error != 0) {
227204861Sdes				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
228215116Sdes				    __func__, __LINE__, error);
229204861Sdes				vdev_geom_detach(cp, B_FALSE);
230262566Sdes				return (NULL);
231204861Sdes			}
232204861Sdes			error = g_access(cp, 1, 0, 1);
233204861Sdes			if (error != 0) {
234204861Sdes				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
235204861Sdes				    __func__, __LINE__, error);
236204861Sdes				vdev_geom_detach(cp, B_FALSE);
237204861Sdes				return (NULL);
238204861Sdes			}
239204861Sdes			ZFS_LOG(1, "Created consumer for %s.", pp->name);
240295367Sdes		} else {
241204861Sdes			error = g_access(cp, 1, 0, 1);
242204861Sdes			if (error != 0) {
243204861Sdes				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
244204861Sdes				    __func__, __LINE__, error);
245204861Sdes				return (NULL);
246204861Sdes			}
247204861Sdes			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
248204861Sdes		}
249204861Sdes	}
250204861Sdes
251204861Sdes	/*
252204861Sdes	 * BUG: cp may already belong to a vdev.  This could happen if:
253204861Sdes	 * 1) That vdev is a shared spare, or
254204861Sdes	 * 2) We are trying to reopen a missing vdev and we are scanning by
255204861Sdes	 *    guid.  In that case, we'll ultimately fail to open this consumer,
256204861Sdes	 *    but not until after setting the private field.
257204861Sdes	 * The solution is to:
258295367Sdes	 * 1) Don't set the private field until after the open succeeds, and
259295367Sdes	 * 2) Set it to a linked list of vdevs, not just a single vdev
260295367Sdes	 */
261204861Sdes	cp->private = vd;
262204861Sdes	if (vd != NULL)
263295367Sdes		vd->vdev_tsd = cp;
264295367Sdes
265295367Sdes	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
266295367Sdes	return (cp);
267295367Sdes}
268295367Sdes
269295367Sdesstatic void
270295367Sdesvdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
271295367Sdes{
272295367Sdes	struct g_geom *gp;
273295367Sdes	vdev_t *vd;
274295367Sdes
275295367Sdes	g_topology_assert();
276255767Sdes
277295367Sdes	ZFS_LOG(1, "Detaching consumer. Provider %s.",
278295367Sdes	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
279204861Sdes
280204861Sdes	vd = cp->private;
281204861Sdes	if (vd != NULL) {
282204861Sdes		vd->vdev_tsd = NULL;
283204861Sdes		vd->vdev_delayed_close = B_FALSE;
284204861Sdes	}
285204861Sdes	cp->private = NULL;
286215116Sdes
287215116Sdes	gp = cp->geom;
288215116Sdes	if (open_for_read)
289215116Sdes		g_access(cp, -1, 0, -1);
290204861Sdes	/* Destroy consumer on last close. */
291204861Sdes	if (cp->acr == 0 && cp->ace == 0) {
292204861Sdes		if (cp->acw > 0)
293204861Sdes			g_access(cp, 0, -cp->acw, 0);
294204861Sdes		if (cp->provider != NULL) {
295204861Sdes			ZFS_LOG(1, "Destroying consumer to %s.",
296204861Sdes			    cp->provider->name ? cp->provider->name : "NULL");
297204861Sdes			g_detach(cp);
298204861Sdes		}
299204861Sdes		g_destroy_consumer(cp);
300204861Sdes	}
301204861Sdes	/* Destroy geom if there are no consumers left. */
302204861Sdes	if (LIST_EMPTY(&gp->consumer)) {
303204861Sdes		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
304204861Sdes		g_wither_geom(gp, ENXIO);
305204861Sdes	}
306204861Sdes}
307204861Sdes
308204861Sdesstatic void
309204861Sdesvdev_geom_close_locked(vdev_t *vd)
310204861Sdes{
311204861Sdes	struct g_consumer *cp;
312204861Sdes
313204861Sdes	g_topology_assert();
314204861Sdes
315204861Sdes	cp = vd->vdev_tsd;
316204861Sdes	if (cp == NULL)
317204861Sdes		return;
318204861Sdes
319204861Sdes	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
320204861Sdes
321204861Sdes	vdev_geom_detach(cp, B_TRUE);
322204861Sdes}
323204861Sdes
324204861Sdesstatic void
325296781Sdesnvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
326296781Sdes{
327296781Sdes
328296781Sdes	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
329204861Sdes	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
330204861Sdes}
331204861Sdes
332204861Sdesstatic int
333204861Sdesvdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
334204861Sdes{
335204861Sdes	struct bio *bp;
336204861Sdes	u_char *p;
337204861Sdes	off_t off, maxio;
338204861Sdes	int error;
339204861Sdes
340204861Sdes	ASSERT((offset % cp->provider->sectorsize) == 0);
341204861Sdes	ASSERT((size % cp->provider->sectorsize) == 0);
342255767Sdes
343204861Sdes	bp = g_alloc_bio();
344204861Sdes	off = offset;
345204861Sdes	offset += size;
346204861Sdes	p = data;
347204861Sdes	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
348204861Sdes	error = 0;
349204861Sdes
350204861Sdes	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
351204861Sdes		bzero(bp, sizeof(*bp));
352204861Sdes		bp->bio_cmd = cmd;
353204861Sdes		bp->bio_done = NULL;
354204861Sdes		bp->bio_offset = off;
355204861Sdes		bp->bio_length = MIN(size, maxio);
356204861Sdes		bp->bio_data = p;
357204861Sdes		g_io_request(bp, cp);
358204861Sdes		error = biowait(bp, "vdev_geom_io");
359204861Sdes		if (error != 0)
360204861Sdes			break;
361204861Sdes	}
362204861Sdes
363204861Sdes	g_destroy_bio(bp);
364204861Sdes	return (error);
365204861Sdes}
366204861Sdes
367204861Sdesstatic int
368204861Sdesvdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
369204861Sdes{
370204861Sdes	struct g_provider *pp;
371204861Sdes	vdev_label_t *label;
372204861Sdes	char *p, *buf;
373204861Sdes	size_t buflen;
374204861Sdes	uint64_t psize;
375204861Sdes	off_t offset, size;
376204861Sdes	uint64_t state, txg;
377204861Sdes	int error, l, len;
378204861Sdes
379204861Sdes	g_topology_assert_not();
380295367Sdes
381295367Sdes	pp = cp->provider;
382295367Sdes	ZFS_LOG(1, "Reading config from %s...", pp->name);
383204861Sdes
384204861Sdes	psize = pp->mediasize;
385204861Sdes	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
386204861Sdes
387204861Sdes	size = sizeof(*label) + pp->sectorsize -
388204861Sdes	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
389204861Sdes
390204861Sdes	label = kmem_alloc(size, KM_SLEEP);
391204861Sdes	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
392204861Sdes
393204861Sdes	*config = NULL;
394204861Sdes	for (l = 0; l < VDEV_LABELS; l++) {
395204861Sdes
396204861Sdes		offset = vdev_label_offset(psize, l, 0);
397204861Sdes		if ((offset % pp->sectorsize) != 0)
398204861Sdes			continue;
399262566Sdes
400295367Sdes		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
401262566Sdes			continue;
402262566Sdes		buf = label->vl_vdev_phys.vp_nvlist;
403204861Sdes
404262566Sdes		if (nvlist_unpack(buf, buflen, config, 0) != 0)
405295367Sdes			continue;
406204861Sdes
407262566Sdes		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
408262566Sdes		    &state) != 0 || state > POOL_STATE_L2CACHE) {
409262566Sdes			nvlist_free(*config);
410262566Sdes			*config = NULL;
411262566Sdes			continue;
412262566Sdes		}
413262566Sdes
414262566Sdes		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
415262566Sdes		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
416262566Sdes		    &txg) != 0 || txg == 0)) {
417262566Sdes			nvlist_free(*config);
418262566Sdes			*config = NULL;
419262566Sdes			continue;
420262566Sdes		}
421262566Sdes
422262566Sdes		break;
423262566Sdes	}
424262566Sdes
425262566Sdes	kmem_free(label, size);
426262566Sdes	return (*config == NULL ? ENOENT : 0);
427262566Sdes}
428262566Sdes
429262566Sdesstatic void
430262566Sdesresize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
431262566Sdes{
432262566Sdes	nvlist_t **new_configs;
433262566Sdes	uint64_t i;
434262566Sdes
435262566Sdes	if (id < *count)
436262566Sdes		return;
437295367Sdes	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
438262566Sdes	    KM_SLEEP);
439262566Sdes	for (i = 0; i < *count; i++)
440262566Sdes		new_configs[i] = (*configs)[i];
441262566Sdes	if (*configs != NULL)
442295367Sdes		kmem_free(*configs, *count * sizeof(void *));
443262566Sdes	*configs = new_configs;
444262566Sdes	*count = id + 1;
445262566Sdes}
446262566Sdes
447262566Sdesstatic void
448262566Sdesprocess_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
449262566Sdes    const char *name, uint64_t* known_pool_guid)
450295367Sdes{
451262566Sdes	nvlist_t *vdev_tree;
452295367Sdes	uint64_t pool_guid;
453204861Sdes	uint64_t vdev_guid, known_guid;
454262566Sdes	uint64_t id, txg, known_txg;
455262566Sdes	char *pname;
456204861Sdes	int i;
457262566Sdes
458204861Sdes	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
459204861Sdes	    strcmp(pname, name) != 0)
460204861Sdes		goto ignore;
461204861Sdes
462204861Sdes	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
463204861Sdes		goto ignore;
464204861Sdes
465204861Sdes	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
466204861Sdes		goto ignore;
467262566Sdes
468204861Sdes	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
469204861Sdes		goto ignore;
470204861Sdes
471204861Sdes	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
472204861Sdes		goto ignore;
473204861Sdes
474204861Sdes	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
475204861Sdes
476204861Sdes	if (*known_pool_guid != 0) {
477204861Sdes		if (pool_guid != *known_pool_guid)
478204861Sdes			goto ignore;
479204861Sdes	} else
480204861Sdes		*known_pool_guid = pool_guid;
481204861Sdes
482204861Sdes	resize_configs(configs, count, id);
483204861Sdes
484204861Sdes	if ((*configs)[id] != NULL) {
485204861Sdes		VERIFY(nvlist_lookup_uint64((*configs)[id],
486295367Sdes		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
487295367Sdes		if (txg <= known_txg)
488295367Sdes			goto ignore;
489295367Sdes		nvlist_free((*configs)[id]);
490295367Sdes	}
491295367Sdes
492215116Sdes	(*configs)[id] = cfg;
493215116Sdes	return;
494215116Sdes
495215116Sdesignore:
496295367Sdes	nvlist_free(cfg);
497295367Sdes}
498295367Sdes
499295367Sdesint
500295367Sdesvdev_geom_read_pool_label(const char *name,
501295367Sdes    nvlist_t ***configs, uint64_t *count)
502295367Sdes{
503262566Sdes	struct g_class *mp;
504262566Sdes	struct g_geom *gp;
505262566Sdes	struct g_provider *pp;
506262566Sdes	struct g_consumer *zcp;
507262566Sdes	nvlist_t *vdev_cfg;
508204861Sdes	uint64_t pool_guid;
509204861Sdes	int error;
510204861Sdes
511262566Sdes	DROP_GIANT();
512262566Sdes	g_topology_lock();
513262566Sdes
514262566Sdes	*configs = NULL;
515262566Sdes	*count = 0;
516262566Sdes	pool_guid = 0;
517262566Sdes	LIST_FOREACH(mp, &g_classes, class) {
518262566Sdes		if (mp == &zfs_vdev_class)
519262566Sdes			continue;
520204861Sdes		LIST_FOREACH(gp, &mp->geom, geom) {
521262566Sdes			if (gp->flags & G_GEOM_WITHER)
522262566Sdes				continue;
523262566Sdes			LIST_FOREACH(pp, &gp->provider, provider) {
524262566Sdes				if (pp->flags & G_PF_WITHER)
525262566Sdes					continue;
526262566Sdes				zcp = vdev_geom_attach(pp, NULL);
527262566Sdes				if (zcp == NULL)
528262566Sdes					continue;
529262566Sdes				g_topology_unlock();
530262566Sdes				error = vdev_geom_read_config(zcp, &vdev_cfg);
531262566Sdes				g_topology_lock();
532262566Sdes				vdev_geom_detach(zcp, B_TRUE);
533262566Sdes				if (error)
534262566Sdes					continue;
535262566Sdes				ZFS_LOG(1, "successfully read vdev config");
536262566Sdes
537262566Sdes				process_vdev_config(configs, count,
538262566Sdes				    vdev_cfg, name, &pool_guid);
539262566Sdes			}
540295367Sdes		}
541262566Sdes	}
542262566Sdes	g_topology_unlock();
543295367Sdes	PICKUP_GIANT();
544262566Sdes
545295367Sdes	return (*count > 0 ? 0 : ENOENT);
546262566Sdes}
547204861Sdes
548295367Sdesstatic void
549295367Sdesvdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
550204861Sdes{
551204861Sdes	nvlist_t *config;
552204861Sdes
553204861Sdes	g_topology_assert_not();
554262566Sdes
555262566Sdes	*pguid = 0;
556204861Sdes	*vguid = 0;
557204861Sdes	if (vdev_geom_read_config(cp, &config) == 0) {
558255767Sdes		nvlist_get_guids(config, pguid, vguid);
559204861Sdes		nvlist_free(config);
560204861Sdes	}
561204861Sdes}
562204861Sdes
563204861Sdesstatic boolean_t
564204861Sdesvdev_attach_ok(vdev_t *vd, struct g_provider *pp)
565204861Sdes{
566204861Sdes	uint64_t pool_guid;
567295367Sdes	uint64_t vdev_guid;
568204861Sdes	struct g_consumer *zcp;
569204861Sdes	boolean_t pool_ok;
570204861Sdes	boolean_t vdev_ok;
571204861Sdes
572204861Sdes	zcp = vdev_geom_attach(pp, NULL);
573204861Sdes	if (zcp == NULL) {
574204861Sdes		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
575204861Sdes		    pp->name);
576204861Sdes		return (B_FALSE);
577204861Sdes	}
578204861Sdes	g_topology_unlock();
579204861Sdes	vdev_geom_read_guids(zcp, &pool_guid, &vdev_guid);
580204861Sdes	g_topology_lock();
581204861Sdes	vdev_geom_detach(zcp, B_TRUE);
582204861Sdes
583204861Sdes	/*
584204861Sdes	 * Check that the label's vdev guid matches the desired guid.  If the
585204861Sdes	 * label has a pool guid, check that it matches too. (Inactive spares
586204861Sdes	 * and L2ARCs do not have any pool guid in the label.)
587204861Sdes	 */
588204861Sdes	if ((pool_guid == 0 || pool_guid == spa_guid(vd->vdev_spa)) &&
589204861Sdes	    vdev_guid == vd->vdev_guid) {
590204861Sdes		ZFS_LOG(1, "guids match for provider %s.", vd->vdev_path);
591204861Sdes		return (B_TRUE);
592204861Sdes	} else {
593204861Sdes		ZFS_LOG(1, "guid mismatch for provider %s: "
594204861Sdes		    "%ju:%ju != %ju:%ju.", vd->vdev_path,
595204861Sdes		    (uintmax_t)spa_guid(vd->vdev_spa),
596204861Sdes		    (uintmax_t)vd->vdev_guid,
597204861Sdes		    (uintmax_t)pool_guid, (uintmax_t)vdev_guid);
598204861Sdes		return (B_FALSE);
599204861Sdes	}
600204861Sdes}
601204861Sdes
602204861Sdesstatic struct g_consumer *
603204861Sdesvdev_geom_attach_by_guids(vdev_t *vd)
604204861Sdes{
605204861Sdes	struct g_class *mp;
606204861Sdes	struct g_geom *gp;
607204861Sdes	struct g_provider *pp;
608204861Sdes	struct g_consumer *cp;
609204861Sdes
610204861Sdes	g_topology_assert();
611204861Sdes
612204861Sdes	cp = NULL;
613204861Sdes	LIST_FOREACH(mp, &g_classes, class) {
614204861Sdes		if (mp == &zfs_vdev_class)
615204861Sdes			continue;
616204861Sdes		LIST_FOREACH(gp, &mp->geom, geom) {
617204861Sdes			if (gp->flags & G_GEOM_WITHER)
618204861Sdes				continue;
619204861Sdes			LIST_FOREACH(pp, &gp->provider, provider) {
620204861Sdes				if (!vdev_attach_ok(vd, pp))
621204861Sdes					continue;
622204861Sdes				cp = vdev_geom_attach(pp, vd);
623204861Sdes				if (cp == NULL) {
624204861Sdes					printf("ZFS WARNING: Unable to "
625204861Sdes					    "attach to %s.\n", pp->name);
626204861Sdes					continue;
627204861Sdes				}
628204861Sdes				break;
629204861Sdes			}
630204861Sdes			if (cp != NULL)
631204861Sdes				break;
632204861Sdes		}
633204861Sdes		if (cp != NULL)
634204861Sdes			break;
635204861Sdes	}
636204861Sdesend:
637204861Sdes	return (cp);
638204861Sdes}
639204861Sdes
640204861Sdesstatic struct g_consumer *
641204861Sdesvdev_geom_open_by_guids(vdev_t *vd)
642204861Sdes{
643204861Sdes	struct g_consumer *cp;
644295367Sdes	char *buf;
645295367Sdes	size_t len;
646295367Sdes
647295367Sdes	g_topology_assert();
648295367Sdes
649204861Sdes	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
650204861Sdes		(uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
651204861Sdes	cp = vdev_geom_attach_by_guids(vd);
652204861Sdes	if (cp != NULL) {
653204861Sdes		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
654204861Sdes		buf = kmem_alloc(len, KM_SLEEP);
655204861Sdes
656204861Sdes		snprintf(buf, len, "/dev/%s", cp->provider->name);
657204861Sdes		spa_strfree(vd->vdev_path);
658204861Sdes		vd->vdev_path = buf;
659204861Sdes
660204861Sdes		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
661204861Sdes		    (uintmax_t)spa_guid(vd->vdev_spa),
662204861Sdes		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
663204861Sdes	} else {
664204861Sdes		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
665204861Sdes		    (uintmax_t)spa_guid(vd->vdev_spa),
666204861Sdes		    (uintmax_t)vd->vdev_guid);
667204861Sdes	}
668204861Sdes
669204861Sdes	return (cp);
670204861Sdes}
671204861Sdes
672255767Sdesstatic struct g_consumer *
673255767Sdesvdev_geom_open_by_path(vdev_t *vd, int check_guid)
674255767Sdes{
675204861Sdes	struct g_provider *pp;
676204861Sdes	struct g_consumer *cp;
677204861Sdes
678204861Sdes	g_topology_assert();
679204861Sdes
680204861Sdes	cp = NULL;
681226046Sdes	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
682226046Sdes	if (pp != NULL) {
683226046Sdes		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
684226046Sdes		if (!check_guid || vdev_attach_ok(vd, pp))
685226046Sdes			cp = vdev_geom_attach(pp, vd);
686226046Sdes	}
687226046Sdes
688226046Sdes	return (cp);
689226046Sdes}
690226046Sdes
691226046Sdesstatic int
692226046Sdesvdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
693226046Sdes    uint64_t *logical_ashift, uint64_t *physical_ashift)
694226046Sdes{
695204861Sdes	struct g_provider *pp;
696	struct g_consumer *cp;
697	size_t bufsize;
698	int error;
699
700	/* Set the TLS to indicate downstack that we should not access zvols*/
701	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
702
703	/*
704	 * We must have a pathname, and it must be absolute.
705	 */
706	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
707		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
708		return (EINVAL);
709	}
710
711	vd->vdev_tsd = NULL;
712
713	DROP_GIANT();
714	g_topology_lock();
715	error = 0;
716
717	if (vd->vdev_spa->spa_splitting_newspa ||
718	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
719	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
720	     vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
721		/*
722		 * We are dealing with a vdev that hasn't been previously
723		 * opened (since boot), and we are not loading an
724		 * existing pool configuration.  This looks like a
725		 * vdev add operation to a new or existing pool.
726		 * Assume the user knows what he/she is doing and find
727		 * GEOM provider by its name, ignoring GUID mismatches.
728		 *
729		 * XXPOLICY: It would be safer to only allow a device
730		 *           that is unlabeled or labeled but missing
731		 *           GUID information to be opened in this fashion,
732		 *           unless we are doing a split, in which case we
733		 *           should allow any guid.
734		 */
735		cp = vdev_geom_open_by_path(vd, 0);
736	} else {
737		/*
738		 * Try using the recorded path for this device, but only
739		 * accept it if its label data contains the expected GUIDs.
740		 */
741		cp = vdev_geom_open_by_path(vd, 1);
742		if (cp == NULL) {
743			/*
744			 * The device at vd->vdev_path doesn't have the
745			 * expected GUIDs. The disks might have merely
746			 * moved around so try all other GEOM providers
747			 * to find one with the right GUIDs.
748			 */
749			cp = vdev_geom_open_by_guids(vd);
750		}
751	}
752
753	/* Clear the TLS now that tasting is done */
754	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
755
756	if (cp == NULL) {
757		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
758		error = ENOENT;
759	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
760	    !ISP2(cp->provider->sectorsize)) {
761		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
762		    vd->vdev_path);
763
764		vdev_geom_close_locked(vd);
765		error = EINVAL;
766		cp = NULL;
767	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
768		int i;
769
770		for (i = 0; i < 5; i++) {
771			error = g_access(cp, 0, 1, 0);
772			if (error == 0)
773				break;
774			g_topology_unlock();
775			tsleep(vd, 0, "vdev", hz / 2);
776			g_topology_lock();
777		}
778		if (error != 0) {
779			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
780			    vd->vdev_path, error);
781			vdev_geom_close_locked(vd);
782			cp = NULL;
783		}
784	}
785
786	/* Fetch initial physical path information for this device. */
787	if (cp != NULL)
788		vdev_geom_attrchanged(cp, "GEOM::physpath");
789
790	g_topology_unlock();
791	PICKUP_GIANT();
792	if (cp == NULL) {
793		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
794		return (error);
795	}
796	pp = cp->provider;
797
798	/*
799	 * Determine the actual size of the device.
800	 */
801	*max_psize = *psize = pp->mediasize;
802
803	/*
804	 * Determine the device's minimum transfer size and preferred
805	 * transfer size.
806	 */
807	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
808	*physical_ashift = 0;
809	if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
810	    pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
811		*physical_ashift = highbit(pp->stripesize) - 1;
812
813	/*
814	 * Clear the nowritecache settings, so that on a vdev_reopen()
815	 * we will try again.
816	 */
817	vd->vdev_nowritecache = B_FALSE;
818
819	/*
820	 * Determine the device's rotation rate.
821	 */
822	vdev_geom_set_rotation_rate(vd, cp);
823
824	return (0);
825}
826
827static void
828vdev_geom_close(vdev_t *vd)
829{
830
831	DROP_GIANT();
832	g_topology_lock();
833	vdev_geom_close_locked(vd);
834	g_topology_unlock();
835	PICKUP_GIANT();
836}
837
838static void
839vdev_geom_io_intr(struct bio *bp)
840{
841	vdev_t *vd;
842	zio_t *zio;
843
844	zio = bp->bio_caller1;
845	vd = zio->io_vd;
846	zio->io_error = bp->bio_error;
847	if (zio->io_error == 0 && bp->bio_resid != 0)
848		zio->io_error = SET_ERROR(EIO);
849
850	switch(zio->io_error) {
851	case ENOTSUP:
852		/*
853		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
854		 * that future attempts will never succeed. In this case
855		 * we set a persistent flag so that we don't bother with
856		 * requests in the future.
857		 */
858		switch(bp->bio_cmd) {
859		case BIO_FLUSH:
860			vd->vdev_nowritecache = B_TRUE;
861			break;
862		case BIO_DELETE:
863			vd->vdev_notrim = B_TRUE;
864			break;
865		}
866		break;
867	case ENXIO:
868		if (!vd->vdev_remove_wanted) {
869			/*
870			 * If provider's error is set we assume it is being
871			 * removed.
872			 */
873			if (bp->bio_to->error != 0) {
874				vd->vdev_remove_wanted = B_TRUE;
875				spa_async_request(zio->io_spa,
876				    SPA_ASYNC_REMOVE);
877			} else if (!vd->vdev_delayed_close) {
878				vd->vdev_delayed_close = B_TRUE;
879			}
880		}
881		break;
882	}
883	g_destroy_bio(bp);
884	zio_delay_interrupt(zio);
885}
886
887static void
888vdev_geom_io_start(zio_t *zio)
889{
890	vdev_t *vd;
891	struct g_consumer *cp;
892	struct bio *bp;
893	int error;
894
895	vd = zio->io_vd;
896
897	switch (zio->io_type) {
898	case ZIO_TYPE_IOCTL:
899		/* XXPOLICY */
900		if (!vdev_readable(vd)) {
901			zio->io_error = SET_ERROR(ENXIO);
902			zio_interrupt(zio);
903			return;
904		} else {
905			switch (zio->io_cmd) {
906			case DKIOCFLUSHWRITECACHE:
907				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
908					break;
909				if (vd->vdev_nowritecache) {
910					zio->io_error = SET_ERROR(ENOTSUP);
911					break;
912				}
913				goto sendreq;
914			default:
915				zio->io_error = SET_ERROR(ENOTSUP);
916			}
917		}
918
919		zio_execute(zio);
920		return;
921	case ZIO_TYPE_FREE:
922		if (vd->vdev_notrim) {
923			zio->io_error = SET_ERROR(ENOTSUP);
924		} else if (!vdev_geom_bio_delete_disable) {
925			goto sendreq;
926		}
927		zio_execute(zio);
928		return;
929	}
930sendreq:
931	ASSERT(zio->io_type == ZIO_TYPE_READ ||
932	    zio->io_type == ZIO_TYPE_WRITE ||
933	    zio->io_type == ZIO_TYPE_FREE ||
934	    zio->io_type == ZIO_TYPE_IOCTL);
935
936	cp = vd->vdev_tsd;
937	if (cp == NULL) {
938		zio->io_error = SET_ERROR(ENXIO);
939		zio_interrupt(zio);
940		return;
941	}
942	bp = g_alloc_bio();
943	bp->bio_caller1 = zio;
944	switch (zio->io_type) {
945	case ZIO_TYPE_READ:
946	case ZIO_TYPE_WRITE:
947		zio->io_target_timestamp = zio_handle_io_delay(zio);
948		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
949		bp->bio_data = zio->io_data;
950		bp->bio_offset = zio->io_offset;
951		bp->bio_length = zio->io_size;
952		break;
953	case ZIO_TYPE_FREE:
954		bp->bio_cmd = BIO_DELETE;
955		bp->bio_data = NULL;
956		bp->bio_offset = zio->io_offset;
957		bp->bio_length = zio->io_size;
958		break;
959	case ZIO_TYPE_IOCTL:
960		bp->bio_cmd = BIO_FLUSH;
961		bp->bio_flags |= BIO_ORDERED;
962		bp->bio_data = NULL;
963		bp->bio_offset = cp->provider->mediasize;
964		bp->bio_length = 0;
965		break;
966	}
967	bp->bio_done = vdev_geom_io_intr;
968
969	g_io_request(bp, cp);
970}
971
972static void
973vdev_geom_io_done(zio_t *zio)
974{
975}
976
977static void
978vdev_geom_hold(vdev_t *vd)
979{
980}
981
982static void
983vdev_geom_rele(vdev_t *vd)
984{
985}
986
987vdev_ops_t vdev_geom_ops = {
988	vdev_geom_open,
989	vdev_geom_close,
990	vdev_default_asize,
991	vdev_geom_io_start,
992	vdev_geom_io_done,
993	NULL,
994	vdev_geom_hold,
995	vdev_geom_rele,
996	VDEV_TYPE_DISK,		/* name of this vdev type */
997	B_TRUE			/* leaf vdev */
998};
999