vdev_geom.c revision 297078
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
23 * All rights reserved.
24 *
25 * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/zfs_context.h>
29#include <sys/param.h>
30#include <sys/kernel.h>
31#include <sys/bio.h>
32#include <sys/disk.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/vdev_impl.h>
36#include <sys/fs/zfs.h>
37#include <sys/zio.h>
38#include <geom/geom.h>
39#include <geom/geom_int.h>
40
41/*
42 * Virtual device vector for GEOM.
43 */
44
45static g_attrchanged_t vdev_geom_attrchanged;
46struct g_class zfs_vdev_class = {
47	.name = "ZFS::VDEV",
48	.version = G_VERSION,
49	.attrchanged = vdev_geom_attrchanged,
50};
51
52DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
53
54SYSCTL_DECL(_vfs_zfs_vdev);
55/* Don't send BIO_FLUSH. */
56static int vdev_geom_bio_flush_disable = 0;
57TUNABLE_INT("vfs.zfs.vdev.bio_flush_disable", &vdev_geom_bio_flush_disable);
58SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RW,
59    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
60/* Don't send BIO_DELETE. */
61static int vdev_geom_bio_delete_disable = 0;
62TUNABLE_INT("vfs.zfs.vdev.bio_delete_disable", &vdev_geom_bio_delete_disable);
63SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
64    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
65
66static void
67vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
68{
69	int error;
70	uint16_t rate;
71
72	error = g_getattr("GEOM::rotation_rate", cp, &rate);
73	if (error == 0)
74		vd->vdev_rotation_rate = rate;
75	else
76		vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
77}
78
79static void
80vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
81{
82	vdev_t *vd;
83
84	vd = cp->private;
85	if (vd == NULL)
86		return;
87
88	if (strcmp(attr, "GEOM::rotation_rate") == 0) {
89		vdev_geom_set_rotation_rate(vd, cp);
90		return;
91	}
92}
93
94static void
95vdev_geom_orphan(struct g_consumer *cp)
96{
97	vdev_t *vd;
98
99	g_topology_assert();
100
101	vd = cp->private;
102	if (vd == NULL)
103		return;
104
105	/*
106	 * Orphan callbacks occur from the GEOM event thread.
107	 * Concurrent with this call, new I/O requests may be
108	 * working their way through GEOM about to find out
109	 * (only once executed by the g_down thread) that we've
110	 * been orphaned from our disk provider.  These I/Os
111	 * must be retired before we can detach our consumer.
112	 * This is most easily achieved by acquiring the
113	 * SPA ZIO configuration lock as a writer, but doing
114	 * so with the GEOM topology lock held would cause
115	 * a lock order reversal.  Instead, rely on the SPA's
116	 * async removal support to invoke a close on this
117	 * vdev once it is safe to do so.
118	 */
119	vd->vdev_remove_wanted = B_TRUE;
120	spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
121}
122
123static struct g_consumer *
124vdev_geom_attach(struct g_provider *pp)
125{
126	struct g_geom *gp;
127	struct g_consumer *cp;
128
129	g_topology_assert();
130
131	ZFS_LOG(1, "Attaching to %s.", pp->name);
132	/* Do we have geom already? No? Create one. */
133	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
134		if (gp->flags & G_GEOM_WITHER)
135			continue;
136		if (strcmp(gp->name, "zfs::vdev") != 0)
137			continue;
138		break;
139	}
140	if (gp == NULL) {
141		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
142		gp->orphan = vdev_geom_orphan;
143		cp = g_new_consumer(gp);
144		if (g_attach(cp, pp) != 0) {
145			g_wither_geom(gp, ENXIO);
146			return (NULL);
147		}
148		if (g_access(cp, 1, 0, 1) != 0) {
149			g_wither_geom(gp, ENXIO);
150			return (NULL);
151		}
152		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
153	} else {
154		/* Check if we are already connected to this provider. */
155		LIST_FOREACH(cp, &gp->consumer, consumer) {
156			if (cp->provider == pp) {
157				ZFS_LOG(1, "Found consumer for %s.", pp->name);
158				break;
159			}
160		}
161		if (cp == NULL) {
162			cp = g_new_consumer(gp);
163			if (g_attach(cp, pp) != 0) {
164				g_destroy_consumer(cp);
165				return (NULL);
166			}
167			if (g_access(cp, 1, 0, 1) != 0) {
168				g_detach(cp);
169				g_destroy_consumer(cp);
170				return (NULL);
171			}
172			ZFS_LOG(1, "Created consumer for %s.", pp->name);
173		} else {
174			if (g_access(cp, 1, 0, 1) != 0)
175				return (NULL);
176			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
177		}
178	}
179	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
180	return (cp);
181}
182
183static void
184vdev_geom_detach(void *arg, int flag __unused)
185{
186	struct g_geom *gp;
187	struct g_consumer *cp;
188
189	g_topology_assert();
190	cp = arg;
191	gp = cp->geom;
192
193	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
194	g_access(cp, -1, 0, -1);
195	/* Destroy consumer on last close. */
196	if (cp->acr == 0 && cp->ace == 0) {
197		ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
198		if (cp->acw > 0)
199			g_access(cp, 0, -cp->acw, 0);
200		g_detach(cp);
201		g_destroy_consumer(cp);
202	}
203	/* Destroy geom if there are no consumers left. */
204	if (LIST_EMPTY(&gp->consumer)) {
205		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
206		g_wither_geom(gp, ENXIO);
207	}
208}
209
210static void
211nvlist_get_guids(nvlist_t *list, uint64_t *pguid, uint64_t *vguid)
212{
213
214	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, vguid);
215	(void) nvlist_lookup_uint64(list, ZPOOL_CONFIG_POOL_GUID, pguid);
216}
217
218static int
219vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
220{
221	struct bio *bp;
222	u_char *p;
223	off_t off, maxio;
224	int error;
225
226	ASSERT((offset % cp->provider->sectorsize) == 0);
227	ASSERT((size % cp->provider->sectorsize) == 0);
228
229	bp = g_alloc_bio();
230	off = offset;
231	offset += size;
232	p = data;
233	maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
234	error = 0;
235
236	for (; off < offset; off += maxio, p += maxio, size -= maxio) {
237		bzero(bp, sizeof(*bp));
238		bp->bio_cmd = cmd;
239		bp->bio_done = NULL;
240		bp->bio_offset = off;
241		bp->bio_length = MIN(size, maxio);
242		bp->bio_data = p;
243		g_io_request(bp, cp);
244		error = biowait(bp, "vdev_geom_io");
245		if (error != 0)
246			break;
247	}
248
249	g_destroy_bio(bp);
250	return (error);
251}
252
253static void
254vdev_geom_taste_orphan(struct g_consumer *cp)
255{
256
257	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
258	    cp->provider->name));
259}
260
261static int
262vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
263{
264	struct g_provider *pp;
265	vdev_label_t *label;
266	char *p, *buf;
267	size_t buflen;
268	uint64_t psize;
269	off_t offset, size;
270	uint64_t state, txg;
271	int error, l, len;
272
273	g_topology_assert_not();
274
275	pp = cp->provider;
276	ZFS_LOG(1, "Reading config from %s...", pp->name);
277
278	psize = pp->mediasize;
279	psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
280
281	size = sizeof(*label) + pp->sectorsize -
282	    ((sizeof(*label) - 1) % pp->sectorsize) - 1;
283
284	label = kmem_alloc(size, KM_SLEEP);
285	buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
286
287	*config = NULL;
288	for (l = 0; l < VDEV_LABELS; l++) {
289
290		offset = vdev_label_offset(psize, l, 0);
291		if ((offset % pp->sectorsize) != 0)
292			continue;
293
294		if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
295			continue;
296		buf = label->vl_vdev_phys.vp_nvlist;
297
298		if (nvlist_unpack(buf, buflen, config, 0) != 0)
299			continue;
300
301		if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
302		    &state) != 0 || state > POOL_STATE_L2CACHE) {
303			nvlist_free(*config);
304			*config = NULL;
305			continue;
306		}
307
308		if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
309		    (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
310		    &txg) != 0 || txg == 0)) {
311			nvlist_free(*config);
312			*config = NULL;
313			continue;
314		}
315
316		break;
317	}
318
319	kmem_free(label, size);
320	return (*config == NULL ? ENOENT : 0);
321}
322
323static void
324resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
325{
326	nvlist_t **new_configs;
327	uint64_t i;
328
329	if (id < *count)
330		return;
331	new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *),
332	    KM_SLEEP);
333	for (i = 0; i < *count; i++)
334		new_configs[i] = (*configs)[i];
335	if (*configs != NULL)
336		kmem_free(*configs, *count * sizeof(void *));
337	*configs = new_configs;
338	*count = id + 1;
339}
340
341static void
342process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
343    const char *name, uint64_t* known_pool_guid)
344{
345	nvlist_t *vdev_tree;
346	uint64_t pool_guid;
347	uint64_t vdev_guid, known_guid;
348	uint64_t id, txg, known_txg;
349	char *pname;
350	int i;
351
352	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
353	    strcmp(pname, name) != 0)
354		goto ignore;
355
356	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
357		goto ignore;
358
359	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
360		goto ignore;
361
362	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
363		goto ignore;
364
365	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
366		goto ignore;
367
368	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
369
370	if (*known_pool_guid != 0) {
371		if (pool_guid != *known_pool_guid)
372			goto ignore;
373	} else
374		*known_pool_guid = pool_guid;
375
376	resize_configs(configs, count, id);
377
378	if ((*configs)[id] != NULL) {
379		VERIFY(nvlist_lookup_uint64((*configs)[id],
380		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
381		if (txg <= known_txg)
382			goto ignore;
383		nvlist_free((*configs)[id]);
384	}
385
386	(*configs)[id] = cfg;
387	return;
388
389ignore:
390	nvlist_free(cfg);
391}
392
393static int
394vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp)
395{
396	int error;
397
398	if (pp->flags & G_PF_WITHER)
399		return (EINVAL);
400	g_attach(cp, pp);
401	error = g_access(cp, 1, 0, 0);
402	if (error == 0) {
403		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize))
404			error = EINVAL;
405		else if (pp->mediasize < SPA_MINDEVSIZE)
406			error = EINVAL;
407		if (error != 0)
408			g_access(cp, -1, 0, 0);
409	}
410	if (error != 0)
411		g_detach(cp);
412	return (error);
413}
414
415static void
416vdev_geom_detach_taster(struct g_consumer *cp)
417{
418	g_access(cp, -1, 0, 0);
419	g_detach(cp);
420}
421
422int
423vdev_geom_read_pool_label(const char *name,
424    nvlist_t ***configs, uint64_t *count)
425{
426	struct g_class *mp;
427	struct g_geom *gp, *zgp;
428	struct g_provider *pp;
429	struct g_consumer *zcp;
430	nvlist_t *vdev_cfg;
431	uint64_t pool_guid;
432	int error;
433
434	DROP_GIANT();
435	g_topology_lock();
436
437	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
438	/* This orphan function should be never called. */
439	zgp->orphan = vdev_geom_taste_orphan;
440	zcp = g_new_consumer(zgp);
441
442	*configs = NULL;
443	*count = 0;
444	pool_guid = 0;
445	LIST_FOREACH(mp, &g_classes, class) {
446		if (mp == &zfs_vdev_class)
447			continue;
448		LIST_FOREACH(gp, &mp->geom, geom) {
449			if (gp->flags & G_GEOM_WITHER)
450				continue;
451			LIST_FOREACH(pp, &gp->provider, provider) {
452				if (pp->flags & G_PF_WITHER)
453					continue;
454				if (vdev_geom_attach_taster(zcp, pp) != 0)
455					continue;
456				g_topology_unlock();
457				error = vdev_geom_read_config(zcp, &vdev_cfg);
458				g_topology_lock();
459				vdev_geom_detach_taster(zcp);
460				if (error)
461					continue;
462				ZFS_LOG(1, "successfully read vdev config");
463
464				process_vdev_config(configs, count,
465				    vdev_cfg, name, &pool_guid);
466			}
467		}
468	}
469
470	g_destroy_consumer(zcp);
471	g_destroy_geom(zgp);
472	g_topology_unlock();
473	PICKUP_GIANT();
474
475	return (*count > 0 ? 0 : ENOENT);
476}
477
478static void
479vdev_geom_read_guids(struct g_consumer *cp, uint64_t *pguid, uint64_t *vguid)
480{
481	nvlist_t *config;
482
483	g_topology_assert_not();
484
485	*pguid = 0;
486	*vguid = 0;
487	if (vdev_geom_read_config(cp, &config) == 0) {
488		nvlist_get_guids(config, pguid, vguid);
489		nvlist_free(config);
490	}
491}
492
493static struct g_consumer *
494vdev_geom_attach_by_guids(uint64_t pool_guid, uint64_t vdev_guid)
495{
496	struct g_class *mp;
497	struct g_geom *gp, *zgp;
498	struct g_provider *pp;
499	struct g_consumer *cp, *zcp;
500	uint64_t pguid, vguid;
501
502	g_topology_assert();
503
504	zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
505	/* This orphan function should be never called. */
506	zgp->orphan = vdev_geom_taste_orphan;
507	zcp = g_new_consumer(zgp);
508
509	cp = NULL;
510	LIST_FOREACH(mp, &g_classes, class) {
511		if (mp == &zfs_vdev_class)
512			continue;
513		LIST_FOREACH(gp, &mp->geom, geom) {
514			if (gp->flags & G_GEOM_WITHER)
515				continue;
516			LIST_FOREACH(pp, &gp->provider, provider) {
517				if (vdev_geom_attach_taster(zcp, pp) != 0)
518					continue;
519				g_topology_unlock();
520				vdev_geom_read_guids(zcp, &pguid, &vguid);
521				g_topology_lock();
522				vdev_geom_detach_taster(zcp);
523				/*
524				 * Check that the label's vdev guid matches the
525				 * desired guid.  If the label has a pool guid,
526				 * check that it matches too. (Inactive spares
527				 * and L2ARCs do not have any pool guid in the
528				 * label.)
529				*/
530				if ((pguid != 0 &&
531				     pguid != pool_guid) ||
532				    vguid != vdev_guid)
533					continue;
534				cp = vdev_geom_attach(pp);
535				if (cp == NULL) {
536					printf("ZFS WARNING: Unable to "
537					    "attach to %s.\n", pp->name);
538					continue;
539				}
540				break;
541			}
542			if (cp != NULL)
543				break;
544		}
545		if (cp != NULL)
546			break;
547	}
548end:
549	g_destroy_consumer(zcp);
550	g_destroy_geom(zgp);
551	return (cp);
552}
553
554static struct g_consumer *
555vdev_geom_open_by_guids(vdev_t *vd)
556{
557	struct g_consumer *cp;
558	char *buf;
559	size_t len;
560
561	g_topology_assert();
562
563	ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
564	cp = vdev_geom_attach_by_guids(spa_guid(vd->vdev_spa), vd->vdev_guid);
565	if (cp != NULL) {
566		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
567		buf = kmem_alloc(len, KM_SLEEP);
568
569		snprintf(buf, len, "/dev/%s", cp->provider->name);
570		spa_strfree(vd->vdev_path);
571		vd->vdev_path = buf;
572
573		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
574		    (uintmax_t)spa_guid(vd->vdev_spa),
575		    (uintmax_t)vd->vdev_guid, vd->vdev_path);
576	} else {
577		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
578		    (uintmax_t)spa_guid(vd->vdev_spa),
579		    (uintmax_t)vd->vdev_guid);
580	}
581
582	return (cp);
583}
584
585static struct g_consumer *
586vdev_geom_open_by_path(vdev_t *vd, int check_guid)
587{
588	struct g_provider *pp;
589	struct g_consumer *cp;
590	uint64_t pguid, vguid;
591
592	g_topology_assert();
593
594	cp = NULL;
595	pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
596	if (pp != NULL) {
597		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
598		cp = vdev_geom_attach(pp);
599		if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
600		    pp->sectorsize <= VDEV_PAD_SIZE) {
601			g_topology_unlock();
602			vdev_geom_read_guids(cp, &pguid, &vguid);
603			g_topology_lock();
604			if (pguid != spa_guid(vd->vdev_spa) ||
605			    vguid != vd->vdev_guid) {
606				vdev_geom_detach(cp, 0);
607				cp = NULL;
608				ZFS_LOG(1, "guid mismatch for provider %s: "
609				    "%ju:%ju != %ju:%ju.", vd->vdev_path,
610				    (uintmax_t)spa_guid(vd->vdev_spa),
611				    (uintmax_t)vd->vdev_guid,
612				    (uintmax_t)pguid, (uintmax_t)vguid);
613			} else {
614				ZFS_LOG(1, "guid match for provider %s.",
615				    vd->vdev_path);
616			}
617		}
618	}
619
620	return (cp);
621}
622
623static int
624vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
625    uint64_t *logical_ashift, uint64_t *physical_ashift)
626{
627	struct g_provider *pp;
628	struct g_consumer *cp;
629	size_t bufsize;
630	int error;
631
632	/*
633	 * We must have a pathname, and it must be absolute.
634	 */
635	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
636		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
637		return (EINVAL);
638	}
639
640	vd->vdev_tsd = NULL;
641
642	DROP_GIANT();
643	g_topology_lock();
644	error = 0;
645
646	if (vd->vdev_spa->spa_splitting_newspa ||
647	    (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
648	     vd->vdev_spa->spa_load_state == SPA_LOAD_NONE)) {
649		/*
650		 * We are dealing with a vdev that hasn't been previously
651		 * opened (since boot), and we are not loading an
652		 * existing pool configuration.  This looks like a
653		 * vdev add operation to a new or existing pool.
654		 * Assume the user knows what he/she is doing and find
655		 * GEOM provider by its name, ignoring GUID mismatches.
656		 *
657		 * XXPOLICY: It would be safer to only allow a device
658		 *           that is unlabeled or labeled but missing
659		 *           GUID information to be opened in this fashion,
660		 *           unless we are doing a split, in which case we
661		 *           should allow any guid.
662		 */
663		cp = vdev_geom_open_by_path(vd, 0);
664	} else {
665		/*
666		 * Try using the recorded path for this device, but only
667		 * accept it if its label data contains the expected GUIDs.
668		 */
669		cp = vdev_geom_open_by_path(vd, 1);
670		if (cp == NULL) {
671			/*
672			 * The device at vd->vdev_path doesn't have the
673			 * expected GUIDs. The disks might have merely
674			 * moved around so try all other GEOM providers
675			 * to find one with the right GUIDs.
676			 */
677			cp = vdev_geom_open_by_guids(vd);
678		}
679	}
680
681	if (cp == NULL) {
682		ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
683		error = ENOENT;
684	} else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
685	    !ISP2(cp->provider->sectorsize)) {
686		ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
687		    vd->vdev_path);
688		vdev_geom_detach(cp, 0);
689		error = EINVAL;
690		cp = NULL;
691	} else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
692		int i;
693
694		for (i = 0; i < 5; i++) {
695			error = g_access(cp, 0, 1, 0);
696			if (error == 0)
697				break;
698			g_topology_unlock();
699			tsleep(vd, 0, "vdev", hz / 2);
700			g_topology_lock();
701		}
702		if (error != 0) {
703			printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
704			    vd->vdev_path, error);
705			vdev_geom_detach(cp, 0);
706			cp = NULL;
707		}
708	}
709	g_topology_unlock();
710	PICKUP_GIANT();
711	if (cp == NULL) {
712		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
713		return (error);
714	}
715
716	cp->private = vd;
717	vd->vdev_tsd = cp;
718	pp = cp->provider;
719
720	/*
721	 * Determine the actual size of the device.
722	 */
723	*max_psize = *psize = pp->mediasize;
724
725	/*
726	 * Determine the device's minimum transfer size and preferred
727	 * transfer size.
728	 */
729	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
730	*physical_ashift = 0;
731	if (pp->stripesize)
732		*physical_ashift = highbit(pp->stripesize) - 1;
733
734	/*
735	 * Clear the nowritecache settings, so that on a vdev_reopen()
736	 * we will try again.
737	 */
738	vd->vdev_nowritecache = B_FALSE;
739
740	if (vd->vdev_physpath != NULL)
741		spa_strfree(vd->vdev_physpath);
742	bufsize = sizeof("/dev/") + strlen(pp->name);
743	vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
744	snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
745
746	/*
747	 * Determine the device's rotation rate.
748	 */
749	vdev_geom_set_rotation_rate(vd, cp);
750
751	return (0);
752}
753
754static void
755vdev_geom_close(vdev_t *vd)
756{
757	struct g_consumer *cp;
758
759	cp = vd->vdev_tsd;
760	if (cp == NULL)
761		return;
762	vd->vdev_tsd = NULL;
763	vd->vdev_delayed_close = B_FALSE;
764	cp->private = NULL;	/* XXX locking */
765	g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
766}
767
768static void
769vdev_geom_io_intr(struct bio *bp)
770{
771	vdev_t *vd;
772	zio_t *zio;
773
774	zio = bp->bio_caller1;
775	vd = zio->io_vd;
776	zio->io_error = bp->bio_error;
777	if (zio->io_error == 0 && bp->bio_resid != 0)
778		zio->io_error = SET_ERROR(EIO);
779
780	switch(zio->io_error) {
781	case ENOTSUP:
782		/*
783		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
784		 * that future attempts will never succeed. In this case
785		 * we set a persistent flag so that we don't bother with
786		 * requests in the future.
787		 */
788		switch(bp->bio_cmd) {
789		case BIO_FLUSH:
790			vd->vdev_nowritecache = B_TRUE;
791			break;
792		case BIO_DELETE:
793			vd->vdev_notrim = B_TRUE;
794			break;
795		}
796		break;
797	case ENXIO:
798		if (!vd->vdev_remove_wanted) {
799			/*
800			 * If provider's error is set we assume it is being
801			 * removed.
802			 */
803			if (bp->bio_to->error != 0) {
804				vd->vdev_remove_wanted = B_TRUE;
805				spa_async_request(zio->io_spa,
806				    SPA_ASYNC_REMOVE);
807			} else if (!vd->vdev_delayed_close) {
808				vd->vdev_delayed_close = B_TRUE;
809			}
810		}
811		break;
812	}
813	g_destroy_bio(bp);
814	zio_interrupt(zio);
815}
816
817static void
818vdev_geom_io_start(zio_t *zio)
819{
820	vdev_t *vd;
821	struct g_consumer *cp;
822	struct bio *bp;
823	int error;
824
825	vd = zio->io_vd;
826
827	switch (zio->io_type) {
828	case ZIO_TYPE_IOCTL:
829		/* XXPOLICY */
830		if (!vdev_readable(vd)) {
831			zio->io_error = SET_ERROR(ENXIO);
832			zio_interrupt(zio);
833			return;
834		} else {
835			switch (zio->io_cmd) {
836			case DKIOCFLUSHWRITECACHE:
837				if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
838					break;
839				if (vd->vdev_nowritecache) {
840					zio->io_error = SET_ERROR(ENOTSUP);
841					break;
842				}
843				goto sendreq;
844			default:
845				zio->io_error = SET_ERROR(ENOTSUP);
846			}
847		}
848
849		zio_execute(zio);
850		return;
851	case ZIO_TYPE_FREE:
852		if (vd->vdev_notrim) {
853			zio->io_error = SET_ERROR(ENOTSUP);
854		} else if (!vdev_geom_bio_delete_disable) {
855			goto sendreq;
856		}
857		zio_execute(zio);
858		return;
859	}
860sendreq:
861	ASSERT(zio->io_type == ZIO_TYPE_READ ||
862	    zio->io_type == ZIO_TYPE_WRITE ||
863	    zio->io_type == ZIO_TYPE_FREE ||
864	    zio->io_type == ZIO_TYPE_IOCTL);
865
866	cp = vd->vdev_tsd;
867	if (cp == NULL) {
868		zio->io_error = SET_ERROR(ENXIO);
869		zio_interrupt(zio);
870		return;
871	}
872	bp = g_alloc_bio();
873	bp->bio_caller1 = zio;
874	switch (zio->io_type) {
875	case ZIO_TYPE_READ:
876	case ZIO_TYPE_WRITE:
877		bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
878		bp->bio_data = zio->io_data;
879		bp->bio_offset = zio->io_offset;
880		bp->bio_length = zio->io_size;
881		break;
882	case ZIO_TYPE_FREE:
883		bp->bio_cmd = BIO_DELETE;
884		bp->bio_data = NULL;
885		bp->bio_offset = zio->io_offset;
886		bp->bio_length = zio->io_size;
887		break;
888	case ZIO_TYPE_IOCTL:
889		bp->bio_cmd = BIO_FLUSH;
890		bp->bio_flags |= BIO_ORDERED;
891		bp->bio_data = NULL;
892		bp->bio_offset = cp->provider->mediasize;
893		bp->bio_length = 0;
894		break;
895	}
896	bp->bio_done = vdev_geom_io_intr;
897
898	g_io_request(bp, cp);
899}
900
901static void
902vdev_geom_io_done(zio_t *zio)
903{
904}
905
906static void
907vdev_geom_hold(vdev_t *vd)
908{
909}
910
911static void
912vdev_geom_rele(vdev_t *vd)
913{
914}
915
916vdev_ops_t vdev_geom_ops = {
917	vdev_geom_open,
918	vdev_geom_close,
919	vdev_default_asize,
920	vdev_geom_io_start,
921	vdev_geom_io_done,
922	NULL,
923	vdev_geom_hold,
924	vdev_geom_rele,
925	VDEV_TYPE_DISK,		/* name of this vdev type */
926	B_TRUE			/* leaf vdev */
927};
928