zvol.c revision 265677
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 * Copyright (c) 2013 by Delphix. All rights reserved.
27 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
28 *
29 * Portions Copyright 2010 Robert Milkowski
30 *
31 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
32 */
33
34/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
35
36/*
37 * ZFS volume emulation driver.
38 *
39 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
40 * Volumes are accessed through the symbolic links named:
41 *
42 * /dev/zvol/dsk/<pool_name>/<dataset_name>
43 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
44 *
45 * These links are created by the /dev filesystem (sdev_zvolops.c).
46 * Volumes are persistent through reboot.  No user command needs to be
47 * run before opening and using a device.
48 *
49 * FreeBSD notes.
50 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
51 * in the system.
52 */
53
54#include <sys/types.h>
55#include <sys/param.h>
56#include <sys/kernel.h>
57#include <sys/errno.h>
58#include <sys/uio.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kmem.h>
62#include <sys/conf.h>
63#include <sys/cmn_err.h>
64#include <sys/stat.h>
65#include <sys/zap.h>
66#include <sys/spa.h>
67#include <sys/spa_impl.h>
68#include <sys/zio.h>
69#include <sys/dmu_traverse.h>
70#include <sys/dnode.h>
71#include <sys/dsl_dataset.h>
72#include <sys/dsl_prop.h>
73#include <sys/dkio.h>
74#include <sys/byteorder.h>
75#include <sys/sunddi.h>
76#include <sys/dirent.h>
77#include <sys/policy.h>
78#include <sys/fs/zfs.h>
79#include <sys/zfs_ioctl.h>
80#include <sys/zil.h>
81#include <sys/refcount.h>
82#include <sys/zfs_znode.h>
83#include <sys/zfs_rlock.h>
84#include <sys/vdev_impl.h>
85#include <sys/vdev_raidz.h>
86#include <sys/zvol.h>
87#include <sys/zil_impl.h>
88#include <sys/dbuf.h>
89#include <sys/dmu_tx.h>
90#include <sys/zfeature.h>
91#include <sys/zio_checksum.h>
92
93#include <geom/geom.h>
94
95#include "zfs_namecheck.h"
96
97struct g_class zfs_zvol_class = {
98	.name = "ZFS::ZVOL",
99	.version = G_VERSION,
100};
101
102DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
103
104void *zfsdev_state;
105static char *zvol_tag = "zvol_tag";
106
107#define	ZVOL_DUMPSIZE		"dumpsize"
108
109/*
110 * The spa_namespace_lock protects the zfsdev_state structure from being
111 * modified while it's being used, e.g. an open that comes in before a
112 * create finishes.  It also protects temporary opens of the dataset so that,
113 * e.g., an open doesn't get a spurious EBUSY.
114 */
115static uint32_t zvol_minors;
116
117typedef struct zvol_extent {
118	list_node_t	ze_node;
119	dva_t		ze_dva;		/* dva associated with this extent */
120	uint64_t	ze_nblks;	/* number of blocks in extent */
121} zvol_extent_t;
122
123/*
124 * The in-core state of each volume.
125 */
126typedef struct zvol_state {
127	char		zv_name[MAXPATHLEN]; /* pool/dd name */
128	uint64_t	zv_volsize;	/* amount of space we advertise */
129	uint64_t	zv_volblocksize; /* volume block size */
130	struct g_provider *zv_provider;	/* GEOM provider */
131	uint8_t		zv_min_bs;	/* minimum addressable block shift */
132	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
133	objset_t	*zv_objset;	/* objset handle */
134	uint32_t	zv_total_opens;	/* total open count */
135	zilog_t		*zv_zilog;	/* ZIL handle */
136	list_t		zv_extents;	/* List of extents for dump */
137	znode_t		zv_znode;	/* for range locking */
138	dmu_buf_t	*zv_dbuf;	/* bonus handle */
139	int		zv_state;
140	struct bio_queue_head zv_queue;
141	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
142} zvol_state_t;
143
144/*
145 * zvol specific flags
146 */
147#define	ZVOL_RDONLY	0x1
148#define	ZVOL_DUMPIFIED	0x2
149#define	ZVOL_EXCL	0x4
150#define	ZVOL_WCE	0x8
151
152/*
153 * zvol maximum transfer in one DMU tx.
154 */
155int zvol_maxphys = DMU_MAX_ACCESS/2;
156
157extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
158    nvlist_t *, nvlist_t *);
159static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
160    uint64_t len, boolean_t sync);
161static int zvol_remove_zv(zvol_state_t *);
162static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
163static int zvol_dumpify(zvol_state_t *zv);
164static int zvol_dump_fini(zvol_state_t *zv);
165static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
166
167static zvol_state_t *zvol_geom_create(const char *name);
168static void zvol_geom_run(zvol_state_t *zv);
169static void zvol_geom_destroy(zvol_state_t *zv);
170static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
171static void zvol_geom_start(struct bio *bp);
172static void zvol_geom_worker(void *arg);
173
174static void
175zvol_size_changed(zvol_state_t *zv)
176{
177#ifdef sun
178	dev_t dev = makedevice(maj, min);
179
180	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
181	    "Size", volsize) == DDI_SUCCESS);
182	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
183	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
184
185	/* Notify specfs to invalidate the cached size */
186	spec_size_invalidate(dev, VBLK);
187	spec_size_invalidate(dev, VCHR);
188#else	/* !sun */
189	struct g_provider *pp;
190
191	pp = zv->zv_provider;
192	if (pp == NULL)
193		return;
194	g_topology_lock();
195	g_resize_provider(pp, zv->zv_volsize);
196	g_topology_unlock();
197#endif	/* !sun */
198}
199
200int
201zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
202{
203	if (volsize == 0)
204		return (SET_ERROR(EINVAL));
205
206	if (volsize % blocksize != 0)
207		return (SET_ERROR(EINVAL));
208
209#ifdef _ILP32
210	if (volsize - 1 > SPEC_MAXOFFSET_T)
211		return (SET_ERROR(EOVERFLOW));
212#endif
213	return (0);
214}
215
216int
217zvol_check_volblocksize(uint64_t volblocksize)
218{
219	if (volblocksize < SPA_MINBLOCKSIZE ||
220	    volblocksize > SPA_MAXBLOCKSIZE ||
221	    !ISP2(volblocksize))
222		return (SET_ERROR(EDOM));
223
224	return (0);
225}
226
227int
228zvol_get_stats(objset_t *os, nvlist_t *nv)
229{
230	int error;
231	dmu_object_info_t doi;
232	uint64_t val;
233
234	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
235	if (error)
236		return (error);
237
238	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
239
240	error = dmu_object_info(os, ZVOL_OBJ, &doi);
241
242	if (error == 0) {
243		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
244		    doi.doi_data_block_size);
245	}
246
247	return (error);
248}
249
250static zvol_state_t *
251zvol_minor_lookup(const char *name)
252{
253	struct g_provider *pp;
254	struct g_geom *gp;
255	zvol_state_t *zv = NULL;
256
257	ASSERT(MUTEX_HELD(&spa_namespace_lock));
258
259	g_topology_lock();
260	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
261		pp = LIST_FIRST(&gp->provider);
262		if (pp == NULL)
263			continue;
264		zv = pp->private;
265		if (zv == NULL)
266			continue;
267		if (strcmp(zv->zv_name, name) == 0)
268			break;
269	}
270	g_topology_unlock();
271
272	return (gp != NULL ? zv : NULL);
273}
274
275/* extent mapping arg */
276struct maparg {
277	zvol_state_t	*ma_zv;
278	uint64_t	ma_blks;
279};
280
281/*ARGSUSED*/
282static int
283zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
284    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
285{
286	struct maparg *ma = arg;
287	zvol_extent_t *ze;
288	int bs = ma->ma_zv->zv_volblocksize;
289
290	if (BP_IS_HOLE(bp) ||
291	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
292		return (0);
293
294	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
295	ma->ma_blks++;
296
297	/* Abort immediately if we have encountered gang blocks */
298	if (BP_IS_GANG(bp))
299		return (SET_ERROR(EFRAGS));
300
301	/*
302	 * See if the block is at the end of the previous extent.
303	 */
304	ze = list_tail(&ma->ma_zv->zv_extents);
305	if (ze &&
306	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
307	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
308	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
309		ze->ze_nblks++;
310		return (0);
311	}
312
313	dprintf_bp(bp, "%s", "next blkptr:");
314
315	/* start a new extent */
316	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
317	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
318	ze->ze_nblks = 1;
319	list_insert_tail(&ma->ma_zv->zv_extents, ze);
320	return (0);
321}
322
323static void
324zvol_free_extents(zvol_state_t *zv)
325{
326	zvol_extent_t *ze;
327
328	while (ze = list_head(&zv->zv_extents)) {
329		list_remove(&zv->zv_extents, ze);
330		kmem_free(ze, sizeof (zvol_extent_t));
331	}
332}
333
334static int
335zvol_get_lbas(zvol_state_t *zv)
336{
337	objset_t *os = zv->zv_objset;
338	struct maparg	ma;
339	int		err;
340
341	ma.ma_zv = zv;
342	ma.ma_blks = 0;
343	zvol_free_extents(zv);
344
345	/* commit any in-flight changes before traversing the dataset */
346	txg_wait_synced(dmu_objset_pool(os), 0);
347	err = traverse_dataset(dmu_objset_ds(os), 0,
348	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
349	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
350		zvol_free_extents(zv);
351		return (err ? err : EIO);
352	}
353
354	return (0);
355}
356
357/* ARGSUSED */
358void
359zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
360{
361	zfs_creat_t *zct = arg;
362	nvlist_t *nvprops = zct->zct_props;
363	int error;
364	uint64_t volblocksize, volsize;
365
366	VERIFY(nvlist_lookup_uint64(nvprops,
367	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
368	if (nvlist_lookup_uint64(nvprops,
369	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
370		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
371
372	/*
373	 * These properties must be removed from the list so the generic
374	 * property setting step won't apply to them.
375	 */
376	VERIFY(nvlist_remove_all(nvprops,
377	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
378	(void) nvlist_remove_all(nvprops,
379	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
380
381	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
382	    DMU_OT_NONE, 0, tx);
383	ASSERT(error == 0);
384
385	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
386	    DMU_OT_NONE, 0, tx);
387	ASSERT(error == 0);
388
389	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
390	ASSERT(error == 0);
391}
392
393/*
394 * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
395 * implement DKIOCFREE/free-long-range.
396 */
397static int
398zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
399{
400	uint64_t offset, length;
401
402	if (byteswap)
403		byteswap_uint64_array(lr, sizeof (*lr));
404
405	offset = lr->lr_offset;
406	length = lr->lr_length;
407
408	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
409}
410
411/*
412 * Replay a TX_WRITE ZIL transaction that didn't get committed
413 * after a system failure
414 */
415static int
416zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
417{
418	objset_t *os = zv->zv_objset;
419	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
420	uint64_t offset, length;
421	dmu_tx_t *tx;
422	int error;
423
424	if (byteswap)
425		byteswap_uint64_array(lr, sizeof (*lr));
426
427	offset = lr->lr_offset;
428	length = lr->lr_length;
429
430	/* If it's a dmu_sync() block, write the whole block */
431	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
432		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
433		if (length < blocksize) {
434			offset -= offset % blocksize;
435			length = blocksize;
436		}
437	}
438
439	tx = dmu_tx_create(os);
440	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
441	error = dmu_tx_assign(tx, TXG_WAIT);
442	if (error) {
443		dmu_tx_abort(tx);
444	} else {
445		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
446		dmu_tx_commit(tx);
447	}
448
449	return (error);
450}
451
452/* ARGSUSED */
453static int
454zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
455{
456	return (SET_ERROR(ENOTSUP));
457}
458
459/*
460 * Callback vectors for replaying records.
461 * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
462 */
463zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
464	zvol_replay_err,	/* 0 no such transaction type */
465	zvol_replay_err,	/* TX_CREATE */
466	zvol_replay_err,	/* TX_MKDIR */
467	zvol_replay_err,	/* TX_MKXATTR */
468	zvol_replay_err,	/* TX_SYMLINK */
469	zvol_replay_err,	/* TX_REMOVE */
470	zvol_replay_err,	/* TX_RMDIR */
471	zvol_replay_err,	/* TX_LINK */
472	zvol_replay_err,	/* TX_RENAME */
473	zvol_replay_write,	/* TX_WRITE */
474	zvol_replay_truncate,	/* TX_TRUNCATE */
475	zvol_replay_err,	/* TX_SETATTR */
476	zvol_replay_err,	/* TX_ACL */
477	zvol_replay_err,	/* TX_CREATE_ACL */
478	zvol_replay_err,	/* TX_CREATE_ATTR */
479	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
480	zvol_replay_err,	/* TX_MKDIR_ACL */
481	zvol_replay_err,	/* TX_MKDIR_ATTR */
482	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
483	zvol_replay_err,	/* TX_WRITE2 */
484};
485
486#ifdef sun
487int
488zvol_name2minor(const char *name, minor_t *minor)
489{
490	zvol_state_t *zv;
491
492	mutex_enter(&spa_namespace_lock);
493	zv = zvol_minor_lookup(name);
494	if (minor && zv)
495		*minor = zv->zv_minor;
496	mutex_exit(&spa_namespace_lock);
497	return (zv ? 0 : -1);
498}
499#endif	/* sun */
500
501/*
502 * Create a minor node (plus a whole lot more) for the specified volume.
503 */
504int
505zvol_create_minor(const char *name)
506{
507	zfs_soft_state_t *zs;
508	zvol_state_t *zv;
509	objset_t *os;
510	dmu_object_info_t doi;
511	uint64_t volsize;
512	int error;
513
514	ZFS_LOG(1, "Creating ZVOL %s...", name);
515
516	mutex_enter(&spa_namespace_lock);
517
518	if (zvol_minor_lookup(name) != NULL) {
519		mutex_exit(&spa_namespace_lock);
520		return (SET_ERROR(EEXIST));
521	}
522
523	/* lie and say we're read-only */
524	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
525
526	if (error) {
527		mutex_exit(&spa_namespace_lock);
528		return (error);
529	}
530
531#ifdef sun
532	if ((minor = zfsdev_minor_alloc()) == 0) {
533		dmu_objset_disown(os, FTAG);
534		mutex_exit(&spa_namespace_lock);
535		return (SET_ERROR(ENXIO));
536	}
537
538	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
539		dmu_objset_disown(os, FTAG);
540		mutex_exit(&spa_namespace_lock);
541		return (SET_ERROR(EAGAIN));
542	}
543	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
544	    (char *)name);
545
546	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
547
548	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
549	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
550		ddi_soft_state_free(zfsdev_state, minor);
551		dmu_objset_disown(os, FTAG);
552		mutex_exit(&spa_namespace_lock);
553		return (SET_ERROR(EAGAIN));
554	}
555
556	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
557
558	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
559	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
560		ddi_remove_minor_node(zfs_dip, chrbuf);
561		ddi_soft_state_free(zfsdev_state, minor);
562		dmu_objset_disown(os, FTAG);
563		mutex_exit(&spa_namespace_lock);
564		return (SET_ERROR(EAGAIN));
565	}
566
567	zs = ddi_get_soft_state(zfsdev_state, minor);
568	zs->zss_type = ZSST_ZVOL;
569	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
570#else	/* !sun */
571
572	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
573	if (error) {
574		ASSERT(error == 0);
575		dmu_objset_disown(os, zvol_tag);
576		mutex_exit(&spa_namespace_lock);
577		return (error);
578	}
579
580	DROP_GIANT();
581	g_topology_lock();
582	zv = zvol_geom_create(name);
583	zv->zv_volsize = volsize;
584	zv->zv_provider->mediasize = zv->zv_volsize;
585
586#endif	/* !sun */
587
588	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
589	zv->zv_min_bs = DEV_BSHIFT;
590	zv->zv_objset = os;
591	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
592		zv->zv_flags |= ZVOL_RDONLY;
593	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
594	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
595	    sizeof (rl_t), offsetof(rl_t, r_node));
596	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
597	    offsetof(zvol_extent_t, ze_node));
598	/* get and cache the blocksize */
599	error = dmu_object_info(os, ZVOL_OBJ, &doi);
600	ASSERT(error == 0);
601	zv->zv_volblocksize = doi.doi_data_block_size;
602
603	if (spa_writeable(dmu_objset_spa(os))) {
604		if (zil_replay_disable)
605			zil_destroy(dmu_objset_zil(os), B_FALSE);
606		else
607			zil_replay(os, zv, zvol_replay_vector);
608	}
609	dmu_objset_disown(os, FTAG);
610	zv->zv_objset = NULL;
611
612	zvol_minors++;
613
614	mutex_exit(&spa_namespace_lock);
615
616	zvol_geom_run(zv);
617
618	g_topology_unlock();
619	PICKUP_GIANT();
620
621	ZFS_LOG(1, "ZVOL %s created.", name);
622
623	return (0);
624}
625
626/*
627 * Remove minor node for the specified volume.
628 */
629static int
630zvol_remove_zv(zvol_state_t *zv)
631{
632#ifdef sun
633	minor_t minor = zv->zv_minor;
634#endif
635
636	ASSERT(MUTEX_HELD(&spa_namespace_lock));
637	if (zv->zv_total_opens != 0)
638		return (SET_ERROR(EBUSY));
639
640	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
641
642#ifdef sun
643	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
644	ddi_remove_minor_node(zfs_dip, nmbuf);
645#endif	/* sun */
646
647	avl_destroy(&zv->zv_znode.z_range_avl);
648	mutex_destroy(&zv->zv_znode.z_range_lock);
649
650	zvol_geom_destroy(zv);
651
652	zvol_minors--;
653	return (0);
654}
655
656int
657zvol_remove_minor(const char *name)
658{
659	zvol_state_t *zv;
660	int rc;
661
662	mutex_enter(&spa_namespace_lock);
663	if ((zv = zvol_minor_lookup(name)) == NULL) {
664		mutex_exit(&spa_namespace_lock);
665		return (SET_ERROR(ENXIO));
666	}
667	g_topology_lock();
668	rc = zvol_remove_zv(zv);
669	g_topology_unlock();
670	mutex_exit(&spa_namespace_lock);
671	return (rc);
672}
673
674int
675zvol_first_open(zvol_state_t *zv)
676{
677	objset_t *os;
678	uint64_t volsize;
679	int error;
680	uint64_t readonly;
681
682	/* lie and say we're read-only */
683	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
684	    zvol_tag, &os);
685	if (error)
686		return (error);
687
688	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
689	if (error) {
690		ASSERT(error == 0);
691		dmu_objset_disown(os, zvol_tag);
692		return (error);
693	}
694	zv->zv_objset = os;
695	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
696	if (error) {
697		dmu_objset_disown(os, zvol_tag);
698		return (error);
699	}
700	zv->zv_volsize = volsize;
701	zv->zv_zilog = zil_open(os, zvol_get_data);
702	zvol_size_changed(zv);
703
704	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
705	    NULL) == 0);
706	if (readonly || dmu_objset_is_snapshot(os) ||
707	    !spa_writeable(dmu_objset_spa(os)))
708		zv->zv_flags |= ZVOL_RDONLY;
709	else
710		zv->zv_flags &= ~ZVOL_RDONLY;
711	return (error);
712}
713
714void
715zvol_last_close(zvol_state_t *zv)
716{
717	zil_close(zv->zv_zilog);
718	zv->zv_zilog = NULL;
719
720	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
721	zv->zv_dbuf = NULL;
722
723	/*
724	 * Evict cached data
725	 */
726	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
727	    !(zv->zv_flags & ZVOL_RDONLY))
728		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
729	dmu_objset_evict_dbufs(zv->zv_objset);
730
731	dmu_objset_disown(zv->zv_objset, zvol_tag);
732	zv->zv_objset = NULL;
733}
734
735#ifdef sun
736int
737zvol_prealloc(zvol_state_t *zv)
738{
739	objset_t *os = zv->zv_objset;
740	dmu_tx_t *tx;
741	uint64_t refd, avail, usedobjs, availobjs;
742	uint64_t resid = zv->zv_volsize;
743	uint64_t off = 0;
744
745	/* Check the space usage before attempting to allocate the space */
746	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
747	if (avail < zv->zv_volsize)
748		return (SET_ERROR(ENOSPC));
749
750	/* Free old extents if they exist */
751	zvol_free_extents(zv);
752
753	while (resid != 0) {
754		int error;
755		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
756
757		tx = dmu_tx_create(os);
758		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
759		error = dmu_tx_assign(tx, TXG_WAIT);
760		if (error) {
761			dmu_tx_abort(tx);
762			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
763			return (error);
764		}
765		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
766		dmu_tx_commit(tx);
767		off += bytes;
768		resid -= bytes;
769	}
770	txg_wait_synced(dmu_objset_pool(os), 0);
771
772	return (0);
773}
774#endif	/* sun */
775
776static int
777zvol_update_volsize(objset_t *os, uint64_t volsize)
778{
779	dmu_tx_t *tx;
780	int error;
781
782	ASSERT(MUTEX_HELD(&spa_namespace_lock));
783
784	tx = dmu_tx_create(os);
785	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
786	error = dmu_tx_assign(tx, TXG_WAIT);
787	if (error) {
788		dmu_tx_abort(tx);
789		return (error);
790	}
791
792	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
793	    &volsize, tx);
794	dmu_tx_commit(tx);
795
796	if (error == 0)
797		error = dmu_free_long_range(os,
798		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
799	return (error);
800}
801
802void
803zvol_remove_minors(const char *name)
804{
805	struct g_geom *gp, *gptmp;
806	struct g_provider *pp;
807	zvol_state_t *zv;
808	size_t namelen;
809
810	namelen = strlen(name);
811
812	DROP_GIANT();
813	mutex_enter(&spa_namespace_lock);
814	g_topology_lock();
815
816	LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
817		pp = LIST_FIRST(&gp->provider);
818		if (pp == NULL)
819			continue;
820		zv = pp->private;
821		if (zv == NULL)
822			continue;
823		if (strcmp(zv->zv_name, name) == 0 ||
824		    (strncmp(zv->zv_name, name, namelen) == 0 &&
825		     zv->zv_name[namelen] == '/')) {
826			(void) zvol_remove_zv(zv);
827		}
828	}
829
830	g_topology_unlock();
831	mutex_exit(&spa_namespace_lock);
832	PICKUP_GIANT();
833}
834
835int
836zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
837{
838	zvol_state_t *zv = NULL;
839	objset_t *os;
840	int error;
841	dmu_object_info_t doi;
842	uint64_t old_volsize = 0ULL;
843	uint64_t readonly;
844
845	mutex_enter(&spa_namespace_lock);
846	zv = zvol_minor_lookup(name);
847	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
848		mutex_exit(&spa_namespace_lock);
849		return (error);
850	}
851
852	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
853	    (error = zvol_check_volsize(volsize,
854	    doi.doi_data_block_size)) != 0)
855		goto out;
856
857	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
858	    NULL) == 0);
859	if (readonly) {
860		error = EROFS;
861		goto out;
862	}
863
864	error = zvol_update_volsize(os, volsize);
865	/*
866	 * Reinitialize the dump area to the new size. If we
867	 * failed to resize the dump area then restore it back to
868	 * its original size.
869	 */
870	if (zv && error == 0) {
871#ifdef ZVOL_DUMP
872		if (zv->zv_flags & ZVOL_DUMPIFIED) {
873			old_volsize = zv->zv_volsize;
874			zv->zv_volsize = volsize;
875			if ((error = zvol_dumpify(zv)) != 0 ||
876			    (error = dumpvp_resize()) != 0) {
877				(void) zvol_update_volsize(os, old_volsize);
878				zv->zv_volsize = old_volsize;
879				error = zvol_dumpify(zv);
880			}
881		}
882#endif	/* ZVOL_DUMP */
883		if (error == 0) {
884			zv->zv_volsize = volsize;
885			zvol_size_changed(zv);
886		}
887	}
888
889#ifdef sun
890	/*
891	 * Generate a LUN expansion event.
892	 */
893	if (zv && error == 0) {
894		sysevent_id_t eid;
895		nvlist_t *attr;
896		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
897
898		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
899		    zv->zv_minor);
900
901		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
902		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
903
904		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
905		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
906
907		nvlist_free(attr);
908		kmem_free(physpath, MAXPATHLEN);
909	}
910#endif	/* sun */
911
912out:
913	dmu_objset_rele(os, FTAG);
914
915	mutex_exit(&spa_namespace_lock);
916
917	return (error);
918}
919
920/*ARGSUSED*/
921static int
922zvol_open(struct g_provider *pp, int flag, int count)
923{
924	zvol_state_t *zv;
925	int err = 0;
926	boolean_t locked = B_FALSE;
927
928	/*
929	 * Protect against recursively entering spa_namespace_lock
930	 * when spa_open() is used for a pool on a (local) ZVOL(s).
931	 * This is needed since we replaced upstream zfsdev_state_lock
932	 * with spa_namespace_lock in the ZVOL code.
933	 * We are using the same trick as spa_open().
934	 * Note that calls in zvol_first_open which need to resolve
935	 * pool name to a spa object will enter spa_open()
936	 * recursively, but that function already has all the
937	 * necessary protection.
938	 */
939	if (!MUTEX_HELD(&spa_namespace_lock)) {
940		mutex_enter(&spa_namespace_lock);
941		locked = B_TRUE;
942	}
943
944	zv = pp->private;
945	if (zv == NULL) {
946		if (locked)
947			mutex_exit(&spa_namespace_lock);
948		return (SET_ERROR(ENXIO));
949	}
950
951	if (zv->zv_total_opens == 0) {
952		err = zvol_first_open(zv);
953		if (err) {
954			if (locked)
955				mutex_exit(&spa_namespace_lock);
956			return (err);
957		}
958		pp->mediasize = zv->zv_volsize;
959		pp->stripeoffset = 0;
960		pp->stripesize = zv->zv_volblocksize;
961	}
962	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
963		err = SET_ERROR(EROFS);
964		goto out;
965	}
966	if (zv->zv_flags & ZVOL_EXCL) {
967		err = SET_ERROR(EBUSY);
968		goto out;
969	}
970#ifdef FEXCL
971	if (flag & FEXCL) {
972		if (zv->zv_total_opens != 0) {
973			err = SET_ERROR(EBUSY);
974			goto out;
975		}
976		zv->zv_flags |= ZVOL_EXCL;
977	}
978#endif
979
980	zv->zv_total_opens += count;
981	if (locked)
982		mutex_exit(&spa_namespace_lock);
983
984	return (err);
985out:
986	if (zv->zv_total_opens == 0)
987		zvol_last_close(zv);
988	if (locked)
989		mutex_exit(&spa_namespace_lock);
990	return (err);
991}
992
993/*ARGSUSED*/
994static int
995zvol_close(struct g_provider *pp, int flag, int count)
996{
997	zvol_state_t *zv;
998	int error = 0;
999	boolean_t locked = B_FALSE;
1000
1001	/* See comment in zvol_open(). */
1002	if (!MUTEX_HELD(&spa_namespace_lock)) {
1003		mutex_enter(&spa_namespace_lock);
1004		locked = B_TRUE;
1005	}
1006
1007	zv = pp->private;
1008	if (zv == NULL) {
1009		if (locked)
1010			mutex_exit(&spa_namespace_lock);
1011		return (SET_ERROR(ENXIO));
1012	}
1013
1014	if (zv->zv_flags & ZVOL_EXCL) {
1015		ASSERT(zv->zv_total_opens == 1);
1016		zv->zv_flags &= ~ZVOL_EXCL;
1017	}
1018
1019	/*
1020	 * If the open count is zero, this is a spurious close.
1021	 * That indicates a bug in the kernel / DDI framework.
1022	 */
1023	ASSERT(zv->zv_total_opens != 0);
1024
1025	/*
1026	 * You may get multiple opens, but only one close.
1027	 */
1028	zv->zv_total_opens -= count;
1029
1030	if (zv->zv_total_opens == 0)
1031		zvol_last_close(zv);
1032
1033	if (locked)
1034		mutex_exit(&spa_namespace_lock);
1035	return (error);
1036}
1037
1038static void
1039zvol_get_done(zgd_t *zgd, int error)
1040{
1041	if (zgd->zgd_db)
1042		dmu_buf_rele(zgd->zgd_db, zgd);
1043
1044	zfs_range_unlock(zgd->zgd_rl);
1045
1046	if (error == 0 && zgd->zgd_bp)
1047		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1048
1049	kmem_free(zgd, sizeof (zgd_t));
1050}
1051
1052/*
1053 * Get data to generate a TX_WRITE intent log record.
1054 */
1055static int
1056zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1057{
1058	zvol_state_t *zv = arg;
1059	objset_t *os = zv->zv_objset;
1060	uint64_t object = ZVOL_OBJ;
1061	uint64_t offset = lr->lr_offset;
1062	uint64_t size = lr->lr_length;	/* length of user data */
1063	blkptr_t *bp = &lr->lr_blkptr;
1064	dmu_buf_t *db;
1065	zgd_t *zgd;
1066	int error;
1067
1068	ASSERT(zio != NULL);
1069	ASSERT(size != 0);
1070
1071	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1072	zgd->zgd_zilog = zv->zv_zilog;
1073	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
1074
1075	/*
1076	 * Write records come in two flavors: immediate and indirect.
1077	 * For small writes it's cheaper to store the data with the
1078	 * log record (immediate); for large writes it's cheaper to
1079	 * sync the data and get a pointer to it (indirect) so that
1080	 * we don't have to write the data twice.
1081	 */
1082	if (buf != NULL) {	/* immediate write */
1083		error = dmu_read(os, object, offset, size, buf,
1084		    DMU_READ_NO_PREFETCH);
1085	} else {
1086		size = zv->zv_volblocksize;
1087		offset = P2ALIGN(offset, size);
1088		error = dmu_buf_hold(os, object, offset, zgd, &db,
1089		    DMU_READ_NO_PREFETCH);
1090		if (error == 0) {
1091			blkptr_t *obp = dmu_buf_get_blkptr(db);
1092			if (obp) {
1093				ASSERT(BP_IS_HOLE(bp));
1094				*bp = *obp;
1095			}
1096
1097			zgd->zgd_db = db;
1098			zgd->zgd_bp = bp;
1099
1100			ASSERT(db->db_offset == offset);
1101			ASSERT(db->db_size == size);
1102
1103			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1104			    zvol_get_done, zgd);
1105
1106			if (error == 0)
1107				return (0);
1108		}
1109	}
1110
1111	zvol_get_done(zgd, error);
1112
1113	return (error);
1114}
1115
1116/*
1117 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1118 *
1119 * We store data in the log buffers if it's small enough.
1120 * Otherwise we will later flush the data out via dmu_sync().
1121 */
1122ssize_t zvol_immediate_write_sz = 32768;
1123
1124static void
1125zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1126    boolean_t sync)
1127{
1128	uint32_t blocksize = zv->zv_volblocksize;
1129	zilog_t *zilog = zv->zv_zilog;
1130	boolean_t slogging;
1131	ssize_t immediate_write_sz;
1132
1133	if (zil_replaying(zilog, tx))
1134		return;
1135
1136	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1137	    ? 0 : zvol_immediate_write_sz;
1138
1139	slogging = spa_has_slogs(zilog->zl_spa) &&
1140	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1141
1142	while (resid) {
1143		itx_t *itx;
1144		lr_write_t *lr;
1145		ssize_t len;
1146		itx_wr_state_t write_state;
1147
1148		/*
1149		 * Unlike zfs_log_write() we can be called with
1150		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
1151		 */
1152		if (blocksize > immediate_write_sz && !slogging &&
1153		    resid >= blocksize && off % blocksize == 0) {
1154			write_state = WR_INDIRECT; /* uses dmu_sync */
1155			len = blocksize;
1156		} else if (sync) {
1157			write_state = WR_COPIED;
1158			len = MIN(ZIL_MAX_LOG_DATA, resid);
1159		} else {
1160			write_state = WR_NEED_COPY;
1161			len = MIN(ZIL_MAX_LOG_DATA, resid);
1162		}
1163
1164		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1165		    (write_state == WR_COPIED ? len : 0));
1166		lr = (lr_write_t *)&itx->itx_lr;
1167		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1168		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1169			zil_itx_destroy(itx);
1170			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1171			lr = (lr_write_t *)&itx->itx_lr;
1172			write_state = WR_NEED_COPY;
1173		}
1174
1175		itx->itx_wr_state = write_state;
1176		if (write_state == WR_NEED_COPY)
1177			itx->itx_sod += len;
1178		lr->lr_foid = ZVOL_OBJ;
1179		lr->lr_offset = off;
1180		lr->lr_length = len;
1181		lr->lr_blkoff = 0;
1182		BP_ZERO(&lr->lr_blkptr);
1183
1184		itx->itx_private = zv;
1185		itx->itx_sync = sync;
1186
1187		zil_itx_assign(zilog, itx, tx);
1188
1189		off += len;
1190		resid -= len;
1191	}
1192}
1193
1194#ifdef sun
1195static int
1196zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1197    uint64_t size, boolean_t doread, boolean_t isdump)
1198{
1199	vdev_disk_t *dvd;
1200	int c;
1201	int numerrors = 0;
1202
1203	if (vd->vdev_ops == &vdev_mirror_ops ||
1204	    vd->vdev_ops == &vdev_replacing_ops ||
1205	    vd->vdev_ops == &vdev_spare_ops) {
1206		for (c = 0; c < vd->vdev_children; c++) {
1207			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1208			    addr, offset, origoffset, size, doread, isdump);
1209			if (err != 0) {
1210				numerrors++;
1211			} else if (doread) {
1212				break;
1213			}
1214		}
1215	}
1216
1217	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1218		return (numerrors < vd->vdev_children ? 0 : EIO);
1219
1220	if (doread && !vdev_readable(vd))
1221		return (SET_ERROR(EIO));
1222	else if (!doread && !vdev_writeable(vd))
1223		return (SET_ERROR(EIO));
1224
1225	if (vd->vdev_ops == &vdev_raidz_ops) {
1226		return (vdev_raidz_physio(vd,
1227		    addr, size, offset, origoffset, doread, isdump));
1228	}
1229
1230	offset += VDEV_LABEL_START_SIZE;
1231
1232	if (ddi_in_panic() || isdump) {
1233		ASSERT(!doread);
1234		if (doread)
1235			return (SET_ERROR(EIO));
1236		dvd = vd->vdev_tsd;
1237		ASSERT3P(dvd, !=, NULL);
1238		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1239		    lbtodb(size)));
1240	} else {
1241		dvd = vd->vdev_tsd;
1242		ASSERT3P(dvd, !=, NULL);
1243		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1244		    offset, doread ? B_READ : B_WRITE));
1245	}
1246}
1247
1248static int
1249zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1250    boolean_t doread, boolean_t isdump)
1251{
1252	vdev_t *vd;
1253	int error;
1254	zvol_extent_t *ze;
1255	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1256
1257	/* Must be sector aligned, and not stradle a block boundary. */
1258	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1259	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1260		return (SET_ERROR(EINVAL));
1261	}
1262	ASSERT(size <= zv->zv_volblocksize);
1263
1264	/* Locate the extent this belongs to */
1265	ze = list_head(&zv->zv_extents);
1266	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1267		offset -= ze->ze_nblks * zv->zv_volblocksize;
1268		ze = list_next(&zv->zv_extents, ze);
1269	}
1270
1271	if (ze == NULL)
1272		return (SET_ERROR(EINVAL));
1273
1274	if (!ddi_in_panic())
1275		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1276
1277	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1278	offset += DVA_GET_OFFSET(&ze->ze_dva);
1279	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1280	    size, doread, isdump);
1281
1282	if (!ddi_in_panic())
1283		spa_config_exit(spa, SCL_STATE, FTAG);
1284
1285	return (error);
1286}
1287#endif	/* sun */
1288
1289int
1290zvol_strategy(struct bio *bp)
1291{
1292	zvol_state_t *zv = bp->bio_to->private;
1293	uint64_t off, volsize;
1294	size_t resid;
1295	char *addr;
1296	objset_t *os;
1297	rl_t *rl;
1298	int error = 0;
1299	boolean_t doread = (bp->bio_cmd == BIO_READ);
1300	boolean_t is_dumpified;
1301	boolean_t sync;
1302
1303	if (zv == NULL) {
1304		g_io_deliver(bp, ENXIO);
1305		return (0);
1306	}
1307
1308	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1309		g_io_deliver(bp, EROFS);
1310		return (0);
1311	}
1312
1313	off = bp->bio_offset;
1314	volsize = zv->zv_volsize;
1315
1316	os = zv->zv_objset;
1317	ASSERT(os != NULL);
1318
1319	addr = bp->bio_data;
1320	resid = bp->bio_length;
1321
1322	if (resid > 0 && (off < 0 || off >= volsize)) {
1323		g_io_deliver(bp, EIO);
1324		return (0);
1325	}
1326
1327#ifdef illumos
1328	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1329#else
1330	is_dumpified = B_FALSE;
1331#endif
1332        sync = !doread && !is_dumpified &&
1333	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1334
1335	/*
1336	 * There must be no buffer changes when doing a dmu_sync() because
1337	 * we can't change the data whilst calculating the checksum.
1338	 */
1339	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1340	    doread ? RL_READER : RL_WRITER);
1341
1342	if (bp->bio_cmd == BIO_DELETE) {
1343		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1344		error = dmu_tx_assign(tx, TXG_WAIT);
1345		if (error != 0) {
1346			dmu_tx_abort(tx);
1347		} else {
1348			zvol_log_truncate(zv, tx, off, resid, B_TRUE);
1349			dmu_tx_commit(tx);
1350			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1351			    off, resid);
1352			resid = 0;
1353		}
1354		goto unlock;
1355	}
1356
1357	while (resid != 0 && off < volsize) {
1358		size_t size = MIN(resid, zvol_maxphys);
1359#ifdef illumos
1360		if (is_dumpified) {
1361			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1362			error = zvol_dumpio(zv, addr, off, size,
1363			    doread, B_FALSE);
1364		} else if (doread) {
1365#else
1366		if (doread) {
1367#endif
1368			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1369			    DMU_READ_PREFETCH);
1370		} else {
1371			dmu_tx_t *tx = dmu_tx_create(os);
1372			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1373			error = dmu_tx_assign(tx, TXG_WAIT);
1374			if (error) {
1375				dmu_tx_abort(tx);
1376			} else {
1377				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1378				zvol_log_write(zv, tx, off, size, sync);
1379				dmu_tx_commit(tx);
1380			}
1381		}
1382		if (error) {
1383			/* convert checksum errors into IO errors */
1384			if (error == ECKSUM)
1385				error = SET_ERROR(EIO);
1386			break;
1387		}
1388		off += size;
1389		addr += size;
1390		resid -= size;
1391	}
1392unlock:
1393	zfs_range_unlock(rl);
1394
1395	bp->bio_completed = bp->bio_length - resid;
1396	if (bp->bio_completed < bp->bio_length)
1397		bp->bio_error = (off > volsize ? EINVAL : error);
1398
1399	if (sync)
1400		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1401	g_io_deliver(bp, 0);
1402
1403	return (0);
1404}
1405
1406#ifdef sun
1407/*
1408 * Set the buffer count to the zvol maximum transfer.
1409 * Using our own routine instead of the default minphys()
1410 * means that for larger writes we write bigger buffers on X86
1411 * (128K instead of 56K) and flush the disk write cache less often
1412 * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1413 * 56K on X86 and 128K on sparc).
1414 */
1415void
1416zvol_minphys(struct buf *bp)
1417{
1418	if (bp->b_bcount > zvol_maxphys)
1419		bp->b_bcount = zvol_maxphys;
1420}
1421
1422int
1423zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1424{
1425	minor_t minor = getminor(dev);
1426	zvol_state_t *zv;
1427	int error = 0;
1428	uint64_t size;
1429	uint64_t boff;
1430	uint64_t resid;
1431
1432	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1433	if (zv == NULL)
1434		return (SET_ERROR(ENXIO));
1435
1436	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1437		return (SET_ERROR(EINVAL));
1438
1439	boff = ldbtob(blkno);
1440	resid = ldbtob(nblocks);
1441
1442	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1443
1444	while (resid) {
1445		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1446		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1447		if (error)
1448			break;
1449		boff += size;
1450		addr += size;
1451		resid -= size;
1452	}
1453
1454	return (error);
1455}
1456
1457/*ARGSUSED*/
1458int
1459zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1460{
1461	minor_t minor = getminor(dev);
1462	zvol_state_t *zv;
1463	uint64_t volsize;
1464	rl_t *rl;
1465	int error = 0;
1466
1467	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1468	if (zv == NULL)
1469		return (SET_ERROR(ENXIO));
1470
1471	volsize = zv->zv_volsize;
1472	if (uio->uio_resid > 0 &&
1473	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1474		return (SET_ERROR(EIO));
1475
1476	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1477		error = physio(zvol_strategy, NULL, dev, B_READ,
1478		    zvol_minphys, uio);
1479		return (error);
1480	}
1481
1482	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1483	    RL_READER);
1484	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1485		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1486
1487		/* don't read past the end */
1488		if (bytes > volsize - uio->uio_loffset)
1489			bytes = volsize - uio->uio_loffset;
1490
1491		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1492		if (error) {
1493			/* convert checksum errors into IO errors */
1494			if (error == ECKSUM)
1495				error = SET_ERROR(EIO);
1496			break;
1497		}
1498	}
1499	zfs_range_unlock(rl);
1500	return (error);
1501}
1502
1503/*ARGSUSED*/
1504int
1505zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1506{
1507	minor_t minor = getminor(dev);
1508	zvol_state_t *zv;
1509	uint64_t volsize;
1510	rl_t *rl;
1511	int error = 0;
1512	boolean_t sync;
1513
1514	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1515	if (zv == NULL)
1516		return (SET_ERROR(ENXIO));
1517
1518	volsize = zv->zv_volsize;
1519	if (uio->uio_resid > 0 &&
1520	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1521		return (SET_ERROR(EIO));
1522
1523	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1524		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1525		    zvol_minphys, uio);
1526		return (error);
1527	}
1528
1529	sync = !(zv->zv_flags & ZVOL_WCE) ||
1530	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1531
1532	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1533	    RL_WRITER);
1534	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1535		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1536		uint64_t off = uio->uio_loffset;
1537		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1538
1539		if (bytes > volsize - off)	/* don't write past the end */
1540			bytes = volsize - off;
1541
1542		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1543		error = dmu_tx_assign(tx, TXG_WAIT);
1544		if (error) {
1545			dmu_tx_abort(tx);
1546			break;
1547		}
1548		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1549		if (error == 0)
1550			zvol_log_write(zv, tx, off, bytes, sync);
1551		dmu_tx_commit(tx);
1552
1553		if (error)
1554			break;
1555	}
1556	zfs_range_unlock(rl);
1557	if (sync)
1558		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1559	return (error);
1560}
1561
1562int
1563zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1564{
1565	struct uuid uuid = EFI_RESERVED;
1566	efi_gpe_t gpe = { 0 };
1567	uint32_t crc;
1568	dk_efi_t efi;
1569	int length;
1570	char *ptr;
1571
1572	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1573		return (SET_ERROR(EFAULT));
1574	ptr = (char *)(uintptr_t)efi.dki_data_64;
1575	length = efi.dki_length;
1576	/*
1577	 * Some clients may attempt to request a PMBR for the
1578	 * zvol.  Currently this interface will return EINVAL to
1579	 * such requests.  These requests could be supported by
1580	 * adding a check for lba == 0 and consing up an appropriate
1581	 * PMBR.
1582	 */
1583	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1584		return (SET_ERROR(EINVAL));
1585
1586	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1587	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1588	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1589
1590	if (efi.dki_lba == 1) {
1591		efi_gpt_t gpt = { 0 };
1592
1593		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1594		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1595		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1596		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1597		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1598		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1599		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1600		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1601		gpt.efi_gpt_SizeOfPartitionEntry =
1602		    LE_32(sizeof (efi_gpe_t));
1603		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1604		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1605		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1606		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1607		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1608		    flag))
1609			return (SET_ERROR(EFAULT));
1610		ptr += sizeof (gpt);
1611		length -= sizeof (gpt);
1612	}
1613	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1614	    length), flag))
1615		return (SET_ERROR(EFAULT));
1616	return (0);
1617}
1618
1619/*
1620 * BEGIN entry points to allow external callers access to the volume.
1621 */
1622/*
1623 * Return the volume parameters needed for access from an external caller.
1624 * These values are invariant as long as the volume is held open.
1625 */
1626int
1627zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1628    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1629    void **rl_hdl, void **bonus_hdl)
1630{
1631	zvol_state_t *zv;
1632
1633	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1634	if (zv == NULL)
1635		return (SET_ERROR(ENXIO));
1636	if (zv->zv_flags & ZVOL_DUMPIFIED)
1637		return (SET_ERROR(ENXIO));
1638
1639	ASSERT(blksize && max_xfer_len && minor_hdl &&
1640	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1641
1642	*blksize = zv->zv_volblocksize;
1643	*max_xfer_len = (uint64_t)zvol_maxphys;
1644	*minor_hdl = zv;
1645	*objset_hdl = zv->zv_objset;
1646	*zil_hdl = zv->zv_zilog;
1647	*rl_hdl = &zv->zv_znode;
1648	*bonus_hdl = zv->zv_dbuf;
1649	return (0);
1650}
1651
1652/*
1653 * Return the current volume size to an external caller.
1654 * The size can change while the volume is open.
1655 */
1656uint64_t
1657zvol_get_volume_size(void *minor_hdl)
1658{
1659	zvol_state_t *zv = minor_hdl;
1660
1661	return (zv->zv_volsize);
1662}
1663
1664/*
1665 * Return the current WCE setting to an external caller.
1666 * The WCE setting can change while the volume is open.
1667 */
1668int
1669zvol_get_volume_wce(void *minor_hdl)
1670{
1671	zvol_state_t *zv = minor_hdl;
1672
1673	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1674}
1675
1676/*
1677 * Entry point for external callers to zvol_log_write
1678 */
1679void
1680zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1681    boolean_t sync)
1682{
1683	zvol_state_t *zv = minor_hdl;
1684
1685	zvol_log_write(zv, tx, off, resid, sync);
1686}
1687/*
1688 * END entry points to allow external callers access to the volume.
1689 */
1690#endif	/* sun */
1691
1692/*
1693 * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1694 */
1695static void
1696zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1697    boolean_t sync)
1698{
1699	itx_t *itx;
1700	lr_truncate_t *lr;
1701	zilog_t *zilog = zv->zv_zilog;
1702
1703	if (zil_replaying(zilog, tx))
1704		return;
1705
1706	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1707	lr = (lr_truncate_t *)&itx->itx_lr;
1708	lr->lr_foid = ZVOL_OBJ;
1709	lr->lr_offset = off;
1710	lr->lr_length = len;
1711
1712	itx->itx_sync = sync;
1713	zil_itx_assign(zilog, itx, tx);
1714}
1715
1716#ifdef sun
1717/*
1718 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1719 * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1720 */
1721/*ARGSUSED*/
1722int
1723zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1724{
1725	zvol_state_t *zv;
1726	struct dk_callback *dkc;
1727	int error = 0;
1728	rl_t *rl;
1729
1730	mutex_enter(&spa_namespace_lock);
1731
1732	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1733
1734	if (zv == NULL) {
1735		mutex_exit(&spa_namespace_lock);
1736		return (SET_ERROR(ENXIO));
1737	}
1738	ASSERT(zv->zv_total_opens > 0);
1739
1740	switch (cmd) {
1741
1742	case DKIOCINFO:
1743	{
1744		struct dk_cinfo dki;
1745
1746		bzero(&dki, sizeof (dki));
1747		(void) strcpy(dki.dki_cname, "zvol");
1748		(void) strcpy(dki.dki_dname, "zvol");
1749		dki.dki_ctype = DKC_UNKNOWN;
1750		dki.dki_unit = getminor(dev);
1751		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1752		mutex_exit(&spa_namespace_lock);
1753		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1754			error = SET_ERROR(EFAULT);
1755		return (error);
1756	}
1757
1758	case DKIOCGMEDIAINFO:
1759	{
1760		struct dk_minfo dkm;
1761
1762		bzero(&dkm, sizeof (dkm));
1763		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1764		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1765		dkm.dki_media_type = DK_UNKNOWN;
1766		mutex_exit(&spa_namespace_lock);
1767		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1768			error = SET_ERROR(EFAULT);
1769		return (error);
1770	}
1771
1772	case DKIOCGMEDIAINFOEXT:
1773	{
1774		struct dk_minfo_ext dkmext;
1775
1776		bzero(&dkmext, sizeof (dkmext));
1777		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
1778		dkmext.dki_pbsize = zv->zv_volblocksize;
1779		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1780		dkmext.dki_media_type = DK_UNKNOWN;
1781		mutex_exit(&spa_namespace_lock);
1782		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
1783			error = SET_ERROR(EFAULT);
1784		return (error);
1785	}
1786
1787	case DKIOCGETEFI:
1788	{
1789		uint64_t vs = zv->zv_volsize;
1790		uint8_t bs = zv->zv_min_bs;
1791
1792		mutex_exit(&spa_namespace_lock);
1793		error = zvol_getefi((void *)arg, flag, vs, bs);
1794		return (error);
1795	}
1796
1797	case DKIOCFLUSHWRITECACHE:
1798		dkc = (struct dk_callback *)arg;
1799		mutex_exit(&spa_namespace_lock);
1800		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1801		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1802			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1803			error = 0;
1804		}
1805		return (error);
1806
1807	case DKIOCGETWCE:
1808	{
1809		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1810		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1811		    flag))
1812			error = SET_ERROR(EFAULT);
1813		break;
1814	}
1815	case DKIOCSETWCE:
1816	{
1817		int wce;
1818		if (ddi_copyin((void *)arg, &wce, sizeof (int),
1819		    flag)) {
1820			error = SET_ERROR(EFAULT);
1821			break;
1822		}
1823		if (wce) {
1824			zv->zv_flags |= ZVOL_WCE;
1825			mutex_exit(&spa_namespace_lock);
1826		} else {
1827			zv->zv_flags &= ~ZVOL_WCE;
1828			mutex_exit(&spa_namespace_lock);
1829			zil_commit(zv->zv_zilog, ZVOL_OBJ);
1830		}
1831		return (0);
1832	}
1833
1834	case DKIOCGGEOM:
1835	case DKIOCGVTOC:
1836		/*
1837		 * commands using these (like prtvtoc) expect ENOTSUP
1838		 * since we're emulating an EFI label
1839		 */
1840		error = SET_ERROR(ENOTSUP);
1841		break;
1842
1843	case DKIOCDUMPINIT:
1844		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1845		    RL_WRITER);
1846		error = zvol_dumpify(zv);
1847		zfs_range_unlock(rl);
1848		break;
1849
1850	case DKIOCDUMPFINI:
1851		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1852			break;
1853		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1854		    RL_WRITER);
1855		error = zvol_dump_fini(zv);
1856		zfs_range_unlock(rl);
1857		break;
1858
1859	case DKIOCFREE:
1860	{
1861		dkioc_free_t df;
1862		dmu_tx_t *tx;
1863
1864		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1865			error = SET_ERROR(EFAULT);
1866			break;
1867		}
1868
1869		/*
1870		 * Apply Postel's Law to length-checking.  If they overshoot,
1871		 * just blank out until the end, if there's a need to blank
1872		 * out anything.
1873		 */
1874		if (df.df_start >= zv->zv_volsize)
1875			break;	/* No need to do anything... */
1876		if (df.df_start + df.df_length > zv->zv_volsize)
1877			df.df_length = DMU_OBJECT_END;
1878
1879		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1880		    RL_WRITER);
1881		tx = dmu_tx_create(zv->zv_objset);
1882		error = dmu_tx_assign(tx, TXG_WAIT);
1883		if (error != 0) {
1884			dmu_tx_abort(tx);
1885		} else {
1886			zvol_log_truncate(zv, tx, df.df_start,
1887			    df.df_length, B_TRUE);
1888			dmu_tx_commit(tx);
1889			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1890			    df.df_start, df.df_length);
1891		}
1892
1893		zfs_range_unlock(rl);
1894
1895		if (error == 0) {
1896			/*
1897			 * If the write-cache is disabled or 'sync' property
1898			 * is set to 'always' then treat this as a synchronous
1899			 * operation (i.e. commit to zil).
1900			 */
1901			if (!(zv->zv_flags & ZVOL_WCE) ||
1902			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1903				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1904
1905			/*
1906			 * If the caller really wants synchronous writes, and
1907			 * can't wait for them, don't return until the write
1908			 * is done.
1909			 */
1910			if (df.df_flags & DF_WAIT_SYNC) {
1911				txg_wait_synced(
1912				    dmu_objset_pool(zv->zv_objset), 0);
1913			}
1914		}
1915		break;
1916	}
1917
1918	default:
1919		error = SET_ERROR(ENOTTY);
1920		break;
1921
1922	}
1923	mutex_exit(&spa_namespace_lock);
1924	return (error);
1925}
1926#endif	/* sun */
1927
1928int
1929zvol_busy(void)
1930{
1931	return (zvol_minors != 0);
1932}
1933
1934void
1935zvol_init(void)
1936{
1937	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1938	    1) == 0);
1939	ZFS_LOG(1, "ZVOL Initialized.");
1940}
1941
1942void
1943zvol_fini(void)
1944{
1945	ddi_soft_state_fini(&zfsdev_state);
1946	ZFS_LOG(1, "ZVOL Deinitialized.");
1947}
1948
1949#ifdef sun
1950/*ARGSUSED*/
1951static int
1952zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1953{
1954	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1955
1956	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1957		return (1);
1958	return (0);
1959}
1960
1961/*ARGSUSED*/
1962static void
1963zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1964{
1965	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1966
1967	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1968}
1969
1970static int
1971zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1972{
1973	dmu_tx_t *tx;
1974	int error;
1975	objset_t *os = zv->zv_objset;
1976	spa_t *spa = dmu_objset_spa(os);
1977	vdev_t *vd = spa->spa_root_vdev;
1978	nvlist_t *nv = NULL;
1979	uint64_t version = spa_version(spa);
1980	enum zio_checksum checksum;
1981
1982	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1983	ASSERT(vd->vdev_ops == &vdev_root_ops);
1984
1985	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1986	    DMU_OBJECT_END);
1987	/* wait for dmu_free_long_range to actually free the blocks */
1988	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1989
1990	/*
1991	 * If the pool on which the dump device is being initialized has more
1992	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1993	 * enabled.  If so, bump that feature's counter to indicate that the
1994	 * feature is active. We also check the vdev type to handle the
1995	 * following case:
1996	 *   # zpool create test raidz disk1 disk2 disk3
1997	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1998	 *   the raidz vdev itself has 3 children.
1999	 */
2000	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
2001		if (!spa_feature_is_enabled(spa,
2002		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
2003			return (SET_ERROR(ENOTSUP));
2004		(void) dsl_sync_task(spa_name(spa),
2005		    zfs_mvdev_dump_feature_check,
2006		    zfs_mvdev_dump_activate_feature_sync, NULL, 2);
2007	}
2008
2009	tx = dmu_tx_create(os);
2010	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2011	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2012	error = dmu_tx_assign(tx, TXG_WAIT);
2013	if (error) {
2014		dmu_tx_abort(tx);
2015		return (error);
2016	}
2017
2018	/*
2019	 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2020	 * function.  Otherwise, use the old default -- OFF.
2021	 */
2022	checksum = spa_feature_is_active(spa,
2023	    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2024	    ZIO_CHECKSUM_OFF;
2025
2026	/*
2027	 * If we are resizing the dump device then we only need to
2028	 * update the refreservation to match the newly updated
2029	 * zvolsize. Otherwise, we save off the original state of the
2030	 * zvol so that we can restore them if the zvol is ever undumpified.
2031	 */
2032	if (resize) {
2033		error = zap_update(os, ZVOL_ZAP_OBJ,
2034		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2035		    &zv->zv_volsize, tx);
2036	} else {
2037		uint64_t checksum, compress, refresrv, vbs, dedup;
2038
2039		error = dsl_prop_get_integer(zv->zv_name,
2040		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
2041		error = error ? error : dsl_prop_get_integer(zv->zv_name,
2042		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
2043		error = error ? error : dsl_prop_get_integer(zv->zv_name,
2044		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
2045		error = error ? error : dsl_prop_get_integer(zv->zv_name,
2046		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
2047		if (version >= SPA_VERSION_DEDUP) {
2048			error = error ? error :
2049			    dsl_prop_get_integer(zv->zv_name,
2050			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
2051		}
2052
2053		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2054		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
2055		    &compress, tx);
2056		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2057		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
2058		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2059		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2060		    &refresrv, tx);
2061		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2062		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
2063		    &vbs, tx);
2064		error = error ? error : dmu_object_set_blocksize(
2065		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
2066		if (version >= SPA_VERSION_DEDUP) {
2067			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2068			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2069			    &dedup, tx);
2070		}
2071		if (error == 0)
2072			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
2073	}
2074	dmu_tx_commit(tx);
2075
2076	/*
2077	 * We only need update the zvol's property if we are initializing
2078	 * the dump area for the first time.
2079	 */
2080	if (!resize) {
2081		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2082		VERIFY(nvlist_add_uint64(nv,
2083		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2084		VERIFY(nvlist_add_uint64(nv,
2085		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2086		    ZIO_COMPRESS_OFF) == 0);
2087		VERIFY(nvlist_add_uint64(nv,
2088		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2089		    checksum) == 0);
2090		if (version >= SPA_VERSION_DEDUP) {
2091			VERIFY(nvlist_add_uint64(nv,
2092			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2093			    ZIO_CHECKSUM_OFF) == 0);
2094		}
2095
2096		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2097		    nv, NULL);
2098		nvlist_free(nv);
2099
2100		if (error)
2101			return (error);
2102	}
2103
2104	/* Allocate the space for the dump */
2105	error = zvol_prealloc(zv);
2106	return (error);
2107}
2108
2109static int
2110zvol_dumpify(zvol_state_t *zv)
2111{
2112	int error = 0;
2113	uint64_t dumpsize = 0;
2114	dmu_tx_t *tx;
2115	objset_t *os = zv->zv_objset;
2116
2117	if (zv->zv_flags & ZVOL_RDONLY)
2118		return (SET_ERROR(EROFS));
2119
2120	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2121	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2122		boolean_t resize = (dumpsize > 0);
2123
2124		if ((error = zvol_dump_init(zv, resize)) != 0) {
2125			(void) zvol_dump_fini(zv);
2126			return (error);
2127		}
2128	}
2129
2130	/*
2131	 * Build up our lba mapping.
2132	 */
2133	error = zvol_get_lbas(zv);
2134	if (error) {
2135		(void) zvol_dump_fini(zv);
2136		return (error);
2137	}
2138
2139	tx = dmu_tx_create(os);
2140	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2141	error = dmu_tx_assign(tx, TXG_WAIT);
2142	if (error) {
2143		dmu_tx_abort(tx);
2144		(void) zvol_dump_fini(zv);
2145		return (error);
2146	}
2147
2148	zv->zv_flags |= ZVOL_DUMPIFIED;
2149	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2150	    &zv->zv_volsize, tx);
2151	dmu_tx_commit(tx);
2152
2153	if (error) {
2154		(void) zvol_dump_fini(zv);
2155		return (error);
2156	}
2157
2158	txg_wait_synced(dmu_objset_pool(os), 0);
2159	return (0);
2160}
2161
2162static int
2163zvol_dump_fini(zvol_state_t *zv)
2164{
2165	dmu_tx_t *tx;
2166	objset_t *os = zv->zv_objset;
2167	nvlist_t *nv;
2168	int error = 0;
2169	uint64_t checksum, compress, refresrv, vbs, dedup;
2170	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2171
2172	/*
2173	 * Attempt to restore the zvol back to its pre-dumpified state.
2174	 * This is a best-effort attempt as it's possible that not all
2175	 * of these properties were initialized during the dumpify process
2176	 * (i.e. error during zvol_dump_init).
2177	 */
2178
2179	tx = dmu_tx_create(os);
2180	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2181	error = dmu_tx_assign(tx, TXG_WAIT);
2182	if (error) {
2183		dmu_tx_abort(tx);
2184		return (error);
2185	}
2186	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2187	dmu_tx_commit(tx);
2188
2189	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2190	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2191	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2192	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2193	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2194	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2195	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2196	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2197
2198	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2199	(void) nvlist_add_uint64(nv,
2200	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2201	(void) nvlist_add_uint64(nv,
2202	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2203	(void) nvlist_add_uint64(nv,
2204	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2205	if (version >= SPA_VERSION_DEDUP &&
2206	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2207	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2208		(void) nvlist_add_uint64(nv,
2209		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2210	}
2211	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2212	    nv, NULL);
2213	nvlist_free(nv);
2214
2215	zvol_free_extents(zv);
2216	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2217	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2218	/* wait for dmu_free_long_range to actually free the blocks */
2219	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2220	tx = dmu_tx_create(os);
2221	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2222	error = dmu_tx_assign(tx, TXG_WAIT);
2223	if (error) {
2224		dmu_tx_abort(tx);
2225		return (error);
2226	}
2227	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2228		zv->zv_volblocksize = vbs;
2229	dmu_tx_commit(tx);
2230
2231	return (0);
2232}
2233#endif	/* sun */
2234
2235static zvol_state_t *
2236zvol_geom_create(const char *name)
2237{
2238	struct g_provider *pp;
2239	struct g_geom *gp;
2240	zvol_state_t *zv;
2241
2242	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
2243	gp->start = zvol_geom_start;
2244	gp->access = zvol_geom_access;
2245	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
2246	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2247	pp->sectorsize = DEV_BSIZE;
2248
2249	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
2250	zv->zv_provider = pp;
2251	zv->zv_state = 0;
2252	bioq_init(&zv->zv_queue);
2253	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
2254
2255	pp->private = zv;
2256
2257	return (zv);
2258}
2259
2260static void
2261zvol_geom_run(zvol_state_t *zv)
2262{
2263	struct g_provider *pp;
2264
2265	pp = zv->zv_provider;
2266	g_error_provider(pp, 0);
2267
2268	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2269	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2270}
2271
2272static void
2273zvol_geom_destroy(zvol_state_t *zv)
2274{
2275	struct g_provider *pp;
2276
2277	g_topology_assert();
2278
2279	mtx_lock(&zv->zv_queue_mtx);
2280	zv->zv_state = 1;
2281	wakeup_one(&zv->zv_queue);
2282	while (zv->zv_state != 2)
2283		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2284	mtx_destroy(&zv->zv_queue_mtx);
2285
2286	pp = zv->zv_provider;
2287	zv->zv_provider = NULL;
2288	pp->private = NULL;
2289	g_wither_geom(pp->geom, ENXIO);
2290
2291	kmem_free(zv, sizeof(*zv));
2292}
2293
2294static int
2295zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2296{
2297	int count, error, flags;
2298
2299	g_topology_assert();
2300
2301	/*
2302	 * To make it easier we expect either open or close, but not both
2303	 * at the same time.
2304	 */
2305	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2306	    (acr <= 0 && acw <= 0 && ace <= 0),
2307	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2308	    pp->name, acr, acw, ace));
2309
2310	if (pp->private == NULL) {
2311		if (acr <= 0 && acw <= 0 && ace <= 0)
2312			return (0);
2313		return (pp->error);
2314	}
2315
2316	/*
2317	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2318	 * because GEOM already handles that and handles it a bit differently.
2319	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2320	 * only one exclusive consumer, no matter if it is reader or writer.
2321	 * I like better the way GEOM works so I'll leave it for GEOM to
2322	 * decide what to do.
2323	 */
2324
2325	count = acr + acw + ace;
2326	if (count == 0)
2327		return (0);
2328
2329	flags = 0;
2330	if (acr != 0 || ace != 0)
2331		flags |= FREAD;
2332	if (acw != 0)
2333		flags |= FWRITE;
2334
2335	g_topology_unlock();
2336	if (count > 0)
2337		error = zvol_open(pp, flags, count);
2338	else
2339		error = zvol_close(pp, flags, -count);
2340	g_topology_lock();
2341	return (error);
2342}
2343
2344static void
2345zvol_geom_start(struct bio *bp)
2346{
2347	zvol_state_t *zv;
2348	boolean_t first;
2349
2350	zv = bp->bio_to->private;
2351	ASSERT(zv != NULL);
2352	switch (bp->bio_cmd) {
2353	case BIO_FLUSH:
2354		if (!THREAD_CAN_SLEEP())
2355			goto enqueue;
2356		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2357		g_io_deliver(bp, 0);
2358		break;
2359	case BIO_READ:
2360	case BIO_WRITE:
2361	case BIO_DELETE:
2362		if (!THREAD_CAN_SLEEP())
2363			goto enqueue;
2364		zvol_strategy(bp);
2365		break;
2366	case BIO_GETATTR:
2367		if (g_handleattr_int(bp, "GEOM::candelete", 1))
2368			return;
2369		/* FALLTHROUGH */
2370	default:
2371		g_io_deliver(bp, EOPNOTSUPP);
2372		break;
2373	}
2374	return;
2375
2376enqueue:
2377	mtx_lock(&zv->zv_queue_mtx);
2378	first = (bioq_first(&zv->zv_queue) == NULL);
2379	bioq_insert_tail(&zv->zv_queue, bp);
2380	mtx_unlock(&zv->zv_queue_mtx);
2381	if (first)
2382		wakeup_one(&zv->zv_queue);
2383}
2384
2385static void
2386zvol_geom_worker(void *arg)
2387{
2388	zvol_state_t *zv;
2389	struct bio *bp;
2390
2391	thread_lock(curthread);
2392	sched_prio(curthread, PRIBIO);
2393	thread_unlock(curthread);
2394
2395	zv = arg;
2396	for (;;) {
2397		mtx_lock(&zv->zv_queue_mtx);
2398		bp = bioq_takefirst(&zv->zv_queue);
2399		if (bp == NULL) {
2400			if (zv->zv_state == 1) {
2401				zv->zv_state = 2;
2402				wakeup(&zv->zv_state);
2403				mtx_unlock(&zv->zv_queue_mtx);
2404				kthread_exit();
2405			}
2406			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2407			    "zvol:io", 0);
2408			continue;
2409		}
2410		mtx_unlock(&zv->zv_queue_mtx);
2411		switch (bp->bio_cmd) {
2412		case BIO_FLUSH:
2413			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2414			g_io_deliver(bp, 0);
2415			break;
2416		case BIO_READ:
2417		case BIO_WRITE:
2418			zvol_strategy(bp);
2419			break;
2420		}
2421	}
2422}
2423
2424extern boolean_t dataset_name_hidden(const char *name);
2425
2426static int
2427zvol_create_snapshots(objset_t *os, const char *name)
2428{
2429	uint64_t cookie, obj;
2430	char *sname;
2431	int error, len;
2432
2433	cookie = obj = 0;
2434	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2435
2436#if 0
2437	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2438	    DS_FIND_SNAPSHOTS);
2439#endif
2440
2441	for (;;) {
2442		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2443		if (len >= MAXPATHLEN) {
2444			dmu_objset_rele(os, FTAG);
2445			error = ENAMETOOLONG;
2446			break;
2447		}
2448
2449		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2450		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2451		    sname + len, &obj, &cookie, NULL);
2452		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2453		if (error != 0) {
2454			if (error == ENOENT)
2455				error = 0;
2456			break;
2457		}
2458
2459		if ((error = zvol_create_minor(sname)) != 0) {
2460			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2461			    sname, error);
2462			break;
2463		}
2464	}
2465
2466	kmem_free(sname, MAXPATHLEN);
2467	return (error);
2468}
2469
2470int
2471zvol_create_minors(const char *name)
2472{
2473	uint64_t cookie;
2474	objset_t *os;
2475	char *osname, *p;
2476	int error, len;
2477
2478	if (dataset_name_hidden(name))
2479		return (0);
2480
2481	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2482		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2483		    name, error);
2484		return (error);
2485	}
2486	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2487		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2488		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2489		if ((error = zvol_create_minor(name)) == 0)
2490			error = zvol_create_snapshots(os, name);
2491		else {
2492			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2493			    name, error);
2494		}
2495		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2496		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2497		return (error);
2498	}
2499	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2500		dmu_objset_rele(os, FTAG);
2501		return (0);
2502	}
2503
2504	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2505	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2506		dmu_objset_rele(os, FTAG);
2507		kmem_free(osname, MAXPATHLEN);
2508		return (ENOENT);
2509	}
2510	p = osname + strlen(osname);
2511	len = MAXPATHLEN - (p - osname);
2512
2513#if 0
2514	/* Prefetch the datasets. */
2515	cookie = 0;
2516	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2517		if (!dataset_name_hidden(osname))
2518			(void) dmu_objset_prefetch(osname, NULL);
2519	}
2520#endif
2521
2522	cookie = 0;
2523	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2524	    &cookie) == 0) {
2525		dmu_objset_rele(os, FTAG);
2526		(void)zvol_create_minors(osname);
2527		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2528			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2529			    name, error);
2530			return (error);
2531		}
2532	}
2533
2534	dmu_objset_rele(os, FTAG);
2535	kmem_free(osname, MAXPATHLEN);
2536	return (0);
2537}
2538
2539static void
2540zvol_rename_minor(struct g_geom *gp, const char *newname)
2541{
2542	struct g_provider *pp;
2543	zvol_state_t *zv;
2544
2545	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2546	g_topology_assert();
2547
2548	pp = LIST_FIRST(&gp->provider);
2549	ASSERT(pp != NULL);
2550	zv = pp->private;
2551	ASSERT(zv != NULL);
2552
2553	zv->zv_provider = NULL;
2554	g_wither_provider(pp, ENXIO);
2555
2556	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2557	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2558	pp->sectorsize = DEV_BSIZE;
2559	pp->mediasize = zv->zv_volsize;
2560	pp->private = zv;
2561	zv->zv_provider = pp;
2562	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
2563	g_error_provider(pp, 0);
2564}
2565
2566void
2567zvol_rename_minors(const char *oldname, const char *newname)
2568{
2569	char name[MAXPATHLEN];
2570	struct g_provider *pp;
2571	struct g_geom *gp;
2572	size_t oldnamelen, newnamelen;
2573	zvol_state_t *zv;
2574	char *namebuf;
2575
2576	oldnamelen = strlen(oldname);
2577	newnamelen = strlen(newname);
2578
2579	DROP_GIANT();
2580	mutex_enter(&spa_namespace_lock);
2581	g_topology_lock();
2582
2583	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
2584		pp = LIST_FIRST(&gp->provider);
2585		if (pp == NULL)
2586			continue;
2587		zv = pp->private;
2588		if (zv == NULL)
2589			continue;
2590		if (strcmp(zv->zv_name, oldname) == 0) {
2591			zvol_rename_minor(gp, newname);
2592		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
2593		    (zv->zv_name[oldnamelen] == '/' ||
2594		     zv->zv_name[oldnamelen] == '@')) {
2595			snprintf(name, sizeof(name), "%s%c%s", newname,
2596			    zv->zv_name[oldnamelen],
2597			    zv->zv_name + oldnamelen + 1);
2598			zvol_rename_minor(gp, name);
2599		}
2600	}
2601
2602	g_topology_unlock();
2603	mutex_exit(&spa_namespace_lock);
2604	PICKUP_GIANT();
2605}
2606