zvol.c revision 263390
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 * Copyright (c) 2013 by Delphix. All rights reserved.
27 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
28 */
29
30/* Portions Copyright 2010 Robert Milkowski */
31/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
32
33/*
34 * ZFS volume emulation driver.
35 *
36 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
37 * Volumes are accessed through the symbolic links named:
38 *
39 * /dev/zvol/dsk/<pool_name>/<dataset_name>
40 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
41 *
42 * These links are created by the /dev filesystem (sdev_zvolops.c).
43 * Volumes are persistent through reboot.  No user command needs to be
44 * run before opening and using a device.
45 *
46 * FreeBSD notes.
47 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
48 * in the system.
49 */
50
51#include <sys/types.h>
52#include <sys/param.h>
53#include <sys/kernel.h>
54#include <sys/errno.h>
55#include <sys/uio.h>
56#include <sys/bio.h>
57#include <sys/buf.h>
58#include <sys/kmem.h>
59#include <sys/conf.h>
60#include <sys/cmn_err.h>
61#include <sys/stat.h>
62#include <sys/zap.h>
63#include <sys/spa.h>
64#include <sys/spa_impl.h>
65#include <sys/zio.h>
66#include <sys/dmu_traverse.h>
67#include <sys/dnode.h>
68#include <sys/dsl_dataset.h>
69#include <sys/dsl_prop.h>
70#include <sys/dkio.h>
71#include <sys/byteorder.h>
72#include <sys/sunddi.h>
73#include <sys/dirent.h>
74#include <sys/policy.h>
75#include <sys/fs/zfs.h>
76#include <sys/zfs_ioctl.h>
77#include <sys/zil.h>
78#include <sys/refcount.h>
79#include <sys/zfs_znode.h>
80#include <sys/zfs_rlock.h>
81#include <sys/vdev_impl.h>
82#include <sys/vdev_raidz.h>
83#include <sys/zvol.h>
84#include <sys/zil_impl.h>
85#include <sys/dbuf.h>
86#include <sys/dmu_tx.h>
87#include <sys/zfeature.h>
88#include <sys/zio_checksum.h>
89
90#include <geom/geom.h>
91
92#include "zfs_namecheck.h"
93
94struct g_class zfs_zvol_class = {
95	.name = "ZFS::ZVOL",
96	.version = G_VERSION,
97};
98
99DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
100
101void *zfsdev_state;
102static char *zvol_tag = "zvol_tag";
103
104#define	ZVOL_DUMPSIZE		"dumpsize"
105
106/*
107 * The spa_namespace_lock protects the zfsdev_state structure from being
108 * modified while it's being used, e.g. an open that comes in before a
109 * create finishes.  It also protects temporary opens of the dataset so that,
110 * e.g., an open doesn't get a spurious EBUSY.
111 */
112static uint32_t zvol_minors;
113
114typedef struct zvol_extent {
115	list_node_t	ze_node;
116	dva_t		ze_dva;		/* dva associated with this extent */
117	uint64_t	ze_nblks;	/* number of blocks in extent */
118} zvol_extent_t;
119
120/*
121 * The in-core state of each volume.
122 */
123typedef struct zvol_state {
124	char		zv_name[MAXPATHLEN]; /* pool/dd name */
125	uint64_t	zv_volsize;	/* amount of space we advertise */
126	uint64_t	zv_volblocksize; /* volume block size */
127	struct g_provider *zv_provider;	/* GEOM provider */
128	uint8_t		zv_min_bs;	/* minimum addressable block shift */
129	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
130	objset_t	*zv_objset;	/* objset handle */
131	uint32_t	zv_total_opens;	/* total open count */
132	zilog_t		*zv_zilog;	/* ZIL handle */
133	list_t		zv_extents;	/* List of extents for dump */
134	znode_t		zv_znode;	/* for range locking */
135	dmu_buf_t	*zv_dbuf;	/* bonus handle */
136	int		zv_state;
137	struct bio_queue_head zv_queue;
138	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
139} zvol_state_t;
140
141/*
142 * zvol specific flags
143 */
144#define	ZVOL_RDONLY	0x1
145#define	ZVOL_DUMPIFIED	0x2
146#define	ZVOL_EXCL	0x4
147#define	ZVOL_WCE	0x8
148
149/*
150 * zvol maximum transfer in one DMU tx.
151 */
152int zvol_maxphys = DMU_MAX_ACCESS/2;
153
154extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
155    nvlist_t *, nvlist_t *);
156static int zvol_remove_zv(zvol_state_t *);
157static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
158static int zvol_dumpify(zvol_state_t *zv);
159static int zvol_dump_fini(zvol_state_t *zv);
160static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
161
162static zvol_state_t *zvol_geom_create(const char *name);
163static void zvol_geom_run(zvol_state_t *zv);
164static void zvol_geom_destroy(zvol_state_t *zv);
165static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
166static void zvol_geom_start(struct bio *bp);
167static void zvol_geom_worker(void *arg);
168
169static void
170zvol_size_changed(zvol_state_t *zv)
171{
172#ifdef sun
173	dev_t dev = makedevice(maj, min);
174
175	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
176	    "Size", volsize) == DDI_SUCCESS);
177	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
178	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
179
180	/* Notify specfs to invalidate the cached size */
181	spec_size_invalidate(dev, VBLK);
182	spec_size_invalidate(dev, VCHR);
183#else	/* !sun */
184	struct g_provider *pp;
185
186	pp = zv->zv_provider;
187	if (pp == NULL)
188		return;
189	g_topology_lock();
190	g_resize_provider(pp, zv->zv_volsize);
191	g_topology_unlock();
192#endif	/* !sun */
193}
194
195int
196zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
197{
198	if (volsize == 0)
199		return (SET_ERROR(EINVAL));
200
201	if (volsize % blocksize != 0)
202		return (SET_ERROR(EINVAL));
203
204#ifdef _ILP32
205	if (volsize - 1 > SPEC_MAXOFFSET_T)
206		return (SET_ERROR(EOVERFLOW));
207#endif
208	return (0);
209}
210
211int
212zvol_check_volblocksize(uint64_t volblocksize)
213{
214	if (volblocksize < SPA_MINBLOCKSIZE ||
215	    volblocksize > SPA_MAXBLOCKSIZE ||
216	    !ISP2(volblocksize))
217		return (SET_ERROR(EDOM));
218
219	return (0);
220}
221
222int
223zvol_get_stats(objset_t *os, nvlist_t *nv)
224{
225	int error;
226	dmu_object_info_t doi;
227	uint64_t val;
228
229	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
230	if (error)
231		return (error);
232
233	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
234
235	error = dmu_object_info(os, ZVOL_OBJ, &doi);
236
237	if (error == 0) {
238		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
239		    doi.doi_data_block_size);
240	}
241
242	return (error);
243}
244
245static zvol_state_t *
246zvol_minor_lookup(const char *name)
247{
248	struct g_provider *pp;
249	struct g_geom *gp;
250	zvol_state_t *zv = NULL;
251
252	ASSERT(MUTEX_HELD(&spa_namespace_lock));
253
254	g_topology_lock();
255	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
256		pp = LIST_FIRST(&gp->provider);
257		if (pp == NULL)
258			continue;
259		zv = pp->private;
260		if (zv == NULL)
261			continue;
262		if (strcmp(zv->zv_name, name) == 0)
263			break;
264	}
265	g_topology_unlock();
266
267	return (gp != NULL ? zv : NULL);
268}
269
270/* extent mapping arg */
271struct maparg {
272	zvol_state_t	*ma_zv;
273	uint64_t	ma_blks;
274};
275
276/*ARGSUSED*/
277static int
278zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
279    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
280{
281	struct maparg *ma = arg;
282	zvol_extent_t *ze;
283	int bs = ma->ma_zv->zv_volblocksize;
284
285	if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
286		return (0);
287
288	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
289	ma->ma_blks++;
290
291	/* Abort immediately if we have encountered gang blocks */
292	if (BP_IS_GANG(bp))
293		return (SET_ERROR(EFRAGS));
294
295	/*
296	 * See if the block is at the end of the previous extent.
297	 */
298	ze = list_tail(&ma->ma_zv->zv_extents);
299	if (ze &&
300	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
301	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
302	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
303		ze->ze_nblks++;
304		return (0);
305	}
306
307	dprintf_bp(bp, "%s", "next blkptr:");
308
309	/* start a new extent */
310	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
311	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
312	ze->ze_nblks = 1;
313	list_insert_tail(&ma->ma_zv->zv_extents, ze);
314	return (0);
315}
316
317static void
318zvol_free_extents(zvol_state_t *zv)
319{
320	zvol_extent_t *ze;
321
322	while (ze = list_head(&zv->zv_extents)) {
323		list_remove(&zv->zv_extents, ze);
324		kmem_free(ze, sizeof (zvol_extent_t));
325	}
326}
327
328static int
329zvol_get_lbas(zvol_state_t *zv)
330{
331	objset_t *os = zv->zv_objset;
332	struct maparg	ma;
333	int		err;
334
335	ma.ma_zv = zv;
336	ma.ma_blks = 0;
337	zvol_free_extents(zv);
338
339	/* commit any in-flight changes before traversing the dataset */
340	txg_wait_synced(dmu_objset_pool(os), 0);
341	err = traverse_dataset(dmu_objset_ds(os), 0,
342	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
343	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
344		zvol_free_extents(zv);
345		return (err ? err : EIO);
346	}
347
348	return (0);
349}
350
351/* ARGSUSED */
352void
353zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
354{
355	zfs_creat_t *zct = arg;
356	nvlist_t *nvprops = zct->zct_props;
357	int error;
358	uint64_t volblocksize, volsize;
359
360	VERIFY(nvlist_lookup_uint64(nvprops,
361	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
362	if (nvlist_lookup_uint64(nvprops,
363	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
364		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
365
366	/*
367	 * These properties must be removed from the list so the generic
368	 * property setting step won't apply to them.
369	 */
370	VERIFY(nvlist_remove_all(nvprops,
371	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
372	(void) nvlist_remove_all(nvprops,
373	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
374
375	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
376	    DMU_OT_NONE, 0, tx);
377	ASSERT(error == 0);
378
379	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
380	    DMU_OT_NONE, 0, tx);
381	ASSERT(error == 0);
382
383	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
384	ASSERT(error == 0);
385}
386
387/*
388 * Replay a TX_WRITE ZIL transaction that didn't get committed
389 * after a system failure
390 */
391static int
392zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
393{
394	objset_t *os = zv->zv_objset;
395	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
396	uint64_t offset, length;
397	dmu_tx_t *tx;
398	int error;
399
400	if (byteswap)
401		byteswap_uint64_array(lr, sizeof (*lr));
402
403	offset = lr->lr_offset;
404	length = lr->lr_length;
405
406	/* If it's a dmu_sync() block, write the whole block */
407	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
408		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
409		if (length < blocksize) {
410			offset -= offset % blocksize;
411			length = blocksize;
412		}
413	}
414
415	tx = dmu_tx_create(os);
416	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
417	error = dmu_tx_assign(tx, TXG_WAIT);
418	if (error) {
419		dmu_tx_abort(tx);
420	} else {
421		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
422		dmu_tx_commit(tx);
423	}
424
425	return (error);
426}
427
428/* ARGSUSED */
429static int
430zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
431{
432	return (SET_ERROR(ENOTSUP));
433}
434
435/*
436 * Callback vectors for replaying records.
437 * Only TX_WRITE is needed for zvol.
438 */
439zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
440	zvol_replay_err,	/* 0 no such transaction type */
441	zvol_replay_err,	/* TX_CREATE */
442	zvol_replay_err,	/* TX_MKDIR */
443	zvol_replay_err,	/* TX_MKXATTR */
444	zvol_replay_err,	/* TX_SYMLINK */
445	zvol_replay_err,	/* TX_REMOVE */
446	zvol_replay_err,	/* TX_RMDIR */
447	zvol_replay_err,	/* TX_LINK */
448	zvol_replay_err,	/* TX_RENAME */
449	zvol_replay_write,	/* TX_WRITE */
450	zvol_replay_err,	/* TX_TRUNCATE */
451	zvol_replay_err,	/* TX_SETATTR */
452	zvol_replay_err,	/* TX_ACL */
453	zvol_replay_err,	/* TX_CREATE_ACL */
454	zvol_replay_err,	/* TX_CREATE_ATTR */
455	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
456	zvol_replay_err,	/* TX_MKDIR_ACL */
457	zvol_replay_err,	/* TX_MKDIR_ATTR */
458	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
459	zvol_replay_err,	/* TX_WRITE2 */
460};
461
462#ifdef sun
463int
464zvol_name2minor(const char *name, minor_t *minor)
465{
466	zvol_state_t *zv;
467
468	mutex_enter(&spa_namespace_lock);
469	zv = zvol_minor_lookup(name);
470	if (minor && zv)
471		*minor = zv->zv_minor;
472	mutex_exit(&spa_namespace_lock);
473	return (zv ? 0 : -1);
474}
475#endif	/* sun */
476
477/*
478 * Create a minor node (plus a whole lot more) for the specified volume.
479 */
480int
481zvol_create_minor(const char *name)
482{
483	zfs_soft_state_t *zs;
484	zvol_state_t *zv;
485	objset_t *os;
486	dmu_object_info_t doi;
487	uint64_t volsize;
488	int error;
489
490	ZFS_LOG(1, "Creating ZVOL %s...", name);
491
492	mutex_enter(&spa_namespace_lock);
493
494	if (zvol_minor_lookup(name) != NULL) {
495		mutex_exit(&spa_namespace_lock);
496		return (SET_ERROR(EEXIST));
497	}
498
499	/* lie and say we're read-only */
500	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
501
502	if (error) {
503		mutex_exit(&spa_namespace_lock);
504		return (error);
505	}
506
507#ifdef sun
508	if ((minor = zfsdev_minor_alloc()) == 0) {
509		dmu_objset_disown(os, FTAG);
510		mutex_exit(&spa_namespace_lock);
511		return (SET_ERROR(ENXIO));
512	}
513
514	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
515		dmu_objset_disown(os, FTAG);
516		mutex_exit(&spa_namespace_lock);
517		return (SET_ERROR(EAGAIN));
518	}
519	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
520	    (char *)name);
521
522	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
523
524	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
525	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
526		ddi_soft_state_free(zfsdev_state, minor);
527		dmu_objset_disown(os, FTAG);
528		mutex_exit(&spa_namespace_lock);
529		return (SET_ERROR(EAGAIN));
530	}
531
532	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
533
534	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
535	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
536		ddi_remove_minor_node(zfs_dip, chrbuf);
537		ddi_soft_state_free(zfsdev_state, minor);
538		dmu_objset_disown(os, FTAG);
539		mutex_exit(&spa_namespace_lock);
540		return (SET_ERROR(EAGAIN));
541	}
542
543	zs = ddi_get_soft_state(zfsdev_state, minor);
544	zs->zss_type = ZSST_ZVOL;
545	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
546#else	/* !sun */
547
548	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
549	if (error) {
550		ASSERT(error == 0);
551		dmu_objset_disown(os, zvol_tag);
552		mutex_exit(&spa_namespace_lock);
553		return (error);
554	}
555
556	DROP_GIANT();
557	g_topology_lock();
558	zv = zvol_geom_create(name);
559	zv->zv_volsize = volsize;
560	zv->zv_provider->mediasize = zv->zv_volsize;
561
562#endif	/* !sun */
563
564	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
565	zv->zv_min_bs = DEV_BSHIFT;
566	zv->zv_objset = os;
567	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
568		zv->zv_flags |= ZVOL_RDONLY;
569	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
570	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
571	    sizeof (rl_t), offsetof(rl_t, r_node));
572	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
573	    offsetof(zvol_extent_t, ze_node));
574	/* get and cache the blocksize */
575	error = dmu_object_info(os, ZVOL_OBJ, &doi);
576	ASSERT(error == 0);
577	zv->zv_volblocksize = doi.doi_data_block_size;
578
579	if (spa_writeable(dmu_objset_spa(os))) {
580		if (zil_replay_disable)
581			zil_destroy(dmu_objset_zil(os), B_FALSE);
582		else
583			zil_replay(os, zv, zvol_replay_vector);
584	}
585	dmu_objset_disown(os, FTAG);
586	zv->zv_objset = NULL;
587
588	zvol_minors++;
589
590	mutex_exit(&spa_namespace_lock);
591
592	zvol_geom_run(zv);
593
594	g_topology_unlock();
595	PICKUP_GIANT();
596
597	ZFS_LOG(1, "ZVOL %s created.", name);
598
599	return (0);
600}
601
602/*
603 * Remove minor node for the specified volume.
604 */
605static int
606zvol_remove_zv(zvol_state_t *zv)
607{
608#ifdef sun
609	minor_t minor = zv->zv_minor;
610#endif
611
612	ASSERT(MUTEX_HELD(&spa_namespace_lock));
613	if (zv->zv_total_opens != 0)
614		return (SET_ERROR(EBUSY));
615
616	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
617
618#ifdef sun
619	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
620	ddi_remove_minor_node(zfs_dip, nmbuf);
621#endif	/* sun */
622
623	avl_destroy(&zv->zv_znode.z_range_avl);
624	mutex_destroy(&zv->zv_znode.z_range_lock);
625
626	zvol_geom_destroy(zv);
627
628	zvol_minors--;
629	return (0);
630}
631
632int
633zvol_remove_minor(const char *name)
634{
635	zvol_state_t *zv;
636	int rc;
637
638	mutex_enter(&spa_namespace_lock);
639	if ((zv = zvol_minor_lookup(name)) == NULL) {
640		mutex_exit(&spa_namespace_lock);
641		return (SET_ERROR(ENXIO));
642	}
643	g_topology_lock();
644	rc = zvol_remove_zv(zv);
645	g_topology_unlock();
646	mutex_exit(&spa_namespace_lock);
647	return (rc);
648}
649
650int
651zvol_first_open(zvol_state_t *zv)
652{
653	objset_t *os;
654	uint64_t volsize;
655	int error;
656	uint64_t readonly;
657
658	/* lie and say we're read-only */
659	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
660	    zvol_tag, &os);
661	if (error)
662		return (error);
663
664	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
665	if (error) {
666		ASSERT(error == 0);
667		dmu_objset_disown(os, zvol_tag);
668		return (error);
669	}
670	zv->zv_objset = os;
671	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
672	if (error) {
673		dmu_objset_disown(os, zvol_tag);
674		return (error);
675	}
676	zv->zv_volsize = volsize;
677	zv->zv_zilog = zil_open(os, zvol_get_data);
678	zvol_size_changed(zv);
679
680	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
681	    NULL) == 0);
682	if (readonly || dmu_objset_is_snapshot(os) ||
683	    !spa_writeable(dmu_objset_spa(os)))
684		zv->zv_flags |= ZVOL_RDONLY;
685	else
686		zv->zv_flags &= ~ZVOL_RDONLY;
687	return (error);
688}
689
690void
691zvol_last_close(zvol_state_t *zv)
692{
693	zil_close(zv->zv_zilog);
694	zv->zv_zilog = NULL;
695
696	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
697	zv->zv_dbuf = NULL;
698
699	/*
700	 * Evict cached data
701	 */
702	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
703	    !(zv->zv_flags & ZVOL_RDONLY))
704		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
705	dmu_objset_evict_dbufs(zv->zv_objset);
706
707	dmu_objset_disown(zv->zv_objset, zvol_tag);
708	zv->zv_objset = NULL;
709}
710
711#ifdef sun
712int
713zvol_prealloc(zvol_state_t *zv)
714{
715	objset_t *os = zv->zv_objset;
716	dmu_tx_t *tx;
717	uint64_t refd, avail, usedobjs, availobjs;
718	uint64_t resid = zv->zv_volsize;
719	uint64_t off = 0;
720
721	/* Check the space usage before attempting to allocate the space */
722	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
723	if (avail < zv->zv_volsize)
724		return (SET_ERROR(ENOSPC));
725
726	/* Free old extents if they exist */
727	zvol_free_extents(zv);
728
729	while (resid != 0) {
730		int error;
731		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
732
733		tx = dmu_tx_create(os);
734		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
735		error = dmu_tx_assign(tx, TXG_WAIT);
736		if (error) {
737			dmu_tx_abort(tx);
738			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
739			return (error);
740		}
741		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
742		dmu_tx_commit(tx);
743		off += bytes;
744		resid -= bytes;
745	}
746	txg_wait_synced(dmu_objset_pool(os), 0);
747
748	return (0);
749}
750#endif	/* sun */
751
752static int
753zvol_update_volsize(objset_t *os, uint64_t volsize)
754{
755	dmu_tx_t *tx;
756	int error;
757
758	ASSERT(MUTEX_HELD(&spa_namespace_lock));
759
760	tx = dmu_tx_create(os);
761	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
762	error = dmu_tx_assign(tx, TXG_WAIT);
763	if (error) {
764		dmu_tx_abort(tx);
765		return (error);
766	}
767
768	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
769	    &volsize, tx);
770	dmu_tx_commit(tx);
771
772	if (error == 0)
773		error = dmu_free_long_range(os,
774		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
775	return (error);
776}
777
778void
779zvol_remove_minors(const char *name)
780{
781	struct g_geom *gp, *gptmp;
782	struct g_provider *pp;
783	zvol_state_t *zv;
784	size_t namelen;
785
786	namelen = strlen(name);
787
788	DROP_GIANT();
789	mutex_enter(&spa_namespace_lock);
790	g_topology_lock();
791
792	LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
793		pp = LIST_FIRST(&gp->provider);
794		if (pp == NULL)
795			continue;
796		zv = pp->private;
797		if (zv == NULL)
798			continue;
799		if (strcmp(zv->zv_name, name) == 0 ||
800		    (strncmp(zv->zv_name, name, namelen) == 0 &&
801		     zv->zv_name[namelen] == '/')) {
802			(void) zvol_remove_zv(zv);
803		}
804	}
805
806	g_topology_unlock();
807	mutex_exit(&spa_namespace_lock);
808	PICKUP_GIANT();
809}
810
811int
812zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
813{
814	zvol_state_t *zv = NULL;
815	objset_t *os;
816	int error;
817	dmu_object_info_t doi;
818	uint64_t old_volsize = 0ULL;
819	uint64_t readonly;
820
821	mutex_enter(&spa_namespace_lock);
822	zv = zvol_minor_lookup(name);
823	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
824		mutex_exit(&spa_namespace_lock);
825		return (error);
826	}
827
828	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
829	    (error = zvol_check_volsize(volsize,
830	    doi.doi_data_block_size)) != 0)
831		goto out;
832
833	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
834	    NULL) == 0);
835	if (readonly) {
836		error = EROFS;
837		goto out;
838	}
839
840	error = zvol_update_volsize(os, volsize);
841	/*
842	 * Reinitialize the dump area to the new size. If we
843	 * failed to resize the dump area then restore it back to
844	 * its original size.
845	 */
846	if (zv && error == 0) {
847#ifdef ZVOL_DUMP
848		if (zv->zv_flags & ZVOL_DUMPIFIED) {
849			old_volsize = zv->zv_volsize;
850			zv->zv_volsize = volsize;
851			if ((error = zvol_dumpify(zv)) != 0 ||
852			    (error = dumpvp_resize()) != 0) {
853				(void) zvol_update_volsize(os, old_volsize);
854				zv->zv_volsize = old_volsize;
855				error = zvol_dumpify(zv);
856			}
857		}
858#endif	/* ZVOL_DUMP */
859		if (error == 0) {
860			zv->zv_volsize = volsize;
861			zvol_size_changed(zv);
862		}
863	}
864
865#ifdef sun
866	/*
867	 * Generate a LUN expansion event.
868	 */
869	if (zv && error == 0) {
870		sysevent_id_t eid;
871		nvlist_t *attr;
872		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
873
874		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
875		    zv->zv_minor);
876
877		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
878		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
879
880		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
881		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
882
883		nvlist_free(attr);
884		kmem_free(physpath, MAXPATHLEN);
885	}
886#endif	/* sun */
887
888out:
889	dmu_objset_rele(os, FTAG);
890
891	mutex_exit(&spa_namespace_lock);
892
893	return (error);
894}
895
896/*ARGSUSED*/
897static int
898zvol_open(struct g_provider *pp, int flag, int count)
899{
900	zvol_state_t *zv;
901	int err = 0;
902	boolean_t locked = B_FALSE;
903
904	/*
905	 * Protect against recursively entering spa_namespace_lock
906	 * when spa_open() is used for a pool on a (local) ZVOL(s).
907	 * This is needed since we replaced upstream zfsdev_state_lock
908	 * with spa_namespace_lock in the ZVOL code.
909	 * We are using the same trick as spa_open().
910	 * Note that calls in zvol_first_open which need to resolve
911	 * pool name to a spa object will enter spa_open()
912	 * recursively, but that function already has all the
913	 * necessary protection.
914	 */
915	if (!MUTEX_HELD(&spa_namespace_lock)) {
916		mutex_enter(&spa_namespace_lock);
917		locked = B_TRUE;
918	}
919
920	zv = pp->private;
921	if (zv == NULL) {
922		if (locked)
923			mutex_exit(&spa_namespace_lock);
924		return (SET_ERROR(ENXIO));
925	}
926
927	if (zv->zv_total_opens == 0)
928		err = zvol_first_open(zv);
929	if (err) {
930		if (locked)
931			mutex_exit(&spa_namespace_lock);
932		return (err);
933	}
934	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
935		err = SET_ERROR(EROFS);
936		goto out;
937	}
938	if (zv->zv_flags & ZVOL_EXCL) {
939		err = SET_ERROR(EBUSY);
940		goto out;
941	}
942#ifdef FEXCL
943	if (flag & FEXCL) {
944		if (zv->zv_total_opens != 0) {
945			err = SET_ERROR(EBUSY);
946			goto out;
947		}
948		zv->zv_flags |= ZVOL_EXCL;
949	}
950#endif
951
952	zv->zv_total_opens += count;
953	if (locked)
954		mutex_exit(&spa_namespace_lock);
955
956	return (err);
957out:
958	if (zv->zv_total_opens == 0)
959		zvol_last_close(zv);
960	if (locked)
961		mutex_exit(&spa_namespace_lock);
962	return (err);
963}
964
965/*ARGSUSED*/
966static int
967zvol_close(struct g_provider *pp, int flag, int count)
968{
969	zvol_state_t *zv;
970	int error = 0;
971	boolean_t locked = B_FALSE;
972
973	/* See comment in zvol_open(). */
974	if (!MUTEX_HELD(&spa_namespace_lock)) {
975		mutex_enter(&spa_namespace_lock);
976		locked = B_TRUE;
977	}
978
979	zv = pp->private;
980	if (zv == NULL) {
981		if (locked)
982			mutex_exit(&spa_namespace_lock);
983		return (SET_ERROR(ENXIO));
984	}
985
986	if (zv->zv_flags & ZVOL_EXCL) {
987		ASSERT(zv->zv_total_opens == 1);
988		zv->zv_flags &= ~ZVOL_EXCL;
989	}
990
991	/*
992	 * If the open count is zero, this is a spurious close.
993	 * That indicates a bug in the kernel / DDI framework.
994	 */
995	ASSERT(zv->zv_total_opens != 0);
996
997	/*
998	 * You may get multiple opens, but only one close.
999	 */
1000	zv->zv_total_opens -= count;
1001
1002	if (zv->zv_total_opens == 0)
1003		zvol_last_close(zv);
1004
1005	if (locked)
1006		mutex_exit(&spa_namespace_lock);
1007	return (error);
1008}
1009
1010static void
1011zvol_get_done(zgd_t *zgd, int error)
1012{
1013	if (zgd->zgd_db)
1014		dmu_buf_rele(zgd->zgd_db, zgd);
1015
1016	zfs_range_unlock(zgd->zgd_rl);
1017
1018	if (error == 0 && zgd->zgd_bp)
1019		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1020
1021	kmem_free(zgd, sizeof (zgd_t));
1022}
1023
1024/*
1025 * Get data to generate a TX_WRITE intent log record.
1026 */
1027static int
1028zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1029{
1030	zvol_state_t *zv = arg;
1031	objset_t *os = zv->zv_objset;
1032	uint64_t object = ZVOL_OBJ;
1033	uint64_t offset = lr->lr_offset;
1034	uint64_t size = lr->lr_length;	/* length of user data */
1035	blkptr_t *bp = &lr->lr_blkptr;
1036	dmu_buf_t *db;
1037	zgd_t *zgd;
1038	int error;
1039
1040	ASSERT(zio != NULL);
1041	ASSERT(size != 0);
1042
1043	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1044	zgd->zgd_zilog = zv->zv_zilog;
1045	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
1046
1047	/*
1048	 * Write records come in two flavors: immediate and indirect.
1049	 * For small writes it's cheaper to store the data with the
1050	 * log record (immediate); for large writes it's cheaper to
1051	 * sync the data and get a pointer to it (indirect) so that
1052	 * we don't have to write the data twice.
1053	 */
1054	if (buf != NULL) {	/* immediate write */
1055		error = dmu_read(os, object, offset, size, buf,
1056		    DMU_READ_NO_PREFETCH);
1057	} else {
1058		size = zv->zv_volblocksize;
1059		offset = P2ALIGN(offset, size);
1060		error = dmu_buf_hold(os, object, offset, zgd, &db,
1061		    DMU_READ_NO_PREFETCH);
1062		if (error == 0) {
1063			blkptr_t *obp = dmu_buf_get_blkptr(db);
1064			if (obp) {
1065				ASSERT(BP_IS_HOLE(bp));
1066				*bp = *obp;
1067			}
1068
1069			zgd->zgd_db = db;
1070			zgd->zgd_bp = bp;
1071
1072			ASSERT(db->db_offset == offset);
1073			ASSERT(db->db_size == size);
1074
1075			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1076			    zvol_get_done, zgd);
1077
1078			if (error == 0)
1079				return (0);
1080		}
1081	}
1082
1083	zvol_get_done(zgd, error);
1084
1085	return (error);
1086}
1087
1088/*
1089 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1090 *
1091 * We store data in the log buffers if it's small enough.
1092 * Otherwise we will later flush the data out via dmu_sync().
1093 */
1094ssize_t zvol_immediate_write_sz = 32768;
1095
1096static void
1097zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1098    boolean_t sync)
1099{
1100	uint32_t blocksize = zv->zv_volblocksize;
1101	zilog_t *zilog = zv->zv_zilog;
1102	boolean_t slogging;
1103	ssize_t immediate_write_sz;
1104
1105	if (zil_replaying(zilog, tx))
1106		return;
1107
1108	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1109	    ? 0 : zvol_immediate_write_sz;
1110
1111	slogging = spa_has_slogs(zilog->zl_spa) &&
1112	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1113
1114	while (resid) {
1115		itx_t *itx;
1116		lr_write_t *lr;
1117		ssize_t len;
1118		itx_wr_state_t write_state;
1119
1120		/*
1121		 * Unlike zfs_log_write() we can be called with
1122		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
1123		 */
1124		if (blocksize > immediate_write_sz && !slogging &&
1125		    resid >= blocksize && off % blocksize == 0) {
1126			write_state = WR_INDIRECT; /* uses dmu_sync */
1127			len = blocksize;
1128		} else if (sync) {
1129			write_state = WR_COPIED;
1130			len = MIN(ZIL_MAX_LOG_DATA, resid);
1131		} else {
1132			write_state = WR_NEED_COPY;
1133			len = MIN(ZIL_MAX_LOG_DATA, resid);
1134		}
1135
1136		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1137		    (write_state == WR_COPIED ? len : 0));
1138		lr = (lr_write_t *)&itx->itx_lr;
1139		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1140		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1141			zil_itx_destroy(itx);
1142			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1143			lr = (lr_write_t *)&itx->itx_lr;
1144			write_state = WR_NEED_COPY;
1145		}
1146
1147		itx->itx_wr_state = write_state;
1148		if (write_state == WR_NEED_COPY)
1149			itx->itx_sod += len;
1150		lr->lr_foid = ZVOL_OBJ;
1151		lr->lr_offset = off;
1152		lr->lr_length = len;
1153		lr->lr_blkoff = 0;
1154		BP_ZERO(&lr->lr_blkptr);
1155
1156		itx->itx_private = zv;
1157		itx->itx_sync = sync;
1158
1159		zil_itx_assign(zilog, itx, tx);
1160
1161		off += len;
1162		resid -= len;
1163	}
1164}
1165
1166#ifdef sun
1167static int
1168zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1169    uint64_t size, boolean_t doread, boolean_t isdump)
1170{
1171	vdev_disk_t *dvd;
1172	int c;
1173	int numerrors = 0;
1174
1175	if (vd->vdev_ops == &vdev_mirror_ops ||
1176	    vd->vdev_ops == &vdev_replacing_ops ||
1177	    vd->vdev_ops == &vdev_spare_ops) {
1178		for (c = 0; c < vd->vdev_children; c++) {
1179			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1180			    addr, offset, origoffset, size, doread, isdump);
1181			if (err != 0) {
1182				numerrors++;
1183			} else if (doread) {
1184				break;
1185			}
1186		}
1187	}
1188
1189	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1190		return (numerrors < vd->vdev_children ? 0 : EIO);
1191
1192	if (doread && !vdev_readable(vd))
1193		return (SET_ERROR(EIO));
1194	else if (!doread && !vdev_writeable(vd))
1195		return (SET_ERROR(EIO));
1196
1197	if (vd->vdev_ops == &vdev_raidz_ops) {
1198		return (vdev_raidz_physio(vd,
1199		    addr, size, offset, origoffset, doread, isdump));
1200	}
1201
1202	offset += VDEV_LABEL_START_SIZE;
1203
1204	if (ddi_in_panic() || isdump) {
1205		ASSERT(!doread);
1206		if (doread)
1207			return (SET_ERROR(EIO));
1208		dvd = vd->vdev_tsd;
1209		ASSERT3P(dvd, !=, NULL);
1210		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1211		    lbtodb(size)));
1212	} else {
1213		dvd = vd->vdev_tsd;
1214		ASSERT3P(dvd, !=, NULL);
1215		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1216		    offset, doread ? B_READ : B_WRITE));
1217	}
1218}
1219
1220static int
1221zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1222    boolean_t doread, boolean_t isdump)
1223{
1224	vdev_t *vd;
1225	int error;
1226	zvol_extent_t *ze;
1227	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1228
1229	/* Must be sector aligned, and not stradle a block boundary. */
1230	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1231	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1232		return (SET_ERROR(EINVAL));
1233	}
1234	ASSERT(size <= zv->zv_volblocksize);
1235
1236	/* Locate the extent this belongs to */
1237	ze = list_head(&zv->zv_extents);
1238	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1239		offset -= ze->ze_nblks * zv->zv_volblocksize;
1240		ze = list_next(&zv->zv_extents, ze);
1241	}
1242
1243	if (ze == NULL)
1244		return (SET_ERROR(EINVAL));
1245
1246	if (!ddi_in_panic())
1247		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1248
1249	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1250	offset += DVA_GET_OFFSET(&ze->ze_dva);
1251	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1252	    size, doread, isdump);
1253
1254	if (!ddi_in_panic())
1255		spa_config_exit(spa, SCL_STATE, FTAG);
1256
1257	return (error);
1258}
1259#endif	/* sun */
1260
1261int
1262zvol_strategy(struct bio *bp)
1263{
1264	zvol_state_t *zv = bp->bio_to->private;
1265	uint64_t off, volsize;
1266	size_t resid;
1267	char *addr;
1268	objset_t *os;
1269	rl_t *rl;
1270	int error = 0;
1271	boolean_t doread = (bp->bio_cmd == BIO_READ);
1272	boolean_t is_dumpified;
1273	boolean_t sync;
1274
1275	if (zv == NULL) {
1276		g_io_deliver(bp, ENXIO);
1277		return (0);
1278	}
1279
1280	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1281		g_io_deliver(bp, EROFS);
1282		return (0);
1283	}
1284
1285	off = bp->bio_offset;
1286	volsize = zv->zv_volsize;
1287
1288	os = zv->zv_objset;
1289	ASSERT(os != NULL);
1290
1291	addr = bp->bio_data;
1292	resid = bp->bio_length;
1293
1294	if (resid > 0 && (off < 0 || off >= volsize)) {
1295		g_io_deliver(bp, EIO);
1296		return (0);
1297	}
1298
1299#ifdef illumos
1300	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1301#else
1302	is_dumpified = B_FALSE;
1303#endif
1304        sync = !doread && !is_dumpified &&
1305	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1306
1307	/*
1308	 * There must be no buffer changes when doing a dmu_sync() because
1309	 * we can't change the data whilst calculating the checksum.
1310	 */
1311	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1312	    doread ? RL_READER : RL_WRITER);
1313
1314	while (resid != 0 && off < volsize) {
1315		size_t size = MIN(resid, zvol_maxphys);
1316#ifdef illumos
1317		if (is_dumpified) {
1318			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1319			error = zvol_dumpio(zv, addr, off, size,
1320			    doread, B_FALSE);
1321		} else if (doread) {
1322#else
1323		if (doread) {
1324#endif
1325			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1326			    DMU_READ_PREFETCH);
1327		} else {
1328			dmu_tx_t *tx = dmu_tx_create(os);
1329			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1330			error = dmu_tx_assign(tx, TXG_WAIT);
1331			if (error) {
1332				dmu_tx_abort(tx);
1333			} else {
1334				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1335				zvol_log_write(zv, tx, off, size, sync);
1336				dmu_tx_commit(tx);
1337			}
1338		}
1339		if (error) {
1340			/* convert checksum errors into IO errors */
1341			if (error == ECKSUM)
1342				error = SET_ERROR(EIO);
1343			break;
1344		}
1345		off += size;
1346		addr += size;
1347		resid -= size;
1348	}
1349	zfs_range_unlock(rl);
1350
1351	bp->bio_completed = bp->bio_length - resid;
1352	if (bp->bio_completed < bp->bio_length)
1353		bp->bio_error = (off > volsize ? EINVAL : error);
1354
1355	if (sync)
1356		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1357	g_io_deliver(bp, 0);
1358
1359	return (0);
1360}
1361
1362#ifdef sun
1363/*
1364 * Set the buffer count to the zvol maximum transfer.
1365 * Using our own routine instead of the default minphys()
1366 * means that for larger writes we write bigger buffers on X86
1367 * (128K instead of 56K) and flush the disk write cache less often
1368 * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1369 * 56K on X86 and 128K on sparc).
1370 */
1371void
1372zvol_minphys(struct buf *bp)
1373{
1374	if (bp->b_bcount > zvol_maxphys)
1375		bp->b_bcount = zvol_maxphys;
1376}
1377
1378int
1379zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1380{
1381	minor_t minor = getminor(dev);
1382	zvol_state_t *zv;
1383	int error = 0;
1384	uint64_t size;
1385	uint64_t boff;
1386	uint64_t resid;
1387
1388	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1389	if (zv == NULL)
1390		return (SET_ERROR(ENXIO));
1391
1392	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1393		return (SET_ERROR(EINVAL));
1394
1395	boff = ldbtob(blkno);
1396	resid = ldbtob(nblocks);
1397
1398	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1399
1400	while (resid) {
1401		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1402		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1403		if (error)
1404			break;
1405		boff += size;
1406		addr += size;
1407		resid -= size;
1408	}
1409
1410	return (error);
1411}
1412
1413/*ARGSUSED*/
1414int
1415zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1416{
1417	minor_t minor = getminor(dev);
1418	zvol_state_t *zv;
1419	uint64_t volsize;
1420	rl_t *rl;
1421	int error = 0;
1422
1423	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1424	if (zv == NULL)
1425		return (SET_ERROR(ENXIO));
1426
1427	volsize = zv->zv_volsize;
1428	if (uio->uio_resid > 0 &&
1429	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1430		return (SET_ERROR(EIO));
1431
1432	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1433		error = physio(zvol_strategy, NULL, dev, B_READ,
1434		    zvol_minphys, uio);
1435		return (error);
1436	}
1437
1438	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1439	    RL_READER);
1440	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1441		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1442
1443		/* don't read past the end */
1444		if (bytes > volsize - uio->uio_loffset)
1445			bytes = volsize - uio->uio_loffset;
1446
1447		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1448		if (error) {
1449			/* convert checksum errors into IO errors */
1450			if (error == ECKSUM)
1451				error = SET_ERROR(EIO);
1452			break;
1453		}
1454	}
1455	zfs_range_unlock(rl);
1456	return (error);
1457}
1458
1459/*ARGSUSED*/
1460int
1461zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1462{
1463	minor_t minor = getminor(dev);
1464	zvol_state_t *zv;
1465	uint64_t volsize;
1466	rl_t *rl;
1467	int error = 0;
1468	boolean_t sync;
1469
1470	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1471	if (zv == NULL)
1472		return (SET_ERROR(ENXIO));
1473
1474	volsize = zv->zv_volsize;
1475	if (uio->uio_resid > 0 &&
1476	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1477		return (SET_ERROR(EIO));
1478
1479	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1480		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1481		    zvol_minphys, uio);
1482		return (error);
1483	}
1484
1485	sync = !(zv->zv_flags & ZVOL_WCE) ||
1486	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1487
1488	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1489	    RL_WRITER);
1490	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1491		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1492		uint64_t off = uio->uio_loffset;
1493		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1494
1495		if (bytes > volsize - off)	/* don't write past the end */
1496			bytes = volsize - off;
1497
1498		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1499		error = dmu_tx_assign(tx, TXG_WAIT);
1500		if (error) {
1501			dmu_tx_abort(tx);
1502			break;
1503		}
1504		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1505		if (error == 0)
1506			zvol_log_write(zv, tx, off, bytes, sync);
1507		dmu_tx_commit(tx);
1508
1509		if (error)
1510			break;
1511	}
1512	zfs_range_unlock(rl);
1513	if (sync)
1514		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1515	return (error);
1516}
1517
1518int
1519zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1520{
1521	struct uuid uuid = EFI_RESERVED;
1522	efi_gpe_t gpe = { 0 };
1523	uint32_t crc;
1524	dk_efi_t efi;
1525	int length;
1526	char *ptr;
1527
1528	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1529		return (SET_ERROR(EFAULT));
1530	ptr = (char *)(uintptr_t)efi.dki_data_64;
1531	length = efi.dki_length;
1532	/*
1533	 * Some clients may attempt to request a PMBR for the
1534	 * zvol.  Currently this interface will return EINVAL to
1535	 * such requests.  These requests could be supported by
1536	 * adding a check for lba == 0 and consing up an appropriate
1537	 * PMBR.
1538	 */
1539	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1540		return (SET_ERROR(EINVAL));
1541
1542	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1543	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1544	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1545
1546	if (efi.dki_lba == 1) {
1547		efi_gpt_t gpt = { 0 };
1548
1549		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1550		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1551		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1552		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1553		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1554		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1555		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1556		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1557		gpt.efi_gpt_SizeOfPartitionEntry =
1558		    LE_32(sizeof (efi_gpe_t));
1559		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1560		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1561		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1562		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1563		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1564		    flag))
1565			return (SET_ERROR(EFAULT));
1566		ptr += sizeof (gpt);
1567		length -= sizeof (gpt);
1568	}
1569	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1570	    length), flag))
1571		return (SET_ERROR(EFAULT));
1572	return (0);
1573}
1574
1575/*
1576 * BEGIN entry points to allow external callers access to the volume.
1577 */
1578/*
1579 * Return the volume parameters needed for access from an external caller.
1580 * These values are invariant as long as the volume is held open.
1581 */
1582int
1583zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1584    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1585    void **rl_hdl, void **bonus_hdl)
1586{
1587	zvol_state_t *zv;
1588
1589	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1590	if (zv == NULL)
1591		return (SET_ERROR(ENXIO));
1592	if (zv->zv_flags & ZVOL_DUMPIFIED)
1593		return (SET_ERROR(ENXIO));
1594
1595	ASSERT(blksize && max_xfer_len && minor_hdl &&
1596	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1597
1598	*blksize = zv->zv_volblocksize;
1599	*max_xfer_len = (uint64_t)zvol_maxphys;
1600	*minor_hdl = zv;
1601	*objset_hdl = zv->zv_objset;
1602	*zil_hdl = zv->zv_zilog;
1603	*rl_hdl = &zv->zv_znode;
1604	*bonus_hdl = zv->zv_dbuf;
1605	return (0);
1606}
1607
1608/*
1609 * Return the current volume size to an external caller.
1610 * The size can change while the volume is open.
1611 */
1612uint64_t
1613zvol_get_volume_size(void *minor_hdl)
1614{
1615	zvol_state_t *zv = minor_hdl;
1616
1617	return (zv->zv_volsize);
1618}
1619
1620/*
1621 * Return the current WCE setting to an external caller.
1622 * The WCE setting can change while the volume is open.
1623 */
1624int
1625zvol_get_volume_wce(void *minor_hdl)
1626{
1627	zvol_state_t *zv = minor_hdl;
1628
1629	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1630}
1631
1632/*
1633 * Entry point for external callers to zvol_log_write
1634 */
1635void
1636zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1637    boolean_t sync)
1638{
1639	zvol_state_t *zv = minor_hdl;
1640
1641	zvol_log_write(zv, tx, off, resid, sync);
1642}
1643/*
1644 * END entry points to allow external callers access to the volume.
1645 */
1646
1647/*
1648 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1649 */
1650/*ARGSUSED*/
1651int
1652zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1653{
1654	zvol_state_t *zv;
1655	struct dk_cinfo dki;
1656	struct dk_minfo dkm;
1657	struct dk_callback *dkc;
1658	int error = 0;
1659	rl_t *rl;
1660
1661	mutex_enter(&spa_namespace_lock);
1662
1663	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1664
1665	if (zv == NULL) {
1666		mutex_exit(&spa_namespace_lock);
1667		return (SET_ERROR(ENXIO));
1668	}
1669	ASSERT(zv->zv_total_opens > 0);
1670
1671	switch (cmd) {
1672
1673	case DKIOCINFO:
1674		bzero(&dki, sizeof (dki));
1675		(void) strcpy(dki.dki_cname, "zvol");
1676		(void) strcpy(dki.dki_dname, "zvol");
1677		dki.dki_ctype = DKC_UNKNOWN;
1678		dki.dki_unit = getminor(dev);
1679		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1680		mutex_exit(&spa_namespace_lock);
1681		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1682			error = SET_ERROR(EFAULT);
1683		return (error);
1684
1685	case DKIOCGMEDIAINFO:
1686		bzero(&dkm, sizeof (dkm));
1687		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1688		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1689		dkm.dki_media_type = DK_UNKNOWN;
1690		mutex_exit(&spa_namespace_lock);
1691		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1692			error = SET_ERROR(EFAULT);
1693		return (error);
1694
1695	case DKIOCGETEFI:
1696		{
1697			uint64_t vs = zv->zv_volsize;
1698			uint8_t bs = zv->zv_min_bs;
1699
1700			mutex_exit(&spa_namespace_lock);
1701			error = zvol_getefi((void *)arg, flag, vs, bs);
1702			return (error);
1703		}
1704
1705	case DKIOCFLUSHWRITECACHE:
1706		dkc = (struct dk_callback *)arg;
1707		mutex_exit(&spa_namespace_lock);
1708		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1709		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1710			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1711			error = 0;
1712		}
1713		return (error);
1714
1715	case DKIOCGETWCE:
1716		{
1717			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1718			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1719			    flag))
1720				error = SET_ERROR(EFAULT);
1721			break;
1722		}
1723	case DKIOCSETWCE:
1724		{
1725			int wce;
1726			if (ddi_copyin((void *)arg, &wce, sizeof (int),
1727			    flag)) {
1728				error = SET_ERROR(EFAULT);
1729				break;
1730			}
1731			if (wce) {
1732				zv->zv_flags |= ZVOL_WCE;
1733				mutex_exit(&spa_namespace_lock);
1734			} else {
1735				zv->zv_flags &= ~ZVOL_WCE;
1736				mutex_exit(&spa_namespace_lock);
1737				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1738			}
1739			return (0);
1740		}
1741
1742	case DKIOCGGEOM:
1743	case DKIOCGVTOC:
1744		/*
1745		 * commands using these (like prtvtoc) expect ENOTSUP
1746		 * since we're emulating an EFI label
1747		 */
1748		error = SET_ERROR(ENOTSUP);
1749		break;
1750
1751	case DKIOCDUMPINIT:
1752		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1753		    RL_WRITER);
1754		error = zvol_dumpify(zv);
1755		zfs_range_unlock(rl);
1756		break;
1757
1758	case DKIOCDUMPFINI:
1759		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1760			break;
1761		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1762		    RL_WRITER);
1763		error = zvol_dump_fini(zv);
1764		zfs_range_unlock(rl);
1765		break;
1766
1767	case DKIOCFREE:
1768	{
1769		dkioc_free_t df;
1770		dmu_tx_t *tx;
1771
1772		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1773			error = SET_ERROR(EFAULT);
1774			break;
1775		}
1776
1777		/*
1778		 * Apply Postel's Law to length-checking.  If they overshoot,
1779		 * just blank out until the end, if there's a need to blank
1780		 * out anything.
1781		 */
1782		if (df.df_start >= zv->zv_volsize)
1783			break;	/* No need to do anything... */
1784		if (df.df_start + df.df_length > zv->zv_volsize)
1785			df.df_length = DMU_OBJECT_END;
1786
1787		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1788		    RL_WRITER);
1789		tx = dmu_tx_create(zv->zv_objset);
1790		error = dmu_tx_assign(tx, TXG_WAIT);
1791		if (error != 0) {
1792			dmu_tx_abort(tx);
1793		} else {
1794			zvol_log_truncate(zv, tx, df.df_start,
1795			    df.df_length, B_TRUE);
1796			dmu_tx_commit(tx);
1797			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1798			    df.df_start, df.df_length);
1799		}
1800
1801		zfs_range_unlock(rl);
1802
1803		if (error == 0) {
1804			/*
1805			 * If the write-cache is disabled or 'sync' property
1806			 * is set to 'always' then treat this as a synchronous
1807			 * operation (i.e. commit to zil).
1808			 */
1809			if (!(zv->zv_flags & ZVOL_WCE) ||
1810			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1811				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1812
1813			/*
1814			 * If the caller really wants synchronous writes, and
1815			 * can't wait for them, don't return until the write
1816			 * is done.
1817			 */
1818			if (df.df_flags & DF_WAIT_SYNC) {
1819				txg_wait_synced(
1820				    dmu_objset_pool(zv->zv_objset), 0);
1821			}
1822		}
1823		break;
1824	}
1825
1826	default:
1827		error = SET_ERROR(ENOTTY);
1828		break;
1829
1830	}
1831	mutex_exit(&spa_namespace_lock);
1832	return (error);
1833}
1834#endif	/* sun */
1835
1836int
1837zvol_busy(void)
1838{
1839	return (zvol_minors != 0);
1840}
1841
1842void
1843zvol_init(void)
1844{
1845	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1846	    1) == 0);
1847	ZFS_LOG(1, "ZVOL Initialized.");
1848}
1849
1850void
1851zvol_fini(void)
1852{
1853	ddi_soft_state_fini(&zfsdev_state);
1854	ZFS_LOG(1, "ZVOL Deinitialized.");
1855}
1856
1857#ifdef sun
1858/*ARGSUSED*/
1859static int
1860zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1861{
1862	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1863
1864	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1865		return (1);
1866	return (0);
1867}
1868
1869/*ARGSUSED*/
1870static void
1871zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1872{
1873	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1874
1875	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1876}
1877
1878static int
1879zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1880{
1881	dmu_tx_t *tx;
1882	int error;
1883	objset_t *os = zv->zv_objset;
1884	spa_t *spa = dmu_objset_spa(os);
1885	vdev_t *vd = spa->spa_root_vdev;
1886	nvlist_t *nv = NULL;
1887	uint64_t version = spa_version(spa);
1888	enum zio_checksum checksum;
1889
1890	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1891	ASSERT(vd->vdev_ops == &vdev_root_ops);
1892
1893	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1894	    DMU_OBJECT_END);
1895	/* wait for dmu_free_long_range to actually free the blocks */
1896	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1897
1898	/*
1899	 * If the pool on which the dump device is being initialized has more
1900	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1901	 * enabled.  If so, bump that feature's counter to indicate that the
1902	 * feature is active. We also check the vdev type to handle the
1903	 * following case:
1904	 *   # zpool create test raidz disk1 disk2 disk3
1905	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1906	 *   the raidz vdev itself has 3 children.
1907	 */
1908	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1909		if (!spa_feature_is_enabled(spa,
1910		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1911			return (SET_ERROR(ENOTSUP));
1912		(void) dsl_sync_task(spa_name(spa),
1913		    zfs_mvdev_dump_feature_check,
1914		    zfs_mvdev_dump_activate_feature_sync, NULL, 2);
1915	}
1916
1917	tx = dmu_tx_create(os);
1918	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1919	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1920	error = dmu_tx_assign(tx, TXG_WAIT);
1921	if (error) {
1922		dmu_tx_abort(tx);
1923		return (error);
1924	}
1925
1926	/*
1927	 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
1928	 * function.  Otherwise, use the old default -- OFF.
1929	 */
1930	checksum = spa_feature_is_active(spa,
1931	    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
1932	    ZIO_CHECKSUM_OFF;
1933
1934	/*
1935	 * If we are resizing the dump device then we only need to
1936	 * update the refreservation to match the newly updated
1937	 * zvolsize. Otherwise, we save off the original state of the
1938	 * zvol so that we can restore them if the zvol is ever undumpified.
1939	 */
1940	if (resize) {
1941		error = zap_update(os, ZVOL_ZAP_OBJ,
1942		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1943		    &zv->zv_volsize, tx);
1944	} else {
1945		uint64_t checksum, compress, refresrv, vbs, dedup;
1946
1947		error = dsl_prop_get_integer(zv->zv_name,
1948		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1949		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1950		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
1951		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1952		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
1953		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1954		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
1955		if (version >= SPA_VERSION_DEDUP) {
1956			error = error ? error :
1957			    dsl_prop_get_integer(zv->zv_name,
1958			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1959		}
1960
1961		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1962		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1963		    &compress, tx);
1964		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1965		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
1966		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1967		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1968		    &refresrv, tx);
1969		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1970		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1971		    &vbs, tx);
1972		error = error ? error : dmu_object_set_blocksize(
1973		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
1974		if (version >= SPA_VERSION_DEDUP) {
1975			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1976			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
1977			    &dedup, tx);
1978		}
1979		if (error == 0)
1980			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
1981	}
1982	dmu_tx_commit(tx);
1983
1984	/*
1985	 * We only need update the zvol's property if we are initializing
1986	 * the dump area for the first time.
1987	 */
1988	if (!resize) {
1989		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1990		VERIFY(nvlist_add_uint64(nv,
1991		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
1992		VERIFY(nvlist_add_uint64(nv,
1993		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
1994		    ZIO_COMPRESS_OFF) == 0);
1995		VERIFY(nvlist_add_uint64(nv,
1996		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
1997		    checksum) == 0);
1998		if (version >= SPA_VERSION_DEDUP) {
1999			VERIFY(nvlist_add_uint64(nv,
2000			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2001			    ZIO_CHECKSUM_OFF) == 0);
2002		}
2003
2004		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2005		    nv, NULL);
2006		nvlist_free(nv);
2007
2008		if (error)
2009			return (error);
2010	}
2011
2012	/* Allocate the space for the dump */
2013	error = zvol_prealloc(zv);
2014	return (error);
2015}
2016
2017static int
2018zvol_dumpify(zvol_state_t *zv)
2019{
2020	int error = 0;
2021	uint64_t dumpsize = 0;
2022	dmu_tx_t *tx;
2023	objset_t *os = zv->zv_objset;
2024
2025	if (zv->zv_flags & ZVOL_RDONLY)
2026		return (SET_ERROR(EROFS));
2027
2028	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2029	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2030		boolean_t resize = (dumpsize > 0);
2031
2032		if ((error = zvol_dump_init(zv, resize)) != 0) {
2033			(void) zvol_dump_fini(zv);
2034			return (error);
2035		}
2036	}
2037
2038	/*
2039	 * Build up our lba mapping.
2040	 */
2041	error = zvol_get_lbas(zv);
2042	if (error) {
2043		(void) zvol_dump_fini(zv);
2044		return (error);
2045	}
2046
2047	tx = dmu_tx_create(os);
2048	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2049	error = dmu_tx_assign(tx, TXG_WAIT);
2050	if (error) {
2051		dmu_tx_abort(tx);
2052		(void) zvol_dump_fini(zv);
2053		return (error);
2054	}
2055
2056	zv->zv_flags |= ZVOL_DUMPIFIED;
2057	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2058	    &zv->zv_volsize, tx);
2059	dmu_tx_commit(tx);
2060
2061	if (error) {
2062		(void) zvol_dump_fini(zv);
2063		return (error);
2064	}
2065
2066	txg_wait_synced(dmu_objset_pool(os), 0);
2067	return (0);
2068}
2069
2070static int
2071zvol_dump_fini(zvol_state_t *zv)
2072{
2073	dmu_tx_t *tx;
2074	objset_t *os = zv->zv_objset;
2075	nvlist_t *nv;
2076	int error = 0;
2077	uint64_t checksum, compress, refresrv, vbs, dedup;
2078	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2079
2080	/*
2081	 * Attempt to restore the zvol back to its pre-dumpified state.
2082	 * This is a best-effort attempt as it's possible that not all
2083	 * of these properties were initialized during the dumpify process
2084	 * (i.e. error during zvol_dump_init).
2085	 */
2086
2087	tx = dmu_tx_create(os);
2088	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2089	error = dmu_tx_assign(tx, TXG_WAIT);
2090	if (error) {
2091		dmu_tx_abort(tx);
2092		return (error);
2093	}
2094	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2095	dmu_tx_commit(tx);
2096
2097	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2098	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2099	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2100	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2101	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2102	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2103	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2104	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2105
2106	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2107	(void) nvlist_add_uint64(nv,
2108	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2109	(void) nvlist_add_uint64(nv,
2110	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2111	(void) nvlist_add_uint64(nv,
2112	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2113	if (version >= SPA_VERSION_DEDUP &&
2114	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2115	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2116		(void) nvlist_add_uint64(nv,
2117		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2118	}
2119	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2120	    nv, NULL);
2121	nvlist_free(nv);
2122
2123	zvol_free_extents(zv);
2124	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2125	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2126	/* wait for dmu_free_long_range to actually free the blocks */
2127	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2128	tx = dmu_tx_create(os);
2129	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2130	error = dmu_tx_assign(tx, TXG_WAIT);
2131	if (error) {
2132		dmu_tx_abort(tx);
2133		return (error);
2134	}
2135	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2136		zv->zv_volblocksize = vbs;
2137	dmu_tx_commit(tx);
2138
2139	return (0);
2140}
2141#endif	/* sun */
2142
2143static zvol_state_t *
2144zvol_geom_create(const char *name)
2145{
2146	struct g_provider *pp;
2147	struct g_geom *gp;
2148	zvol_state_t *zv;
2149
2150	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
2151	gp->start = zvol_geom_start;
2152	gp->access = zvol_geom_access;
2153	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
2154	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2155	pp->sectorsize = DEV_BSIZE;
2156
2157	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
2158	zv->zv_provider = pp;
2159	zv->zv_state = 0;
2160	bioq_init(&zv->zv_queue);
2161	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
2162
2163	pp->private = zv;
2164
2165	return (zv);
2166}
2167
2168static void
2169zvol_geom_run(zvol_state_t *zv)
2170{
2171	struct g_provider *pp;
2172
2173	pp = zv->zv_provider;
2174	g_error_provider(pp, 0);
2175
2176	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2177	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2178}
2179
2180static void
2181zvol_geom_destroy(zvol_state_t *zv)
2182{
2183	struct g_provider *pp;
2184
2185	g_topology_assert();
2186
2187	mtx_lock(&zv->zv_queue_mtx);
2188	zv->zv_state = 1;
2189	wakeup_one(&zv->zv_queue);
2190	while (zv->zv_state != 2)
2191		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2192	mtx_destroy(&zv->zv_queue_mtx);
2193
2194	pp = zv->zv_provider;
2195	zv->zv_provider = NULL;
2196	pp->private = NULL;
2197	g_wither_geom(pp->geom, ENXIO);
2198
2199	kmem_free(zv, sizeof(*zv));
2200}
2201
2202static int
2203zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2204{
2205	int count, error, flags;
2206
2207	g_topology_assert();
2208
2209	/*
2210	 * To make it easier we expect either open or close, but not both
2211	 * at the same time.
2212	 */
2213	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2214	    (acr <= 0 && acw <= 0 && ace <= 0),
2215	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2216	    pp->name, acr, acw, ace));
2217
2218	if (pp->private == NULL) {
2219		if (acr <= 0 && acw <= 0 && ace <= 0)
2220			return (0);
2221		return (pp->error);
2222	}
2223
2224	/*
2225	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2226	 * because GEOM already handles that and handles it a bit differently.
2227	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2228	 * only one exclusive consumer, no matter if it is reader or writer.
2229	 * I like better the way GEOM works so I'll leave it for GEOM to
2230	 * decide what to do.
2231	 */
2232
2233	count = acr + acw + ace;
2234	if (count == 0)
2235		return (0);
2236
2237	flags = 0;
2238	if (acr != 0 || ace != 0)
2239		flags |= FREAD;
2240	if (acw != 0)
2241		flags |= FWRITE;
2242
2243	g_topology_unlock();
2244	if (count > 0)
2245		error = zvol_open(pp, flags, count);
2246	else
2247		error = zvol_close(pp, flags, -count);
2248	g_topology_lock();
2249	return (error);
2250}
2251
2252static void
2253zvol_geom_start(struct bio *bp)
2254{
2255	zvol_state_t *zv;
2256	boolean_t first;
2257
2258	zv = bp->bio_to->private;
2259	ASSERT(zv != NULL);
2260	switch (bp->bio_cmd) {
2261	case BIO_FLUSH:
2262		if (!THREAD_CAN_SLEEP())
2263			goto enqueue;
2264		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2265		g_io_deliver(bp, 0);
2266		break;
2267	case BIO_READ:
2268	case BIO_WRITE:
2269		if (!THREAD_CAN_SLEEP())
2270			goto enqueue;
2271		zvol_strategy(bp);
2272		break;
2273	case BIO_GETATTR:
2274	case BIO_DELETE:
2275	default:
2276		g_io_deliver(bp, EOPNOTSUPP);
2277		break;
2278	}
2279	return;
2280
2281enqueue:
2282	mtx_lock(&zv->zv_queue_mtx);
2283	first = (bioq_first(&zv->zv_queue) == NULL);
2284	bioq_insert_tail(&zv->zv_queue, bp);
2285	mtx_unlock(&zv->zv_queue_mtx);
2286	if (first)
2287		wakeup_one(&zv->zv_queue);
2288}
2289
2290static void
2291zvol_geom_worker(void *arg)
2292{
2293	zvol_state_t *zv;
2294	struct bio *bp;
2295
2296	thread_lock(curthread);
2297	sched_prio(curthread, PRIBIO);
2298	thread_unlock(curthread);
2299
2300	zv = arg;
2301	for (;;) {
2302		mtx_lock(&zv->zv_queue_mtx);
2303		bp = bioq_takefirst(&zv->zv_queue);
2304		if (bp == NULL) {
2305			if (zv->zv_state == 1) {
2306				zv->zv_state = 2;
2307				wakeup(&zv->zv_state);
2308				mtx_unlock(&zv->zv_queue_mtx);
2309				kthread_exit();
2310			}
2311			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2312			    "zvol:io", 0);
2313			continue;
2314		}
2315		mtx_unlock(&zv->zv_queue_mtx);
2316		switch (bp->bio_cmd) {
2317		case BIO_FLUSH:
2318			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2319			g_io_deliver(bp, 0);
2320			break;
2321		case BIO_READ:
2322		case BIO_WRITE:
2323			zvol_strategy(bp);
2324			break;
2325		}
2326	}
2327}
2328
2329extern boolean_t dataset_name_hidden(const char *name);
2330
2331static int
2332zvol_create_snapshots(objset_t *os, const char *name)
2333{
2334	uint64_t cookie, obj;
2335	char *sname;
2336	int error, len;
2337
2338	cookie = obj = 0;
2339	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2340
2341#if 0
2342	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2343	    DS_FIND_SNAPSHOTS);
2344#endif
2345
2346	for (;;) {
2347		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2348		if (len >= MAXPATHLEN) {
2349			dmu_objset_rele(os, FTAG);
2350			error = ENAMETOOLONG;
2351			break;
2352		}
2353
2354		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2355		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2356		    sname + len, &obj, &cookie, NULL);
2357		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2358		if (error != 0) {
2359			if (error == ENOENT)
2360				error = 0;
2361			break;
2362		}
2363
2364		if ((error = zvol_create_minor(sname)) != 0) {
2365			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2366			    sname, error);
2367			break;
2368		}
2369	}
2370
2371	kmem_free(sname, MAXPATHLEN);
2372	return (error);
2373}
2374
2375int
2376zvol_create_minors(const char *name)
2377{
2378	uint64_t cookie;
2379	objset_t *os;
2380	char *osname, *p;
2381	int error, len;
2382
2383	if (dataset_name_hidden(name))
2384		return (0);
2385
2386	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2387		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2388		    name, error);
2389		return (error);
2390	}
2391	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2392		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2393		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2394		if ((error = zvol_create_minor(name)) == 0)
2395			error = zvol_create_snapshots(os, name);
2396		else {
2397			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2398			    name, error);
2399		}
2400		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2401		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2402		return (error);
2403	}
2404	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2405		dmu_objset_rele(os, FTAG);
2406		return (0);
2407	}
2408
2409	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2410	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2411		dmu_objset_rele(os, FTAG);
2412		kmem_free(osname, MAXPATHLEN);
2413		return (ENOENT);
2414	}
2415	p = osname + strlen(osname);
2416	len = MAXPATHLEN - (p - osname);
2417
2418#if 0
2419	/* Prefetch the datasets. */
2420	cookie = 0;
2421	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2422		if (!dataset_name_hidden(osname))
2423			(void) dmu_objset_prefetch(osname, NULL);
2424	}
2425#endif
2426
2427	cookie = 0;
2428	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2429	    &cookie) == 0) {
2430		dmu_objset_rele(os, FTAG);
2431		(void)zvol_create_minors(osname);
2432		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2433			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2434			    name, error);
2435			return (error);
2436		}
2437	}
2438
2439	dmu_objset_rele(os, FTAG);
2440	kmem_free(osname, MAXPATHLEN);
2441	return (0);
2442}
2443
2444static void
2445zvol_rename_minor(struct g_geom *gp, const char *newname)
2446{
2447	struct g_provider *pp;
2448	zvol_state_t *zv;
2449
2450	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2451	g_topology_assert();
2452
2453	pp = LIST_FIRST(&gp->provider);
2454	ASSERT(pp != NULL);
2455	zv = pp->private;
2456	ASSERT(zv != NULL);
2457
2458	zv->zv_provider = NULL;
2459	g_wither_provider(pp, ENXIO);
2460
2461	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2462	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2463	pp->sectorsize = DEV_BSIZE;
2464	pp->mediasize = zv->zv_volsize;
2465	pp->private = zv;
2466	zv->zv_provider = pp;
2467	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
2468	g_error_provider(pp, 0);
2469}
2470
2471void
2472zvol_rename_minors(const char *oldname, const char *newname)
2473{
2474	char name[MAXPATHLEN];
2475	struct g_provider *pp;
2476	struct g_geom *gp;
2477	size_t oldnamelen, newnamelen;
2478	zvol_state_t *zv;
2479	char *namebuf;
2480
2481	oldnamelen = strlen(oldname);
2482	newnamelen = strlen(newname);
2483
2484	DROP_GIANT();
2485	mutex_enter(&spa_namespace_lock);
2486	g_topology_lock();
2487
2488	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
2489		pp = LIST_FIRST(&gp->provider);
2490		if (pp == NULL)
2491			continue;
2492		zv = pp->private;
2493		if (zv == NULL)
2494			continue;
2495		if (strcmp(zv->zv_name, oldname) == 0) {
2496			zvol_rename_minor(gp, newname);
2497		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
2498		    (zv->zv_name[oldnamelen] == '/' ||
2499		     zv->zv_name[oldnamelen] == '@')) {
2500			snprintf(name, sizeof(name), "%s%c%s", newname,
2501			    zv->zv_name[oldnamelen],
2502			    zv->zv_name + oldnamelen + 1);
2503			zvol_rename_minor(gp, name);
2504		}
2505	}
2506
2507	g_topology_unlock();
2508	mutex_exit(&spa_namespace_lock);
2509	PICKUP_GIANT();
2510}
2511