zvol.c revision 325132
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 *
27 * Portions Copyright 2010 Robert Milkowski
28 *
29 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 */
34
35/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36
37/*
38 * ZFS volume emulation driver.
39 *
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
42 *
43 * /dev/zvol/dsk/<pool_name>/<dataset_name>
44 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
45 *
46 * These links are created by the /dev filesystem (sdev_zvolops.c).
47 * Volumes are persistent through reboot.  No user command needs to be
48 * run before opening and using a device.
49 *
50 * FreeBSD notes.
51 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
52 * in the system.
53 */
54
55#include <sys/types.h>
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/errno.h>
59#include <sys/uio.h>
60#include <sys/bio.h>
61#include <sys/buf.h>
62#include <sys/kmem.h>
63#include <sys/conf.h>
64#include <sys/cmn_err.h>
65#include <sys/stat.h>
66#include <sys/zap.h>
67#include <sys/spa.h>
68#include <sys/spa_impl.h>
69#include <sys/zio.h>
70#include <sys/disk.h>
71#include <sys/dmu_traverse.h>
72#include <sys/dnode.h>
73#include <sys/dsl_dataset.h>
74#include <sys/dsl_prop.h>
75#include <sys/dkio.h>
76#include <sys/byteorder.h>
77#include <sys/sunddi.h>
78#include <sys/dirent.h>
79#include <sys/policy.h>
80#include <sys/queue.h>
81#include <sys/fs/zfs.h>
82#include <sys/zfs_ioctl.h>
83#include <sys/zil.h>
84#include <sys/refcount.h>
85#include <sys/zfs_znode.h>
86#include <sys/zfs_rlock.h>
87#include <sys/vdev_impl.h>
88#include <sys/vdev_raidz.h>
89#include <sys/zvol.h>
90#include <sys/zil_impl.h>
91#include <sys/dbuf.h>
92#include <sys/dmu_tx.h>
93#include <sys/zfeature.h>
94#include <sys/zio_checksum.h>
95#include <sys/zil_impl.h>
96#include <sys/filio.h>
97
98#include <geom/geom.h>
99
100#include "zfs_namecheck.h"
101
102#ifndef illumos
103struct g_class zfs_zvol_class = {
104	.name = "ZFS::ZVOL",
105	.version = G_VERSION,
106};
107
108DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
109
110#endif
111void *zfsdev_state;
112static char *zvol_tag = "zvol_tag";
113
114#define	ZVOL_DUMPSIZE		"dumpsize"
115
116/*
117 * This lock protects the zfsdev_state structure from being modified
118 * while it's being used, e.g. an open that comes in before a create
119 * finishes.  It also protects temporary opens of the dataset so that,
120 * e.g., an open doesn't get a spurious EBUSY.
121 */
122#ifdef illumos
123kmutex_t zfsdev_state_lock;
124#else
125/*
126 * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
127 * spa_namespace_lock in the ZVOL code.
128 */
129#define zfsdev_state_lock spa_namespace_lock
130#endif
131static uint32_t zvol_minors;
132
133#ifndef illumos
134SYSCTL_DECL(_vfs_zfs);
135SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
136static int	volmode = ZFS_VOLMODE_GEOM;
137SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
138    "Expose as GEOM providers (1), device files (2) or neither");
139static boolean_t zpool_on_zvol = B_FALSE;
140SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
141    "Allow zpools to use zvols as vdevs (DANGEROUS)");
142
143#endif
144typedef struct zvol_extent {
145	list_node_t	ze_node;
146	dva_t		ze_dva;		/* dva associated with this extent */
147	uint64_t	ze_nblks;	/* number of blocks in extent */
148} zvol_extent_t;
149
150/*
151 * The in-core state of each volume.
152 */
153typedef struct zvol_state {
154#ifndef illumos
155	LIST_ENTRY(zvol_state)	zv_links;
156#endif
157	char		zv_name[MAXPATHLEN]; /* pool/dd name */
158	uint64_t	zv_volsize;	/* amount of space we advertise */
159	uint64_t	zv_volblocksize; /* volume block size */
160#ifdef illumos
161	minor_t		zv_minor;	/* minor number */
162#else
163	struct cdev	*zv_dev;	/* non-GEOM device */
164	struct g_provider *zv_provider;	/* GEOM provider */
165#endif
166	uint8_t		zv_min_bs;	/* minimum addressable block shift */
167	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
168	objset_t	*zv_objset;	/* objset handle */
169#ifdef illumos
170	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
171#endif
172	uint32_t	zv_total_opens;	/* total open count */
173	uint32_t	zv_sync_cnt;	/* synchronous open count */
174	zilog_t		*zv_zilog;	/* ZIL handle */
175	list_t		zv_extents;	/* List of extents for dump */
176	znode_t		zv_znode;	/* for range locking */
177	dmu_buf_t	*zv_dbuf;	/* bonus handle */
178#ifndef illumos
179	int		zv_state;
180	int		zv_volmode;	/* Provide GEOM or cdev */
181	struct bio_queue_head zv_queue;
182	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
183#endif
184} zvol_state_t;
185
186#ifndef illumos
187static LIST_HEAD(, zvol_state) all_zvols;
188#endif
189/*
190 * zvol specific flags
191 */
192#define	ZVOL_RDONLY	0x1
193#define	ZVOL_DUMPIFIED	0x2
194#define	ZVOL_EXCL	0x4
195#define	ZVOL_WCE	0x8
196
197/*
198 * zvol maximum transfer in one DMU tx.
199 */
200int zvol_maxphys = DMU_MAX_ACCESS/2;
201
202/*
203 * Toggle unmap functionality.
204 */
205boolean_t zvol_unmap_enabled = B_TRUE;
206
207/*
208 * If true, unmaps requested as synchronous are executed synchronously,
209 * otherwise all unmaps are asynchronous.
210 */
211boolean_t zvol_unmap_sync_enabled = B_FALSE;
212
213#ifndef illumos
214SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
215    &zvol_unmap_enabled, 0,
216    "Enable UNMAP functionality");
217
218SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_sync_enabled, CTLFLAG_RWTUN,
219    &zvol_unmap_sync_enabled, 0,
220    "UNMAPs requested as sync are executed synchronously");
221
222static d_open_t		zvol_d_open;
223static d_close_t	zvol_d_close;
224static d_read_t		zvol_read;
225static d_write_t	zvol_write;
226static d_ioctl_t	zvol_d_ioctl;
227static d_strategy_t	zvol_strategy;
228
229static struct cdevsw zvol_cdevsw = {
230	.d_version =	D_VERSION,
231	.d_open =	zvol_d_open,
232	.d_close =	zvol_d_close,
233	.d_read =	zvol_read,
234	.d_write =	zvol_write,
235	.d_ioctl =	zvol_d_ioctl,
236	.d_strategy =	zvol_strategy,
237	.d_name =	"zvol",
238	.d_flags =	D_DISK | D_TRACKCLOSE,
239};
240
241static void zvol_geom_run(zvol_state_t *zv);
242static void zvol_geom_destroy(zvol_state_t *zv);
243static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
244static void zvol_geom_start(struct bio *bp);
245static void zvol_geom_worker(void *arg);
246static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
247    uint64_t len, boolean_t sync);
248#endif	/* !illumos */
249
250extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
251    nvlist_t *, nvlist_t *);
252static int zvol_remove_zv(zvol_state_t *);
253static int zvol_get_data(void *arg, lr_write_t *lr, char *buf,
254    struct lwb *lwb, zio_t *zio);
255static int zvol_dumpify(zvol_state_t *zv);
256static int zvol_dump_fini(zvol_state_t *zv);
257static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
258
259static void
260zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
261{
262#ifdef illumos
263	dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
264
265	zv->zv_volsize = volsize;
266	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
267	    "Size", volsize) == DDI_SUCCESS);
268	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
269	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
270
271	/* Notify specfs to invalidate the cached size */
272	spec_size_invalidate(dev, VBLK);
273	spec_size_invalidate(dev, VCHR);
274#else	/* !illumos */
275	zv->zv_volsize = volsize;
276	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
277		struct g_provider *pp;
278
279		pp = zv->zv_provider;
280		if (pp == NULL)
281			return;
282		g_topology_lock();
283
284		/*
285		 * Do not invoke resize event when initial size was zero.
286		 * ZVOL initializes the size on first open, this is not
287		 * real resizing.
288		 */
289		if (pp->mediasize == 0)
290			pp->mediasize = zv->zv_volsize;
291		else
292			g_resize_provider(pp, zv->zv_volsize);
293		g_topology_unlock();
294	}
295#endif	/* illumos */
296}
297
298int
299zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
300{
301	if (volsize == 0)
302		return (SET_ERROR(EINVAL));
303
304	if (volsize % blocksize != 0)
305		return (SET_ERROR(EINVAL));
306
307#ifdef _ILP32
308	if (volsize - 1 > SPEC_MAXOFFSET_T)
309		return (SET_ERROR(EOVERFLOW));
310#endif
311	return (0);
312}
313
314int
315zvol_check_volblocksize(uint64_t volblocksize)
316{
317	if (volblocksize < SPA_MINBLOCKSIZE ||
318	    volblocksize > SPA_OLD_MAXBLOCKSIZE ||
319	    !ISP2(volblocksize))
320		return (SET_ERROR(EDOM));
321
322	return (0);
323}
324
325int
326zvol_get_stats(objset_t *os, nvlist_t *nv)
327{
328	int error;
329	dmu_object_info_t doi;
330	uint64_t val;
331
332	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
333	if (error)
334		return (error);
335
336	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
337
338	error = dmu_object_info(os, ZVOL_OBJ, &doi);
339
340	if (error == 0) {
341		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
342		    doi.doi_data_block_size);
343	}
344
345	return (error);
346}
347
348static zvol_state_t *
349zvol_minor_lookup(const char *name)
350{
351#ifdef illumos
352	minor_t minor;
353#endif
354	zvol_state_t *zv;
355
356	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
357
358#ifdef illumos
359	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
360		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
361		if (zv == NULL)
362			continue;
363#else
364	LIST_FOREACH(zv, &all_zvols, zv_links) {
365#endif
366		if (strcmp(zv->zv_name, name) == 0)
367			return (zv);
368	}
369
370	return (NULL);
371}
372
373/* extent mapping arg */
374struct maparg {
375	zvol_state_t	*ma_zv;
376	uint64_t	ma_blks;
377};
378
379/*ARGSUSED*/
380static int
381zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
382    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
383{
384	struct maparg *ma = arg;
385	zvol_extent_t *ze;
386	int bs = ma->ma_zv->zv_volblocksize;
387
388	if (bp == NULL || BP_IS_HOLE(bp) ||
389	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
390		return (0);
391
392	VERIFY(!BP_IS_EMBEDDED(bp));
393
394	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
395	ma->ma_blks++;
396
397	/* Abort immediately if we have encountered gang blocks */
398	if (BP_IS_GANG(bp))
399		return (SET_ERROR(EFRAGS));
400
401	/*
402	 * See if the block is at the end of the previous extent.
403	 */
404	ze = list_tail(&ma->ma_zv->zv_extents);
405	if (ze &&
406	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
407	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
408	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
409		ze->ze_nblks++;
410		return (0);
411	}
412
413	dprintf_bp(bp, "%s", "next blkptr:");
414
415	/* start a new extent */
416	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
417	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
418	ze->ze_nblks = 1;
419	list_insert_tail(&ma->ma_zv->zv_extents, ze);
420	return (0);
421}
422
423static void
424zvol_free_extents(zvol_state_t *zv)
425{
426	zvol_extent_t *ze;
427
428	while (ze = list_head(&zv->zv_extents)) {
429		list_remove(&zv->zv_extents, ze);
430		kmem_free(ze, sizeof (zvol_extent_t));
431	}
432}
433
434static int
435zvol_get_lbas(zvol_state_t *zv)
436{
437	objset_t *os = zv->zv_objset;
438	struct maparg	ma;
439	int		err;
440
441	ma.ma_zv = zv;
442	ma.ma_blks = 0;
443	zvol_free_extents(zv);
444
445	/* commit any in-flight changes before traversing the dataset */
446	txg_wait_synced(dmu_objset_pool(os), 0);
447	err = traverse_dataset(dmu_objset_ds(os), 0,
448	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
449	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
450		zvol_free_extents(zv);
451		return (err ? err : EIO);
452	}
453
454	return (0);
455}
456
457/* ARGSUSED */
458void
459zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
460{
461	zfs_creat_t *zct = arg;
462	nvlist_t *nvprops = zct->zct_props;
463	int error;
464	uint64_t volblocksize, volsize;
465
466	VERIFY(nvlist_lookup_uint64(nvprops,
467	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
468	if (nvlist_lookup_uint64(nvprops,
469	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
470		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
471
472	/*
473	 * These properties must be removed from the list so the generic
474	 * property setting step won't apply to them.
475	 */
476	VERIFY(nvlist_remove_all(nvprops,
477	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
478	(void) nvlist_remove_all(nvprops,
479	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
480
481	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
482	    DMU_OT_NONE, 0, tx);
483	ASSERT(error == 0);
484
485	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
486	    DMU_OT_NONE, 0, tx);
487	ASSERT(error == 0);
488
489	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
490	ASSERT(error == 0);
491}
492
493/*
494 * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
495 * implement DKIOCFREE/free-long-range.
496 */
497static int
498zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
499{
500	uint64_t offset, length;
501
502	if (byteswap)
503		byteswap_uint64_array(lr, sizeof (*lr));
504
505	offset = lr->lr_offset;
506	length = lr->lr_length;
507
508	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
509}
510
511/*
512 * Replay a TX_WRITE ZIL transaction that didn't get committed
513 * after a system failure
514 */
515static int
516zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
517{
518	objset_t *os = zv->zv_objset;
519	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
520	uint64_t offset, length;
521	dmu_tx_t *tx;
522	int error;
523
524	if (byteswap)
525		byteswap_uint64_array(lr, sizeof (*lr));
526
527	offset = lr->lr_offset;
528	length = lr->lr_length;
529
530	/* If it's a dmu_sync() block, write the whole block */
531	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
532		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
533		if (length < blocksize) {
534			offset -= offset % blocksize;
535			length = blocksize;
536		}
537	}
538
539	tx = dmu_tx_create(os);
540	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
541	error = dmu_tx_assign(tx, TXG_WAIT);
542	if (error) {
543		dmu_tx_abort(tx);
544	} else {
545		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
546		dmu_tx_commit(tx);
547	}
548
549	return (error);
550}
551
552/* ARGSUSED */
553static int
554zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
555{
556	return (SET_ERROR(ENOTSUP));
557}
558
559/*
560 * Callback vectors for replaying records.
561 * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
562 */
563zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
564	zvol_replay_err,	/* 0 no such transaction type */
565	zvol_replay_err,	/* TX_CREATE */
566	zvol_replay_err,	/* TX_MKDIR */
567	zvol_replay_err,	/* TX_MKXATTR */
568	zvol_replay_err,	/* TX_SYMLINK */
569	zvol_replay_err,	/* TX_REMOVE */
570	zvol_replay_err,	/* TX_RMDIR */
571	zvol_replay_err,	/* TX_LINK */
572	zvol_replay_err,	/* TX_RENAME */
573	zvol_replay_write,	/* TX_WRITE */
574	zvol_replay_truncate,	/* TX_TRUNCATE */
575	zvol_replay_err,	/* TX_SETATTR */
576	zvol_replay_err,	/* TX_ACL */
577	zvol_replay_err,	/* TX_CREATE_ACL */
578	zvol_replay_err,	/* TX_CREATE_ATTR */
579	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
580	zvol_replay_err,	/* TX_MKDIR_ACL */
581	zvol_replay_err,	/* TX_MKDIR_ATTR */
582	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
583	zvol_replay_err,	/* TX_WRITE2 */
584};
585
586#ifdef illumos
587int
588zvol_name2minor(const char *name, minor_t *minor)
589{
590	zvol_state_t *zv;
591
592	mutex_enter(&zfsdev_state_lock);
593	zv = zvol_minor_lookup(name);
594	if (minor && zv)
595		*minor = zv->zv_minor;
596	mutex_exit(&zfsdev_state_lock);
597	return (zv ? 0 : -1);
598}
599#endif	/* illumos */
600
601/*
602 * Create a minor node (plus a whole lot more) for the specified volume.
603 */
604int
605zvol_create_minor(const char *name)
606{
607	zfs_soft_state_t *zs;
608	zvol_state_t *zv;
609	objset_t *os;
610#ifdef illumos
611	dmu_object_info_t doi;
612	minor_t minor = 0;
613	char chrbuf[30], blkbuf[30];
614#else
615	struct g_provider *pp;
616	struct g_geom *gp;
617	uint64_t mode;
618#endif
619	int error;
620
621#ifndef illumos
622	ZFS_LOG(1, "Creating ZVOL %s...", name);
623#endif
624
625	mutex_enter(&zfsdev_state_lock);
626
627	if (zvol_minor_lookup(name) != NULL) {
628		mutex_exit(&zfsdev_state_lock);
629		return (SET_ERROR(EEXIST));
630	}
631
632	/* lie and say we're read-only */
633	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
634
635	if (error) {
636		mutex_exit(&zfsdev_state_lock);
637		return (error);
638	}
639
640#ifdef illumos
641	if ((minor = zfsdev_minor_alloc()) == 0) {
642		dmu_objset_disown(os, FTAG);
643		mutex_exit(&zfsdev_state_lock);
644		return (SET_ERROR(ENXIO));
645	}
646
647	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
648		dmu_objset_disown(os, FTAG);
649		mutex_exit(&zfsdev_state_lock);
650		return (SET_ERROR(EAGAIN));
651	}
652	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
653	    (char *)name);
654
655	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
656
657	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
658	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
659		ddi_soft_state_free(zfsdev_state, minor);
660		dmu_objset_disown(os, FTAG);
661		mutex_exit(&zfsdev_state_lock);
662		return (SET_ERROR(EAGAIN));
663	}
664
665	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
666
667	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
668	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
669		ddi_remove_minor_node(zfs_dip, chrbuf);
670		ddi_soft_state_free(zfsdev_state, minor);
671		dmu_objset_disown(os, FTAG);
672		mutex_exit(&zfsdev_state_lock);
673		return (SET_ERROR(EAGAIN));
674	}
675
676	zs = ddi_get_soft_state(zfsdev_state, minor);
677	zs->zss_type = ZSST_ZVOL;
678	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
679#else	/* !illumos */
680
681	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
682	zv->zv_state = 0;
683	error = dsl_prop_get_integer(name,
684	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
685	if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
686		mode = volmode;
687
688	DROP_GIANT();
689	zv->zv_volmode = mode;
690	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
691		g_topology_lock();
692		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
693		gp->start = zvol_geom_start;
694		gp->access = zvol_geom_access;
695		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
696		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
697		pp->sectorsize = DEV_BSIZE;
698		pp->mediasize = 0;
699		pp->private = zv;
700
701		zv->zv_provider = pp;
702		bioq_init(&zv->zv_queue);
703		mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
704	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
705		struct make_dev_args args;
706
707		make_dev_args_init(&args);
708		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
709		args.mda_devsw = &zvol_cdevsw;
710		args.mda_cr = NULL;
711		args.mda_uid = UID_ROOT;
712		args.mda_gid = GID_OPERATOR;
713		args.mda_mode = 0640;
714		args.mda_si_drv2 = zv;
715		error = make_dev_s(&args, &zv->zv_dev,
716		    "%s/%s", ZVOL_DRIVER, name);
717		if (error != 0) {
718			kmem_free(zv, sizeof(*zv));
719			dmu_objset_disown(os, FTAG);
720			mutex_exit(&zfsdev_state_lock);
721			return (error);
722		}
723		zv->zv_dev->si_iosize_max = MAXPHYS;
724	}
725	LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
726#endif	/* illumos */
727
728	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
729	zv->zv_min_bs = DEV_BSHIFT;
730#ifdef illumos
731	zv->zv_minor = minor;
732#endif
733	zv->zv_objset = os;
734	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
735		zv->zv_flags |= ZVOL_RDONLY;
736	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
737	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
738	    sizeof (rl_t), offsetof(rl_t, r_node));
739	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
740	    offsetof(zvol_extent_t, ze_node));
741#ifdef illumos
742	/* get and cache the blocksize */
743	error = dmu_object_info(os, ZVOL_OBJ, &doi);
744	ASSERT(error == 0);
745	zv->zv_volblocksize = doi.doi_data_block_size;
746#endif
747
748	if (spa_writeable(dmu_objset_spa(os))) {
749		if (zil_replay_disable)
750			zil_destroy(dmu_objset_zil(os), B_FALSE);
751		else
752			zil_replay(os, zv, zvol_replay_vector);
753	}
754	dmu_objset_disown(os, FTAG);
755	zv->zv_objset = NULL;
756
757	zvol_minors++;
758
759	mutex_exit(&zfsdev_state_lock);
760#ifndef illumos
761	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
762		zvol_geom_run(zv);
763		g_topology_unlock();
764	}
765	PICKUP_GIANT();
766
767	ZFS_LOG(1, "ZVOL %s created.", name);
768#endif
769
770	return (0);
771}
772
773/*
774 * Remove minor node for the specified volume.
775 */
776static int
777zvol_remove_zv(zvol_state_t *zv)
778{
779#ifdef illumos
780	char nmbuf[20];
781	minor_t minor = zv->zv_minor;
782#endif
783
784	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
785	if (zv->zv_total_opens != 0)
786		return (SET_ERROR(EBUSY));
787
788#ifdef illumos
789	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
790	ddi_remove_minor_node(zfs_dip, nmbuf);
791
792	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
793	ddi_remove_minor_node(zfs_dip, nmbuf);
794#else
795	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
796
797	LIST_REMOVE(zv, zv_links);
798	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
799		g_topology_lock();
800		zvol_geom_destroy(zv);
801		g_topology_unlock();
802	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
803		if (zv->zv_dev != NULL)
804			destroy_dev(zv->zv_dev);
805	}
806#endif
807
808	avl_destroy(&zv->zv_znode.z_range_avl);
809	mutex_destroy(&zv->zv_znode.z_range_lock);
810
811	kmem_free(zv, sizeof (zvol_state_t));
812#ifdef illumos
813	ddi_soft_state_free(zfsdev_state, minor);
814#endif
815	zvol_minors--;
816	return (0);
817}
818
819int
820zvol_remove_minor(const char *name)
821{
822	zvol_state_t *zv;
823	int rc;
824
825	mutex_enter(&zfsdev_state_lock);
826	if ((zv = zvol_minor_lookup(name)) == NULL) {
827		mutex_exit(&zfsdev_state_lock);
828		return (SET_ERROR(ENXIO));
829	}
830	rc = zvol_remove_zv(zv);
831	mutex_exit(&zfsdev_state_lock);
832	return (rc);
833}
834
835int
836zvol_first_open(zvol_state_t *zv)
837{
838	dmu_object_info_t doi;
839	objset_t *os;
840	uint64_t volsize;
841	int error;
842	uint64_t readonly;
843
844	/* lie and say we're read-only */
845	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
846	    zvol_tag, &os);
847	if (error)
848		return (error);
849
850	zv->zv_objset = os;
851	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
852	if (error) {
853		ASSERT(error == 0);
854		dmu_objset_disown(os, zvol_tag);
855		return (error);
856	}
857
858	/* get and cache the blocksize */
859	error = dmu_object_info(os, ZVOL_OBJ, &doi);
860	if (error) {
861		ASSERT(error == 0);
862		dmu_objset_disown(os, zvol_tag);
863		return (error);
864	}
865	zv->zv_volblocksize = doi.doi_data_block_size;
866
867	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
868	if (error) {
869		dmu_objset_disown(os, zvol_tag);
870		return (error);
871	}
872
873	zvol_size_changed(zv, volsize);
874	zv->zv_zilog = zil_open(os, zvol_get_data);
875
876	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
877	    NULL) == 0);
878	if (readonly || dmu_objset_is_snapshot(os) ||
879	    !spa_writeable(dmu_objset_spa(os)))
880		zv->zv_flags |= ZVOL_RDONLY;
881	else
882		zv->zv_flags &= ~ZVOL_RDONLY;
883	return (error);
884}
885
886void
887zvol_last_close(zvol_state_t *zv)
888{
889	zil_close(zv->zv_zilog);
890	zv->zv_zilog = NULL;
891
892	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
893	zv->zv_dbuf = NULL;
894
895	/*
896	 * Evict cached data
897	 */
898	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
899	    !(zv->zv_flags & ZVOL_RDONLY))
900		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
901	dmu_objset_evict_dbufs(zv->zv_objset);
902
903	dmu_objset_disown(zv->zv_objset, zvol_tag);
904	zv->zv_objset = NULL;
905}
906
907#ifdef illumos
908int
909zvol_prealloc(zvol_state_t *zv)
910{
911	objset_t *os = zv->zv_objset;
912	dmu_tx_t *tx;
913	uint64_t refd, avail, usedobjs, availobjs;
914	uint64_t resid = zv->zv_volsize;
915	uint64_t off = 0;
916
917	/* Check the space usage before attempting to allocate the space */
918	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
919	if (avail < zv->zv_volsize)
920		return (SET_ERROR(ENOSPC));
921
922	/* Free old extents if they exist */
923	zvol_free_extents(zv);
924
925	while (resid != 0) {
926		int error;
927		uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
928
929		tx = dmu_tx_create(os);
930		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
931		error = dmu_tx_assign(tx, TXG_WAIT);
932		if (error) {
933			dmu_tx_abort(tx);
934			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
935			return (error);
936		}
937		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
938		dmu_tx_commit(tx);
939		off += bytes;
940		resid -= bytes;
941	}
942	txg_wait_synced(dmu_objset_pool(os), 0);
943
944	return (0);
945}
946#endif	/* illumos */
947
948static int
949zvol_update_volsize(objset_t *os, uint64_t volsize)
950{
951	dmu_tx_t *tx;
952	int error;
953
954	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
955
956	tx = dmu_tx_create(os);
957	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
958	dmu_tx_mark_netfree(tx);
959	error = dmu_tx_assign(tx, TXG_WAIT);
960	if (error) {
961		dmu_tx_abort(tx);
962		return (error);
963	}
964
965	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
966	    &volsize, tx);
967	dmu_tx_commit(tx);
968
969	if (error == 0)
970		error = dmu_free_long_range(os,
971		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
972	return (error);
973}
974
975void
976zvol_remove_minors(const char *name)
977{
978#ifdef illumos
979	zvol_state_t *zv;
980	char *namebuf;
981	minor_t minor;
982
983	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
984	(void) strncpy(namebuf, name, strlen(name));
985	(void) strcat(namebuf, "/");
986	mutex_enter(&zfsdev_state_lock);
987	for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
988
989		zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
990		if (zv == NULL)
991			continue;
992		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
993			(void) zvol_remove_zv(zv);
994	}
995	kmem_free(namebuf, strlen(name) + 2);
996
997	mutex_exit(&zfsdev_state_lock);
998#else	/* !illumos */
999	zvol_state_t *zv, *tzv;
1000	size_t namelen;
1001
1002	namelen = strlen(name);
1003
1004	DROP_GIANT();
1005	mutex_enter(&zfsdev_state_lock);
1006
1007	LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
1008		if (strcmp(zv->zv_name, name) == 0 ||
1009		    (strncmp(zv->zv_name, name, namelen) == 0 &&
1010		    strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
1011		    zv->zv_name[namelen] == '@'))) {
1012			(void) zvol_remove_zv(zv);
1013		}
1014	}
1015
1016	mutex_exit(&zfsdev_state_lock);
1017	PICKUP_GIANT();
1018#endif	/* illumos */
1019}
1020
1021static int
1022zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
1023{
1024	uint64_t old_volsize = 0ULL;
1025	int error = 0;
1026
1027	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
1028
1029	/*
1030	 * Reinitialize the dump area to the new size. If we
1031	 * failed to resize the dump area then restore it back to
1032	 * its original size.  We must set the new volsize prior
1033	 * to calling dumpvp_resize() to ensure that the devices'
1034	 * size(9P) is not visible by the dump subsystem.
1035	 */
1036	old_volsize = zv->zv_volsize;
1037	zvol_size_changed(zv, volsize);
1038
1039#ifdef ZVOL_DUMP
1040	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1041		if ((error = zvol_dumpify(zv)) != 0 ||
1042		    (error = dumpvp_resize()) != 0) {
1043			int dumpify_error;
1044
1045			(void) zvol_update_volsize(zv->zv_objset, old_volsize);
1046			zvol_size_changed(zv, old_volsize);
1047			dumpify_error = zvol_dumpify(zv);
1048			error = dumpify_error ? dumpify_error : error;
1049		}
1050	}
1051#endif	/* ZVOL_DUMP */
1052
1053#ifdef illumos
1054	/*
1055	 * Generate a LUN expansion event.
1056	 */
1057	if (error == 0) {
1058		sysevent_id_t eid;
1059		nvlist_t *attr;
1060		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1061
1062		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
1063		    zv->zv_minor);
1064
1065		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1066		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
1067
1068		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
1069		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
1070
1071		nvlist_free(attr);
1072		kmem_free(physpath, MAXPATHLEN);
1073	}
1074#endif	/* illumos */
1075	return (error);
1076}
1077
1078int
1079zvol_set_volsize(const char *name, uint64_t volsize)
1080{
1081	zvol_state_t *zv = NULL;
1082	objset_t *os;
1083	int error;
1084	dmu_object_info_t doi;
1085	uint64_t readonly;
1086	boolean_t owned = B_FALSE;
1087
1088	error = dsl_prop_get_integer(name,
1089	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
1090	if (error != 0)
1091		return (error);
1092	if (readonly)
1093		return (SET_ERROR(EROFS));
1094
1095	mutex_enter(&zfsdev_state_lock);
1096	zv = zvol_minor_lookup(name);
1097
1098	if (zv == NULL || zv->zv_objset == NULL) {
1099		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
1100		    FTAG, &os)) != 0) {
1101			mutex_exit(&zfsdev_state_lock);
1102			return (error);
1103		}
1104		owned = B_TRUE;
1105		if (zv != NULL)
1106			zv->zv_objset = os;
1107	} else {
1108		os = zv->zv_objset;
1109	}
1110
1111	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
1112	    (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
1113		goto out;
1114
1115	error = zvol_update_volsize(os, volsize);
1116
1117	if (error == 0 && zv != NULL)
1118		error = zvol_update_live_volsize(zv, volsize);
1119out:
1120	if (owned) {
1121		dmu_objset_disown(os, FTAG);
1122		if (zv != NULL)
1123			zv->zv_objset = NULL;
1124	}
1125	mutex_exit(&zfsdev_state_lock);
1126	return (error);
1127}
1128
1129/*ARGSUSED*/
1130#ifdef illumos
1131int
1132zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
1133#else
1134static int
1135zvol_open(struct g_provider *pp, int flag, int count)
1136#endif
1137{
1138	zvol_state_t *zv;
1139	int err = 0;
1140#ifdef illumos
1141
1142	mutex_enter(&zfsdev_state_lock);
1143
1144	zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
1145	if (zv == NULL) {
1146		mutex_exit(&zfsdev_state_lock);
1147		return (SET_ERROR(ENXIO));
1148	}
1149
1150	if (zv->zv_total_opens == 0)
1151		err = zvol_first_open(zv);
1152	if (err) {
1153		mutex_exit(&zfsdev_state_lock);
1154		return (err);
1155	}
1156#else	/* !illumos */
1157	boolean_t locked = B_FALSE;
1158
1159	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
1160		/*
1161		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
1162		 * attempting to probe geom providers while looking for a
1163		 * replacement for a missing VDEV.  In this case, the
1164		 * spa_namespace_lock will not be held, but it is still illegal
1165		 * to use a zvol as a vdev.  Deadlocks can result if another
1166		 * thread has spa_namespace_lock
1167		 */
1168		return (EOPNOTSUPP);
1169	}
1170	/*
1171	 * Protect against recursively entering spa_namespace_lock
1172	 * when spa_open() is used for a pool on a (local) ZVOL(s).
1173	 * This is needed since we replaced upstream zfsdev_state_lock
1174	 * with spa_namespace_lock in the ZVOL code.
1175	 * We are using the same trick as spa_open().
1176	 * Note that calls in zvol_first_open which need to resolve
1177	 * pool name to a spa object will enter spa_open()
1178	 * recursively, but that function already has all the
1179	 * necessary protection.
1180	 */
1181	if (!MUTEX_HELD(&zfsdev_state_lock)) {
1182		mutex_enter(&zfsdev_state_lock);
1183		locked = B_TRUE;
1184	}
1185
1186	zv = pp->private;
1187	if (zv == NULL) {
1188		if (locked)
1189			mutex_exit(&zfsdev_state_lock);
1190		return (SET_ERROR(ENXIO));
1191	}
1192
1193	if (zv->zv_total_opens == 0) {
1194		err = zvol_first_open(zv);
1195		if (err) {
1196			if (locked)
1197				mutex_exit(&zfsdev_state_lock);
1198			return (err);
1199		}
1200		pp->mediasize = zv->zv_volsize;
1201		pp->stripeoffset = 0;
1202		pp->stripesize = zv->zv_volblocksize;
1203	}
1204#endif	/* illumos */
1205	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1206		err = SET_ERROR(EROFS);
1207		goto out;
1208	}
1209	if (zv->zv_flags & ZVOL_EXCL) {
1210		err = SET_ERROR(EBUSY);
1211		goto out;
1212	}
1213#ifdef FEXCL
1214	if (flag & FEXCL) {
1215		if (zv->zv_total_opens != 0) {
1216			err = SET_ERROR(EBUSY);
1217			goto out;
1218		}
1219		zv->zv_flags |= ZVOL_EXCL;
1220	}
1221#endif
1222
1223#ifdef illumos
1224	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
1225		zv->zv_open_count[otyp]++;
1226		zv->zv_total_opens++;
1227	}
1228	mutex_exit(&zfsdev_state_lock);
1229#else
1230	zv->zv_total_opens += count;
1231	if (locked)
1232		mutex_exit(&zfsdev_state_lock);
1233#endif
1234
1235	return (err);
1236out:
1237	if (zv->zv_total_opens == 0)
1238		zvol_last_close(zv);
1239#ifdef illumos
1240	mutex_exit(&zfsdev_state_lock);
1241#else
1242	if (locked)
1243		mutex_exit(&zfsdev_state_lock);
1244#endif
1245	return (err);
1246}
1247
1248/*ARGSUSED*/
1249#ifdef illumos
1250int
1251zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
1252{
1253	minor_t minor = getminor(dev);
1254	zvol_state_t *zv;
1255	int error = 0;
1256
1257	mutex_enter(&zfsdev_state_lock);
1258
1259	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1260	if (zv == NULL) {
1261		mutex_exit(&zfsdev_state_lock);
1262#else	/* !illumos */
1263static int
1264zvol_close(struct g_provider *pp, int flag, int count)
1265{
1266	zvol_state_t *zv;
1267	int error = 0;
1268	boolean_t locked = B_FALSE;
1269
1270	/* See comment in zvol_open(). */
1271	if (!MUTEX_HELD(&zfsdev_state_lock)) {
1272		mutex_enter(&zfsdev_state_lock);
1273		locked = B_TRUE;
1274	}
1275
1276	zv = pp->private;
1277	if (zv == NULL) {
1278		if (locked)
1279			mutex_exit(&zfsdev_state_lock);
1280#endif	/* illumos */
1281		return (SET_ERROR(ENXIO));
1282	}
1283
1284	if (zv->zv_flags & ZVOL_EXCL) {
1285		ASSERT(zv->zv_total_opens == 1);
1286		zv->zv_flags &= ~ZVOL_EXCL;
1287	}
1288
1289	/*
1290	 * If the open count is zero, this is a spurious close.
1291	 * That indicates a bug in the kernel / DDI framework.
1292	 */
1293#ifdef illumos
1294	ASSERT(zv->zv_open_count[otyp] != 0);
1295#endif
1296	ASSERT(zv->zv_total_opens != 0);
1297
1298	/*
1299	 * You may get multiple opens, but only one close.
1300	 */
1301#ifdef illumos
1302	zv->zv_open_count[otyp]--;
1303	zv->zv_total_opens--;
1304#else
1305	zv->zv_total_opens -= count;
1306#endif
1307
1308	if (zv->zv_total_opens == 0)
1309		zvol_last_close(zv);
1310
1311#ifdef illumos
1312	mutex_exit(&zfsdev_state_lock);
1313#else
1314	if (locked)
1315		mutex_exit(&zfsdev_state_lock);
1316#endif
1317	return (error);
1318}
1319
1320static void
1321zvol_get_done(zgd_t *zgd, int error)
1322{
1323	if (zgd->zgd_db)
1324		dmu_buf_rele(zgd->zgd_db, zgd);
1325
1326	zfs_range_unlock(zgd->zgd_rl);
1327
1328	if (error == 0 && zgd->zgd_bp)
1329		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
1330
1331	kmem_free(zgd, sizeof (zgd_t));
1332}
1333
1334/*
1335 * Get data to generate a TX_WRITE intent log record.
1336 */
1337static int
1338zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1339{
1340	zvol_state_t *zv = arg;
1341	objset_t *os = zv->zv_objset;
1342	uint64_t object = ZVOL_OBJ;
1343	uint64_t offset = lr->lr_offset;
1344	uint64_t size = lr->lr_length;	/* length of user data */
1345	dmu_buf_t *db;
1346	zgd_t *zgd;
1347	int error;
1348
1349	ASSERT3P(lwb, !=, NULL);
1350	ASSERT3P(zio, !=, NULL);
1351	ASSERT3U(size, !=, 0);
1352
1353	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1354	zgd->zgd_lwb = lwb;
1355
1356	/*
1357	 * Write records come in two flavors: immediate and indirect.
1358	 * For small writes it's cheaper to store the data with the
1359	 * log record (immediate); for large writes it's cheaper to
1360	 * sync the data and get a pointer to it (indirect) so that
1361	 * we don't have to write the data twice.
1362	 */
1363	if (buf != NULL) { /* immediate write */
1364		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1365		    RL_READER);
1366		error = dmu_read(os, object, offset, size, buf,
1367		    DMU_READ_NO_PREFETCH);
1368	} else { /* indirect write */
1369		/*
1370		 * Have to lock the whole block to ensure when it's written out
1371		 * and its checksum is being calculated that no one can change
1372		 * the data. Contrarily to zfs_get_data we need not re-check
1373		 * blocksize after we get the lock because it cannot be changed.
1374		 */
1375		size = zv->zv_volblocksize;
1376		offset = P2ALIGN(offset, size);
1377		zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
1378		    RL_READER);
1379		error = dmu_buf_hold(os, object, offset, zgd, &db,
1380		    DMU_READ_NO_PREFETCH);
1381		if (error == 0) {
1382			blkptr_t *bp = &lr->lr_blkptr;
1383
1384			zgd->zgd_db = db;
1385			zgd->zgd_bp = bp;
1386
1387			ASSERT(db->db_offset == offset);
1388			ASSERT(db->db_size == size);
1389
1390			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1391			    zvol_get_done, zgd);
1392
1393			if (error == 0)
1394				return (0);
1395		}
1396	}
1397
1398	zvol_get_done(zgd, error);
1399
1400	return (error);
1401}
1402
1403/*
1404 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1405 *
1406 * We store data in the log buffers if it's small enough.
1407 * Otherwise we will later flush the data out via dmu_sync().
1408 */
1409ssize_t zvol_immediate_write_sz = 32768;
1410#ifdef _KERNEL
1411SYSCTL_LONG(_vfs_zfs_vol, OID_AUTO, immediate_write_sz, CTLFLAG_RWTUN,
1412    &zvol_immediate_write_sz, 0, "Minimal size for indirect log write");
1413#endif
1414
1415static void
1416zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1417    boolean_t sync)
1418{
1419	uint32_t blocksize = zv->zv_volblocksize;
1420	zilog_t *zilog = zv->zv_zilog;
1421	itx_wr_state_t write_state;
1422
1423	if (zil_replaying(zilog, tx))
1424		return;
1425
1426	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1427		write_state = WR_INDIRECT;
1428	else if (!spa_has_slogs(zilog->zl_spa) &&
1429	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
1430		write_state = WR_INDIRECT;
1431	else if (sync)
1432		write_state = WR_COPIED;
1433	else
1434		write_state = WR_NEED_COPY;
1435
1436	while (resid) {
1437		itx_t *itx;
1438		lr_write_t *lr;
1439		itx_wr_state_t wr_state = write_state;
1440		ssize_t len = resid;
1441
1442		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
1443			wr_state = WR_NEED_COPY;
1444		else if (wr_state == WR_INDIRECT)
1445			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
1446
1447		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1448		    (wr_state == WR_COPIED ? len : 0));
1449		lr = (lr_write_t *)&itx->itx_lr;
1450		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
1451		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1452			zil_itx_destroy(itx);
1453			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1454			lr = (lr_write_t *)&itx->itx_lr;
1455			wr_state = WR_NEED_COPY;
1456		}
1457
1458		itx->itx_wr_state = wr_state;
1459		lr->lr_foid = ZVOL_OBJ;
1460		lr->lr_offset = off;
1461		lr->lr_length = len;
1462		lr->lr_blkoff = 0;
1463		BP_ZERO(&lr->lr_blkptr);
1464
1465		itx->itx_private = zv;
1466
1467		if (!sync && (zv->zv_sync_cnt == 0))
1468			itx->itx_sync = B_FALSE;
1469
1470		zil_itx_assign(zilog, itx, tx);
1471
1472		off += len;
1473		resid -= len;
1474	}
1475}
1476
1477#ifdef illumos
1478static int
1479zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1480    uint64_t size, boolean_t doread, boolean_t isdump)
1481{
1482	vdev_disk_t *dvd;
1483	int c;
1484	int numerrors = 0;
1485
1486	if (vd->vdev_ops == &vdev_mirror_ops ||
1487	    vd->vdev_ops == &vdev_replacing_ops ||
1488	    vd->vdev_ops == &vdev_spare_ops) {
1489		for (c = 0; c < vd->vdev_children; c++) {
1490			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1491			    addr, offset, origoffset, size, doread, isdump);
1492			if (err != 0) {
1493				numerrors++;
1494			} else if (doread) {
1495				break;
1496			}
1497		}
1498	}
1499
1500	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1501		return (numerrors < vd->vdev_children ? 0 : EIO);
1502
1503	if (doread && !vdev_readable(vd))
1504		return (SET_ERROR(EIO));
1505	else if (!doread && !vdev_writeable(vd))
1506		return (SET_ERROR(EIO));
1507
1508	if (vd->vdev_ops == &vdev_raidz_ops) {
1509		return (vdev_raidz_physio(vd,
1510		    addr, size, offset, origoffset, doread, isdump));
1511	}
1512
1513	offset += VDEV_LABEL_START_SIZE;
1514
1515	if (ddi_in_panic() || isdump) {
1516		ASSERT(!doread);
1517		if (doread)
1518			return (SET_ERROR(EIO));
1519		dvd = vd->vdev_tsd;
1520		ASSERT3P(dvd, !=, NULL);
1521		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1522		    lbtodb(size)));
1523	} else {
1524		dvd = vd->vdev_tsd;
1525		ASSERT3P(dvd, !=, NULL);
1526		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1527		    offset, doread ? B_READ : B_WRITE));
1528	}
1529}
1530
1531static int
1532zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1533    boolean_t doread, boolean_t isdump)
1534{
1535	vdev_t *vd;
1536	int error;
1537	zvol_extent_t *ze;
1538	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1539
1540	/* Must be sector aligned, and not stradle a block boundary. */
1541	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1542	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1543		return (SET_ERROR(EINVAL));
1544	}
1545	ASSERT(size <= zv->zv_volblocksize);
1546
1547	/* Locate the extent this belongs to */
1548	ze = list_head(&zv->zv_extents);
1549	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1550		offset -= ze->ze_nblks * zv->zv_volblocksize;
1551		ze = list_next(&zv->zv_extents, ze);
1552	}
1553
1554	if (ze == NULL)
1555		return (SET_ERROR(EINVAL));
1556
1557	if (!ddi_in_panic())
1558		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1559
1560	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1561	offset += DVA_GET_OFFSET(&ze->ze_dva);
1562	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1563	    size, doread, isdump);
1564
1565	if (!ddi_in_panic())
1566		spa_config_exit(spa, SCL_STATE, FTAG);
1567
1568	return (error);
1569}
1570
1571int
1572zvol_strategy(buf_t *bp)
1573{
1574	zfs_soft_state_t *zs = NULL;
1575#else	/* !illumos */
1576void
1577zvol_strategy(struct bio *bp)
1578{
1579#endif	/* illumos */
1580	zvol_state_t *zv;
1581	uint64_t off, volsize;
1582	size_t resid;
1583	char *addr;
1584	objset_t *os;
1585	rl_t *rl;
1586	int error = 0;
1587#ifdef illumos
1588	boolean_t doread = bp->b_flags & B_READ;
1589#else
1590	boolean_t doread = 0;
1591#endif
1592	boolean_t is_dumpified;
1593	boolean_t sync;
1594
1595#ifdef illumos
1596	if (getminor(bp->b_edev) == 0) {
1597		error = SET_ERROR(EINVAL);
1598	} else {
1599		zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
1600		if (zs == NULL)
1601			error = SET_ERROR(ENXIO);
1602		else if (zs->zss_type != ZSST_ZVOL)
1603			error = SET_ERROR(EINVAL);
1604	}
1605
1606	if (error) {
1607		bioerror(bp, error);
1608		biodone(bp);
1609		return (0);
1610	}
1611
1612	zv = zs->zss_data;
1613
1614	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
1615		bioerror(bp, EROFS);
1616		biodone(bp);
1617		return (0);
1618	}
1619
1620	off = ldbtob(bp->b_blkno);
1621#else	/* !illumos */
1622	if (bp->bio_to)
1623		zv = bp->bio_to->private;
1624	else
1625		zv = bp->bio_dev->si_drv2;
1626
1627	if (zv == NULL) {
1628		error = SET_ERROR(ENXIO);
1629		goto out;
1630	}
1631
1632	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1633		error = SET_ERROR(EROFS);
1634		goto out;
1635	}
1636
1637	switch (bp->bio_cmd) {
1638	case BIO_FLUSH:
1639		goto sync;
1640	case BIO_READ:
1641		doread = 1;
1642	case BIO_WRITE:
1643	case BIO_DELETE:
1644		break;
1645	default:
1646		error = EOPNOTSUPP;
1647		goto out;
1648	}
1649
1650	off = bp->bio_offset;
1651#endif	/* illumos */
1652	volsize = zv->zv_volsize;
1653
1654	os = zv->zv_objset;
1655	ASSERT(os != NULL);
1656
1657#ifdef illumos
1658	bp_mapin(bp);
1659	addr = bp->b_un.b_addr;
1660	resid = bp->b_bcount;
1661
1662	if (resid > 0 && (off < 0 || off >= volsize)) {
1663		bioerror(bp, EIO);
1664		biodone(bp);
1665		return (0);
1666	}
1667
1668	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1669	sync = ((!(bp->b_flags & B_ASYNC) &&
1670	    !(zv->zv_flags & ZVOL_WCE)) ||
1671	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
1672	    !doread && !is_dumpified;
1673#else	/* !illumos */
1674	addr = bp->bio_data;
1675	resid = bp->bio_length;
1676
1677	if (resid > 0 && (off < 0 || off >= volsize)) {
1678		error = SET_ERROR(EIO);
1679		goto out;
1680	}
1681
1682	is_dumpified = B_FALSE;
1683	sync = !doread && !is_dumpified &&
1684	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1685#endif	/* illumos */
1686
1687	/*
1688	 * There must be no buffer changes when doing a dmu_sync() because
1689	 * we can't change the data whilst calculating the checksum.
1690	 */
1691	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1692	    doread ? RL_READER : RL_WRITER);
1693
1694#ifndef illumos
1695	if (bp->bio_cmd == BIO_DELETE) {
1696		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1697		error = dmu_tx_assign(tx, TXG_WAIT);
1698		if (error != 0) {
1699			dmu_tx_abort(tx);
1700		} else {
1701			zvol_log_truncate(zv, tx, off, resid, sync);
1702			dmu_tx_commit(tx);
1703			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1704			    off, resid);
1705			resid = 0;
1706		}
1707		goto unlock;
1708	}
1709#endif
1710	while (resid != 0 && off < volsize) {
1711		size_t size = MIN(resid, zvol_maxphys);
1712#ifdef illumos
1713		if (is_dumpified) {
1714			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1715			error = zvol_dumpio(zv, addr, off, size,
1716			    doread, B_FALSE);
1717		} else if (doread) {
1718#else
1719		if (doread) {
1720#endif
1721			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1722			    DMU_READ_PREFETCH);
1723		} else {
1724			dmu_tx_t *tx = dmu_tx_create(os);
1725			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1726			error = dmu_tx_assign(tx, TXG_WAIT);
1727			if (error) {
1728				dmu_tx_abort(tx);
1729			} else {
1730				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1731				zvol_log_write(zv, tx, off, size, sync);
1732				dmu_tx_commit(tx);
1733			}
1734		}
1735		if (error) {
1736			/* convert checksum errors into IO errors */
1737			if (error == ECKSUM)
1738				error = SET_ERROR(EIO);
1739			break;
1740		}
1741		off += size;
1742		addr += size;
1743		resid -= size;
1744	}
1745#ifndef illumos
1746unlock:
1747#endif
1748	zfs_range_unlock(rl);
1749
1750#ifdef illumos
1751	if ((bp->b_resid = resid) == bp->b_bcount)
1752		bioerror(bp, off > volsize ? EINVAL : error);
1753
1754	if (sync)
1755		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1756	biodone(bp);
1757
1758	return (0);
1759#else	/* !illumos */
1760	bp->bio_completed = bp->bio_length - resid;
1761	if (bp->bio_completed < bp->bio_length && off > volsize)
1762		error = EINVAL;
1763
1764	if (sync) {
1765sync:
1766		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1767	}
1768out:
1769	if (bp->bio_to)
1770		g_io_deliver(bp, error);
1771	else
1772		biofinish(bp, NULL, error);
1773#endif	/* illumos */
1774}
1775
1776#ifdef illumos
1777/*
1778 * Set the buffer count to the zvol maximum transfer.
1779 * Using our own routine instead of the default minphys()
1780 * means that for larger writes we write bigger buffers on X86
1781 * (128K instead of 56K) and flush the disk write cache less often
1782 * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1783 * 56K on X86 and 128K on sparc).
1784 */
1785void
1786zvol_minphys(struct buf *bp)
1787{
1788	if (bp->b_bcount > zvol_maxphys)
1789		bp->b_bcount = zvol_maxphys;
1790}
1791
1792int
1793zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1794{
1795	minor_t minor = getminor(dev);
1796	zvol_state_t *zv;
1797	int error = 0;
1798	uint64_t size;
1799	uint64_t boff;
1800	uint64_t resid;
1801
1802	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1803	if (zv == NULL)
1804		return (SET_ERROR(ENXIO));
1805
1806	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1807		return (SET_ERROR(EINVAL));
1808
1809	boff = ldbtob(blkno);
1810	resid = ldbtob(nblocks);
1811
1812	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1813
1814	while (resid) {
1815		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1816		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1817		if (error)
1818			break;
1819		boff += size;
1820		addr += size;
1821		resid -= size;
1822	}
1823
1824	return (error);
1825}
1826
1827/*ARGSUSED*/
1828int
1829zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1830{
1831	minor_t minor = getminor(dev);
1832#else	/* !illumos */
1833int
1834zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
1835{
1836#endif	/* illumos */
1837	zvol_state_t *zv;
1838	uint64_t volsize;
1839	rl_t *rl;
1840	int error = 0;
1841
1842#ifdef illumos
1843	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1844	if (zv == NULL)
1845		return (SET_ERROR(ENXIO));
1846#else
1847	zv = dev->si_drv2;
1848#endif
1849
1850	volsize = zv->zv_volsize;
1851	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
1852	if (uio->uio_resid > 0 &&
1853	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
1854		return (SET_ERROR(EIO));
1855
1856#ifdef illumos
1857	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1858		error = physio(zvol_strategy, NULL, dev, B_READ,
1859		    zvol_minphys, uio);
1860		return (error);
1861	}
1862#endif
1863
1864	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1865	    RL_READER);
1866	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1867		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1868
1869		/* don't read past the end */
1870		if (bytes > volsize - uio->uio_loffset)
1871			bytes = volsize - uio->uio_loffset;
1872
1873		error =  dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
1874		if (error) {
1875			/* convert checksum errors into IO errors */
1876			if (error == ECKSUM)
1877				error = SET_ERROR(EIO);
1878			break;
1879		}
1880	}
1881	zfs_range_unlock(rl);
1882	return (error);
1883}
1884
1885#ifdef illumos
1886/*ARGSUSED*/
1887int
1888zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1889{
1890	minor_t minor = getminor(dev);
1891#else	/* !illumos */
1892int
1893zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
1894{
1895#endif	/* illumos */
1896	zvol_state_t *zv;
1897	uint64_t volsize;
1898	rl_t *rl;
1899	int error = 0;
1900	boolean_t sync;
1901
1902#ifdef illumos
1903	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1904	if (zv == NULL)
1905		return (SET_ERROR(ENXIO));
1906#else
1907	zv = dev->si_drv2;
1908#endif
1909
1910	volsize = zv->zv_volsize;
1911	/* uio_loffset == volsize isn't an error as its required for EOF processing. */
1912	if (uio->uio_resid > 0 &&
1913	    (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
1914		return (SET_ERROR(EIO));
1915
1916#ifdef illumos
1917	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1918		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1919		    zvol_minphys, uio);
1920		return (error);
1921	}
1922
1923	sync = !(zv->zv_flags & ZVOL_WCE) ||
1924#else
1925	sync = (ioflag & IO_SYNC) ||
1926#endif
1927	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1928
1929	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1930	    RL_WRITER);
1931	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1932		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1933		uint64_t off = uio->uio_loffset;
1934		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1935
1936		if (bytes > volsize - off)	/* don't write past the end */
1937			bytes = volsize - off;
1938
1939		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1940		error = dmu_tx_assign(tx, TXG_WAIT);
1941		if (error) {
1942			dmu_tx_abort(tx);
1943			break;
1944		}
1945		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1946		if (error == 0)
1947			zvol_log_write(zv, tx, off, bytes, sync);
1948		dmu_tx_commit(tx);
1949
1950		if (error)
1951			break;
1952	}
1953	zfs_range_unlock(rl);
1954	if (sync)
1955		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1956	return (error);
1957}
1958
1959#ifdef illumos
1960int
1961zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1962{
1963	struct uuid uuid = EFI_RESERVED;
1964	efi_gpe_t gpe = { 0 };
1965	uint32_t crc;
1966	dk_efi_t efi;
1967	int length;
1968	char *ptr;
1969
1970	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1971		return (SET_ERROR(EFAULT));
1972	ptr = (char *)(uintptr_t)efi.dki_data_64;
1973	length = efi.dki_length;
1974	/*
1975	 * Some clients may attempt to request a PMBR for the
1976	 * zvol.  Currently this interface will return EINVAL to
1977	 * such requests.  These requests could be supported by
1978	 * adding a check for lba == 0 and consing up an appropriate
1979	 * PMBR.
1980	 */
1981	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1982		return (SET_ERROR(EINVAL));
1983
1984	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1985	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1986	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1987
1988	if (efi.dki_lba == 1) {
1989		efi_gpt_t gpt = { 0 };
1990
1991		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1992		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1993		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1994		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1995		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1996		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1997		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1998		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1999		gpt.efi_gpt_SizeOfPartitionEntry =
2000		    LE_32(sizeof (efi_gpe_t));
2001		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
2002		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2003		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
2004		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
2005		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
2006		    flag))
2007			return (SET_ERROR(EFAULT));
2008		ptr += sizeof (gpt);
2009		length -= sizeof (gpt);
2010	}
2011	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
2012	    length), flag))
2013		return (SET_ERROR(EFAULT));
2014	return (0);
2015}
2016
2017/*
2018 * BEGIN entry points to allow external callers access to the volume.
2019 */
2020/*
2021 * Return the volume parameters needed for access from an external caller.
2022 * These values are invariant as long as the volume is held open.
2023 */
2024int
2025zvol_get_volume_params(minor_t minor, uint64_t *blksize,
2026    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
2027    void **rl_hdl, void **bonus_hdl)
2028{
2029	zvol_state_t *zv;
2030
2031	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
2032	if (zv == NULL)
2033		return (SET_ERROR(ENXIO));
2034	if (zv->zv_flags & ZVOL_DUMPIFIED)
2035		return (SET_ERROR(ENXIO));
2036
2037	ASSERT(blksize && max_xfer_len && minor_hdl &&
2038	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
2039
2040	*blksize = zv->zv_volblocksize;
2041	*max_xfer_len = (uint64_t)zvol_maxphys;
2042	*minor_hdl = zv;
2043	*objset_hdl = zv->zv_objset;
2044	*zil_hdl = zv->zv_zilog;
2045	*rl_hdl = &zv->zv_znode;
2046	*bonus_hdl = zv->zv_dbuf;
2047	return (0);
2048}
2049
2050/*
2051 * Return the current volume size to an external caller.
2052 * The size can change while the volume is open.
2053 */
2054uint64_t
2055zvol_get_volume_size(void *minor_hdl)
2056{
2057	zvol_state_t *zv = minor_hdl;
2058
2059	return (zv->zv_volsize);
2060}
2061
2062/*
2063 * Return the current WCE setting to an external caller.
2064 * The WCE setting can change while the volume is open.
2065 */
2066int
2067zvol_get_volume_wce(void *minor_hdl)
2068{
2069	zvol_state_t *zv = minor_hdl;
2070
2071	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
2072}
2073
2074/*
2075 * Entry point for external callers to zvol_log_write
2076 */
2077void
2078zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
2079    boolean_t sync)
2080{
2081	zvol_state_t *zv = minor_hdl;
2082
2083	zvol_log_write(zv, tx, off, resid, sync);
2084}
2085/*
2086 * END entry points to allow external callers access to the volume.
2087 */
2088#endif	/* illumos */
2089
2090/*
2091 * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
2092 */
2093static void
2094zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
2095    boolean_t sync)
2096{
2097	itx_t *itx;
2098	lr_truncate_t *lr;
2099	zilog_t *zilog = zv->zv_zilog;
2100
2101	if (zil_replaying(zilog, tx))
2102		return;
2103
2104	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
2105	lr = (lr_truncate_t *)&itx->itx_lr;
2106	lr->lr_foid = ZVOL_OBJ;
2107	lr->lr_offset = off;
2108	lr->lr_length = len;
2109
2110	itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
2111	zil_itx_assign(zilog, itx, tx);
2112}
2113
2114#ifdef illumos
2115/*
2116 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
2117 * Also a dirtbag dkio ioctl for unmap/free-block functionality.
2118 */
2119/*ARGSUSED*/
2120int
2121zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
2122{
2123	zvol_state_t *zv;
2124	struct dk_callback *dkc;
2125	int error = 0;
2126	rl_t *rl;
2127
2128	mutex_enter(&zfsdev_state_lock);
2129
2130	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
2131
2132	if (zv == NULL) {
2133		mutex_exit(&zfsdev_state_lock);
2134		return (SET_ERROR(ENXIO));
2135	}
2136	ASSERT(zv->zv_total_opens > 0);
2137
2138	switch (cmd) {
2139
2140	case DKIOCINFO:
2141	{
2142		struct dk_cinfo dki;
2143
2144		bzero(&dki, sizeof (dki));
2145		(void) strcpy(dki.dki_cname, "zvol");
2146		(void) strcpy(dki.dki_dname, "zvol");
2147		dki.dki_ctype = DKC_UNKNOWN;
2148		dki.dki_unit = getminor(dev);
2149		dki.dki_maxtransfer =
2150		    1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
2151		mutex_exit(&zfsdev_state_lock);
2152		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
2153			error = SET_ERROR(EFAULT);
2154		return (error);
2155	}
2156
2157	case DKIOCGMEDIAINFO:
2158	{
2159		struct dk_minfo dkm;
2160
2161		bzero(&dkm, sizeof (dkm));
2162		dkm.dki_lbsize = 1U << zv->zv_min_bs;
2163		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
2164		dkm.dki_media_type = DK_UNKNOWN;
2165		mutex_exit(&zfsdev_state_lock);
2166		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
2167			error = SET_ERROR(EFAULT);
2168		return (error);
2169	}
2170
2171	case DKIOCGMEDIAINFOEXT:
2172	{
2173		struct dk_minfo_ext dkmext;
2174
2175		bzero(&dkmext, sizeof (dkmext));
2176		dkmext.dki_lbsize = 1U << zv->zv_min_bs;
2177		dkmext.dki_pbsize = zv->zv_volblocksize;
2178		dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
2179		dkmext.dki_media_type = DK_UNKNOWN;
2180		mutex_exit(&zfsdev_state_lock);
2181		if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
2182			error = SET_ERROR(EFAULT);
2183		return (error);
2184	}
2185
2186	case DKIOCGETEFI:
2187	{
2188		uint64_t vs = zv->zv_volsize;
2189		uint8_t bs = zv->zv_min_bs;
2190
2191		mutex_exit(&zfsdev_state_lock);
2192		error = zvol_getefi((void *)arg, flag, vs, bs);
2193		return (error);
2194	}
2195
2196	case DKIOCFLUSHWRITECACHE:
2197		dkc = (struct dk_callback *)arg;
2198		mutex_exit(&zfsdev_state_lock);
2199		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2200		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
2201			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
2202			error = 0;
2203		}
2204		return (error);
2205
2206	case DKIOCGETWCE:
2207	{
2208		int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
2209		if (ddi_copyout(&wce, (void *)arg, sizeof (int),
2210		    flag))
2211			error = SET_ERROR(EFAULT);
2212		break;
2213	}
2214	case DKIOCSETWCE:
2215	{
2216		int wce;
2217		if (ddi_copyin((void *)arg, &wce, sizeof (int),
2218		    flag)) {
2219			error = SET_ERROR(EFAULT);
2220			break;
2221		}
2222		if (wce) {
2223			zv->zv_flags |= ZVOL_WCE;
2224			mutex_exit(&zfsdev_state_lock);
2225		} else {
2226			zv->zv_flags &= ~ZVOL_WCE;
2227			mutex_exit(&zfsdev_state_lock);
2228			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2229		}
2230		return (0);
2231	}
2232
2233	case DKIOCGGEOM:
2234	case DKIOCGVTOC:
2235		/*
2236		 * commands using these (like prtvtoc) expect ENOTSUP
2237		 * since we're emulating an EFI label
2238		 */
2239		error = SET_ERROR(ENOTSUP);
2240		break;
2241
2242	case DKIOCDUMPINIT:
2243		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
2244		    RL_WRITER);
2245		error = zvol_dumpify(zv);
2246		zfs_range_unlock(rl);
2247		break;
2248
2249	case DKIOCDUMPFINI:
2250		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
2251			break;
2252		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
2253		    RL_WRITER);
2254		error = zvol_dump_fini(zv);
2255		zfs_range_unlock(rl);
2256		break;
2257
2258	case DKIOCFREE:
2259	{
2260		dkioc_free_t df;
2261		dmu_tx_t *tx;
2262
2263		if (!zvol_unmap_enabled)
2264			break;
2265
2266		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
2267			error = SET_ERROR(EFAULT);
2268			break;
2269		}
2270
2271		/*
2272		 * Apply Postel's Law to length-checking.  If they overshoot,
2273		 * just blank out until the end, if there's a need to blank
2274		 * out anything.
2275		 */
2276		if (df.df_start >= zv->zv_volsize)
2277			break;	/* No need to do anything... */
2278
2279		mutex_exit(&zfsdev_state_lock);
2280
2281		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
2282		    RL_WRITER);
2283		tx = dmu_tx_create(zv->zv_objset);
2284		dmu_tx_mark_netfree(tx);
2285		error = dmu_tx_assign(tx, TXG_WAIT);
2286		if (error != 0) {
2287			dmu_tx_abort(tx);
2288		} else {
2289			zvol_log_truncate(zv, tx, df.df_start,
2290			    df.df_length, B_TRUE);
2291			dmu_tx_commit(tx);
2292			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
2293			    df.df_start, df.df_length);
2294		}
2295
2296		zfs_range_unlock(rl);
2297
2298		/*
2299		 * If the write-cache is disabled, 'sync' property
2300		 * is set to 'always', or if the caller is asking for
2301		 * a synchronous free, commit this operation to the zil.
2302		 * This will sync any previous uncommitted writes to the
2303		 * zvol object.
2304		 * Can be overridden by the zvol_unmap_sync_enabled tunable.
2305		 */
2306		if ((error == 0) && zvol_unmap_sync_enabled &&
2307		    (!(zv->zv_flags & ZVOL_WCE) ||
2308		    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
2309		    (df.df_flags & DF_WAIT_SYNC))) {
2310			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2311		}
2312
2313		return (error);
2314	}
2315
2316	default:
2317		error = SET_ERROR(ENOTTY);
2318		break;
2319
2320	}
2321	mutex_exit(&zfsdev_state_lock);
2322	return (error);
2323}
2324#endif	/* illumos */
2325
2326int
2327zvol_busy(void)
2328{
2329	return (zvol_minors != 0);
2330}
2331
2332void
2333zvol_init(void)
2334{
2335	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
2336	    1) == 0);
2337#ifdef illumos
2338	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
2339#else
2340	ZFS_LOG(1, "ZVOL Initialized.");
2341#endif
2342}
2343
2344void
2345zvol_fini(void)
2346{
2347#ifdef illumos
2348	mutex_destroy(&zfsdev_state_lock);
2349#endif
2350	ddi_soft_state_fini(&zfsdev_state);
2351	ZFS_LOG(1, "ZVOL Deinitialized.");
2352}
2353
2354#ifdef illumos
2355/*ARGSUSED*/
2356static int
2357zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
2358{
2359	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
2360
2361	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
2362		return (1);
2363	return (0);
2364}
2365
2366/*ARGSUSED*/
2367static void
2368zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
2369{
2370	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
2371
2372	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
2373}
2374
2375static int
2376zvol_dump_init(zvol_state_t *zv, boolean_t resize)
2377{
2378	dmu_tx_t *tx;
2379	int error;
2380	objset_t *os = zv->zv_objset;
2381	spa_t *spa = dmu_objset_spa(os);
2382	vdev_t *vd = spa->spa_root_vdev;
2383	nvlist_t *nv = NULL;
2384	uint64_t version = spa_version(spa);
2385	uint64_t checksum, compress, refresrv, vbs, dedup;
2386
2387	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
2388	ASSERT(vd->vdev_ops == &vdev_root_ops);
2389
2390	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
2391	    DMU_OBJECT_END);
2392	if (error != 0)
2393		return (error);
2394	/* wait for dmu_free_long_range to actually free the blocks */
2395	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2396
2397	/*
2398	 * If the pool on which the dump device is being initialized has more
2399	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
2400	 * enabled.  If so, bump that feature's counter to indicate that the
2401	 * feature is active. We also check the vdev type to handle the
2402	 * following case:
2403	 *   # zpool create test raidz disk1 disk2 disk3
2404	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
2405	 *   the raidz vdev itself has 3 children.
2406	 */
2407	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
2408		if (!spa_feature_is_enabled(spa,
2409		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
2410			return (SET_ERROR(ENOTSUP));
2411		(void) dsl_sync_task(spa_name(spa),
2412		    zfs_mvdev_dump_feature_check,
2413		    zfs_mvdev_dump_activate_feature_sync, NULL,
2414		    2, ZFS_SPACE_CHECK_RESERVED);
2415	}
2416
2417	if (!resize) {
2418		error = dsl_prop_get_integer(zv->zv_name,
2419		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
2420		if (error == 0) {
2421			error = dsl_prop_get_integer(zv->zv_name,
2422			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
2423			    NULL);
2424		}
2425		if (error == 0) {
2426			error = dsl_prop_get_integer(zv->zv_name,
2427			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2428			    &refresrv, NULL);
2429		}
2430		if (error == 0) {
2431			error = dsl_prop_get_integer(zv->zv_name,
2432			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
2433			    NULL);
2434		}
2435		if (version >= SPA_VERSION_DEDUP && error == 0) {
2436			error = dsl_prop_get_integer(zv->zv_name,
2437			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
2438		}
2439	}
2440	if (error != 0)
2441		return (error);
2442
2443	tx = dmu_tx_create(os);
2444	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2445	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2446	error = dmu_tx_assign(tx, TXG_WAIT);
2447	if (error != 0) {
2448		dmu_tx_abort(tx);
2449		return (error);
2450	}
2451
2452	/*
2453	 * If we are resizing the dump device then we only need to
2454	 * update the refreservation to match the newly updated
2455	 * zvolsize. Otherwise, we save off the original state of the
2456	 * zvol so that we can restore them if the zvol is ever undumpified.
2457	 */
2458	if (resize) {
2459		error = zap_update(os, ZVOL_ZAP_OBJ,
2460		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2461		    &zv->zv_volsize, tx);
2462	} else {
2463		error = zap_update(os, ZVOL_ZAP_OBJ,
2464		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
2465		    &compress, tx);
2466		if (error == 0) {
2467			error = zap_update(os, ZVOL_ZAP_OBJ,
2468			    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
2469			    &checksum, tx);
2470		}
2471		if (error == 0) {
2472			error = zap_update(os, ZVOL_ZAP_OBJ,
2473			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2474			    &refresrv, tx);
2475		}
2476		if (error == 0) {
2477			error = zap_update(os, ZVOL_ZAP_OBJ,
2478			    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
2479			    &vbs, tx);
2480		}
2481		if (error == 0) {
2482			error = dmu_object_set_blocksize(
2483			    os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
2484		}
2485		if (version >= SPA_VERSION_DEDUP && error == 0) {
2486			error = zap_update(os, ZVOL_ZAP_OBJ,
2487			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2488			    &dedup, tx);
2489		}
2490		if (error == 0)
2491			zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
2492	}
2493	dmu_tx_commit(tx);
2494
2495	/*
2496	 * We only need update the zvol's property if we are initializing
2497	 * the dump area for the first time.
2498	 */
2499	if (error == 0 && !resize) {
2500		/*
2501		 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
2502		 * function.  Otherwise, use the old default -- OFF.
2503		 */
2504		checksum = spa_feature_is_active(spa,
2505		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2506		    ZIO_CHECKSUM_OFF;
2507
2508		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2509		VERIFY(nvlist_add_uint64(nv,
2510		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2511		VERIFY(nvlist_add_uint64(nv,
2512		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2513		    ZIO_COMPRESS_OFF) == 0);
2514		VERIFY(nvlist_add_uint64(nv,
2515		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2516		    checksum) == 0);
2517		if (version >= SPA_VERSION_DEDUP) {
2518			VERIFY(nvlist_add_uint64(nv,
2519			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2520			    ZIO_CHECKSUM_OFF) == 0);
2521		}
2522
2523		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2524		    nv, NULL);
2525		nvlist_free(nv);
2526	}
2527
2528	/* Allocate the space for the dump */
2529	if (error == 0)
2530		error = zvol_prealloc(zv);
2531	return (error);
2532}
2533
2534static int
2535zvol_dumpify(zvol_state_t *zv)
2536{
2537	int error = 0;
2538	uint64_t dumpsize = 0;
2539	dmu_tx_t *tx;
2540	objset_t *os = zv->zv_objset;
2541
2542	if (zv->zv_flags & ZVOL_RDONLY)
2543		return (SET_ERROR(EROFS));
2544
2545	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2546	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2547		boolean_t resize = (dumpsize > 0);
2548
2549		if ((error = zvol_dump_init(zv, resize)) != 0) {
2550			(void) zvol_dump_fini(zv);
2551			return (error);
2552		}
2553	}
2554
2555	/*
2556	 * Build up our lba mapping.
2557	 */
2558	error = zvol_get_lbas(zv);
2559	if (error) {
2560		(void) zvol_dump_fini(zv);
2561		return (error);
2562	}
2563
2564	tx = dmu_tx_create(os);
2565	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2566	error = dmu_tx_assign(tx, TXG_WAIT);
2567	if (error) {
2568		dmu_tx_abort(tx);
2569		(void) zvol_dump_fini(zv);
2570		return (error);
2571	}
2572
2573	zv->zv_flags |= ZVOL_DUMPIFIED;
2574	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2575	    &zv->zv_volsize, tx);
2576	dmu_tx_commit(tx);
2577
2578	if (error) {
2579		(void) zvol_dump_fini(zv);
2580		return (error);
2581	}
2582
2583	txg_wait_synced(dmu_objset_pool(os), 0);
2584	return (0);
2585}
2586
2587static int
2588zvol_dump_fini(zvol_state_t *zv)
2589{
2590	dmu_tx_t *tx;
2591	objset_t *os = zv->zv_objset;
2592	nvlist_t *nv;
2593	int error = 0;
2594	uint64_t checksum, compress, refresrv, vbs, dedup;
2595	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2596
2597	/*
2598	 * Attempt to restore the zvol back to its pre-dumpified state.
2599	 * This is a best-effort attempt as it's possible that not all
2600	 * of these properties were initialized during the dumpify process
2601	 * (i.e. error during zvol_dump_init).
2602	 */
2603
2604	tx = dmu_tx_create(os);
2605	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2606	error = dmu_tx_assign(tx, TXG_WAIT);
2607	if (error) {
2608		dmu_tx_abort(tx);
2609		return (error);
2610	}
2611	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2612	dmu_tx_commit(tx);
2613
2614	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2615	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2616	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2617	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2618	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2619	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2620	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2621	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2622
2623	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2624	(void) nvlist_add_uint64(nv,
2625	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2626	(void) nvlist_add_uint64(nv,
2627	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2628	(void) nvlist_add_uint64(nv,
2629	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2630	if (version >= SPA_VERSION_DEDUP &&
2631	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2632	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2633		(void) nvlist_add_uint64(nv,
2634		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2635	}
2636	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2637	    nv, NULL);
2638	nvlist_free(nv);
2639
2640	zvol_free_extents(zv);
2641	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2642	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2643	/* wait for dmu_free_long_range to actually free the blocks */
2644	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2645	tx = dmu_tx_create(os);
2646	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2647	error = dmu_tx_assign(tx, TXG_WAIT);
2648	if (error) {
2649		dmu_tx_abort(tx);
2650		return (error);
2651	}
2652	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2653		zv->zv_volblocksize = vbs;
2654	dmu_tx_commit(tx);
2655
2656	return (0);
2657}
2658#else	/* !illumos */
2659
2660static void
2661zvol_geom_run(zvol_state_t *zv)
2662{
2663	struct g_provider *pp;
2664
2665	pp = zv->zv_provider;
2666	g_error_provider(pp, 0);
2667
2668	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2669	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2670}
2671
2672static void
2673zvol_geom_destroy(zvol_state_t *zv)
2674{
2675	struct g_provider *pp;
2676
2677	g_topology_assert();
2678
2679	mtx_lock(&zv->zv_queue_mtx);
2680	zv->zv_state = 1;
2681	wakeup_one(&zv->zv_queue);
2682	while (zv->zv_state != 2)
2683		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2684	mtx_destroy(&zv->zv_queue_mtx);
2685
2686	pp = zv->zv_provider;
2687	zv->zv_provider = NULL;
2688	pp->private = NULL;
2689	g_wither_geom(pp->geom, ENXIO);
2690}
2691
2692static int
2693zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2694{
2695	int count, error, flags;
2696
2697	g_topology_assert();
2698
2699	/*
2700	 * To make it easier we expect either open or close, but not both
2701	 * at the same time.
2702	 */
2703	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2704	    (acr <= 0 && acw <= 0 && ace <= 0),
2705	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2706	    pp->name, acr, acw, ace));
2707
2708	if (pp->private == NULL) {
2709		if (acr <= 0 && acw <= 0 && ace <= 0)
2710			return (0);
2711		return (pp->error);
2712	}
2713
2714	/*
2715	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2716	 * because GEOM already handles that and handles it a bit differently.
2717	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2718	 * only one exclusive consumer, no matter if it is reader or writer.
2719	 * I like better the way GEOM works so I'll leave it for GEOM to
2720	 * decide what to do.
2721	 */
2722
2723	count = acr + acw + ace;
2724	if (count == 0)
2725		return (0);
2726
2727	flags = 0;
2728	if (acr != 0 || ace != 0)
2729		flags |= FREAD;
2730	if (acw != 0)
2731		flags |= FWRITE;
2732
2733	g_topology_unlock();
2734	if (count > 0)
2735		error = zvol_open(pp, flags, count);
2736	else
2737		error = zvol_close(pp, flags, -count);
2738	g_topology_lock();
2739	return (error);
2740}
2741
2742static void
2743zvol_geom_start(struct bio *bp)
2744{
2745	zvol_state_t *zv;
2746	boolean_t first;
2747
2748	zv = bp->bio_to->private;
2749	ASSERT(zv != NULL);
2750	switch (bp->bio_cmd) {
2751	case BIO_FLUSH:
2752		if (!THREAD_CAN_SLEEP())
2753			goto enqueue;
2754		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2755		g_io_deliver(bp, 0);
2756		break;
2757	case BIO_READ:
2758	case BIO_WRITE:
2759	case BIO_DELETE:
2760		if (!THREAD_CAN_SLEEP())
2761			goto enqueue;
2762		zvol_strategy(bp);
2763		break;
2764	case BIO_GETATTR: {
2765		spa_t *spa = dmu_objset_spa(zv->zv_objset);
2766		uint64_t refd, avail, usedobjs, availobjs, val;
2767
2768		if (g_handleattr_int(bp, "GEOM::candelete", 1))
2769			return;
2770		if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
2771			dmu_objset_space(zv->zv_objset, &refd, &avail,
2772			    &usedobjs, &availobjs);
2773			if (g_handleattr_off_t(bp, "blocksavail",
2774			    avail / DEV_BSIZE))
2775				return;
2776		} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
2777			dmu_objset_space(zv->zv_objset, &refd, &avail,
2778			    &usedobjs, &availobjs);
2779			if (g_handleattr_off_t(bp, "blocksused",
2780			    refd / DEV_BSIZE))
2781				return;
2782		} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
2783			avail = metaslab_class_get_space(spa_normal_class(spa));
2784			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
2785			if (g_handleattr_off_t(bp, "poolblocksavail",
2786			    avail / DEV_BSIZE))
2787				return;
2788		} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
2789			refd = metaslab_class_get_alloc(spa_normal_class(spa));
2790			if (g_handleattr_off_t(bp, "poolblocksused",
2791			    refd / DEV_BSIZE))
2792				return;
2793		}
2794		/* FALLTHROUGH */
2795	}
2796	default:
2797		g_io_deliver(bp, EOPNOTSUPP);
2798		break;
2799	}
2800	return;
2801
2802enqueue:
2803	mtx_lock(&zv->zv_queue_mtx);
2804	first = (bioq_first(&zv->zv_queue) == NULL);
2805	bioq_insert_tail(&zv->zv_queue, bp);
2806	mtx_unlock(&zv->zv_queue_mtx);
2807	if (first)
2808		wakeup_one(&zv->zv_queue);
2809}
2810
2811static void
2812zvol_geom_worker(void *arg)
2813{
2814	zvol_state_t *zv;
2815	struct bio *bp;
2816
2817	thread_lock(curthread);
2818	sched_prio(curthread, PRIBIO);
2819	thread_unlock(curthread);
2820
2821	zv = arg;
2822	for (;;) {
2823		mtx_lock(&zv->zv_queue_mtx);
2824		bp = bioq_takefirst(&zv->zv_queue);
2825		if (bp == NULL) {
2826			if (zv->zv_state == 1) {
2827				zv->zv_state = 2;
2828				wakeup(&zv->zv_state);
2829				mtx_unlock(&zv->zv_queue_mtx);
2830				kthread_exit();
2831			}
2832			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2833			    "zvol:io", 0);
2834			continue;
2835		}
2836		mtx_unlock(&zv->zv_queue_mtx);
2837		switch (bp->bio_cmd) {
2838		case BIO_FLUSH:
2839			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2840			g_io_deliver(bp, 0);
2841			break;
2842		case BIO_READ:
2843		case BIO_WRITE:
2844		case BIO_DELETE:
2845			zvol_strategy(bp);
2846			break;
2847		default:
2848			g_io_deliver(bp, EOPNOTSUPP);
2849			break;
2850		}
2851	}
2852}
2853
2854extern boolean_t dataset_name_hidden(const char *name);
2855
2856static int
2857zvol_create_snapshots(objset_t *os, const char *name)
2858{
2859	uint64_t cookie, obj;
2860	char *sname;
2861	int error, len;
2862
2863	cookie = obj = 0;
2864	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2865
2866#if 0
2867	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2868	    DS_FIND_SNAPSHOTS);
2869#endif
2870
2871	for (;;) {
2872		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2873		if (len >= MAXPATHLEN) {
2874			dmu_objset_rele(os, FTAG);
2875			error = ENAMETOOLONG;
2876			break;
2877		}
2878
2879		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2880		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2881		    sname + len, &obj, &cookie, NULL);
2882		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2883		if (error != 0) {
2884			if (error == ENOENT)
2885				error = 0;
2886			break;
2887		}
2888
2889		error = zvol_create_minor(sname);
2890		if (error != 0 && error != EEXIST) {
2891			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2892			    sname, error);
2893			break;
2894		}
2895	}
2896
2897	kmem_free(sname, MAXPATHLEN);
2898	return (error);
2899}
2900
2901int
2902zvol_create_minors(const char *name)
2903{
2904	uint64_t cookie;
2905	objset_t *os;
2906	char *osname, *p;
2907	int error, len;
2908
2909	if (dataset_name_hidden(name))
2910		return (0);
2911
2912	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2913		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2914		    name, error);
2915		return (error);
2916	}
2917	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2918		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2919		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2920		error = zvol_create_minor(name);
2921		if (error == 0 || error == EEXIST) {
2922			error = zvol_create_snapshots(os, name);
2923		} else {
2924			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2925			    name, error);
2926		}
2927		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2928		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2929		return (error);
2930	}
2931	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2932		dmu_objset_rele(os, FTAG);
2933		return (0);
2934	}
2935
2936	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2937	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2938		dmu_objset_rele(os, FTAG);
2939		kmem_free(osname, MAXPATHLEN);
2940		return (ENOENT);
2941	}
2942	p = osname + strlen(osname);
2943	len = MAXPATHLEN - (p - osname);
2944
2945#if 0
2946	/* Prefetch the datasets. */
2947	cookie = 0;
2948	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2949		if (!dataset_name_hidden(osname))
2950			(void) dmu_objset_prefetch(osname, NULL);
2951	}
2952#endif
2953
2954	cookie = 0;
2955	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2956	    &cookie) == 0) {
2957		dmu_objset_rele(os, FTAG);
2958		(void)zvol_create_minors(osname);
2959		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2960			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2961			    name, error);
2962			return (error);
2963		}
2964	}
2965
2966	dmu_objset_rele(os, FTAG);
2967	kmem_free(osname, MAXPATHLEN);
2968	return (0);
2969}
2970
2971static void
2972zvol_rename_minor(zvol_state_t *zv, const char *newname)
2973{
2974	struct g_geom *gp;
2975	struct g_provider *pp;
2976	struct cdev *dev;
2977
2978	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
2979
2980	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
2981		g_topology_lock();
2982		pp = zv->zv_provider;
2983		ASSERT(pp != NULL);
2984		gp = pp->geom;
2985		ASSERT(gp != NULL);
2986
2987		zv->zv_provider = NULL;
2988		g_wither_provider(pp, ENXIO);
2989
2990		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2991		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2992		pp->sectorsize = DEV_BSIZE;
2993		pp->mediasize = zv->zv_volsize;
2994		pp->private = zv;
2995		zv->zv_provider = pp;
2996		g_error_provider(pp, 0);
2997		g_topology_unlock();
2998	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
2999		struct make_dev_args args;
3000
3001		if ((dev = zv->zv_dev) != NULL) {
3002			zv->zv_dev = NULL;
3003			destroy_dev(dev);
3004			if (zv->zv_total_opens > 0) {
3005				zv->zv_flags &= ~ZVOL_EXCL;
3006				zv->zv_total_opens = 0;
3007				zvol_last_close(zv);
3008			}
3009		}
3010
3011		make_dev_args_init(&args);
3012		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
3013		args.mda_devsw = &zvol_cdevsw;
3014		args.mda_cr = NULL;
3015		args.mda_uid = UID_ROOT;
3016		args.mda_gid = GID_OPERATOR;
3017		args.mda_mode = 0640;
3018		args.mda_si_drv2 = zv;
3019		if (make_dev_s(&args, &zv->zv_dev,
3020		    "%s/%s", ZVOL_DRIVER, newname) == 0)
3021			zv->zv_dev->si_iosize_max = MAXPHYS;
3022	}
3023	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
3024}
3025
3026void
3027zvol_rename_minors(const char *oldname, const char *newname)
3028{
3029	char name[MAXPATHLEN];
3030	struct g_provider *pp;
3031	struct g_geom *gp;
3032	size_t oldnamelen, newnamelen;
3033	zvol_state_t *zv;
3034	char *namebuf;
3035	boolean_t locked = B_FALSE;
3036
3037	oldnamelen = strlen(oldname);
3038	newnamelen = strlen(newname);
3039
3040	DROP_GIANT();
3041	/* See comment in zvol_open(). */
3042	if (!MUTEX_HELD(&zfsdev_state_lock)) {
3043		mutex_enter(&zfsdev_state_lock);
3044		locked = B_TRUE;
3045	}
3046
3047	LIST_FOREACH(zv, &all_zvols, zv_links) {
3048		if (strcmp(zv->zv_name, oldname) == 0) {
3049			zvol_rename_minor(zv, newname);
3050		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
3051		    (zv->zv_name[oldnamelen] == '/' ||
3052		     zv->zv_name[oldnamelen] == '@')) {
3053			snprintf(name, sizeof(name), "%s%c%s", newname,
3054			    zv->zv_name[oldnamelen],
3055			    zv->zv_name + oldnamelen + 1);
3056			zvol_rename_minor(zv, name);
3057		}
3058	}
3059
3060	if (locked)
3061		mutex_exit(&zfsdev_state_lock);
3062	PICKUP_GIANT();
3063}
3064
3065static int
3066zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
3067{
3068	zvol_state_t *zv = dev->si_drv2;
3069	int err = 0;
3070
3071	mutex_enter(&zfsdev_state_lock);
3072	if (zv->zv_total_opens == 0)
3073		err = zvol_first_open(zv);
3074	if (err) {
3075		mutex_exit(&zfsdev_state_lock);
3076		return (err);
3077	}
3078	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
3079		err = SET_ERROR(EROFS);
3080		goto out;
3081	}
3082	if (zv->zv_flags & ZVOL_EXCL) {
3083		err = SET_ERROR(EBUSY);
3084		goto out;
3085	}
3086#ifdef FEXCL
3087	if (flags & FEXCL) {
3088		if (zv->zv_total_opens != 0) {
3089			err = SET_ERROR(EBUSY);
3090			goto out;
3091		}
3092		zv->zv_flags |= ZVOL_EXCL;
3093	}
3094#endif
3095
3096	zv->zv_total_opens++;
3097	if (flags & (FSYNC | FDSYNC)) {
3098		zv->zv_sync_cnt++;
3099		if (zv->zv_sync_cnt == 1)
3100			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
3101	}
3102	mutex_exit(&zfsdev_state_lock);
3103	return (err);
3104out:
3105	if (zv->zv_total_opens == 0)
3106		zvol_last_close(zv);
3107	mutex_exit(&zfsdev_state_lock);
3108	return (err);
3109}
3110
3111static int
3112zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
3113{
3114	zvol_state_t *zv = dev->si_drv2;
3115
3116	mutex_enter(&zfsdev_state_lock);
3117	if (zv->zv_flags & ZVOL_EXCL) {
3118		ASSERT(zv->zv_total_opens == 1);
3119		zv->zv_flags &= ~ZVOL_EXCL;
3120	}
3121
3122	/*
3123	 * If the open count is zero, this is a spurious close.
3124	 * That indicates a bug in the kernel / DDI framework.
3125	 */
3126	ASSERT(zv->zv_total_opens != 0);
3127
3128	/*
3129	 * You may get multiple opens, but only one close.
3130	 */
3131	zv->zv_total_opens--;
3132	if (flags & (FSYNC | FDSYNC))
3133		zv->zv_sync_cnt--;
3134
3135	if (zv->zv_total_opens == 0)
3136		zvol_last_close(zv);
3137
3138	mutex_exit(&zfsdev_state_lock);
3139	return (0);
3140}
3141
3142static int
3143zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
3144{
3145	zvol_state_t *zv;
3146	rl_t *rl;
3147	off_t offset, length;
3148	int i, error;
3149	boolean_t sync;
3150
3151	zv = dev->si_drv2;
3152
3153	error = 0;
3154	KASSERT(zv->zv_total_opens > 0,
3155	    ("Device with zero access count in zvol_d_ioctl"));
3156
3157	i = IOCPARM_LEN(cmd);
3158	switch (cmd) {
3159	case DIOCGSECTORSIZE:
3160		*(u_int *)data = DEV_BSIZE;
3161		break;
3162	case DIOCGMEDIASIZE:
3163		*(off_t *)data = zv->zv_volsize;
3164		break;
3165	case DIOCGFLUSH:
3166		zil_commit(zv->zv_zilog, ZVOL_OBJ);
3167		break;
3168	case DIOCGDELETE:
3169		if (!zvol_unmap_enabled)
3170			break;
3171
3172		offset = ((off_t *)data)[0];
3173		length = ((off_t *)data)[1];
3174		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
3175		    offset < 0 || offset >= zv->zv_volsize ||
3176		    length <= 0) {
3177			printf("%s: offset=%jd length=%jd\n", __func__, offset,
3178			    length);
3179			error = EINVAL;
3180			break;
3181		}
3182
3183		rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
3184		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
3185		error = dmu_tx_assign(tx, TXG_WAIT);
3186		if (error != 0) {
3187			sync = FALSE;
3188			dmu_tx_abort(tx);
3189		} else {
3190			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
3191			zvol_log_truncate(zv, tx, offset, length, sync);
3192			dmu_tx_commit(tx);
3193			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
3194			    offset, length);
3195		}
3196		zfs_range_unlock(rl);
3197		if (sync)
3198			zil_commit(zv->zv_zilog, ZVOL_OBJ);
3199		break;
3200	case DIOCGSTRIPESIZE:
3201		*(off_t *)data = zv->zv_volblocksize;
3202		break;
3203	case DIOCGSTRIPEOFFSET:
3204		*(off_t *)data = 0;
3205		break;
3206	case DIOCGATTR: {
3207		spa_t *spa = dmu_objset_spa(zv->zv_objset);
3208		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
3209		uint64_t refd, avail, usedobjs, availobjs;
3210
3211		if (strcmp(arg->name, "GEOM::candelete") == 0)
3212			arg->value.i = 1;
3213		else if (strcmp(arg->name, "blocksavail") == 0) {
3214			dmu_objset_space(zv->zv_objset, &refd, &avail,
3215			    &usedobjs, &availobjs);
3216			arg->value.off = avail / DEV_BSIZE;
3217		} else if (strcmp(arg->name, "blocksused") == 0) {
3218			dmu_objset_space(zv->zv_objset, &refd, &avail,
3219			    &usedobjs, &availobjs);
3220			arg->value.off = refd / DEV_BSIZE;
3221		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
3222			avail = metaslab_class_get_space(spa_normal_class(spa));
3223			avail -= metaslab_class_get_alloc(spa_normal_class(spa));
3224			arg->value.off = avail / DEV_BSIZE;
3225		} else if (strcmp(arg->name, "poolblocksused") == 0) {
3226			refd = metaslab_class_get_alloc(spa_normal_class(spa));
3227			arg->value.off = refd / DEV_BSIZE;
3228		} else
3229			error = ENOIOCTL;
3230		break;
3231	}
3232	case FIOSEEKHOLE:
3233	case FIOSEEKDATA: {
3234		off_t *off = (off_t *)data;
3235		uint64_t noff;
3236		boolean_t hole;
3237
3238		hole = (cmd == FIOSEEKHOLE);
3239		noff = *off;
3240		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
3241		*off = noff;
3242		break;
3243	}
3244	default:
3245		error = ENOIOCTL;
3246	}
3247
3248	return (error);
3249}
3250#endif	/* illumos */
3251