zvol.c revision 263397
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 * Copyright (c) 2013 by Delphix. All rights reserved.
27 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
28 */
29
30/* Portions Copyright 2010 Robert Milkowski */
31/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
32
33/*
34 * ZFS volume emulation driver.
35 *
36 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
37 * Volumes are accessed through the symbolic links named:
38 *
39 * /dev/zvol/dsk/<pool_name>/<dataset_name>
40 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
41 *
42 * These links are created by the /dev filesystem (sdev_zvolops.c).
43 * Volumes are persistent through reboot.  No user command needs to be
44 * run before opening and using a device.
45 *
46 * FreeBSD notes.
47 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
48 * in the system.
49 */
50
51#include <sys/types.h>
52#include <sys/param.h>
53#include <sys/kernel.h>
54#include <sys/errno.h>
55#include <sys/uio.h>
56#include <sys/bio.h>
57#include <sys/buf.h>
58#include <sys/kmem.h>
59#include <sys/conf.h>
60#include <sys/cmn_err.h>
61#include <sys/stat.h>
62#include <sys/zap.h>
63#include <sys/spa.h>
64#include <sys/spa_impl.h>
65#include <sys/zio.h>
66#include <sys/dmu_traverse.h>
67#include <sys/dnode.h>
68#include <sys/dsl_dataset.h>
69#include <sys/dsl_prop.h>
70#include <sys/dkio.h>
71#include <sys/byteorder.h>
72#include <sys/sunddi.h>
73#include <sys/dirent.h>
74#include <sys/policy.h>
75#include <sys/fs/zfs.h>
76#include <sys/zfs_ioctl.h>
77#include <sys/zil.h>
78#include <sys/refcount.h>
79#include <sys/zfs_znode.h>
80#include <sys/zfs_rlock.h>
81#include <sys/vdev_impl.h>
82#include <sys/vdev_raidz.h>
83#include <sys/zvol.h>
84#include <sys/zil_impl.h>
85#include <sys/dbuf.h>
86#include <sys/dmu_tx.h>
87#include <sys/zfeature.h>
88#include <sys/zio_checksum.h>
89
90#include <geom/geom.h>
91
92#include "zfs_namecheck.h"
93
94struct g_class zfs_zvol_class = {
95	.name = "ZFS::ZVOL",
96	.version = G_VERSION,
97};
98
99DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
100
101void *zfsdev_state;
102static char *zvol_tag = "zvol_tag";
103
104#define	ZVOL_DUMPSIZE		"dumpsize"
105
106/*
107 * The spa_namespace_lock protects the zfsdev_state structure from being
108 * modified while it's being used, e.g. an open that comes in before a
109 * create finishes.  It also protects temporary opens of the dataset so that,
110 * e.g., an open doesn't get a spurious EBUSY.
111 */
112static uint32_t zvol_minors;
113
114typedef struct zvol_extent {
115	list_node_t	ze_node;
116	dva_t		ze_dva;		/* dva associated with this extent */
117	uint64_t	ze_nblks;	/* number of blocks in extent */
118} zvol_extent_t;
119
120/*
121 * The in-core state of each volume.
122 */
123typedef struct zvol_state {
124	char		zv_name[MAXPATHLEN]; /* pool/dd name */
125	uint64_t	zv_volsize;	/* amount of space we advertise */
126	uint64_t	zv_volblocksize; /* volume block size */
127	struct g_provider *zv_provider;	/* GEOM provider */
128	uint8_t		zv_min_bs;	/* minimum addressable block shift */
129	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
130	objset_t	*zv_objset;	/* objset handle */
131	uint32_t	zv_total_opens;	/* total open count */
132	zilog_t		*zv_zilog;	/* ZIL handle */
133	list_t		zv_extents;	/* List of extents for dump */
134	znode_t		zv_znode;	/* for range locking */
135	dmu_buf_t	*zv_dbuf;	/* bonus handle */
136	int		zv_state;
137	struct bio_queue_head zv_queue;
138	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
139} zvol_state_t;
140
141/*
142 * zvol specific flags
143 */
144#define	ZVOL_RDONLY	0x1
145#define	ZVOL_DUMPIFIED	0x2
146#define	ZVOL_EXCL	0x4
147#define	ZVOL_WCE	0x8
148
149/*
150 * zvol maximum transfer in one DMU tx.
151 */
152int zvol_maxphys = DMU_MAX_ACCESS/2;
153
154extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
155    nvlist_t *, nvlist_t *);
156static int zvol_remove_zv(zvol_state_t *);
157static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
158static int zvol_dumpify(zvol_state_t *zv);
159static int zvol_dump_fini(zvol_state_t *zv);
160static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
161
162static zvol_state_t *zvol_geom_create(const char *name);
163static void zvol_geom_run(zvol_state_t *zv);
164static void zvol_geom_destroy(zvol_state_t *zv);
165static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
166static void zvol_geom_start(struct bio *bp);
167static void zvol_geom_worker(void *arg);
168
169static void
170zvol_size_changed(zvol_state_t *zv)
171{
172#ifdef sun
173	dev_t dev = makedevice(maj, min);
174
175	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
176	    "Size", volsize) == DDI_SUCCESS);
177	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
178	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
179
180	/* Notify specfs to invalidate the cached size */
181	spec_size_invalidate(dev, VBLK);
182	spec_size_invalidate(dev, VCHR);
183#else	/* !sun */
184	struct g_provider *pp;
185
186	pp = zv->zv_provider;
187	if (pp == NULL)
188		return;
189	g_topology_lock();
190	g_resize_provider(pp, zv->zv_volsize);
191	g_topology_unlock();
192#endif	/* !sun */
193}
194
195int
196zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
197{
198	if (volsize == 0)
199		return (SET_ERROR(EINVAL));
200
201	if (volsize % blocksize != 0)
202		return (SET_ERROR(EINVAL));
203
204#ifdef _ILP32
205	if (volsize - 1 > SPEC_MAXOFFSET_T)
206		return (SET_ERROR(EOVERFLOW));
207#endif
208	return (0);
209}
210
211int
212zvol_check_volblocksize(uint64_t volblocksize)
213{
214	if (volblocksize < SPA_MINBLOCKSIZE ||
215	    volblocksize > SPA_MAXBLOCKSIZE ||
216	    !ISP2(volblocksize))
217		return (SET_ERROR(EDOM));
218
219	return (0);
220}
221
222int
223zvol_get_stats(objset_t *os, nvlist_t *nv)
224{
225	int error;
226	dmu_object_info_t doi;
227	uint64_t val;
228
229	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
230	if (error)
231		return (error);
232
233	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
234
235	error = dmu_object_info(os, ZVOL_OBJ, &doi);
236
237	if (error == 0) {
238		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
239		    doi.doi_data_block_size);
240	}
241
242	return (error);
243}
244
245static zvol_state_t *
246zvol_minor_lookup(const char *name)
247{
248	struct g_provider *pp;
249	struct g_geom *gp;
250	zvol_state_t *zv = NULL;
251
252	ASSERT(MUTEX_HELD(&spa_namespace_lock));
253
254	g_topology_lock();
255	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
256		pp = LIST_FIRST(&gp->provider);
257		if (pp == NULL)
258			continue;
259		zv = pp->private;
260		if (zv == NULL)
261			continue;
262		if (strcmp(zv->zv_name, name) == 0)
263			break;
264	}
265	g_topology_unlock();
266
267	return (gp != NULL ? zv : NULL);
268}
269
270/* extent mapping arg */
271struct maparg {
272	zvol_state_t	*ma_zv;
273	uint64_t	ma_blks;
274};
275
276/*ARGSUSED*/
277static int
278zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
279    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
280{
281	struct maparg *ma = arg;
282	zvol_extent_t *ze;
283	int bs = ma->ma_zv->zv_volblocksize;
284
285	if (BP_IS_HOLE(bp) ||
286	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
287		return (0);
288
289	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
290	ma->ma_blks++;
291
292	/* Abort immediately if we have encountered gang blocks */
293	if (BP_IS_GANG(bp))
294		return (SET_ERROR(EFRAGS));
295
296	/*
297	 * See if the block is at the end of the previous extent.
298	 */
299	ze = list_tail(&ma->ma_zv->zv_extents);
300	if (ze &&
301	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
302	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
303	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
304		ze->ze_nblks++;
305		return (0);
306	}
307
308	dprintf_bp(bp, "%s", "next blkptr:");
309
310	/* start a new extent */
311	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
312	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
313	ze->ze_nblks = 1;
314	list_insert_tail(&ma->ma_zv->zv_extents, ze);
315	return (0);
316}
317
318static void
319zvol_free_extents(zvol_state_t *zv)
320{
321	zvol_extent_t *ze;
322
323	while (ze = list_head(&zv->zv_extents)) {
324		list_remove(&zv->zv_extents, ze);
325		kmem_free(ze, sizeof (zvol_extent_t));
326	}
327}
328
329static int
330zvol_get_lbas(zvol_state_t *zv)
331{
332	objset_t *os = zv->zv_objset;
333	struct maparg	ma;
334	int		err;
335
336	ma.ma_zv = zv;
337	ma.ma_blks = 0;
338	zvol_free_extents(zv);
339
340	/* commit any in-flight changes before traversing the dataset */
341	txg_wait_synced(dmu_objset_pool(os), 0);
342	err = traverse_dataset(dmu_objset_ds(os), 0,
343	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
344	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
345		zvol_free_extents(zv);
346		return (err ? err : EIO);
347	}
348
349	return (0);
350}
351
352/* ARGSUSED */
353void
354zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
355{
356	zfs_creat_t *zct = arg;
357	nvlist_t *nvprops = zct->zct_props;
358	int error;
359	uint64_t volblocksize, volsize;
360
361	VERIFY(nvlist_lookup_uint64(nvprops,
362	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
363	if (nvlist_lookup_uint64(nvprops,
364	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
365		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
366
367	/*
368	 * These properties must be removed from the list so the generic
369	 * property setting step won't apply to them.
370	 */
371	VERIFY(nvlist_remove_all(nvprops,
372	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
373	(void) nvlist_remove_all(nvprops,
374	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
375
376	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
377	    DMU_OT_NONE, 0, tx);
378	ASSERT(error == 0);
379
380	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
381	    DMU_OT_NONE, 0, tx);
382	ASSERT(error == 0);
383
384	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
385	ASSERT(error == 0);
386}
387
388/*
389 * Replay a TX_WRITE ZIL transaction that didn't get committed
390 * after a system failure
391 */
392static int
393zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
394{
395	objset_t *os = zv->zv_objset;
396	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
397	uint64_t offset, length;
398	dmu_tx_t *tx;
399	int error;
400
401	if (byteswap)
402		byteswap_uint64_array(lr, sizeof (*lr));
403
404	offset = lr->lr_offset;
405	length = lr->lr_length;
406
407	/* If it's a dmu_sync() block, write the whole block */
408	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
409		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
410		if (length < blocksize) {
411			offset -= offset % blocksize;
412			length = blocksize;
413		}
414	}
415
416	tx = dmu_tx_create(os);
417	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
418	error = dmu_tx_assign(tx, TXG_WAIT);
419	if (error) {
420		dmu_tx_abort(tx);
421	} else {
422		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
423		dmu_tx_commit(tx);
424	}
425
426	return (error);
427}
428
429/* ARGSUSED */
430static int
431zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
432{
433	return (SET_ERROR(ENOTSUP));
434}
435
436/*
437 * Callback vectors for replaying records.
438 * Only TX_WRITE is needed for zvol.
439 */
440zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
441	zvol_replay_err,	/* 0 no such transaction type */
442	zvol_replay_err,	/* TX_CREATE */
443	zvol_replay_err,	/* TX_MKDIR */
444	zvol_replay_err,	/* TX_MKXATTR */
445	zvol_replay_err,	/* TX_SYMLINK */
446	zvol_replay_err,	/* TX_REMOVE */
447	zvol_replay_err,	/* TX_RMDIR */
448	zvol_replay_err,	/* TX_LINK */
449	zvol_replay_err,	/* TX_RENAME */
450	zvol_replay_write,	/* TX_WRITE */
451	zvol_replay_err,	/* TX_TRUNCATE */
452	zvol_replay_err,	/* TX_SETATTR */
453	zvol_replay_err,	/* TX_ACL */
454	zvol_replay_err,	/* TX_CREATE_ACL */
455	zvol_replay_err,	/* TX_CREATE_ATTR */
456	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
457	zvol_replay_err,	/* TX_MKDIR_ACL */
458	zvol_replay_err,	/* TX_MKDIR_ATTR */
459	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
460	zvol_replay_err,	/* TX_WRITE2 */
461};
462
463#ifdef sun
464int
465zvol_name2minor(const char *name, minor_t *minor)
466{
467	zvol_state_t *zv;
468
469	mutex_enter(&spa_namespace_lock);
470	zv = zvol_minor_lookup(name);
471	if (minor && zv)
472		*minor = zv->zv_minor;
473	mutex_exit(&spa_namespace_lock);
474	return (zv ? 0 : -1);
475}
476#endif	/* sun */
477
478/*
479 * Create a minor node (plus a whole lot more) for the specified volume.
480 */
481int
482zvol_create_minor(const char *name)
483{
484	zfs_soft_state_t *zs;
485	zvol_state_t *zv;
486	objset_t *os;
487	dmu_object_info_t doi;
488	uint64_t volsize;
489	int error;
490
491	ZFS_LOG(1, "Creating ZVOL %s...", name);
492
493	mutex_enter(&spa_namespace_lock);
494
495	if (zvol_minor_lookup(name) != NULL) {
496		mutex_exit(&spa_namespace_lock);
497		return (SET_ERROR(EEXIST));
498	}
499
500	/* lie and say we're read-only */
501	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
502
503	if (error) {
504		mutex_exit(&spa_namespace_lock);
505		return (error);
506	}
507
508#ifdef sun
509	if ((minor = zfsdev_minor_alloc()) == 0) {
510		dmu_objset_disown(os, FTAG);
511		mutex_exit(&spa_namespace_lock);
512		return (SET_ERROR(ENXIO));
513	}
514
515	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
516		dmu_objset_disown(os, FTAG);
517		mutex_exit(&spa_namespace_lock);
518		return (SET_ERROR(EAGAIN));
519	}
520	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
521	    (char *)name);
522
523	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
524
525	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
526	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
527		ddi_soft_state_free(zfsdev_state, minor);
528		dmu_objset_disown(os, FTAG);
529		mutex_exit(&spa_namespace_lock);
530		return (SET_ERROR(EAGAIN));
531	}
532
533	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
534
535	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
536	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
537		ddi_remove_minor_node(zfs_dip, chrbuf);
538		ddi_soft_state_free(zfsdev_state, minor);
539		dmu_objset_disown(os, FTAG);
540		mutex_exit(&spa_namespace_lock);
541		return (SET_ERROR(EAGAIN));
542	}
543
544	zs = ddi_get_soft_state(zfsdev_state, minor);
545	zs->zss_type = ZSST_ZVOL;
546	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
547#else	/* !sun */
548
549	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
550	if (error) {
551		ASSERT(error == 0);
552		dmu_objset_disown(os, zvol_tag);
553		mutex_exit(&spa_namespace_lock);
554		return (error);
555	}
556
557	DROP_GIANT();
558	g_topology_lock();
559	zv = zvol_geom_create(name);
560	zv->zv_volsize = volsize;
561	zv->zv_provider->mediasize = zv->zv_volsize;
562
563#endif	/* !sun */
564
565	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
566	zv->zv_min_bs = DEV_BSHIFT;
567	zv->zv_objset = os;
568	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
569		zv->zv_flags |= ZVOL_RDONLY;
570	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
571	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
572	    sizeof (rl_t), offsetof(rl_t, r_node));
573	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
574	    offsetof(zvol_extent_t, ze_node));
575	/* get and cache the blocksize */
576	error = dmu_object_info(os, ZVOL_OBJ, &doi);
577	ASSERT(error == 0);
578	zv->zv_volblocksize = doi.doi_data_block_size;
579
580	if (spa_writeable(dmu_objset_spa(os))) {
581		if (zil_replay_disable)
582			zil_destroy(dmu_objset_zil(os), B_FALSE);
583		else
584			zil_replay(os, zv, zvol_replay_vector);
585	}
586	dmu_objset_disown(os, FTAG);
587	zv->zv_objset = NULL;
588
589	zvol_minors++;
590
591	mutex_exit(&spa_namespace_lock);
592
593	zvol_geom_run(zv);
594
595	g_topology_unlock();
596	PICKUP_GIANT();
597
598	ZFS_LOG(1, "ZVOL %s created.", name);
599
600	return (0);
601}
602
603/*
604 * Remove minor node for the specified volume.
605 */
606static int
607zvol_remove_zv(zvol_state_t *zv)
608{
609#ifdef sun
610	minor_t minor = zv->zv_minor;
611#endif
612
613	ASSERT(MUTEX_HELD(&spa_namespace_lock));
614	if (zv->zv_total_opens != 0)
615		return (SET_ERROR(EBUSY));
616
617	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
618
619#ifdef sun
620	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
621	ddi_remove_minor_node(zfs_dip, nmbuf);
622#endif	/* sun */
623
624	avl_destroy(&zv->zv_znode.z_range_avl);
625	mutex_destroy(&zv->zv_znode.z_range_lock);
626
627	zvol_geom_destroy(zv);
628
629	zvol_minors--;
630	return (0);
631}
632
633int
634zvol_remove_minor(const char *name)
635{
636	zvol_state_t *zv;
637	int rc;
638
639	mutex_enter(&spa_namespace_lock);
640	if ((zv = zvol_minor_lookup(name)) == NULL) {
641		mutex_exit(&spa_namespace_lock);
642		return (SET_ERROR(ENXIO));
643	}
644	g_topology_lock();
645	rc = zvol_remove_zv(zv);
646	g_topology_unlock();
647	mutex_exit(&spa_namespace_lock);
648	return (rc);
649}
650
651int
652zvol_first_open(zvol_state_t *zv)
653{
654	objset_t *os;
655	uint64_t volsize;
656	int error;
657	uint64_t readonly;
658
659	/* lie and say we're read-only */
660	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
661	    zvol_tag, &os);
662	if (error)
663		return (error);
664
665	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
666	if (error) {
667		ASSERT(error == 0);
668		dmu_objset_disown(os, zvol_tag);
669		return (error);
670	}
671	zv->zv_objset = os;
672	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
673	if (error) {
674		dmu_objset_disown(os, zvol_tag);
675		return (error);
676	}
677	zv->zv_volsize = volsize;
678	zv->zv_zilog = zil_open(os, zvol_get_data);
679	zvol_size_changed(zv);
680
681	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
682	    NULL) == 0);
683	if (readonly || dmu_objset_is_snapshot(os) ||
684	    !spa_writeable(dmu_objset_spa(os)))
685		zv->zv_flags |= ZVOL_RDONLY;
686	else
687		zv->zv_flags &= ~ZVOL_RDONLY;
688	return (error);
689}
690
691void
692zvol_last_close(zvol_state_t *zv)
693{
694	zil_close(zv->zv_zilog);
695	zv->zv_zilog = NULL;
696
697	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
698	zv->zv_dbuf = NULL;
699
700	/*
701	 * Evict cached data
702	 */
703	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
704	    !(zv->zv_flags & ZVOL_RDONLY))
705		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
706	dmu_objset_evict_dbufs(zv->zv_objset);
707
708	dmu_objset_disown(zv->zv_objset, zvol_tag);
709	zv->zv_objset = NULL;
710}
711
712#ifdef sun
713int
714zvol_prealloc(zvol_state_t *zv)
715{
716	objset_t *os = zv->zv_objset;
717	dmu_tx_t *tx;
718	uint64_t refd, avail, usedobjs, availobjs;
719	uint64_t resid = zv->zv_volsize;
720	uint64_t off = 0;
721
722	/* Check the space usage before attempting to allocate the space */
723	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
724	if (avail < zv->zv_volsize)
725		return (SET_ERROR(ENOSPC));
726
727	/* Free old extents if they exist */
728	zvol_free_extents(zv);
729
730	while (resid != 0) {
731		int error;
732		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
733
734		tx = dmu_tx_create(os);
735		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
736		error = dmu_tx_assign(tx, TXG_WAIT);
737		if (error) {
738			dmu_tx_abort(tx);
739			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
740			return (error);
741		}
742		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
743		dmu_tx_commit(tx);
744		off += bytes;
745		resid -= bytes;
746	}
747	txg_wait_synced(dmu_objset_pool(os), 0);
748
749	return (0);
750}
751#endif	/* sun */
752
753static int
754zvol_update_volsize(objset_t *os, uint64_t volsize)
755{
756	dmu_tx_t *tx;
757	int error;
758
759	ASSERT(MUTEX_HELD(&spa_namespace_lock));
760
761	tx = dmu_tx_create(os);
762	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
763	error = dmu_tx_assign(tx, TXG_WAIT);
764	if (error) {
765		dmu_tx_abort(tx);
766		return (error);
767	}
768
769	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
770	    &volsize, tx);
771	dmu_tx_commit(tx);
772
773	if (error == 0)
774		error = dmu_free_long_range(os,
775		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
776	return (error);
777}
778
779void
780zvol_remove_minors(const char *name)
781{
782	struct g_geom *gp, *gptmp;
783	struct g_provider *pp;
784	zvol_state_t *zv;
785	size_t namelen;
786
787	namelen = strlen(name);
788
789	DROP_GIANT();
790	mutex_enter(&spa_namespace_lock);
791	g_topology_lock();
792
793	LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
794		pp = LIST_FIRST(&gp->provider);
795		if (pp == NULL)
796			continue;
797		zv = pp->private;
798		if (zv == NULL)
799			continue;
800		if (strcmp(zv->zv_name, name) == 0 ||
801		    (strncmp(zv->zv_name, name, namelen) == 0 &&
802		     zv->zv_name[namelen] == '/')) {
803			(void) zvol_remove_zv(zv);
804		}
805	}
806
807	g_topology_unlock();
808	mutex_exit(&spa_namespace_lock);
809	PICKUP_GIANT();
810}
811
812int
813zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
814{
815	zvol_state_t *zv = NULL;
816	objset_t *os;
817	int error;
818	dmu_object_info_t doi;
819	uint64_t old_volsize = 0ULL;
820	uint64_t readonly;
821
822	mutex_enter(&spa_namespace_lock);
823	zv = zvol_minor_lookup(name);
824	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
825		mutex_exit(&spa_namespace_lock);
826		return (error);
827	}
828
829	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
830	    (error = zvol_check_volsize(volsize,
831	    doi.doi_data_block_size)) != 0)
832		goto out;
833
834	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
835	    NULL) == 0);
836	if (readonly) {
837		error = EROFS;
838		goto out;
839	}
840
841	error = zvol_update_volsize(os, volsize);
842	/*
843	 * Reinitialize the dump area to the new size. If we
844	 * failed to resize the dump area then restore it back to
845	 * its original size.
846	 */
847	if (zv && error == 0) {
848#ifdef ZVOL_DUMP
849		if (zv->zv_flags & ZVOL_DUMPIFIED) {
850			old_volsize = zv->zv_volsize;
851			zv->zv_volsize = volsize;
852			if ((error = zvol_dumpify(zv)) != 0 ||
853			    (error = dumpvp_resize()) != 0) {
854				(void) zvol_update_volsize(os, old_volsize);
855				zv->zv_volsize = old_volsize;
856				error = zvol_dumpify(zv);
857			}
858		}
859#endif	/* ZVOL_DUMP */
860		if (error == 0) {
861			zv->zv_volsize = volsize;
862			zvol_size_changed(zv);
863		}
864	}
865
866#ifdef sun
867	/*
868	 * Generate a LUN expansion event.
869	 */
870	if (zv && error == 0) {
871		sysevent_id_t eid;
872		nvlist_t *attr;
873		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
874
875		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
876		    zv->zv_minor);
877
878		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
879		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
880
881		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
882		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
883
884		nvlist_free(attr);
885		kmem_free(physpath, MAXPATHLEN);
886	}
887#endif	/* sun */
888
889out:
890	dmu_objset_rele(os, FTAG);
891
892	mutex_exit(&spa_namespace_lock);
893
894	return (error);
895}
896
897/*ARGSUSED*/
898static int
899zvol_open(struct g_provider *pp, int flag, int count)
900{
901	zvol_state_t *zv;
902	int err = 0;
903	boolean_t locked = B_FALSE;
904
905	/*
906	 * Protect against recursively entering spa_namespace_lock
907	 * when spa_open() is used for a pool on a (local) ZVOL(s).
908	 * This is needed since we replaced upstream zfsdev_state_lock
909	 * with spa_namespace_lock in the ZVOL code.
910	 * We are using the same trick as spa_open().
911	 * Note that calls in zvol_first_open which need to resolve
912	 * pool name to a spa object will enter spa_open()
913	 * recursively, but that function already has all the
914	 * necessary protection.
915	 */
916	if (!MUTEX_HELD(&spa_namespace_lock)) {
917		mutex_enter(&spa_namespace_lock);
918		locked = B_TRUE;
919	}
920
921	zv = pp->private;
922	if (zv == NULL) {
923		if (locked)
924			mutex_exit(&spa_namespace_lock);
925		return (SET_ERROR(ENXIO));
926	}
927
928	if (zv->zv_total_opens == 0)
929		err = zvol_first_open(zv);
930	if (err) {
931		if (locked)
932			mutex_exit(&spa_namespace_lock);
933		return (err);
934	}
935	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
936		err = SET_ERROR(EROFS);
937		goto out;
938	}
939	if (zv->zv_flags & ZVOL_EXCL) {
940		err = SET_ERROR(EBUSY);
941		goto out;
942	}
943#ifdef FEXCL
944	if (flag & FEXCL) {
945		if (zv->zv_total_opens != 0) {
946			err = SET_ERROR(EBUSY);
947			goto out;
948		}
949		zv->zv_flags |= ZVOL_EXCL;
950	}
951#endif
952
953	zv->zv_total_opens += count;
954	if (locked)
955		mutex_exit(&spa_namespace_lock);
956
957	return (err);
958out:
959	if (zv->zv_total_opens == 0)
960		zvol_last_close(zv);
961	if (locked)
962		mutex_exit(&spa_namespace_lock);
963	return (err);
964}
965
966/*ARGSUSED*/
967static int
968zvol_close(struct g_provider *pp, int flag, int count)
969{
970	zvol_state_t *zv;
971	int error = 0;
972	boolean_t locked = B_FALSE;
973
974	/* See comment in zvol_open(). */
975	if (!MUTEX_HELD(&spa_namespace_lock)) {
976		mutex_enter(&spa_namespace_lock);
977		locked = B_TRUE;
978	}
979
980	zv = pp->private;
981	if (zv == NULL) {
982		if (locked)
983			mutex_exit(&spa_namespace_lock);
984		return (SET_ERROR(ENXIO));
985	}
986
987	if (zv->zv_flags & ZVOL_EXCL) {
988		ASSERT(zv->zv_total_opens == 1);
989		zv->zv_flags &= ~ZVOL_EXCL;
990	}
991
992	/*
993	 * If the open count is zero, this is a spurious close.
994	 * That indicates a bug in the kernel / DDI framework.
995	 */
996	ASSERT(zv->zv_total_opens != 0);
997
998	/*
999	 * You may get multiple opens, but only one close.
1000	 */
1001	zv->zv_total_opens -= count;
1002
1003	if (zv->zv_total_opens == 0)
1004		zvol_last_close(zv);
1005
1006	if (locked)
1007		mutex_exit(&spa_namespace_lock);
1008	return (error);
1009}
1010
1011static void
1012zvol_get_done(zgd_t *zgd, int error)
1013{
1014	if (zgd->zgd_db)
1015		dmu_buf_rele(zgd->zgd_db, zgd);
1016
1017	zfs_range_unlock(zgd->zgd_rl);
1018
1019	if (error == 0 && zgd->zgd_bp)
1020		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1021
1022	kmem_free(zgd, sizeof (zgd_t));
1023}
1024
1025/*
1026 * Get data to generate a TX_WRITE intent log record.
1027 */
1028static int
1029zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1030{
1031	zvol_state_t *zv = arg;
1032	objset_t *os = zv->zv_objset;
1033	uint64_t object = ZVOL_OBJ;
1034	uint64_t offset = lr->lr_offset;
1035	uint64_t size = lr->lr_length;	/* length of user data */
1036	blkptr_t *bp = &lr->lr_blkptr;
1037	dmu_buf_t *db;
1038	zgd_t *zgd;
1039	int error;
1040
1041	ASSERT(zio != NULL);
1042	ASSERT(size != 0);
1043
1044	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1045	zgd->zgd_zilog = zv->zv_zilog;
1046	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
1047
1048	/*
1049	 * Write records come in two flavors: immediate and indirect.
1050	 * For small writes it's cheaper to store the data with the
1051	 * log record (immediate); for large writes it's cheaper to
1052	 * sync the data and get a pointer to it (indirect) so that
1053	 * we don't have to write the data twice.
1054	 */
1055	if (buf != NULL) {	/* immediate write */
1056		error = dmu_read(os, object, offset, size, buf,
1057		    DMU_READ_NO_PREFETCH);
1058	} else {
1059		size = zv->zv_volblocksize;
1060		offset = P2ALIGN(offset, size);
1061		error = dmu_buf_hold(os, object, offset, zgd, &db,
1062		    DMU_READ_NO_PREFETCH);
1063		if (error == 0) {
1064			blkptr_t *obp = dmu_buf_get_blkptr(db);
1065			if (obp) {
1066				ASSERT(BP_IS_HOLE(bp));
1067				*bp = *obp;
1068			}
1069
1070			zgd->zgd_db = db;
1071			zgd->zgd_bp = bp;
1072
1073			ASSERT(db->db_offset == offset);
1074			ASSERT(db->db_size == size);
1075
1076			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1077			    zvol_get_done, zgd);
1078
1079			if (error == 0)
1080				return (0);
1081		}
1082	}
1083
1084	zvol_get_done(zgd, error);
1085
1086	return (error);
1087}
1088
1089/*
1090 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1091 *
1092 * We store data in the log buffers if it's small enough.
1093 * Otherwise we will later flush the data out via dmu_sync().
1094 */
1095ssize_t zvol_immediate_write_sz = 32768;
1096
1097static void
1098zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1099    boolean_t sync)
1100{
1101	uint32_t blocksize = zv->zv_volblocksize;
1102	zilog_t *zilog = zv->zv_zilog;
1103	boolean_t slogging;
1104	ssize_t immediate_write_sz;
1105
1106	if (zil_replaying(zilog, tx))
1107		return;
1108
1109	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1110	    ? 0 : zvol_immediate_write_sz;
1111
1112	slogging = spa_has_slogs(zilog->zl_spa) &&
1113	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1114
1115	while (resid) {
1116		itx_t *itx;
1117		lr_write_t *lr;
1118		ssize_t len;
1119		itx_wr_state_t write_state;
1120
1121		/*
1122		 * Unlike zfs_log_write() we can be called with
1123		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
1124		 */
1125		if (blocksize > immediate_write_sz && !slogging &&
1126		    resid >= blocksize && off % blocksize == 0) {
1127			write_state = WR_INDIRECT; /* uses dmu_sync */
1128			len = blocksize;
1129		} else if (sync) {
1130			write_state = WR_COPIED;
1131			len = MIN(ZIL_MAX_LOG_DATA, resid);
1132		} else {
1133			write_state = WR_NEED_COPY;
1134			len = MIN(ZIL_MAX_LOG_DATA, resid);
1135		}
1136
1137		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1138		    (write_state == WR_COPIED ? len : 0));
1139		lr = (lr_write_t *)&itx->itx_lr;
1140		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1141		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1142			zil_itx_destroy(itx);
1143			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1144			lr = (lr_write_t *)&itx->itx_lr;
1145			write_state = WR_NEED_COPY;
1146		}
1147
1148		itx->itx_wr_state = write_state;
1149		if (write_state == WR_NEED_COPY)
1150			itx->itx_sod += len;
1151		lr->lr_foid = ZVOL_OBJ;
1152		lr->lr_offset = off;
1153		lr->lr_length = len;
1154		lr->lr_blkoff = 0;
1155		BP_ZERO(&lr->lr_blkptr);
1156
1157		itx->itx_private = zv;
1158		itx->itx_sync = sync;
1159
1160		zil_itx_assign(zilog, itx, tx);
1161
1162		off += len;
1163		resid -= len;
1164	}
1165}
1166
1167#ifdef sun
1168static int
1169zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1170    uint64_t size, boolean_t doread, boolean_t isdump)
1171{
1172	vdev_disk_t *dvd;
1173	int c;
1174	int numerrors = 0;
1175
1176	if (vd->vdev_ops == &vdev_mirror_ops ||
1177	    vd->vdev_ops == &vdev_replacing_ops ||
1178	    vd->vdev_ops == &vdev_spare_ops) {
1179		for (c = 0; c < vd->vdev_children; c++) {
1180			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1181			    addr, offset, origoffset, size, doread, isdump);
1182			if (err != 0) {
1183				numerrors++;
1184			} else if (doread) {
1185				break;
1186			}
1187		}
1188	}
1189
1190	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1191		return (numerrors < vd->vdev_children ? 0 : EIO);
1192
1193	if (doread && !vdev_readable(vd))
1194		return (SET_ERROR(EIO));
1195	else if (!doread && !vdev_writeable(vd))
1196		return (SET_ERROR(EIO));
1197
1198	if (vd->vdev_ops == &vdev_raidz_ops) {
1199		return (vdev_raidz_physio(vd,
1200		    addr, size, offset, origoffset, doread, isdump));
1201	}
1202
1203	offset += VDEV_LABEL_START_SIZE;
1204
1205	if (ddi_in_panic() || isdump) {
1206		ASSERT(!doread);
1207		if (doread)
1208			return (SET_ERROR(EIO));
1209		dvd = vd->vdev_tsd;
1210		ASSERT3P(dvd, !=, NULL);
1211		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1212		    lbtodb(size)));
1213	} else {
1214		dvd = vd->vdev_tsd;
1215		ASSERT3P(dvd, !=, NULL);
1216		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1217		    offset, doread ? B_READ : B_WRITE));
1218	}
1219}
1220
1221static int
1222zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1223    boolean_t doread, boolean_t isdump)
1224{
1225	vdev_t *vd;
1226	int error;
1227	zvol_extent_t *ze;
1228	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1229
1230	/* Must be sector aligned, and not stradle a block boundary. */
1231	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1232	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1233		return (SET_ERROR(EINVAL));
1234	}
1235	ASSERT(size <= zv->zv_volblocksize);
1236
1237	/* Locate the extent this belongs to */
1238	ze = list_head(&zv->zv_extents);
1239	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1240		offset -= ze->ze_nblks * zv->zv_volblocksize;
1241		ze = list_next(&zv->zv_extents, ze);
1242	}
1243
1244	if (ze == NULL)
1245		return (SET_ERROR(EINVAL));
1246
1247	if (!ddi_in_panic())
1248		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1249
1250	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1251	offset += DVA_GET_OFFSET(&ze->ze_dva);
1252	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1253	    size, doread, isdump);
1254
1255	if (!ddi_in_panic())
1256		spa_config_exit(spa, SCL_STATE, FTAG);
1257
1258	return (error);
1259}
1260#endif	/* sun */
1261
1262int
1263zvol_strategy(struct bio *bp)
1264{
1265	zvol_state_t *zv = bp->bio_to->private;
1266	uint64_t off, volsize;
1267	size_t resid;
1268	char *addr;
1269	objset_t *os;
1270	rl_t *rl;
1271	int error = 0;
1272	boolean_t doread = (bp->bio_cmd == BIO_READ);
1273	boolean_t is_dumpified;
1274	boolean_t sync;
1275
1276	if (zv == NULL) {
1277		g_io_deliver(bp, ENXIO);
1278		return (0);
1279	}
1280
1281	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1282		g_io_deliver(bp, EROFS);
1283		return (0);
1284	}
1285
1286	off = bp->bio_offset;
1287	volsize = zv->zv_volsize;
1288
1289	os = zv->zv_objset;
1290	ASSERT(os != NULL);
1291
1292	addr = bp->bio_data;
1293	resid = bp->bio_length;
1294
1295	if (resid > 0 && (off < 0 || off >= volsize)) {
1296		g_io_deliver(bp, EIO);
1297		return (0);
1298	}
1299
1300#ifdef illumos
1301	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1302#else
1303	is_dumpified = B_FALSE;
1304#endif
1305        sync = !doread && !is_dumpified &&
1306	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1307
1308	/*
1309	 * There must be no buffer changes when doing a dmu_sync() because
1310	 * we can't change the data whilst calculating the checksum.
1311	 */
1312	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1313	    doread ? RL_READER : RL_WRITER);
1314
1315	while (resid != 0 && off < volsize) {
1316		size_t size = MIN(resid, zvol_maxphys);
1317#ifdef illumos
1318		if (is_dumpified) {
1319			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1320			error = zvol_dumpio(zv, addr, off, size,
1321			    doread, B_FALSE);
1322		} else if (doread) {
1323#else
1324		if (doread) {
1325#endif
1326			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1327			    DMU_READ_PREFETCH);
1328		} else {
1329			dmu_tx_t *tx = dmu_tx_create(os);
1330			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1331			error = dmu_tx_assign(tx, TXG_WAIT);
1332			if (error) {
1333				dmu_tx_abort(tx);
1334			} else {
1335				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1336				zvol_log_write(zv, tx, off, size, sync);
1337				dmu_tx_commit(tx);
1338			}
1339		}
1340		if (error) {
1341			/* convert checksum errors into IO errors */
1342			if (error == ECKSUM)
1343				error = SET_ERROR(EIO);
1344			break;
1345		}
1346		off += size;
1347		addr += size;
1348		resid -= size;
1349	}
1350	zfs_range_unlock(rl);
1351
1352	bp->bio_completed = bp->bio_length - resid;
1353	if (bp->bio_completed < bp->bio_length)
1354		bp->bio_error = (off > volsize ? EINVAL : error);
1355
1356	if (sync)
1357		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1358	g_io_deliver(bp, 0);
1359
1360	return (0);
1361}
1362
1363#ifdef sun
1364/*
1365 * Set the buffer count to the zvol maximum transfer.
1366 * Using our own routine instead of the default minphys()
1367 * means that for larger writes we write bigger buffers on X86
1368 * (128K instead of 56K) and flush the disk write cache less often
1369 * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1370 * 56K on X86 and 128K on sparc).
1371 */
1372void
1373zvol_minphys(struct buf *bp)
1374{
1375	if (bp->b_bcount > zvol_maxphys)
1376		bp->b_bcount = zvol_maxphys;
1377}
1378
1379int
1380zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1381{
1382	minor_t minor = getminor(dev);
1383	zvol_state_t *zv;
1384	int error = 0;
1385	uint64_t size;
1386	uint64_t boff;
1387	uint64_t resid;
1388
1389	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1390	if (zv == NULL)
1391		return (SET_ERROR(ENXIO));
1392
1393	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1394		return (SET_ERROR(EINVAL));
1395
1396	boff = ldbtob(blkno);
1397	resid = ldbtob(nblocks);
1398
1399	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1400
1401	while (resid) {
1402		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1403		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1404		if (error)
1405			break;
1406		boff += size;
1407		addr += size;
1408		resid -= size;
1409	}
1410
1411	return (error);
1412}
1413
1414/*ARGSUSED*/
1415int
1416zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1417{
1418	minor_t minor = getminor(dev);
1419	zvol_state_t *zv;
1420	uint64_t volsize;
1421	rl_t *rl;
1422	int error = 0;
1423
1424	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1425	if (zv == NULL)
1426		return (SET_ERROR(ENXIO));
1427
1428	volsize = zv->zv_volsize;
1429	if (uio->uio_resid > 0 &&
1430	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1431		return (SET_ERROR(EIO));
1432
1433	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1434		error = physio(zvol_strategy, NULL, dev, B_READ,
1435		    zvol_minphys, uio);
1436		return (error);
1437	}
1438
1439	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1440	    RL_READER);
1441	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1442		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1443
1444		/* don't read past the end */
1445		if (bytes > volsize - uio->uio_loffset)
1446			bytes = volsize - uio->uio_loffset;
1447
1448		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1449		if (error) {
1450			/* convert checksum errors into IO errors */
1451			if (error == ECKSUM)
1452				error = SET_ERROR(EIO);
1453			break;
1454		}
1455	}
1456	zfs_range_unlock(rl);
1457	return (error);
1458}
1459
1460/*ARGSUSED*/
1461int
1462zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1463{
1464	minor_t minor = getminor(dev);
1465	zvol_state_t *zv;
1466	uint64_t volsize;
1467	rl_t *rl;
1468	int error = 0;
1469	boolean_t sync;
1470
1471	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1472	if (zv == NULL)
1473		return (SET_ERROR(ENXIO));
1474
1475	volsize = zv->zv_volsize;
1476	if (uio->uio_resid > 0 &&
1477	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1478		return (SET_ERROR(EIO));
1479
1480	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1481		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1482		    zvol_minphys, uio);
1483		return (error);
1484	}
1485
1486	sync = !(zv->zv_flags & ZVOL_WCE) ||
1487	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1488
1489	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1490	    RL_WRITER);
1491	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1492		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1493		uint64_t off = uio->uio_loffset;
1494		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1495
1496		if (bytes > volsize - off)	/* don't write past the end */
1497			bytes = volsize - off;
1498
1499		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1500		error = dmu_tx_assign(tx, TXG_WAIT);
1501		if (error) {
1502			dmu_tx_abort(tx);
1503			break;
1504		}
1505		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1506		if (error == 0)
1507			zvol_log_write(zv, tx, off, bytes, sync);
1508		dmu_tx_commit(tx);
1509
1510		if (error)
1511			break;
1512	}
1513	zfs_range_unlock(rl);
1514	if (sync)
1515		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1516	return (error);
1517}
1518
1519int
1520zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1521{
1522	struct uuid uuid = EFI_RESERVED;
1523	efi_gpe_t gpe = { 0 };
1524	uint32_t crc;
1525	dk_efi_t efi;
1526	int length;
1527	char *ptr;
1528
1529	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1530		return (SET_ERROR(EFAULT));
1531	ptr = (char *)(uintptr_t)efi.dki_data_64;
1532	length = efi.dki_length;
1533	/*
1534	 * Some clients may attempt to request a PMBR for the
1535	 * zvol.  Currently this interface will return EINVAL to
1536	 * such requests.  These requests could be supported by
1537	 * adding a check for lba == 0 and consing up an appropriate
1538	 * PMBR.
1539	 */
1540	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1541		return (SET_ERROR(EINVAL));
1542
1543	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1544	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1545	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1546
1547	if (efi.dki_lba == 1) {
1548		efi_gpt_t gpt = { 0 };
1549
1550		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1551		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1552		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1553		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1554		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1555		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1556		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1557		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1558		gpt.efi_gpt_SizeOfPartitionEntry =
1559		    LE_32(sizeof (efi_gpe_t));
1560		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1561		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1562		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1563		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1564		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1565		    flag))
1566			return (SET_ERROR(EFAULT));
1567		ptr += sizeof (gpt);
1568		length -= sizeof (gpt);
1569	}
1570	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1571	    length), flag))
1572		return (SET_ERROR(EFAULT));
1573	return (0);
1574}
1575
1576/*
1577 * BEGIN entry points to allow external callers access to the volume.
1578 */
1579/*
1580 * Return the volume parameters needed for access from an external caller.
1581 * These values are invariant as long as the volume is held open.
1582 */
1583int
1584zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1585    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1586    void **rl_hdl, void **bonus_hdl)
1587{
1588	zvol_state_t *zv;
1589
1590	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1591	if (zv == NULL)
1592		return (SET_ERROR(ENXIO));
1593	if (zv->zv_flags & ZVOL_DUMPIFIED)
1594		return (SET_ERROR(ENXIO));
1595
1596	ASSERT(blksize && max_xfer_len && minor_hdl &&
1597	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1598
1599	*blksize = zv->zv_volblocksize;
1600	*max_xfer_len = (uint64_t)zvol_maxphys;
1601	*minor_hdl = zv;
1602	*objset_hdl = zv->zv_objset;
1603	*zil_hdl = zv->zv_zilog;
1604	*rl_hdl = &zv->zv_znode;
1605	*bonus_hdl = zv->zv_dbuf;
1606	return (0);
1607}
1608
1609/*
1610 * Return the current volume size to an external caller.
1611 * The size can change while the volume is open.
1612 */
1613uint64_t
1614zvol_get_volume_size(void *minor_hdl)
1615{
1616	zvol_state_t *zv = minor_hdl;
1617
1618	return (zv->zv_volsize);
1619}
1620
1621/*
1622 * Return the current WCE setting to an external caller.
1623 * The WCE setting can change while the volume is open.
1624 */
1625int
1626zvol_get_volume_wce(void *minor_hdl)
1627{
1628	zvol_state_t *zv = minor_hdl;
1629
1630	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1631}
1632
1633/*
1634 * Entry point for external callers to zvol_log_write
1635 */
1636void
1637zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1638    boolean_t sync)
1639{
1640	zvol_state_t *zv = minor_hdl;
1641
1642	zvol_log_write(zv, tx, off, resid, sync);
1643}
1644/*
1645 * END entry points to allow external callers access to the volume.
1646 */
1647
1648/*
1649 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1650 */
1651/*ARGSUSED*/
1652int
1653zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1654{
1655	zvol_state_t *zv;
1656	struct dk_cinfo dki;
1657	struct dk_minfo dkm;
1658	struct dk_callback *dkc;
1659	int error = 0;
1660	rl_t *rl;
1661
1662	mutex_enter(&spa_namespace_lock);
1663
1664	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1665
1666	if (zv == NULL) {
1667		mutex_exit(&spa_namespace_lock);
1668		return (SET_ERROR(ENXIO));
1669	}
1670	ASSERT(zv->zv_total_opens > 0);
1671
1672	switch (cmd) {
1673
1674	case DKIOCINFO:
1675		bzero(&dki, sizeof (dki));
1676		(void) strcpy(dki.dki_cname, "zvol");
1677		(void) strcpy(dki.dki_dname, "zvol");
1678		dki.dki_ctype = DKC_UNKNOWN;
1679		dki.dki_unit = getminor(dev);
1680		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1681		mutex_exit(&spa_namespace_lock);
1682		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1683			error = SET_ERROR(EFAULT);
1684		return (error);
1685
1686	case DKIOCGMEDIAINFO:
1687		bzero(&dkm, sizeof (dkm));
1688		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1689		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1690		dkm.dki_media_type = DK_UNKNOWN;
1691		mutex_exit(&spa_namespace_lock);
1692		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1693			error = SET_ERROR(EFAULT);
1694		return (error);
1695
1696	case DKIOCGETEFI:
1697		{
1698			uint64_t vs = zv->zv_volsize;
1699			uint8_t bs = zv->zv_min_bs;
1700
1701			mutex_exit(&spa_namespace_lock);
1702			error = zvol_getefi((void *)arg, flag, vs, bs);
1703			return (error);
1704		}
1705
1706	case DKIOCFLUSHWRITECACHE:
1707		dkc = (struct dk_callback *)arg;
1708		mutex_exit(&spa_namespace_lock);
1709		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1710		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1711			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1712			error = 0;
1713		}
1714		return (error);
1715
1716	case DKIOCGETWCE:
1717		{
1718			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1719			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1720			    flag))
1721				error = SET_ERROR(EFAULT);
1722			break;
1723		}
1724	case DKIOCSETWCE:
1725		{
1726			int wce;
1727			if (ddi_copyin((void *)arg, &wce, sizeof (int),
1728			    flag)) {
1729				error = SET_ERROR(EFAULT);
1730				break;
1731			}
1732			if (wce) {
1733				zv->zv_flags |= ZVOL_WCE;
1734				mutex_exit(&spa_namespace_lock);
1735			} else {
1736				zv->zv_flags &= ~ZVOL_WCE;
1737				mutex_exit(&spa_namespace_lock);
1738				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1739			}
1740			return (0);
1741		}
1742
1743	case DKIOCGGEOM:
1744	case DKIOCGVTOC:
1745		/*
1746		 * commands using these (like prtvtoc) expect ENOTSUP
1747		 * since we're emulating an EFI label
1748		 */
1749		error = SET_ERROR(ENOTSUP);
1750		break;
1751
1752	case DKIOCDUMPINIT:
1753		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1754		    RL_WRITER);
1755		error = zvol_dumpify(zv);
1756		zfs_range_unlock(rl);
1757		break;
1758
1759	case DKIOCDUMPFINI:
1760		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1761			break;
1762		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1763		    RL_WRITER);
1764		error = zvol_dump_fini(zv);
1765		zfs_range_unlock(rl);
1766		break;
1767
1768	case DKIOCFREE:
1769	{
1770		dkioc_free_t df;
1771		dmu_tx_t *tx;
1772
1773		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1774			error = SET_ERROR(EFAULT);
1775			break;
1776		}
1777
1778		/*
1779		 * Apply Postel's Law to length-checking.  If they overshoot,
1780		 * just blank out until the end, if there's a need to blank
1781		 * out anything.
1782		 */
1783		if (df.df_start >= zv->zv_volsize)
1784			break;	/* No need to do anything... */
1785		if (df.df_start + df.df_length > zv->zv_volsize)
1786			df.df_length = DMU_OBJECT_END;
1787
1788		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1789		    RL_WRITER);
1790		tx = dmu_tx_create(zv->zv_objset);
1791		error = dmu_tx_assign(tx, TXG_WAIT);
1792		if (error != 0) {
1793			dmu_tx_abort(tx);
1794		} else {
1795			zvol_log_truncate(zv, tx, df.df_start,
1796			    df.df_length, B_TRUE);
1797			dmu_tx_commit(tx);
1798			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1799			    df.df_start, df.df_length);
1800		}
1801
1802		zfs_range_unlock(rl);
1803
1804		if (error == 0) {
1805			/*
1806			 * If the write-cache is disabled or 'sync' property
1807			 * is set to 'always' then treat this as a synchronous
1808			 * operation (i.e. commit to zil).
1809			 */
1810			if (!(zv->zv_flags & ZVOL_WCE) ||
1811			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1812				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1813
1814			/*
1815			 * If the caller really wants synchronous writes, and
1816			 * can't wait for them, don't return until the write
1817			 * is done.
1818			 */
1819			if (df.df_flags & DF_WAIT_SYNC) {
1820				txg_wait_synced(
1821				    dmu_objset_pool(zv->zv_objset), 0);
1822			}
1823		}
1824		break;
1825	}
1826
1827	default:
1828		error = SET_ERROR(ENOTTY);
1829		break;
1830
1831	}
1832	mutex_exit(&spa_namespace_lock);
1833	return (error);
1834}
1835#endif	/* sun */
1836
1837int
1838zvol_busy(void)
1839{
1840	return (zvol_minors != 0);
1841}
1842
1843void
1844zvol_init(void)
1845{
1846	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1847	    1) == 0);
1848	ZFS_LOG(1, "ZVOL Initialized.");
1849}
1850
1851void
1852zvol_fini(void)
1853{
1854	ddi_soft_state_fini(&zfsdev_state);
1855	ZFS_LOG(1, "ZVOL Deinitialized.");
1856}
1857
1858#ifdef sun
1859/*ARGSUSED*/
1860static int
1861zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1862{
1863	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1864
1865	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1866		return (1);
1867	return (0);
1868}
1869
1870/*ARGSUSED*/
1871static void
1872zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1873{
1874	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1875
1876	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1877}
1878
1879static int
1880zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1881{
1882	dmu_tx_t *tx;
1883	int error;
1884	objset_t *os = zv->zv_objset;
1885	spa_t *spa = dmu_objset_spa(os);
1886	vdev_t *vd = spa->spa_root_vdev;
1887	nvlist_t *nv = NULL;
1888	uint64_t version = spa_version(spa);
1889	enum zio_checksum checksum;
1890
1891	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1892	ASSERT(vd->vdev_ops == &vdev_root_ops);
1893
1894	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1895	    DMU_OBJECT_END);
1896	/* wait for dmu_free_long_range to actually free the blocks */
1897	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1898
1899	/*
1900	 * If the pool on which the dump device is being initialized has more
1901	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1902	 * enabled.  If so, bump that feature's counter to indicate that the
1903	 * feature is active. We also check the vdev type to handle the
1904	 * following case:
1905	 *   # zpool create test raidz disk1 disk2 disk3
1906	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1907	 *   the raidz vdev itself has 3 children.
1908	 */
1909	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1910		if (!spa_feature_is_enabled(spa,
1911		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1912			return (SET_ERROR(ENOTSUP));
1913		(void) dsl_sync_task(spa_name(spa),
1914		    zfs_mvdev_dump_feature_check,
1915		    zfs_mvdev_dump_activate_feature_sync, NULL, 2);
1916	}
1917
1918	tx = dmu_tx_create(os);
1919	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1920	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1921	error = dmu_tx_assign(tx, TXG_WAIT);
1922	if (error) {
1923		dmu_tx_abort(tx);
1924		return (error);
1925	}
1926
1927	/*
1928	 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
1929	 * function.  Otherwise, use the old default -- OFF.
1930	 */
1931	checksum = spa_feature_is_active(spa,
1932	    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
1933	    ZIO_CHECKSUM_OFF;
1934
1935	/*
1936	 * If we are resizing the dump device then we only need to
1937	 * update the refreservation to match the newly updated
1938	 * zvolsize. Otherwise, we save off the original state of the
1939	 * zvol so that we can restore them if the zvol is ever undumpified.
1940	 */
1941	if (resize) {
1942		error = zap_update(os, ZVOL_ZAP_OBJ,
1943		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1944		    &zv->zv_volsize, tx);
1945	} else {
1946		uint64_t checksum, compress, refresrv, vbs, dedup;
1947
1948		error = dsl_prop_get_integer(zv->zv_name,
1949		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1950		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1951		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
1952		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1953		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
1954		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1955		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
1956		if (version >= SPA_VERSION_DEDUP) {
1957			error = error ? error :
1958			    dsl_prop_get_integer(zv->zv_name,
1959			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1960		}
1961
1962		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1963		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1964		    &compress, tx);
1965		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1966		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
1967		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1968		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1969		    &refresrv, tx);
1970		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1971		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1972		    &vbs, tx);
1973		error = error ? error : dmu_object_set_blocksize(
1974		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
1975		if (version >= SPA_VERSION_DEDUP) {
1976			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1977			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
1978			    &dedup, tx);
1979		}
1980		if (error == 0)
1981			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
1982	}
1983	dmu_tx_commit(tx);
1984
1985	/*
1986	 * We only need update the zvol's property if we are initializing
1987	 * the dump area for the first time.
1988	 */
1989	if (!resize) {
1990		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1991		VERIFY(nvlist_add_uint64(nv,
1992		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
1993		VERIFY(nvlist_add_uint64(nv,
1994		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
1995		    ZIO_COMPRESS_OFF) == 0);
1996		VERIFY(nvlist_add_uint64(nv,
1997		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
1998		    checksum) == 0);
1999		if (version >= SPA_VERSION_DEDUP) {
2000			VERIFY(nvlist_add_uint64(nv,
2001			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2002			    ZIO_CHECKSUM_OFF) == 0);
2003		}
2004
2005		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2006		    nv, NULL);
2007		nvlist_free(nv);
2008
2009		if (error)
2010			return (error);
2011	}
2012
2013	/* Allocate the space for the dump */
2014	error = zvol_prealloc(zv);
2015	return (error);
2016}
2017
2018static int
2019zvol_dumpify(zvol_state_t *zv)
2020{
2021	int error = 0;
2022	uint64_t dumpsize = 0;
2023	dmu_tx_t *tx;
2024	objset_t *os = zv->zv_objset;
2025
2026	if (zv->zv_flags & ZVOL_RDONLY)
2027		return (SET_ERROR(EROFS));
2028
2029	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2030	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2031		boolean_t resize = (dumpsize > 0);
2032
2033		if ((error = zvol_dump_init(zv, resize)) != 0) {
2034			(void) zvol_dump_fini(zv);
2035			return (error);
2036		}
2037	}
2038
2039	/*
2040	 * Build up our lba mapping.
2041	 */
2042	error = zvol_get_lbas(zv);
2043	if (error) {
2044		(void) zvol_dump_fini(zv);
2045		return (error);
2046	}
2047
2048	tx = dmu_tx_create(os);
2049	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2050	error = dmu_tx_assign(tx, TXG_WAIT);
2051	if (error) {
2052		dmu_tx_abort(tx);
2053		(void) zvol_dump_fini(zv);
2054		return (error);
2055	}
2056
2057	zv->zv_flags |= ZVOL_DUMPIFIED;
2058	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2059	    &zv->zv_volsize, tx);
2060	dmu_tx_commit(tx);
2061
2062	if (error) {
2063		(void) zvol_dump_fini(zv);
2064		return (error);
2065	}
2066
2067	txg_wait_synced(dmu_objset_pool(os), 0);
2068	return (0);
2069}
2070
2071static int
2072zvol_dump_fini(zvol_state_t *zv)
2073{
2074	dmu_tx_t *tx;
2075	objset_t *os = zv->zv_objset;
2076	nvlist_t *nv;
2077	int error = 0;
2078	uint64_t checksum, compress, refresrv, vbs, dedup;
2079	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2080
2081	/*
2082	 * Attempt to restore the zvol back to its pre-dumpified state.
2083	 * This is a best-effort attempt as it's possible that not all
2084	 * of these properties were initialized during the dumpify process
2085	 * (i.e. error during zvol_dump_init).
2086	 */
2087
2088	tx = dmu_tx_create(os);
2089	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2090	error = dmu_tx_assign(tx, TXG_WAIT);
2091	if (error) {
2092		dmu_tx_abort(tx);
2093		return (error);
2094	}
2095	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2096	dmu_tx_commit(tx);
2097
2098	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2099	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2100	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2101	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2102	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2103	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2104	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2105	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2106
2107	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2108	(void) nvlist_add_uint64(nv,
2109	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2110	(void) nvlist_add_uint64(nv,
2111	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2112	(void) nvlist_add_uint64(nv,
2113	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2114	if (version >= SPA_VERSION_DEDUP &&
2115	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2116	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2117		(void) nvlist_add_uint64(nv,
2118		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2119	}
2120	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2121	    nv, NULL);
2122	nvlist_free(nv);
2123
2124	zvol_free_extents(zv);
2125	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2126	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2127	/* wait for dmu_free_long_range to actually free the blocks */
2128	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2129	tx = dmu_tx_create(os);
2130	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2131	error = dmu_tx_assign(tx, TXG_WAIT);
2132	if (error) {
2133		dmu_tx_abort(tx);
2134		return (error);
2135	}
2136	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2137		zv->zv_volblocksize = vbs;
2138	dmu_tx_commit(tx);
2139
2140	return (0);
2141}
2142#endif	/* sun */
2143
2144static zvol_state_t *
2145zvol_geom_create(const char *name)
2146{
2147	struct g_provider *pp;
2148	struct g_geom *gp;
2149	zvol_state_t *zv;
2150
2151	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
2152	gp->start = zvol_geom_start;
2153	gp->access = zvol_geom_access;
2154	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
2155	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2156	pp->sectorsize = DEV_BSIZE;
2157
2158	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
2159	zv->zv_provider = pp;
2160	zv->zv_state = 0;
2161	bioq_init(&zv->zv_queue);
2162	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
2163
2164	pp->private = zv;
2165
2166	return (zv);
2167}
2168
2169static void
2170zvol_geom_run(zvol_state_t *zv)
2171{
2172	struct g_provider *pp;
2173
2174	pp = zv->zv_provider;
2175	g_error_provider(pp, 0);
2176
2177	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2178	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2179}
2180
2181static void
2182zvol_geom_destroy(zvol_state_t *zv)
2183{
2184	struct g_provider *pp;
2185
2186	g_topology_assert();
2187
2188	mtx_lock(&zv->zv_queue_mtx);
2189	zv->zv_state = 1;
2190	wakeup_one(&zv->zv_queue);
2191	while (zv->zv_state != 2)
2192		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2193	mtx_destroy(&zv->zv_queue_mtx);
2194
2195	pp = zv->zv_provider;
2196	zv->zv_provider = NULL;
2197	pp->private = NULL;
2198	g_wither_geom(pp->geom, ENXIO);
2199
2200	kmem_free(zv, sizeof(*zv));
2201}
2202
2203static int
2204zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2205{
2206	int count, error, flags;
2207
2208	g_topology_assert();
2209
2210	/*
2211	 * To make it easier we expect either open or close, but not both
2212	 * at the same time.
2213	 */
2214	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2215	    (acr <= 0 && acw <= 0 && ace <= 0),
2216	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2217	    pp->name, acr, acw, ace));
2218
2219	if (pp->private == NULL) {
2220		if (acr <= 0 && acw <= 0 && ace <= 0)
2221			return (0);
2222		return (pp->error);
2223	}
2224
2225	/*
2226	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2227	 * because GEOM already handles that and handles it a bit differently.
2228	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2229	 * only one exclusive consumer, no matter if it is reader or writer.
2230	 * I like better the way GEOM works so I'll leave it for GEOM to
2231	 * decide what to do.
2232	 */
2233
2234	count = acr + acw + ace;
2235	if (count == 0)
2236		return (0);
2237
2238	flags = 0;
2239	if (acr != 0 || ace != 0)
2240		flags |= FREAD;
2241	if (acw != 0)
2242		flags |= FWRITE;
2243
2244	g_topology_unlock();
2245	if (count > 0)
2246		error = zvol_open(pp, flags, count);
2247	else
2248		error = zvol_close(pp, flags, -count);
2249	g_topology_lock();
2250	return (error);
2251}
2252
2253static void
2254zvol_geom_start(struct bio *bp)
2255{
2256	zvol_state_t *zv;
2257	boolean_t first;
2258
2259	zv = bp->bio_to->private;
2260	ASSERT(zv != NULL);
2261	switch (bp->bio_cmd) {
2262	case BIO_FLUSH:
2263		if (!THREAD_CAN_SLEEP())
2264			goto enqueue;
2265		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2266		g_io_deliver(bp, 0);
2267		break;
2268	case BIO_READ:
2269	case BIO_WRITE:
2270		if (!THREAD_CAN_SLEEP())
2271			goto enqueue;
2272		zvol_strategy(bp);
2273		break;
2274	case BIO_GETATTR:
2275	case BIO_DELETE:
2276	default:
2277		g_io_deliver(bp, EOPNOTSUPP);
2278		break;
2279	}
2280	return;
2281
2282enqueue:
2283	mtx_lock(&zv->zv_queue_mtx);
2284	first = (bioq_first(&zv->zv_queue) == NULL);
2285	bioq_insert_tail(&zv->zv_queue, bp);
2286	mtx_unlock(&zv->zv_queue_mtx);
2287	if (first)
2288		wakeup_one(&zv->zv_queue);
2289}
2290
2291static void
2292zvol_geom_worker(void *arg)
2293{
2294	zvol_state_t *zv;
2295	struct bio *bp;
2296
2297	thread_lock(curthread);
2298	sched_prio(curthread, PRIBIO);
2299	thread_unlock(curthread);
2300
2301	zv = arg;
2302	for (;;) {
2303		mtx_lock(&zv->zv_queue_mtx);
2304		bp = bioq_takefirst(&zv->zv_queue);
2305		if (bp == NULL) {
2306			if (zv->zv_state == 1) {
2307				zv->zv_state = 2;
2308				wakeup(&zv->zv_state);
2309				mtx_unlock(&zv->zv_queue_mtx);
2310				kthread_exit();
2311			}
2312			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2313			    "zvol:io", 0);
2314			continue;
2315		}
2316		mtx_unlock(&zv->zv_queue_mtx);
2317		switch (bp->bio_cmd) {
2318		case BIO_FLUSH:
2319			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2320			g_io_deliver(bp, 0);
2321			break;
2322		case BIO_READ:
2323		case BIO_WRITE:
2324			zvol_strategy(bp);
2325			break;
2326		}
2327	}
2328}
2329
2330extern boolean_t dataset_name_hidden(const char *name);
2331
2332static int
2333zvol_create_snapshots(objset_t *os, const char *name)
2334{
2335	uint64_t cookie, obj;
2336	char *sname;
2337	int error, len;
2338
2339	cookie = obj = 0;
2340	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2341
2342#if 0
2343	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2344	    DS_FIND_SNAPSHOTS);
2345#endif
2346
2347	for (;;) {
2348		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2349		if (len >= MAXPATHLEN) {
2350			dmu_objset_rele(os, FTAG);
2351			error = ENAMETOOLONG;
2352			break;
2353		}
2354
2355		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2356		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2357		    sname + len, &obj, &cookie, NULL);
2358		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2359		if (error != 0) {
2360			if (error == ENOENT)
2361				error = 0;
2362			break;
2363		}
2364
2365		if ((error = zvol_create_minor(sname)) != 0) {
2366			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2367			    sname, error);
2368			break;
2369		}
2370	}
2371
2372	kmem_free(sname, MAXPATHLEN);
2373	return (error);
2374}
2375
2376int
2377zvol_create_minors(const char *name)
2378{
2379	uint64_t cookie;
2380	objset_t *os;
2381	char *osname, *p;
2382	int error, len;
2383
2384	if (dataset_name_hidden(name))
2385		return (0);
2386
2387	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2388		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2389		    name, error);
2390		return (error);
2391	}
2392	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2393		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2394		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2395		if ((error = zvol_create_minor(name)) == 0)
2396			error = zvol_create_snapshots(os, name);
2397		else {
2398			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2399			    name, error);
2400		}
2401		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2402		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2403		return (error);
2404	}
2405	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2406		dmu_objset_rele(os, FTAG);
2407		return (0);
2408	}
2409
2410	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2411	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2412		dmu_objset_rele(os, FTAG);
2413		kmem_free(osname, MAXPATHLEN);
2414		return (ENOENT);
2415	}
2416	p = osname + strlen(osname);
2417	len = MAXPATHLEN - (p - osname);
2418
2419#if 0
2420	/* Prefetch the datasets. */
2421	cookie = 0;
2422	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2423		if (!dataset_name_hidden(osname))
2424			(void) dmu_objset_prefetch(osname, NULL);
2425	}
2426#endif
2427
2428	cookie = 0;
2429	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2430	    &cookie) == 0) {
2431		dmu_objset_rele(os, FTAG);
2432		(void)zvol_create_minors(osname);
2433		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2434			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2435			    name, error);
2436			return (error);
2437		}
2438	}
2439
2440	dmu_objset_rele(os, FTAG);
2441	kmem_free(osname, MAXPATHLEN);
2442	return (0);
2443}
2444
2445static void
2446zvol_rename_minor(struct g_geom *gp, const char *newname)
2447{
2448	struct g_provider *pp;
2449	zvol_state_t *zv;
2450
2451	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2452	g_topology_assert();
2453
2454	pp = LIST_FIRST(&gp->provider);
2455	ASSERT(pp != NULL);
2456	zv = pp->private;
2457	ASSERT(zv != NULL);
2458
2459	zv->zv_provider = NULL;
2460	g_wither_provider(pp, ENXIO);
2461
2462	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2463	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2464	pp->sectorsize = DEV_BSIZE;
2465	pp->mediasize = zv->zv_volsize;
2466	pp->private = zv;
2467	zv->zv_provider = pp;
2468	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
2469	g_error_provider(pp, 0);
2470}
2471
2472void
2473zvol_rename_minors(const char *oldname, const char *newname)
2474{
2475	char name[MAXPATHLEN];
2476	struct g_provider *pp;
2477	struct g_geom *gp;
2478	size_t oldnamelen, newnamelen;
2479	zvol_state_t *zv;
2480	char *namebuf;
2481
2482	oldnamelen = strlen(oldname);
2483	newnamelen = strlen(newname);
2484
2485	DROP_GIANT();
2486	mutex_enter(&spa_namespace_lock);
2487	g_topology_lock();
2488
2489	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
2490		pp = LIST_FIRST(&gp->provider);
2491		if (pp == NULL)
2492			continue;
2493		zv = pp->private;
2494		if (zv == NULL)
2495			continue;
2496		if (strcmp(zv->zv_name, oldname) == 0) {
2497			zvol_rename_minor(gp, newname);
2498		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
2499		    (zv->zv_name[oldnamelen] == '/' ||
2500		     zv->zv_name[oldnamelen] == '@')) {
2501			snprintf(name, sizeof(name), "%s%c%s", newname,
2502			    zv->zv_name[oldnamelen],
2503			    zv->zv_name + oldnamelen + 1);
2504			zvol_rename_minor(gp, name);
2505		}
2506	}
2507
2508	g_topology_unlock();
2509	mutex_exit(&spa_namespace_lock);
2510	PICKUP_GIANT();
2511}
2512