zvol.c revision 264732
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 * Copyright (c) 2013 by Delphix. All rights reserved.
27 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
28 *
29 * Portions Copyright 2010 Robert Milkowski
30 *
31 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
32 */
33
34/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
35
36/*
37 * ZFS volume emulation driver.
38 *
39 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
40 * Volumes are accessed through the symbolic links named:
41 *
42 * /dev/zvol/dsk/<pool_name>/<dataset_name>
43 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
44 *
45 * These links are created by the /dev filesystem (sdev_zvolops.c).
46 * Volumes are persistent through reboot.  No user command needs to be
47 * run before opening and using a device.
48 *
49 * FreeBSD notes.
50 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
51 * in the system.
52 */
53
54#include <sys/types.h>
55#include <sys/param.h>
56#include <sys/kernel.h>
57#include <sys/errno.h>
58#include <sys/uio.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kmem.h>
62#include <sys/conf.h>
63#include <sys/cmn_err.h>
64#include <sys/stat.h>
65#include <sys/zap.h>
66#include <sys/spa.h>
67#include <sys/spa_impl.h>
68#include <sys/zio.h>
69#include <sys/dmu_traverse.h>
70#include <sys/dnode.h>
71#include <sys/dsl_dataset.h>
72#include <sys/dsl_prop.h>
73#include <sys/dkio.h>
74#include <sys/byteorder.h>
75#include <sys/sunddi.h>
76#include <sys/dirent.h>
77#include <sys/policy.h>
78#include <sys/fs/zfs.h>
79#include <sys/zfs_ioctl.h>
80#include <sys/zil.h>
81#include <sys/refcount.h>
82#include <sys/zfs_znode.h>
83#include <sys/zfs_rlock.h>
84#include <sys/vdev_impl.h>
85#include <sys/vdev_raidz.h>
86#include <sys/zvol.h>
87#include <sys/zil_impl.h>
88#include <sys/dbuf.h>
89#include <sys/dmu_tx.h>
90#include <sys/zfeature.h>
91#include <sys/zio_checksum.h>
92
93#include <geom/geom.h>
94
95#include "zfs_namecheck.h"
96
97struct g_class zfs_zvol_class = {
98	.name = "ZFS::ZVOL",
99	.version = G_VERSION,
100};
101
102DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
103
104void *zfsdev_state;
105static char *zvol_tag = "zvol_tag";
106
107#define	ZVOL_DUMPSIZE		"dumpsize"
108
109/*
110 * The spa_namespace_lock protects the zfsdev_state structure from being
111 * modified while it's being used, e.g. an open that comes in before a
112 * create finishes.  It also protects temporary opens of the dataset so that,
113 * e.g., an open doesn't get a spurious EBUSY.
114 */
115static uint32_t zvol_minors;
116
117typedef struct zvol_extent {
118	list_node_t	ze_node;
119	dva_t		ze_dva;		/* dva associated with this extent */
120	uint64_t	ze_nblks;	/* number of blocks in extent */
121} zvol_extent_t;
122
123/*
124 * The in-core state of each volume.
125 */
126typedef struct zvol_state {
127	char		zv_name[MAXPATHLEN]; /* pool/dd name */
128	uint64_t	zv_volsize;	/* amount of space we advertise */
129	uint64_t	zv_volblocksize; /* volume block size */
130	struct g_provider *zv_provider;	/* GEOM provider */
131	uint8_t		zv_min_bs;	/* minimum addressable block shift */
132	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
133	objset_t	*zv_objset;	/* objset handle */
134	uint32_t	zv_total_opens;	/* total open count */
135	zilog_t		*zv_zilog;	/* ZIL handle */
136	list_t		zv_extents;	/* List of extents for dump */
137	znode_t		zv_znode;	/* for range locking */
138	dmu_buf_t	*zv_dbuf;	/* bonus handle */
139	int		zv_state;
140	struct bio_queue_head zv_queue;
141	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
142} zvol_state_t;
143
144/*
145 * zvol specific flags
146 */
147#define	ZVOL_RDONLY	0x1
148#define	ZVOL_DUMPIFIED	0x2
149#define	ZVOL_EXCL	0x4
150#define	ZVOL_WCE	0x8
151
152/*
153 * zvol maximum transfer in one DMU tx.
154 */
155int zvol_maxphys = DMU_MAX_ACCESS/2;
156
157extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
158    nvlist_t *, nvlist_t *);
159static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
160    uint64_t len, boolean_t sync);
161static int zvol_remove_zv(zvol_state_t *);
162static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
163static int zvol_dumpify(zvol_state_t *zv);
164static int zvol_dump_fini(zvol_state_t *zv);
165static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
166
167static zvol_state_t *zvol_geom_create(const char *name);
168static void zvol_geom_run(zvol_state_t *zv);
169static void zvol_geom_destroy(zvol_state_t *zv);
170static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
171static void zvol_geom_start(struct bio *bp);
172static void zvol_geom_worker(void *arg);
173
174static void
175zvol_size_changed(zvol_state_t *zv)
176{
177#ifdef sun
178	dev_t dev = makedevice(maj, min);
179
180	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
181	    "Size", volsize) == DDI_SUCCESS);
182	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
183	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
184
185	/* Notify specfs to invalidate the cached size */
186	spec_size_invalidate(dev, VBLK);
187	spec_size_invalidate(dev, VCHR);
188#else	/* !sun */
189	struct g_provider *pp;
190
191	pp = zv->zv_provider;
192	if (pp == NULL)
193		return;
194	g_topology_lock();
195	g_resize_provider(pp, zv->zv_volsize);
196	g_topology_unlock();
197#endif	/* !sun */
198}
199
200int
201zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
202{
203	if (volsize == 0)
204		return (SET_ERROR(EINVAL));
205
206	if (volsize % blocksize != 0)
207		return (SET_ERROR(EINVAL));
208
209#ifdef _ILP32
210	if (volsize - 1 > SPEC_MAXOFFSET_T)
211		return (SET_ERROR(EOVERFLOW));
212#endif
213	return (0);
214}
215
216int
217zvol_check_volblocksize(uint64_t volblocksize)
218{
219	if (volblocksize < SPA_MINBLOCKSIZE ||
220	    volblocksize > SPA_MAXBLOCKSIZE ||
221	    !ISP2(volblocksize))
222		return (SET_ERROR(EDOM));
223
224	return (0);
225}
226
227int
228zvol_get_stats(objset_t *os, nvlist_t *nv)
229{
230	int error;
231	dmu_object_info_t doi;
232	uint64_t val;
233
234	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
235	if (error)
236		return (error);
237
238	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
239
240	error = dmu_object_info(os, ZVOL_OBJ, &doi);
241
242	if (error == 0) {
243		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
244		    doi.doi_data_block_size);
245	}
246
247	return (error);
248}
249
250static zvol_state_t *
251zvol_minor_lookup(const char *name)
252{
253	struct g_provider *pp;
254	struct g_geom *gp;
255	zvol_state_t *zv = NULL;
256
257	ASSERT(MUTEX_HELD(&spa_namespace_lock));
258
259	g_topology_lock();
260	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
261		pp = LIST_FIRST(&gp->provider);
262		if (pp == NULL)
263			continue;
264		zv = pp->private;
265		if (zv == NULL)
266			continue;
267		if (strcmp(zv->zv_name, name) == 0)
268			break;
269	}
270	g_topology_unlock();
271
272	return (gp != NULL ? zv : NULL);
273}
274
275/* extent mapping arg */
276struct maparg {
277	zvol_state_t	*ma_zv;
278	uint64_t	ma_blks;
279};
280
281/*ARGSUSED*/
282static int
283zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
284    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
285{
286	struct maparg *ma = arg;
287	zvol_extent_t *ze;
288	int bs = ma->ma_zv->zv_volblocksize;
289
290	if (BP_IS_HOLE(bp) ||
291	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
292		return (0);
293
294	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
295	ma->ma_blks++;
296
297	/* Abort immediately if we have encountered gang blocks */
298	if (BP_IS_GANG(bp))
299		return (SET_ERROR(EFRAGS));
300
301	/*
302	 * See if the block is at the end of the previous extent.
303	 */
304	ze = list_tail(&ma->ma_zv->zv_extents);
305	if (ze &&
306	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
307	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
308	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
309		ze->ze_nblks++;
310		return (0);
311	}
312
313	dprintf_bp(bp, "%s", "next blkptr:");
314
315	/* start a new extent */
316	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
317	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
318	ze->ze_nblks = 1;
319	list_insert_tail(&ma->ma_zv->zv_extents, ze);
320	return (0);
321}
322
323static void
324zvol_free_extents(zvol_state_t *zv)
325{
326	zvol_extent_t *ze;
327
328	while (ze = list_head(&zv->zv_extents)) {
329		list_remove(&zv->zv_extents, ze);
330		kmem_free(ze, sizeof (zvol_extent_t));
331	}
332}
333
334static int
335zvol_get_lbas(zvol_state_t *zv)
336{
337	objset_t *os = zv->zv_objset;
338	struct maparg	ma;
339	int		err;
340
341	ma.ma_zv = zv;
342	ma.ma_blks = 0;
343	zvol_free_extents(zv);
344
345	/* commit any in-flight changes before traversing the dataset */
346	txg_wait_synced(dmu_objset_pool(os), 0);
347	err = traverse_dataset(dmu_objset_ds(os), 0,
348	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
349	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
350		zvol_free_extents(zv);
351		return (err ? err : EIO);
352	}
353
354	return (0);
355}
356
357/* ARGSUSED */
358void
359zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
360{
361	zfs_creat_t *zct = arg;
362	nvlist_t *nvprops = zct->zct_props;
363	int error;
364	uint64_t volblocksize, volsize;
365
366	VERIFY(nvlist_lookup_uint64(nvprops,
367	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
368	if (nvlist_lookup_uint64(nvprops,
369	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
370		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
371
372	/*
373	 * These properties must be removed from the list so the generic
374	 * property setting step won't apply to them.
375	 */
376	VERIFY(nvlist_remove_all(nvprops,
377	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
378	(void) nvlist_remove_all(nvprops,
379	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
380
381	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
382	    DMU_OT_NONE, 0, tx);
383	ASSERT(error == 0);
384
385	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
386	    DMU_OT_NONE, 0, tx);
387	ASSERT(error == 0);
388
389	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
390	ASSERT(error == 0);
391}
392
393/*
394 * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
395 * implement DKIOCFREE/free-long-range.
396 */
397static int
398zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
399{
400	uint64_t offset, length;
401
402	if (byteswap)
403		byteswap_uint64_array(lr, sizeof (*lr));
404
405	offset = lr->lr_offset;
406	length = lr->lr_length;
407
408	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
409}
410
411/*
412 * Replay a TX_WRITE ZIL transaction that didn't get committed
413 * after a system failure
414 */
415static int
416zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
417{
418	objset_t *os = zv->zv_objset;
419	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
420	uint64_t offset, length;
421	dmu_tx_t *tx;
422	int error;
423
424	if (byteswap)
425		byteswap_uint64_array(lr, sizeof (*lr));
426
427	offset = lr->lr_offset;
428	length = lr->lr_length;
429
430	/* If it's a dmu_sync() block, write the whole block */
431	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
432		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
433		if (length < blocksize) {
434			offset -= offset % blocksize;
435			length = blocksize;
436		}
437	}
438
439	tx = dmu_tx_create(os);
440	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
441	error = dmu_tx_assign(tx, TXG_WAIT);
442	if (error) {
443		dmu_tx_abort(tx);
444	} else {
445		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
446		dmu_tx_commit(tx);
447	}
448
449	return (error);
450}
451
452/* ARGSUSED */
453static int
454zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
455{
456	return (SET_ERROR(ENOTSUP));
457}
458
459/*
460 * Callback vectors for replaying records.
461 * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
462 */
463zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
464	zvol_replay_err,	/* 0 no such transaction type */
465	zvol_replay_err,	/* TX_CREATE */
466	zvol_replay_err,	/* TX_MKDIR */
467	zvol_replay_err,	/* TX_MKXATTR */
468	zvol_replay_err,	/* TX_SYMLINK */
469	zvol_replay_err,	/* TX_REMOVE */
470	zvol_replay_err,	/* TX_RMDIR */
471	zvol_replay_err,	/* TX_LINK */
472	zvol_replay_err,	/* TX_RENAME */
473	zvol_replay_write,	/* TX_WRITE */
474	zvol_replay_truncate,	/* TX_TRUNCATE */
475	zvol_replay_err,	/* TX_SETATTR */
476	zvol_replay_err,	/* TX_ACL */
477	zvol_replay_err,	/* TX_CREATE_ACL */
478	zvol_replay_err,	/* TX_CREATE_ATTR */
479	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
480	zvol_replay_err,	/* TX_MKDIR_ACL */
481	zvol_replay_err,	/* TX_MKDIR_ATTR */
482	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
483	zvol_replay_err,	/* TX_WRITE2 */
484};
485
486#ifdef sun
487int
488zvol_name2minor(const char *name, minor_t *minor)
489{
490	zvol_state_t *zv;
491
492	mutex_enter(&spa_namespace_lock);
493	zv = zvol_minor_lookup(name);
494	if (minor && zv)
495		*minor = zv->zv_minor;
496	mutex_exit(&spa_namespace_lock);
497	return (zv ? 0 : -1);
498}
499#endif	/* sun */
500
501/*
502 * Create a minor node (plus a whole lot more) for the specified volume.
503 */
504int
505zvol_create_minor(const char *name)
506{
507	zfs_soft_state_t *zs;
508	zvol_state_t *zv;
509	objset_t *os;
510	dmu_object_info_t doi;
511	uint64_t volsize;
512	int error;
513
514	ZFS_LOG(1, "Creating ZVOL %s...", name);
515
516	mutex_enter(&spa_namespace_lock);
517
518	if (zvol_minor_lookup(name) != NULL) {
519		mutex_exit(&spa_namespace_lock);
520		return (SET_ERROR(EEXIST));
521	}
522
523	/* lie and say we're read-only */
524	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
525
526	if (error) {
527		mutex_exit(&spa_namespace_lock);
528		return (error);
529	}
530
531#ifdef sun
532	if ((minor = zfsdev_minor_alloc()) == 0) {
533		dmu_objset_disown(os, FTAG);
534		mutex_exit(&spa_namespace_lock);
535		return (SET_ERROR(ENXIO));
536	}
537
538	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
539		dmu_objset_disown(os, FTAG);
540		mutex_exit(&spa_namespace_lock);
541		return (SET_ERROR(EAGAIN));
542	}
543	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
544	    (char *)name);
545
546	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
547
548	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
549	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
550		ddi_soft_state_free(zfsdev_state, minor);
551		dmu_objset_disown(os, FTAG);
552		mutex_exit(&spa_namespace_lock);
553		return (SET_ERROR(EAGAIN));
554	}
555
556	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
557
558	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
559	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
560		ddi_remove_minor_node(zfs_dip, chrbuf);
561		ddi_soft_state_free(zfsdev_state, minor);
562		dmu_objset_disown(os, FTAG);
563		mutex_exit(&spa_namespace_lock);
564		return (SET_ERROR(EAGAIN));
565	}
566
567	zs = ddi_get_soft_state(zfsdev_state, minor);
568	zs->zss_type = ZSST_ZVOL;
569	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
570#else	/* !sun */
571
572	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
573	if (error) {
574		ASSERT(error == 0);
575		dmu_objset_disown(os, zvol_tag);
576		mutex_exit(&spa_namespace_lock);
577		return (error);
578	}
579
580	DROP_GIANT();
581	g_topology_lock();
582	zv = zvol_geom_create(name);
583	zv->zv_volsize = volsize;
584	zv->zv_provider->mediasize = zv->zv_volsize;
585
586#endif	/* !sun */
587
588	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
589	zv->zv_min_bs = DEV_BSHIFT;
590	zv->zv_objset = os;
591	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
592		zv->zv_flags |= ZVOL_RDONLY;
593	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
594	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
595	    sizeof (rl_t), offsetof(rl_t, r_node));
596	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
597	    offsetof(zvol_extent_t, ze_node));
598	/* get and cache the blocksize */
599	error = dmu_object_info(os, ZVOL_OBJ, &doi);
600	ASSERT(error == 0);
601	zv->zv_volblocksize = doi.doi_data_block_size;
602
603	if (spa_writeable(dmu_objset_spa(os))) {
604		if (zil_replay_disable)
605			zil_destroy(dmu_objset_zil(os), B_FALSE);
606		else
607			zil_replay(os, zv, zvol_replay_vector);
608	}
609	dmu_objset_disown(os, FTAG);
610	zv->zv_objset = NULL;
611
612	zvol_minors++;
613
614	mutex_exit(&spa_namespace_lock);
615
616	zvol_geom_run(zv);
617
618	g_topology_unlock();
619	PICKUP_GIANT();
620
621	ZFS_LOG(1, "ZVOL %s created.", name);
622
623	return (0);
624}
625
626/*
627 * Remove minor node for the specified volume.
628 */
629static int
630zvol_remove_zv(zvol_state_t *zv)
631{
632#ifdef sun
633	minor_t minor = zv->zv_minor;
634#endif
635
636	ASSERT(MUTEX_HELD(&spa_namespace_lock));
637	if (zv->zv_total_opens != 0)
638		return (SET_ERROR(EBUSY));
639
640	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
641
642#ifdef sun
643	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
644	ddi_remove_minor_node(zfs_dip, nmbuf);
645#endif	/* sun */
646
647	avl_destroy(&zv->zv_znode.z_range_avl);
648	mutex_destroy(&zv->zv_znode.z_range_lock);
649
650	zvol_geom_destroy(zv);
651
652	zvol_minors--;
653	return (0);
654}
655
656int
657zvol_remove_minor(const char *name)
658{
659	zvol_state_t *zv;
660	int rc;
661
662	mutex_enter(&spa_namespace_lock);
663	if ((zv = zvol_minor_lookup(name)) == NULL) {
664		mutex_exit(&spa_namespace_lock);
665		return (SET_ERROR(ENXIO));
666	}
667	g_topology_lock();
668	rc = zvol_remove_zv(zv);
669	g_topology_unlock();
670	mutex_exit(&spa_namespace_lock);
671	return (rc);
672}
673
674int
675zvol_first_open(zvol_state_t *zv)
676{
677	objset_t *os;
678	uint64_t volsize;
679	int error;
680	uint64_t readonly;
681
682	/* lie and say we're read-only */
683	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
684	    zvol_tag, &os);
685	if (error)
686		return (error);
687
688	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
689	if (error) {
690		ASSERT(error == 0);
691		dmu_objset_disown(os, zvol_tag);
692		return (error);
693	}
694	zv->zv_objset = os;
695	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
696	if (error) {
697		dmu_objset_disown(os, zvol_tag);
698		return (error);
699	}
700	zv->zv_volsize = volsize;
701	zv->zv_zilog = zil_open(os, zvol_get_data);
702	zvol_size_changed(zv);
703
704	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
705	    NULL) == 0);
706	if (readonly || dmu_objset_is_snapshot(os) ||
707	    !spa_writeable(dmu_objset_spa(os)))
708		zv->zv_flags |= ZVOL_RDONLY;
709	else
710		zv->zv_flags &= ~ZVOL_RDONLY;
711	return (error);
712}
713
714void
715zvol_last_close(zvol_state_t *zv)
716{
717	zil_close(zv->zv_zilog);
718	zv->zv_zilog = NULL;
719
720	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
721	zv->zv_dbuf = NULL;
722
723	/*
724	 * Evict cached data
725	 */
726	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
727	    !(zv->zv_flags & ZVOL_RDONLY))
728		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
729	dmu_objset_evict_dbufs(zv->zv_objset);
730
731	dmu_objset_disown(zv->zv_objset, zvol_tag);
732	zv->zv_objset = NULL;
733}
734
735#ifdef sun
736int
737zvol_prealloc(zvol_state_t *zv)
738{
739	objset_t *os = zv->zv_objset;
740	dmu_tx_t *tx;
741	uint64_t refd, avail, usedobjs, availobjs;
742	uint64_t resid = zv->zv_volsize;
743	uint64_t off = 0;
744
745	/* Check the space usage before attempting to allocate the space */
746	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
747	if (avail < zv->zv_volsize)
748		return (SET_ERROR(ENOSPC));
749
750	/* Free old extents if they exist */
751	zvol_free_extents(zv);
752
753	while (resid != 0) {
754		int error;
755		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
756
757		tx = dmu_tx_create(os);
758		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
759		error = dmu_tx_assign(tx, TXG_WAIT);
760		if (error) {
761			dmu_tx_abort(tx);
762			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
763			return (error);
764		}
765		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
766		dmu_tx_commit(tx);
767		off += bytes;
768		resid -= bytes;
769	}
770	txg_wait_synced(dmu_objset_pool(os), 0);
771
772	return (0);
773}
774#endif	/* sun */
775
776static int
777zvol_update_volsize(objset_t *os, uint64_t volsize)
778{
779	dmu_tx_t *tx;
780	int error;
781
782	ASSERT(MUTEX_HELD(&spa_namespace_lock));
783
784	tx = dmu_tx_create(os);
785	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
786	error = dmu_tx_assign(tx, TXG_WAIT);
787	if (error) {
788		dmu_tx_abort(tx);
789		return (error);
790	}
791
792	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
793	    &volsize, tx);
794	dmu_tx_commit(tx);
795
796	if (error == 0)
797		error = dmu_free_long_range(os,
798		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
799	return (error);
800}
801
802void
803zvol_remove_minors(const char *name)
804{
805	struct g_geom *gp, *gptmp;
806	struct g_provider *pp;
807	zvol_state_t *zv;
808	size_t namelen;
809
810	namelen = strlen(name);
811
812	DROP_GIANT();
813	mutex_enter(&spa_namespace_lock);
814	g_topology_lock();
815
816	LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
817		pp = LIST_FIRST(&gp->provider);
818		if (pp == NULL)
819			continue;
820		zv = pp->private;
821		if (zv == NULL)
822			continue;
823		if (strcmp(zv->zv_name, name) == 0 ||
824		    (strncmp(zv->zv_name, name, namelen) == 0 &&
825		     zv->zv_name[namelen] == '/')) {
826			(void) zvol_remove_zv(zv);
827		}
828	}
829
830	g_topology_unlock();
831	mutex_exit(&spa_namespace_lock);
832	PICKUP_GIANT();
833}
834
835int
836zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
837{
838	zvol_state_t *zv = NULL;
839	objset_t *os;
840	int error;
841	dmu_object_info_t doi;
842	uint64_t old_volsize = 0ULL;
843	uint64_t readonly;
844
845	mutex_enter(&spa_namespace_lock);
846	zv = zvol_minor_lookup(name);
847	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
848		mutex_exit(&spa_namespace_lock);
849		return (error);
850	}
851
852	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
853	    (error = zvol_check_volsize(volsize,
854	    doi.doi_data_block_size)) != 0)
855		goto out;
856
857	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
858	    NULL) == 0);
859	if (readonly) {
860		error = EROFS;
861		goto out;
862	}
863
864	error = zvol_update_volsize(os, volsize);
865	/*
866	 * Reinitialize the dump area to the new size. If we
867	 * failed to resize the dump area then restore it back to
868	 * its original size.
869	 */
870	if (zv && error == 0) {
871#ifdef ZVOL_DUMP
872		if (zv->zv_flags & ZVOL_DUMPIFIED) {
873			old_volsize = zv->zv_volsize;
874			zv->zv_volsize = volsize;
875			if ((error = zvol_dumpify(zv)) != 0 ||
876			    (error = dumpvp_resize()) != 0) {
877				(void) zvol_update_volsize(os, old_volsize);
878				zv->zv_volsize = old_volsize;
879				error = zvol_dumpify(zv);
880			}
881		}
882#endif	/* ZVOL_DUMP */
883		if (error == 0) {
884			zv->zv_volsize = volsize;
885			zvol_size_changed(zv);
886		}
887	}
888
889#ifdef sun
890	/*
891	 * Generate a LUN expansion event.
892	 */
893	if (zv && error == 0) {
894		sysevent_id_t eid;
895		nvlist_t *attr;
896		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
897
898		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
899		    zv->zv_minor);
900
901		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
902		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
903
904		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
905		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
906
907		nvlist_free(attr);
908		kmem_free(physpath, MAXPATHLEN);
909	}
910#endif	/* sun */
911
912out:
913	dmu_objset_rele(os, FTAG);
914
915	mutex_exit(&spa_namespace_lock);
916
917	return (error);
918}
919
920/*ARGSUSED*/
921static int
922zvol_open(struct g_provider *pp, int flag, int count)
923{
924	zvol_state_t *zv;
925	int err = 0;
926	boolean_t locked = B_FALSE;
927
928	/*
929	 * Protect against recursively entering spa_namespace_lock
930	 * when spa_open() is used for a pool on a (local) ZVOL(s).
931	 * This is needed since we replaced upstream zfsdev_state_lock
932	 * with spa_namespace_lock in the ZVOL code.
933	 * We are using the same trick as spa_open().
934	 * Note that calls in zvol_first_open which need to resolve
935	 * pool name to a spa object will enter spa_open()
936	 * recursively, but that function already has all the
937	 * necessary protection.
938	 */
939	if (!MUTEX_HELD(&spa_namespace_lock)) {
940		mutex_enter(&spa_namespace_lock);
941		locked = B_TRUE;
942	}
943
944	zv = pp->private;
945	if (zv == NULL) {
946		if (locked)
947			mutex_exit(&spa_namespace_lock);
948		return (SET_ERROR(ENXIO));
949	}
950
951	if (zv->zv_total_opens == 0) {
952		err = zvol_first_open(zv);
953		if (err) {
954			if (locked)
955				mutex_exit(&spa_namespace_lock);
956			return (err);
957		}
958		pp->mediasize = zv->zv_volsize;
959		pp->stripeoffset = 0;
960		pp->stripesize = zv->zv_volblocksize;
961	}
962	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
963		err = SET_ERROR(EROFS);
964		goto out;
965	}
966	if (zv->zv_flags & ZVOL_EXCL) {
967		err = SET_ERROR(EBUSY);
968		goto out;
969	}
970#ifdef FEXCL
971	if (flag & FEXCL) {
972		if (zv->zv_total_opens != 0) {
973			err = SET_ERROR(EBUSY);
974			goto out;
975		}
976		zv->zv_flags |= ZVOL_EXCL;
977	}
978#endif
979
980	zv->zv_total_opens += count;
981	if (locked)
982		mutex_exit(&spa_namespace_lock);
983
984	return (err);
985out:
986	if (zv->zv_total_opens == 0)
987		zvol_last_close(zv);
988	if (locked)
989		mutex_exit(&spa_namespace_lock);
990	return (err);
991}
992
993/*ARGSUSED*/
994static int
995zvol_close(struct g_provider *pp, int flag, int count)
996{
997	zvol_state_t *zv;
998	int error = 0;
999	boolean_t locked = B_FALSE;
1000
1001	/* See comment in zvol_open(). */
1002	if (!MUTEX_HELD(&spa_namespace_lock)) {
1003		mutex_enter(&spa_namespace_lock);
1004		locked = B_TRUE;
1005	}
1006
1007	zv = pp->private;
1008	if (zv == NULL) {
1009		if (locked)
1010			mutex_exit(&spa_namespace_lock);
1011		return (SET_ERROR(ENXIO));
1012	}
1013
1014	if (zv->zv_flags & ZVOL_EXCL) {
1015		ASSERT(zv->zv_total_opens == 1);
1016		zv->zv_flags &= ~ZVOL_EXCL;
1017	}
1018
1019	/*
1020	 * If the open count is zero, this is a spurious close.
1021	 * That indicates a bug in the kernel / DDI framework.
1022	 */
1023	ASSERT(zv->zv_total_opens != 0);
1024
1025	/*
1026	 * You may get multiple opens, but only one close.
1027	 */
1028	zv->zv_total_opens -= count;
1029
1030	if (zv->zv_total_opens == 0)
1031		zvol_last_close(zv);
1032
1033	if (locked)
1034		mutex_exit(&spa_namespace_lock);
1035	return (error);
1036}
1037
1038static void
1039zvol_get_done(zgd_t *zgd, int error)
1040{
1041	if (zgd->zgd_db)
1042		dmu_buf_rele(zgd->zgd_db, zgd);
1043
1044	zfs_range_unlock(zgd->zgd_rl);
1045
1046	if (error == 0 && zgd->zgd_bp)
1047		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1048
1049	kmem_free(zgd, sizeof (zgd_t));
1050}
1051
1052/*
1053 * Get data to generate a TX_WRITE intent log record.
1054 */
1055static int
1056zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1057{
1058	zvol_state_t *zv = arg;
1059	objset_t *os = zv->zv_objset;
1060	uint64_t object = ZVOL_OBJ;
1061	uint64_t offset = lr->lr_offset;
1062	uint64_t size = lr->lr_length;	/* length of user data */
1063	blkptr_t *bp = &lr->lr_blkptr;
1064	dmu_buf_t *db;
1065	zgd_t *zgd;
1066	int error;
1067
1068	ASSERT(zio != NULL);
1069	ASSERT(size != 0);
1070
1071	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1072	zgd->zgd_zilog = zv->zv_zilog;
1073	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
1074
1075	/*
1076	 * Write records come in two flavors: immediate and indirect.
1077	 * For small writes it's cheaper to store the data with the
1078	 * log record (immediate); for large writes it's cheaper to
1079	 * sync the data and get a pointer to it (indirect) so that
1080	 * we don't have to write the data twice.
1081	 */
1082	if (buf != NULL) {	/* immediate write */
1083		error = dmu_read(os, object, offset, size, buf,
1084		    DMU_READ_NO_PREFETCH);
1085	} else {
1086		size = zv->zv_volblocksize;
1087		offset = P2ALIGN(offset, size);
1088		error = dmu_buf_hold(os, object, offset, zgd, &db,
1089		    DMU_READ_NO_PREFETCH);
1090		if (error == 0) {
1091			blkptr_t *obp = dmu_buf_get_blkptr(db);
1092			if (obp) {
1093				ASSERT(BP_IS_HOLE(bp));
1094				*bp = *obp;
1095			}
1096
1097			zgd->zgd_db = db;
1098			zgd->zgd_bp = bp;
1099
1100			ASSERT(db->db_offset == offset);
1101			ASSERT(db->db_size == size);
1102
1103			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1104			    zvol_get_done, zgd);
1105
1106			if (error == 0)
1107				return (0);
1108		}
1109	}
1110
1111	zvol_get_done(zgd, error);
1112
1113	return (error);
1114}
1115
1116/*
1117 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1118 *
1119 * We store data in the log buffers if it's small enough.
1120 * Otherwise we will later flush the data out via dmu_sync().
1121 */
1122ssize_t zvol_immediate_write_sz = 32768;
1123
1124static void
1125zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1126    boolean_t sync)
1127{
1128	uint32_t blocksize = zv->zv_volblocksize;
1129	zilog_t *zilog = zv->zv_zilog;
1130	boolean_t slogging;
1131	ssize_t immediate_write_sz;
1132
1133	if (zil_replaying(zilog, tx))
1134		return;
1135
1136	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1137	    ? 0 : zvol_immediate_write_sz;
1138
1139	slogging = spa_has_slogs(zilog->zl_spa) &&
1140	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1141
1142	while (resid) {
1143		itx_t *itx;
1144		lr_write_t *lr;
1145		ssize_t len;
1146		itx_wr_state_t write_state;
1147
1148		/*
1149		 * Unlike zfs_log_write() we can be called with
1150		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
1151		 */
1152		if (blocksize > immediate_write_sz && !slogging &&
1153		    resid >= blocksize && off % blocksize == 0) {
1154			write_state = WR_INDIRECT; /* uses dmu_sync */
1155			len = blocksize;
1156		} else if (sync) {
1157			write_state = WR_COPIED;
1158			len = MIN(ZIL_MAX_LOG_DATA, resid);
1159		} else {
1160			write_state = WR_NEED_COPY;
1161			len = MIN(ZIL_MAX_LOG_DATA, resid);
1162		}
1163
1164		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1165		    (write_state == WR_COPIED ? len : 0));
1166		lr = (lr_write_t *)&itx->itx_lr;
1167		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1168		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1169			zil_itx_destroy(itx);
1170			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1171			lr = (lr_write_t *)&itx->itx_lr;
1172			write_state = WR_NEED_COPY;
1173		}
1174
1175		itx->itx_wr_state = write_state;
1176		if (write_state == WR_NEED_COPY)
1177			itx->itx_sod += len;
1178		lr->lr_foid = ZVOL_OBJ;
1179		lr->lr_offset = off;
1180		lr->lr_length = len;
1181		lr->lr_blkoff = 0;
1182		BP_ZERO(&lr->lr_blkptr);
1183
1184		itx->itx_private = zv;
1185		itx->itx_sync = sync;
1186
1187		zil_itx_assign(zilog, itx, tx);
1188
1189		off += len;
1190		resid -= len;
1191	}
1192}
1193
1194#ifdef sun
1195static int
1196zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1197    uint64_t size, boolean_t doread, boolean_t isdump)
1198{
1199	vdev_disk_t *dvd;
1200	int c;
1201	int numerrors = 0;
1202
1203	if (vd->vdev_ops == &vdev_mirror_ops ||
1204	    vd->vdev_ops == &vdev_replacing_ops ||
1205	    vd->vdev_ops == &vdev_spare_ops) {
1206		for (c = 0; c < vd->vdev_children; c++) {
1207			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1208			    addr, offset, origoffset, size, doread, isdump);
1209			if (err != 0) {
1210				numerrors++;
1211			} else if (doread) {
1212				break;
1213			}
1214		}
1215	}
1216
1217	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1218		return (numerrors < vd->vdev_children ? 0 : EIO);
1219
1220	if (doread && !vdev_readable(vd))
1221		return (SET_ERROR(EIO));
1222	else if (!doread && !vdev_writeable(vd))
1223		return (SET_ERROR(EIO));
1224
1225	if (vd->vdev_ops == &vdev_raidz_ops) {
1226		return (vdev_raidz_physio(vd,
1227		    addr, size, offset, origoffset, doread, isdump));
1228	}
1229
1230	offset += VDEV_LABEL_START_SIZE;
1231
1232	if (ddi_in_panic() || isdump) {
1233		ASSERT(!doread);
1234		if (doread)
1235			return (SET_ERROR(EIO));
1236		dvd = vd->vdev_tsd;
1237		ASSERT3P(dvd, !=, NULL);
1238		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1239		    lbtodb(size)));
1240	} else {
1241		dvd = vd->vdev_tsd;
1242		ASSERT3P(dvd, !=, NULL);
1243		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1244		    offset, doread ? B_READ : B_WRITE));
1245	}
1246}
1247
1248static int
1249zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1250    boolean_t doread, boolean_t isdump)
1251{
1252	vdev_t *vd;
1253	int error;
1254	zvol_extent_t *ze;
1255	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1256
1257	/* Must be sector aligned, and not stradle a block boundary. */
1258	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1259	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1260		return (SET_ERROR(EINVAL));
1261	}
1262	ASSERT(size <= zv->zv_volblocksize);
1263
1264	/* Locate the extent this belongs to */
1265	ze = list_head(&zv->zv_extents);
1266	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1267		offset -= ze->ze_nblks * zv->zv_volblocksize;
1268		ze = list_next(&zv->zv_extents, ze);
1269	}
1270
1271	if (ze == NULL)
1272		return (SET_ERROR(EINVAL));
1273
1274	if (!ddi_in_panic())
1275		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1276
1277	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1278	offset += DVA_GET_OFFSET(&ze->ze_dva);
1279	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1280	    size, doread, isdump);
1281
1282	if (!ddi_in_panic())
1283		spa_config_exit(spa, SCL_STATE, FTAG);
1284
1285	return (error);
1286}
1287#endif	/* sun */
1288
1289int
1290zvol_strategy(struct bio *bp)
1291{
1292	zvol_state_t *zv = bp->bio_to->private;
1293	uint64_t off, volsize;
1294	size_t resid;
1295	char *addr;
1296	objset_t *os;
1297	rl_t *rl;
1298	int error = 0;
1299	boolean_t doread = (bp->bio_cmd == BIO_READ);
1300	boolean_t is_dumpified;
1301	boolean_t sync;
1302
1303	if (zv == NULL) {
1304		g_io_deliver(bp, ENXIO);
1305		return (0);
1306	}
1307
1308	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1309		g_io_deliver(bp, EROFS);
1310		return (0);
1311	}
1312
1313	off = bp->bio_offset;
1314	volsize = zv->zv_volsize;
1315
1316	os = zv->zv_objset;
1317	ASSERT(os != NULL);
1318
1319	addr = bp->bio_data;
1320	resid = bp->bio_length;
1321
1322	if (resid > 0 && (off < 0 || off >= volsize)) {
1323		g_io_deliver(bp, EIO);
1324		return (0);
1325	}
1326
1327#ifdef illumos
1328	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1329#else
1330	is_dumpified = B_FALSE;
1331#endif
1332        sync = !doread && !is_dumpified &&
1333	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1334
1335	/*
1336	 * There must be no buffer changes when doing a dmu_sync() because
1337	 * we can't change the data whilst calculating the checksum.
1338	 */
1339	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1340	    doread ? RL_READER : RL_WRITER);
1341
1342	if (bp->bio_cmd == BIO_DELETE) {
1343		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1344		error = dmu_tx_assign(tx, TXG_WAIT);
1345		if (error != 0) {
1346			dmu_tx_abort(tx);
1347		} else {
1348			zvol_log_truncate(zv, tx, off, resid, B_TRUE);
1349			dmu_tx_commit(tx);
1350			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1351			    off, resid);
1352			resid = 0;
1353		}
1354		goto unlock;
1355	}
1356
1357	while (resid != 0 && off < volsize) {
1358		size_t size = MIN(resid, zvol_maxphys);
1359#ifdef illumos
1360		if (is_dumpified) {
1361			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1362			error = zvol_dumpio(zv, addr, off, size,
1363			    doread, B_FALSE);
1364		} else if (doread) {
1365#else
1366		if (doread) {
1367#endif
1368			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1369			    DMU_READ_PREFETCH);
1370		} else {
1371			dmu_tx_t *tx = dmu_tx_create(os);
1372			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1373			error = dmu_tx_assign(tx, TXG_WAIT);
1374			if (error) {
1375				dmu_tx_abort(tx);
1376			} else {
1377				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1378				zvol_log_write(zv, tx, off, size, sync);
1379				dmu_tx_commit(tx);
1380			}
1381		}
1382		if (error) {
1383			/* convert checksum errors into IO errors */
1384			if (error == ECKSUM)
1385				error = SET_ERROR(EIO);
1386			break;
1387		}
1388		off += size;
1389		addr += size;
1390		resid -= size;
1391	}
1392unlock:
1393	zfs_range_unlock(rl);
1394
1395	bp->bio_completed = bp->bio_length - resid;
1396	if (bp->bio_completed < bp->bio_length)
1397		bp->bio_error = (off > volsize ? EINVAL : error);
1398
1399	if (sync)
1400		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1401	g_io_deliver(bp, 0);
1402
1403	return (0);
1404}
1405
1406#ifdef sun
1407/*
1408 * Set the buffer count to the zvol maximum transfer.
1409 * Using our own routine instead of the default minphys()
1410 * means that for larger writes we write bigger buffers on X86
1411 * (128K instead of 56K) and flush the disk write cache less often
1412 * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1413 * 56K on X86 and 128K on sparc).
1414 */
1415void
1416zvol_minphys(struct buf *bp)
1417{
1418	if (bp->b_bcount > zvol_maxphys)
1419		bp->b_bcount = zvol_maxphys;
1420}
1421
1422int
1423zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1424{
1425	minor_t minor = getminor(dev);
1426	zvol_state_t *zv;
1427	int error = 0;
1428	uint64_t size;
1429	uint64_t boff;
1430	uint64_t resid;
1431
1432	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1433	if (zv == NULL)
1434		return (SET_ERROR(ENXIO));
1435
1436	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1437		return (SET_ERROR(EINVAL));
1438
1439	boff = ldbtob(blkno);
1440	resid = ldbtob(nblocks);
1441
1442	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1443
1444	while (resid) {
1445		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1446		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1447		if (error)
1448			break;
1449		boff += size;
1450		addr += size;
1451		resid -= size;
1452	}
1453
1454	return (error);
1455}
1456
1457/*ARGSUSED*/
1458int
1459zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1460{
1461	minor_t minor = getminor(dev);
1462	zvol_state_t *zv;
1463	uint64_t volsize;
1464	rl_t *rl;
1465	int error = 0;
1466
1467	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1468	if (zv == NULL)
1469		return (SET_ERROR(ENXIO));
1470
1471	volsize = zv->zv_volsize;
1472	if (uio->uio_resid > 0 &&
1473	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1474		return (SET_ERROR(EIO));
1475
1476	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1477		error = physio(zvol_strategy, NULL, dev, B_READ,
1478		    zvol_minphys, uio);
1479		return (error);
1480	}
1481
1482	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1483	    RL_READER);
1484	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1485		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1486
1487		/* don't read past the end */
1488		if (bytes > volsize - uio->uio_loffset)
1489			bytes = volsize - uio->uio_loffset;
1490
1491		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1492		if (error) {
1493			/* convert checksum errors into IO errors */
1494			if (error == ECKSUM)
1495				error = SET_ERROR(EIO);
1496			break;
1497		}
1498	}
1499	zfs_range_unlock(rl);
1500	return (error);
1501}
1502
1503/*ARGSUSED*/
1504int
1505zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1506{
1507	minor_t minor = getminor(dev);
1508	zvol_state_t *zv;
1509	uint64_t volsize;
1510	rl_t *rl;
1511	int error = 0;
1512	boolean_t sync;
1513
1514	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1515	if (zv == NULL)
1516		return (SET_ERROR(ENXIO));
1517
1518	volsize = zv->zv_volsize;
1519	if (uio->uio_resid > 0 &&
1520	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1521		return (SET_ERROR(EIO));
1522
1523	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1524		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1525		    zvol_minphys, uio);
1526		return (error);
1527	}
1528
1529	sync = !(zv->zv_flags & ZVOL_WCE) ||
1530	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1531
1532	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1533	    RL_WRITER);
1534	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1535		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1536		uint64_t off = uio->uio_loffset;
1537		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1538
1539		if (bytes > volsize - off)	/* don't write past the end */
1540			bytes = volsize - off;
1541
1542		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1543		error = dmu_tx_assign(tx, TXG_WAIT);
1544		if (error) {
1545			dmu_tx_abort(tx);
1546			break;
1547		}
1548		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1549		if (error == 0)
1550			zvol_log_write(zv, tx, off, bytes, sync);
1551		dmu_tx_commit(tx);
1552
1553		if (error)
1554			break;
1555	}
1556	zfs_range_unlock(rl);
1557	if (sync)
1558		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1559	return (error);
1560}
1561
1562int
1563zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1564{
1565	struct uuid uuid = EFI_RESERVED;
1566	efi_gpe_t gpe = { 0 };
1567	uint32_t crc;
1568	dk_efi_t efi;
1569	int length;
1570	char *ptr;
1571
1572	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1573		return (SET_ERROR(EFAULT));
1574	ptr = (char *)(uintptr_t)efi.dki_data_64;
1575	length = efi.dki_length;
1576	/*
1577	 * Some clients may attempt to request a PMBR for the
1578	 * zvol.  Currently this interface will return EINVAL to
1579	 * such requests.  These requests could be supported by
1580	 * adding a check for lba == 0 and consing up an appropriate
1581	 * PMBR.
1582	 */
1583	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1584		return (SET_ERROR(EINVAL));
1585
1586	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1587	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1588	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1589
1590	if (efi.dki_lba == 1) {
1591		efi_gpt_t gpt = { 0 };
1592
1593		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1594		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1595		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1596		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1597		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1598		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1599		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1600		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1601		gpt.efi_gpt_SizeOfPartitionEntry =
1602		    LE_32(sizeof (efi_gpe_t));
1603		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1604		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1605		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1606		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1607		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1608		    flag))
1609			return (SET_ERROR(EFAULT));
1610		ptr += sizeof (gpt);
1611		length -= sizeof (gpt);
1612	}
1613	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1614	    length), flag))
1615		return (SET_ERROR(EFAULT));
1616	return (0);
1617}
1618
1619/*
1620 * BEGIN entry points to allow external callers access to the volume.
1621 */
1622/*
1623 * Return the volume parameters needed for access from an external caller.
1624 * These values are invariant as long as the volume is held open.
1625 */
1626int
1627zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1628    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1629    void **rl_hdl, void **bonus_hdl)
1630{
1631	zvol_state_t *zv;
1632
1633	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1634	if (zv == NULL)
1635		return (SET_ERROR(ENXIO));
1636	if (zv->zv_flags & ZVOL_DUMPIFIED)
1637		return (SET_ERROR(ENXIO));
1638
1639	ASSERT(blksize && max_xfer_len && minor_hdl &&
1640	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1641
1642	*blksize = zv->zv_volblocksize;
1643	*max_xfer_len = (uint64_t)zvol_maxphys;
1644	*minor_hdl = zv;
1645	*objset_hdl = zv->zv_objset;
1646	*zil_hdl = zv->zv_zilog;
1647	*rl_hdl = &zv->zv_znode;
1648	*bonus_hdl = zv->zv_dbuf;
1649	return (0);
1650}
1651
1652/*
1653 * Return the current volume size to an external caller.
1654 * The size can change while the volume is open.
1655 */
1656uint64_t
1657zvol_get_volume_size(void *minor_hdl)
1658{
1659	zvol_state_t *zv = minor_hdl;
1660
1661	return (zv->zv_volsize);
1662}
1663
1664/*
1665 * Return the current WCE setting to an external caller.
1666 * The WCE setting can change while the volume is open.
1667 */
1668int
1669zvol_get_volume_wce(void *minor_hdl)
1670{
1671	zvol_state_t *zv = minor_hdl;
1672
1673	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1674}
1675
1676/*
1677 * Entry point for external callers to zvol_log_write
1678 */
1679void
1680zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1681    boolean_t sync)
1682{
1683	zvol_state_t *zv = minor_hdl;
1684
1685	zvol_log_write(zv, tx, off, resid, sync);
1686}
1687/*
1688 * END entry points to allow external callers access to the volume.
1689 */
1690#endif	/* sun */
1691
1692/*
1693 * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
1694 */
1695static void
1696zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
1697    boolean_t sync)
1698{
1699	itx_t *itx;
1700	lr_truncate_t *lr;
1701	zilog_t *zilog = zv->zv_zilog;
1702
1703	if (zil_replaying(zilog, tx))
1704		return;
1705
1706	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
1707	lr = (lr_truncate_t *)&itx->itx_lr;
1708	lr->lr_foid = ZVOL_OBJ;
1709	lr->lr_offset = off;
1710	lr->lr_length = len;
1711
1712	itx->itx_sync = sync;
1713	zil_itx_assign(zilog, itx, tx);
1714}
1715
1716#ifdef sun
1717/*
1718 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1719 * Also a dirtbag dkio ioctl for unmap/free-block functionality.
1720 */
1721/*ARGSUSED*/
1722int
1723zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1724{
1725	zvol_state_t *zv;
1726	struct dk_cinfo dki;
1727	struct dk_minfo dkm;
1728	struct dk_callback *dkc;
1729	int error = 0;
1730	rl_t *rl;
1731
1732	mutex_enter(&spa_namespace_lock);
1733
1734	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1735
1736	if (zv == NULL) {
1737		mutex_exit(&spa_namespace_lock);
1738		return (SET_ERROR(ENXIO));
1739	}
1740	ASSERT(zv->zv_total_opens > 0);
1741
1742	switch (cmd) {
1743
1744	case DKIOCINFO:
1745		bzero(&dki, sizeof (dki));
1746		(void) strcpy(dki.dki_cname, "zvol");
1747		(void) strcpy(dki.dki_dname, "zvol");
1748		dki.dki_ctype = DKC_UNKNOWN;
1749		dki.dki_unit = getminor(dev);
1750		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1751		mutex_exit(&spa_namespace_lock);
1752		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1753			error = SET_ERROR(EFAULT);
1754		return (error);
1755
1756	case DKIOCGMEDIAINFO:
1757		bzero(&dkm, sizeof (dkm));
1758		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1759		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1760		dkm.dki_media_type = DK_UNKNOWN;
1761		mutex_exit(&spa_namespace_lock);
1762		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1763			error = SET_ERROR(EFAULT);
1764		return (error);
1765
1766	case DKIOCGETEFI:
1767		{
1768			uint64_t vs = zv->zv_volsize;
1769			uint8_t bs = zv->zv_min_bs;
1770
1771			mutex_exit(&spa_namespace_lock);
1772			error = zvol_getefi((void *)arg, flag, vs, bs);
1773			return (error);
1774		}
1775
1776	case DKIOCFLUSHWRITECACHE:
1777		dkc = (struct dk_callback *)arg;
1778		mutex_exit(&spa_namespace_lock);
1779		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1780		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1781			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1782			error = 0;
1783		}
1784		return (error);
1785
1786	case DKIOCGETWCE:
1787		{
1788			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1789			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1790			    flag))
1791				error = SET_ERROR(EFAULT);
1792			break;
1793		}
1794	case DKIOCSETWCE:
1795		{
1796			int wce;
1797			if (ddi_copyin((void *)arg, &wce, sizeof (int),
1798			    flag)) {
1799				error = SET_ERROR(EFAULT);
1800				break;
1801			}
1802			if (wce) {
1803				zv->zv_flags |= ZVOL_WCE;
1804				mutex_exit(&spa_namespace_lock);
1805			} else {
1806				zv->zv_flags &= ~ZVOL_WCE;
1807				mutex_exit(&spa_namespace_lock);
1808				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1809			}
1810			return (0);
1811		}
1812
1813	case DKIOCGGEOM:
1814	case DKIOCGVTOC:
1815		/*
1816		 * commands using these (like prtvtoc) expect ENOTSUP
1817		 * since we're emulating an EFI label
1818		 */
1819		error = SET_ERROR(ENOTSUP);
1820		break;
1821
1822	case DKIOCDUMPINIT:
1823		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1824		    RL_WRITER);
1825		error = zvol_dumpify(zv);
1826		zfs_range_unlock(rl);
1827		break;
1828
1829	case DKIOCDUMPFINI:
1830		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1831			break;
1832		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1833		    RL_WRITER);
1834		error = zvol_dump_fini(zv);
1835		zfs_range_unlock(rl);
1836		break;
1837
1838	case DKIOCFREE:
1839	{
1840		dkioc_free_t df;
1841		dmu_tx_t *tx;
1842
1843		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1844			error = SET_ERROR(EFAULT);
1845			break;
1846		}
1847
1848		/*
1849		 * Apply Postel's Law to length-checking.  If they overshoot,
1850		 * just blank out until the end, if there's a need to blank
1851		 * out anything.
1852		 */
1853		if (df.df_start >= zv->zv_volsize)
1854			break;	/* No need to do anything... */
1855		if (df.df_start + df.df_length > zv->zv_volsize)
1856			df.df_length = DMU_OBJECT_END;
1857
1858		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1859		    RL_WRITER);
1860		tx = dmu_tx_create(zv->zv_objset);
1861		error = dmu_tx_assign(tx, TXG_WAIT);
1862		if (error != 0) {
1863			dmu_tx_abort(tx);
1864		} else {
1865			zvol_log_truncate(zv, tx, df.df_start,
1866			    df.df_length, B_TRUE);
1867			dmu_tx_commit(tx);
1868			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1869			    df.df_start, df.df_length);
1870		}
1871
1872		zfs_range_unlock(rl);
1873
1874		if (error == 0) {
1875			/*
1876			 * If the write-cache is disabled or 'sync' property
1877			 * is set to 'always' then treat this as a synchronous
1878			 * operation (i.e. commit to zil).
1879			 */
1880			if (!(zv->zv_flags & ZVOL_WCE) ||
1881			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1882				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1883
1884			/*
1885			 * If the caller really wants synchronous writes, and
1886			 * can't wait for them, don't return until the write
1887			 * is done.
1888			 */
1889			if (df.df_flags & DF_WAIT_SYNC) {
1890				txg_wait_synced(
1891				    dmu_objset_pool(zv->zv_objset), 0);
1892			}
1893		}
1894		break;
1895	}
1896
1897	default:
1898		error = SET_ERROR(ENOTTY);
1899		break;
1900
1901	}
1902	mutex_exit(&spa_namespace_lock);
1903	return (error);
1904}
1905#endif	/* sun */
1906
1907int
1908zvol_busy(void)
1909{
1910	return (zvol_minors != 0);
1911}
1912
1913void
1914zvol_init(void)
1915{
1916	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1917	    1) == 0);
1918	ZFS_LOG(1, "ZVOL Initialized.");
1919}
1920
1921void
1922zvol_fini(void)
1923{
1924	ddi_soft_state_fini(&zfsdev_state);
1925	ZFS_LOG(1, "ZVOL Deinitialized.");
1926}
1927
1928#ifdef sun
1929/*ARGSUSED*/
1930static int
1931zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1932{
1933	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1934
1935	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1936		return (1);
1937	return (0);
1938}
1939
1940/*ARGSUSED*/
1941static void
1942zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1943{
1944	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1945
1946	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1947}
1948
1949static int
1950zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1951{
1952	dmu_tx_t *tx;
1953	int error;
1954	objset_t *os = zv->zv_objset;
1955	spa_t *spa = dmu_objset_spa(os);
1956	vdev_t *vd = spa->spa_root_vdev;
1957	nvlist_t *nv = NULL;
1958	uint64_t version = spa_version(spa);
1959	enum zio_checksum checksum;
1960
1961	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1962	ASSERT(vd->vdev_ops == &vdev_root_ops);
1963
1964	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1965	    DMU_OBJECT_END);
1966	/* wait for dmu_free_long_range to actually free the blocks */
1967	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1968
1969	/*
1970	 * If the pool on which the dump device is being initialized has more
1971	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1972	 * enabled.  If so, bump that feature's counter to indicate that the
1973	 * feature is active. We also check the vdev type to handle the
1974	 * following case:
1975	 *   # zpool create test raidz disk1 disk2 disk3
1976	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1977	 *   the raidz vdev itself has 3 children.
1978	 */
1979	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1980		if (!spa_feature_is_enabled(spa,
1981		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1982			return (SET_ERROR(ENOTSUP));
1983		(void) dsl_sync_task(spa_name(spa),
1984		    zfs_mvdev_dump_feature_check,
1985		    zfs_mvdev_dump_activate_feature_sync, NULL, 2);
1986	}
1987
1988	tx = dmu_tx_create(os);
1989	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1990	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1991	error = dmu_tx_assign(tx, TXG_WAIT);
1992	if (error) {
1993		dmu_tx_abort(tx);
1994		return (error);
1995	}
1996
1997	/*
1998	 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
1999	 * function.  Otherwise, use the old default -- OFF.
2000	 */
2001	checksum = spa_feature_is_active(spa,
2002	    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
2003	    ZIO_CHECKSUM_OFF;
2004
2005	/*
2006	 * If we are resizing the dump device then we only need to
2007	 * update the refreservation to match the newly updated
2008	 * zvolsize. Otherwise, we save off the original state of the
2009	 * zvol so that we can restore them if the zvol is ever undumpified.
2010	 */
2011	if (resize) {
2012		error = zap_update(os, ZVOL_ZAP_OBJ,
2013		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2014		    &zv->zv_volsize, tx);
2015	} else {
2016		uint64_t checksum, compress, refresrv, vbs, dedup;
2017
2018		error = dsl_prop_get_integer(zv->zv_name,
2019		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
2020		error = error ? error : dsl_prop_get_integer(zv->zv_name,
2021		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
2022		error = error ? error : dsl_prop_get_integer(zv->zv_name,
2023		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
2024		error = error ? error : dsl_prop_get_integer(zv->zv_name,
2025		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
2026		if (version >= SPA_VERSION_DEDUP) {
2027			error = error ? error :
2028			    dsl_prop_get_integer(zv->zv_name,
2029			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
2030		}
2031
2032		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2033		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
2034		    &compress, tx);
2035		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2036		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
2037		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2038		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
2039		    &refresrv, tx);
2040		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2041		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
2042		    &vbs, tx);
2043		error = error ? error : dmu_object_set_blocksize(
2044		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
2045		if (version >= SPA_VERSION_DEDUP) {
2046			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
2047			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
2048			    &dedup, tx);
2049		}
2050		if (error == 0)
2051			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
2052	}
2053	dmu_tx_commit(tx);
2054
2055	/*
2056	 * We only need update the zvol's property if we are initializing
2057	 * the dump area for the first time.
2058	 */
2059	if (!resize) {
2060		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2061		VERIFY(nvlist_add_uint64(nv,
2062		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
2063		VERIFY(nvlist_add_uint64(nv,
2064		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
2065		    ZIO_COMPRESS_OFF) == 0);
2066		VERIFY(nvlist_add_uint64(nv,
2067		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2068		    checksum) == 0);
2069		if (version >= SPA_VERSION_DEDUP) {
2070			VERIFY(nvlist_add_uint64(nv,
2071			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2072			    ZIO_CHECKSUM_OFF) == 0);
2073		}
2074
2075		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2076		    nv, NULL);
2077		nvlist_free(nv);
2078
2079		if (error)
2080			return (error);
2081	}
2082
2083	/* Allocate the space for the dump */
2084	error = zvol_prealloc(zv);
2085	return (error);
2086}
2087
2088static int
2089zvol_dumpify(zvol_state_t *zv)
2090{
2091	int error = 0;
2092	uint64_t dumpsize = 0;
2093	dmu_tx_t *tx;
2094	objset_t *os = zv->zv_objset;
2095
2096	if (zv->zv_flags & ZVOL_RDONLY)
2097		return (SET_ERROR(EROFS));
2098
2099	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2100	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2101		boolean_t resize = (dumpsize > 0);
2102
2103		if ((error = zvol_dump_init(zv, resize)) != 0) {
2104			(void) zvol_dump_fini(zv);
2105			return (error);
2106		}
2107	}
2108
2109	/*
2110	 * Build up our lba mapping.
2111	 */
2112	error = zvol_get_lbas(zv);
2113	if (error) {
2114		(void) zvol_dump_fini(zv);
2115		return (error);
2116	}
2117
2118	tx = dmu_tx_create(os);
2119	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2120	error = dmu_tx_assign(tx, TXG_WAIT);
2121	if (error) {
2122		dmu_tx_abort(tx);
2123		(void) zvol_dump_fini(zv);
2124		return (error);
2125	}
2126
2127	zv->zv_flags |= ZVOL_DUMPIFIED;
2128	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2129	    &zv->zv_volsize, tx);
2130	dmu_tx_commit(tx);
2131
2132	if (error) {
2133		(void) zvol_dump_fini(zv);
2134		return (error);
2135	}
2136
2137	txg_wait_synced(dmu_objset_pool(os), 0);
2138	return (0);
2139}
2140
2141static int
2142zvol_dump_fini(zvol_state_t *zv)
2143{
2144	dmu_tx_t *tx;
2145	objset_t *os = zv->zv_objset;
2146	nvlist_t *nv;
2147	int error = 0;
2148	uint64_t checksum, compress, refresrv, vbs, dedup;
2149	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2150
2151	/*
2152	 * Attempt to restore the zvol back to its pre-dumpified state.
2153	 * This is a best-effort attempt as it's possible that not all
2154	 * of these properties were initialized during the dumpify process
2155	 * (i.e. error during zvol_dump_init).
2156	 */
2157
2158	tx = dmu_tx_create(os);
2159	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2160	error = dmu_tx_assign(tx, TXG_WAIT);
2161	if (error) {
2162		dmu_tx_abort(tx);
2163		return (error);
2164	}
2165	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2166	dmu_tx_commit(tx);
2167
2168	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2169	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2170	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2171	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2172	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2173	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2174	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2175	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2176
2177	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2178	(void) nvlist_add_uint64(nv,
2179	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2180	(void) nvlist_add_uint64(nv,
2181	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2182	(void) nvlist_add_uint64(nv,
2183	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2184	if (version >= SPA_VERSION_DEDUP &&
2185	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2186	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2187		(void) nvlist_add_uint64(nv,
2188		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2189	}
2190	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2191	    nv, NULL);
2192	nvlist_free(nv);
2193
2194	zvol_free_extents(zv);
2195	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2196	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2197	/* wait for dmu_free_long_range to actually free the blocks */
2198	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2199	tx = dmu_tx_create(os);
2200	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2201	error = dmu_tx_assign(tx, TXG_WAIT);
2202	if (error) {
2203		dmu_tx_abort(tx);
2204		return (error);
2205	}
2206	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2207		zv->zv_volblocksize = vbs;
2208	dmu_tx_commit(tx);
2209
2210	return (0);
2211}
2212#endif	/* sun */
2213
2214static zvol_state_t *
2215zvol_geom_create(const char *name)
2216{
2217	struct g_provider *pp;
2218	struct g_geom *gp;
2219	zvol_state_t *zv;
2220
2221	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
2222	gp->start = zvol_geom_start;
2223	gp->access = zvol_geom_access;
2224	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
2225	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2226	pp->sectorsize = DEV_BSIZE;
2227
2228	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
2229	zv->zv_provider = pp;
2230	zv->zv_state = 0;
2231	bioq_init(&zv->zv_queue);
2232	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
2233
2234	pp->private = zv;
2235
2236	return (zv);
2237}
2238
2239static void
2240zvol_geom_run(zvol_state_t *zv)
2241{
2242	struct g_provider *pp;
2243
2244	pp = zv->zv_provider;
2245	g_error_provider(pp, 0);
2246
2247	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2248	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2249}
2250
2251static void
2252zvol_geom_destroy(zvol_state_t *zv)
2253{
2254	struct g_provider *pp;
2255
2256	g_topology_assert();
2257
2258	mtx_lock(&zv->zv_queue_mtx);
2259	zv->zv_state = 1;
2260	wakeup_one(&zv->zv_queue);
2261	while (zv->zv_state != 2)
2262		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2263	mtx_destroy(&zv->zv_queue_mtx);
2264
2265	pp = zv->zv_provider;
2266	zv->zv_provider = NULL;
2267	pp->private = NULL;
2268	g_wither_geom(pp->geom, ENXIO);
2269
2270	kmem_free(zv, sizeof(*zv));
2271}
2272
2273static int
2274zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2275{
2276	int count, error, flags;
2277
2278	g_topology_assert();
2279
2280	/*
2281	 * To make it easier we expect either open or close, but not both
2282	 * at the same time.
2283	 */
2284	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2285	    (acr <= 0 && acw <= 0 && ace <= 0),
2286	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2287	    pp->name, acr, acw, ace));
2288
2289	if (pp->private == NULL) {
2290		if (acr <= 0 && acw <= 0 && ace <= 0)
2291			return (0);
2292		return (pp->error);
2293	}
2294
2295	/*
2296	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2297	 * because GEOM already handles that and handles it a bit differently.
2298	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2299	 * only one exclusive consumer, no matter if it is reader or writer.
2300	 * I like better the way GEOM works so I'll leave it for GEOM to
2301	 * decide what to do.
2302	 */
2303
2304	count = acr + acw + ace;
2305	if (count == 0)
2306		return (0);
2307
2308	flags = 0;
2309	if (acr != 0 || ace != 0)
2310		flags |= FREAD;
2311	if (acw != 0)
2312		flags |= FWRITE;
2313
2314	g_topology_unlock();
2315	if (count > 0)
2316		error = zvol_open(pp, flags, count);
2317	else
2318		error = zvol_close(pp, flags, -count);
2319	g_topology_lock();
2320	return (error);
2321}
2322
2323static void
2324zvol_geom_start(struct bio *bp)
2325{
2326	zvol_state_t *zv;
2327	boolean_t first;
2328
2329	zv = bp->bio_to->private;
2330	ASSERT(zv != NULL);
2331	switch (bp->bio_cmd) {
2332	case BIO_FLUSH:
2333		if (!THREAD_CAN_SLEEP())
2334			goto enqueue;
2335		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2336		g_io_deliver(bp, 0);
2337		break;
2338	case BIO_READ:
2339	case BIO_WRITE:
2340	case BIO_DELETE:
2341		if (!THREAD_CAN_SLEEP())
2342			goto enqueue;
2343		zvol_strategy(bp);
2344		break;
2345	case BIO_GETATTR:
2346	default:
2347		g_io_deliver(bp, EOPNOTSUPP);
2348		break;
2349	}
2350	return;
2351
2352enqueue:
2353	mtx_lock(&zv->zv_queue_mtx);
2354	first = (bioq_first(&zv->zv_queue) == NULL);
2355	bioq_insert_tail(&zv->zv_queue, bp);
2356	mtx_unlock(&zv->zv_queue_mtx);
2357	if (first)
2358		wakeup_one(&zv->zv_queue);
2359}
2360
2361static void
2362zvol_geom_worker(void *arg)
2363{
2364	zvol_state_t *zv;
2365	struct bio *bp;
2366
2367	thread_lock(curthread);
2368	sched_prio(curthread, PRIBIO);
2369	thread_unlock(curthread);
2370
2371	zv = arg;
2372	for (;;) {
2373		mtx_lock(&zv->zv_queue_mtx);
2374		bp = bioq_takefirst(&zv->zv_queue);
2375		if (bp == NULL) {
2376			if (zv->zv_state == 1) {
2377				zv->zv_state = 2;
2378				wakeup(&zv->zv_state);
2379				mtx_unlock(&zv->zv_queue_mtx);
2380				kthread_exit();
2381			}
2382			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2383			    "zvol:io", 0);
2384			continue;
2385		}
2386		mtx_unlock(&zv->zv_queue_mtx);
2387		switch (bp->bio_cmd) {
2388		case BIO_FLUSH:
2389			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2390			g_io_deliver(bp, 0);
2391			break;
2392		case BIO_READ:
2393		case BIO_WRITE:
2394			zvol_strategy(bp);
2395			break;
2396		}
2397	}
2398}
2399
2400extern boolean_t dataset_name_hidden(const char *name);
2401
2402static int
2403zvol_create_snapshots(objset_t *os, const char *name)
2404{
2405	uint64_t cookie, obj;
2406	char *sname;
2407	int error, len;
2408
2409	cookie = obj = 0;
2410	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2411
2412#if 0
2413	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2414	    DS_FIND_SNAPSHOTS);
2415#endif
2416
2417	for (;;) {
2418		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2419		if (len >= MAXPATHLEN) {
2420			dmu_objset_rele(os, FTAG);
2421			error = ENAMETOOLONG;
2422			break;
2423		}
2424
2425		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2426		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2427		    sname + len, &obj, &cookie, NULL);
2428		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2429		if (error != 0) {
2430			if (error == ENOENT)
2431				error = 0;
2432			break;
2433		}
2434
2435		if ((error = zvol_create_minor(sname)) != 0) {
2436			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2437			    sname, error);
2438			break;
2439		}
2440	}
2441
2442	kmem_free(sname, MAXPATHLEN);
2443	return (error);
2444}
2445
2446int
2447zvol_create_minors(const char *name)
2448{
2449	uint64_t cookie;
2450	objset_t *os;
2451	char *osname, *p;
2452	int error, len;
2453
2454	if (dataset_name_hidden(name))
2455		return (0);
2456
2457	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2458		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2459		    name, error);
2460		return (error);
2461	}
2462	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2463		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2464		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2465		if ((error = zvol_create_minor(name)) == 0)
2466			error = zvol_create_snapshots(os, name);
2467		else {
2468			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2469			    name, error);
2470		}
2471		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2472		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2473		return (error);
2474	}
2475	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2476		dmu_objset_rele(os, FTAG);
2477		return (0);
2478	}
2479
2480	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2481	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2482		dmu_objset_rele(os, FTAG);
2483		kmem_free(osname, MAXPATHLEN);
2484		return (ENOENT);
2485	}
2486	p = osname + strlen(osname);
2487	len = MAXPATHLEN - (p - osname);
2488
2489#if 0
2490	/* Prefetch the datasets. */
2491	cookie = 0;
2492	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2493		if (!dataset_name_hidden(osname))
2494			(void) dmu_objset_prefetch(osname, NULL);
2495	}
2496#endif
2497
2498	cookie = 0;
2499	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2500	    &cookie) == 0) {
2501		dmu_objset_rele(os, FTAG);
2502		(void)zvol_create_minors(osname);
2503		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2504			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2505			    name, error);
2506			return (error);
2507		}
2508	}
2509
2510	dmu_objset_rele(os, FTAG);
2511	kmem_free(osname, MAXPATHLEN);
2512	return (0);
2513}
2514
2515static void
2516zvol_rename_minor(struct g_geom *gp, const char *newname)
2517{
2518	struct g_provider *pp;
2519	zvol_state_t *zv;
2520
2521	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2522	g_topology_assert();
2523
2524	pp = LIST_FIRST(&gp->provider);
2525	ASSERT(pp != NULL);
2526	zv = pp->private;
2527	ASSERT(zv != NULL);
2528
2529	zv->zv_provider = NULL;
2530	g_wither_provider(pp, ENXIO);
2531
2532	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2533	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2534	pp->sectorsize = DEV_BSIZE;
2535	pp->mediasize = zv->zv_volsize;
2536	pp->private = zv;
2537	zv->zv_provider = pp;
2538	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
2539	g_error_provider(pp, 0);
2540}
2541
2542void
2543zvol_rename_minors(const char *oldname, const char *newname)
2544{
2545	char name[MAXPATHLEN];
2546	struct g_provider *pp;
2547	struct g_geom *gp;
2548	size_t oldnamelen, newnamelen;
2549	zvol_state_t *zv;
2550	char *namebuf;
2551
2552	oldnamelen = strlen(oldname);
2553	newnamelen = strlen(newname);
2554
2555	DROP_GIANT();
2556	mutex_enter(&spa_namespace_lock);
2557	g_topology_lock();
2558
2559	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
2560		pp = LIST_FIRST(&gp->provider);
2561		if (pp == NULL)
2562			continue;
2563		zv = pp->private;
2564		if (zv == NULL)
2565			continue;
2566		if (strcmp(zv->zv_name, oldname) == 0) {
2567			zvol_rename_minor(gp, newname);
2568		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
2569		    (zv->zv_name[oldnamelen] == '/' ||
2570		     zv->zv_name[oldnamelen] == '@')) {
2571			snprintf(name, sizeof(name), "%s%c%s", newname,
2572			    zv->zv_name[oldnamelen],
2573			    zv->zv_name + oldnamelen + 1);
2574			zvol_rename_minor(gp, name);
2575		}
2576	}
2577
2578	g_topology_unlock();
2579	mutex_exit(&spa_namespace_lock);
2580	PICKUP_GIANT();
2581}
2582