zvol.c revision 263987
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 * Copyright (c) 2013 by Delphix. All rights reserved.
27 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
28 */
29
30/* Portions Copyright 2010 Robert Milkowski */
31/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
32
33/*
34 * ZFS volume emulation driver.
35 *
36 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
37 * Volumes are accessed through the symbolic links named:
38 *
39 * /dev/zvol/dsk/<pool_name>/<dataset_name>
40 * /dev/zvol/rdsk/<pool_name>/<dataset_name>
41 *
42 * These links are created by the /dev filesystem (sdev_zvolops.c).
43 * Volumes are persistent through reboot.  No user command needs to be
44 * run before opening and using a device.
45 *
46 * FreeBSD notes.
47 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
48 * in the system.
49 */
50
51#include <sys/types.h>
52#include <sys/param.h>
53#include <sys/kernel.h>
54#include <sys/errno.h>
55#include <sys/uio.h>
56#include <sys/bio.h>
57#include <sys/buf.h>
58#include <sys/kmem.h>
59#include <sys/conf.h>
60#include <sys/cmn_err.h>
61#include <sys/stat.h>
62#include <sys/zap.h>
63#include <sys/spa.h>
64#include <sys/spa_impl.h>
65#include <sys/zio.h>
66#include <sys/dmu_traverse.h>
67#include <sys/dnode.h>
68#include <sys/dsl_dataset.h>
69#include <sys/dsl_prop.h>
70#include <sys/dkio.h>
71#include <sys/byteorder.h>
72#include <sys/sunddi.h>
73#include <sys/dirent.h>
74#include <sys/policy.h>
75#include <sys/fs/zfs.h>
76#include <sys/zfs_ioctl.h>
77#include <sys/zil.h>
78#include <sys/refcount.h>
79#include <sys/zfs_znode.h>
80#include <sys/zfs_rlock.h>
81#include <sys/vdev_impl.h>
82#include <sys/vdev_raidz.h>
83#include <sys/zvol.h>
84#include <sys/zil_impl.h>
85#include <sys/dbuf.h>
86#include <sys/dmu_tx.h>
87#include <sys/zfeature.h>
88#include <sys/zio_checksum.h>
89
90#include <geom/geom.h>
91
92#include "zfs_namecheck.h"
93
94struct g_class zfs_zvol_class = {
95	.name = "ZFS::ZVOL",
96	.version = G_VERSION,
97};
98
99DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
100
101void *zfsdev_state;
102static char *zvol_tag = "zvol_tag";
103
104#define	ZVOL_DUMPSIZE		"dumpsize"
105
106/*
107 * The spa_namespace_lock protects the zfsdev_state structure from being
108 * modified while it's being used, e.g. an open that comes in before a
109 * create finishes.  It also protects temporary opens of the dataset so that,
110 * e.g., an open doesn't get a spurious EBUSY.
111 */
112static uint32_t zvol_minors;
113
114typedef struct zvol_extent {
115	list_node_t	ze_node;
116	dva_t		ze_dva;		/* dva associated with this extent */
117	uint64_t	ze_nblks;	/* number of blocks in extent */
118} zvol_extent_t;
119
120/*
121 * The in-core state of each volume.
122 */
123typedef struct zvol_state {
124	char		zv_name[MAXPATHLEN]; /* pool/dd name */
125	uint64_t	zv_volsize;	/* amount of space we advertise */
126	uint64_t	zv_volblocksize; /* volume block size */
127	struct g_provider *zv_provider;	/* GEOM provider */
128	uint8_t		zv_min_bs;	/* minimum addressable block shift */
129	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
130	objset_t	*zv_objset;	/* objset handle */
131	uint32_t	zv_total_opens;	/* total open count */
132	zilog_t		*zv_zilog;	/* ZIL handle */
133	list_t		zv_extents;	/* List of extents for dump */
134	znode_t		zv_znode;	/* for range locking */
135	dmu_buf_t	*zv_dbuf;	/* bonus handle */
136	int		zv_state;
137	struct bio_queue_head zv_queue;
138	struct mtx	zv_queue_mtx;	/* zv_queue mutex */
139} zvol_state_t;
140
141/*
142 * zvol specific flags
143 */
144#define	ZVOL_RDONLY	0x1
145#define	ZVOL_DUMPIFIED	0x2
146#define	ZVOL_EXCL	0x4
147#define	ZVOL_WCE	0x8
148
149/*
150 * zvol maximum transfer in one DMU tx.
151 */
152int zvol_maxphys = DMU_MAX_ACCESS/2;
153
154extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
155    nvlist_t *, nvlist_t *);
156static int zvol_remove_zv(zvol_state_t *);
157static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
158static int zvol_dumpify(zvol_state_t *zv);
159static int zvol_dump_fini(zvol_state_t *zv);
160static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
161
162static zvol_state_t *zvol_geom_create(const char *name);
163static void zvol_geom_run(zvol_state_t *zv);
164static void zvol_geom_destroy(zvol_state_t *zv);
165static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
166static void zvol_geom_start(struct bio *bp);
167static void zvol_geom_worker(void *arg);
168
169static void
170zvol_size_changed(zvol_state_t *zv)
171{
172#ifdef sun
173	dev_t dev = makedevice(maj, min);
174
175	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
176	    "Size", volsize) == DDI_SUCCESS);
177	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
178	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
179
180	/* Notify specfs to invalidate the cached size */
181	spec_size_invalidate(dev, VBLK);
182	spec_size_invalidate(dev, VCHR);
183#else	/* !sun */
184	struct g_provider *pp;
185
186	pp = zv->zv_provider;
187	if (pp == NULL)
188		return;
189	g_topology_lock();
190	g_resize_provider(pp, zv->zv_volsize);
191	g_topology_unlock();
192#endif	/* !sun */
193}
194
195int
196zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
197{
198	if (volsize == 0)
199		return (SET_ERROR(EINVAL));
200
201	if (volsize % blocksize != 0)
202		return (SET_ERROR(EINVAL));
203
204#ifdef _ILP32
205	if (volsize - 1 > SPEC_MAXOFFSET_T)
206		return (SET_ERROR(EOVERFLOW));
207#endif
208	return (0);
209}
210
211int
212zvol_check_volblocksize(uint64_t volblocksize)
213{
214	if (volblocksize < SPA_MINBLOCKSIZE ||
215	    volblocksize > SPA_MAXBLOCKSIZE ||
216	    !ISP2(volblocksize))
217		return (SET_ERROR(EDOM));
218
219	return (0);
220}
221
222int
223zvol_get_stats(objset_t *os, nvlist_t *nv)
224{
225	int error;
226	dmu_object_info_t doi;
227	uint64_t val;
228
229	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
230	if (error)
231		return (error);
232
233	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
234
235	error = dmu_object_info(os, ZVOL_OBJ, &doi);
236
237	if (error == 0) {
238		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
239		    doi.doi_data_block_size);
240	}
241
242	return (error);
243}
244
245static zvol_state_t *
246zvol_minor_lookup(const char *name)
247{
248	struct g_provider *pp;
249	struct g_geom *gp;
250	zvol_state_t *zv = NULL;
251
252	ASSERT(MUTEX_HELD(&spa_namespace_lock));
253
254	g_topology_lock();
255	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
256		pp = LIST_FIRST(&gp->provider);
257		if (pp == NULL)
258			continue;
259		zv = pp->private;
260		if (zv == NULL)
261			continue;
262		if (strcmp(zv->zv_name, name) == 0)
263			break;
264	}
265	g_topology_unlock();
266
267	return (gp != NULL ? zv : NULL);
268}
269
270/* extent mapping arg */
271struct maparg {
272	zvol_state_t	*ma_zv;
273	uint64_t	ma_blks;
274};
275
276/*ARGSUSED*/
277static int
278zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
279    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
280{
281	struct maparg *ma = arg;
282	zvol_extent_t *ze;
283	int bs = ma->ma_zv->zv_volblocksize;
284
285	if (BP_IS_HOLE(bp) ||
286	    zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
287		return (0);
288
289	VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
290	ma->ma_blks++;
291
292	/* Abort immediately if we have encountered gang blocks */
293	if (BP_IS_GANG(bp))
294		return (SET_ERROR(EFRAGS));
295
296	/*
297	 * See if the block is at the end of the previous extent.
298	 */
299	ze = list_tail(&ma->ma_zv->zv_extents);
300	if (ze &&
301	    DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
302	    DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
303	    DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
304		ze->ze_nblks++;
305		return (0);
306	}
307
308	dprintf_bp(bp, "%s", "next blkptr:");
309
310	/* start a new extent */
311	ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
312	ze->ze_dva = bp->blk_dva[0];	/* structure assignment */
313	ze->ze_nblks = 1;
314	list_insert_tail(&ma->ma_zv->zv_extents, ze);
315	return (0);
316}
317
318static void
319zvol_free_extents(zvol_state_t *zv)
320{
321	zvol_extent_t *ze;
322
323	while (ze = list_head(&zv->zv_extents)) {
324		list_remove(&zv->zv_extents, ze);
325		kmem_free(ze, sizeof (zvol_extent_t));
326	}
327}
328
329static int
330zvol_get_lbas(zvol_state_t *zv)
331{
332	objset_t *os = zv->zv_objset;
333	struct maparg	ma;
334	int		err;
335
336	ma.ma_zv = zv;
337	ma.ma_blks = 0;
338	zvol_free_extents(zv);
339
340	/* commit any in-flight changes before traversing the dataset */
341	txg_wait_synced(dmu_objset_pool(os), 0);
342	err = traverse_dataset(dmu_objset_ds(os), 0,
343	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
344	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
345		zvol_free_extents(zv);
346		return (err ? err : EIO);
347	}
348
349	return (0);
350}
351
352/* ARGSUSED */
353void
354zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
355{
356	zfs_creat_t *zct = arg;
357	nvlist_t *nvprops = zct->zct_props;
358	int error;
359	uint64_t volblocksize, volsize;
360
361	VERIFY(nvlist_lookup_uint64(nvprops,
362	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
363	if (nvlist_lookup_uint64(nvprops,
364	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
365		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
366
367	/*
368	 * These properties must be removed from the list so the generic
369	 * property setting step won't apply to them.
370	 */
371	VERIFY(nvlist_remove_all(nvprops,
372	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
373	(void) nvlist_remove_all(nvprops,
374	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
375
376	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
377	    DMU_OT_NONE, 0, tx);
378	ASSERT(error == 0);
379
380	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
381	    DMU_OT_NONE, 0, tx);
382	ASSERT(error == 0);
383
384	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
385	ASSERT(error == 0);
386}
387
388/*
389 * Replay a TX_WRITE ZIL transaction that didn't get committed
390 * after a system failure
391 */
392static int
393zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
394{
395	objset_t *os = zv->zv_objset;
396	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
397	uint64_t offset, length;
398	dmu_tx_t *tx;
399	int error;
400
401	if (byteswap)
402		byteswap_uint64_array(lr, sizeof (*lr));
403
404	offset = lr->lr_offset;
405	length = lr->lr_length;
406
407	/* If it's a dmu_sync() block, write the whole block */
408	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
409		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
410		if (length < blocksize) {
411			offset -= offset % blocksize;
412			length = blocksize;
413		}
414	}
415
416	tx = dmu_tx_create(os);
417	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
418	error = dmu_tx_assign(tx, TXG_WAIT);
419	if (error) {
420		dmu_tx_abort(tx);
421	} else {
422		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
423		dmu_tx_commit(tx);
424	}
425
426	return (error);
427}
428
429/* ARGSUSED */
430static int
431zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
432{
433	return (SET_ERROR(ENOTSUP));
434}
435
436/*
437 * Callback vectors for replaying records.
438 * Only TX_WRITE is needed for zvol.
439 */
440zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
441	zvol_replay_err,	/* 0 no such transaction type */
442	zvol_replay_err,	/* TX_CREATE */
443	zvol_replay_err,	/* TX_MKDIR */
444	zvol_replay_err,	/* TX_MKXATTR */
445	zvol_replay_err,	/* TX_SYMLINK */
446	zvol_replay_err,	/* TX_REMOVE */
447	zvol_replay_err,	/* TX_RMDIR */
448	zvol_replay_err,	/* TX_LINK */
449	zvol_replay_err,	/* TX_RENAME */
450	zvol_replay_write,	/* TX_WRITE */
451	zvol_replay_err,	/* TX_TRUNCATE */
452	zvol_replay_err,	/* TX_SETATTR */
453	zvol_replay_err,	/* TX_ACL */
454	zvol_replay_err,	/* TX_CREATE_ACL */
455	zvol_replay_err,	/* TX_CREATE_ATTR */
456	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
457	zvol_replay_err,	/* TX_MKDIR_ACL */
458	zvol_replay_err,	/* TX_MKDIR_ATTR */
459	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
460	zvol_replay_err,	/* TX_WRITE2 */
461};
462
463#ifdef sun
464int
465zvol_name2minor(const char *name, minor_t *minor)
466{
467	zvol_state_t *zv;
468
469	mutex_enter(&spa_namespace_lock);
470	zv = zvol_minor_lookup(name);
471	if (minor && zv)
472		*minor = zv->zv_minor;
473	mutex_exit(&spa_namespace_lock);
474	return (zv ? 0 : -1);
475}
476#endif	/* sun */
477
478/*
479 * Create a minor node (plus a whole lot more) for the specified volume.
480 */
481int
482zvol_create_minor(const char *name)
483{
484	zfs_soft_state_t *zs;
485	zvol_state_t *zv;
486	objset_t *os;
487	dmu_object_info_t doi;
488	uint64_t volsize;
489	int error;
490
491	ZFS_LOG(1, "Creating ZVOL %s...", name);
492
493	mutex_enter(&spa_namespace_lock);
494
495	if (zvol_minor_lookup(name) != NULL) {
496		mutex_exit(&spa_namespace_lock);
497		return (SET_ERROR(EEXIST));
498	}
499
500	/* lie and say we're read-only */
501	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
502
503	if (error) {
504		mutex_exit(&spa_namespace_lock);
505		return (error);
506	}
507
508#ifdef sun
509	if ((minor = zfsdev_minor_alloc()) == 0) {
510		dmu_objset_disown(os, FTAG);
511		mutex_exit(&spa_namespace_lock);
512		return (SET_ERROR(ENXIO));
513	}
514
515	if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
516		dmu_objset_disown(os, FTAG);
517		mutex_exit(&spa_namespace_lock);
518		return (SET_ERROR(EAGAIN));
519	}
520	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
521	    (char *)name);
522
523	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
524
525	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
526	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
527		ddi_soft_state_free(zfsdev_state, minor);
528		dmu_objset_disown(os, FTAG);
529		mutex_exit(&spa_namespace_lock);
530		return (SET_ERROR(EAGAIN));
531	}
532
533	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
534
535	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
536	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
537		ddi_remove_minor_node(zfs_dip, chrbuf);
538		ddi_soft_state_free(zfsdev_state, minor);
539		dmu_objset_disown(os, FTAG);
540		mutex_exit(&spa_namespace_lock);
541		return (SET_ERROR(EAGAIN));
542	}
543
544	zs = ddi_get_soft_state(zfsdev_state, minor);
545	zs->zss_type = ZSST_ZVOL;
546	zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
547#else	/* !sun */
548
549	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
550	if (error) {
551		ASSERT(error == 0);
552		dmu_objset_disown(os, zvol_tag);
553		mutex_exit(&spa_namespace_lock);
554		return (error);
555	}
556
557	DROP_GIANT();
558	g_topology_lock();
559	zv = zvol_geom_create(name);
560	zv->zv_volsize = volsize;
561	zv->zv_provider->mediasize = zv->zv_volsize;
562
563#endif	/* !sun */
564
565	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
566	zv->zv_min_bs = DEV_BSHIFT;
567	zv->zv_objset = os;
568	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
569		zv->zv_flags |= ZVOL_RDONLY;
570	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
571	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
572	    sizeof (rl_t), offsetof(rl_t, r_node));
573	list_create(&zv->zv_extents, sizeof (zvol_extent_t),
574	    offsetof(zvol_extent_t, ze_node));
575	/* get and cache the blocksize */
576	error = dmu_object_info(os, ZVOL_OBJ, &doi);
577	ASSERT(error == 0);
578	zv->zv_volblocksize = doi.doi_data_block_size;
579
580	if (spa_writeable(dmu_objset_spa(os))) {
581		if (zil_replay_disable)
582			zil_destroy(dmu_objset_zil(os), B_FALSE);
583		else
584			zil_replay(os, zv, zvol_replay_vector);
585	}
586	dmu_objset_disown(os, FTAG);
587	zv->zv_objset = NULL;
588
589	zvol_minors++;
590
591	mutex_exit(&spa_namespace_lock);
592
593	zvol_geom_run(zv);
594
595	g_topology_unlock();
596	PICKUP_GIANT();
597
598	ZFS_LOG(1, "ZVOL %s created.", name);
599
600	return (0);
601}
602
603/*
604 * Remove minor node for the specified volume.
605 */
606static int
607zvol_remove_zv(zvol_state_t *zv)
608{
609#ifdef sun
610	minor_t minor = zv->zv_minor;
611#endif
612
613	ASSERT(MUTEX_HELD(&spa_namespace_lock));
614	if (zv->zv_total_opens != 0)
615		return (SET_ERROR(EBUSY));
616
617	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
618
619#ifdef sun
620	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
621	ddi_remove_minor_node(zfs_dip, nmbuf);
622#endif	/* sun */
623
624	avl_destroy(&zv->zv_znode.z_range_avl);
625	mutex_destroy(&zv->zv_znode.z_range_lock);
626
627	zvol_geom_destroy(zv);
628
629	zvol_minors--;
630	return (0);
631}
632
633int
634zvol_remove_minor(const char *name)
635{
636	zvol_state_t *zv;
637	int rc;
638
639	mutex_enter(&spa_namespace_lock);
640	if ((zv = zvol_minor_lookup(name)) == NULL) {
641		mutex_exit(&spa_namespace_lock);
642		return (SET_ERROR(ENXIO));
643	}
644	g_topology_lock();
645	rc = zvol_remove_zv(zv);
646	g_topology_unlock();
647	mutex_exit(&spa_namespace_lock);
648	return (rc);
649}
650
651int
652zvol_first_open(zvol_state_t *zv)
653{
654	objset_t *os;
655	uint64_t volsize;
656	int error;
657	uint64_t readonly;
658
659	/* lie and say we're read-only */
660	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
661	    zvol_tag, &os);
662	if (error)
663		return (error);
664
665	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
666	if (error) {
667		ASSERT(error == 0);
668		dmu_objset_disown(os, zvol_tag);
669		return (error);
670	}
671	zv->zv_objset = os;
672	error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
673	if (error) {
674		dmu_objset_disown(os, zvol_tag);
675		return (error);
676	}
677	zv->zv_volsize = volsize;
678	zv->zv_zilog = zil_open(os, zvol_get_data);
679	zvol_size_changed(zv);
680
681	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
682	    NULL) == 0);
683	if (readonly || dmu_objset_is_snapshot(os) ||
684	    !spa_writeable(dmu_objset_spa(os)))
685		zv->zv_flags |= ZVOL_RDONLY;
686	else
687		zv->zv_flags &= ~ZVOL_RDONLY;
688	return (error);
689}
690
691void
692zvol_last_close(zvol_state_t *zv)
693{
694	zil_close(zv->zv_zilog);
695	zv->zv_zilog = NULL;
696
697	dmu_buf_rele(zv->zv_dbuf, zvol_tag);
698	zv->zv_dbuf = NULL;
699
700	/*
701	 * Evict cached data
702	 */
703	if (dsl_dataset_is_dirty(dmu_objset_ds(zv->zv_objset)) &&
704	    !(zv->zv_flags & ZVOL_RDONLY))
705		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
706	dmu_objset_evict_dbufs(zv->zv_objset);
707
708	dmu_objset_disown(zv->zv_objset, zvol_tag);
709	zv->zv_objset = NULL;
710}
711
712#ifdef sun
713int
714zvol_prealloc(zvol_state_t *zv)
715{
716	objset_t *os = zv->zv_objset;
717	dmu_tx_t *tx;
718	uint64_t refd, avail, usedobjs, availobjs;
719	uint64_t resid = zv->zv_volsize;
720	uint64_t off = 0;
721
722	/* Check the space usage before attempting to allocate the space */
723	dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
724	if (avail < zv->zv_volsize)
725		return (SET_ERROR(ENOSPC));
726
727	/* Free old extents if they exist */
728	zvol_free_extents(zv);
729
730	while (resid != 0) {
731		int error;
732		uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
733
734		tx = dmu_tx_create(os);
735		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
736		error = dmu_tx_assign(tx, TXG_WAIT);
737		if (error) {
738			dmu_tx_abort(tx);
739			(void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
740			return (error);
741		}
742		dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
743		dmu_tx_commit(tx);
744		off += bytes;
745		resid -= bytes;
746	}
747	txg_wait_synced(dmu_objset_pool(os), 0);
748
749	return (0);
750}
751#endif	/* sun */
752
753static int
754zvol_update_volsize(objset_t *os, uint64_t volsize)
755{
756	dmu_tx_t *tx;
757	int error;
758
759	ASSERT(MUTEX_HELD(&spa_namespace_lock));
760
761	tx = dmu_tx_create(os);
762	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
763	error = dmu_tx_assign(tx, TXG_WAIT);
764	if (error) {
765		dmu_tx_abort(tx);
766		return (error);
767	}
768
769	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
770	    &volsize, tx);
771	dmu_tx_commit(tx);
772
773	if (error == 0)
774		error = dmu_free_long_range(os,
775		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
776	return (error);
777}
778
779void
780zvol_remove_minors(const char *name)
781{
782	struct g_geom *gp, *gptmp;
783	struct g_provider *pp;
784	zvol_state_t *zv;
785	size_t namelen;
786
787	namelen = strlen(name);
788
789	DROP_GIANT();
790	mutex_enter(&spa_namespace_lock);
791	g_topology_lock();
792
793	LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
794		pp = LIST_FIRST(&gp->provider);
795		if (pp == NULL)
796			continue;
797		zv = pp->private;
798		if (zv == NULL)
799			continue;
800		if (strcmp(zv->zv_name, name) == 0 ||
801		    (strncmp(zv->zv_name, name, namelen) == 0 &&
802		     zv->zv_name[namelen] == '/')) {
803			(void) zvol_remove_zv(zv);
804		}
805	}
806
807	g_topology_unlock();
808	mutex_exit(&spa_namespace_lock);
809	PICKUP_GIANT();
810}
811
812int
813zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
814{
815	zvol_state_t *zv = NULL;
816	objset_t *os;
817	int error;
818	dmu_object_info_t doi;
819	uint64_t old_volsize = 0ULL;
820	uint64_t readonly;
821
822	mutex_enter(&spa_namespace_lock);
823	zv = zvol_minor_lookup(name);
824	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
825		mutex_exit(&spa_namespace_lock);
826		return (error);
827	}
828
829	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
830	    (error = zvol_check_volsize(volsize,
831	    doi.doi_data_block_size)) != 0)
832		goto out;
833
834	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
835	    NULL) == 0);
836	if (readonly) {
837		error = EROFS;
838		goto out;
839	}
840
841	error = zvol_update_volsize(os, volsize);
842	/*
843	 * Reinitialize the dump area to the new size. If we
844	 * failed to resize the dump area then restore it back to
845	 * its original size.
846	 */
847	if (zv && error == 0) {
848#ifdef ZVOL_DUMP
849		if (zv->zv_flags & ZVOL_DUMPIFIED) {
850			old_volsize = zv->zv_volsize;
851			zv->zv_volsize = volsize;
852			if ((error = zvol_dumpify(zv)) != 0 ||
853			    (error = dumpvp_resize()) != 0) {
854				(void) zvol_update_volsize(os, old_volsize);
855				zv->zv_volsize = old_volsize;
856				error = zvol_dumpify(zv);
857			}
858		}
859#endif	/* ZVOL_DUMP */
860		if (error == 0) {
861			zv->zv_volsize = volsize;
862			zvol_size_changed(zv);
863		}
864	}
865
866#ifdef sun
867	/*
868	 * Generate a LUN expansion event.
869	 */
870	if (zv && error == 0) {
871		sysevent_id_t eid;
872		nvlist_t *attr;
873		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
874
875		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
876		    zv->zv_minor);
877
878		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
879		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
880
881		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
882		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
883
884		nvlist_free(attr);
885		kmem_free(physpath, MAXPATHLEN);
886	}
887#endif	/* sun */
888
889out:
890	dmu_objset_rele(os, FTAG);
891
892	mutex_exit(&spa_namespace_lock);
893
894	return (error);
895}
896
897/*ARGSUSED*/
898static int
899zvol_open(struct g_provider *pp, int flag, int count)
900{
901	zvol_state_t *zv;
902	int err = 0;
903	boolean_t locked = B_FALSE;
904
905	/*
906	 * Protect against recursively entering spa_namespace_lock
907	 * when spa_open() is used for a pool on a (local) ZVOL(s).
908	 * This is needed since we replaced upstream zfsdev_state_lock
909	 * with spa_namespace_lock in the ZVOL code.
910	 * We are using the same trick as spa_open().
911	 * Note that calls in zvol_first_open which need to resolve
912	 * pool name to a spa object will enter spa_open()
913	 * recursively, but that function already has all the
914	 * necessary protection.
915	 */
916	if (!MUTEX_HELD(&spa_namespace_lock)) {
917		mutex_enter(&spa_namespace_lock);
918		locked = B_TRUE;
919	}
920
921	zv = pp->private;
922	if (zv == NULL) {
923		if (locked)
924			mutex_exit(&spa_namespace_lock);
925		return (SET_ERROR(ENXIO));
926	}
927
928	if (zv->zv_total_opens == 0) {
929		err = zvol_first_open(zv);
930		if (err) {
931			if (locked)
932				mutex_exit(&spa_namespace_lock);
933			return (err);
934		}
935		pp->mediasize = zv->zv_volsize;
936		pp->stripeoffset = 0;
937		pp->stripesize = zv->zv_volblocksize;
938	}
939	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
940		err = SET_ERROR(EROFS);
941		goto out;
942	}
943	if (zv->zv_flags & ZVOL_EXCL) {
944		err = SET_ERROR(EBUSY);
945		goto out;
946	}
947#ifdef FEXCL
948	if (flag & FEXCL) {
949		if (zv->zv_total_opens != 0) {
950			err = SET_ERROR(EBUSY);
951			goto out;
952		}
953		zv->zv_flags |= ZVOL_EXCL;
954	}
955#endif
956
957	zv->zv_total_opens += count;
958	if (locked)
959		mutex_exit(&spa_namespace_lock);
960
961	return (err);
962out:
963	if (zv->zv_total_opens == 0)
964		zvol_last_close(zv);
965	if (locked)
966		mutex_exit(&spa_namespace_lock);
967	return (err);
968}
969
970/*ARGSUSED*/
971static int
972zvol_close(struct g_provider *pp, int flag, int count)
973{
974	zvol_state_t *zv;
975	int error = 0;
976	boolean_t locked = B_FALSE;
977
978	/* See comment in zvol_open(). */
979	if (!MUTEX_HELD(&spa_namespace_lock)) {
980		mutex_enter(&spa_namespace_lock);
981		locked = B_TRUE;
982	}
983
984	zv = pp->private;
985	if (zv == NULL) {
986		if (locked)
987			mutex_exit(&spa_namespace_lock);
988		return (SET_ERROR(ENXIO));
989	}
990
991	if (zv->zv_flags & ZVOL_EXCL) {
992		ASSERT(zv->zv_total_opens == 1);
993		zv->zv_flags &= ~ZVOL_EXCL;
994	}
995
996	/*
997	 * If the open count is zero, this is a spurious close.
998	 * That indicates a bug in the kernel / DDI framework.
999	 */
1000	ASSERT(zv->zv_total_opens != 0);
1001
1002	/*
1003	 * You may get multiple opens, but only one close.
1004	 */
1005	zv->zv_total_opens -= count;
1006
1007	if (zv->zv_total_opens == 0)
1008		zvol_last_close(zv);
1009
1010	if (locked)
1011		mutex_exit(&spa_namespace_lock);
1012	return (error);
1013}
1014
1015static void
1016zvol_get_done(zgd_t *zgd, int error)
1017{
1018	if (zgd->zgd_db)
1019		dmu_buf_rele(zgd->zgd_db, zgd);
1020
1021	zfs_range_unlock(zgd->zgd_rl);
1022
1023	if (error == 0 && zgd->zgd_bp)
1024		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1025
1026	kmem_free(zgd, sizeof (zgd_t));
1027}
1028
1029/*
1030 * Get data to generate a TX_WRITE intent log record.
1031 */
1032static int
1033zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1034{
1035	zvol_state_t *zv = arg;
1036	objset_t *os = zv->zv_objset;
1037	uint64_t object = ZVOL_OBJ;
1038	uint64_t offset = lr->lr_offset;
1039	uint64_t size = lr->lr_length;	/* length of user data */
1040	blkptr_t *bp = &lr->lr_blkptr;
1041	dmu_buf_t *db;
1042	zgd_t *zgd;
1043	int error;
1044
1045	ASSERT(zio != NULL);
1046	ASSERT(size != 0);
1047
1048	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1049	zgd->zgd_zilog = zv->zv_zilog;
1050	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
1051
1052	/*
1053	 * Write records come in two flavors: immediate and indirect.
1054	 * For small writes it's cheaper to store the data with the
1055	 * log record (immediate); for large writes it's cheaper to
1056	 * sync the data and get a pointer to it (indirect) so that
1057	 * we don't have to write the data twice.
1058	 */
1059	if (buf != NULL) {	/* immediate write */
1060		error = dmu_read(os, object, offset, size, buf,
1061		    DMU_READ_NO_PREFETCH);
1062	} else {
1063		size = zv->zv_volblocksize;
1064		offset = P2ALIGN(offset, size);
1065		error = dmu_buf_hold(os, object, offset, zgd, &db,
1066		    DMU_READ_NO_PREFETCH);
1067		if (error == 0) {
1068			blkptr_t *obp = dmu_buf_get_blkptr(db);
1069			if (obp) {
1070				ASSERT(BP_IS_HOLE(bp));
1071				*bp = *obp;
1072			}
1073
1074			zgd->zgd_db = db;
1075			zgd->zgd_bp = bp;
1076
1077			ASSERT(db->db_offset == offset);
1078			ASSERT(db->db_size == size);
1079
1080			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1081			    zvol_get_done, zgd);
1082
1083			if (error == 0)
1084				return (0);
1085		}
1086	}
1087
1088	zvol_get_done(zgd, error);
1089
1090	return (error);
1091}
1092
1093/*
1094 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
1095 *
1096 * We store data in the log buffers if it's small enough.
1097 * Otherwise we will later flush the data out via dmu_sync().
1098 */
1099ssize_t zvol_immediate_write_sz = 32768;
1100
1101static void
1102zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1103    boolean_t sync)
1104{
1105	uint32_t blocksize = zv->zv_volblocksize;
1106	zilog_t *zilog = zv->zv_zilog;
1107	boolean_t slogging;
1108	ssize_t immediate_write_sz;
1109
1110	if (zil_replaying(zilog, tx))
1111		return;
1112
1113	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
1114	    ? 0 : zvol_immediate_write_sz;
1115
1116	slogging = spa_has_slogs(zilog->zl_spa) &&
1117	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
1118
1119	while (resid) {
1120		itx_t *itx;
1121		lr_write_t *lr;
1122		ssize_t len;
1123		itx_wr_state_t write_state;
1124
1125		/*
1126		 * Unlike zfs_log_write() we can be called with
1127		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
1128		 */
1129		if (blocksize > immediate_write_sz && !slogging &&
1130		    resid >= blocksize && off % blocksize == 0) {
1131			write_state = WR_INDIRECT; /* uses dmu_sync */
1132			len = blocksize;
1133		} else if (sync) {
1134			write_state = WR_COPIED;
1135			len = MIN(ZIL_MAX_LOG_DATA, resid);
1136		} else {
1137			write_state = WR_NEED_COPY;
1138			len = MIN(ZIL_MAX_LOG_DATA, resid);
1139		}
1140
1141		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
1142		    (write_state == WR_COPIED ? len : 0));
1143		lr = (lr_write_t *)&itx->itx_lr;
1144		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
1145		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
1146			zil_itx_destroy(itx);
1147			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
1148			lr = (lr_write_t *)&itx->itx_lr;
1149			write_state = WR_NEED_COPY;
1150		}
1151
1152		itx->itx_wr_state = write_state;
1153		if (write_state == WR_NEED_COPY)
1154			itx->itx_sod += len;
1155		lr->lr_foid = ZVOL_OBJ;
1156		lr->lr_offset = off;
1157		lr->lr_length = len;
1158		lr->lr_blkoff = 0;
1159		BP_ZERO(&lr->lr_blkptr);
1160
1161		itx->itx_private = zv;
1162		itx->itx_sync = sync;
1163
1164		zil_itx_assign(zilog, itx, tx);
1165
1166		off += len;
1167		resid -= len;
1168	}
1169}
1170
1171#ifdef sun
1172static int
1173zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1174    uint64_t size, boolean_t doread, boolean_t isdump)
1175{
1176	vdev_disk_t *dvd;
1177	int c;
1178	int numerrors = 0;
1179
1180	if (vd->vdev_ops == &vdev_mirror_ops ||
1181	    vd->vdev_ops == &vdev_replacing_ops ||
1182	    vd->vdev_ops == &vdev_spare_ops) {
1183		for (c = 0; c < vd->vdev_children; c++) {
1184			int err = zvol_dumpio_vdev(vd->vdev_child[c],
1185			    addr, offset, origoffset, size, doread, isdump);
1186			if (err != 0) {
1187				numerrors++;
1188			} else if (doread) {
1189				break;
1190			}
1191		}
1192	}
1193
1194	if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
1195		return (numerrors < vd->vdev_children ? 0 : EIO);
1196
1197	if (doread && !vdev_readable(vd))
1198		return (SET_ERROR(EIO));
1199	else if (!doread && !vdev_writeable(vd))
1200		return (SET_ERROR(EIO));
1201
1202	if (vd->vdev_ops == &vdev_raidz_ops) {
1203		return (vdev_raidz_physio(vd,
1204		    addr, size, offset, origoffset, doread, isdump));
1205	}
1206
1207	offset += VDEV_LABEL_START_SIZE;
1208
1209	if (ddi_in_panic() || isdump) {
1210		ASSERT(!doread);
1211		if (doread)
1212			return (SET_ERROR(EIO));
1213		dvd = vd->vdev_tsd;
1214		ASSERT3P(dvd, !=, NULL);
1215		return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1216		    lbtodb(size)));
1217	} else {
1218		dvd = vd->vdev_tsd;
1219		ASSERT3P(dvd, !=, NULL);
1220		return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1221		    offset, doread ? B_READ : B_WRITE));
1222	}
1223}
1224
1225static int
1226zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1227    boolean_t doread, boolean_t isdump)
1228{
1229	vdev_t *vd;
1230	int error;
1231	zvol_extent_t *ze;
1232	spa_t *spa = dmu_objset_spa(zv->zv_objset);
1233
1234	/* Must be sector aligned, and not stradle a block boundary. */
1235	if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
1236	    P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
1237		return (SET_ERROR(EINVAL));
1238	}
1239	ASSERT(size <= zv->zv_volblocksize);
1240
1241	/* Locate the extent this belongs to */
1242	ze = list_head(&zv->zv_extents);
1243	while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
1244		offset -= ze->ze_nblks * zv->zv_volblocksize;
1245		ze = list_next(&zv->zv_extents, ze);
1246	}
1247
1248	if (ze == NULL)
1249		return (SET_ERROR(EINVAL));
1250
1251	if (!ddi_in_panic())
1252		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1253
1254	vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
1255	offset += DVA_GET_OFFSET(&ze->ze_dva);
1256	error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
1257	    size, doread, isdump);
1258
1259	if (!ddi_in_panic())
1260		spa_config_exit(spa, SCL_STATE, FTAG);
1261
1262	return (error);
1263}
1264#endif	/* sun */
1265
1266int
1267zvol_strategy(struct bio *bp)
1268{
1269	zvol_state_t *zv = bp->bio_to->private;
1270	uint64_t off, volsize;
1271	size_t resid;
1272	char *addr;
1273	objset_t *os;
1274	rl_t *rl;
1275	int error = 0;
1276	boolean_t doread = (bp->bio_cmd == BIO_READ);
1277	boolean_t is_dumpified;
1278	boolean_t sync;
1279
1280	if (zv == NULL) {
1281		g_io_deliver(bp, ENXIO);
1282		return (0);
1283	}
1284
1285	if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
1286		g_io_deliver(bp, EROFS);
1287		return (0);
1288	}
1289
1290	off = bp->bio_offset;
1291	volsize = zv->zv_volsize;
1292
1293	os = zv->zv_objset;
1294	ASSERT(os != NULL);
1295
1296	addr = bp->bio_data;
1297	resid = bp->bio_length;
1298
1299	if (resid > 0 && (off < 0 || off >= volsize)) {
1300		g_io_deliver(bp, EIO);
1301		return (0);
1302	}
1303
1304#ifdef illumos
1305	is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
1306#else
1307	is_dumpified = B_FALSE;
1308#endif
1309        sync = !doread && !is_dumpified &&
1310	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
1311
1312	/*
1313	 * There must be no buffer changes when doing a dmu_sync() because
1314	 * we can't change the data whilst calculating the checksum.
1315	 */
1316	rl = zfs_range_lock(&zv->zv_znode, off, resid,
1317	    doread ? RL_READER : RL_WRITER);
1318
1319	while (resid != 0 && off < volsize) {
1320		size_t size = MIN(resid, zvol_maxphys);
1321#ifdef illumos
1322		if (is_dumpified) {
1323			size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
1324			error = zvol_dumpio(zv, addr, off, size,
1325			    doread, B_FALSE);
1326		} else if (doread) {
1327#else
1328		if (doread) {
1329#endif
1330			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
1331			    DMU_READ_PREFETCH);
1332		} else {
1333			dmu_tx_t *tx = dmu_tx_create(os);
1334			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
1335			error = dmu_tx_assign(tx, TXG_WAIT);
1336			if (error) {
1337				dmu_tx_abort(tx);
1338			} else {
1339				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
1340				zvol_log_write(zv, tx, off, size, sync);
1341				dmu_tx_commit(tx);
1342			}
1343		}
1344		if (error) {
1345			/* convert checksum errors into IO errors */
1346			if (error == ECKSUM)
1347				error = SET_ERROR(EIO);
1348			break;
1349		}
1350		off += size;
1351		addr += size;
1352		resid -= size;
1353	}
1354	zfs_range_unlock(rl);
1355
1356	bp->bio_completed = bp->bio_length - resid;
1357	if (bp->bio_completed < bp->bio_length)
1358		bp->bio_error = (off > volsize ? EINVAL : error);
1359
1360	if (sync)
1361		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1362	g_io_deliver(bp, 0);
1363
1364	return (0);
1365}
1366
1367#ifdef sun
1368/*
1369 * Set the buffer count to the zvol maximum transfer.
1370 * Using our own routine instead of the default minphys()
1371 * means that for larger writes we write bigger buffers on X86
1372 * (128K instead of 56K) and flush the disk write cache less often
1373 * (every zvol_maxphys - currently 1MB) instead of minphys (currently
1374 * 56K on X86 and 128K on sparc).
1375 */
1376void
1377zvol_minphys(struct buf *bp)
1378{
1379	if (bp->b_bcount > zvol_maxphys)
1380		bp->b_bcount = zvol_maxphys;
1381}
1382
1383int
1384zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
1385{
1386	minor_t minor = getminor(dev);
1387	zvol_state_t *zv;
1388	int error = 0;
1389	uint64_t size;
1390	uint64_t boff;
1391	uint64_t resid;
1392
1393	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1394	if (zv == NULL)
1395		return (SET_ERROR(ENXIO));
1396
1397	if ((zv->zv_flags & ZVOL_DUMPIFIED) == 0)
1398		return (SET_ERROR(EINVAL));
1399
1400	boff = ldbtob(blkno);
1401	resid = ldbtob(nblocks);
1402
1403	VERIFY3U(boff + resid, <=, zv->zv_volsize);
1404
1405	while (resid) {
1406		size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
1407		error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
1408		if (error)
1409			break;
1410		boff += size;
1411		addr += size;
1412		resid -= size;
1413	}
1414
1415	return (error);
1416}
1417
1418/*ARGSUSED*/
1419int
1420zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
1421{
1422	minor_t minor = getminor(dev);
1423	zvol_state_t *zv;
1424	uint64_t volsize;
1425	rl_t *rl;
1426	int error = 0;
1427
1428	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1429	if (zv == NULL)
1430		return (SET_ERROR(ENXIO));
1431
1432	volsize = zv->zv_volsize;
1433	if (uio->uio_resid > 0 &&
1434	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1435		return (SET_ERROR(EIO));
1436
1437	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1438		error = physio(zvol_strategy, NULL, dev, B_READ,
1439		    zvol_minphys, uio);
1440		return (error);
1441	}
1442
1443	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1444	    RL_READER);
1445	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1446		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1447
1448		/* don't read past the end */
1449		if (bytes > volsize - uio->uio_loffset)
1450			bytes = volsize - uio->uio_loffset;
1451
1452		error =  dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
1453		if (error) {
1454			/* convert checksum errors into IO errors */
1455			if (error == ECKSUM)
1456				error = SET_ERROR(EIO);
1457			break;
1458		}
1459	}
1460	zfs_range_unlock(rl);
1461	return (error);
1462}
1463
1464/*ARGSUSED*/
1465int
1466zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
1467{
1468	minor_t minor = getminor(dev);
1469	zvol_state_t *zv;
1470	uint64_t volsize;
1471	rl_t *rl;
1472	int error = 0;
1473	boolean_t sync;
1474
1475	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1476	if (zv == NULL)
1477		return (SET_ERROR(ENXIO));
1478
1479	volsize = zv->zv_volsize;
1480	if (uio->uio_resid > 0 &&
1481	    (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
1482		return (SET_ERROR(EIO));
1483
1484	if (zv->zv_flags & ZVOL_DUMPIFIED) {
1485		error = physio(zvol_strategy, NULL, dev, B_WRITE,
1486		    zvol_minphys, uio);
1487		return (error);
1488	}
1489
1490	sync = !(zv->zv_flags & ZVOL_WCE) ||
1491	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1492
1493	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
1494	    RL_WRITER);
1495	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
1496		uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
1497		uint64_t off = uio->uio_loffset;
1498		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1499
1500		if (bytes > volsize - off)	/* don't write past the end */
1501			bytes = volsize - off;
1502
1503		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
1504		error = dmu_tx_assign(tx, TXG_WAIT);
1505		if (error) {
1506			dmu_tx_abort(tx);
1507			break;
1508		}
1509		error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
1510		if (error == 0)
1511			zvol_log_write(zv, tx, off, bytes, sync);
1512		dmu_tx_commit(tx);
1513
1514		if (error)
1515			break;
1516	}
1517	zfs_range_unlock(rl);
1518	if (sync)
1519		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1520	return (error);
1521}
1522
1523int
1524zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
1525{
1526	struct uuid uuid = EFI_RESERVED;
1527	efi_gpe_t gpe = { 0 };
1528	uint32_t crc;
1529	dk_efi_t efi;
1530	int length;
1531	char *ptr;
1532
1533	if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
1534		return (SET_ERROR(EFAULT));
1535	ptr = (char *)(uintptr_t)efi.dki_data_64;
1536	length = efi.dki_length;
1537	/*
1538	 * Some clients may attempt to request a PMBR for the
1539	 * zvol.  Currently this interface will return EINVAL to
1540	 * such requests.  These requests could be supported by
1541	 * adding a check for lba == 0 and consing up an appropriate
1542	 * PMBR.
1543	 */
1544	if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
1545		return (SET_ERROR(EINVAL));
1546
1547	gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1548	gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
1549	UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1550
1551	if (efi.dki_lba == 1) {
1552		efi_gpt_t gpt = { 0 };
1553
1554		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
1555		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
1556		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
1557		gpt.efi_gpt_MyLBA = LE_64(1ULL);
1558		gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
1559		gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
1560		gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
1561		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
1562		gpt.efi_gpt_SizeOfPartitionEntry =
1563		    LE_32(sizeof (efi_gpe_t));
1564		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
1565		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
1566		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
1567		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
1568		if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
1569		    flag))
1570			return (SET_ERROR(EFAULT));
1571		ptr += sizeof (gpt);
1572		length -= sizeof (gpt);
1573	}
1574	if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
1575	    length), flag))
1576		return (SET_ERROR(EFAULT));
1577	return (0);
1578}
1579
1580/*
1581 * BEGIN entry points to allow external callers access to the volume.
1582 */
1583/*
1584 * Return the volume parameters needed for access from an external caller.
1585 * These values are invariant as long as the volume is held open.
1586 */
1587int
1588zvol_get_volume_params(minor_t minor, uint64_t *blksize,
1589    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
1590    void **rl_hdl, void **bonus_hdl)
1591{
1592	zvol_state_t *zv;
1593
1594	zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
1595	if (zv == NULL)
1596		return (SET_ERROR(ENXIO));
1597	if (zv->zv_flags & ZVOL_DUMPIFIED)
1598		return (SET_ERROR(ENXIO));
1599
1600	ASSERT(blksize && max_xfer_len && minor_hdl &&
1601	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
1602
1603	*blksize = zv->zv_volblocksize;
1604	*max_xfer_len = (uint64_t)zvol_maxphys;
1605	*minor_hdl = zv;
1606	*objset_hdl = zv->zv_objset;
1607	*zil_hdl = zv->zv_zilog;
1608	*rl_hdl = &zv->zv_znode;
1609	*bonus_hdl = zv->zv_dbuf;
1610	return (0);
1611}
1612
1613/*
1614 * Return the current volume size to an external caller.
1615 * The size can change while the volume is open.
1616 */
1617uint64_t
1618zvol_get_volume_size(void *minor_hdl)
1619{
1620	zvol_state_t *zv = minor_hdl;
1621
1622	return (zv->zv_volsize);
1623}
1624
1625/*
1626 * Return the current WCE setting to an external caller.
1627 * The WCE setting can change while the volume is open.
1628 */
1629int
1630zvol_get_volume_wce(void *minor_hdl)
1631{
1632	zvol_state_t *zv = minor_hdl;
1633
1634	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
1635}
1636
1637/*
1638 * Entry point for external callers to zvol_log_write
1639 */
1640void
1641zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
1642    boolean_t sync)
1643{
1644	zvol_state_t *zv = minor_hdl;
1645
1646	zvol_log_write(zv, tx, off, resid, sync);
1647}
1648/*
1649 * END entry points to allow external callers access to the volume.
1650 */
1651
1652/*
1653 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
1654 */
1655/*ARGSUSED*/
1656int
1657zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
1658{
1659	zvol_state_t *zv;
1660	struct dk_cinfo dki;
1661	struct dk_minfo dkm;
1662	struct dk_callback *dkc;
1663	int error = 0;
1664	rl_t *rl;
1665
1666	mutex_enter(&spa_namespace_lock);
1667
1668	zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
1669
1670	if (zv == NULL) {
1671		mutex_exit(&spa_namespace_lock);
1672		return (SET_ERROR(ENXIO));
1673	}
1674	ASSERT(zv->zv_total_opens > 0);
1675
1676	switch (cmd) {
1677
1678	case DKIOCINFO:
1679		bzero(&dki, sizeof (dki));
1680		(void) strcpy(dki.dki_cname, "zvol");
1681		(void) strcpy(dki.dki_dname, "zvol");
1682		dki.dki_ctype = DKC_UNKNOWN;
1683		dki.dki_unit = getminor(dev);
1684		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
1685		mutex_exit(&spa_namespace_lock);
1686		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
1687			error = SET_ERROR(EFAULT);
1688		return (error);
1689
1690	case DKIOCGMEDIAINFO:
1691		bzero(&dkm, sizeof (dkm));
1692		dkm.dki_lbsize = 1U << zv->zv_min_bs;
1693		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
1694		dkm.dki_media_type = DK_UNKNOWN;
1695		mutex_exit(&spa_namespace_lock);
1696		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
1697			error = SET_ERROR(EFAULT);
1698		return (error);
1699
1700	case DKIOCGETEFI:
1701		{
1702			uint64_t vs = zv->zv_volsize;
1703			uint8_t bs = zv->zv_min_bs;
1704
1705			mutex_exit(&spa_namespace_lock);
1706			error = zvol_getefi((void *)arg, flag, vs, bs);
1707			return (error);
1708		}
1709
1710	case DKIOCFLUSHWRITECACHE:
1711		dkc = (struct dk_callback *)arg;
1712		mutex_exit(&spa_namespace_lock);
1713		zil_commit(zv->zv_zilog, ZVOL_OBJ);
1714		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
1715			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
1716			error = 0;
1717		}
1718		return (error);
1719
1720	case DKIOCGETWCE:
1721		{
1722			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
1723			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
1724			    flag))
1725				error = SET_ERROR(EFAULT);
1726			break;
1727		}
1728	case DKIOCSETWCE:
1729		{
1730			int wce;
1731			if (ddi_copyin((void *)arg, &wce, sizeof (int),
1732			    flag)) {
1733				error = SET_ERROR(EFAULT);
1734				break;
1735			}
1736			if (wce) {
1737				zv->zv_flags |= ZVOL_WCE;
1738				mutex_exit(&spa_namespace_lock);
1739			} else {
1740				zv->zv_flags &= ~ZVOL_WCE;
1741				mutex_exit(&spa_namespace_lock);
1742				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1743			}
1744			return (0);
1745		}
1746
1747	case DKIOCGGEOM:
1748	case DKIOCGVTOC:
1749		/*
1750		 * commands using these (like prtvtoc) expect ENOTSUP
1751		 * since we're emulating an EFI label
1752		 */
1753		error = SET_ERROR(ENOTSUP);
1754		break;
1755
1756	case DKIOCDUMPINIT:
1757		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1758		    RL_WRITER);
1759		error = zvol_dumpify(zv);
1760		zfs_range_unlock(rl);
1761		break;
1762
1763	case DKIOCDUMPFINI:
1764		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1765			break;
1766		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1767		    RL_WRITER);
1768		error = zvol_dump_fini(zv);
1769		zfs_range_unlock(rl);
1770		break;
1771
1772	case DKIOCFREE:
1773	{
1774		dkioc_free_t df;
1775		dmu_tx_t *tx;
1776
1777		if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1778			error = SET_ERROR(EFAULT);
1779			break;
1780		}
1781
1782		/*
1783		 * Apply Postel's Law to length-checking.  If they overshoot,
1784		 * just blank out until the end, if there's a need to blank
1785		 * out anything.
1786		 */
1787		if (df.df_start >= zv->zv_volsize)
1788			break;	/* No need to do anything... */
1789		if (df.df_start + df.df_length > zv->zv_volsize)
1790			df.df_length = DMU_OBJECT_END;
1791
1792		rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1793		    RL_WRITER);
1794		tx = dmu_tx_create(zv->zv_objset);
1795		error = dmu_tx_assign(tx, TXG_WAIT);
1796		if (error != 0) {
1797			dmu_tx_abort(tx);
1798		} else {
1799			zvol_log_truncate(zv, tx, df.df_start,
1800			    df.df_length, B_TRUE);
1801			dmu_tx_commit(tx);
1802			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1803			    df.df_start, df.df_length);
1804		}
1805
1806		zfs_range_unlock(rl);
1807
1808		if (error == 0) {
1809			/*
1810			 * If the write-cache is disabled or 'sync' property
1811			 * is set to 'always' then treat this as a synchronous
1812			 * operation (i.e. commit to zil).
1813			 */
1814			if (!(zv->zv_flags & ZVOL_WCE) ||
1815			    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
1816				zil_commit(zv->zv_zilog, ZVOL_OBJ);
1817
1818			/*
1819			 * If the caller really wants synchronous writes, and
1820			 * can't wait for them, don't return until the write
1821			 * is done.
1822			 */
1823			if (df.df_flags & DF_WAIT_SYNC) {
1824				txg_wait_synced(
1825				    dmu_objset_pool(zv->zv_objset), 0);
1826			}
1827		}
1828		break;
1829	}
1830
1831	default:
1832		error = SET_ERROR(ENOTTY);
1833		break;
1834
1835	}
1836	mutex_exit(&spa_namespace_lock);
1837	return (error);
1838}
1839#endif	/* sun */
1840
1841int
1842zvol_busy(void)
1843{
1844	return (zvol_minors != 0);
1845}
1846
1847void
1848zvol_init(void)
1849{
1850	VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1851	    1) == 0);
1852	ZFS_LOG(1, "ZVOL Initialized.");
1853}
1854
1855void
1856zvol_fini(void)
1857{
1858	ddi_soft_state_fini(&zfsdev_state);
1859	ZFS_LOG(1, "ZVOL Deinitialized.");
1860}
1861
1862#ifdef sun
1863/*ARGSUSED*/
1864static int
1865zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
1866{
1867	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1868
1869	if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1870		return (1);
1871	return (0);
1872}
1873
1874/*ARGSUSED*/
1875static void
1876zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
1877{
1878	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1879
1880	spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
1881}
1882
1883static int
1884zvol_dump_init(zvol_state_t *zv, boolean_t resize)
1885{
1886	dmu_tx_t *tx;
1887	int error;
1888	objset_t *os = zv->zv_objset;
1889	spa_t *spa = dmu_objset_spa(os);
1890	vdev_t *vd = spa->spa_root_vdev;
1891	nvlist_t *nv = NULL;
1892	uint64_t version = spa_version(spa);
1893	enum zio_checksum checksum;
1894
1895	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1896	ASSERT(vd->vdev_ops == &vdev_root_ops);
1897
1898	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
1899	    DMU_OBJECT_END);
1900	/* wait for dmu_free_long_range to actually free the blocks */
1901	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
1902
1903	/*
1904	 * If the pool on which the dump device is being initialized has more
1905	 * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
1906	 * enabled.  If so, bump that feature's counter to indicate that the
1907	 * feature is active. We also check the vdev type to handle the
1908	 * following case:
1909	 *   # zpool create test raidz disk1 disk2 disk3
1910	 *   Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
1911	 *   the raidz vdev itself has 3 children.
1912	 */
1913	if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
1914		if (!spa_feature_is_enabled(spa,
1915		    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
1916			return (SET_ERROR(ENOTSUP));
1917		(void) dsl_sync_task(spa_name(spa),
1918		    zfs_mvdev_dump_feature_check,
1919		    zfs_mvdev_dump_activate_feature_sync, NULL, 2);
1920	}
1921
1922	tx = dmu_tx_create(os);
1923	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
1924	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
1925	error = dmu_tx_assign(tx, TXG_WAIT);
1926	if (error) {
1927		dmu_tx_abort(tx);
1928		return (error);
1929	}
1930
1931	/*
1932	 * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
1933	 * function.  Otherwise, use the old default -- OFF.
1934	 */
1935	checksum = spa_feature_is_active(spa,
1936	    SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
1937	    ZIO_CHECKSUM_OFF;
1938
1939	/*
1940	 * If we are resizing the dump device then we only need to
1941	 * update the refreservation to match the newly updated
1942	 * zvolsize. Otherwise, we save off the original state of the
1943	 * zvol so that we can restore them if the zvol is ever undumpified.
1944	 */
1945	if (resize) {
1946		error = zap_update(os, ZVOL_ZAP_OBJ,
1947		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1948		    &zv->zv_volsize, tx);
1949	} else {
1950		uint64_t checksum, compress, refresrv, vbs, dedup;
1951
1952		error = dsl_prop_get_integer(zv->zv_name,
1953		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
1954		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1955		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
1956		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1957		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
1958		error = error ? error : dsl_prop_get_integer(zv->zv_name,
1959		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
1960		if (version >= SPA_VERSION_DEDUP) {
1961			error = error ? error :
1962			    dsl_prop_get_integer(zv->zv_name,
1963			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
1964		}
1965
1966		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1967		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
1968		    &compress, tx);
1969		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1970		    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
1971		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1972		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
1973		    &refresrv, tx);
1974		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1975		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
1976		    &vbs, tx);
1977		error = error ? error : dmu_object_set_blocksize(
1978		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
1979		if (version >= SPA_VERSION_DEDUP) {
1980			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
1981			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
1982			    &dedup, tx);
1983		}
1984		if (error == 0)
1985			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
1986	}
1987	dmu_tx_commit(tx);
1988
1989	/*
1990	 * We only need update the zvol's property if we are initializing
1991	 * the dump area for the first time.
1992	 */
1993	if (!resize) {
1994		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1995		VERIFY(nvlist_add_uint64(nv,
1996		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
1997		VERIFY(nvlist_add_uint64(nv,
1998		    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
1999		    ZIO_COMPRESS_OFF) == 0);
2000		VERIFY(nvlist_add_uint64(nv,
2001		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
2002		    checksum) == 0);
2003		if (version >= SPA_VERSION_DEDUP) {
2004			VERIFY(nvlist_add_uint64(nv,
2005			    zfs_prop_to_name(ZFS_PROP_DEDUP),
2006			    ZIO_CHECKSUM_OFF) == 0);
2007		}
2008
2009		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2010		    nv, NULL);
2011		nvlist_free(nv);
2012
2013		if (error)
2014			return (error);
2015	}
2016
2017	/* Allocate the space for the dump */
2018	error = zvol_prealloc(zv);
2019	return (error);
2020}
2021
2022static int
2023zvol_dumpify(zvol_state_t *zv)
2024{
2025	int error = 0;
2026	uint64_t dumpsize = 0;
2027	dmu_tx_t *tx;
2028	objset_t *os = zv->zv_objset;
2029
2030	if (zv->zv_flags & ZVOL_RDONLY)
2031		return (SET_ERROR(EROFS));
2032
2033	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
2034	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
2035		boolean_t resize = (dumpsize > 0);
2036
2037		if ((error = zvol_dump_init(zv, resize)) != 0) {
2038			(void) zvol_dump_fini(zv);
2039			return (error);
2040		}
2041	}
2042
2043	/*
2044	 * Build up our lba mapping.
2045	 */
2046	error = zvol_get_lbas(zv);
2047	if (error) {
2048		(void) zvol_dump_fini(zv);
2049		return (error);
2050	}
2051
2052	tx = dmu_tx_create(os);
2053	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2054	error = dmu_tx_assign(tx, TXG_WAIT);
2055	if (error) {
2056		dmu_tx_abort(tx);
2057		(void) zvol_dump_fini(zv);
2058		return (error);
2059	}
2060
2061	zv->zv_flags |= ZVOL_DUMPIFIED;
2062	error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
2063	    &zv->zv_volsize, tx);
2064	dmu_tx_commit(tx);
2065
2066	if (error) {
2067		(void) zvol_dump_fini(zv);
2068		return (error);
2069	}
2070
2071	txg_wait_synced(dmu_objset_pool(os), 0);
2072	return (0);
2073}
2074
2075static int
2076zvol_dump_fini(zvol_state_t *zv)
2077{
2078	dmu_tx_t *tx;
2079	objset_t *os = zv->zv_objset;
2080	nvlist_t *nv;
2081	int error = 0;
2082	uint64_t checksum, compress, refresrv, vbs, dedup;
2083	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
2084
2085	/*
2086	 * Attempt to restore the zvol back to its pre-dumpified state.
2087	 * This is a best-effort attempt as it's possible that not all
2088	 * of these properties were initialized during the dumpify process
2089	 * (i.e. error during zvol_dump_init).
2090	 */
2091
2092	tx = dmu_tx_create(os);
2093	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
2094	error = dmu_tx_assign(tx, TXG_WAIT);
2095	if (error) {
2096		dmu_tx_abort(tx);
2097		return (error);
2098	}
2099	(void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
2100	dmu_tx_commit(tx);
2101
2102	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2103	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
2104	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2105	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
2106	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2107	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
2108	(void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2109	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
2110
2111	VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2112	(void) nvlist_add_uint64(nv,
2113	    zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
2114	(void) nvlist_add_uint64(nv,
2115	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
2116	(void) nvlist_add_uint64(nv,
2117	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
2118	if (version >= SPA_VERSION_DEDUP &&
2119	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
2120	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
2121		(void) nvlist_add_uint64(nv,
2122		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
2123	}
2124	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
2125	    nv, NULL);
2126	nvlist_free(nv);
2127
2128	zvol_free_extents(zv);
2129	zv->zv_flags &= ~ZVOL_DUMPIFIED;
2130	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
2131	/* wait for dmu_free_long_range to actually free the blocks */
2132	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
2133	tx = dmu_tx_create(os);
2134	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
2135	error = dmu_tx_assign(tx, TXG_WAIT);
2136	if (error) {
2137		dmu_tx_abort(tx);
2138		return (error);
2139	}
2140	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
2141		zv->zv_volblocksize = vbs;
2142	dmu_tx_commit(tx);
2143
2144	return (0);
2145}
2146#endif	/* sun */
2147
2148static zvol_state_t *
2149zvol_geom_create(const char *name)
2150{
2151	struct g_provider *pp;
2152	struct g_geom *gp;
2153	zvol_state_t *zv;
2154
2155	gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
2156	gp->start = zvol_geom_start;
2157	gp->access = zvol_geom_access;
2158	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
2159	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2160	pp->sectorsize = DEV_BSIZE;
2161
2162	zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
2163	zv->zv_provider = pp;
2164	zv->zv_state = 0;
2165	bioq_init(&zv->zv_queue);
2166	mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
2167
2168	pp->private = zv;
2169
2170	return (zv);
2171}
2172
2173static void
2174zvol_geom_run(zvol_state_t *zv)
2175{
2176	struct g_provider *pp;
2177
2178	pp = zv->zv_provider;
2179	g_error_provider(pp, 0);
2180
2181	kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0,
2182	    "zfskern", "zvol %s", pp->name + sizeof(ZVOL_DRIVER));
2183}
2184
2185static void
2186zvol_geom_destroy(zvol_state_t *zv)
2187{
2188	struct g_provider *pp;
2189
2190	g_topology_assert();
2191
2192	mtx_lock(&zv->zv_queue_mtx);
2193	zv->zv_state = 1;
2194	wakeup_one(&zv->zv_queue);
2195	while (zv->zv_state != 2)
2196		msleep(&zv->zv_state, &zv->zv_queue_mtx, 0, "zvol:w", 0);
2197	mtx_destroy(&zv->zv_queue_mtx);
2198
2199	pp = zv->zv_provider;
2200	zv->zv_provider = NULL;
2201	pp->private = NULL;
2202	g_wither_geom(pp->geom, ENXIO);
2203
2204	kmem_free(zv, sizeof(*zv));
2205}
2206
2207static int
2208zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
2209{
2210	int count, error, flags;
2211
2212	g_topology_assert();
2213
2214	/*
2215	 * To make it easier we expect either open or close, but not both
2216	 * at the same time.
2217	 */
2218	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
2219	    (acr <= 0 && acw <= 0 && ace <= 0),
2220	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
2221	    pp->name, acr, acw, ace));
2222
2223	if (pp->private == NULL) {
2224		if (acr <= 0 && acw <= 0 && ace <= 0)
2225			return (0);
2226		return (pp->error);
2227	}
2228
2229	/*
2230	 * We don't pass FEXCL flag to zvol_open()/zvol_close() if ace != 0,
2231	 * because GEOM already handles that and handles it a bit differently.
2232	 * GEOM allows for multiple read/exclusive consumers and ZFS allows
2233	 * only one exclusive consumer, no matter if it is reader or writer.
2234	 * I like better the way GEOM works so I'll leave it for GEOM to
2235	 * decide what to do.
2236	 */
2237
2238	count = acr + acw + ace;
2239	if (count == 0)
2240		return (0);
2241
2242	flags = 0;
2243	if (acr != 0 || ace != 0)
2244		flags |= FREAD;
2245	if (acw != 0)
2246		flags |= FWRITE;
2247
2248	g_topology_unlock();
2249	if (count > 0)
2250		error = zvol_open(pp, flags, count);
2251	else
2252		error = zvol_close(pp, flags, -count);
2253	g_topology_lock();
2254	return (error);
2255}
2256
2257static void
2258zvol_geom_start(struct bio *bp)
2259{
2260	zvol_state_t *zv;
2261	boolean_t first;
2262
2263	zv = bp->bio_to->private;
2264	ASSERT(zv != NULL);
2265	switch (bp->bio_cmd) {
2266	case BIO_FLUSH:
2267		if (!THREAD_CAN_SLEEP())
2268			goto enqueue;
2269		zil_commit(zv->zv_zilog, ZVOL_OBJ);
2270		g_io_deliver(bp, 0);
2271		break;
2272	case BIO_READ:
2273	case BIO_WRITE:
2274		if (!THREAD_CAN_SLEEP())
2275			goto enqueue;
2276		zvol_strategy(bp);
2277		break;
2278	case BIO_GETATTR:
2279	case BIO_DELETE:
2280	default:
2281		g_io_deliver(bp, EOPNOTSUPP);
2282		break;
2283	}
2284	return;
2285
2286enqueue:
2287	mtx_lock(&zv->zv_queue_mtx);
2288	first = (bioq_first(&zv->zv_queue) == NULL);
2289	bioq_insert_tail(&zv->zv_queue, bp);
2290	mtx_unlock(&zv->zv_queue_mtx);
2291	if (first)
2292		wakeup_one(&zv->zv_queue);
2293}
2294
2295static void
2296zvol_geom_worker(void *arg)
2297{
2298	zvol_state_t *zv;
2299	struct bio *bp;
2300
2301	thread_lock(curthread);
2302	sched_prio(curthread, PRIBIO);
2303	thread_unlock(curthread);
2304
2305	zv = arg;
2306	for (;;) {
2307		mtx_lock(&zv->zv_queue_mtx);
2308		bp = bioq_takefirst(&zv->zv_queue);
2309		if (bp == NULL) {
2310			if (zv->zv_state == 1) {
2311				zv->zv_state = 2;
2312				wakeup(&zv->zv_state);
2313				mtx_unlock(&zv->zv_queue_mtx);
2314				kthread_exit();
2315			}
2316			msleep(&zv->zv_queue, &zv->zv_queue_mtx, PRIBIO | PDROP,
2317			    "zvol:io", 0);
2318			continue;
2319		}
2320		mtx_unlock(&zv->zv_queue_mtx);
2321		switch (bp->bio_cmd) {
2322		case BIO_FLUSH:
2323			zil_commit(zv->zv_zilog, ZVOL_OBJ);
2324			g_io_deliver(bp, 0);
2325			break;
2326		case BIO_READ:
2327		case BIO_WRITE:
2328			zvol_strategy(bp);
2329			break;
2330		}
2331	}
2332}
2333
2334extern boolean_t dataset_name_hidden(const char *name);
2335
2336static int
2337zvol_create_snapshots(objset_t *os, const char *name)
2338{
2339	uint64_t cookie, obj;
2340	char *sname;
2341	int error, len;
2342
2343	cookie = obj = 0;
2344	sname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2345
2346#if 0
2347	(void) dmu_objset_find(name, dmu_objset_prefetch, NULL,
2348	    DS_FIND_SNAPSHOTS);
2349#endif
2350
2351	for (;;) {
2352		len = snprintf(sname, MAXPATHLEN, "%s@", name);
2353		if (len >= MAXPATHLEN) {
2354			dmu_objset_rele(os, FTAG);
2355			error = ENAMETOOLONG;
2356			break;
2357		}
2358
2359		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
2360		error = dmu_snapshot_list_next(os, MAXPATHLEN - len,
2361		    sname + len, &obj, &cookie, NULL);
2362		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
2363		if (error != 0) {
2364			if (error == ENOENT)
2365				error = 0;
2366			break;
2367		}
2368
2369		if ((error = zvol_create_minor(sname)) != 0) {
2370			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2371			    sname, error);
2372			break;
2373		}
2374	}
2375
2376	kmem_free(sname, MAXPATHLEN);
2377	return (error);
2378}
2379
2380int
2381zvol_create_minors(const char *name)
2382{
2383	uint64_t cookie;
2384	objset_t *os;
2385	char *osname, *p;
2386	int error, len;
2387
2388	if (dataset_name_hidden(name))
2389		return (0);
2390
2391	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2392		printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2393		    name, error);
2394		return (error);
2395	}
2396	if (dmu_objset_type(os) == DMU_OST_ZVOL) {
2397		dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
2398		dsl_pool_rele(dmu_objset_pool(os), FTAG);
2399		if ((error = zvol_create_minor(name)) == 0)
2400			error = zvol_create_snapshots(os, name);
2401		else {
2402			printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
2403			    name, error);
2404		}
2405		dsl_dataset_long_rele(os->os_dsl_dataset, FTAG);
2406		dsl_dataset_rele(os->os_dsl_dataset, FTAG);
2407		return (error);
2408	}
2409	if (dmu_objset_type(os) != DMU_OST_ZFS) {
2410		dmu_objset_rele(os, FTAG);
2411		return (0);
2412	}
2413
2414	osname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2415	if (snprintf(osname, MAXPATHLEN, "%s/", name) >= MAXPATHLEN) {
2416		dmu_objset_rele(os, FTAG);
2417		kmem_free(osname, MAXPATHLEN);
2418		return (ENOENT);
2419	}
2420	p = osname + strlen(osname);
2421	len = MAXPATHLEN - (p - osname);
2422
2423#if 0
2424	/* Prefetch the datasets. */
2425	cookie = 0;
2426	while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) {
2427		if (!dataset_name_hidden(osname))
2428			(void) dmu_objset_prefetch(osname, NULL);
2429	}
2430#endif
2431
2432	cookie = 0;
2433	while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
2434	    &cookie) == 0) {
2435		dmu_objset_rele(os, FTAG);
2436		(void)zvol_create_minors(osname);
2437		if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
2438			printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
2439			    name, error);
2440			return (error);
2441		}
2442	}
2443
2444	dmu_objset_rele(os, FTAG);
2445	kmem_free(osname, MAXPATHLEN);
2446	return (0);
2447}
2448
2449static void
2450zvol_rename_minor(struct g_geom *gp, const char *newname)
2451{
2452	struct g_provider *pp;
2453	zvol_state_t *zv;
2454
2455	ASSERT(MUTEX_HELD(&spa_namespace_lock));
2456	g_topology_assert();
2457
2458	pp = LIST_FIRST(&gp->provider);
2459	ASSERT(pp != NULL);
2460	zv = pp->private;
2461	ASSERT(zv != NULL);
2462
2463	zv->zv_provider = NULL;
2464	g_wither_provider(pp, ENXIO);
2465
2466	pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
2467	pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
2468	pp->sectorsize = DEV_BSIZE;
2469	pp->mediasize = zv->zv_volsize;
2470	pp->private = zv;
2471	zv->zv_provider = pp;
2472	strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
2473	g_error_provider(pp, 0);
2474}
2475
2476void
2477zvol_rename_minors(const char *oldname, const char *newname)
2478{
2479	char name[MAXPATHLEN];
2480	struct g_provider *pp;
2481	struct g_geom *gp;
2482	size_t oldnamelen, newnamelen;
2483	zvol_state_t *zv;
2484	char *namebuf;
2485
2486	oldnamelen = strlen(oldname);
2487	newnamelen = strlen(newname);
2488
2489	DROP_GIANT();
2490	mutex_enter(&spa_namespace_lock);
2491	g_topology_lock();
2492
2493	LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
2494		pp = LIST_FIRST(&gp->provider);
2495		if (pp == NULL)
2496			continue;
2497		zv = pp->private;
2498		if (zv == NULL)
2499			continue;
2500		if (strcmp(zv->zv_name, oldname) == 0) {
2501			zvol_rename_minor(gp, newname);
2502		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
2503		    (zv->zv_name[oldnamelen] == '/' ||
2504		     zv->zv_name[oldnamelen] == '@')) {
2505			snprintf(name, sizeof(name), "%s%c%s", newname,
2506			    zv->zv_name[oldnamelen],
2507			    zv->zv_name + oldnamelen + 1);
2508			zvol_rename_minor(gp, name);
2509		}
2510	}
2511
2512	g_topology_unlock();
2513	mutex_exit(&spa_namespace_lock);
2514	PICKUP_GIANT();
2515}
2516