1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23268657Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24168404Spjd */
25251478Sdelphij/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
26255750Sdelphij/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
27268658Sdelphij/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */
28251478Sdelphij
29168404Spjd#include <sys/dmu.h>
30168404Spjd#include <sys/dmu_impl.h>
31168404Spjd#include <sys/dmu_tx.h>
32168404Spjd#include <sys/dbuf.h>
33168404Spjd#include <sys/dnode.h>
34168404Spjd#include <sys/zfs_context.h>
35168404Spjd#include <sys/dmu_objset.h>
36168404Spjd#include <sys/dmu_traverse.h>
37168404Spjd#include <sys/dsl_dataset.h>
38168404Spjd#include <sys/dsl_dir.h>
39168404Spjd#include <sys/dsl_pool.h>
40168404Spjd#include <sys/dsl_synctask.h>
41168404Spjd#include <sys/dsl_prop.h>
42168404Spjd#include <sys/dmu_zfetch.h>
43168404Spjd#include <sys/zfs_ioctl.h>
44168404Spjd#include <sys/zap.h>
45168404Spjd#include <sys/zio_checksum.h>
46243524Smm#include <sys/zio_compress.h>
47219089Spjd#include <sys/sa.h>
48268658Sdelphij#include <sys/zfeature.h>
49219089Spjd#ifdef _KERNEL
50260786Savg#include <sys/vm.h>
51185029Spjd#include <sys/zfs_znode.h>
52219089Spjd#endif
53168404Spjd
54243524Smm/*
55243524Smm * Enable/disable nopwrite feature.
56243524Smm */
57243524Smmint zfs_nopwrite_enabled = 1;
58243525SmmSYSCTL_DECL(_vfs_zfs);
59243525SmmTUNABLE_INT("vfs.zfs.nopwrite_enabled", &zfs_nopwrite_enabled);
60243525SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
61243525Smm    &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
62243524Smm
63168404Spjdconst dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
64236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
65236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
66236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
67236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
68236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
69236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
70236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
71236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
72236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
73236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
74236884Smm	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
75236884Smm	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
76236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
77236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
78236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
79236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
80236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
81236884Smm	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
82236884Smm	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
83236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
84236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
85236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
86236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
87236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
88236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
89236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
90236884Smm	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
91236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
92236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
93236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
94236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
95236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
96236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
97236884Smm	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
98236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
99236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
100236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
101236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
102236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
103236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
104236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
105236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
106236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
107236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
108236884Smm	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
109236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
110236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
111236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
112236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
113236884Smm	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
114236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
115236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
116236884Smm	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
117236884Smm	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
118168404Spjd};
119168404Spjd
120236884Smmconst dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
121236884Smm	{	byteswap_uint8_array,	"uint8"		},
122236884Smm	{	byteswap_uint16_array,	"uint16"	},
123236884Smm	{	byteswap_uint32_array,	"uint32"	},
124236884Smm	{	byteswap_uint64_array,	"uint64"	},
125236884Smm	{	zap_byteswap,		"zap"		},
126236884Smm	{	dnode_buf_byteswap,	"dnode"		},
127236884Smm	{	dmu_objset_byteswap,	"objset"	},
128236884Smm	{	zfs_znode_byteswap,	"znode"		},
129236884Smm	{	zfs_oldacl_byteswap,	"oldacl"	},
130236884Smm	{	zfs_acl_byteswap,	"acl"		}
131236884Smm};
132236884Smm
133168404Spjdint
134268649Sdelphijdmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
135268649Sdelphij    void *tag, dmu_buf_t **dbp)
136168404Spjd{
137168404Spjd	dnode_t *dn;
138168404Spjd	uint64_t blkid;
139168404Spjd	dmu_buf_impl_t *db;
140168404Spjd	int err;
141168404Spjd
142219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
143168404Spjd	if (err)
144168404Spjd		return (err);
145168404Spjd	blkid = dbuf_whichblock(dn, offset);
146168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
147168404Spjd	db = dbuf_hold(dn, blkid, tag);
148168404Spjd	rw_exit(&dn->dn_struct_rwlock);
149268649Sdelphij	dnode_rele(dn, FTAG);
150268649Sdelphij
151168404Spjd	if (db == NULL) {
152268649Sdelphij		*dbp = NULL;
153268649Sdelphij		return (SET_ERROR(EIO));
154268649Sdelphij	}
155268649Sdelphij
156268649Sdelphij	*dbp = &db->db;
157268649Sdelphij	return (err);
158268649Sdelphij}
159268649Sdelphij
160268649Sdelphijint
161268649Sdelphijdmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
162268649Sdelphij    void *tag, dmu_buf_t **dbp, int flags)
163268649Sdelphij{
164268649Sdelphij	int err;
165268649Sdelphij	int db_flags = DB_RF_CANFAIL;
166268649Sdelphij
167268649Sdelphij	if (flags & DMU_READ_NO_PREFETCH)
168268649Sdelphij		db_flags |= DB_RF_NOPREFETCH;
169268649Sdelphij
170268649Sdelphij	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
171268649Sdelphij	if (err == 0) {
172268649Sdelphij		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
173219089Spjd		err = dbuf_read(db, NULL, db_flags);
174268649Sdelphij		if (err != 0) {
175168404Spjd			dbuf_rele(db, tag);
176268649Sdelphij			*dbp = NULL;
177168404Spjd		}
178168404Spjd	}
179168404Spjd
180168404Spjd	return (err);
181168404Spjd}
182168404Spjd
183168404Spjdint
184168404Spjddmu_bonus_max(void)
185168404Spjd{
186168404Spjd	return (DN_MAX_BONUSLEN);
187168404Spjd}
188168404Spjd
189185029Spjdint
190219089Spjddmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
191185029Spjd{
192219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
193219089Spjd	dnode_t *dn;
194219089Spjd	int error;
195185029Spjd
196219089Spjd	DB_DNODE_ENTER(db);
197219089Spjd	dn = DB_DNODE(db);
198219089Spjd
199219089Spjd	if (dn->dn_bonus != db) {
200249195Smm		error = SET_ERROR(EINVAL);
201219089Spjd	} else if (newsize < 0 || newsize > db_fake->db_size) {
202249195Smm		error = SET_ERROR(EINVAL);
203219089Spjd	} else {
204219089Spjd		dnode_setbonuslen(dn, newsize, tx);
205219089Spjd		error = 0;
206219089Spjd	}
207219089Spjd
208219089Spjd	DB_DNODE_EXIT(db);
209219089Spjd	return (error);
210185029Spjd}
211185029Spjd
212219089Spjdint
213219089Spjddmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
214219089Spjd{
215219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
216219089Spjd	dnode_t *dn;
217219089Spjd	int error;
218219089Spjd
219219089Spjd	DB_DNODE_ENTER(db);
220219089Spjd	dn = DB_DNODE(db);
221219089Spjd
222236884Smm	if (!DMU_OT_IS_VALID(type)) {
223249195Smm		error = SET_ERROR(EINVAL);
224219089Spjd	} else if (dn->dn_bonus != db) {
225249195Smm		error = SET_ERROR(EINVAL);
226219089Spjd	} else {
227219089Spjd		dnode_setbonus_type(dn, type, tx);
228219089Spjd		error = 0;
229219089Spjd	}
230219089Spjd
231219089Spjd	DB_DNODE_EXIT(db);
232219089Spjd	return (error);
233219089Spjd}
234219089Spjd
235219089Spjddmu_object_type_t
236219089Spjddmu_get_bonustype(dmu_buf_t *db_fake)
237219089Spjd{
238219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
239219089Spjd	dnode_t *dn;
240219089Spjd	dmu_object_type_t type;
241219089Spjd
242219089Spjd	DB_DNODE_ENTER(db);
243219089Spjd	dn = DB_DNODE(db);
244219089Spjd	type = dn->dn_bonustype;
245219089Spjd	DB_DNODE_EXIT(db);
246219089Spjd
247219089Spjd	return (type);
248219089Spjd}
249219089Spjd
250219089Spjdint
251219089Spjddmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
252219089Spjd{
253219089Spjd	dnode_t *dn;
254219089Spjd	int error;
255219089Spjd
256219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
257219089Spjd	dbuf_rm_spill(dn, tx);
258219089Spjd	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
259219089Spjd	dnode_rm_spill(dn, tx);
260219089Spjd	rw_exit(&dn->dn_struct_rwlock);
261219089Spjd	dnode_rele(dn, FTAG);
262219089Spjd	return (error);
263219089Spjd}
264219089Spjd
265168404Spjd/*
266168404Spjd * returns ENOENT, EIO, or 0.
267168404Spjd */
268168404Spjdint
269168404Spjddmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
270168404Spjd{
271168404Spjd	dnode_t *dn;
272168404Spjd	dmu_buf_impl_t *db;
273185029Spjd	int error;
274168404Spjd
275219089Spjd	error = dnode_hold(os, object, FTAG, &dn);
276185029Spjd	if (error)
277185029Spjd		return (error);
278168404Spjd
279168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
280168404Spjd	if (dn->dn_bonus == NULL) {
281168404Spjd		rw_exit(&dn->dn_struct_rwlock);
282168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
283168404Spjd		if (dn->dn_bonus == NULL)
284185029Spjd			dbuf_create_bonus(dn);
285168404Spjd	}
286168404Spjd	db = dn->dn_bonus;
287185029Spjd
288185029Spjd	/* as long as the bonus buf is held, the dnode will be held */
289219089Spjd	if (refcount_add(&db->db_holds, tag) == 1) {
290185029Spjd		VERIFY(dnode_add_ref(dn, db));
291271002Sdelphij		atomic_inc_32(&dn->dn_dbufs_count);
292219089Spjd	}
293185029Spjd
294219089Spjd	/*
295219089Spjd	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
296219089Spjd	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
297219089Spjd	 * a dnode hold for every dbuf.
298219089Spjd	 */
299219089Spjd	rw_exit(&dn->dn_struct_rwlock);
300219089Spjd
301168404Spjd	dnode_rele(dn, FTAG);
302168404Spjd
303219089Spjd	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
304168404Spjd
305168404Spjd	*dbp = &db->db;
306168404Spjd	return (0);
307168404Spjd}
308168404Spjd
309168404Spjd/*
310219089Spjd * returns ENOENT, EIO, or 0.
311219089Spjd *
312219089Spjd * This interface will allocate a blank spill dbuf when a spill blk
313219089Spjd * doesn't already exist on the dnode.
314219089Spjd *
315219089Spjd * if you only want to find an already existing spill db, then
316219089Spjd * dmu_spill_hold_existing() should be used.
317219089Spjd */
318219089Spjdint
319219089Spjddmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
320219089Spjd{
321219089Spjd	dmu_buf_impl_t *db = NULL;
322219089Spjd	int err;
323219089Spjd
324219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
325219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
326219089Spjd
327219089Spjd	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
328219089Spjd
329219089Spjd	if ((flags & DB_RF_HAVESTRUCT) == 0)
330219089Spjd		rw_exit(&dn->dn_struct_rwlock);
331219089Spjd
332219089Spjd	ASSERT(db != NULL);
333219089Spjd	err = dbuf_read(db, NULL, flags);
334219089Spjd	if (err == 0)
335219089Spjd		*dbp = &db->db;
336219089Spjd	else
337219089Spjd		dbuf_rele(db, tag);
338219089Spjd	return (err);
339219089Spjd}
340219089Spjd
341219089Spjdint
342219089Spjddmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
343219089Spjd{
344219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
345219089Spjd	dnode_t *dn;
346219089Spjd	int err;
347219089Spjd
348219089Spjd	DB_DNODE_ENTER(db);
349219089Spjd	dn = DB_DNODE(db);
350219089Spjd
351219089Spjd	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
352249195Smm		err = SET_ERROR(EINVAL);
353219089Spjd	} else {
354219089Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
355219089Spjd
356219089Spjd		if (!dn->dn_have_spill) {
357249195Smm			err = SET_ERROR(ENOENT);
358219089Spjd		} else {
359219089Spjd			err = dmu_spill_hold_by_dnode(dn,
360219089Spjd			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
361219089Spjd		}
362219089Spjd
363219089Spjd		rw_exit(&dn->dn_struct_rwlock);
364219089Spjd	}
365219089Spjd
366219089Spjd	DB_DNODE_EXIT(db);
367219089Spjd	return (err);
368219089Spjd}
369219089Spjd
370219089Spjdint
371219089Spjddmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
372219089Spjd{
373219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
374219089Spjd	dnode_t *dn;
375219089Spjd	int err;
376219089Spjd
377219089Spjd	DB_DNODE_ENTER(db);
378219089Spjd	dn = DB_DNODE(db);
379219089Spjd	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
380219089Spjd	DB_DNODE_EXIT(db);
381219089Spjd
382219089Spjd	return (err);
383219089Spjd}
384219089Spjd
385219089Spjd/*
386168404Spjd * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
387168404Spjd * to take a held dnode rather than <os, object> -- the lookup is wasteful,
388168404Spjd * and can induce severe lock contention when writing to several files
389168404Spjd * whose dnodes are in the same block.
390168404Spjd */
391168404Spjdstatic int
392209962Smmdmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
393209962Smm    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
394168404Spjd{
395168404Spjd	dmu_buf_t **dbp;
396168404Spjd	uint64_t blkid, nblks, i;
397209962Smm	uint32_t dbuf_flags;
398168404Spjd	int err;
399168404Spjd	zio_t *zio;
400168404Spjd
401168404Spjd	ASSERT(length <= DMU_MAX_ACCESS);
402168404Spjd
403214378Smm	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
404209962Smm	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
405209962Smm		dbuf_flags |= DB_RF_NOPREFETCH;
406168404Spjd
407168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
408168404Spjd	if (dn->dn_datablkshift) {
409168404Spjd		int blkshift = dn->dn_datablkshift;
410168404Spjd		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
411168404Spjd		    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
412168404Spjd	} else {
413168404Spjd		if (offset + length > dn->dn_datablksz) {
414168404Spjd			zfs_panic_recover("zfs: accessing past end of object "
415168404Spjd			    "%llx/%llx (size=%u access=%llu+%llu)",
416168404Spjd			    (longlong_t)dn->dn_objset->
417168404Spjd			    os_dsl_dataset->ds_object,
418168404Spjd			    (longlong_t)dn->dn_object, dn->dn_datablksz,
419168404Spjd			    (longlong_t)offset, (longlong_t)length);
420214378Smm			rw_exit(&dn->dn_struct_rwlock);
421249195Smm			return (SET_ERROR(EIO));
422168404Spjd		}
423168404Spjd		nblks = 1;
424168404Spjd	}
425168404Spjd	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
426168404Spjd
427185029Spjd	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
428168404Spjd	blkid = dbuf_whichblock(dn, offset);
429168404Spjd	for (i = 0; i < nblks; i++) {
430168404Spjd		dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
431168404Spjd		if (db == NULL) {
432168404Spjd			rw_exit(&dn->dn_struct_rwlock);
433168404Spjd			dmu_buf_rele_array(dbp, nblks, tag);
434168404Spjd			zio_nowait(zio);
435249195Smm			return (SET_ERROR(EIO));
436168404Spjd		}
437168404Spjd		/* initiate async i/o */
438226620Spjd		if (read)
439209962Smm			(void) dbuf_read(db, zio, dbuf_flags);
440226620Spjd#ifdef _KERNEL
441226620Spjd		else
442226620Spjd			curthread->td_ru.ru_oublock++;
443226620Spjd#endif
444168404Spjd		dbp[i] = &db->db;
445168404Spjd	}
446168404Spjd	rw_exit(&dn->dn_struct_rwlock);
447168404Spjd
448168404Spjd	/* wait for async i/o */
449168404Spjd	err = zio_wait(zio);
450168404Spjd	if (err) {
451168404Spjd		dmu_buf_rele_array(dbp, nblks, tag);
452168404Spjd		return (err);
453168404Spjd	}
454168404Spjd
455168404Spjd	/* wait for other io to complete */
456168404Spjd	if (read) {
457168404Spjd		for (i = 0; i < nblks; i++) {
458168404Spjd			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
459168404Spjd			mutex_enter(&db->db_mtx);
460168404Spjd			while (db->db_state == DB_READ ||
461168404Spjd			    db->db_state == DB_FILL)
462168404Spjd				cv_wait(&db->db_changed, &db->db_mtx);
463168404Spjd			if (db->db_state == DB_UNCACHED)
464249195Smm				err = SET_ERROR(EIO);
465168404Spjd			mutex_exit(&db->db_mtx);
466168404Spjd			if (err) {
467168404Spjd				dmu_buf_rele_array(dbp, nblks, tag);
468168404Spjd				return (err);
469168404Spjd			}
470168404Spjd		}
471168404Spjd	}
472168404Spjd
473168404Spjd	*numbufsp = nblks;
474168404Spjd	*dbpp = dbp;
475168404Spjd	return (0);
476168404Spjd}
477168404Spjd
478168404Spjdstatic int
479168404Spjddmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
480168404Spjd    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
481168404Spjd{
482168404Spjd	dnode_t *dn;
483168404Spjd	int err;
484168404Spjd
485219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
486168404Spjd	if (err)
487168404Spjd		return (err);
488168404Spjd
489168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
490209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
491168404Spjd
492168404Spjd	dnode_rele(dn, FTAG);
493168404Spjd
494168404Spjd	return (err);
495168404Spjd}
496168404Spjd
497168404Spjdint
498219089Spjddmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
499168404Spjd    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
500168404Spjd{
501219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
502219089Spjd	dnode_t *dn;
503168404Spjd	int err;
504168404Spjd
505219089Spjd	DB_DNODE_ENTER(db);
506219089Spjd	dn = DB_DNODE(db);
507168404Spjd	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
508209962Smm	    numbufsp, dbpp, DMU_READ_PREFETCH);
509219089Spjd	DB_DNODE_EXIT(db);
510168404Spjd
511168404Spjd	return (err);
512168404Spjd}
513168404Spjd
514168404Spjdvoid
515168404Spjddmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
516168404Spjd{
517168404Spjd	int i;
518168404Spjd	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
519168404Spjd
520168404Spjd	if (numbufs == 0)
521168404Spjd		return;
522168404Spjd
523168404Spjd	for (i = 0; i < numbufs; i++) {
524168404Spjd		if (dbp[i])
525168404Spjd			dbuf_rele(dbp[i], tag);
526168404Spjd	}
527168404Spjd
528168404Spjd	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
529168404Spjd}
530168404Spjd
531260763Savg/*
532260763Savg * Issue prefetch i/os for the given blocks.
533260763Savg *
534260763Savg * Note: The assumption is that we *know* these blocks will be needed
535260763Savg * almost immediately.  Therefore, the prefetch i/os will be issued at
536260763Savg * ZIO_PRIORITY_SYNC_READ
537260763Savg *
538260763Savg * Note: indirect blocks and other metadata will be read synchronously,
539260763Savg * causing this function to block if they are not already cached.
540260763Savg */
541168404Spjdvoid
542168404Spjddmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
543168404Spjd{
544168404Spjd	dnode_t *dn;
545168404Spjd	uint64_t blkid;
546260763Savg	int nblks, err;
547168404Spjd
548194043Skmacy	if (zfs_prefetch_disable)
549168404Spjd		return;
550168404Spjd
551168404Spjd	if (len == 0) {  /* they're interested in the bonus buffer */
552219089Spjd		dn = DMU_META_DNODE(os);
553168404Spjd
554168404Spjd		if (object == 0 || object >= DN_MAX_OBJECT)
555168404Spjd			return;
556168404Spjd
557168404Spjd		rw_enter(&dn->dn_struct_rwlock, RW_READER);
558168404Spjd		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
559260763Savg		dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ);
560168404Spjd		rw_exit(&dn->dn_struct_rwlock);
561168404Spjd		return;
562168404Spjd	}
563168404Spjd
564168404Spjd	/*
565168404Spjd	 * XXX - Note, if the dnode for the requested object is not
566168404Spjd	 * already cached, we will do a *synchronous* read in the
567168404Spjd	 * dnode_hold() call.  The same is true for any indirects.
568168404Spjd	 */
569219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
570168404Spjd	if (err != 0)
571168404Spjd		return;
572168404Spjd
573168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
574168404Spjd	if (dn->dn_datablkshift) {
575168404Spjd		int blkshift = dn->dn_datablkshift;
576260763Savg		nblks = (P2ROUNDUP(offset + len, 1 << blkshift) -
577260763Savg		    P2ALIGN(offset, 1 << blkshift)) >> blkshift;
578168404Spjd	} else {
579168404Spjd		nblks = (offset < dn->dn_datablksz);
580168404Spjd	}
581168404Spjd
582168404Spjd	if (nblks != 0) {
583168404Spjd		blkid = dbuf_whichblock(dn, offset);
584260763Savg		for (int i = 0; i < nblks; i++)
585260763Savg			dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ);
586168404Spjd	}
587168404Spjd
588168404Spjd	rw_exit(&dn->dn_struct_rwlock);
589168404Spjd
590168404Spjd	dnode_rele(dn, FTAG);
591168404Spjd}
592168404Spjd
593208775Smm/*
594208775Smm * Get the next "chunk" of file data to free.  We traverse the file from
595208775Smm * the end so that the file gets shorter over time (if we crashes in the
596208775Smm * middle, this will leave us in a better state).  We find allocated file
597208775Smm * data by simply searching the allocated level 1 indirects.
598254753Sdelphij *
599254753Sdelphij * On input, *start should be the first offset that does not need to be
600254753Sdelphij * freed (e.g. "offset + length").  On return, *start will be the first
601254753Sdelphij * offset that should be freed.
602208775Smm */
603185029Spjdstatic int
604254753Sdelphijget_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
605185029Spjd{
606254753Sdelphij	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
607254753Sdelphij	/* bytes of data covered by a level-1 indirect block */
608208775Smm	uint64_t iblkrange =
609185029Spjd	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
610185029Spjd
611254753Sdelphij	ASSERT3U(minimum, <=, *start);
612185029Spjd
613254753Sdelphij	if (*start - minimum <= iblkrange * maxblks) {
614254753Sdelphij		*start = minimum;
615185029Spjd		return (0);
616185029Spjd	}
617208775Smm	ASSERT(ISP2(iblkrange));
618185029Spjd
619254753Sdelphij	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
620185029Spjd		int err;
621185029Spjd
622254753Sdelphij		/*
623254753Sdelphij		 * dnode_next_offset(BACKWARDS) will find an allocated L1
624254753Sdelphij		 * indirect block at or before the input offset.  We must
625254753Sdelphij		 * decrement *start so that it is at the end of the region
626254753Sdelphij		 * to search.
627254753Sdelphij		 */
628254753Sdelphij		(*start)--;
629185029Spjd		err = dnode_next_offset(dn,
630208775Smm		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
631185029Spjd
632254753Sdelphij		/* if there are no indirect blocks before start, we are done */
633208775Smm		if (err == ESRCH) {
634254753Sdelphij			*start = minimum;
635254753Sdelphij			break;
636254753Sdelphij		} else if (err != 0) {
637208775Smm			return (err);
638185029Spjd		}
639185029Spjd
640254753Sdelphij		/* set start to the beginning of this L1 indirect */
641208775Smm		*start = P2ALIGN(*start, iblkrange);
642185029Spjd	}
643254753Sdelphij	if (*start < minimum)
644254753Sdelphij		*start = minimum;
645185029Spjd	return (0);
646185029Spjd}
647185029Spjd
648185029Spjdstatic int
649185029Spjddmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
650254753Sdelphij    uint64_t length)
651185029Spjd{
652254753Sdelphij	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
653254753Sdelphij	int err;
654185029Spjd
655254753Sdelphij	if (offset >= object_size)
656185029Spjd		return (0);
657185029Spjd
658254753Sdelphij	if (length == DMU_OBJECT_END || offset + length > object_size)
659254753Sdelphij		length = object_size - offset;
660254753Sdelphij
661254753Sdelphij	while (length != 0) {
662254753Sdelphij		uint64_t chunk_end, chunk_begin;
663254753Sdelphij
664254753Sdelphij		chunk_end = chunk_begin = offset + length;
665254753Sdelphij
666254753Sdelphij		/* move chunk_begin backwards to the beginning of this chunk */
667254753Sdelphij		err = get_next_chunk(dn, &chunk_begin, offset);
668185029Spjd		if (err)
669185029Spjd			return (err);
670254753Sdelphij		ASSERT3U(chunk_begin, >=, offset);
671254753Sdelphij		ASSERT3U(chunk_begin, <=, chunk_end);
672185029Spjd
673254753Sdelphij		dmu_tx_t *tx = dmu_tx_create(os);
674254753Sdelphij		dmu_tx_hold_free(tx, dn->dn_object,
675254753Sdelphij		    chunk_begin, chunk_end - chunk_begin);
676269002Sdelphij
677269002Sdelphij		/*
678269002Sdelphij		 * Mark this transaction as typically resulting in a net
679269002Sdelphij		 * reduction in space used.
680269002Sdelphij		 */
681269002Sdelphij		dmu_tx_mark_netfree(tx);
682185029Spjd		err = dmu_tx_assign(tx, TXG_WAIT);
683185029Spjd		if (err) {
684185029Spjd			dmu_tx_abort(tx);
685185029Spjd			return (err);
686185029Spjd		}
687254753Sdelphij		dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
688254753Sdelphij		dmu_tx_commit(tx);
689185029Spjd
690254753Sdelphij		length -= chunk_end - chunk_begin;
691185029Spjd	}
692185029Spjd	return (0);
693185029Spjd}
694185029Spjd
695168404Spjdint
696185029Spjddmu_free_long_range(objset_t *os, uint64_t object,
697185029Spjd    uint64_t offset, uint64_t length)
698185029Spjd{
699185029Spjd	dnode_t *dn;
700185029Spjd	int err;
701185029Spjd
702219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
703185029Spjd	if (err != 0)
704185029Spjd		return (err);
705254753Sdelphij	err = dmu_free_long_range_impl(os, dn, offset, length);
706256259Savg
707256259Savg	/*
708256259Savg	 * It is important to zero out the maxblkid when freeing the entire
709256259Savg	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
710256259Savg	 * will take the fast path, and (b) dnode_reallocate() can verify
711256259Savg	 * that the entire file has been freed.
712256259Savg	 */
713263397Sdelphij	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
714256259Savg		dn->dn_maxblkid = 0;
715256259Savg
716185029Spjd	dnode_rele(dn, FTAG);
717185029Spjd	return (err);
718185029Spjd}
719185029Spjd
720185029Spjdint
721254753Sdelphijdmu_free_long_object(objset_t *os, uint64_t object)
722185029Spjd{
723185029Spjd	dmu_tx_t *tx;
724185029Spjd	int err;
725185029Spjd
726254753Sdelphij	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
727185029Spjd	if (err != 0)
728185029Spjd		return (err);
729254753Sdelphij
730254753Sdelphij	tx = dmu_tx_create(os);
731254753Sdelphij	dmu_tx_hold_bonus(tx, object);
732254753Sdelphij	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
733269002Sdelphij	dmu_tx_mark_netfree(tx);
734254753Sdelphij	err = dmu_tx_assign(tx, TXG_WAIT);
735254753Sdelphij	if (err == 0) {
736254753Sdelphij		err = dmu_object_free(os, object, tx);
737254753Sdelphij		dmu_tx_commit(tx);
738185029Spjd	} else {
739254753Sdelphij		dmu_tx_abort(tx);
740185029Spjd	}
741254753Sdelphij
742185029Spjd	return (err);
743185029Spjd}
744185029Spjd
745185029Spjdint
746168404Spjddmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
747168404Spjd    uint64_t size, dmu_tx_t *tx)
748168404Spjd{
749168404Spjd	dnode_t *dn;
750219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
751168404Spjd	if (err)
752168404Spjd		return (err);
753168404Spjd	ASSERT(offset < UINT64_MAX);
754168404Spjd	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
755168404Spjd	dnode_free_range(dn, offset, size, tx);
756168404Spjd	dnode_rele(dn, FTAG);
757168404Spjd	return (0);
758168404Spjd}
759168404Spjd
760168404Spjdint
761168404Spjddmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
762209962Smm    void *buf, uint32_t flags)
763168404Spjd{
764168404Spjd	dnode_t *dn;
765168404Spjd	dmu_buf_t **dbp;
766214378Smm	int numbufs, err;
767168404Spjd
768219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
769168404Spjd	if (err)
770168404Spjd		return (err);
771168404Spjd
772168404Spjd	/*
773168404Spjd	 * Deal with odd block sizes, where there can't be data past the first
774168404Spjd	 * block.  If we ever do the tail block optimization, we will need to
775168404Spjd	 * handle that here as well.
776168404Spjd	 */
777214378Smm	if (dn->dn_maxblkid == 0) {
778168404Spjd		int newsz = offset > dn->dn_datablksz ? 0 :
779168404Spjd		    MIN(size, dn->dn_datablksz - offset);
780168404Spjd		bzero((char *)buf + newsz, size - newsz);
781168404Spjd		size = newsz;
782168404Spjd	}
783168404Spjd
784168404Spjd	while (size > 0) {
785168404Spjd		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
786214378Smm		int i;
787168404Spjd
788168404Spjd		/*
789168404Spjd		 * NB: we could do this block-at-a-time, but it's nice
790168404Spjd		 * to be reading in parallel.
791168404Spjd		 */
792168404Spjd		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
793209962Smm		    TRUE, FTAG, &numbufs, &dbp, flags);
794168404Spjd		if (err)
795185029Spjd			break;
796168404Spjd
797168404Spjd		for (i = 0; i < numbufs; i++) {
798168404Spjd			int tocpy;
799168404Spjd			int bufoff;
800168404Spjd			dmu_buf_t *db = dbp[i];
801168404Spjd
802168404Spjd			ASSERT(size > 0);
803168404Spjd
804168404Spjd			bufoff = offset - db->db_offset;
805168404Spjd			tocpy = (int)MIN(db->db_size - bufoff, size);
806168404Spjd
807168404Spjd			bcopy((char *)db->db_data + bufoff, buf, tocpy);
808168404Spjd
809168404Spjd			offset += tocpy;
810168404Spjd			size -= tocpy;
811168404Spjd			buf = (char *)buf + tocpy;
812168404Spjd		}
813168404Spjd		dmu_buf_rele_array(dbp, numbufs, FTAG);
814168404Spjd	}
815168404Spjd	dnode_rele(dn, FTAG);
816185029Spjd	return (err);
817168404Spjd}
818168404Spjd
819168404Spjdvoid
820168404Spjddmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
821168404Spjd    const void *buf, dmu_tx_t *tx)
822168404Spjd{
823168404Spjd	dmu_buf_t **dbp;
824168404Spjd	int numbufs, i;
825168404Spjd
826168404Spjd	if (size == 0)
827168404Spjd		return;
828168404Spjd
829168404Spjd	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
830168404Spjd	    FALSE, FTAG, &numbufs, &dbp));
831168404Spjd
832168404Spjd	for (i = 0; i < numbufs; i++) {
833168404Spjd		int tocpy;
834168404Spjd		int bufoff;
835168404Spjd		dmu_buf_t *db = dbp[i];
836168404Spjd
837168404Spjd		ASSERT(size > 0);
838168404Spjd
839168404Spjd		bufoff = offset - db->db_offset;
840168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
841168404Spjd
842168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
843168404Spjd
844168404Spjd		if (tocpy == db->db_size)
845168404Spjd			dmu_buf_will_fill(db, tx);
846168404Spjd		else
847168404Spjd			dmu_buf_will_dirty(db, tx);
848168404Spjd
849168404Spjd		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
850168404Spjd
851168404Spjd		if (tocpy == db->db_size)
852168404Spjd			dmu_buf_fill_done(db, tx);
853168404Spjd
854168404Spjd		offset += tocpy;
855168404Spjd		size -= tocpy;
856168404Spjd		buf = (char *)buf + tocpy;
857168404Spjd	}
858168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
859168404Spjd}
860168404Spjd
861219089Spjdvoid
862219089Spjddmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
863219089Spjd    dmu_tx_t *tx)
864219089Spjd{
865219089Spjd	dmu_buf_t **dbp;
866219089Spjd	int numbufs, i;
867219089Spjd
868219089Spjd	if (size == 0)
869219089Spjd		return;
870219089Spjd
871219089Spjd	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
872219089Spjd	    FALSE, FTAG, &numbufs, &dbp));
873219089Spjd
874219089Spjd	for (i = 0; i < numbufs; i++) {
875219089Spjd		dmu_buf_t *db = dbp[i];
876219089Spjd
877219089Spjd		dmu_buf_will_not_fill(db, tx);
878219089Spjd	}
879219089Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
880219089Spjd}
881219089Spjd
882268649Sdelphijvoid
883268649Sdelphijdmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
884268649Sdelphij    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
885268649Sdelphij    int compressed_size, int byteorder, dmu_tx_t *tx)
886268649Sdelphij{
887268649Sdelphij	dmu_buf_t *db;
888268649Sdelphij
889268649Sdelphij	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
890268649Sdelphij	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
891268649Sdelphij	VERIFY0(dmu_buf_hold_noread(os, object, offset,
892268649Sdelphij	    FTAG, &db));
893268649Sdelphij
894268649Sdelphij	dmu_buf_write_embedded(db,
895268649Sdelphij	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
896268649Sdelphij	    uncompressed_size, compressed_size, byteorder, tx);
897268649Sdelphij
898268649Sdelphij	dmu_buf_rele(db, FTAG);
899268649Sdelphij}
900268649Sdelphij
901219089Spjd/*
902219089Spjd * DMU support for xuio
903219089Spjd */
904219089Spjdkstat_t *xuio_ksp = NULL;
905219089Spjd
906219089Spjdint
907219089Spjddmu_xuio_init(xuio_t *xuio, int nblk)
908219089Spjd{
909219089Spjd	dmu_xuio_t *priv;
910219089Spjd	uio_t *uio = &xuio->xu_uio;
911219089Spjd
912219089Spjd	uio->uio_iovcnt = nblk;
913219089Spjd	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
914219089Spjd
915219089Spjd	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
916219089Spjd	priv->cnt = nblk;
917219089Spjd	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
918219089Spjd	priv->iovp = uio->uio_iov;
919219089Spjd	XUIO_XUZC_PRIV(xuio) = priv;
920219089Spjd
921219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
922219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
923219089Spjd	else
924219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
925219089Spjd
926219089Spjd	return (0);
927219089Spjd}
928219089Spjd
929219089Spjdvoid
930219089Spjddmu_xuio_fini(xuio_t *xuio)
931219089Spjd{
932219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
933219089Spjd	int nblk = priv->cnt;
934219089Spjd
935219089Spjd	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
936219089Spjd	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
937219089Spjd	kmem_free(priv, sizeof (dmu_xuio_t));
938219089Spjd
939219089Spjd	if (XUIO_XUZC_RW(xuio) == UIO_READ)
940219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
941219089Spjd	else
942219089Spjd		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
943219089Spjd}
944219089Spjd
945219089Spjd/*
946219089Spjd * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
947219089Spjd * and increase priv->next by 1.
948219089Spjd */
949219089Spjdint
950219089Spjddmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
951219089Spjd{
952219089Spjd	struct iovec *iov;
953219089Spjd	uio_t *uio = &xuio->xu_uio;
954219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
955219089Spjd	int i = priv->next++;
956219089Spjd
957219089Spjd	ASSERT(i < priv->cnt);
958219089Spjd	ASSERT(off + n <= arc_buf_size(abuf));
959219089Spjd	iov = uio->uio_iov + i;
960219089Spjd	iov->iov_base = (char *)abuf->b_data + off;
961219089Spjd	iov->iov_len = n;
962219089Spjd	priv->bufs[i] = abuf;
963219089Spjd	return (0);
964219089Spjd}
965219089Spjd
966219089Spjdint
967219089Spjddmu_xuio_cnt(xuio_t *xuio)
968219089Spjd{
969219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
970219089Spjd	return (priv->cnt);
971219089Spjd}
972219089Spjd
973219089Spjdarc_buf_t *
974219089Spjddmu_xuio_arcbuf(xuio_t *xuio, int i)
975219089Spjd{
976219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
977219089Spjd
978219089Spjd	ASSERT(i < priv->cnt);
979219089Spjd	return (priv->bufs[i]);
980219089Spjd}
981219089Spjd
982219089Spjdvoid
983219089Spjddmu_xuio_clear(xuio_t *xuio, int i)
984219089Spjd{
985219089Spjd	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
986219089Spjd
987219089Spjd	ASSERT(i < priv->cnt);
988219089Spjd	priv->bufs[i] = NULL;
989219089Spjd}
990219089Spjd
991219089Spjdstatic void
992219089Spjdxuio_stat_init(void)
993219089Spjd{
994219089Spjd	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
995219089Spjd	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
996219089Spjd	    KSTAT_FLAG_VIRTUAL);
997219089Spjd	if (xuio_ksp != NULL) {
998219089Spjd		xuio_ksp->ks_data = &xuio_stats;
999219089Spjd		kstat_install(xuio_ksp);
1000219089Spjd	}
1001219089Spjd}
1002219089Spjd
1003219089Spjdstatic void
1004219089Spjdxuio_stat_fini(void)
1005219089Spjd{
1006219089Spjd	if (xuio_ksp != NULL) {
1007219089Spjd		kstat_delete(xuio_ksp);
1008219089Spjd		xuio_ksp = NULL;
1009219089Spjd	}
1010219089Spjd}
1011219089Spjd
1012219089Spjdvoid
1013219089Spjdxuio_stat_wbuf_copied()
1014219089Spjd{
1015219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1016219089Spjd}
1017219089Spjd
1018219089Spjdvoid
1019219089Spjdxuio_stat_wbuf_nocopy()
1020219089Spjd{
1021219089Spjd	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
1022219089Spjd}
1023219089Spjd
1024168404Spjd#ifdef _KERNEL
1025168404Spjdint
1026168404Spjddmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
1027168404Spjd{
1028168404Spjd	dmu_buf_t **dbp;
1029168404Spjd	int numbufs, i, err;
1030219089Spjd	xuio_t *xuio = NULL;
1031168404Spjd
1032168404Spjd	/*
1033168404Spjd	 * NB: we could do this block-at-a-time, but it's nice
1034168404Spjd	 * to be reading in parallel.
1035168404Spjd	 */
1036168404Spjd	err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
1037168404Spjd	    &numbufs, &dbp);
1038168404Spjd	if (err)
1039168404Spjd		return (err);
1040168404Spjd
1041219089Spjd#ifdef UIO_XUIO
1042219089Spjd	if (uio->uio_extflg == UIO_XUIO)
1043219089Spjd		xuio = (xuio_t *)uio;
1044219089Spjd#endif
1045219089Spjd
1046168404Spjd	for (i = 0; i < numbufs; i++) {
1047168404Spjd		int tocpy;
1048168404Spjd		int bufoff;
1049168404Spjd		dmu_buf_t *db = dbp[i];
1050168404Spjd
1051168404Spjd		ASSERT(size > 0);
1052168404Spjd
1053168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
1054168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1055168404Spjd
1056219089Spjd		if (xuio) {
1057219089Spjd			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
1058219089Spjd			arc_buf_t *dbuf_abuf = dbi->db_buf;
1059219089Spjd			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
1060219089Spjd			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
1061219089Spjd			if (!err) {
1062219089Spjd				uio->uio_resid -= tocpy;
1063219089Spjd				uio->uio_loffset += tocpy;
1064219089Spjd			}
1065219089Spjd
1066219089Spjd			if (abuf == dbuf_abuf)
1067219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
1068219089Spjd			else
1069219089Spjd				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
1070219089Spjd		} else {
1071219089Spjd			err = uiomove((char *)db->db_data + bufoff, tocpy,
1072219089Spjd			    UIO_READ, uio);
1073219089Spjd		}
1074168404Spjd		if (err)
1075168404Spjd			break;
1076168404Spjd
1077168404Spjd		size -= tocpy;
1078168404Spjd	}
1079168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1080168404Spjd
1081168404Spjd	return (err);
1082168404Spjd}
1083168404Spjd
1084219089Spjdstatic int
1085219089Spjddmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1086168404Spjd{
1087168404Spjd	dmu_buf_t **dbp;
1088219089Spjd	int numbufs;
1089168404Spjd	int err = 0;
1090219089Spjd	int i;
1091168404Spjd
1092219089Spjd	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1093219089Spjd	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1094168404Spjd	if (err)
1095168404Spjd		return (err);
1096168404Spjd
1097168404Spjd	for (i = 0; i < numbufs; i++) {
1098168404Spjd		int tocpy;
1099168404Spjd		int bufoff;
1100168404Spjd		dmu_buf_t *db = dbp[i];
1101168404Spjd
1102168404Spjd		ASSERT(size > 0);
1103168404Spjd
1104168404Spjd		bufoff = uio->uio_loffset - db->db_offset;
1105168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1106168404Spjd
1107168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1108168404Spjd
1109168404Spjd		if (tocpy == db->db_size)
1110168404Spjd			dmu_buf_will_fill(db, tx);
1111168404Spjd		else
1112168404Spjd			dmu_buf_will_dirty(db, tx);
1113168404Spjd
1114168404Spjd		/*
1115168404Spjd		 * XXX uiomove could block forever (eg. nfs-backed
1116168404Spjd		 * pages).  There needs to be a uiolockdown() function
1117168404Spjd		 * to lock the pages in memory, so that uiomove won't
1118168404Spjd		 * block.
1119168404Spjd		 */
1120168404Spjd		err = uiomove((char *)db->db_data + bufoff, tocpy,
1121168404Spjd		    UIO_WRITE, uio);
1122168404Spjd
1123168404Spjd		if (tocpy == db->db_size)
1124168404Spjd			dmu_buf_fill_done(db, tx);
1125168404Spjd
1126168404Spjd		if (err)
1127168404Spjd			break;
1128168404Spjd
1129168404Spjd		size -= tocpy;
1130168404Spjd	}
1131219089Spjd
1132168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1133168404Spjd	return (err);
1134168404Spjd}
1135168404Spjd
1136168404Spjdint
1137219089Spjddmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1138219089Spjd    dmu_tx_t *tx)
1139219089Spjd{
1140219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1141219089Spjd	dnode_t *dn;
1142219089Spjd	int err;
1143219089Spjd
1144219089Spjd	if (size == 0)
1145219089Spjd		return (0);
1146219089Spjd
1147219089Spjd	DB_DNODE_ENTER(db);
1148219089Spjd	dn = DB_DNODE(db);
1149219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1150219089Spjd	DB_DNODE_EXIT(db);
1151219089Spjd
1152219089Spjd	return (err);
1153219089Spjd}
1154219089Spjd
1155219089Spjdint
1156219089Spjddmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1157219089Spjd    dmu_tx_t *tx)
1158219089Spjd{
1159219089Spjd	dnode_t *dn;
1160219089Spjd	int err;
1161219089Spjd
1162219089Spjd	if (size == 0)
1163219089Spjd		return (0);
1164219089Spjd
1165219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1166219089Spjd	if (err)
1167219089Spjd		return (err);
1168219089Spjd
1169219089Spjd	err = dmu_write_uio_dnode(dn, uio, size, tx);
1170219089Spjd
1171219089Spjd	dnode_rele(dn, FTAG);
1172219089Spjd
1173219089Spjd	return (err);
1174219089Spjd}
1175219089Spjd
1176219089Spjd#ifdef sun
1177219089Spjdint
1178168404Spjddmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1179168404Spjd    page_t *pp, dmu_tx_t *tx)
1180168404Spjd{
1181168404Spjd	dmu_buf_t **dbp;
1182168404Spjd	int numbufs, i;
1183168404Spjd	int err;
1184168404Spjd
1185168404Spjd	if (size == 0)
1186168404Spjd		return (0);
1187168404Spjd
1188168404Spjd	err = dmu_buf_hold_array(os, object, offset, size,
1189168404Spjd	    FALSE, FTAG, &numbufs, &dbp);
1190168404Spjd	if (err)
1191168404Spjd		return (err);
1192168404Spjd
1193168404Spjd	for (i = 0; i < numbufs; i++) {
1194168404Spjd		int tocpy, copied, thiscpy;
1195168404Spjd		int bufoff;
1196168404Spjd		dmu_buf_t *db = dbp[i];
1197168404Spjd		caddr_t va;
1198168404Spjd
1199168404Spjd		ASSERT(size > 0);
1200168404Spjd		ASSERT3U(db->db_size, >=, PAGESIZE);
1201168404Spjd
1202168404Spjd		bufoff = offset - db->db_offset;
1203168404Spjd		tocpy = (int)MIN(db->db_size - bufoff, size);
1204168404Spjd
1205168404Spjd		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1206168404Spjd
1207168404Spjd		if (tocpy == db->db_size)
1208168404Spjd			dmu_buf_will_fill(db, tx);
1209168404Spjd		else
1210168404Spjd			dmu_buf_will_dirty(db, tx);
1211168404Spjd
1212168404Spjd		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1213168404Spjd			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1214168404Spjd			thiscpy = MIN(PAGESIZE, tocpy - copied);
1215185029Spjd			va = zfs_map_page(pp, S_READ);
1216168404Spjd			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1217185029Spjd			zfs_unmap_page(pp, va);
1218168404Spjd			pp = pp->p_next;
1219168404Spjd			bufoff += PAGESIZE;
1220168404Spjd		}
1221168404Spjd
1222168404Spjd		if (tocpy == db->db_size)
1223168404Spjd			dmu_buf_fill_done(db, tx);
1224168404Spjd
1225168404Spjd		offset += tocpy;
1226168404Spjd		size -= tocpy;
1227168404Spjd	}
1228168404Spjd	dmu_buf_rele_array(dbp, numbufs, FTAG);
1229168404Spjd	return (err);
1230168404Spjd}
1231260786Savg
1232260786Savg#else
1233260786Savg
1234260786Savgint
1235260786Savgdmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1236260786Savg    vm_page_t *ma, dmu_tx_t *tx)
1237260786Savg{
1238260786Savg	dmu_buf_t **dbp;
1239260786Savg	struct sf_buf *sf;
1240260786Savg	int numbufs, i;
1241260786Savg	int err;
1242260786Savg
1243260786Savg	if (size == 0)
1244260786Savg		return (0);
1245260786Savg
1246260786Savg	err = dmu_buf_hold_array(os, object, offset, size,
1247260786Savg	    FALSE, FTAG, &numbufs, &dbp);
1248260786Savg	if (err)
1249260786Savg		return (err);
1250260786Savg
1251260786Savg	for (i = 0; i < numbufs; i++) {
1252260786Savg		int tocpy, copied, thiscpy;
1253260786Savg		int bufoff;
1254260786Savg		dmu_buf_t *db = dbp[i];
1255260786Savg		caddr_t va;
1256260786Savg
1257260786Savg		ASSERT(size > 0);
1258260786Savg		ASSERT3U(db->db_size, >=, PAGESIZE);
1259260786Savg
1260260786Savg		bufoff = offset - db->db_offset;
1261260786Savg		tocpy = (int)MIN(db->db_size - bufoff, size);
1262260786Savg
1263260786Savg		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1264260786Savg
1265260786Savg		if (tocpy == db->db_size)
1266260786Savg			dmu_buf_will_fill(db, tx);
1267260786Savg		else
1268260786Savg			dmu_buf_will_dirty(db, tx);
1269260786Savg
1270260786Savg		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1271260786Savg			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
1272260786Savg			thiscpy = MIN(PAGESIZE, tocpy - copied);
1273260786Savg			va = zfs_map_page(*ma, &sf);
1274260786Savg			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1275260786Savg			zfs_unmap_page(sf);
1276260786Savg			ma += 1;
1277260786Savg			bufoff += PAGESIZE;
1278260786Savg		}
1279260786Savg
1280260786Savg		if (tocpy == db->db_size)
1281260786Savg			dmu_buf_fill_done(db, tx);
1282260786Savg
1283260786Savg		offset += tocpy;
1284260786Savg		size -= tocpy;
1285260786Savg	}
1286260786Savg	dmu_buf_rele_array(dbp, numbufs, FTAG);
1287260786Savg	return (err);
1288260786Savg}
1289219089Spjd#endif	/* sun */
1290219089Spjd#endif
1291168404Spjd
1292209962Smm/*
1293209962Smm * Allocate a loaned anonymous arc buffer.
1294209962Smm */
1295209962Smmarc_buf_t *
1296209962Smmdmu_request_arcbuf(dmu_buf_t *handle, int size)
1297209962Smm{
1298219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1299209962Smm
1300263397Sdelphij	return (arc_loan_buf(db->db_objset->os_spa, size));
1301209962Smm}
1302209962Smm
1303209962Smm/*
1304209962Smm * Free a loaned arc buffer.
1305209962Smm */
1306209962Smmvoid
1307209962Smmdmu_return_arcbuf(arc_buf_t *buf)
1308209962Smm{
1309209962Smm	arc_return_buf(buf, FTAG);
1310248571Smm	VERIFY(arc_buf_remove_ref(buf, FTAG));
1311209962Smm}
1312209962Smm
1313209962Smm/*
1314209962Smm * When possible directly assign passed loaned arc buffer to a dbuf.
1315209962Smm * If this is not possible copy the contents of passed arc buf via
1316209962Smm * dmu_write().
1317209962Smm */
1318209962Smmvoid
1319209962Smmdmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1320209962Smm    dmu_tx_t *tx)
1321209962Smm{
1322219089Spjd	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1323219089Spjd	dnode_t *dn;
1324209962Smm	dmu_buf_impl_t *db;
1325209962Smm	uint32_t blksz = (uint32_t)arc_buf_size(buf);
1326209962Smm	uint64_t blkid;
1327209962Smm
1328219089Spjd	DB_DNODE_ENTER(dbuf);
1329219089Spjd	dn = DB_DNODE(dbuf);
1330209962Smm	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1331209962Smm	blkid = dbuf_whichblock(dn, offset);
1332209962Smm	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1333209962Smm	rw_exit(&dn->dn_struct_rwlock);
1334219089Spjd	DB_DNODE_EXIT(dbuf);
1335209962Smm
1336209962Smm	if (offset == db->db.db_offset && blksz == db->db.db_size) {
1337209962Smm		dbuf_assign_arcbuf(db, buf, tx);
1338209962Smm		dbuf_rele(db, FTAG);
1339209962Smm	} else {
1340219089Spjd		objset_t *os;
1341219089Spjd		uint64_t object;
1342219089Spjd
1343219089Spjd		DB_DNODE_ENTER(dbuf);
1344219089Spjd		dn = DB_DNODE(dbuf);
1345219089Spjd		os = dn->dn_objset;
1346219089Spjd		object = dn->dn_object;
1347219089Spjd		DB_DNODE_EXIT(dbuf);
1348219089Spjd
1349209962Smm		dbuf_rele(db, FTAG);
1350219089Spjd		dmu_write(os, object, offset, blksz, buf->b_data, tx);
1351209962Smm		dmu_return_arcbuf(buf);
1352219089Spjd		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1353209962Smm	}
1354209962Smm}
1355209962Smm
1356168404Spjdtypedef struct {
1357219089Spjd	dbuf_dirty_record_t	*dsa_dr;
1358219089Spjd	dmu_sync_cb_t		*dsa_done;
1359219089Spjd	zgd_t			*dsa_zgd;
1360219089Spjd	dmu_tx_t		*dsa_tx;
1361168404Spjd} dmu_sync_arg_t;
1362168404Spjd
1363168404Spjd/* ARGSUSED */
1364168404Spjdstatic void
1365185029Spjddmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1366185029Spjd{
1367219089Spjd	dmu_sync_arg_t *dsa = varg;
1368219089Spjd	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1369185029Spjd	blkptr_t *bp = zio->io_bp;
1370185029Spjd
1371219089Spjd	if (zio->io_error == 0) {
1372219089Spjd		if (BP_IS_HOLE(bp)) {
1373219089Spjd			/*
1374219089Spjd			 * A block of zeros may compress to a hole, but the
1375219089Spjd			 * block size still needs to be known for replay.
1376219089Spjd			 */
1377219089Spjd			BP_SET_LSIZE(bp, db->db_size);
1378268649Sdelphij		} else if (!BP_IS_EMBEDDED(bp)) {
1379219089Spjd			ASSERT(BP_GET_LEVEL(bp) == 0);
1380219089Spjd			bp->blk_fill = 1;
1381219089Spjd		}
1382185029Spjd	}
1383185029Spjd}
1384185029Spjd
1385219089Spjdstatic void
1386219089Spjddmu_sync_late_arrival_ready(zio_t *zio)
1387219089Spjd{
1388219089Spjd	dmu_sync_ready(zio, NULL, zio->io_private);
1389219089Spjd}
1390219089Spjd
1391185029Spjd/* ARGSUSED */
1392185029Spjdstatic void
1393168404Spjddmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1394168404Spjd{
1395219089Spjd	dmu_sync_arg_t *dsa = varg;
1396219089Spjd	dbuf_dirty_record_t *dr = dsa->dsa_dr;
1397168404Spjd	dmu_buf_impl_t *db = dr->dr_dbuf;
1398168404Spjd
1399168404Spjd	mutex_enter(&db->db_mtx);
1400168404Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1401219089Spjd	if (zio->io_error == 0) {
1402243524Smm		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1403243524Smm		if (dr->dt.dl.dr_nopwrite) {
1404243524Smm			blkptr_t *bp = zio->io_bp;
1405243524Smm			blkptr_t *bp_orig = &zio->io_bp_orig;
1406243524Smm			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1407243524Smm
1408243524Smm			ASSERT(BP_EQUAL(bp, bp_orig));
1409243524Smm			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
1410243524Smm			ASSERT(zio_checksum_table[chksum].ci_dedup);
1411243524Smm		}
1412219089Spjd		dr->dt.dl.dr_overridden_by = *zio->io_bp;
1413219089Spjd		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1414219089Spjd		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1415219089Spjd		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
1416219089Spjd			BP_ZERO(&dr->dt.dl.dr_overridden_by);
1417219089Spjd	} else {
1418219089Spjd		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1419219089Spjd	}
1420168404Spjd	cv_broadcast(&db->db_changed);
1421168404Spjd	mutex_exit(&db->db_mtx);
1422168404Spjd
1423219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1424168404Spjd
1425219089Spjd	kmem_free(dsa, sizeof (*dsa));
1426168404Spjd}
1427168404Spjd
1428219089Spjdstatic void
1429219089Spjddmu_sync_late_arrival_done(zio_t *zio)
1430219089Spjd{
1431219089Spjd	blkptr_t *bp = zio->io_bp;
1432219089Spjd	dmu_sync_arg_t *dsa = zio->io_private;
1433243524Smm	blkptr_t *bp_orig = &zio->io_bp_orig;
1434219089Spjd
1435219089Spjd	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1436243524Smm		/*
1437243524Smm		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
1438243524Smm		 * then there is nothing to do here. Otherwise, free the
1439243524Smm		 * newly allocated block in this txg.
1440243524Smm		 */
1441243524Smm		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
1442243524Smm			ASSERT(BP_EQUAL(bp, bp_orig));
1443243524Smm		} else {
1444243524Smm			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
1445243524Smm			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1446243524Smm			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1447243524Smm			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1448243524Smm		}
1449219089Spjd	}
1450219089Spjd
1451219089Spjd	dmu_tx_commit(dsa->dsa_tx);
1452219089Spjd
1453219089Spjd	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1454219089Spjd
1455219089Spjd	kmem_free(dsa, sizeof (*dsa));
1456219089Spjd}
1457219089Spjd
1458219089Spjdstatic int
1459219089Spjddmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1460268657Sdelphij    zio_prop_t *zp, zbookmark_phys_t *zb)
1461219089Spjd{
1462219089Spjd	dmu_sync_arg_t *dsa;
1463219089Spjd	dmu_tx_t *tx;
1464219089Spjd
1465219089Spjd	tx = dmu_tx_create(os);
1466219089Spjd	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1467219089Spjd	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1468219089Spjd		dmu_tx_abort(tx);
1469249195Smm		/* Make zl_get_data do txg_waited_synced() */
1470249195Smm		return (SET_ERROR(EIO));
1471219089Spjd	}
1472219089Spjd
1473219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1474219089Spjd	dsa->dsa_dr = NULL;
1475219089Spjd	dsa->dsa_done = done;
1476219089Spjd	dsa->dsa_zgd = zgd;
1477219089Spjd	dsa->dsa_tx = tx;
1478219089Spjd
1479219089Spjd	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1480219089Spjd	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1481260763Savg	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa,
1482219089Spjd	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1483219089Spjd
1484219089Spjd	return (0);
1485219089Spjd}
1486219089Spjd
1487168404Spjd/*
1488168404Spjd * Intent log support: sync the block associated with db to disk.
1489168404Spjd * N.B. and XXX: the caller is responsible for making sure that the
1490168404Spjd * data isn't changing while dmu_sync() is writing it.
1491168404Spjd *
1492168404Spjd * Return values:
1493168404Spjd *
1494243524Smm *	EEXIST: this txg has already been synced, so there's nothing to do.
1495168404Spjd *		The caller should not log the write.
1496168404Spjd *
1497168404Spjd *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1498168404Spjd *		The caller should not log the write.
1499168404Spjd *
1500168404Spjd *	EALREADY: this block is already in the process of being synced.
1501168404Spjd *		The caller should track its progress (somehow).
1502168404Spjd *
1503219089Spjd *	EIO: could not do the I/O.
1504219089Spjd *		The caller should do a txg_wait_synced().
1505168404Spjd *
1506219089Spjd *	0: the I/O has been initiated.
1507219089Spjd *		The caller should log this blkptr in the done callback.
1508219089Spjd *		It is possible that the I/O will fail, in which case
1509219089Spjd *		the error will be reported to the done callback and
1510219089Spjd *		propagated to pio from zio_done().
1511168404Spjd */
1512168404Spjdint
1513219089Spjddmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1514168404Spjd{
1515219089Spjd	blkptr_t *bp = zgd->zgd_bp;
1516219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1517219089Spjd	objset_t *os = db->db_objset;
1518219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
1519168404Spjd	dbuf_dirty_record_t *dr;
1520219089Spjd	dmu_sync_arg_t *dsa;
1521268657Sdelphij	zbookmark_phys_t zb;
1522219089Spjd	zio_prop_t zp;
1523219089Spjd	dnode_t *dn;
1524168404Spjd
1525219089Spjd	ASSERT(pio != NULL);
1526168404Spjd	ASSERT(txg != 0);
1527168404Spjd
1528219089Spjd	SET_BOOKMARK(&zb, ds->ds_object,
1529219089Spjd	    db->db.db_object, db->db_level, db->db_blkid);
1530168404Spjd
1531219089Spjd	DB_DNODE_ENTER(db);
1532219089Spjd	dn = DB_DNODE(db);
1533219089Spjd	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
1534219089Spjd	DB_DNODE_EXIT(db);
1535219089Spjd
1536168404Spjd	/*
1537219089Spjd	 * If we're frozen (running ziltest), we always need to generate a bp.
1538168404Spjd	 */
1539219089Spjd	if (txg > spa_freeze_txg(os->os_spa))
1540219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1541168404Spjd
1542168404Spjd	/*
1543219089Spjd	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1544219089Spjd	 * and us.  If we determine that this txg is not yet syncing,
1545219089Spjd	 * but it begins to sync a moment later, that's OK because the
1546219089Spjd	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1547168404Spjd	 */
1548219089Spjd	mutex_enter(&db->db_mtx);
1549219089Spjd
1550219089Spjd	if (txg <= spa_last_synced_txg(os->os_spa)) {
1551168404Spjd		/*
1552219089Spjd		 * This txg has already synced.  There's nothing to do.
1553168404Spjd		 */
1554219089Spjd		mutex_exit(&db->db_mtx);
1555249195Smm		return (SET_ERROR(EEXIST));
1556168404Spjd	}
1557168404Spjd
1558219089Spjd	if (txg <= spa_syncing_txg(os->os_spa)) {
1559219089Spjd		/*
1560219089Spjd		 * This txg is currently syncing, so we can't mess with
1561219089Spjd		 * the dirty record anymore; just write a new log block.
1562219089Spjd		 */
1563219089Spjd		mutex_exit(&db->db_mtx);
1564219089Spjd		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1565168404Spjd	}
1566168404Spjd
1567168404Spjd	dr = db->db_last_dirty;
1568219089Spjd	while (dr && dr->dr_txg != txg)
1569168404Spjd		dr = dr->dr_next;
1570219089Spjd
1571219089Spjd	if (dr == NULL) {
1572168404Spjd		/*
1573219089Spjd		 * There's no dr for this dbuf, so it must have been freed.
1574168404Spjd		 * There's no need to log writes to freed blocks, so we're done.
1575168404Spjd		 */
1576168404Spjd		mutex_exit(&db->db_mtx);
1577249195Smm		return (SET_ERROR(ENOENT));
1578168404Spjd	}
1579168404Spjd
1580243524Smm	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
1581243524Smm
1582243524Smm	/*
1583243524Smm	 * Assume the on-disk data is X, the current syncing data is Y,
1584243524Smm	 * and the current in-memory data is Z (currently in dmu_sync).
1585243524Smm	 * X and Z are identical but Y is has been modified. Normally,
1586243524Smm	 * when X and Z are the same we will perform a nopwrite but if Y
1587243524Smm	 * is different we must disable nopwrite since the resulting write
1588243524Smm	 * of Y to disk can free the block containing X. If we allowed a
1589243524Smm	 * nopwrite to occur the block pointing to Z would reference a freed
1590243524Smm	 * block. Since this is a rare case we simplify this by disabling
1591243524Smm	 * nopwrite if the current dmu_sync-ing dbuf has been modified in
1592243524Smm	 * a previous transaction.
1593243524Smm	 */
1594243524Smm	if (dr->dr_next)
1595243524Smm		zp.zp_nopwrite = B_FALSE;
1596243524Smm
1597168404Spjd	ASSERT(dr->dr_txg == txg);
1598219089Spjd	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1599219089Spjd	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1600168404Spjd		/*
1601219089Spjd		 * We have already issued a sync write for this buffer,
1602219089Spjd		 * or this buffer has already been synced.  It could not
1603219089Spjd		 * have been dirtied since, or we would have cleared the state.
1604168404Spjd		 */
1605168404Spjd		mutex_exit(&db->db_mtx);
1606249195Smm		return (SET_ERROR(EALREADY));
1607168404Spjd	}
1608168404Spjd
1609219089Spjd	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1610168404Spjd	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1611168404Spjd	mutex_exit(&db->db_mtx);
1612168404Spjd
1613219089Spjd	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1614219089Spjd	dsa->dsa_dr = dr;
1615219089Spjd	dsa->dsa_done = done;
1616219089Spjd	dsa->dsa_zgd = zgd;
1617219089Spjd	dsa->dsa_tx = NULL;
1618168404Spjd
1619219089Spjd	zio_nowait(arc_write(pio, os->os_spa, txg,
1620251478Sdelphij	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1621260763Savg	    DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready,
1622260763Savg	    NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
1623260763Savg	    ZIO_FLAG_CANFAIL, &zb));
1624185029Spjd
1625219089Spjd	return (0);
1626168404Spjd}
1627168404Spjd
1628168404Spjdint
1629168404Spjddmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1630168404Spjd	dmu_tx_t *tx)
1631168404Spjd{
1632168404Spjd	dnode_t *dn;
1633168404Spjd	int err;
1634168404Spjd
1635219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1636168404Spjd	if (err)
1637168404Spjd		return (err);
1638168404Spjd	err = dnode_set_blksz(dn, size, ibs, tx);
1639168404Spjd	dnode_rele(dn, FTAG);
1640168404Spjd	return (err);
1641168404Spjd}
1642168404Spjd
1643168404Spjdvoid
1644168404Spjddmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
1645168404Spjd	dmu_tx_t *tx)
1646168404Spjd{
1647168404Spjd	dnode_t *dn;
1648168404Spjd
1649268649Sdelphij	/*
1650268649Sdelphij	 * Send streams include each object's checksum function.  This
1651268649Sdelphij	 * check ensures that the receiving system can understand the
1652268649Sdelphij	 * checksum function transmitted.
1653268649Sdelphij	 */
1654268649Sdelphij	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
1655268649Sdelphij
1656268649Sdelphij	VERIFY0(dnode_hold(os, object, FTAG, &dn));
1657268649Sdelphij	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
1658168404Spjd	dn->dn_checksum = checksum;
1659168404Spjd	dnode_setdirty(dn, tx);
1660168404Spjd	dnode_rele(dn, FTAG);
1661168404Spjd}
1662168404Spjd
1663168404Spjdvoid
1664168404Spjddmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1665168404Spjd	dmu_tx_t *tx)
1666168404Spjd{
1667168404Spjd	dnode_t *dn;
1668168404Spjd
1669268649Sdelphij	/*
1670268649Sdelphij	 * Send streams include each object's compression function.  This
1671268649Sdelphij	 * check ensures that the receiving system can understand the
1672268649Sdelphij	 * compression function transmitted.
1673268649Sdelphij	 */
1674268649Sdelphij	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
1675268649Sdelphij
1676268649Sdelphij	VERIFY0(dnode_hold(os, object, FTAG, &dn));
1677168404Spjd	dn->dn_compress = compress;
1678168404Spjd	dnode_setdirty(dn, tx);
1679168404Spjd	dnode_rele(dn, FTAG);
1680168404Spjd}
1681168404Spjd
1682219089Spjdint zfs_mdcomp_disable = 0;
1683219089SpjdTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
1684219089SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
1685219089Spjd    &zfs_mdcomp_disable, 0, "Disable metadata compression");
1686219089Spjd
1687268647Sdelphij/*
1688268647Sdelphij * When the "redundant_metadata" property is set to "most", only indirect
1689268647Sdelphij * blocks of this level and higher will have an additional ditto block.
1690268647Sdelphij */
1691268647Sdelphijint zfs_redundant_metadata_most_ditto_level = 2;
1692268647Sdelphij
1693219089Spjdvoid
1694219089Spjddmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
1695219089Spjd{
1696219089Spjd	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1697236884Smm	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1698219089Spjd	    (wp & WP_SPILL));
1699219089Spjd	enum zio_checksum checksum = os->os_checksum;
1700219089Spjd	enum zio_compress compress = os->os_compress;
1701219089Spjd	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1702243524Smm	boolean_t dedup = B_FALSE;
1703243524Smm	boolean_t nopwrite = B_FALSE;
1704219089Spjd	boolean_t dedup_verify = os->os_dedup_verify;
1705219089Spjd	int copies = os->os_copies;
1706219089Spjd
1707219089Spjd	/*
1708243524Smm	 * We maintain different write policies for each of the following
1709243524Smm	 * types of data:
1710243524Smm	 *	 1. metadata
1711243524Smm	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
1712243524Smm	 *	 3. all other level 0 blocks
1713219089Spjd	 */
1714219089Spjd	if (ismd) {
1715219089Spjd		/*
1716243524Smm		 * XXX -- we should design a compression algorithm
1717243524Smm		 * that specializes in arrays of bps.
1718243524Smm		 */
1719268658Sdelphij		boolean_t lz4_ac = spa_feature_is_active(os->os_spa,
1720268658Sdelphij		    SPA_FEATURE_LZ4_COMPRESS);
1721243524Smm
1722268658Sdelphij		if (zfs_mdcomp_disable) {
1723268658Sdelphij			compress = ZIO_COMPRESS_EMPTY;
1724268658Sdelphij		} else if (lz4_ac) {
1725268658Sdelphij			compress = ZIO_COMPRESS_LZ4;
1726268658Sdelphij		} else {
1727268658Sdelphij			compress = ZIO_COMPRESS_LZJB;
1728268658Sdelphij		}
1729268658Sdelphij
1730243524Smm		/*
1731219089Spjd		 * Metadata always gets checksummed.  If the data
1732219089Spjd		 * checksum is multi-bit correctable, and it's not a
1733219089Spjd		 * ZBT-style checksum, then it's suitable for metadata
1734219089Spjd		 * as well.  Otherwise, the metadata checksum defaults
1735219089Spjd		 * to fletcher4.
1736219089Spjd		 */
1737219089Spjd		if (zio_checksum_table[checksum].ci_correctable < 1 ||
1738219089Spjd		    zio_checksum_table[checksum].ci_eck)
1739219089Spjd			checksum = ZIO_CHECKSUM_FLETCHER_4;
1740268647Sdelphij
1741268647Sdelphij		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
1742268647Sdelphij		    (os->os_redundant_metadata ==
1743268647Sdelphij		    ZFS_REDUNDANT_METADATA_MOST &&
1744268647Sdelphij		    (level >= zfs_redundant_metadata_most_ditto_level ||
1745268647Sdelphij		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
1746268647Sdelphij			copies++;
1747243524Smm	} else if (wp & WP_NOFILL) {
1748243524Smm		ASSERT(level == 0);
1749219089Spjd
1750219089Spjd		/*
1751243524Smm		 * If we're writing preallocated blocks, we aren't actually
1752243524Smm		 * writing them so don't set any policy properties.  These
1753243524Smm		 * blocks are currently only used by an external subsystem
1754243524Smm		 * outside of zfs (i.e. dump) and not written by the zio
1755243524Smm		 * pipeline.
1756219089Spjd		 */
1757243524Smm		compress = ZIO_COMPRESS_OFF;
1758255750Sdelphij		checksum = ZIO_CHECKSUM_NOPARITY;
1759219089Spjd	} else {
1760219089Spjd		compress = zio_compress_select(dn->dn_compress, compress);
1761219089Spjd
1762243524Smm		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
1763243524Smm		    zio_checksum_select(dn->dn_checksum, checksum) :
1764243524Smm		    dedup_checksum;
1765219089Spjd
1766243524Smm		/*
1767243524Smm		 * Determine dedup setting.  If we are in dmu_sync(),
1768243524Smm		 * we won't actually dedup now because that's all
1769243524Smm		 * done in syncing context; but we do want to use the
1770243524Smm		 * dedup checkum.  If the checksum is not strong
1771243524Smm		 * enough to ensure unique signatures, force
1772243524Smm		 * dedup_verify.
1773243524Smm		 */
1774243524Smm		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
1775243524Smm			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
1776243524Smm			if (!zio_checksum_table[checksum].ci_dedup)
1777243524Smm				dedup_verify = B_TRUE;
1778243524Smm		}
1779219089Spjd
1780243524Smm		/*
1781243524Smm		 * Enable nopwrite if we have a cryptographically secure
1782243524Smm		 * checksum that has no known collisions (i.e. SHA-256)
1783243524Smm		 * and compression is enabled.  We don't enable nopwrite if
1784243524Smm		 * dedup is enabled as the two features are mutually exclusive.
1785243524Smm		 */
1786243524Smm		nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
1787243524Smm		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
1788219089Spjd	}
1789219089Spjd
1790219089Spjd	zp->zp_checksum = checksum;
1791219089Spjd	zp->zp_compress = compress;
1792219089Spjd	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
1793219089Spjd	zp->zp_level = level;
1794268647Sdelphij	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
1795219089Spjd	zp->zp_dedup = dedup;
1796219089Spjd	zp->zp_dedup_verify = dedup && dedup_verify;
1797243524Smm	zp->zp_nopwrite = nopwrite;
1798219089Spjd}
1799219089Spjd
1800168404Spjdint
1801168404Spjddmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
1802168404Spjd{
1803168404Spjd	dnode_t *dn;
1804168404Spjd	int i, err;
1805168404Spjd
1806219089Spjd	err = dnode_hold(os, object, FTAG, &dn);
1807168404Spjd	if (err)
1808168404Spjd		return (err);
1809168404Spjd	/*
1810168404Spjd	 * Sync any current changes before
1811168404Spjd	 * we go trundling through the block pointers.
1812168404Spjd	 */
1813168404Spjd	for (i = 0; i < TXG_SIZE; i++) {
1814168404Spjd		if (list_link_active(&dn->dn_dirty_link[i]))
1815168404Spjd			break;
1816168404Spjd	}
1817168404Spjd	if (i != TXG_SIZE) {
1818168404Spjd		dnode_rele(dn, FTAG);
1819168404Spjd		txg_wait_synced(dmu_objset_pool(os), 0);
1820219089Spjd		err = dnode_hold(os, object, FTAG, &dn);
1821168404Spjd		if (err)
1822168404Spjd			return (err);
1823168404Spjd	}
1824168404Spjd
1825185029Spjd	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
1826168404Spjd	dnode_rele(dn, FTAG);
1827168404Spjd
1828168404Spjd	return (err);
1829168404Spjd}
1830168404Spjd
1831168404Spjdvoid
1832168404Spjddmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
1833168404Spjd{
1834219089Spjd	dnode_phys_t *dnp;
1835219089Spjd
1836168404Spjd	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1837168404Spjd	mutex_enter(&dn->dn_mtx);
1838168404Spjd
1839219089Spjd	dnp = dn->dn_phys;
1840219089Spjd
1841168404Spjd	doi->doi_data_block_size = dn->dn_datablksz;
1842168404Spjd	doi->doi_metadata_block_size = dn->dn_indblkshift ?
1843168404Spjd	    1ULL << dn->dn_indblkshift : 0;
1844219089Spjd	doi->doi_type = dn->dn_type;
1845219089Spjd	doi->doi_bonus_type = dn->dn_bonustype;
1846219089Spjd	doi->doi_bonus_size = dn->dn_bonuslen;
1847168404Spjd	doi->doi_indirection = dn->dn_nlevels;
1848168404Spjd	doi->doi_checksum = dn->dn_checksum;
1849168404Spjd	doi->doi_compress = dn->dn_compress;
1850219089Spjd	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
1851247852Smm	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
1852219089Spjd	doi->doi_fill_count = 0;
1853219089Spjd	for (int i = 0; i < dnp->dn_nblkptr; i++)
1854268649Sdelphij		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
1855168404Spjd
1856168404Spjd	mutex_exit(&dn->dn_mtx);
1857168404Spjd	rw_exit(&dn->dn_struct_rwlock);
1858168404Spjd}
1859168404Spjd
1860168404Spjd/*
1861168404Spjd * Get information on a DMU object.
1862168404Spjd * If doi is NULL, just indicates whether the object exists.
1863168404Spjd */
1864168404Spjdint
1865168404Spjddmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
1866168404Spjd{
1867168404Spjd	dnode_t *dn;
1868219089Spjd	int err = dnode_hold(os, object, FTAG, &dn);
1869168404Spjd
1870168404Spjd	if (err)
1871168404Spjd		return (err);
1872168404Spjd
1873168404Spjd	if (doi != NULL)
1874168404Spjd		dmu_object_info_from_dnode(dn, doi);
1875168404Spjd
1876168404Spjd	dnode_rele(dn, FTAG);
1877168404Spjd	return (0);
1878168404Spjd}
1879168404Spjd
1880168404Spjd/*
1881168404Spjd * As above, but faster; can be used when you have a held dbuf in hand.
1882168404Spjd */
1883168404Spjdvoid
1884219089Spjddmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
1885168404Spjd{
1886219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1887219089Spjd
1888219089Spjd	DB_DNODE_ENTER(db);
1889219089Spjd	dmu_object_info_from_dnode(DB_DNODE(db), doi);
1890219089Spjd	DB_DNODE_EXIT(db);
1891168404Spjd}
1892168404Spjd
1893168404Spjd/*
1894168404Spjd * Faster still when you only care about the size.
1895168404Spjd * This is specifically optimized for zfs_getattr().
1896168404Spjd */
1897168404Spjdvoid
1898219089Spjddmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
1899219089Spjd    u_longlong_t *nblk512)
1900168404Spjd{
1901219089Spjd	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1902219089Spjd	dnode_t *dn;
1903168404Spjd
1904219089Spjd	DB_DNODE_ENTER(db);
1905219089Spjd	dn = DB_DNODE(db);
1906219089Spjd
1907168404Spjd	*blksize = dn->dn_datablksz;
1908168404Spjd	/* add 1 for dnode space */
1909168404Spjd	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
1910168404Spjd	    SPA_MINBLOCKSHIFT) + 1;
1911219089Spjd	DB_DNODE_EXIT(db);
1912168404Spjd}
1913168404Spjd
1914168404Spjdvoid
1915168404Spjdbyteswap_uint64_array(void *vbuf, size_t size)
1916168404Spjd{
1917168404Spjd	uint64_t *buf = vbuf;
1918168404Spjd	size_t count = size >> 3;
1919168404Spjd	int i;
1920168404Spjd
1921168404Spjd	ASSERT((size & 7) == 0);
1922168404Spjd
1923168404Spjd	for (i = 0; i < count; i++)
1924168404Spjd		buf[i] = BSWAP_64(buf[i]);
1925168404Spjd}
1926168404Spjd
1927168404Spjdvoid
1928168404Spjdbyteswap_uint32_array(void *vbuf, size_t size)
1929168404Spjd{
1930168404Spjd	uint32_t *buf = vbuf;
1931168404Spjd	size_t count = size >> 2;
1932168404Spjd	int i;
1933168404Spjd
1934168404Spjd	ASSERT((size & 3) == 0);
1935168404Spjd
1936168404Spjd	for (i = 0; i < count; i++)
1937168404Spjd		buf[i] = BSWAP_32(buf[i]);
1938168404Spjd}
1939168404Spjd
1940168404Spjdvoid
1941168404Spjdbyteswap_uint16_array(void *vbuf, size_t size)
1942168404Spjd{
1943168404Spjd	uint16_t *buf = vbuf;
1944168404Spjd	size_t count = size >> 1;
1945168404Spjd	int i;
1946168404Spjd
1947168404Spjd	ASSERT((size & 1) == 0);
1948168404Spjd
1949168404Spjd	for (i = 0; i < count; i++)
1950168404Spjd		buf[i] = BSWAP_16(buf[i]);
1951168404Spjd}
1952168404Spjd
1953168404Spjd/* ARGSUSED */
1954168404Spjdvoid
1955168404Spjdbyteswap_uint8_array(void *vbuf, size_t size)
1956168404Spjd{
1957168404Spjd}
1958168404Spjd
1959168404Spjdvoid
1960168404Spjddmu_init(void)
1961168404Spjd{
1962219089Spjd	zfs_dbgmsg_init();
1963219089Spjd	sa_cache_init();
1964219089Spjd	xuio_stat_init();
1965219089Spjd	dmu_objset_init();
1966219089Spjd	dnode_init();
1967168404Spjd	dbuf_init();
1968208130Smm	zfetch_init();
1969254608Sgibbs	zio_compress_init();
1970239620Smm	l2arc_init();
1971168404Spjd	arc_init();
1972168404Spjd}
1973168404Spjd
1974168404Spjdvoid
1975168404Spjddmu_fini(void)
1976168404Spjd{
1977251629Sdelphij	arc_fini(); /* arc depends on l2arc, so arc must go first */
1978219089Spjd	l2arc_fini();
1979208130Smm	zfetch_fini();
1980254608Sgibbs	zio_compress_fini();
1981219089Spjd	dbuf_fini();
1982168404Spjd	dnode_fini();
1983219089Spjd	dmu_objset_fini();
1984219089Spjd	xuio_stat_fini();
1985219089Spjd	sa_cache_fini();
1986219089Spjd	zfs_dbgmsg_fini();
1987168404Spjd}
1988