zdb.c revision 268653
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 */
26
27#include <stdio.h>
28#include <unistd.h>
29#include <stdio_ext.h>
30#include <stdlib.h>
31#include <ctype.h>
32#include <sys/zfs_context.h>
33#include <sys/spa.h>
34#include <sys/spa_impl.h>
35#include <sys/dmu.h>
36#include <sys/zap.h>
37#include <sys/fs/zfs.h>
38#include <sys/zfs_znode.h>
39#include <sys/zfs_sa.h>
40#include <sys/sa.h>
41#include <sys/sa_impl.h>
42#include <sys/vdev.h>
43#include <sys/vdev_impl.h>
44#include <sys/metaslab_impl.h>
45#include <sys/dmu_objset.h>
46#include <sys/dsl_dir.h>
47#include <sys/dsl_dataset.h>
48#include <sys/dsl_pool.h>
49#include <sys/dbuf.h>
50#include <sys/zil.h>
51#include <sys/zil_impl.h>
52#include <sys/stat.h>
53#include <sys/resource.h>
54#include <sys/dmu_traverse.h>
55#include <sys/zio_checksum.h>
56#include <sys/zio_compress.h>
57#include <sys/zfs_fuid.h>
58#include <sys/arc.h>
59#include <sys/ddt.h>
60#include <sys/zfeature.h>
61#include <zfs_comutil.h>
62#undef ZFS_MAXNAMELEN
63#undef verify
64#include <libzfs.h>
65
66#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ?	\
67	zio_compress_table[(idx)].ci_name : "UNKNOWN")
68#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ?	\
69	zio_checksum_table[(idx)].ci_name : "UNKNOWN")
70#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ?	\
71	dmu_ot[(idx)].ot_name : DMU_OT_IS_VALID(idx) ?	\
72	dmu_ot_byteswap[DMU_OT_BYTESWAP(idx)].ob_name : "UNKNOWN")
73#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) :		\
74	(((idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA) ?	\
75	DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
76
77#ifndef lint
78extern boolean_t zfs_recover;
79#else
80boolean_t zfs_recover;
81#endif
82
83const char cmdname[] = "zdb";
84uint8_t dump_opt[256];
85
86typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
87
88extern void dump_intent_log(zilog_t *);
89uint64_t *zopt_object = NULL;
90int zopt_objects = 0;
91libzfs_handle_t *g_zfs;
92uint64_t max_inflight = 200;
93
94/*
95 * These libumem hooks provide a reasonable set of defaults for the allocator's
96 * debugging facilities.
97 */
98const char *
99_umem_debug_init()
100{
101	return ("default,verbose"); /* $UMEM_DEBUG setting */
102}
103
104const char *
105_umem_logging_init(void)
106{
107	return ("fail,contents"); /* $UMEM_LOGGING setting */
108}
109
110static void
111usage(void)
112{
113	(void) fprintf(stderr,
114	    "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
115	    "[-U config] [-M inflight I/Os] [-x dumpdir] poolname [object...]\n"
116	    "       %s [-divPA] [-e -p path...] [-U config] dataset "
117	    "[object...]\n"
118	    "       %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
119	    "poolname [vdev [metaslab...]]\n"
120	    "       %s -R [-A] [-e [-p path...]] poolname "
121	    "vdev:offset:size[:flags]\n"
122	    "       %s -S [-PA] [-e [-p path...]] [-U config] poolname\n"
123	    "       %s -l [-uA] device\n"
124	    "       %s -C [-A] [-U config]\n\n",
125	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
126
127	(void) fprintf(stderr, "    Dataset name must include at least one "
128	    "separator character '/' or '@'\n");
129	(void) fprintf(stderr, "    If dataset name is specified, only that "
130	    "dataset is dumped\n");
131	(void) fprintf(stderr, "    If object numbers are specified, only "
132	    "those objects are dumped\n\n");
133	(void) fprintf(stderr, "    Options to control amount of output:\n");
134	(void) fprintf(stderr, "        -u uberblock\n");
135	(void) fprintf(stderr, "        -d dataset(s)\n");
136	(void) fprintf(stderr, "        -i intent logs\n");
137	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
138	(void) fprintf(stderr, "        -h pool history\n");
139	(void) fprintf(stderr, "        -b block statistics\n");
140	(void) fprintf(stderr, "        -m metaslabs\n");
141	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
142	    "all data) blocks\n");
143	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
144	(void) fprintf(stderr, "        -D dedup statistics\n");
145	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
146	(void) fprintf(stderr, "        -v verbose (applies to all others)\n");
147	(void) fprintf(stderr, "        -l dump label contents\n");
148	(void) fprintf(stderr, "        -L disable leak tracking (do not "
149	    "load spacemaps)\n");
150	(void) fprintf(stderr, "        -R read and display block from a "
151	    "device\n\n");
152	(void) fprintf(stderr, "    Below options are intended for use "
153	    "with other options:\n");
154	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
155	    "panic recovery (-AA) or both (-AAA)\n");
156	(void) fprintf(stderr, "        -F attempt automatic rewind within "
157	    "safe range of transaction groups\n");
158	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
159	    "cachefile\n");
160	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
161	    "work with dataset)\n");
162	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
163	    "has altroot/not in a cachefile\n");
164	(void) fprintf(stderr, "        -p <path> -- use one or more with "
165	    "-e to specify path to vdev dir\n");
166	(void) fprintf(stderr, "        -x <dumpdir> -- "
167	    "dump all read blocks into specified directory\n");
168	(void) fprintf(stderr, "        -P print numbers in parseable form\n");
169	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
170	    "searching for uberblocks\n");
171	(void) fprintf(stderr, "        -M <number of inflight I/Os> -- "
172	    "specify the maximum number of "
173	    "checksumming I/Os [default is 200]\n");
174	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
175	    "to make only that option verbose\n");
176	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
177	exit(1);
178}
179
180/*
181 * Called for usage errors that are discovered after a call to spa_open(),
182 * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
183 */
184
185static void
186fatal(const char *fmt, ...)
187{
188	va_list ap;
189
190	va_start(ap, fmt);
191	(void) fprintf(stderr, "%s: ", cmdname);
192	(void) vfprintf(stderr, fmt, ap);
193	va_end(ap);
194	(void) fprintf(stderr, "\n");
195
196	exit(1);
197}
198
199/* ARGSUSED */
200static void
201dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
202{
203	nvlist_t *nv;
204	size_t nvsize = *(uint64_t *)data;
205	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
206
207	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
208
209	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
210
211	umem_free(packed, nvsize);
212
213	dump_nvlist(nv, 8);
214
215	nvlist_free(nv);
216}
217
218/* ARGSUSED */
219static void
220dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
221{
222	spa_history_phys_t *shp = data;
223
224	if (shp == NULL)
225		return;
226
227	(void) printf("\t\tpool_create_len = %llu\n",
228	    (u_longlong_t)shp->sh_pool_create_len);
229	(void) printf("\t\tphys_max_off = %llu\n",
230	    (u_longlong_t)shp->sh_phys_max_off);
231	(void) printf("\t\tbof = %llu\n",
232	    (u_longlong_t)shp->sh_bof);
233	(void) printf("\t\teof = %llu\n",
234	    (u_longlong_t)shp->sh_eof);
235	(void) printf("\t\trecords_lost = %llu\n",
236	    (u_longlong_t)shp->sh_records_lost);
237}
238
239static void
240zdb_nicenum(uint64_t num, char *buf)
241{
242	if (dump_opt['P'])
243		(void) sprintf(buf, "%llu", (longlong_t)num);
244	else
245		nicenum(num, buf);
246}
247
248const char histo_stars[] = "****************************************";
249const int histo_width = sizeof (histo_stars) - 1;
250
251static void
252dump_histogram(const uint64_t *histo, int size, int offset)
253{
254	int i;
255	int minidx = size - 1;
256	int maxidx = 0;
257	uint64_t max = 0;
258
259	for (i = 0; i < size; i++) {
260		if (histo[i] > max)
261			max = histo[i];
262		if (histo[i] > 0 && i > maxidx)
263			maxidx = i;
264		if (histo[i] > 0 && i < minidx)
265			minidx = i;
266	}
267
268	if (max < histo_width)
269		max = histo_width;
270
271	for (i = minidx; i <= maxidx; i++) {
272		(void) printf("\t\t\t%3u: %6llu %s\n",
273		    i + offset, (u_longlong_t)histo[i],
274		    &histo_stars[(max - histo[i]) * histo_width / max]);
275	}
276}
277
278static void
279dump_zap_stats(objset_t *os, uint64_t object)
280{
281	int error;
282	zap_stats_t zs;
283
284	error = zap_get_stats(os, object, &zs);
285	if (error)
286		return;
287
288	if (zs.zs_ptrtbl_len == 0) {
289		ASSERT(zs.zs_num_blocks == 1);
290		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
291		    (u_longlong_t)zs.zs_blocksize,
292		    (u_longlong_t)zs.zs_num_entries);
293		return;
294	}
295
296	(void) printf("\tFat ZAP stats:\n");
297
298	(void) printf("\t\tPointer table:\n");
299	(void) printf("\t\t\t%llu elements\n",
300	    (u_longlong_t)zs.zs_ptrtbl_len);
301	(void) printf("\t\t\tzt_blk: %llu\n",
302	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
303	(void) printf("\t\t\tzt_numblks: %llu\n",
304	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
305	(void) printf("\t\t\tzt_shift: %llu\n",
306	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
307	(void) printf("\t\t\tzt_blks_copied: %llu\n",
308	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
309	(void) printf("\t\t\tzt_nextblk: %llu\n",
310	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
311
312	(void) printf("\t\tZAP entries: %llu\n",
313	    (u_longlong_t)zs.zs_num_entries);
314	(void) printf("\t\tLeaf blocks: %llu\n",
315	    (u_longlong_t)zs.zs_num_leafs);
316	(void) printf("\t\tTotal blocks: %llu\n",
317	    (u_longlong_t)zs.zs_num_blocks);
318	(void) printf("\t\tzap_block_type: 0x%llx\n",
319	    (u_longlong_t)zs.zs_block_type);
320	(void) printf("\t\tzap_magic: 0x%llx\n",
321	    (u_longlong_t)zs.zs_magic);
322	(void) printf("\t\tzap_salt: 0x%llx\n",
323	    (u_longlong_t)zs.zs_salt);
324
325	(void) printf("\t\tLeafs with 2^n pointers:\n");
326	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
327
328	(void) printf("\t\tBlocks with n*5 entries:\n");
329	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
330
331	(void) printf("\t\tBlocks n/10 full:\n");
332	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
333
334	(void) printf("\t\tEntries with n chunks:\n");
335	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
336
337	(void) printf("\t\tBuckets with n entries:\n");
338	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
339}
340
341/*ARGSUSED*/
342static void
343dump_none(objset_t *os, uint64_t object, void *data, size_t size)
344{
345}
346
347/*ARGSUSED*/
348static void
349dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
350{
351	(void) printf("\tUNKNOWN OBJECT TYPE\n");
352}
353
354/*ARGSUSED*/
355void
356dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
357{
358}
359
360/*ARGSUSED*/
361static void
362dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
363{
364}
365
366/*ARGSUSED*/
367static void
368dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
369{
370	zap_cursor_t zc;
371	zap_attribute_t attr;
372	void *prop;
373	int i;
374
375	dump_zap_stats(os, object);
376	(void) printf("\n");
377
378	for (zap_cursor_init(&zc, os, object);
379	    zap_cursor_retrieve(&zc, &attr) == 0;
380	    zap_cursor_advance(&zc)) {
381		(void) printf("\t\t%s = ", attr.za_name);
382		if (attr.za_num_integers == 0) {
383			(void) printf("\n");
384			continue;
385		}
386		prop = umem_zalloc(attr.za_num_integers *
387		    attr.za_integer_length, UMEM_NOFAIL);
388		(void) zap_lookup(os, object, attr.za_name,
389		    attr.za_integer_length, attr.za_num_integers, prop);
390		if (attr.za_integer_length == 1) {
391			(void) printf("%s", (char *)prop);
392		} else {
393			for (i = 0; i < attr.za_num_integers; i++) {
394				switch (attr.za_integer_length) {
395				case 2:
396					(void) printf("%u ",
397					    ((uint16_t *)prop)[i]);
398					break;
399				case 4:
400					(void) printf("%u ",
401					    ((uint32_t *)prop)[i]);
402					break;
403				case 8:
404					(void) printf("%lld ",
405					    (u_longlong_t)((int64_t *)prop)[i]);
406					break;
407				}
408			}
409		}
410		(void) printf("\n");
411		umem_free(prop, attr.za_num_integers * attr.za_integer_length);
412	}
413	zap_cursor_fini(&zc);
414}
415
416/*ARGSUSED*/
417static void
418dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
419{
420	dump_zap_stats(os, object);
421	/* contents are printed elsewhere, properly decoded */
422}
423
424/*ARGSUSED*/
425static void
426dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
427{
428	zap_cursor_t zc;
429	zap_attribute_t attr;
430
431	dump_zap_stats(os, object);
432	(void) printf("\n");
433
434	for (zap_cursor_init(&zc, os, object);
435	    zap_cursor_retrieve(&zc, &attr) == 0;
436	    zap_cursor_advance(&zc)) {
437		(void) printf("\t\t%s = ", attr.za_name);
438		if (attr.za_num_integers == 0) {
439			(void) printf("\n");
440			continue;
441		}
442		(void) printf(" %llx : [%d:%d:%d]\n",
443		    (u_longlong_t)attr.za_first_integer,
444		    (int)ATTR_LENGTH(attr.za_first_integer),
445		    (int)ATTR_BSWAP(attr.za_first_integer),
446		    (int)ATTR_NUM(attr.za_first_integer));
447	}
448	zap_cursor_fini(&zc);
449}
450
451/*ARGSUSED*/
452static void
453dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
454{
455	zap_cursor_t zc;
456	zap_attribute_t attr;
457	uint16_t *layout_attrs;
458	int i;
459
460	dump_zap_stats(os, object);
461	(void) printf("\n");
462
463	for (zap_cursor_init(&zc, os, object);
464	    zap_cursor_retrieve(&zc, &attr) == 0;
465	    zap_cursor_advance(&zc)) {
466		(void) printf("\t\t%s = [", attr.za_name);
467		if (attr.za_num_integers == 0) {
468			(void) printf("\n");
469			continue;
470		}
471
472		VERIFY(attr.za_integer_length == 2);
473		layout_attrs = umem_zalloc(attr.za_num_integers *
474		    attr.za_integer_length, UMEM_NOFAIL);
475
476		VERIFY(zap_lookup(os, object, attr.za_name,
477		    attr.za_integer_length,
478		    attr.za_num_integers, layout_attrs) == 0);
479
480		for (i = 0; i != attr.za_num_integers; i++)
481			(void) printf(" %d ", (int)layout_attrs[i]);
482		(void) printf("]\n");
483		umem_free(layout_attrs,
484		    attr.za_num_integers * attr.za_integer_length);
485	}
486	zap_cursor_fini(&zc);
487}
488
489/*ARGSUSED*/
490static void
491dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
492{
493	zap_cursor_t zc;
494	zap_attribute_t attr;
495	const char *typenames[] = {
496		/* 0 */ "not specified",
497		/* 1 */ "FIFO",
498		/* 2 */ "Character Device",
499		/* 3 */ "3 (invalid)",
500		/* 4 */ "Directory",
501		/* 5 */ "5 (invalid)",
502		/* 6 */ "Block Device",
503		/* 7 */ "7 (invalid)",
504		/* 8 */ "Regular File",
505		/* 9 */ "9 (invalid)",
506		/* 10 */ "Symbolic Link",
507		/* 11 */ "11 (invalid)",
508		/* 12 */ "Socket",
509		/* 13 */ "Door",
510		/* 14 */ "Event Port",
511		/* 15 */ "15 (invalid)",
512	};
513
514	dump_zap_stats(os, object);
515	(void) printf("\n");
516
517	for (zap_cursor_init(&zc, os, object);
518	    zap_cursor_retrieve(&zc, &attr) == 0;
519	    zap_cursor_advance(&zc)) {
520		(void) printf("\t\t%s = %lld (type: %s)\n",
521		    attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer),
522		    typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]);
523	}
524	zap_cursor_fini(&zc);
525}
526
527int
528get_dtl_refcount(vdev_t *vd)
529{
530	int refcount = 0;
531
532	if (vd->vdev_ops->vdev_op_leaf) {
533		space_map_t *sm = vd->vdev_dtl_sm;
534
535		if (sm != NULL &&
536		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
537			return (1);
538		return (0);
539	}
540
541	for (int c = 0; c < vd->vdev_children; c++)
542		refcount += get_dtl_refcount(vd->vdev_child[c]);
543	return (refcount);
544}
545
546int
547get_metaslab_refcount(vdev_t *vd)
548{
549	int refcount = 0;
550
551	if (vd->vdev_top == vd) {
552		for (int m = 0; m < vd->vdev_ms_count; m++) {
553			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
554
555			if (sm != NULL &&
556			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
557				refcount++;
558		}
559	}
560	for (int c = 0; c < vd->vdev_children; c++)
561		refcount += get_metaslab_refcount(vd->vdev_child[c]);
562
563	return (refcount);
564}
565
566static int
567verify_spacemap_refcounts(spa_t *spa)
568{
569	uint64_t expected_refcount = 0;
570	uint64_t actual_refcount;
571
572	(void) feature_get_refcount(spa,
573	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
574	    &expected_refcount);
575	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
576	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
577
578	if (expected_refcount != actual_refcount) {
579		(void) printf("space map refcount mismatch: expected %lld != "
580		    "actual %lld\n",
581		    (longlong_t)expected_refcount,
582		    (longlong_t)actual_refcount);
583		return (2);
584	}
585	return (0);
586}
587
588static void
589dump_spacemap(objset_t *os, space_map_t *sm)
590{
591	uint64_t alloc, offset, entry;
592	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
593			    "INVALID", "INVALID", "INVALID", "INVALID" };
594
595	if (sm == NULL)
596		return;
597
598	/*
599	 * Print out the freelist entries in both encoded and decoded form.
600	 */
601	alloc = 0;
602	for (offset = 0; offset < space_map_length(sm);
603	    offset += sizeof (entry)) {
604		uint8_t mapshift = sm->sm_shift;
605
606		VERIFY0(dmu_read(os, space_map_object(sm), offset,
607		    sizeof (entry), &entry, DMU_READ_PREFETCH));
608		if (SM_DEBUG_DECODE(entry)) {
609
610			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
611			    (u_longlong_t)(offset / sizeof (entry)),
612			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
613			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
614			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
615		} else {
616			(void) printf("\t    [%6llu]    %c  range:"
617			    " %010llx-%010llx  size: %06llx\n",
618			    (u_longlong_t)(offset / sizeof (entry)),
619			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
620			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
621			    mapshift) + sm->sm_start),
622			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
623			    mapshift) + sm->sm_start +
624			    (SM_RUN_DECODE(entry) << mapshift)),
625			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
626			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
627				alloc += SM_RUN_DECODE(entry) << mapshift;
628			else
629				alloc -= SM_RUN_DECODE(entry) << mapshift;
630		}
631	}
632	if (alloc != space_map_allocated(sm)) {
633		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
634		    "with space map summary (%llu)\n",
635		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
636	}
637}
638
639static void
640dump_metaslab_stats(metaslab_t *msp)
641{
642	char maxbuf[32];
643	range_tree_t *rt = msp->ms_tree;
644	avl_tree_t *t = &msp->ms_size_tree;
645	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
646
647	zdb_nicenum(metaslab_block_maxsize(msp), maxbuf);
648
649	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
650	    "segments", avl_numnodes(t), "maxsize", maxbuf,
651	    "freepct", free_pct);
652	(void) printf("\tIn-memory histogram:\n");
653	dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
654}
655
656static void
657dump_metaslab(metaslab_t *msp)
658{
659	vdev_t *vd = msp->ms_group->mg_vd;
660	spa_t *spa = vd->vdev_spa;
661	space_map_t *sm = msp->ms_sm;
662	char freebuf[32];
663
664	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf);
665
666	(void) printf(
667	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
668	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
669	    (u_longlong_t)space_map_object(sm), freebuf);
670
671	if (dump_opt['m'] > 2 && !dump_opt['L']) {
672		mutex_enter(&msp->ms_lock);
673		metaslab_load_wait(msp);
674		if (!msp->ms_loaded) {
675			VERIFY0(metaslab_load(msp));
676			range_tree_stat_verify(msp->ms_tree);
677		}
678		dump_metaslab_stats(msp);
679		metaslab_unload(msp);
680		mutex_exit(&msp->ms_lock);
681	}
682
683	if (dump_opt['m'] > 1 && sm != NULL &&
684	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
685		/*
686		 * The space map histogram represents free space in chunks
687		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
688		 */
689		(void) printf("\tOn-disk histogram:\n");
690		dump_histogram(sm->sm_phys->smp_histogram,
691		    SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift);
692	}
693
694	if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
695		ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift));
696
697		mutex_enter(&msp->ms_lock);
698		dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
699		mutex_exit(&msp->ms_lock);
700	}
701}
702
703static void
704print_vdev_metaslab_header(vdev_t *vd)
705{
706	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
707	    (u_longlong_t)vd->vdev_id,
708	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
709	    "offset", "spacemap", "free");
710	(void) printf("\t%15s   %19s   %15s   %10s\n",
711	    "---------------", "-------------------",
712	    "---------------", "-------------");
713}
714
715static void
716dump_metaslabs(spa_t *spa)
717{
718	vdev_t *vd, *rvd = spa->spa_root_vdev;
719	uint64_t m, c = 0, children = rvd->vdev_children;
720
721	(void) printf("\nMetaslabs:\n");
722
723	if (!dump_opt['d'] && zopt_objects > 0) {
724		c = zopt_object[0];
725
726		if (c >= children)
727			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
728
729		if (zopt_objects > 1) {
730			vd = rvd->vdev_child[c];
731			print_vdev_metaslab_header(vd);
732
733			for (m = 1; m < zopt_objects; m++) {
734				if (zopt_object[m] < vd->vdev_ms_count)
735					dump_metaslab(
736					    vd->vdev_ms[zopt_object[m]]);
737				else
738					(void) fprintf(stderr, "bad metaslab "
739					    "number %llu\n",
740					    (u_longlong_t)zopt_object[m]);
741			}
742			(void) printf("\n");
743			return;
744		}
745		children = c + 1;
746	}
747	for (; c < children; c++) {
748		vd = rvd->vdev_child[c];
749		print_vdev_metaslab_header(vd);
750
751		for (m = 0; m < vd->vdev_ms_count; m++)
752			dump_metaslab(vd->vdev_ms[m]);
753		(void) printf("\n");
754	}
755}
756
757static void
758dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
759{
760	const ddt_phys_t *ddp = dde->dde_phys;
761	const ddt_key_t *ddk = &dde->dde_key;
762	char *types[4] = { "ditto", "single", "double", "triple" };
763	char blkbuf[BP_SPRINTF_LEN];
764	blkptr_t blk;
765
766	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
767		if (ddp->ddp_phys_birth == 0)
768			continue;
769		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
770		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
771		(void) printf("index %llx refcnt %llu %s %s\n",
772		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
773		    types[p], blkbuf);
774	}
775}
776
777static void
778dump_dedup_ratio(const ddt_stat_t *dds)
779{
780	double rL, rP, rD, D, dedup, compress, copies;
781
782	if (dds->dds_blocks == 0)
783		return;
784
785	rL = (double)dds->dds_ref_lsize;
786	rP = (double)dds->dds_ref_psize;
787	rD = (double)dds->dds_ref_dsize;
788	D = (double)dds->dds_dsize;
789
790	dedup = rD / D;
791	compress = rL / rP;
792	copies = rD / rP;
793
794	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
795	    "dedup * compress / copies = %.2f\n\n",
796	    dedup, compress, copies, dedup * compress / copies);
797}
798
799static void
800dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
801{
802	char name[DDT_NAMELEN];
803	ddt_entry_t dde;
804	uint64_t walk = 0;
805	dmu_object_info_t doi;
806	uint64_t count, dspace, mspace;
807	int error;
808
809	error = ddt_object_info(ddt, type, class, &doi);
810
811	if (error == ENOENT)
812		return;
813	ASSERT(error == 0);
814
815	error = ddt_object_count(ddt, type, class, &count);
816	ASSERT(error == 0);
817	if (count == 0)
818		return;
819
820	dspace = doi.doi_physical_blocks_512 << 9;
821	mspace = doi.doi_fill_count * doi.doi_data_block_size;
822
823	ddt_object_name(ddt, type, class, name);
824
825	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
826	    name,
827	    (u_longlong_t)count,
828	    (u_longlong_t)(dspace / count),
829	    (u_longlong_t)(mspace / count));
830
831	if (dump_opt['D'] < 3)
832		return;
833
834	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
835
836	if (dump_opt['D'] < 4)
837		return;
838
839	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
840		return;
841
842	(void) printf("%s contents:\n\n", name);
843
844	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
845		dump_dde(ddt, &dde, walk);
846
847	ASSERT(error == ENOENT);
848
849	(void) printf("\n");
850}
851
852static void
853dump_all_ddts(spa_t *spa)
854{
855	ddt_histogram_t ddh_total = { 0 };
856	ddt_stat_t dds_total = { 0 };
857
858	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
859		ddt_t *ddt = spa->spa_ddt[c];
860		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
861			for (enum ddt_class class = 0; class < DDT_CLASSES;
862			    class++) {
863				dump_ddt(ddt, type, class);
864			}
865		}
866	}
867
868	ddt_get_dedup_stats(spa, &dds_total);
869
870	if (dds_total.dds_blocks == 0) {
871		(void) printf("All DDTs are empty\n");
872		return;
873	}
874
875	(void) printf("\n");
876
877	if (dump_opt['D'] > 1) {
878		(void) printf("DDT histogram (aggregated over all DDTs):\n");
879		ddt_get_dedup_histogram(spa, &ddh_total);
880		zpool_dump_ddt(&dds_total, &ddh_total);
881	}
882
883	dump_dedup_ratio(&dds_total);
884}
885
886static void
887dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
888{
889	char *prefix = arg;
890
891	(void) printf("%s [%llu,%llu) length %llu\n",
892	    prefix,
893	    (u_longlong_t)start,
894	    (u_longlong_t)(start + size),
895	    (u_longlong_t)(size));
896}
897
898static void
899dump_dtl(vdev_t *vd, int indent)
900{
901	spa_t *spa = vd->vdev_spa;
902	boolean_t required;
903	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
904	char prefix[256];
905
906	spa_vdev_state_enter(spa, SCL_NONE);
907	required = vdev_dtl_required(vd);
908	(void) spa_vdev_state_exit(spa, NULL, 0);
909
910	if (indent == 0)
911		(void) printf("\nDirty time logs:\n\n");
912
913	(void) printf("\t%*s%s [%s]\n", indent, "",
914	    vd->vdev_path ? vd->vdev_path :
915	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
916	    required ? "DTL-required" : "DTL-expendable");
917
918	for (int t = 0; t < DTL_TYPES; t++) {
919		range_tree_t *rt = vd->vdev_dtl[t];
920		if (range_tree_space(rt) == 0)
921			continue;
922		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
923		    indent + 2, "", name[t]);
924		mutex_enter(rt->rt_lock);
925		range_tree_walk(rt, dump_dtl_seg, prefix);
926		mutex_exit(rt->rt_lock);
927		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
928			dump_spacemap(spa->spa_meta_objset, vd->vdev_dtl_sm);
929	}
930
931	for (int c = 0; c < vd->vdev_children; c++)
932		dump_dtl(vd->vdev_child[c], indent + 4);
933}
934
935/* from spa_history.c: spa_history_create_obj() */
936#define	HIS_BUF_LEN_DEF	(128 << 10)
937#define	HIS_BUF_LEN_MAX	(1 << 30)
938
939static void
940dump_history(spa_t *spa)
941{
942	nvlist_t **events = NULL;
943	char *buf = NULL;
944	uint64_t bufsize = HIS_BUF_LEN_DEF;
945	uint64_t resid, len, off = 0;
946	uint_t num = 0;
947	int error;
948	time_t tsec;
949	struct tm t;
950	char tbuf[30];
951	char internalstr[MAXPATHLEN];
952
953	if ((buf = malloc(bufsize)) == NULL)
954		(void) fprintf(stderr, "Unable to read history: "
955		    "out of memory\n");
956	do {
957		len = bufsize;
958
959		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
960			(void) fprintf(stderr, "Unable to read history: "
961			    "error %d\n", error);
962			return;
963		}
964
965		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
966			break;
967		off -= resid;
968
969		/*
970		 * If the history block is too big, double the buffer
971		 * size and try again.
972		 */
973		if (resid == len) {
974			free(buf);
975			buf = NULL;
976
977			bufsize <<= 1;
978			if ((bufsize >= HIS_BUF_LEN_MAX) ||
979			    ((buf = malloc(bufsize)) == NULL)) {
980				(void) fprintf(stderr, "Unable to read history: "
981				    "out of memory\n");
982				return;
983			}
984		}
985	} while (len != 0);
986	free(buf);
987
988	(void) printf("\nHistory:\n");
989	for (int i = 0; i < num; i++) {
990		uint64_t time, txg, ievent;
991		char *cmd, *intstr;
992		boolean_t printed = B_FALSE;
993
994		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
995		    &time) != 0)
996			goto next;
997		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
998		    &cmd) != 0) {
999			if (nvlist_lookup_uint64(events[i],
1000			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
1001				goto next;
1002			verify(nvlist_lookup_uint64(events[i],
1003			    ZPOOL_HIST_TXG, &txg) == 0);
1004			verify(nvlist_lookup_string(events[i],
1005			    ZPOOL_HIST_INT_STR, &intstr) == 0);
1006			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
1007				goto next;
1008
1009			(void) snprintf(internalstr,
1010			    sizeof (internalstr),
1011			    "[internal %s txg:%lld] %s",
1012			    zfs_history_event_names[ievent], txg,
1013			    intstr);
1014			cmd = internalstr;
1015		}
1016		tsec = time;
1017		(void) localtime_r(&tsec, &t);
1018		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
1019		(void) printf("%s %s\n", tbuf, cmd);
1020		printed = B_TRUE;
1021
1022next:
1023		if (dump_opt['h'] > 1) {
1024			if (!printed)
1025				(void) printf("unrecognized record:\n");
1026			dump_nvlist(events[i], 2);
1027		}
1028	}
1029}
1030
1031/*ARGSUSED*/
1032static void
1033dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
1034{
1035}
1036
1037static uint64_t
1038blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
1039{
1040	if (dnp == NULL) {
1041		ASSERT(zb->zb_level < 0);
1042		if (zb->zb_object == 0)
1043			return (zb->zb_blkid);
1044		return (zb->zb_blkid * BP_GET_LSIZE(bp));
1045	}
1046
1047	ASSERT(zb->zb_level >= 0);
1048
1049	return ((zb->zb_blkid <<
1050	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
1051	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
1052}
1053
1054static void
1055snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
1056{
1057	const dva_t *dva = bp->blk_dva;
1058	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
1059
1060	if (dump_opt['b'] >= 6) {
1061		snprintf_blkptr(blkbuf, buflen, bp);
1062		return;
1063	}
1064
1065	if (BP_IS_EMBEDDED(bp)) {
1066		(void) sprintf(blkbuf,
1067		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
1068		    (int)BPE_GET_ETYPE(bp),
1069		    (u_longlong_t)BPE_GET_LSIZE(bp),
1070		    (u_longlong_t)BPE_GET_PSIZE(bp),
1071		    (u_longlong_t)bp->blk_birth);
1072		return;
1073	}
1074
1075	blkbuf[0] = '\0';
1076	for (int i = 0; i < ndvas; i++)
1077		(void) snprintf(blkbuf + strlen(blkbuf),
1078		    buflen - strlen(blkbuf), "%llu:%llx:%llx ",
1079		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
1080		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
1081		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
1082
1083	if (BP_IS_HOLE(bp)) {
1084		(void) snprintf(blkbuf + strlen(blkbuf),
1085		    buflen - strlen(blkbuf), "B=%llu",
1086		    (u_longlong_t)bp->blk_birth);
1087	} else {
1088		(void) snprintf(blkbuf + strlen(blkbuf),
1089		    buflen - strlen(blkbuf),
1090		    "%llxL/%llxP F=%llu B=%llu/%llu",
1091		    (u_longlong_t)BP_GET_LSIZE(bp),
1092		    (u_longlong_t)BP_GET_PSIZE(bp),
1093		    (u_longlong_t)BP_GET_FILL(bp),
1094		    (u_longlong_t)bp->blk_birth,
1095		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
1096	}
1097}
1098
1099static void
1100print_indirect(blkptr_t *bp, const zbookmark_t *zb,
1101    const dnode_phys_t *dnp)
1102{
1103	char blkbuf[BP_SPRINTF_LEN];
1104	int l;
1105
1106	if (!BP_IS_EMBEDDED(bp)) {
1107		ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
1108		ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
1109	}
1110
1111	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
1112
1113	ASSERT(zb->zb_level >= 0);
1114
1115	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
1116		if (l == zb->zb_level) {
1117			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
1118		} else {
1119			(void) printf(" ");
1120		}
1121	}
1122
1123	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1124	(void) printf("%s\n", blkbuf);
1125}
1126
1127static int
1128visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
1129    blkptr_t *bp, const zbookmark_t *zb)
1130{
1131	int err = 0;
1132
1133	if (bp->blk_birth == 0)
1134		return (0);
1135
1136	print_indirect(bp, zb, dnp);
1137
1138	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
1139		uint32_t flags = ARC_WAIT;
1140		int i;
1141		blkptr_t *cbp;
1142		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
1143		arc_buf_t *buf;
1144		uint64_t fill = 0;
1145
1146		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
1147		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
1148		if (err)
1149			return (err);
1150		ASSERT(buf->b_data);
1151
1152		/* recursively visit blocks below this */
1153		cbp = buf->b_data;
1154		for (i = 0; i < epb; i++, cbp++) {
1155			zbookmark_t czb;
1156
1157			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
1158			    zb->zb_level - 1,
1159			    zb->zb_blkid * epb + i);
1160			err = visit_indirect(spa, dnp, cbp, &czb);
1161			if (err)
1162				break;
1163			fill += BP_GET_FILL(cbp);
1164		}
1165		if (!err)
1166			ASSERT3U(fill, ==, BP_GET_FILL(bp));
1167		(void) arc_buf_remove_ref(buf, &buf);
1168	}
1169
1170	return (err);
1171}
1172
1173/*ARGSUSED*/
1174static void
1175dump_indirect(dnode_t *dn)
1176{
1177	dnode_phys_t *dnp = dn->dn_phys;
1178	int j;
1179	zbookmark_t czb;
1180
1181	(void) printf("Indirect blocks:\n");
1182
1183	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
1184	    dn->dn_object, dnp->dn_nlevels - 1, 0);
1185	for (j = 0; j < dnp->dn_nblkptr; j++) {
1186		czb.zb_blkid = j;
1187		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
1188		    &dnp->dn_blkptr[j], &czb);
1189	}
1190
1191	(void) printf("\n");
1192}
1193
1194/*ARGSUSED*/
1195static void
1196dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
1197{
1198	dsl_dir_phys_t *dd = data;
1199	time_t crtime;
1200	char nice[32];
1201
1202	if (dd == NULL)
1203		return;
1204
1205	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
1206
1207	crtime = dd->dd_creation_time;
1208	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1209	(void) printf("\t\thead_dataset_obj = %llu\n",
1210	    (u_longlong_t)dd->dd_head_dataset_obj);
1211	(void) printf("\t\tparent_dir_obj = %llu\n",
1212	    (u_longlong_t)dd->dd_parent_obj);
1213	(void) printf("\t\torigin_obj = %llu\n",
1214	    (u_longlong_t)dd->dd_origin_obj);
1215	(void) printf("\t\tchild_dir_zapobj = %llu\n",
1216	    (u_longlong_t)dd->dd_child_dir_zapobj);
1217	zdb_nicenum(dd->dd_used_bytes, nice);
1218	(void) printf("\t\tused_bytes = %s\n", nice);
1219	zdb_nicenum(dd->dd_compressed_bytes, nice);
1220	(void) printf("\t\tcompressed_bytes = %s\n", nice);
1221	zdb_nicenum(dd->dd_uncompressed_bytes, nice);
1222	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
1223	zdb_nicenum(dd->dd_quota, nice);
1224	(void) printf("\t\tquota = %s\n", nice);
1225	zdb_nicenum(dd->dd_reserved, nice);
1226	(void) printf("\t\treserved = %s\n", nice);
1227	(void) printf("\t\tprops_zapobj = %llu\n",
1228	    (u_longlong_t)dd->dd_props_zapobj);
1229	(void) printf("\t\tdeleg_zapobj = %llu\n",
1230	    (u_longlong_t)dd->dd_deleg_zapobj);
1231	(void) printf("\t\tflags = %llx\n",
1232	    (u_longlong_t)dd->dd_flags);
1233
1234#define	DO(which) \
1235	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \
1236	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
1237	DO(HEAD);
1238	DO(SNAP);
1239	DO(CHILD);
1240	DO(CHILD_RSRV);
1241	DO(REFRSRV);
1242#undef DO
1243}
1244
1245/*ARGSUSED*/
1246static void
1247dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
1248{
1249	dsl_dataset_phys_t *ds = data;
1250	time_t crtime;
1251	char used[32], compressed[32], uncompressed[32], unique[32];
1252	char blkbuf[BP_SPRINTF_LEN];
1253
1254	if (ds == NULL)
1255		return;
1256
1257	ASSERT(size == sizeof (*ds));
1258	crtime = ds->ds_creation_time;
1259	zdb_nicenum(ds->ds_referenced_bytes, used);
1260	zdb_nicenum(ds->ds_compressed_bytes, compressed);
1261	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed);
1262	zdb_nicenum(ds->ds_unique_bytes, unique);
1263	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
1264
1265	(void) printf("\t\tdir_obj = %llu\n",
1266	    (u_longlong_t)ds->ds_dir_obj);
1267	(void) printf("\t\tprev_snap_obj = %llu\n",
1268	    (u_longlong_t)ds->ds_prev_snap_obj);
1269	(void) printf("\t\tprev_snap_txg = %llu\n",
1270	    (u_longlong_t)ds->ds_prev_snap_txg);
1271	(void) printf("\t\tnext_snap_obj = %llu\n",
1272	    (u_longlong_t)ds->ds_next_snap_obj);
1273	(void) printf("\t\tsnapnames_zapobj = %llu\n",
1274	    (u_longlong_t)ds->ds_snapnames_zapobj);
1275	(void) printf("\t\tnum_children = %llu\n",
1276	    (u_longlong_t)ds->ds_num_children);
1277	(void) printf("\t\tuserrefs_obj = %llu\n",
1278	    (u_longlong_t)ds->ds_userrefs_obj);
1279	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
1280	(void) printf("\t\tcreation_txg = %llu\n",
1281	    (u_longlong_t)ds->ds_creation_txg);
1282	(void) printf("\t\tdeadlist_obj = %llu\n",
1283	    (u_longlong_t)ds->ds_deadlist_obj);
1284	(void) printf("\t\tused_bytes = %s\n", used);
1285	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
1286	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
1287	(void) printf("\t\tunique = %s\n", unique);
1288	(void) printf("\t\tfsid_guid = %llu\n",
1289	    (u_longlong_t)ds->ds_fsid_guid);
1290	(void) printf("\t\tguid = %llu\n",
1291	    (u_longlong_t)ds->ds_guid);
1292	(void) printf("\t\tflags = %llx\n",
1293	    (u_longlong_t)ds->ds_flags);
1294	(void) printf("\t\tnext_clones_obj = %llu\n",
1295	    (u_longlong_t)ds->ds_next_clones_obj);
1296	(void) printf("\t\tprops_obj = %llu\n",
1297	    (u_longlong_t)ds->ds_props_obj);
1298	(void) printf("\t\tbp = %s\n", blkbuf);
1299}
1300
1301/* ARGSUSED */
1302static int
1303dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1304{
1305	char blkbuf[BP_SPRINTF_LEN];
1306
1307	if (bp->blk_birth != 0) {
1308		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
1309		(void) printf("\t%s\n", blkbuf);
1310	}
1311	return (0);
1312}
1313
1314static void
1315dump_bptree(objset_t *os, uint64_t obj, char *name)
1316{
1317	char bytes[32];
1318	bptree_phys_t *bt;
1319	dmu_buf_t *db;
1320
1321	if (dump_opt['d'] < 3)
1322		return;
1323
1324	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
1325	bt = db->db_data;
1326	zdb_nicenum(bt->bt_bytes, bytes);
1327	(void) printf("\n    %s: %llu datasets, %s\n",
1328	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
1329	dmu_buf_rele(db, FTAG);
1330
1331	if (dump_opt['d'] < 5)
1332		return;
1333
1334	(void) printf("\n");
1335
1336	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
1337}
1338
1339/* ARGSUSED */
1340static int
1341dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
1342{
1343	char blkbuf[BP_SPRINTF_LEN];
1344
1345	ASSERT(bp->blk_birth != 0);
1346	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp);
1347	(void) printf("\t%s\n", blkbuf);
1348	return (0);
1349}
1350
1351static void
1352dump_bpobj(bpobj_t *bpo, char *name, int indent)
1353{
1354	char bytes[32];
1355	char comp[32];
1356	char uncomp[32];
1357
1358	if (dump_opt['d'] < 3)
1359		return;
1360
1361	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
1362	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
1363		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
1364		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
1365		(void) printf("    %*s: object %llu, %llu local blkptrs, "
1366		    "%llu subobjs, %s (%s/%s comp)\n",
1367		    indent * 8, name,
1368		    (u_longlong_t)bpo->bpo_object,
1369		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1370		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
1371		    bytes, comp, uncomp);
1372
1373		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
1374			uint64_t subobj;
1375			bpobj_t subbpo;
1376			int error;
1377			VERIFY0(dmu_read(bpo->bpo_os,
1378			    bpo->bpo_phys->bpo_subobjs,
1379			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
1380			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
1381			if (error != 0) {
1382				(void) printf("ERROR %u while trying to open "
1383				    "subobj id %llu\n",
1384				    error, (u_longlong_t)subobj);
1385				continue;
1386			}
1387			dump_bpobj(&subbpo, "subobj", indent + 1);
1388			bpobj_close(&subbpo);
1389		}
1390	} else {
1391		(void) printf("    %*s: object %llu, %llu blkptrs, %s\n",
1392		    indent * 8, name,
1393		    (u_longlong_t)bpo->bpo_object,
1394		    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
1395		    bytes);
1396	}
1397
1398	if (dump_opt['d'] < 5)
1399		return;
1400
1401
1402	if (indent == 0) {
1403		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
1404		(void) printf("\n");
1405	}
1406}
1407
1408static void
1409dump_deadlist(dsl_deadlist_t *dl)
1410{
1411	dsl_deadlist_entry_t *dle;
1412	uint64_t unused;
1413	char bytes[32];
1414	char comp[32];
1415	char uncomp[32];
1416
1417	if (dump_opt['d'] < 3)
1418		return;
1419
1420	zdb_nicenum(dl->dl_phys->dl_used, bytes);
1421	zdb_nicenum(dl->dl_phys->dl_comp, comp);
1422	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
1423	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
1424	    bytes, comp, uncomp);
1425
1426	if (dump_opt['d'] < 4)
1427		return;
1428
1429	(void) printf("\n");
1430
1431	/* force the tree to be loaded */
1432	dsl_deadlist_space_range(dl, 0, UINT64_MAX, &unused, &unused, &unused);
1433
1434	for (dle = avl_first(&dl->dl_tree); dle;
1435	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
1436		if (dump_opt['d'] >= 5) {
1437			char buf[128];
1438			(void) snprintf(buf, sizeof (buf), "mintxg %llu -> ",
1439			    (longlong_t)dle->dle_mintxg,
1440			    (longlong_t)dle->dle_bpobj.bpo_object);
1441
1442			dump_bpobj(&dle->dle_bpobj, buf, 0);
1443		} else {
1444			(void) printf("mintxg %llu -> obj %llu\n",
1445			    (longlong_t)dle->dle_mintxg,
1446			    (longlong_t)dle->dle_bpobj.bpo_object);
1447
1448		}
1449	}
1450}
1451
1452static avl_tree_t idx_tree;
1453static avl_tree_t domain_tree;
1454static boolean_t fuid_table_loaded;
1455static boolean_t sa_loaded;
1456sa_attr_type_t *sa_attr_table;
1457
1458static void
1459fuid_table_destroy()
1460{
1461	if (fuid_table_loaded) {
1462		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
1463		fuid_table_loaded = B_FALSE;
1464	}
1465}
1466
1467/*
1468 * print uid or gid information.
1469 * For normal POSIX id just the id is printed in decimal format.
1470 * For CIFS files with FUID the fuid is printed in hex followed by
1471 * the domain-rid string.
1472 */
1473static void
1474print_idstr(uint64_t id, const char *id_type)
1475{
1476	if (FUID_INDEX(id)) {
1477		char *domain;
1478
1479		domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
1480		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
1481		    (u_longlong_t)id, domain, (int)FUID_RID(id));
1482	} else {
1483		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
1484	}
1485
1486}
1487
1488static void
1489dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
1490{
1491	uint32_t uid_idx, gid_idx;
1492
1493	uid_idx = FUID_INDEX(uid);
1494	gid_idx = FUID_INDEX(gid);
1495
1496	/* Load domain table, if not already loaded */
1497	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
1498		uint64_t fuid_obj;
1499
1500		/* first find the fuid object.  It lives in the master node */
1501		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
1502		    8, 1, &fuid_obj) == 0);
1503		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
1504		(void) zfs_fuid_table_load(os, fuid_obj,
1505		    &idx_tree, &domain_tree);
1506		fuid_table_loaded = B_TRUE;
1507	}
1508
1509	print_idstr(uid, "uid");
1510	print_idstr(gid, "gid");
1511}
1512
1513/*ARGSUSED*/
1514static void
1515dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
1516{
1517	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
1518	sa_handle_t *hdl;
1519	uint64_t xattr, rdev, gen;
1520	uint64_t uid, gid, mode, fsize, parent, links;
1521	uint64_t pflags;
1522	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
1523	time_t z_crtime, z_atime, z_mtime, z_ctime;
1524	sa_bulk_attr_t bulk[12];
1525	int idx = 0;
1526	int error;
1527
1528	if (!sa_loaded) {
1529		uint64_t sa_attrs = 0;
1530		uint64_t version;
1531
1532		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1533		    8, 1, &version) == 0);
1534		if (version >= ZPL_VERSION_SA) {
1535			VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
1536			    8, 1, &sa_attrs) == 0);
1537		}
1538		if ((error = sa_setup(os, sa_attrs, zfs_attr_table,
1539		    ZPL_END, &sa_attr_table)) != 0) {
1540			(void) printf("sa_setup failed errno %d, can't "
1541			    "display znode contents\n", error);
1542			return;
1543		}
1544		sa_loaded = B_TRUE;
1545	}
1546
1547	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
1548		(void) printf("Failed to get handle for SA znode\n");
1549		return;
1550	}
1551
1552	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
1553	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
1554	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
1555	    &links, 8);
1556	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
1557	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
1558	    &mode, 8);
1559	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
1560	    NULL, &parent, 8);
1561	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
1562	    &fsize, 8);
1563	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
1564	    acctm, 16);
1565	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
1566	    modtm, 16);
1567	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
1568	    crtm, 16);
1569	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
1570	    chgtm, 16);
1571	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
1572	    &pflags, 8);
1573
1574	if (sa_bulk_lookup(hdl, bulk, idx)) {
1575		(void) sa_handle_destroy(hdl);
1576		return;
1577	}
1578
1579	error = zfs_obj_to_path(os, object, path, sizeof (path));
1580	if (error != 0) {
1581		(void) snprintf(path, sizeof (path), "\?\?\?<object#%llu>",
1582		    (u_longlong_t)object);
1583	}
1584	if (dump_opt['d'] < 3) {
1585		(void) printf("\t%s\n", path);
1586		(void) sa_handle_destroy(hdl);
1587		return;
1588	}
1589
1590	z_crtime = (time_t)crtm[0];
1591	z_atime = (time_t)acctm[0];
1592	z_mtime = (time_t)modtm[0];
1593	z_ctime = (time_t)chgtm[0];
1594
1595	(void) printf("\tpath	%s\n", path);
1596	dump_uidgid(os, uid, gid);
1597	(void) printf("\tatime	%s", ctime(&z_atime));
1598	(void) printf("\tmtime	%s", ctime(&z_mtime));
1599	(void) printf("\tctime	%s", ctime(&z_ctime));
1600	(void) printf("\tcrtime	%s", ctime(&z_crtime));
1601	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
1602	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
1603	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
1604	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
1605	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
1606	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
1607	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
1608	    sizeof (uint64_t)) == 0)
1609		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
1610	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
1611	    sizeof (uint64_t)) == 0)
1612		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
1613	sa_handle_destroy(hdl);
1614}
1615
1616/*ARGSUSED*/
1617static void
1618dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
1619{
1620}
1621
1622/*ARGSUSED*/
1623static void
1624dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
1625{
1626}
1627
1628static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
1629	dump_none,		/* unallocated			*/
1630	dump_zap,		/* object directory		*/
1631	dump_uint64,		/* object array			*/
1632	dump_none,		/* packed nvlist		*/
1633	dump_packed_nvlist,	/* packed nvlist size		*/
1634	dump_none,		/* bplist			*/
1635	dump_none,		/* bplist header		*/
1636	dump_none,		/* SPA space map header		*/
1637	dump_none,		/* SPA space map		*/
1638	dump_none,		/* ZIL intent log		*/
1639	dump_dnode,		/* DMU dnode			*/
1640	dump_dmu_objset,	/* DMU objset			*/
1641	dump_dsl_dir,		/* DSL directory		*/
1642	dump_zap,		/* DSL directory child map	*/
1643	dump_zap,		/* DSL dataset snap map		*/
1644	dump_zap,		/* DSL props			*/
1645	dump_dsl_dataset,	/* DSL dataset			*/
1646	dump_znode,		/* ZFS znode			*/
1647	dump_acl,		/* ZFS V0 ACL			*/
1648	dump_uint8,		/* ZFS plain file		*/
1649	dump_zpldir,		/* ZFS directory		*/
1650	dump_zap,		/* ZFS master node		*/
1651	dump_zap,		/* ZFS delete queue		*/
1652	dump_uint8,		/* zvol object			*/
1653	dump_zap,		/* zvol prop			*/
1654	dump_uint8,		/* other uint8[]		*/
1655	dump_uint64,		/* other uint64[]		*/
1656	dump_zap,		/* other ZAP			*/
1657	dump_zap,		/* persistent error log		*/
1658	dump_uint8,		/* SPA history			*/
1659	dump_history_offsets,	/* SPA history offsets		*/
1660	dump_zap,		/* Pool properties		*/
1661	dump_zap,		/* DSL permissions		*/
1662	dump_acl,		/* ZFS ACL			*/
1663	dump_uint8,		/* ZFS SYSACL			*/
1664	dump_none,		/* FUID nvlist			*/
1665	dump_packed_nvlist,	/* FUID nvlist size		*/
1666	dump_zap,		/* DSL dataset next clones	*/
1667	dump_zap,		/* DSL scrub queue		*/
1668	dump_zap,		/* ZFS user/group used		*/
1669	dump_zap,		/* ZFS user/group quota		*/
1670	dump_zap,		/* snapshot refcount tags	*/
1671	dump_ddt_zap,		/* DDT ZAP object		*/
1672	dump_zap,		/* DDT statistics		*/
1673	dump_znode,		/* SA object			*/
1674	dump_zap,		/* SA Master Node		*/
1675	dump_sa_attrs,		/* SA attribute registration	*/
1676	dump_sa_layouts,	/* SA attribute layouts		*/
1677	dump_zap,		/* DSL scrub translations	*/
1678	dump_none,		/* fake dedup BP		*/
1679	dump_zap,		/* deadlist			*/
1680	dump_none,		/* deadlist hdr			*/
1681	dump_zap,		/* dsl clones			*/
1682	dump_none,		/* bpobj subobjs		*/
1683	dump_unknown,		/* Unknown type, must be last	*/
1684};
1685
1686static void
1687dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
1688{
1689	dmu_buf_t *db = NULL;
1690	dmu_object_info_t doi;
1691	dnode_t *dn;
1692	void *bonus = NULL;
1693	size_t bsize = 0;
1694	char iblk[32], dblk[32], lsize[32], asize[32], fill[32];
1695	char bonus_size[32];
1696	char aux[50];
1697	int error;
1698
1699	if (*print_header) {
1700		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
1701		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
1702		    "%full", "type");
1703		*print_header = 0;
1704	}
1705
1706	if (object == 0) {
1707		dn = DMU_META_DNODE(os);
1708	} else {
1709		error = dmu_bonus_hold(os, object, FTAG, &db);
1710		if (error)
1711			fatal("dmu_bonus_hold(%llu) failed, errno %u",
1712			    object, error);
1713		bonus = db->db_data;
1714		bsize = db->db_size;
1715		dn = DB_DNODE((dmu_buf_impl_t *)db);
1716	}
1717	dmu_object_info_from_dnode(dn, &doi);
1718
1719	zdb_nicenum(doi.doi_metadata_block_size, iblk);
1720	zdb_nicenum(doi.doi_data_block_size, dblk);
1721	zdb_nicenum(doi.doi_max_offset, lsize);
1722	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize);
1723	zdb_nicenum(doi.doi_bonus_size, bonus_size);
1724	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
1725	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
1726	    doi.doi_max_offset);
1727
1728	aux[0] = '\0';
1729
1730	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
1731		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
1732		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
1733	}
1734
1735	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
1736		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
1737		    ZDB_COMPRESS_NAME(doi.doi_compress));
1738	}
1739
1740	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
1741	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
1742	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
1743
1744	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
1745		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
1746		    "", "", "", "", "", bonus_size, "bonus",
1747		    ZDB_OT_NAME(doi.doi_bonus_type));
1748	}
1749
1750	if (verbosity >= 4) {
1751		(void) printf("\tdnode flags: %s%s%s\n",
1752		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
1753		    "USED_BYTES " : "",
1754		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
1755		    "USERUSED_ACCOUNTED " : "",
1756		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
1757		    "SPILL_BLKPTR" : "");
1758		(void) printf("\tdnode maxblkid: %llu\n",
1759		    (longlong_t)dn->dn_phys->dn_maxblkid);
1760
1761		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
1762		    bonus, bsize);
1763		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
1764		*print_header = 1;
1765	}
1766
1767	if (verbosity >= 5)
1768		dump_indirect(dn);
1769
1770	if (verbosity >= 5) {
1771		/*
1772		 * Report the list of segments that comprise the object.
1773		 */
1774		uint64_t start = 0;
1775		uint64_t end;
1776		uint64_t blkfill = 1;
1777		int minlvl = 1;
1778
1779		if (dn->dn_type == DMU_OT_DNODE) {
1780			minlvl = 0;
1781			blkfill = DNODES_PER_BLOCK;
1782		}
1783
1784		for (;;) {
1785			char segsize[32];
1786			error = dnode_next_offset(dn,
1787			    0, &start, minlvl, blkfill, 0);
1788			if (error)
1789				break;
1790			end = start;
1791			error = dnode_next_offset(dn,
1792			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
1793			zdb_nicenum(end - start, segsize);
1794			(void) printf("\t\tsegment [%016llx, %016llx)"
1795			    " size %5s\n", (u_longlong_t)start,
1796			    (u_longlong_t)end, segsize);
1797			if (error)
1798				break;
1799			start = end;
1800		}
1801	}
1802
1803	if (db != NULL)
1804		dmu_buf_rele(db, FTAG);
1805}
1806
1807static char *objset_types[DMU_OST_NUMTYPES] = {
1808	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
1809
1810static void
1811dump_dir(objset_t *os)
1812{
1813	dmu_objset_stats_t dds;
1814	uint64_t object, object_count;
1815	uint64_t refdbytes, usedobjs, scratch;
1816	char numbuf[32];
1817	char blkbuf[BP_SPRINTF_LEN + 20];
1818	char osname[MAXNAMELEN];
1819	char *type = "UNKNOWN";
1820	int verbosity = dump_opt['d'];
1821	int print_header = 1;
1822	int i, error;
1823
1824	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
1825	dmu_objset_fast_stat(os, &dds);
1826	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
1827
1828	if (dds.dds_type < DMU_OST_NUMTYPES)
1829		type = objset_types[dds.dds_type];
1830
1831	if (dds.dds_type == DMU_OST_META) {
1832		dds.dds_creation_txg = TXG_INITIAL;
1833		usedobjs = BP_GET_FILL(os->os_rootbp);
1834		refdbytes = os->os_spa->spa_dsl_pool->
1835		    dp_mos_dir->dd_phys->dd_used_bytes;
1836	} else {
1837		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
1838	}
1839
1840	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
1841
1842	zdb_nicenum(refdbytes, numbuf);
1843
1844	if (verbosity >= 4) {
1845		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
1846		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
1847		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
1848	} else {
1849		blkbuf[0] = '\0';
1850	}
1851
1852	dmu_objset_name(os, osname);
1853
1854	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
1855	    "%s, %llu objects%s\n",
1856	    osname, type, (u_longlong_t)dmu_objset_id(os),
1857	    (u_longlong_t)dds.dds_creation_txg,
1858	    numbuf, (u_longlong_t)usedobjs, blkbuf);
1859
1860	if (zopt_objects != 0) {
1861		for (i = 0; i < zopt_objects; i++)
1862			dump_object(os, zopt_object[i], verbosity,
1863			    &print_header);
1864		(void) printf("\n");
1865		return;
1866	}
1867
1868	if (dump_opt['i'] != 0 || verbosity >= 2)
1869		dump_intent_log(dmu_objset_zil(os));
1870
1871	if (dmu_objset_ds(os) != NULL)
1872		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
1873
1874	if (verbosity < 2)
1875		return;
1876
1877	if (BP_IS_HOLE(os->os_rootbp))
1878		return;
1879
1880	dump_object(os, 0, verbosity, &print_header);
1881	object_count = 0;
1882	if (DMU_USERUSED_DNODE(os) != NULL &&
1883	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
1884		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
1885		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
1886	}
1887
1888	object = 0;
1889	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
1890		dump_object(os, object, verbosity, &print_header);
1891		object_count++;
1892	}
1893
1894	ASSERT3U(object_count, ==, usedobjs);
1895
1896	(void) printf("\n");
1897
1898	if (error != ESRCH) {
1899		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
1900		abort();
1901	}
1902}
1903
1904static void
1905dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
1906{
1907	time_t timestamp = ub->ub_timestamp;
1908
1909	(void) printf(header ? header : "");
1910	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
1911	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
1912	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
1913	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
1914	(void) printf("\ttimestamp = %llu UTC = %s",
1915	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
1916	if (dump_opt['u'] >= 3) {
1917		char blkbuf[BP_SPRINTF_LEN];
1918		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
1919		(void) printf("\trootbp = %s\n", blkbuf);
1920	}
1921	(void) printf(footer ? footer : "");
1922}
1923
1924static void
1925dump_config(spa_t *spa)
1926{
1927	dmu_buf_t *db;
1928	size_t nvsize = 0;
1929	int error = 0;
1930
1931
1932	error = dmu_bonus_hold(spa->spa_meta_objset,
1933	    spa->spa_config_object, FTAG, &db);
1934
1935	if (error == 0) {
1936		nvsize = *(uint64_t *)db->db_data;
1937		dmu_buf_rele(db, FTAG);
1938
1939		(void) printf("\nMOS Configuration:\n");
1940		dump_packed_nvlist(spa->spa_meta_objset,
1941		    spa->spa_config_object, (void *)&nvsize, 1);
1942	} else {
1943		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
1944		    (u_longlong_t)spa->spa_config_object, error);
1945	}
1946}
1947
1948static void
1949dump_cachefile(const char *cachefile)
1950{
1951	int fd;
1952	struct stat64 statbuf;
1953	char *buf;
1954	nvlist_t *config;
1955
1956	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
1957		(void) printf("cannot open '%s': %s\n", cachefile,
1958		    strerror(errno));
1959		exit(1);
1960	}
1961
1962	if (fstat64(fd, &statbuf) != 0) {
1963		(void) printf("failed to stat '%s': %s\n", cachefile,
1964		    strerror(errno));
1965		exit(1);
1966	}
1967
1968	if ((buf = malloc(statbuf.st_size)) == NULL) {
1969		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
1970		    (u_longlong_t)statbuf.st_size);
1971		exit(1);
1972	}
1973
1974	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
1975		(void) fprintf(stderr, "failed to read %llu bytes\n",
1976		    (u_longlong_t)statbuf.st_size);
1977		exit(1);
1978	}
1979
1980	(void) close(fd);
1981
1982	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
1983		(void) fprintf(stderr, "failed to unpack nvlist\n");
1984		exit(1);
1985	}
1986
1987	free(buf);
1988
1989	dump_nvlist(config, 0);
1990
1991	nvlist_free(config);
1992}
1993
1994#define	ZDB_MAX_UB_HEADER_SIZE 32
1995
1996static void
1997dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
1998{
1999	vdev_t vd;
2000	vdev_t *vdp = &vd;
2001	char header[ZDB_MAX_UB_HEADER_SIZE];
2002
2003	vd.vdev_ashift = ashift;
2004	vdp->vdev_top = vdp;
2005
2006	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
2007		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
2008		uberblock_t *ub = (void *)((char *)lbl + uoff);
2009
2010		if (uberblock_verify(ub))
2011			continue;
2012		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
2013		    "Uberblock[%d]\n", i);
2014		dump_uberblock(ub, header, "");
2015	}
2016}
2017
2018static void
2019dump_label(const char *dev)
2020{
2021	int fd;
2022	vdev_label_t label;
2023	char *path, *buf = label.vl_vdev_phys.vp_nvlist;
2024	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
2025	struct stat64 statbuf;
2026	uint64_t psize, ashift;
2027	int len = strlen(dev) + 1;
2028
2029	if (strncmp(dev, "/dev/dsk/", 9) == 0) {
2030		len++;
2031		path = malloc(len);
2032		(void) snprintf(path, len, "%s%s", "/dev/rdsk/", dev + 9);
2033	} else {
2034		path = strdup(dev);
2035	}
2036
2037	if ((fd = open64(path, O_RDONLY)) < 0) {
2038		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
2039		free(path);
2040		exit(1);
2041	}
2042
2043	if (fstat64(fd, &statbuf) != 0) {
2044		(void) printf("failed to stat '%s': %s\n", path,
2045		    strerror(errno));
2046		free(path);
2047		(void) close(fd);
2048		exit(1);
2049	}
2050
2051	if (S_ISBLK(statbuf.st_mode)) {
2052		(void) printf("cannot use '%s': character device required\n",
2053		    path);
2054		free(path);
2055		(void) close(fd);
2056		exit(1);
2057	}
2058
2059	psize = statbuf.st_size;
2060	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
2061
2062	for (int l = 0; l < VDEV_LABELS; l++) {
2063		nvlist_t *config = NULL;
2064
2065		(void) printf("--------------------------------------------\n");
2066		(void) printf("LABEL %d\n", l);
2067		(void) printf("--------------------------------------------\n");
2068
2069		if (pread64(fd, &label, sizeof (label),
2070		    vdev_label_offset(psize, l, 0)) != sizeof (label)) {
2071			(void) printf("failed to read label %d\n", l);
2072			continue;
2073		}
2074
2075		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
2076			(void) printf("failed to unpack label %d\n", l);
2077			ashift = SPA_MINBLOCKSHIFT;
2078		} else {
2079			nvlist_t *vdev_tree = NULL;
2080
2081			dump_nvlist(config, 4);
2082			if ((nvlist_lookup_nvlist(config,
2083			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
2084			    (nvlist_lookup_uint64(vdev_tree,
2085			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
2086				ashift = SPA_MINBLOCKSHIFT;
2087			nvlist_free(config);
2088		}
2089		if (dump_opt['u'])
2090			dump_label_uberblocks(&label, ashift);
2091	}
2092
2093	free(path);
2094	(void) close(fd);
2095}
2096
2097/*ARGSUSED*/
2098static int
2099dump_one_dir(const char *dsname, void *arg)
2100{
2101	int error;
2102	objset_t *os;
2103
2104	error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
2105	if (error) {
2106		(void) printf("Could not open %s, error %d\n", dsname, error);
2107		return (0);
2108	}
2109	dump_dir(os);
2110	dmu_objset_disown(os, FTAG);
2111	fuid_table_destroy();
2112	sa_loaded = B_FALSE;
2113	return (0);
2114}
2115
2116/*
2117 * Block statistics.
2118 */
2119#define	PSIZE_HISTO_SIZE (SPA_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1)
2120typedef struct zdb_blkstats {
2121	uint64_t zb_asize;
2122	uint64_t zb_lsize;
2123	uint64_t zb_psize;
2124	uint64_t zb_count;
2125	uint64_t zb_gangs;
2126	uint64_t zb_ditto_samevdev;
2127	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
2128} zdb_blkstats_t;
2129
2130/*
2131 * Extended object types to report deferred frees and dedup auto-ditto blocks.
2132 */
2133#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
2134#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
2135#define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
2136#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
2137
2138static char *zdb_ot_extname[] = {
2139	"deferred free",
2140	"dedup ditto",
2141	"other",
2142	"Total",
2143};
2144
2145#define	ZB_TOTAL	DN_MAX_LEVELS
2146
2147typedef struct zdb_cb {
2148	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
2149	uint64_t	zcb_dedup_asize;
2150	uint64_t	zcb_dedup_blocks;
2151	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
2152	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
2153	    [BPE_PAYLOAD_SIZE];
2154	uint64_t	zcb_start;
2155	uint64_t	zcb_lastprint;
2156	uint64_t	zcb_totalasize;
2157	uint64_t	zcb_errors[256];
2158	int		zcb_readfails;
2159	int		zcb_haderrors;
2160	spa_t		*zcb_spa;
2161} zdb_cb_t;
2162
2163static void
2164zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
2165    dmu_object_type_t type)
2166{
2167	uint64_t refcnt = 0;
2168
2169	ASSERT(type < ZDB_OT_TOTAL);
2170
2171	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
2172		return;
2173
2174	for (int i = 0; i < 4; i++) {
2175		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
2176		int t = (i & 1) ? type : ZDB_OT_TOTAL;
2177		int equal;
2178		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
2179
2180		zb->zb_asize += BP_GET_ASIZE(bp);
2181		zb->zb_lsize += BP_GET_LSIZE(bp);
2182		zb->zb_psize += BP_GET_PSIZE(bp);
2183		zb->zb_count++;
2184		zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
2185
2186		zb->zb_gangs += BP_COUNT_GANG(bp);
2187
2188		switch (BP_GET_NDVAS(bp)) {
2189		case 2:
2190			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2191			    DVA_GET_VDEV(&bp->blk_dva[1]))
2192				zb->zb_ditto_samevdev++;
2193			break;
2194		case 3:
2195			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2196			    DVA_GET_VDEV(&bp->blk_dva[1])) +
2197			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
2198			    DVA_GET_VDEV(&bp->blk_dva[2])) +
2199			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
2200			    DVA_GET_VDEV(&bp->blk_dva[2]));
2201			if (equal != 0)
2202				zb->zb_ditto_samevdev++;
2203			break;
2204		}
2205
2206	}
2207
2208	if (BP_IS_EMBEDDED(bp)) {
2209		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
2210		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
2211		    [BPE_GET_PSIZE(bp)]++;
2212		return;
2213	}
2214
2215	if (dump_opt['L'])
2216		return;
2217
2218	if (BP_GET_DEDUP(bp)) {
2219		ddt_t *ddt;
2220		ddt_entry_t *dde;
2221
2222		ddt = ddt_select(zcb->zcb_spa, bp);
2223		ddt_enter(ddt);
2224		dde = ddt_lookup(ddt, bp, B_FALSE);
2225
2226		if (dde == NULL) {
2227			refcnt = 0;
2228		} else {
2229			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
2230			ddt_phys_decref(ddp);
2231			refcnt = ddp->ddp_refcnt;
2232			if (ddt_phys_total_refcnt(dde) == 0)
2233				ddt_remove(ddt, dde);
2234		}
2235		ddt_exit(ddt);
2236	}
2237
2238	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
2239	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
2240	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
2241}
2242
2243/* ARGSUSED */
2244static void
2245zdb_blkptr_done(zio_t *zio)
2246{
2247	spa_t *spa = zio->io_spa;
2248	blkptr_t *bp = zio->io_bp;
2249	int ioerr = zio->io_error;
2250	zdb_cb_t *zcb = zio->io_private;
2251	zbookmark_t *zb = &zio->io_bookmark;
2252
2253	zio_data_buf_free(zio->io_data, zio->io_size);
2254
2255	mutex_enter(&spa->spa_scrub_lock);
2256	spa->spa_scrub_inflight--;
2257	cv_broadcast(&spa->spa_scrub_io_cv);
2258
2259	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2260		char blkbuf[BP_SPRINTF_LEN];
2261
2262		zcb->zcb_haderrors = 1;
2263		zcb->zcb_errors[ioerr]++;
2264
2265		if (dump_opt['b'] >= 2)
2266			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2267		else
2268			blkbuf[0] = '\0';
2269
2270		(void) printf("zdb_blkptr_cb: "
2271		    "Got error %d reading "
2272		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
2273		    ioerr,
2274		    (u_longlong_t)zb->zb_objset,
2275		    (u_longlong_t)zb->zb_object,
2276		    (u_longlong_t)zb->zb_level,
2277		    (u_longlong_t)zb->zb_blkid,
2278		    blkbuf);
2279	}
2280	mutex_exit(&spa->spa_scrub_lock);
2281}
2282
2283/* ARGSUSED */
2284static int
2285zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2286    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
2287{
2288	zdb_cb_t *zcb = arg;
2289	dmu_object_type_t type;
2290	boolean_t is_metadata;
2291
2292	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
2293		char blkbuf[BP_SPRINTF_LEN];
2294		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2295		(void) printf("objset %llu object %llu "
2296		    "level %lld offset 0x%llx %s\n",
2297		    (u_longlong_t)zb->zb_objset,
2298		    (u_longlong_t)zb->zb_object,
2299		    (longlong_t)zb->zb_level,
2300		    (u_longlong_t)blkid2offset(dnp, bp, zb),
2301		    blkbuf);
2302	}
2303
2304	if (BP_IS_HOLE(bp))
2305		return (0);
2306
2307	type = BP_GET_TYPE(bp);
2308
2309	zdb_count_block(zcb, zilog, bp,
2310	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
2311
2312	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
2313
2314	if (!BP_IS_EMBEDDED(bp) &&
2315	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
2316		size_t size = BP_GET_PSIZE(bp);
2317		void *data = zio_data_buf_alloc(size);
2318		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
2319
2320		/* If it's an intent log block, failure is expected. */
2321		if (zb->zb_level == ZB_ZIL_LEVEL)
2322			flags |= ZIO_FLAG_SPECULATIVE;
2323
2324		mutex_enter(&spa->spa_scrub_lock);
2325		while (spa->spa_scrub_inflight > max_inflight)
2326			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2327		spa->spa_scrub_inflight++;
2328		mutex_exit(&spa->spa_scrub_lock);
2329
2330		zio_nowait(zio_read(NULL, spa, bp, data, size,
2331		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
2332	}
2333
2334	zcb->zcb_readfails = 0;
2335
2336	if (dump_opt['b'] < 5 && isatty(STDERR_FILENO) &&
2337	    gethrtime() > zcb->zcb_lastprint + NANOSEC) {
2338		uint64_t now = gethrtime();
2339		char buf[10];
2340		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
2341		int kb_per_sec =
2342		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
2343		int sec_remaining =
2344		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
2345
2346		zfs_nicenum(bytes, buf, sizeof (buf));
2347		(void) fprintf(stderr,
2348		    "\r%5s completed (%4dMB/s) "
2349		    "estimated time remaining: %uhr %02umin %02usec        ",
2350		    buf, kb_per_sec / 1024,
2351		    sec_remaining / 60 / 60,
2352		    sec_remaining / 60 % 60,
2353		    sec_remaining % 60);
2354
2355		zcb->zcb_lastprint = now;
2356	}
2357
2358	return (0);
2359}
2360
2361static void
2362zdb_leak(void *arg, uint64_t start, uint64_t size)
2363{
2364	vdev_t *vd = arg;
2365
2366	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
2367	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
2368}
2369
2370static metaslab_ops_t zdb_metaslab_ops = {
2371	NULL,	/* alloc */
2372	NULL	/* fragmented */
2373};
2374
2375static void
2376zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
2377{
2378	ddt_bookmark_t ddb = { 0 };
2379	ddt_entry_t dde;
2380	int error;
2381
2382	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
2383		blkptr_t blk;
2384		ddt_phys_t *ddp = dde.dde_phys;
2385
2386		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
2387			return;
2388
2389		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
2390
2391		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2392			if (ddp->ddp_phys_birth == 0)
2393				continue;
2394			ddt_bp_create(ddb.ddb_checksum,
2395			    &dde.dde_key, ddp, &blk);
2396			if (p == DDT_PHYS_DITTO) {
2397				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
2398			} else {
2399				zcb->zcb_dedup_asize +=
2400				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
2401				zcb->zcb_dedup_blocks++;
2402			}
2403		}
2404		if (!dump_opt['L']) {
2405			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
2406			ddt_enter(ddt);
2407			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
2408			ddt_exit(ddt);
2409		}
2410	}
2411
2412	ASSERT(error == ENOENT);
2413}
2414
2415static void
2416zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
2417{
2418	zcb->zcb_spa = spa;
2419
2420	if (!dump_opt['L']) {
2421		vdev_t *rvd = spa->spa_root_vdev;
2422		for (int c = 0; c < rvd->vdev_children; c++) {
2423			vdev_t *vd = rvd->vdev_child[c];
2424			for (int m = 0; m < vd->vdev_ms_count; m++) {
2425				metaslab_t *msp = vd->vdev_ms[m];
2426				mutex_enter(&msp->ms_lock);
2427				metaslab_unload(msp);
2428
2429				/*
2430				 * For leak detection, we overload the metaslab
2431				 * ms_tree to contain allocated segments
2432				 * instead of free segments. As a result,
2433				 * we can't use the normal metaslab_load/unload
2434				 * interfaces.
2435				 */
2436				if (msp->ms_sm != NULL) {
2437					msp->ms_ops = &zdb_metaslab_ops;
2438					VERIFY0(space_map_load(msp->ms_sm,
2439					    msp->ms_tree, SM_ALLOC));
2440					msp->ms_loaded = B_TRUE;
2441				}
2442				mutex_exit(&msp->ms_lock);
2443			}
2444		}
2445	}
2446
2447	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2448
2449	zdb_ddt_leak_init(spa, zcb);
2450
2451	spa_config_exit(spa, SCL_CONFIG, FTAG);
2452}
2453
2454static void
2455zdb_leak_fini(spa_t *spa)
2456{
2457	if (!dump_opt['L']) {
2458		vdev_t *rvd = spa->spa_root_vdev;
2459		for (int c = 0; c < rvd->vdev_children; c++) {
2460			vdev_t *vd = rvd->vdev_child[c];
2461			for (int m = 0; m < vd->vdev_ms_count; m++) {
2462				metaslab_t *msp = vd->vdev_ms[m];
2463				mutex_enter(&msp->ms_lock);
2464
2465				/*
2466				 * The ms_tree has been overloaded to
2467				 * contain allocated segments. Now that we
2468				 * finished traversing all blocks, any
2469				 * block that remains in the ms_tree
2470				 * represents an allocated block that we
2471				 * did not claim during the traversal.
2472				 * Claimed blocks would have been removed
2473				 * from the ms_tree.
2474				 */
2475				range_tree_vacate(msp->ms_tree, zdb_leak, vd);
2476				msp->ms_loaded = B_FALSE;
2477
2478				mutex_exit(&msp->ms_lock);
2479			}
2480		}
2481	}
2482}
2483
2484/* ARGSUSED */
2485static int
2486count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2487{
2488	zdb_cb_t *zcb = arg;
2489
2490	if (dump_opt['b'] >= 5) {
2491		char blkbuf[BP_SPRINTF_LEN];
2492		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2493		(void) printf("[%s] %s\n",
2494		    "deferred free", blkbuf);
2495	}
2496	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
2497	return (0);
2498}
2499
2500static int
2501dump_block_stats(spa_t *spa)
2502{
2503	zdb_cb_t zcb = { 0 };
2504	zdb_blkstats_t *zb, *tzb;
2505	uint64_t norm_alloc, norm_space, total_alloc, total_found;
2506	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
2507	boolean_t leaks = B_FALSE;
2508
2509	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
2510	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
2511	    (dump_opt['c'] == 1) ? "metadata " : "",
2512	    dump_opt['c'] ? "checksums " : "",
2513	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
2514	    !dump_opt['L'] ? "nothing leaked " : "");
2515
2516	/*
2517	 * Load all space maps as SM_ALLOC maps, then traverse the pool
2518	 * claiming each block we discover.  If the pool is perfectly
2519	 * consistent, the space maps will be empty when we're done.
2520	 * Anything left over is a leak; any block we can't claim (because
2521	 * it's not part of any space map) is a double allocation,
2522	 * reference to a freed block, or an unclaimed log block.
2523	 */
2524	zdb_leak_init(spa, &zcb);
2525
2526	/*
2527	 * If there's a deferred-free bplist, process that first.
2528	 */
2529	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
2530	    count_block_cb, &zcb, NULL);
2531	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
2532		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
2533		    count_block_cb, &zcb, NULL);
2534	}
2535	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
2536		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
2537		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
2538		    &zcb, NULL));
2539	}
2540
2541	if (dump_opt['c'] > 1)
2542		flags |= TRAVERSE_PREFETCH_DATA;
2543
2544	zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
2545	zcb.zcb_start = zcb.zcb_lastprint = gethrtime();
2546	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
2547
2548	/*
2549	 * If we've traversed the data blocks then we need to wait for those
2550	 * I/Os to complete. We leverage "The Godfather" zio to wait on
2551	 * all async I/Os to complete.
2552	 */
2553	if (dump_opt['c']) {
2554		(void) zio_wait(spa->spa_async_zio_root);
2555		spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2556		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2557		    ZIO_FLAG_GODFATHER);
2558	}
2559
2560	if (zcb.zcb_haderrors) {
2561		(void) printf("\nError counts:\n\n");
2562		(void) printf("\t%5s  %s\n", "errno", "count");
2563		for (int e = 0; e < 256; e++) {
2564			if (zcb.zcb_errors[e] != 0) {
2565				(void) printf("\t%5d  %llu\n",
2566				    e, (u_longlong_t)zcb.zcb_errors[e]);
2567			}
2568		}
2569	}
2570
2571	/*
2572	 * Report any leaked segments.
2573	 */
2574	zdb_leak_fini(spa);
2575
2576	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
2577
2578	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
2579	norm_space = metaslab_class_get_space(spa_normal_class(spa));
2580
2581	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
2582	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
2583
2584	if (total_found == total_alloc) {
2585		if (!dump_opt['L'])
2586			(void) printf("\n\tNo leaks (block sum matches space"
2587			    " maps exactly)\n");
2588	} else {
2589		(void) printf("block traversal size %llu != alloc %llu "
2590		    "(%s %lld)\n",
2591		    (u_longlong_t)total_found,
2592		    (u_longlong_t)total_alloc,
2593		    (dump_opt['L']) ? "unreachable" : "leaked",
2594		    (longlong_t)(total_alloc - total_found));
2595		leaks = B_TRUE;
2596	}
2597
2598	if (tzb->zb_count == 0)
2599		return (2);
2600
2601	(void) printf("\n");
2602	(void) printf("\tbp count:      %10llu\n",
2603	    (u_longlong_t)tzb->zb_count);
2604	(void) printf("\tganged count:  %10llu\n",
2605	    (longlong_t)tzb->zb_gangs);
2606	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
2607	    (u_longlong_t)tzb->zb_lsize,
2608	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
2609	(void) printf("\tbp physical:   %10llu      avg:"
2610	    " %6llu     compression: %6.2f\n",
2611	    (u_longlong_t)tzb->zb_psize,
2612	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
2613	    (double)tzb->zb_lsize / tzb->zb_psize);
2614	(void) printf("\tbp allocated:  %10llu      avg:"
2615	    " %6llu     compression: %6.2f\n",
2616	    (u_longlong_t)tzb->zb_asize,
2617	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
2618	    (double)tzb->zb_lsize / tzb->zb_asize);
2619	(void) printf("\tbp deduped:    %10llu    ref>1:"
2620	    " %6llu   deduplication: %6.2f\n",
2621	    (u_longlong_t)zcb.zcb_dedup_asize,
2622	    (u_longlong_t)zcb.zcb_dedup_blocks,
2623	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
2624	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
2625	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
2626
2627	for (bp_embedded_type_t i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
2628		if (zcb.zcb_embedded_blocks[i] == 0)
2629			continue;
2630		(void) printf("\n");
2631		(void) printf("\tadditional, non-pointer bps of type %u: "
2632		    "%10llu\n",
2633		    i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
2634
2635		if (dump_opt['b'] >= 3) {
2636			(void) printf("\t number of (compressed) bytes:  "
2637			    "number of bps\n");
2638			dump_histogram(zcb.zcb_embedded_histogram[i],
2639			    sizeof (zcb.zcb_embedded_histogram[i]) /
2640			    sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
2641		}
2642	}
2643
2644	if (tzb->zb_ditto_samevdev != 0) {
2645		(void) printf("\tDittoed blocks on same vdev: %llu\n",
2646		    (longlong_t)tzb->zb_ditto_samevdev);
2647	}
2648
2649	if (dump_opt['b'] >= 2) {
2650		int l, t, level;
2651		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
2652		    "\t  avg\t comp\t%%Total\tType\n");
2653
2654		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
2655			char csize[32], lsize[32], psize[32], asize[32];
2656			char avg[32], gang[32];
2657			char *typename;
2658
2659			if (t < DMU_OT_NUMTYPES)
2660				typename = dmu_ot[t].ot_name;
2661			else
2662				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
2663
2664			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
2665				(void) printf("%6s\t%5s\t%5s\t%5s"
2666				    "\t%5s\t%5s\t%6s\t%s\n",
2667				    "-",
2668				    "-",
2669				    "-",
2670				    "-",
2671				    "-",
2672				    "-",
2673				    "-",
2674				    typename);
2675				continue;
2676			}
2677
2678			for (l = ZB_TOTAL - 1; l >= -1; l--) {
2679				level = (l == -1 ? ZB_TOTAL : l);
2680				zb = &zcb.zcb_type[level][t];
2681
2682				if (zb->zb_asize == 0)
2683					continue;
2684
2685				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
2686					continue;
2687
2688				if (level == 0 && zb->zb_asize ==
2689				    zcb.zcb_type[ZB_TOTAL][t].zb_asize)
2690					continue;
2691
2692				zdb_nicenum(zb->zb_count, csize);
2693				zdb_nicenum(zb->zb_lsize, lsize);
2694				zdb_nicenum(zb->zb_psize, psize);
2695				zdb_nicenum(zb->zb_asize, asize);
2696				zdb_nicenum(zb->zb_asize / zb->zb_count, avg);
2697				zdb_nicenum(zb->zb_gangs, gang);
2698
2699				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
2700				    "\t%5.2f\t%6.2f\t",
2701				    csize, lsize, psize, asize, avg,
2702				    (double)zb->zb_lsize / zb->zb_psize,
2703				    100.0 * zb->zb_asize / tzb->zb_asize);
2704
2705				if (level == ZB_TOTAL)
2706					(void) printf("%s\n", typename);
2707				else
2708					(void) printf("    L%d %s\n",
2709					    level, typename);
2710
2711				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
2712					(void) printf("\t number of ganged "
2713					    "blocks: %s\n", gang);
2714				}
2715
2716				if (dump_opt['b'] >= 4) {
2717					(void) printf("psize "
2718					    "(in 512-byte sectors): "
2719					    "number of blocks\n");
2720					dump_histogram(zb->zb_psize_histogram,
2721					    PSIZE_HISTO_SIZE, 0);
2722				}
2723			}
2724		}
2725	}
2726
2727	(void) printf("\n");
2728
2729	if (leaks)
2730		return (2);
2731
2732	if (zcb.zcb_haderrors)
2733		return (3);
2734
2735	return (0);
2736}
2737
2738typedef struct zdb_ddt_entry {
2739	ddt_key_t	zdde_key;
2740	uint64_t	zdde_ref_blocks;
2741	uint64_t	zdde_ref_lsize;
2742	uint64_t	zdde_ref_psize;
2743	uint64_t	zdde_ref_dsize;
2744	avl_node_t	zdde_node;
2745} zdb_ddt_entry_t;
2746
2747/* ARGSUSED */
2748static int
2749zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2750    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
2751{
2752	avl_tree_t *t = arg;
2753	avl_index_t where;
2754	zdb_ddt_entry_t *zdde, zdde_search;
2755
2756	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
2757		return (0);
2758
2759	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
2760		(void) printf("traversing objset %llu, %llu objects, "
2761		    "%lu blocks so far\n",
2762		    (u_longlong_t)zb->zb_objset,
2763		    (u_longlong_t)BP_GET_FILL(bp),
2764		    avl_numnodes(t));
2765	}
2766
2767	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
2768	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
2769		return (0);
2770
2771	ddt_key_fill(&zdde_search.zdde_key, bp);
2772
2773	zdde = avl_find(t, &zdde_search, &where);
2774
2775	if (zdde == NULL) {
2776		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
2777		zdde->zdde_key = zdde_search.zdde_key;
2778		avl_insert(t, zdde, where);
2779	}
2780
2781	zdde->zdde_ref_blocks += 1;
2782	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
2783	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
2784	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
2785
2786	return (0);
2787}
2788
2789static void
2790dump_simulated_ddt(spa_t *spa)
2791{
2792	avl_tree_t t;
2793	void *cookie = NULL;
2794	zdb_ddt_entry_t *zdde;
2795	ddt_histogram_t ddh_total = { 0 };
2796	ddt_stat_t dds_total = { 0 };
2797
2798	avl_create(&t, ddt_entry_compare,
2799	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
2800
2801	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2802
2803	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
2804	    zdb_ddt_add_cb, &t);
2805
2806	spa_config_exit(spa, SCL_CONFIG, FTAG);
2807
2808	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
2809		ddt_stat_t dds;
2810		uint64_t refcnt = zdde->zdde_ref_blocks;
2811		ASSERT(refcnt != 0);
2812
2813		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
2814		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
2815		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
2816		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
2817
2818		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
2819		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
2820		dds.dds_ref_psize = zdde->zdde_ref_psize;
2821		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
2822
2823		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
2824		    &dds, 0);
2825
2826		umem_free(zdde, sizeof (*zdde));
2827	}
2828
2829	avl_destroy(&t);
2830
2831	ddt_histogram_stat(&dds_total, &ddh_total);
2832
2833	(void) printf("Simulated DDT histogram:\n");
2834
2835	zpool_dump_ddt(&dds_total, &ddh_total);
2836
2837	dump_dedup_ratio(&dds_total);
2838}
2839
2840static void
2841dump_zpool(spa_t *spa)
2842{
2843	dsl_pool_t *dp = spa_get_dsl(spa);
2844	int rc = 0;
2845
2846	if (dump_opt['S']) {
2847		dump_simulated_ddt(spa);
2848		return;
2849	}
2850
2851	if (!dump_opt['e'] && dump_opt['C'] > 1) {
2852		(void) printf("\nCached configuration:\n");
2853		dump_nvlist(spa->spa_config, 8);
2854	}
2855
2856	if (dump_opt['C'])
2857		dump_config(spa);
2858
2859	if (dump_opt['u'])
2860		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
2861
2862	if (dump_opt['D'])
2863		dump_all_ddts(spa);
2864
2865	if (dump_opt['d'] > 2 || dump_opt['m'])
2866		dump_metaslabs(spa);
2867
2868	if (dump_opt['d'] || dump_opt['i']) {
2869		dump_dir(dp->dp_meta_objset);
2870		if (dump_opt['d'] >= 3) {
2871			dump_bpobj(&spa->spa_deferred_bpobj,
2872			    "Deferred frees", 0);
2873			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
2874				dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
2875				    "Pool snapshot frees", 0);
2876			}
2877
2878			if (spa_feature_is_active(spa,
2879			    SPA_FEATURE_ASYNC_DESTROY)) {
2880				dump_bptree(spa->spa_meta_objset,
2881				    spa->spa_dsl_pool->dp_bptree_obj,
2882				    "Pool dataset frees");
2883			}
2884			dump_dtl(spa->spa_root_vdev, 0);
2885		}
2886		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
2887		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
2888	}
2889	if (dump_opt['b'] || dump_opt['c'])
2890		rc = dump_block_stats(spa);
2891
2892	if (rc == 0)
2893		rc = verify_spacemap_refcounts(spa);
2894
2895	if (dump_opt['s'])
2896		show_pool_stats(spa);
2897
2898	if (dump_opt['h'])
2899		dump_history(spa);
2900
2901	if (rc != 0)
2902		exit(rc);
2903}
2904
2905#define	ZDB_FLAG_CHECKSUM	0x0001
2906#define	ZDB_FLAG_DECOMPRESS	0x0002
2907#define	ZDB_FLAG_BSWAP		0x0004
2908#define	ZDB_FLAG_GBH		0x0008
2909#define	ZDB_FLAG_INDIRECT	0x0010
2910#define	ZDB_FLAG_PHYS		0x0020
2911#define	ZDB_FLAG_RAW		0x0040
2912#define	ZDB_FLAG_PRINT_BLKPTR	0x0080
2913
2914int flagbits[256];
2915
2916static void
2917zdb_print_blkptr(blkptr_t *bp, int flags)
2918{
2919	char blkbuf[BP_SPRINTF_LEN];
2920
2921	if (flags & ZDB_FLAG_BSWAP)
2922		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
2923
2924	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
2925	(void) printf("%s\n", blkbuf);
2926}
2927
2928static void
2929zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
2930{
2931	int i;
2932
2933	for (i = 0; i < nbps; i++)
2934		zdb_print_blkptr(&bp[i], flags);
2935}
2936
2937static void
2938zdb_dump_gbh(void *buf, int flags)
2939{
2940	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
2941}
2942
2943static void
2944zdb_dump_block_raw(void *buf, uint64_t size, int flags)
2945{
2946	if (flags & ZDB_FLAG_BSWAP)
2947		byteswap_uint64_array(buf, size);
2948	(void) write(1, buf, size);
2949}
2950
2951static void
2952zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
2953{
2954	uint64_t *d = (uint64_t *)buf;
2955	int nwords = size / sizeof (uint64_t);
2956	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
2957	int i, j;
2958	char *hdr, *c;
2959
2960
2961	if (do_bswap)
2962		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
2963	else
2964		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
2965
2966	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
2967
2968	for (i = 0; i < nwords; i += 2) {
2969		(void) printf("%06llx:  %016llx  %016llx  ",
2970		    (u_longlong_t)(i * sizeof (uint64_t)),
2971		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
2972		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
2973
2974		c = (char *)&d[i];
2975		for (j = 0; j < 2 * sizeof (uint64_t); j++)
2976			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
2977		(void) printf("\n");
2978	}
2979}
2980
2981/*
2982 * There are two acceptable formats:
2983 *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
2984 *	child[.child]*    - For example: 0.1.1
2985 *
2986 * The second form can be used to specify arbitrary vdevs anywhere
2987 * in the heirarchy.  For example, in a pool with a mirror of
2988 * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
2989 */
2990static vdev_t *
2991zdb_vdev_lookup(vdev_t *vdev, char *path)
2992{
2993	char *s, *p, *q;
2994	int i;
2995
2996	if (vdev == NULL)
2997		return (NULL);
2998
2999	/* First, assume the x.x.x.x format */
3000	i = (int)strtoul(path, &s, 10);
3001	if (s == path || (s && *s != '.' && *s != '\0'))
3002		goto name;
3003	if (i < 0 || i >= vdev->vdev_children)
3004		return (NULL);
3005
3006	vdev = vdev->vdev_child[i];
3007	if (*s == '\0')
3008		return (vdev);
3009	return (zdb_vdev_lookup(vdev, s+1));
3010
3011name:
3012	for (i = 0; i < vdev->vdev_children; i++) {
3013		vdev_t *vc = vdev->vdev_child[i];
3014
3015		if (vc->vdev_path == NULL) {
3016			vc = zdb_vdev_lookup(vc, path);
3017			if (vc == NULL)
3018				continue;
3019			else
3020				return (vc);
3021		}
3022
3023		p = strrchr(vc->vdev_path, '/');
3024		p = p ? p + 1 : vc->vdev_path;
3025		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
3026
3027		if (strcmp(vc->vdev_path, path) == 0)
3028			return (vc);
3029		if (strcmp(p, path) == 0)
3030			return (vc);
3031		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
3032			return (vc);
3033	}
3034
3035	return (NULL);
3036}
3037
3038/*
3039 * Read a block from a pool and print it out.  The syntax of the
3040 * block descriptor is:
3041 *
3042 *	pool:vdev_specifier:offset:size[:flags]
3043 *
3044 *	pool           - The name of the pool you wish to read from
3045 *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
3046 *	offset         - offset, in hex, in bytes
3047 *	size           - Amount of data to read, in hex, in bytes
3048 *	flags          - A string of characters specifying options
3049 *		 b: Decode a blkptr at given offset within block
3050 *		*c: Calculate and display checksums
3051 *		 d: Decompress data before dumping
3052 *		 e: Byteswap data before dumping
3053 *		 g: Display data as a gang block header
3054 *		 i: Display as an indirect block
3055 *		 p: Do I/O to physical offset
3056 *		 r: Dump raw data to stdout
3057 *
3058 *              * = not yet implemented
3059 */
3060static void
3061zdb_read_block(char *thing, spa_t *spa)
3062{
3063	blkptr_t blk, *bp = &blk;
3064	dva_t *dva = bp->blk_dva;
3065	int flags = 0;
3066	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
3067	zio_t *zio;
3068	vdev_t *vd;
3069	void *pbuf, *lbuf, *buf;
3070	char *s, *p, *dup, *vdev, *flagstr;
3071	int i, error;
3072
3073	dup = strdup(thing);
3074	s = strtok(dup, ":");
3075	vdev = s ? s : "";
3076	s = strtok(NULL, ":");
3077	offset = strtoull(s ? s : "", NULL, 16);
3078	s = strtok(NULL, ":");
3079	size = strtoull(s ? s : "", NULL, 16);
3080	s = strtok(NULL, ":");
3081	flagstr = s ? s : "";
3082
3083	s = NULL;
3084	if (size == 0)
3085		s = "size must not be zero";
3086	if (!IS_P2ALIGNED(size, DEV_BSIZE))
3087		s = "size must be a multiple of sector size";
3088	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
3089		s = "offset must be a multiple of sector size";
3090	if (s) {
3091		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
3092		free(dup);
3093		return;
3094	}
3095
3096	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
3097		for (i = 0; flagstr[i]; i++) {
3098			int bit = flagbits[(uchar_t)flagstr[i]];
3099
3100			if (bit == 0) {
3101				(void) printf("***Invalid flag: %c\n",
3102				    flagstr[i]);
3103				continue;
3104			}
3105			flags |= bit;
3106
3107			/* If it's not something with an argument, keep going */
3108			if ((bit & (ZDB_FLAG_CHECKSUM |
3109			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
3110				continue;
3111
3112			p = &flagstr[i + 1];
3113			if (bit == ZDB_FLAG_PRINT_BLKPTR)
3114				blkptr_offset = strtoull(p, &p, 16);
3115			if (*p != ':' && *p != '\0') {
3116				(void) printf("***Invalid flag arg: '%s'\n", s);
3117				free(dup);
3118				return;
3119			}
3120			i += p - &flagstr[i + 1]; /* skip over the number */
3121		}
3122	}
3123
3124	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
3125	if (vd == NULL) {
3126		(void) printf("***Invalid vdev: %s\n", vdev);
3127		free(dup);
3128		return;
3129	} else {
3130		if (vd->vdev_path)
3131			(void) fprintf(stderr, "Found vdev: %s\n",
3132			    vd->vdev_path);
3133		else
3134			(void) fprintf(stderr, "Found vdev type: %s\n",
3135			    vd->vdev_ops->vdev_op_type);
3136	}
3137
3138	psize = size;
3139	lsize = size;
3140
3141	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
3142	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
3143
3144	BP_ZERO(bp);
3145
3146	DVA_SET_VDEV(&dva[0], vd->vdev_id);
3147	DVA_SET_OFFSET(&dva[0], offset);
3148	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
3149	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
3150
3151	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
3152
3153	BP_SET_LSIZE(bp, lsize);
3154	BP_SET_PSIZE(bp, psize);
3155	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
3156	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
3157	BP_SET_TYPE(bp, DMU_OT_NONE);
3158	BP_SET_LEVEL(bp, 0);
3159	BP_SET_DEDUP(bp, 0);
3160	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
3161
3162	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
3163	zio = zio_root(spa, NULL, NULL, 0);
3164
3165	if (vd == vd->vdev_top) {
3166		/*
3167		 * Treat this as a normal block read.
3168		 */
3169		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
3170		    ZIO_PRIORITY_SYNC_READ,
3171		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
3172	} else {
3173		/*
3174		 * Treat this as a vdev child I/O.
3175		 */
3176		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
3177		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
3178		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
3179		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
3180		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
3181	}
3182
3183	error = zio_wait(zio);
3184	spa_config_exit(spa, SCL_STATE, FTAG);
3185
3186	if (error) {
3187		(void) printf("Read of %s failed, error: %d\n", thing, error);
3188		goto out;
3189	}
3190
3191	if (flags & ZDB_FLAG_DECOMPRESS) {
3192		/*
3193		 * We don't know how the data was compressed, so just try
3194		 * every decompress function at every inflated blocksize.
3195		 */
3196		enum zio_compress c;
3197		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
3198		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
3199
3200		bcopy(pbuf, pbuf2, psize);
3201
3202		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
3203		    SPA_MAXBLOCKSIZE - psize) == 0);
3204
3205		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
3206		    SPA_MAXBLOCKSIZE - psize) == 0);
3207
3208		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
3209		    lsize -= SPA_MINBLOCKSIZE) {
3210			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
3211				if (zio_decompress_data(c, pbuf, lbuf,
3212				    psize, lsize) == 0 &&
3213				    zio_decompress_data(c, pbuf2, lbuf2,
3214				    psize, lsize) == 0 &&
3215				    bcmp(lbuf, lbuf2, lsize) == 0)
3216					break;
3217			}
3218			if (c != ZIO_COMPRESS_FUNCTIONS)
3219				break;
3220			lsize -= SPA_MINBLOCKSIZE;
3221		}
3222
3223		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
3224		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
3225
3226		if (lsize <= psize) {
3227			(void) printf("Decompress of %s failed\n", thing);
3228			goto out;
3229		}
3230		buf = lbuf;
3231		size = lsize;
3232	} else {
3233		buf = pbuf;
3234		size = psize;
3235	}
3236
3237	if (flags & ZDB_FLAG_PRINT_BLKPTR)
3238		zdb_print_blkptr((blkptr_t *)(void *)
3239		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
3240	else if (flags & ZDB_FLAG_RAW)
3241		zdb_dump_block_raw(buf, size, flags);
3242	else if (flags & ZDB_FLAG_INDIRECT)
3243		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
3244		    flags);
3245	else if (flags & ZDB_FLAG_GBH)
3246		zdb_dump_gbh(buf, flags);
3247	else
3248		zdb_dump_block(thing, buf, size, flags);
3249
3250out:
3251	umem_free(pbuf, SPA_MAXBLOCKSIZE);
3252	umem_free(lbuf, SPA_MAXBLOCKSIZE);
3253	free(dup);
3254}
3255
3256static boolean_t
3257pool_match(nvlist_t *cfg, char *tgt)
3258{
3259	uint64_t v, guid = strtoull(tgt, NULL, 0);
3260	char *s;
3261
3262	if (guid != 0) {
3263		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
3264			return (v == guid);
3265	} else {
3266		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
3267			return (strcmp(s, tgt) == 0);
3268	}
3269	return (B_FALSE);
3270}
3271
3272static char *
3273find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
3274{
3275	nvlist_t *pools;
3276	nvlist_t *match = NULL;
3277	char *name = NULL;
3278	char *sepp = NULL;
3279	char sep;
3280	int count = 0;
3281	importargs_t args = { 0 };
3282
3283	args.paths = dirc;
3284	args.path = dirv;
3285	args.can_be_active = B_TRUE;
3286
3287	if ((sepp = strpbrk(*target, "/@")) != NULL) {
3288		sep = *sepp;
3289		*sepp = '\0';
3290	}
3291
3292	pools = zpool_search_import(g_zfs, &args);
3293
3294	if (pools != NULL) {
3295		nvpair_t *elem = NULL;
3296		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
3297			verify(nvpair_value_nvlist(elem, configp) == 0);
3298			if (pool_match(*configp, *target)) {
3299				count++;
3300				if (match != NULL) {
3301					/* print previously found config */
3302					if (name != NULL) {
3303						(void) printf("%s\n", name);
3304						dump_nvlist(match, 8);
3305						name = NULL;
3306					}
3307					(void) printf("%s\n",
3308					    nvpair_name(elem));
3309					dump_nvlist(*configp, 8);
3310				} else {
3311					match = *configp;
3312					name = nvpair_name(elem);
3313				}
3314			}
3315		}
3316	}
3317	if (count > 1)
3318		(void) fatal("\tMatched %d pools - use pool GUID "
3319		    "instead of pool name or \n"
3320		    "\tpool name part of a dataset name to select pool", count);
3321
3322	if (sepp)
3323		*sepp = sep;
3324	/*
3325	 * If pool GUID was specified for pool id, replace it with pool name
3326	 */
3327	if (name && (strstr(*target, name) != *target)) {
3328		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
3329
3330		*target = umem_alloc(sz, UMEM_NOFAIL);
3331		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
3332	}
3333
3334	*configp = name ? match : NULL;
3335
3336	return (name);
3337}
3338
3339int
3340main(int argc, char **argv)
3341{
3342	int i, c;
3343	struct rlimit rl = { 1024, 1024 };
3344	spa_t *spa = NULL;
3345	objset_t *os = NULL;
3346	int dump_all = 1;
3347	int verbose = 0;
3348	int error = 0;
3349	char **searchdirs = NULL;
3350	int nsearch = 0;
3351	char *target;
3352	nvlist_t *policy = NULL;
3353	uint64_t max_txg = UINT64_MAX;
3354	int rewind = ZPOOL_NEVER_REWIND;
3355
3356	(void) setrlimit(RLIMIT_NOFILE, &rl);
3357	(void) enable_extended_FILE_stdio(-1, -1);
3358
3359	dprintf_setup(&argc, argv);
3360
3361	while ((c = getopt(argc, argv,
3362	    "bcdhilmM:suCDRSAFLXx:evp:t:U:P")) != -1) {
3363		switch (c) {
3364		case 'b':
3365		case 'c':
3366		case 'd':
3367		case 'h':
3368		case 'i':
3369		case 'l':
3370		case 'm':
3371		case 's':
3372		case 'u':
3373		case 'C':
3374		case 'D':
3375		case 'R':
3376		case 'S':
3377			dump_opt[c]++;
3378			dump_all = 0;
3379			break;
3380		case 'A':
3381		case 'F':
3382		case 'L':
3383		case 'X':
3384		case 'e':
3385		case 'P':
3386			dump_opt[c]++;
3387			break;
3388		case 'v':
3389			verbose++;
3390			break;
3391		case 'M':
3392			max_inflight = strtoull(optarg, NULL, 0);
3393			if (max_inflight == 0) {
3394				(void) fprintf(stderr, "maximum number "
3395				    "of inflight I/Os must be greater "
3396				    "than 0\n");
3397				usage();
3398			}
3399			break;
3400		case 'p':
3401			if (searchdirs == NULL) {
3402				searchdirs = umem_alloc(sizeof (char *),
3403				    UMEM_NOFAIL);
3404			} else {
3405				char **tmp = umem_alloc((nsearch + 1) *
3406				    sizeof (char *), UMEM_NOFAIL);
3407				bcopy(searchdirs, tmp, nsearch *
3408				    sizeof (char *));
3409				umem_free(searchdirs,
3410				    nsearch * sizeof (char *));
3411				searchdirs = tmp;
3412			}
3413			searchdirs[nsearch++] = optarg;
3414			break;
3415		case 'x':
3416			vn_dumpdir = optarg;
3417			break;
3418		case 't':
3419			max_txg = strtoull(optarg, NULL, 0);
3420			if (max_txg < TXG_INITIAL) {
3421				(void) fprintf(stderr, "incorrect txg "
3422				    "specified: %s\n", optarg);
3423				usage();
3424			}
3425			break;
3426		case 'U':
3427			spa_config_path = optarg;
3428			break;
3429		default:
3430			usage();
3431			break;
3432		}
3433	}
3434
3435	if (!dump_opt['e'] && searchdirs != NULL) {
3436		(void) fprintf(stderr, "-p option requires use of -e\n");
3437		usage();
3438	}
3439
3440	kernel_init(FREAD);
3441	g_zfs = libzfs_init();
3442	ASSERT(g_zfs != NULL);
3443
3444	if (dump_all)
3445		verbose = MAX(verbose, 1);
3446
3447	for (c = 0; c < 256; c++) {
3448		if (dump_all && !strchr("elAFLRSXP", c))
3449			dump_opt[c] = 1;
3450		if (dump_opt[c])
3451			dump_opt[c] += verbose;
3452	}
3453
3454	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
3455	zfs_recover = (dump_opt['A'] > 1);
3456
3457	argc -= optind;
3458	argv += optind;
3459
3460	if (argc < 2 && dump_opt['R'])
3461		usage();
3462	if (argc < 1) {
3463		if (!dump_opt['e'] && dump_opt['C']) {
3464			dump_cachefile(spa_config_path);
3465			return (0);
3466		}
3467		usage();
3468	}
3469
3470	if (dump_opt['l']) {
3471		dump_label(argv[0]);
3472		return (0);
3473	}
3474
3475	if (dump_opt['X'] || dump_opt['F'])
3476		rewind = ZPOOL_DO_REWIND |
3477		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
3478
3479	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
3480	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
3481	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
3482		fatal("internal error: %s", strerror(ENOMEM));
3483
3484	error = 0;
3485	target = argv[0];
3486
3487	if (dump_opt['e']) {
3488		nvlist_t *cfg = NULL;
3489		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
3490
3491		error = ENOENT;
3492		if (name) {
3493			if (dump_opt['C'] > 1) {
3494				(void) printf("\nConfiguration for import:\n");
3495				dump_nvlist(cfg, 8);
3496			}
3497			if (nvlist_add_nvlist(cfg,
3498			    ZPOOL_REWIND_POLICY, policy) != 0) {
3499				fatal("can't open '%s': %s",
3500				    target, strerror(ENOMEM));
3501			}
3502			if ((error = spa_import(name, cfg, NULL,
3503			    ZFS_IMPORT_MISSING_LOG)) != 0) {
3504				error = spa_import(name, cfg, NULL,
3505				    ZFS_IMPORT_VERBATIM);
3506			}
3507		}
3508	}
3509
3510	if (error == 0) {
3511		if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
3512			error = spa_open_rewind(target, &spa, FTAG, policy,
3513			    NULL);
3514			if (error) {
3515				/*
3516				 * If we're missing the log device then
3517				 * try opening the pool after clearing the
3518				 * log state.
3519				 */
3520				mutex_enter(&spa_namespace_lock);
3521				if ((spa = spa_lookup(target)) != NULL &&
3522				    spa->spa_log_state == SPA_LOG_MISSING) {
3523					spa->spa_log_state = SPA_LOG_CLEAR;
3524					error = 0;
3525				}
3526				mutex_exit(&spa_namespace_lock);
3527
3528				if (!error) {
3529					error = spa_open_rewind(target, &spa,
3530					    FTAG, policy, NULL);
3531				}
3532			}
3533		} else {
3534			error = dmu_objset_own(target, DMU_OST_ANY,
3535			    B_TRUE, FTAG, &os);
3536		}
3537	}
3538	nvlist_free(policy);
3539
3540	if (error)
3541		fatal("can't open '%s': %s", target, strerror(error));
3542
3543	argv++;
3544	argc--;
3545	if (!dump_opt['R']) {
3546		if (argc > 0) {
3547			zopt_objects = argc;
3548			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
3549			for (i = 0; i < zopt_objects; i++) {
3550				errno = 0;
3551				zopt_object[i] = strtoull(argv[i], NULL, 0);
3552				if (zopt_object[i] == 0 && errno != 0)
3553					fatal("bad number %s: %s",
3554					    argv[i], strerror(errno));
3555			}
3556		}
3557		if (os != NULL) {
3558			dump_dir(os);
3559		} else if (zopt_objects > 0 && !dump_opt['m']) {
3560			dump_dir(spa->spa_meta_objset);
3561		} else {
3562			dump_zpool(spa);
3563		}
3564	} else {
3565		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
3566		flagbits['c'] = ZDB_FLAG_CHECKSUM;
3567		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
3568		flagbits['e'] = ZDB_FLAG_BSWAP;
3569		flagbits['g'] = ZDB_FLAG_GBH;
3570		flagbits['i'] = ZDB_FLAG_INDIRECT;
3571		flagbits['p'] = ZDB_FLAG_PHYS;
3572		flagbits['r'] = ZDB_FLAG_RAW;
3573
3574		for (i = 0; i < argc; i++)
3575			zdb_read_block(argv[i], spa);
3576	}
3577
3578	(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
3579
3580	fuid_table_destroy();
3581	sa_loaded = B_FALSE;
3582
3583	libzfs_fini(g_zfs);
3584	kernel_fini();
3585
3586	return (0);
3587}
3588