zfsimpl.c revision 268649
1/*-
2 * Copyright (c) 2007 Doug Rabson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD: stable/10/sys/boot/zfs/zfsimpl.c 268649 2014-07-15 04:53:34Z delphij $");
29
30/*
31 *	Stand-alone ZFS file reader.
32 */
33
34#include <sys/stat.h>
35#include <sys/stdint.h>
36
37#include "zfsimpl.h"
38#include "zfssubr.c"
39
40
41struct zfsmount {
42	const spa_t	*spa;
43	objset_phys_t	objset;
44	uint64_t	rootobj;
45};
46
47/*
48 * List of all vdevs, chained through v_alllink.
49 */
50static vdev_list_t zfs_vdevs;
51
52 /*
53 * List of ZFS features supported for read
54 */
55static const char *features_for_read[] = {
56	"org.illumos:lz4_compress",
57	"com.delphix:hole_birth",
58	"com.delphix:extensible_dataset",
59	"com.delphix:embedded_data",
60	NULL
61};
62
63/*
64 * List of all pools, chained through spa_link.
65 */
66static spa_list_t zfs_pools;
67
68static uint64_t zfs_crc64_table[256];
69static const dnode_phys_t *dnode_cache_obj = 0;
70static uint64_t dnode_cache_bn;
71static char *dnode_cache_buf;
72static char *zap_scratch;
73static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
74
75#define TEMP_SIZE	(1024 * 1024)
76
77static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
78static int zfs_get_root(const spa_t *spa, uint64_t *objid);
79static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
80
81static void
82zfs_init(void)
83{
84	STAILQ_INIT(&zfs_vdevs);
85	STAILQ_INIT(&zfs_pools);
86
87	zfs_temp_buf = malloc(TEMP_SIZE);
88	zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
89	zfs_temp_ptr = zfs_temp_buf;
90	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
91	zap_scratch = malloc(SPA_MAXBLOCKSIZE);
92
93	zfs_init_crc();
94}
95
96static void *
97zfs_alloc(size_t size)
98{
99	char *ptr;
100
101	if (zfs_temp_ptr + size > zfs_temp_end) {
102		printf("ZFS: out of temporary buffer space\n");
103		for (;;) ;
104	}
105	ptr = zfs_temp_ptr;
106	zfs_temp_ptr += size;
107
108	return (ptr);
109}
110
111static void
112zfs_free(void *ptr, size_t size)
113{
114
115	zfs_temp_ptr -= size;
116	if (zfs_temp_ptr != ptr) {
117		printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
118		for (;;) ;
119	}
120}
121
122static int
123xdr_int(const unsigned char **xdr, int *ip)
124{
125	*ip = ((*xdr)[0] << 24)
126		| ((*xdr)[1] << 16)
127		| ((*xdr)[2] << 8)
128		| ((*xdr)[3] << 0);
129	(*xdr) += 4;
130	return (0);
131}
132
133static int
134xdr_u_int(const unsigned char **xdr, u_int *ip)
135{
136	*ip = ((*xdr)[0] << 24)
137		| ((*xdr)[1] << 16)
138		| ((*xdr)[2] << 8)
139		| ((*xdr)[3] << 0);
140	(*xdr) += 4;
141	return (0);
142}
143
144static int
145xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
146{
147	u_int hi, lo;
148
149	xdr_u_int(xdr, &hi);
150	xdr_u_int(xdr, &lo);
151	*lp = (((uint64_t) hi) << 32) | lo;
152	return (0);
153}
154
155static int
156nvlist_find(const unsigned char *nvlist, const char *name, int type,
157	    int* elementsp, void *valuep)
158{
159	const unsigned char *p, *pair;
160	int junk;
161	int encoded_size, decoded_size;
162
163	p = nvlist;
164	xdr_int(&p, &junk);
165	xdr_int(&p, &junk);
166
167	pair = p;
168	xdr_int(&p, &encoded_size);
169	xdr_int(&p, &decoded_size);
170	while (encoded_size && decoded_size) {
171		int namelen, pairtype, elements;
172		const char *pairname;
173
174		xdr_int(&p, &namelen);
175		pairname = (const char*) p;
176		p += roundup(namelen, 4);
177		xdr_int(&p, &pairtype);
178
179		if (!memcmp(name, pairname, namelen) && type == pairtype) {
180			xdr_int(&p, &elements);
181			if (elementsp)
182				*elementsp = elements;
183			if (type == DATA_TYPE_UINT64) {
184				xdr_uint64_t(&p, (uint64_t *) valuep);
185				return (0);
186			} else if (type == DATA_TYPE_STRING) {
187				int len;
188				xdr_int(&p, &len);
189				(*(const char**) valuep) = (const char*) p;
190				return (0);
191			} else if (type == DATA_TYPE_NVLIST
192				   || type == DATA_TYPE_NVLIST_ARRAY) {
193				(*(const unsigned char**) valuep) =
194					 (const unsigned char*) p;
195				return (0);
196			} else {
197				return (EIO);
198			}
199		} else {
200			/*
201			 * Not the pair we are looking for, skip to the next one.
202			 */
203			p = pair + encoded_size;
204		}
205
206		pair = p;
207		xdr_int(&p, &encoded_size);
208		xdr_int(&p, &decoded_size);
209	}
210
211	return (EIO);
212}
213
214static int
215nvlist_check_features_for_read(const unsigned char *nvlist)
216{
217	const unsigned char *p, *pair;
218	int junk;
219	int encoded_size, decoded_size;
220	int rc;
221
222	rc = 0;
223
224	p = nvlist;
225	xdr_int(&p, &junk);
226	xdr_int(&p, &junk);
227
228	pair = p;
229	xdr_int(&p, &encoded_size);
230	xdr_int(&p, &decoded_size);
231	while (encoded_size && decoded_size) {
232		int namelen, pairtype;
233		const char *pairname;
234		int i, found;
235
236		found = 0;
237
238		xdr_int(&p, &namelen);
239		pairname = (const char*) p;
240		p += roundup(namelen, 4);
241		xdr_int(&p, &pairtype);
242
243		for (i = 0; features_for_read[i] != NULL; i++) {
244			if (!memcmp(pairname, features_for_read[i], namelen)) {
245				found = 1;
246				break;
247			}
248		}
249
250		if (!found) {
251			printf("ZFS: unsupported feature: %s\n", pairname);
252			rc = EIO;
253		}
254
255		p = pair + encoded_size;
256
257		pair = p;
258		xdr_int(&p, &encoded_size);
259		xdr_int(&p, &decoded_size);
260	}
261
262	return (rc);
263}
264
265/*
266 * Return the next nvlist in an nvlist array.
267 */
268static const unsigned char *
269nvlist_next(const unsigned char *nvlist)
270{
271	const unsigned char *p, *pair;
272	int junk;
273	int encoded_size, decoded_size;
274
275	p = nvlist;
276	xdr_int(&p, &junk);
277	xdr_int(&p, &junk);
278
279	pair = p;
280	xdr_int(&p, &encoded_size);
281	xdr_int(&p, &decoded_size);
282	while (encoded_size && decoded_size) {
283		p = pair + encoded_size;
284
285		pair = p;
286		xdr_int(&p, &encoded_size);
287		xdr_int(&p, &decoded_size);
288	}
289
290	return p;
291}
292
293#ifdef TEST
294
295static const unsigned char *
296nvlist_print(const unsigned char *nvlist, unsigned int indent)
297{
298	static const char* typenames[] = {
299		"DATA_TYPE_UNKNOWN",
300		"DATA_TYPE_BOOLEAN",
301		"DATA_TYPE_BYTE",
302		"DATA_TYPE_INT16",
303		"DATA_TYPE_UINT16",
304		"DATA_TYPE_INT32",
305		"DATA_TYPE_UINT32",
306		"DATA_TYPE_INT64",
307		"DATA_TYPE_UINT64",
308		"DATA_TYPE_STRING",
309		"DATA_TYPE_BYTE_ARRAY",
310		"DATA_TYPE_INT16_ARRAY",
311		"DATA_TYPE_UINT16_ARRAY",
312		"DATA_TYPE_INT32_ARRAY",
313		"DATA_TYPE_UINT32_ARRAY",
314		"DATA_TYPE_INT64_ARRAY",
315		"DATA_TYPE_UINT64_ARRAY",
316		"DATA_TYPE_STRING_ARRAY",
317		"DATA_TYPE_HRTIME",
318		"DATA_TYPE_NVLIST",
319		"DATA_TYPE_NVLIST_ARRAY",
320		"DATA_TYPE_BOOLEAN_VALUE",
321		"DATA_TYPE_INT8",
322		"DATA_TYPE_UINT8",
323		"DATA_TYPE_BOOLEAN_ARRAY",
324		"DATA_TYPE_INT8_ARRAY",
325		"DATA_TYPE_UINT8_ARRAY"
326	};
327
328	unsigned int i, j;
329	const unsigned char *p, *pair;
330	int junk;
331	int encoded_size, decoded_size;
332
333	p = nvlist;
334	xdr_int(&p, &junk);
335	xdr_int(&p, &junk);
336
337	pair = p;
338	xdr_int(&p, &encoded_size);
339	xdr_int(&p, &decoded_size);
340	while (encoded_size && decoded_size) {
341		int namelen, pairtype, elements;
342		const char *pairname;
343
344		xdr_int(&p, &namelen);
345		pairname = (const char*) p;
346		p += roundup(namelen, 4);
347		xdr_int(&p, &pairtype);
348
349		for (i = 0; i < indent; i++)
350			printf(" ");
351		printf("%s %s", typenames[pairtype], pairname);
352
353		xdr_int(&p, &elements);
354		switch (pairtype) {
355		case DATA_TYPE_UINT64: {
356			uint64_t val;
357			xdr_uint64_t(&p, &val);
358			printf(" = 0x%jx\n", (uintmax_t)val);
359			break;
360		}
361
362		case DATA_TYPE_STRING: {
363			int len;
364			xdr_int(&p, &len);
365			printf(" = \"%s\"\n", p);
366			break;
367		}
368
369		case DATA_TYPE_NVLIST:
370			printf("\n");
371			nvlist_print(p, indent + 1);
372			break;
373
374		case DATA_TYPE_NVLIST_ARRAY:
375			for (j = 0; j < elements; j++) {
376				printf("[%d]\n", j);
377				p = nvlist_print(p, indent + 1);
378				if (j != elements - 1) {
379					for (i = 0; i < indent; i++)
380						printf(" ");
381					printf("%s %s", typenames[pairtype], pairname);
382				}
383			}
384			break;
385
386		default:
387			printf("\n");
388		}
389
390		p = pair + encoded_size;
391
392		pair = p;
393		xdr_int(&p, &encoded_size);
394		xdr_int(&p, &decoded_size);
395	}
396
397	return p;
398}
399
400#endif
401
402static int
403vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
404    off_t offset, size_t size)
405{
406	size_t psize;
407	int rc;
408
409	if (!vdev->v_phys_read)
410		return (EIO);
411
412	if (bp) {
413		psize = BP_GET_PSIZE(bp);
414	} else {
415		psize = size;
416	}
417
418	/*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
419	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
420	if (rc)
421		return (rc);
422	if (bp && zio_checksum_verify(bp, buf))
423		return (EIO);
424
425	return (0);
426}
427
428static int
429vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
430    off_t offset, size_t bytes)
431{
432
433	return (vdev_read_phys(vdev, bp, buf,
434		offset + VDEV_LABEL_START_SIZE, bytes));
435}
436
437
438static int
439vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
440    off_t offset, size_t bytes)
441{
442	vdev_t *kid;
443	int rc;
444
445	rc = EIO;
446	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
447		if (kid->v_state != VDEV_STATE_HEALTHY)
448			continue;
449		rc = kid->v_read(kid, bp, buf, offset, bytes);
450		if (!rc)
451			return (0);
452	}
453
454	return (rc);
455}
456
457static int
458vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
459    off_t offset, size_t bytes)
460{
461	vdev_t *kid;
462
463	/*
464	 * Here we should have two kids:
465	 * First one which is the one we are replacing and we can trust
466	 * only this one to have valid data, but it might not be present.
467	 * Second one is that one we are replacing with. It is most likely
468	 * healthy, but we can't trust it has needed data, so we won't use it.
469	 */
470	kid = STAILQ_FIRST(&vdev->v_children);
471	if (kid == NULL)
472		return (EIO);
473	if (kid->v_state != VDEV_STATE_HEALTHY)
474		return (EIO);
475	return (kid->v_read(kid, bp, buf, offset, bytes));
476}
477
478static vdev_t *
479vdev_find(uint64_t guid)
480{
481	vdev_t *vdev;
482
483	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
484		if (vdev->v_guid == guid)
485			return (vdev);
486
487	return (0);
488}
489
490static vdev_t *
491vdev_create(uint64_t guid, vdev_read_t *read)
492{
493	vdev_t *vdev;
494
495	vdev = malloc(sizeof(vdev_t));
496	memset(vdev, 0, sizeof(vdev_t));
497	STAILQ_INIT(&vdev->v_children);
498	vdev->v_guid = guid;
499	vdev->v_state = VDEV_STATE_OFFLINE;
500	vdev->v_read = read;
501	vdev->v_phys_read = 0;
502	vdev->v_read_priv = 0;
503	STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
504
505	return (vdev);
506}
507
508static int
509vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
510    vdev_t **vdevp, int is_newer)
511{
512	int rc;
513	uint64_t guid, id, ashift, nparity;
514	const char *type;
515	const char *path;
516	vdev_t *vdev, *kid;
517	const unsigned char *kids;
518	int nkids, i, is_new;
519	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
520
521	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
522			DATA_TYPE_UINT64, 0, &guid)
523	    || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
524			   DATA_TYPE_UINT64, 0, &id)
525	    || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
526			   DATA_TYPE_STRING, 0, &type)) {
527		printf("ZFS: can't find vdev details\n");
528		return (ENOENT);
529	}
530
531	if (strcmp(type, VDEV_TYPE_MIRROR)
532	    && strcmp(type, VDEV_TYPE_DISK)
533#ifdef ZFS_TEST
534	    && strcmp(type, VDEV_TYPE_FILE)
535#endif
536	    && strcmp(type, VDEV_TYPE_RAIDZ)
537	    && strcmp(type, VDEV_TYPE_REPLACING)) {
538		printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
539		return (EIO);
540	}
541
542	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
543
544	nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
545			&is_offline);
546	nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
547			&is_removed);
548	nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
549			&is_faulted);
550	nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
551			&is_degraded);
552	nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
553			&isnt_present);
554
555	vdev = vdev_find(guid);
556	if (!vdev) {
557		is_new = 1;
558
559		if (!strcmp(type, VDEV_TYPE_MIRROR))
560			vdev = vdev_create(guid, vdev_mirror_read);
561		else if (!strcmp(type, VDEV_TYPE_RAIDZ))
562			vdev = vdev_create(guid, vdev_raidz_read);
563		else if (!strcmp(type, VDEV_TYPE_REPLACING))
564			vdev = vdev_create(guid, vdev_replacing_read);
565		else
566			vdev = vdev_create(guid, vdev_disk_read);
567
568		vdev->v_id = id;
569		vdev->v_top = pvdev != NULL ? pvdev : vdev;
570		if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
571			DATA_TYPE_UINT64, 0, &ashift) == 0)
572			vdev->v_ashift = ashift;
573		else
574			vdev->v_ashift = 0;
575		if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
576			DATA_TYPE_UINT64, 0, &nparity) == 0)
577			vdev->v_nparity = nparity;
578		else
579			vdev->v_nparity = 0;
580		if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
581				DATA_TYPE_STRING, 0, &path) == 0) {
582			if (strncmp(path, "/dev/", 5) == 0)
583				path += 5;
584			vdev->v_name = strdup(path);
585		} else {
586			if (!strcmp(type, "raidz")) {
587				if (vdev->v_nparity == 1)
588					vdev->v_name = "raidz1";
589				else if (vdev->v_nparity == 2)
590					vdev->v_name = "raidz2";
591				else if (vdev->v_nparity == 3)
592					vdev->v_name = "raidz3";
593				else {
594					printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
595					return (EIO);
596				}
597			} else {
598				vdev->v_name = strdup(type);
599			}
600		}
601	} else {
602		is_new = 0;
603	}
604
605	if (is_new || is_newer) {
606		/*
607		 * This is either new vdev or we've already seen this vdev,
608		 * but from an older vdev label, so let's refresh its state
609		 * from the newer label.
610		 */
611		if (is_offline)
612			vdev->v_state = VDEV_STATE_OFFLINE;
613		else if (is_removed)
614			vdev->v_state = VDEV_STATE_REMOVED;
615		else if (is_faulted)
616			vdev->v_state = VDEV_STATE_FAULTED;
617		else if (is_degraded)
618			vdev->v_state = VDEV_STATE_DEGRADED;
619		else if (isnt_present)
620			vdev->v_state = VDEV_STATE_CANT_OPEN;
621	}
622
623	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
624			 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
625	/*
626	 * Its ok if we don't have any kids.
627	 */
628	if (rc == 0) {
629		vdev->v_nchildren = nkids;
630		for (i = 0; i < nkids; i++) {
631			rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
632			if (rc)
633				return (rc);
634			if (is_new)
635				STAILQ_INSERT_TAIL(&vdev->v_children, kid,
636						   v_childlink);
637			kids = nvlist_next(kids);
638		}
639	} else {
640		vdev->v_nchildren = 0;
641	}
642
643	if (vdevp)
644		*vdevp = vdev;
645	return (0);
646}
647
648static void
649vdev_set_state(vdev_t *vdev)
650{
651	vdev_t *kid;
652	int good_kids;
653	int bad_kids;
654
655	/*
656	 * A mirror or raidz is healthy if all its kids are healthy. A
657	 * mirror is degraded if any of its kids is healthy; a raidz
658	 * is degraded if at most nparity kids are offline.
659	 */
660	if (STAILQ_FIRST(&vdev->v_children)) {
661		good_kids = 0;
662		bad_kids = 0;
663		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
664			if (kid->v_state == VDEV_STATE_HEALTHY)
665				good_kids++;
666			else
667				bad_kids++;
668		}
669		if (bad_kids == 0) {
670			vdev->v_state = VDEV_STATE_HEALTHY;
671		} else {
672			if (vdev->v_read == vdev_mirror_read) {
673				if (good_kids) {
674					vdev->v_state = VDEV_STATE_DEGRADED;
675				} else {
676					vdev->v_state = VDEV_STATE_OFFLINE;
677				}
678			} else if (vdev->v_read == vdev_raidz_read) {
679				if (bad_kids > vdev->v_nparity) {
680					vdev->v_state = VDEV_STATE_OFFLINE;
681				} else {
682					vdev->v_state = VDEV_STATE_DEGRADED;
683				}
684			}
685		}
686	}
687}
688
689static spa_t *
690spa_find_by_guid(uint64_t guid)
691{
692	spa_t *spa;
693
694	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
695		if (spa->spa_guid == guid)
696			return (spa);
697
698	return (0);
699}
700
701static spa_t *
702spa_find_by_name(const char *name)
703{
704	spa_t *spa;
705
706	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
707		if (!strcmp(spa->spa_name, name))
708			return (spa);
709
710	return (0);
711}
712
713#ifdef BOOT2
714static spa_t *
715spa_get_primary(void)
716{
717
718	return (STAILQ_FIRST(&zfs_pools));
719}
720
721static vdev_t *
722spa_get_primary_vdev(const spa_t *spa)
723{
724	vdev_t *vdev;
725	vdev_t *kid;
726
727	if (spa == NULL)
728		spa = spa_get_primary();
729	if (spa == NULL)
730		return (NULL);
731	vdev = STAILQ_FIRST(&spa->spa_vdevs);
732	if (vdev == NULL)
733		return (NULL);
734	for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
735	     kid = STAILQ_FIRST(&vdev->v_children))
736		vdev = kid;
737	return (vdev);
738}
739#endif
740
741static spa_t *
742spa_create(uint64_t guid)
743{
744	spa_t *spa;
745
746	spa = malloc(sizeof(spa_t));
747	memset(spa, 0, sizeof(spa_t));
748	STAILQ_INIT(&spa->spa_vdevs);
749	spa->spa_guid = guid;
750	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
751
752	return (spa);
753}
754
755static const char *
756state_name(vdev_state_t state)
757{
758	static const char* names[] = {
759		"UNKNOWN",
760		"CLOSED",
761		"OFFLINE",
762		"REMOVED",
763		"CANT_OPEN",
764		"FAULTED",
765		"DEGRADED",
766		"ONLINE"
767	};
768	return names[state];
769}
770
771#ifdef BOOT2
772
773#define pager_printf printf
774
775#else
776
777static void
778pager_printf(const char *fmt, ...)
779{
780	char line[80];
781	va_list args;
782
783	va_start(args, fmt);
784	vsprintf(line, fmt, args);
785	va_end(args);
786	pager_output(line);
787}
788
789#endif
790
791#define STATUS_FORMAT	"        %s %s\n"
792
793static void
794print_state(int indent, const char *name, vdev_state_t state)
795{
796	int i;
797	char buf[512];
798
799	buf[0] = 0;
800	for (i = 0; i < indent; i++)
801		strcat(buf, "  ");
802	strcat(buf, name);
803	pager_printf(STATUS_FORMAT, buf, state_name(state));
804
805}
806
807static void
808vdev_status(vdev_t *vdev, int indent)
809{
810	vdev_t *kid;
811	print_state(indent, vdev->v_name, vdev->v_state);
812
813	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
814		vdev_status(kid, indent + 1);
815	}
816}
817
818static void
819spa_status(spa_t *spa)
820{
821	static char bootfs[ZFS_MAXNAMELEN];
822	uint64_t rootid;
823	vdev_t *vdev;
824	int good_kids, bad_kids, degraded_kids;
825	vdev_state_t state;
826
827	pager_printf("  pool: %s\n", spa->spa_name);
828	if (zfs_get_root(spa, &rootid) == 0 &&
829	    zfs_rlookup(spa, rootid, bootfs) == 0) {
830		if (bootfs[0] == '\0')
831			pager_printf("bootfs: %s\n", spa->spa_name);
832		else
833			pager_printf("bootfs: %s/%s\n", spa->spa_name, bootfs);
834	}
835	pager_printf("config:\n\n");
836	pager_printf(STATUS_FORMAT, "NAME", "STATE");
837
838	good_kids = 0;
839	degraded_kids = 0;
840	bad_kids = 0;
841	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
842		if (vdev->v_state == VDEV_STATE_HEALTHY)
843			good_kids++;
844		else if (vdev->v_state == VDEV_STATE_DEGRADED)
845			degraded_kids++;
846		else
847			bad_kids++;
848	}
849
850	state = VDEV_STATE_CLOSED;
851	if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
852		state = VDEV_STATE_HEALTHY;
853	else if ((good_kids + degraded_kids) > 0)
854		state = VDEV_STATE_DEGRADED;
855
856	print_state(0, spa->spa_name, state);
857	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
858		vdev_status(vdev, 1);
859	}
860}
861
862static void
863spa_all_status(void)
864{
865	spa_t *spa;
866	int first = 1;
867
868	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
869		if (!first)
870			pager_printf("\n");
871		first = 0;
872		spa_status(spa);
873	}
874}
875
876static int
877vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
878{
879	vdev_t vtmp;
880	vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
881	spa_t *spa;
882	vdev_t *vdev, *top_vdev, *pool_vdev;
883	off_t off;
884	blkptr_t bp;
885	const unsigned char *nvlist;
886	uint64_t val;
887	uint64_t guid;
888	uint64_t pool_txg, pool_guid;
889	uint64_t is_log;
890	const char *pool_name;
891	const unsigned char *vdevs;
892	const unsigned char *features;
893	int i, rc, is_newer;
894	char *upbuf;
895	const struct uberblock *up;
896
897	/*
898	 * Load the vdev label and figure out which
899	 * uberblock is most current.
900	 */
901	memset(&vtmp, 0, sizeof(vtmp));
902	vtmp.v_phys_read = read;
903	vtmp.v_read_priv = read_priv;
904	off = offsetof(vdev_label_t, vl_vdev_phys);
905	BP_ZERO(&bp);
906	BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
907	BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
908	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
909	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
910	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
911	ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
912	if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
913		return (EIO);
914
915	if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
916		return (EIO);
917	}
918
919	nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
920
921	if (nvlist_find(nvlist,
922			ZPOOL_CONFIG_VERSION,
923			DATA_TYPE_UINT64, 0, &val)) {
924		return (EIO);
925	}
926
927	if (!SPA_VERSION_IS_SUPPORTED(val)) {
928		printf("ZFS: unsupported ZFS version %u (should be %u)\n",
929		    (unsigned) val, (unsigned) SPA_VERSION);
930		return (EIO);
931	}
932
933	/* Check ZFS features for read */
934	if (nvlist_find(nvlist,
935			ZPOOL_CONFIG_FEATURES_FOR_READ,
936			DATA_TYPE_NVLIST, 0, &features) == 0
937	    && nvlist_check_features_for_read(features) != 0)
938		return (EIO);
939
940	if (nvlist_find(nvlist,
941			ZPOOL_CONFIG_POOL_STATE,
942			DATA_TYPE_UINT64, 0, &val)) {
943		return (EIO);
944	}
945
946	if (val == POOL_STATE_DESTROYED) {
947		/* We don't boot only from destroyed pools. */
948		return (EIO);
949	}
950
951	if (nvlist_find(nvlist,
952			ZPOOL_CONFIG_POOL_TXG,
953			DATA_TYPE_UINT64, 0, &pool_txg)
954	    || nvlist_find(nvlist,
955			   ZPOOL_CONFIG_POOL_GUID,
956			   DATA_TYPE_UINT64, 0, &pool_guid)
957	    || nvlist_find(nvlist,
958			   ZPOOL_CONFIG_POOL_NAME,
959			   DATA_TYPE_STRING, 0, &pool_name)) {
960		/*
961		 * Cache and spare devices end up here - just ignore
962		 * them.
963		 */
964		/*printf("ZFS: can't find pool details\n");*/
965		return (EIO);
966	}
967
968	is_log = 0;
969	(void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
970	    &is_log);
971	if (is_log)
972		return (EIO);
973
974	/*
975	 * Create the pool if this is the first time we've seen it.
976	 */
977	spa = spa_find_by_guid(pool_guid);
978	if (!spa) {
979		spa = spa_create(pool_guid);
980		spa->spa_name = strdup(pool_name);
981	}
982	if (pool_txg > spa->spa_txg) {
983		spa->spa_txg = pool_txg;
984		is_newer = 1;
985	} else
986		is_newer = 0;
987
988	/*
989	 * Get the vdev tree and create our in-core copy of it.
990	 * If we already have a vdev with this guid, this must
991	 * be some kind of alias (overlapping slices, dangerously dedicated
992	 * disks etc).
993	 */
994	if (nvlist_find(nvlist,
995			ZPOOL_CONFIG_GUID,
996			DATA_TYPE_UINT64, 0, &guid)) {
997		return (EIO);
998	}
999	vdev = vdev_find(guid);
1000	if (vdev && vdev->v_phys_read)	/* Has this vdev already been inited? */
1001		return (EIO);
1002
1003	if (nvlist_find(nvlist,
1004			ZPOOL_CONFIG_VDEV_TREE,
1005			DATA_TYPE_NVLIST, 0, &vdevs)) {
1006		return (EIO);
1007	}
1008
1009	rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1010	if (rc)
1011		return (rc);
1012
1013	/*
1014	 * Add the toplevel vdev to the pool if its not already there.
1015	 */
1016	STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1017		if (top_vdev == pool_vdev)
1018			break;
1019	if (!pool_vdev && top_vdev)
1020		STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1021
1022	/*
1023	 * We should already have created an incomplete vdev for this
1024	 * vdev. Find it and initialise it with our read proc.
1025	 */
1026	vdev = vdev_find(guid);
1027	if (vdev) {
1028		vdev->v_phys_read = read;
1029		vdev->v_read_priv = read_priv;
1030		vdev->v_state = VDEV_STATE_HEALTHY;
1031	} else {
1032		printf("ZFS: inconsistent nvlist contents\n");
1033		return (EIO);
1034	}
1035
1036	/*
1037	 * Re-evaluate top-level vdev state.
1038	 */
1039	vdev_set_state(top_vdev);
1040
1041	/*
1042	 * Ok, we are happy with the pool so far. Lets find
1043	 * the best uberblock and then we can actually access
1044	 * the contents of the pool.
1045	 */
1046	upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1047	up = (const struct uberblock *)upbuf;
1048	for (i = 0;
1049	     i < VDEV_UBERBLOCK_COUNT(vdev);
1050	     i++) {
1051		off = VDEV_UBERBLOCK_OFFSET(vdev, i);
1052		BP_ZERO(&bp);
1053		DVA_SET_OFFSET(&bp.blk_dva[0], off);
1054		BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1055		BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1056		BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1057		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1058		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1059
1060		if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1061			continue;
1062
1063		if (up->ub_magic != UBERBLOCK_MAGIC)
1064			continue;
1065		if (up->ub_txg < spa->spa_txg)
1066			continue;
1067		if (up->ub_txg > spa->spa_uberblock.ub_txg) {
1068			spa->spa_uberblock = *up;
1069		} else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
1070			if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
1071				spa->spa_uberblock = *up;
1072		}
1073	}
1074	zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1075
1076	if (spap)
1077		*spap = spa;
1078	return (0);
1079}
1080
1081static int
1082ilog2(int n)
1083{
1084	int v;
1085
1086	for (v = 0; v < 32; v++)
1087		if (n == (1 << v))
1088			return v;
1089	return -1;
1090}
1091
1092static int
1093zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1094{
1095	blkptr_t gbh_bp;
1096	zio_gbh_phys_t zio_gb;
1097	char *pbuf;
1098	int i;
1099
1100	/* Artificial BP for gang block header. */
1101	gbh_bp = *bp;
1102	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1103	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1104	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1105	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1106	for (i = 0; i < SPA_DVAS_PER_BP; i++)
1107		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1108
1109	/* Read gang header block using the artificial BP. */
1110	if (zio_read(spa, &gbh_bp, &zio_gb))
1111		return (EIO);
1112
1113	pbuf = buf;
1114	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1115		blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1116
1117		if (BP_IS_HOLE(gbp))
1118			continue;
1119		if (zio_read(spa, gbp, pbuf))
1120			return (EIO);
1121		pbuf += BP_GET_PSIZE(gbp);
1122	}
1123
1124	if (zio_checksum_verify(bp, buf))
1125		return (EIO);
1126	return (0);
1127}
1128
1129static int
1130zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1131{
1132	int cpfunc = BP_GET_COMPRESS(bp);
1133	uint64_t align, size;
1134	void *pbuf;
1135	int i, error;
1136
1137	/*
1138	 * Process data embedded in block pointer
1139	 */
1140	if (BP_IS_EMBEDDED(bp)) {
1141		ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1142
1143		size = BPE_GET_PSIZE(bp);
1144		ASSERT(size <= BPE_PAYLOAD_SIZE);
1145
1146		if (cpfunc != ZIO_COMPRESS_OFF)
1147			pbuf = zfs_alloc(size);
1148		else
1149			pbuf = buf;
1150
1151		decode_embedded_bp_compressed(bp, pbuf);
1152		error = 0;
1153
1154		if (cpfunc != ZIO_COMPRESS_OFF) {
1155			error = zio_decompress_data(cpfunc, pbuf,
1156			    size, buf, BP_GET_LSIZE(bp));
1157			zfs_free(pbuf, size);
1158		}
1159		if (error != 0)
1160			printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1161			    error);
1162		return (error);
1163	}
1164
1165	error = EIO;
1166
1167	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1168		const dva_t *dva = &bp->blk_dva[i];
1169		vdev_t *vdev;
1170		int vdevid;
1171		off_t offset;
1172
1173		if (!dva->dva_word[0] && !dva->dva_word[1])
1174			continue;
1175
1176		vdevid = DVA_GET_VDEV(dva);
1177		offset = DVA_GET_OFFSET(dva);
1178		STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1179			if (vdev->v_id == vdevid)
1180				break;
1181		}
1182		if (!vdev || !vdev->v_read)
1183			continue;
1184
1185		size = BP_GET_PSIZE(bp);
1186		if (vdev->v_read == vdev_raidz_read) {
1187			align = 1ULL << vdev->v_top->v_ashift;
1188			if (P2PHASE(size, align) != 0)
1189				size = P2ROUNDUP(size, align);
1190		}
1191		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1192			pbuf = zfs_alloc(size);
1193		else
1194			pbuf = buf;
1195
1196		if (DVA_GET_GANG(dva))
1197			error = zio_read_gang(spa, bp, pbuf);
1198		else
1199			error = vdev->v_read(vdev, bp, pbuf, offset, size);
1200		if (error == 0) {
1201			if (cpfunc != ZIO_COMPRESS_OFF)
1202				error = zio_decompress_data(cpfunc, pbuf,
1203				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1204			else if (size != BP_GET_PSIZE(bp))
1205				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1206		}
1207		if (buf != pbuf)
1208			zfs_free(pbuf, size);
1209		if (error == 0)
1210			break;
1211	}
1212	if (error != 0)
1213		printf("ZFS: i/o error - all block copies unavailable\n");
1214	return (error);
1215}
1216
1217static int
1218dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1219{
1220	int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1221	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1222	int nlevels = dnode->dn_nlevels;
1223	int i, rc;
1224
1225	/*
1226	 * Note: bsize may not be a power of two here so we need to do an
1227	 * actual divide rather than a bitshift.
1228	 */
1229	while (buflen > 0) {
1230		uint64_t bn = offset / bsize;
1231		int boff = offset % bsize;
1232		int ibn;
1233		const blkptr_t *indbp;
1234		blkptr_t bp;
1235
1236		if (bn > dnode->dn_maxblkid)
1237			return (EIO);
1238
1239		if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1240			goto cached;
1241
1242		indbp = dnode->dn_blkptr;
1243		for (i = 0; i < nlevels; i++) {
1244			/*
1245			 * Copy the bp from the indirect array so that
1246			 * we can re-use the scratch buffer for multi-level
1247			 * objects.
1248			 */
1249			ibn = bn >> ((nlevels - i - 1) * ibshift);
1250			ibn &= ((1 << ibshift) - 1);
1251			bp = indbp[ibn];
1252			rc = zio_read(spa, &bp, dnode_cache_buf);
1253			if (rc)
1254				return (rc);
1255			indbp = (const blkptr_t *) dnode_cache_buf;
1256		}
1257		dnode_cache_obj = dnode;
1258		dnode_cache_bn = bn;
1259	cached:
1260
1261		/*
1262		 * The buffer contains our data block. Copy what we
1263		 * need from it and loop.
1264		 */
1265		i = bsize - boff;
1266		if (i > buflen) i = buflen;
1267		memcpy(buf, &dnode_cache_buf[boff], i);
1268		buf = ((char*) buf) + i;
1269		offset += i;
1270		buflen -= i;
1271	}
1272
1273	return (0);
1274}
1275
1276/*
1277 * Lookup a value in a microzap directory. Assumes that the zap
1278 * scratch buffer contains the directory contents.
1279 */
1280static int
1281mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1282{
1283	const mzap_phys_t *mz;
1284	const mzap_ent_phys_t *mze;
1285	size_t size;
1286	int chunks, i;
1287
1288	/*
1289	 * Microzap objects use exactly one block. Read the whole
1290	 * thing.
1291	 */
1292	size = dnode->dn_datablkszsec * 512;
1293
1294	mz = (const mzap_phys_t *) zap_scratch;
1295	chunks = size / MZAP_ENT_LEN - 1;
1296
1297	for (i = 0; i < chunks; i++) {
1298		mze = &mz->mz_chunk[i];
1299		if (!strcmp(mze->mze_name, name)) {
1300			*value = mze->mze_value;
1301			return (0);
1302		}
1303	}
1304
1305	return (ENOENT);
1306}
1307
1308/*
1309 * Compare a name with a zap leaf entry. Return non-zero if the name
1310 * matches.
1311 */
1312static int
1313fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1314{
1315	size_t namelen;
1316	const zap_leaf_chunk_t *nc;
1317	const char *p;
1318
1319	namelen = zc->l_entry.le_name_numints;
1320
1321	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1322	p = name;
1323	while (namelen > 0) {
1324		size_t len;
1325		len = namelen;
1326		if (len > ZAP_LEAF_ARRAY_BYTES)
1327			len = ZAP_LEAF_ARRAY_BYTES;
1328		if (memcmp(p, nc->l_array.la_array, len))
1329			return (0);
1330		p += len;
1331		namelen -= len;
1332		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1333	}
1334
1335	return 1;
1336}
1337
1338/*
1339 * Extract a uint64_t value from a zap leaf entry.
1340 */
1341static uint64_t
1342fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1343{
1344	const zap_leaf_chunk_t *vc;
1345	int i;
1346	uint64_t value;
1347	const uint8_t *p;
1348
1349	vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1350	for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1351		value = (value << 8) | p[i];
1352	}
1353
1354	return value;
1355}
1356
1357/*
1358 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1359 * buffer contains the directory header.
1360 */
1361static int
1362fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1363{
1364	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1365	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1366	fat_zap_t z;
1367	uint64_t *ptrtbl;
1368	uint64_t hash;
1369	int rc;
1370
1371	if (zh.zap_magic != ZAP_MAGIC)
1372		return (EIO);
1373
1374	z.zap_block_shift = ilog2(bsize);
1375	z.zap_phys = (zap_phys_t *) zap_scratch;
1376
1377	/*
1378	 * Figure out where the pointer table is and read it in if necessary.
1379	 */
1380	if (zh.zap_ptrtbl.zt_blk) {
1381		rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1382			       zap_scratch, bsize);
1383		if (rc)
1384			return (rc);
1385		ptrtbl = (uint64_t *) zap_scratch;
1386	} else {
1387		ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1388	}
1389
1390	hash = zap_hash(zh.zap_salt, name);
1391
1392	zap_leaf_t zl;
1393	zl.l_bs = z.zap_block_shift;
1394
1395	off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1396	zap_leaf_chunk_t *zc;
1397
1398	rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1399	if (rc)
1400		return (rc);
1401
1402	zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1403
1404	/*
1405	 * Make sure this chunk matches our hash.
1406	 */
1407	if (zl.l_phys->l_hdr.lh_prefix_len > 0
1408	    && zl.l_phys->l_hdr.lh_prefix
1409	    != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1410		return (ENOENT);
1411
1412	/*
1413	 * Hash within the chunk to find our entry.
1414	 */
1415	int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1416	int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1417	h = zl.l_phys->l_hash[h];
1418	if (h == 0xffff)
1419		return (ENOENT);
1420	zc = &ZAP_LEAF_CHUNK(&zl, h);
1421	while (zc->l_entry.le_hash != hash) {
1422		if (zc->l_entry.le_next == 0xffff) {
1423			zc = 0;
1424			break;
1425		}
1426		zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1427	}
1428	if (fzap_name_equal(&zl, zc, name)) {
1429		if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints > 8)
1430			return (E2BIG);
1431		*value = fzap_leaf_value(&zl, zc);
1432		return (0);
1433	}
1434
1435	return (ENOENT);
1436}
1437
1438/*
1439 * Lookup a name in a zap object and return its value as a uint64_t.
1440 */
1441static int
1442zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1443{
1444	int rc;
1445	uint64_t zap_type;
1446	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1447
1448	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1449	if (rc)
1450		return (rc);
1451
1452	zap_type = *(uint64_t *) zap_scratch;
1453	if (zap_type == ZBT_MICRO)
1454		return mzap_lookup(dnode, name, value);
1455	else if (zap_type == ZBT_HEADER)
1456		return fzap_lookup(spa, dnode, name, value);
1457	printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1458	return (EIO);
1459}
1460
1461/*
1462 * List a microzap directory. Assumes that the zap scratch buffer contains
1463 * the directory contents.
1464 */
1465static int
1466mzap_list(const dnode_phys_t *dnode)
1467{
1468	const mzap_phys_t *mz;
1469	const mzap_ent_phys_t *mze;
1470	size_t size;
1471	int chunks, i;
1472
1473	/*
1474	 * Microzap objects use exactly one block. Read the whole
1475	 * thing.
1476	 */
1477	size = dnode->dn_datablkszsec * 512;
1478	mz = (const mzap_phys_t *) zap_scratch;
1479	chunks = size / MZAP_ENT_LEN - 1;
1480
1481	for (i = 0; i < chunks; i++) {
1482		mze = &mz->mz_chunk[i];
1483		if (mze->mze_name[0])
1484			//printf("%-32s 0x%jx\n", mze->mze_name, (uintmax_t)mze->mze_value);
1485			printf("%s\n", mze->mze_name);
1486	}
1487
1488	return (0);
1489}
1490
1491/*
1492 * List a fatzap directory. Assumes that the zap scratch buffer contains
1493 * the directory header.
1494 */
1495static int
1496fzap_list(const spa_t *spa, const dnode_phys_t *dnode)
1497{
1498	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1499	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1500	fat_zap_t z;
1501	int i, j;
1502
1503	if (zh.zap_magic != ZAP_MAGIC)
1504		return (EIO);
1505
1506	z.zap_block_shift = ilog2(bsize);
1507	z.zap_phys = (zap_phys_t *) zap_scratch;
1508
1509	/*
1510	 * This assumes that the leaf blocks start at block 1. The
1511	 * documentation isn't exactly clear on this.
1512	 */
1513	zap_leaf_t zl;
1514	zl.l_bs = z.zap_block_shift;
1515	for (i = 0; i < zh.zap_num_leafs; i++) {
1516		off_t off = (i + 1) << zl.l_bs;
1517		char name[256], *p;
1518		uint64_t value;
1519
1520		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1521			return (EIO);
1522
1523		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1524
1525		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1526			zap_leaf_chunk_t *zc, *nc;
1527			int namelen;
1528
1529			zc = &ZAP_LEAF_CHUNK(&zl, j);
1530			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1531				continue;
1532			namelen = zc->l_entry.le_name_numints;
1533			if (namelen > sizeof(name))
1534				namelen = sizeof(name);
1535
1536			/*
1537			 * Paste the name back together.
1538			 */
1539			nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1540			p = name;
1541			while (namelen > 0) {
1542				int len;
1543				len = namelen;
1544				if (len > ZAP_LEAF_ARRAY_BYTES)
1545					len = ZAP_LEAF_ARRAY_BYTES;
1546				memcpy(p, nc->l_array.la_array, len);
1547				p += len;
1548				namelen -= len;
1549				nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1550			}
1551
1552			/*
1553			 * Assume the first eight bytes of the value are
1554			 * a uint64_t.
1555			 */
1556			value = fzap_leaf_value(&zl, zc);
1557
1558			//printf("%s 0x%jx\n", name, (uintmax_t)value);
1559			printf("%s\n", name);
1560		}
1561	}
1562
1563	return (0);
1564}
1565
1566/*
1567 * List a zap directory.
1568 */
1569static int
1570zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1571{
1572	uint64_t zap_type;
1573	size_t size = dnode->dn_datablkszsec * 512;
1574
1575	if (dnode_read(spa, dnode, 0, zap_scratch, size))
1576		return (EIO);
1577
1578	zap_type = *(uint64_t *) zap_scratch;
1579	if (zap_type == ZBT_MICRO)
1580		return mzap_list(dnode);
1581	else
1582		return fzap_list(spa, dnode);
1583}
1584
1585static int
1586objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1587{
1588	off_t offset;
1589
1590	offset = objnum * sizeof(dnode_phys_t);
1591	return dnode_read(spa, &os->os_meta_dnode, offset,
1592		dnode, sizeof(dnode_phys_t));
1593}
1594
1595static int
1596mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1597{
1598	const mzap_phys_t *mz;
1599	const mzap_ent_phys_t *mze;
1600	size_t size;
1601	int chunks, i;
1602
1603	/*
1604	 * Microzap objects use exactly one block. Read the whole
1605	 * thing.
1606	 */
1607	size = dnode->dn_datablkszsec * 512;
1608
1609	mz = (const mzap_phys_t *) zap_scratch;
1610	chunks = size / MZAP_ENT_LEN - 1;
1611
1612	for (i = 0; i < chunks; i++) {
1613		mze = &mz->mz_chunk[i];
1614		if (value == mze->mze_value) {
1615			strcpy(name, mze->mze_name);
1616			return (0);
1617		}
1618	}
1619
1620	return (ENOENT);
1621}
1622
1623static void
1624fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1625{
1626	size_t namelen;
1627	const zap_leaf_chunk_t *nc;
1628	char *p;
1629
1630	namelen = zc->l_entry.le_name_numints;
1631
1632	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1633	p = name;
1634	while (namelen > 0) {
1635		size_t len;
1636		len = namelen;
1637		if (len > ZAP_LEAF_ARRAY_BYTES)
1638			len = ZAP_LEAF_ARRAY_BYTES;
1639		memcpy(p, nc->l_array.la_array, len);
1640		p += len;
1641		namelen -= len;
1642		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1643	}
1644
1645	*p = '\0';
1646}
1647
1648static int
1649fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1650{
1651	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1652	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1653	fat_zap_t z;
1654	int i, j;
1655
1656	if (zh.zap_magic != ZAP_MAGIC)
1657		return (EIO);
1658
1659	z.zap_block_shift = ilog2(bsize);
1660	z.zap_phys = (zap_phys_t *) zap_scratch;
1661
1662	/*
1663	 * This assumes that the leaf blocks start at block 1. The
1664	 * documentation isn't exactly clear on this.
1665	 */
1666	zap_leaf_t zl;
1667	zl.l_bs = z.zap_block_shift;
1668	for (i = 0; i < zh.zap_num_leafs; i++) {
1669		off_t off = (i + 1) << zl.l_bs;
1670
1671		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1672			return (EIO);
1673
1674		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1675
1676		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1677			zap_leaf_chunk_t *zc;
1678
1679			zc = &ZAP_LEAF_CHUNK(&zl, j);
1680			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1681				continue;
1682			if (zc->l_entry.le_value_intlen != 8 ||
1683			    zc->l_entry.le_value_numints != 1)
1684				continue;
1685
1686			if (fzap_leaf_value(&zl, zc) == value) {
1687				fzap_name_copy(&zl, zc, name);
1688				return (0);
1689			}
1690		}
1691	}
1692
1693	return (ENOENT);
1694}
1695
1696static int
1697zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1698{
1699	int rc;
1700	uint64_t zap_type;
1701	size_t size = dnode->dn_datablkszsec * 512;
1702
1703	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1704	if (rc)
1705		return (rc);
1706
1707	zap_type = *(uint64_t *) zap_scratch;
1708	if (zap_type == ZBT_MICRO)
1709		return mzap_rlookup(spa, dnode, name, value);
1710	else
1711		return fzap_rlookup(spa, dnode, name, value);
1712}
1713
1714static int
1715zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1716{
1717	char name[256];
1718	char component[256];
1719	uint64_t dir_obj, parent_obj, child_dir_zapobj;
1720	dnode_phys_t child_dir_zap, dataset, dir, parent;
1721	dsl_dir_phys_t *dd;
1722	dsl_dataset_phys_t *ds;
1723	char *p;
1724	int len;
1725
1726	p = &name[sizeof(name) - 1];
1727	*p = '\0';
1728
1729	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1730		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1731		return (EIO);
1732	}
1733	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1734	dir_obj = ds->ds_dir_obj;
1735
1736	for (;;) {
1737		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1738			return (EIO);
1739		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1740
1741		/* Actual loop condition. */
1742		parent_obj  = dd->dd_parent_obj;
1743		if (parent_obj == 0)
1744			break;
1745
1746		if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1747			return (EIO);
1748		dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1749		child_dir_zapobj = dd->dd_child_dir_zapobj;
1750		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1751			return (EIO);
1752		if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1753			return (EIO);
1754
1755		len = strlen(component);
1756		p -= len;
1757		memcpy(p, component, len);
1758		--p;
1759		*p = '/';
1760
1761		/* Actual loop iteration. */
1762		dir_obj = parent_obj;
1763	}
1764
1765	if (*p != '\0')
1766		++p;
1767	strcpy(result, p);
1768
1769	return (0);
1770}
1771
1772static int
1773zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1774{
1775	char element[256];
1776	uint64_t dir_obj, child_dir_zapobj;
1777	dnode_phys_t child_dir_zap, dir;
1778	dsl_dir_phys_t *dd;
1779	const char *p, *q;
1780
1781	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1782		return (EIO);
1783	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &dir_obj))
1784		return (EIO);
1785
1786	p = name;
1787	for (;;) {
1788		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1789			return (EIO);
1790		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1791
1792		while (*p == '/')
1793			p++;
1794		/* Actual loop condition #1. */
1795		if (*p == '\0')
1796			break;
1797
1798		q = strchr(p, '/');
1799		if (q) {
1800			memcpy(element, p, q - p);
1801			element[q - p] = '\0';
1802			p = q + 1;
1803		} else {
1804			strcpy(element, p);
1805			p += strlen(p);
1806		}
1807
1808		child_dir_zapobj = dd->dd_child_dir_zapobj;
1809		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1810			return (EIO);
1811
1812		/* Actual loop condition #2. */
1813		if (zap_lookup(spa, &child_dir_zap, element, &dir_obj) != 0)
1814			return (ENOENT);
1815	}
1816
1817	*objnum = dd->dd_head_dataset_obj;
1818	return (0);
1819}
1820
1821#ifndef BOOT2
1822static int
1823zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1824{
1825	uint64_t dir_obj, child_dir_zapobj;
1826	dnode_phys_t child_dir_zap, dir, dataset;
1827	dsl_dataset_phys_t *ds;
1828	dsl_dir_phys_t *dd;
1829
1830	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1831		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1832		return (EIO);
1833	}
1834	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1835	dir_obj = ds->ds_dir_obj;
1836
1837	if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
1838		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1839		return (EIO);
1840	}
1841	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1842
1843	child_dir_zapobj = dd->dd_child_dir_zapobj;
1844	if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
1845		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
1846		return (EIO);
1847	}
1848
1849	return (zap_list(spa, &child_dir_zap) != 0);
1850}
1851#endif
1852
1853/*
1854 * Find the object set given the object number of its dataset object
1855 * and return its details in *objset
1856 */
1857static int
1858zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1859{
1860	dnode_phys_t dataset;
1861	dsl_dataset_phys_t *ds;
1862
1863	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1864		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1865		return (EIO);
1866	}
1867
1868	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1869	if (zio_read(spa, &ds->ds_bp, objset)) {
1870		printf("ZFS: can't read object set for dataset %ju\n",
1871		    (uintmax_t)objnum);
1872		return (EIO);
1873	}
1874
1875	return (0);
1876}
1877
1878/*
1879 * Find the object set pointed to by the BOOTFS property or the root
1880 * dataset if there is none and return its details in *objset
1881 */
1882static int
1883zfs_get_root(const spa_t *spa, uint64_t *objid)
1884{
1885	dnode_phys_t dir, propdir;
1886	uint64_t props, bootfs, root;
1887
1888	*objid = 0;
1889
1890	/*
1891	 * Start with the MOS directory object.
1892	 */
1893	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1894		printf("ZFS: can't read MOS object directory\n");
1895		return (EIO);
1896	}
1897
1898	/*
1899	 * Lookup the pool_props and see if we can find a bootfs.
1900	 */
1901	if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1902	     && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1903	     && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1904	     && bootfs != 0)
1905	{
1906		*objid = bootfs;
1907		return (0);
1908	}
1909	/*
1910	 * Lookup the root dataset directory
1911	 */
1912	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1913	    || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1914		printf("ZFS: can't find root dsl_dir\n");
1915		return (EIO);
1916	}
1917
1918	/*
1919	 * Use the information from the dataset directory's bonus buffer
1920	 * to find the dataset object and from that the object set itself.
1921	 */
1922	dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1923	*objid = dd->dd_head_dataset_obj;
1924	return (0);
1925}
1926
1927static int
1928zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
1929{
1930
1931	mount->spa = spa;
1932
1933	/*
1934	 * Find the root object set if not explicitly provided
1935	 */
1936	if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
1937		printf("ZFS: can't find root filesystem\n");
1938		return (EIO);
1939	}
1940
1941	if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
1942		printf("ZFS: can't open root filesystem\n");
1943		return (EIO);
1944	}
1945
1946	mount->rootobj = rootobj;
1947
1948	return (0);
1949}
1950
1951static int
1952zfs_spa_init(spa_t *spa)
1953{
1954
1955	if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1956		printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
1957		return (EIO);
1958	}
1959	if (spa->spa_mos.os_type != DMU_OST_META) {
1960		printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
1961		return (EIO);
1962	}
1963	return (0);
1964}
1965
1966static int
1967zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
1968{
1969
1970	if (dn->dn_bonustype != DMU_OT_SA) {
1971		znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
1972
1973		sb->st_mode = zp->zp_mode;
1974		sb->st_uid = zp->zp_uid;
1975		sb->st_gid = zp->zp_gid;
1976		sb->st_size = zp->zp_size;
1977	} else {
1978		sa_hdr_phys_t *sahdrp;
1979		int hdrsize;
1980		size_t size = 0;
1981		void *buf = NULL;
1982
1983		if (dn->dn_bonuslen != 0)
1984			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
1985		else {
1986			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
1987				blkptr_t *bp = &dn->dn_spill;
1988				int error;
1989
1990				size = BP_GET_LSIZE(bp);
1991				buf = zfs_alloc(size);
1992				error = zio_read(spa, bp, buf);
1993				if (error != 0) {
1994					zfs_free(buf, size);
1995					return (error);
1996				}
1997				sahdrp = buf;
1998			} else {
1999				return (EIO);
2000			}
2001		}
2002		hdrsize = SA_HDR_SIZE(sahdrp);
2003		sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2004		    SA_MODE_OFFSET);
2005		sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2006		    SA_UID_OFFSET);
2007		sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2008		    SA_GID_OFFSET);
2009		sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2010		    SA_SIZE_OFFSET);
2011		if (buf != NULL)
2012			zfs_free(buf, size);
2013	}
2014
2015	return (0);
2016}
2017
2018/*
2019 * Lookup a file and return its dnode.
2020 */
2021static int
2022zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2023{
2024	int rc;
2025	uint64_t objnum, rootnum, parentnum;
2026	const spa_t *spa;
2027	dnode_phys_t dn;
2028	const char *p, *q;
2029	char element[256];
2030	char path[1024];
2031	int symlinks_followed = 0;
2032	struct stat sb;
2033
2034	spa = mount->spa;
2035	if (mount->objset.os_type != DMU_OST_ZFS) {
2036		printf("ZFS: unexpected object set type %ju\n",
2037		    (uintmax_t)mount->objset.os_type);
2038		return (EIO);
2039	}
2040
2041	/*
2042	 * Get the root directory dnode.
2043	 */
2044	rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2045	if (rc)
2046		return (rc);
2047
2048	rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
2049	if (rc)
2050		return (rc);
2051
2052	rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn);
2053	if (rc)
2054		return (rc);
2055
2056	objnum = rootnum;
2057	p = upath;
2058	while (p && *p) {
2059		while (*p == '/')
2060			p++;
2061		if (!*p)
2062			break;
2063		q = strchr(p, '/');
2064		if (q) {
2065			memcpy(element, p, q - p);
2066			element[q - p] = 0;
2067			p = q;
2068		} else {
2069			strcpy(element, p);
2070			p = 0;
2071		}
2072
2073		rc = zfs_dnode_stat(spa, &dn, &sb);
2074		if (rc)
2075			return (rc);
2076		if (!S_ISDIR(sb.st_mode))
2077			return (ENOTDIR);
2078
2079		parentnum = objnum;
2080		rc = zap_lookup(spa, &dn, element, &objnum);
2081		if (rc)
2082			return (rc);
2083		objnum = ZFS_DIRENT_OBJ(objnum);
2084
2085		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2086		if (rc)
2087			return (rc);
2088
2089		/*
2090		 * Check for symlink.
2091		 */
2092		rc = zfs_dnode_stat(spa, &dn, &sb);
2093		if (rc)
2094			return (rc);
2095		if (S_ISLNK(sb.st_mode)) {
2096			if (symlinks_followed > 10)
2097				return (EMLINK);
2098			symlinks_followed++;
2099
2100			/*
2101			 * Read the link value and copy the tail of our
2102			 * current path onto the end.
2103			 */
2104			if (p)
2105				strcpy(&path[sb.st_size], p);
2106			else
2107				path[sb.st_size] = 0;
2108			if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
2109				memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
2110					sb.st_size);
2111			} else {
2112				rc = dnode_read(spa, &dn, 0, path, sb.st_size);
2113				if (rc)
2114					return (rc);
2115			}
2116
2117			/*
2118			 * Restart with the new path, starting either at
2119			 * the root or at the parent depending whether or
2120			 * not the link is relative.
2121			 */
2122			p = path;
2123			if (*p == '/')
2124				objnum = rootnum;
2125			else
2126				objnum = parentnum;
2127			objset_get_dnode(spa, &mount->objset, objnum, &dn);
2128		}
2129	}
2130
2131	*dnode = dn;
2132	return (0);
2133}
2134