zfsimpl.c revision 263397
1/*- 2 * Copyright (c) 2007 Doug Rabson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27#include <sys/cdefs.h> 28__FBSDID("$FreeBSD: stable/10/sys/boot/zfs/zfsimpl.c 263397 2014-03-19 23:55:03Z delphij $"); 29 30/* 31 * Stand-alone ZFS file reader. 32 */ 33 34#include <sys/stat.h> 35#include <sys/stdint.h> 36 37#include "zfsimpl.h" 38#include "zfssubr.c" 39 40 41struct zfsmount { 42 const spa_t *spa; 43 objset_phys_t objset; 44 uint64_t rootobj; 45}; 46 47/* 48 * List of all vdevs, chained through v_alllink. 49 */ 50static vdev_list_t zfs_vdevs; 51 52 /* 53 * List of ZFS features supported for read 54 */ 55static const char *features_for_read[] = { 56 "org.illumos:lz4_compress", 57 "com.delphix:hole_birth", 58 "com.delphix:extensible_dataset", 59 NULL 60}; 61 62/* 63 * List of all pools, chained through spa_link. 64 */ 65static spa_list_t zfs_pools; 66 67static uint64_t zfs_crc64_table[256]; 68static const dnode_phys_t *dnode_cache_obj = 0; 69static uint64_t dnode_cache_bn; 70static char *dnode_cache_buf; 71static char *zap_scratch; 72static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr; 73 74#define TEMP_SIZE (1024 * 1024) 75 76static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf); 77static int zfs_get_root(const spa_t *spa, uint64_t *objid); 78static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result); 79 80static void 81zfs_init(void) 82{ 83 STAILQ_INIT(&zfs_vdevs); 84 STAILQ_INIT(&zfs_pools); 85 86 zfs_temp_buf = malloc(TEMP_SIZE); 87 zfs_temp_end = zfs_temp_buf + TEMP_SIZE; 88 zfs_temp_ptr = zfs_temp_buf; 89 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE); 90 zap_scratch = malloc(SPA_MAXBLOCKSIZE); 91 92 zfs_init_crc(); 93} 94 95static void * 96zfs_alloc(size_t size) 97{ 98 char *ptr; 99 100 if (zfs_temp_ptr + size > zfs_temp_end) { 101 printf("ZFS: out of temporary buffer space\n"); 102 for (;;) ; 103 } 104 ptr = zfs_temp_ptr; 105 zfs_temp_ptr += size; 106 107 return (ptr); 108} 109 110static void 111zfs_free(void *ptr, size_t size) 112{ 113 114 zfs_temp_ptr -= size; 115 if (zfs_temp_ptr != ptr) { 116 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n"); 117 for (;;) ; 118 } 119} 120 121static int 122xdr_int(const unsigned char **xdr, int *ip) 123{ 124 *ip = ((*xdr)[0] << 24) 125 | ((*xdr)[1] << 16) 126 | ((*xdr)[2] << 8) 127 | ((*xdr)[3] << 0); 128 (*xdr) += 4; 129 return (0); 130} 131 132static int 133xdr_u_int(const unsigned char **xdr, u_int *ip) 134{ 135 *ip = ((*xdr)[0] << 24) 136 | ((*xdr)[1] << 16) 137 | ((*xdr)[2] << 8) 138 | ((*xdr)[3] << 0); 139 (*xdr) += 4; 140 return (0); 141} 142 143static int 144xdr_uint64_t(const unsigned char **xdr, uint64_t *lp) 145{ 146 u_int hi, lo; 147 148 xdr_u_int(xdr, &hi); 149 xdr_u_int(xdr, &lo); 150 *lp = (((uint64_t) hi) << 32) | lo; 151 return (0); 152} 153 154static int 155nvlist_find(const unsigned char *nvlist, const char *name, int type, 156 int* elementsp, void *valuep) 157{ 158 const unsigned char *p, *pair; 159 int junk; 160 int encoded_size, decoded_size; 161 162 p = nvlist; 163 xdr_int(&p, &junk); 164 xdr_int(&p, &junk); 165 166 pair = p; 167 xdr_int(&p, &encoded_size); 168 xdr_int(&p, &decoded_size); 169 while (encoded_size && decoded_size) { 170 int namelen, pairtype, elements; 171 const char *pairname; 172 173 xdr_int(&p, &namelen); 174 pairname = (const char*) p; 175 p += roundup(namelen, 4); 176 xdr_int(&p, &pairtype); 177 178 if (!memcmp(name, pairname, namelen) && type == pairtype) { 179 xdr_int(&p, &elements); 180 if (elementsp) 181 *elementsp = elements; 182 if (type == DATA_TYPE_UINT64) { 183 xdr_uint64_t(&p, (uint64_t *) valuep); 184 return (0); 185 } else if (type == DATA_TYPE_STRING) { 186 int len; 187 xdr_int(&p, &len); 188 (*(const char**) valuep) = (const char*) p; 189 return (0); 190 } else if (type == DATA_TYPE_NVLIST 191 || type == DATA_TYPE_NVLIST_ARRAY) { 192 (*(const unsigned char**) valuep) = 193 (const unsigned char*) p; 194 return (0); 195 } else { 196 return (EIO); 197 } 198 } else { 199 /* 200 * Not the pair we are looking for, skip to the next one. 201 */ 202 p = pair + encoded_size; 203 } 204 205 pair = p; 206 xdr_int(&p, &encoded_size); 207 xdr_int(&p, &decoded_size); 208 } 209 210 return (EIO); 211} 212 213static int 214nvlist_check_features_for_read(const unsigned char *nvlist) 215{ 216 const unsigned char *p, *pair; 217 int junk; 218 int encoded_size, decoded_size; 219 int rc; 220 221 rc = 0; 222 223 p = nvlist; 224 xdr_int(&p, &junk); 225 xdr_int(&p, &junk); 226 227 pair = p; 228 xdr_int(&p, &encoded_size); 229 xdr_int(&p, &decoded_size); 230 while (encoded_size && decoded_size) { 231 int namelen, pairtype; 232 const char *pairname; 233 int i, found; 234 235 found = 0; 236 237 xdr_int(&p, &namelen); 238 pairname = (const char*) p; 239 p += roundup(namelen, 4); 240 xdr_int(&p, &pairtype); 241 242 for (i = 0; features_for_read[i] != NULL; i++) { 243 if (!memcmp(pairname, features_for_read[i], namelen)) { 244 found = 1; 245 break; 246 } 247 } 248 249 if (!found) { 250 printf("ZFS: unsupported feature: %s\n", pairname); 251 rc = EIO; 252 } 253 254 p = pair + encoded_size; 255 256 pair = p; 257 xdr_int(&p, &encoded_size); 258 xdr_int(&p, &decoded_size); 259 } 260 261 return (rc); 262} 263 264/* 265 * Return the next nvlist in an nvlist array. 266 */ 267static const unsigned char * 268nvlist_next(const unsigned char *nvlist) 269{ 270 const unsigned char *p, *pair; 271 int junk; 272 int encoded_size, decoded_size; 273 274 p = nvlist; 275 xdr_int(&p, &junk); 276 xdr_int(&p, &junk); 277 278 pair = p; 279 xdr_int(&p, &encoded_size); 280 xdr_int(&p, &decoded_size); 281 while (encoded_size && decoded_size) { 282 p = pair + encoded_size; 283 284 pair = p; 285 xdr_int(&p, &encoded_size); 286 xdr_int(&p, &decoded_size); 287 } 288 289 return p; 290} 291 292#ifdef TEST 293 294static const unsigned char * 295nvlist_print(const unsigned char *nvlist, unsigned int indent) 296{ 297 static const char* typenames[] = { 298 "DATA_TYPE_UNKNOWN", 299 "DATA_TYPE_BOOLEAN", 300 "DATA_TYPE_BYTE", 301 "DATA_TYPE_INT16", 302 "DATA_TYPE_UINT16", 303 "DATA_TYPE_INT32", 304 "DATA_TYPE_UINT32", 305 "DATA_TYPE_INT64", 306 "DATA_TYPE_UINT64", 307 "DATA_TYPE_STRING", 308 "DATA_TYPE_BYTE_ARRAY", 309 "DATA_TYPE_INT16_ARRAY", 310 "DATA_TYPE_UINT16_ARRAY", 311 "DATA_TYPE_INT32_ARRAY", 312 "DATA_TYPE_UINT32_ARRAY", 313 "DATA_TYPE_INT64_ARRAY", 314 "DATA_TYPE_UINT64_ARRAY", 315 "DATA_TYPE_STRING_ARRAY", 316 "DATA_TYPE_HRTIME", 317 "DATA_TYPE_NVLIST", 318 "DATA_TYPE_NVLIST_ARRAY", 319 "DATA_TYPE_BOOLEAN_VALUE", 320 "DATA_TYPE_INT8", 321 "DATA_TYPE_UINT8", 322 "DATA_TYPE_BOOLEAN_ARRAY", 323 "DATA_TYPE_INT8_ARRAY", 324 "DATA_TYPE_UINT8_ARRAY" 325 }; 326 327 unsigned int i, j; 328 const unsigned char *p, *pair; 329 int junk; 330 int encoded_size, decoded_size; 331 332 p = nvlist; 333 xdr_int(&p, &junk); 334 xdr_int(&p, &junk); 335 336 pair = p; 337 xdr_int(&p, &encoded_size); 338 xdr_int(&p, &decoded_size); 339 while (encoded_size && decoded_size) { 340 int namelen, pairtype, elements; 341 const char *pairname; 342 343 xdr_int(&p, &namelen); 344 pairname = (const char*) p; 345 p += roundup(namelen, 4); 346 xdr_int(&p, &pairtype); 347 348 for (i = 0; i < indent; i++) 349 printf(" "); 350 printf("%s %s", typenames[pairtype], pairname); 351 352 xdr_int(&p, &elements); 353 switch (pairtype) { 354 case DATA_TYPE_UINT64: { 355 uint64_t val; 356 xdr_uint64_t(&p, &val); 357 printf(" = 0x%jx\n", (uintmax_t)val); 358 break; 359 } 360 361 case DATA_TYPE_STRING: { 362 int len; 363 xdr_int(&p, &len); 364 printf(" = \"%s\"\n", p); 365 break; 366 } 367 368 case DATA_TYPE_NVLIST: 369 printf("\n"); 370 nvlist_print(p, indent + 1); 371 break; 372 373 case DATA_TYPE_NVLIST_ARRAY: 374 for (j = 0; j < elements; j++) { 375 printf("[%d]\n", j); 376 p = nvlist_print(p, indent + 1); 377 if (j != elements - 1) { 378 for (i = 0; i < indent; i++) 379 printf(" "); 380 printf("%s %s", typenames[pairtype], pairname); 381 } 382 } 383 break; 384 385 default: 386 printf("\n"); 387 } 388 389 p = pair + encoded_size; 390 391 pair = p; 392 xdr_int(&p, &encoded_size); 393 xdr_int(&p, &decoded_size); 394 } 395 396 return p; 397} 398 399#endif 400 401static int 402vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, 403 off_t offset, size_t size) 404{ 405 size_t psize; 406 int rc; 407 408 if (!vdev->v_phys_read) 409 return (EIO); 410 411 if (bp) { 412 psize = BP_GET_PSIZE(bp); 413 } else { 414 psize = size; 415 } 416 417 /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/ 418 rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize); 419 if (rc) 420 return (rc); 421 if (bp && zio_checksum_verify(bp, buf)) 422 return (EIO); 423 424 return (0); 425} 426 427static int 428vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 429 off_t offset, size_t bytes) 430{ 431 432 return (vdev_read_phys(vdev, bp, buf, 433 offset + VDEV_LABEL_START_SIZE, bytes)); 434} 435 436 437static int 438vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 439 off_t offset, size_t bytes) 440{ 441 vdev_t *kid; 442 int rc; 443 444 rc = EIO; 445 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 446 if (kid->v_state != VDEV_STATE_HEALTHY) 447 continue; 448 rc = kid->v_read(kid, bp, buf, offset, bytes); 449 if (!rc) 450 return (0); 451 } 452 453 return (rc); 454} 455 456static int 457vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf, 458 off_t offset, size_t bytes) 459{ 460 vdev_t *kid; 461 462 /* 463 * Here we should have two kids: 464 * First one which is the one we are replacing and we can trust 465 * only this one to have valid data, but it might not be present. 466 * Second one is that one we are replacing with. It is most likely 467 * healthy, but we can't trust it has needed data, so we won't use it. 468 */ 469 kid = STAILQ_FIRST(&vdev->v_children); 470 if (kid == NULL) 471 return (EIO); 472 if (kid->v_state != VDEV_STATE_HEALTHY) 473 return (EIO); 474 return (kid->v_read(kid, bp, buf, offset, bytes)); 475} 476 477static vdev_t * 478vdev_find(uint64_t guid) 479{ 480 vdev_t *vdev; 481 482 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink) 483 if (vdev->v_guid == guid) 484 return (vdev); 485 486 return (0); 487} 488 489static vdev_t * 490vdev_create(uint64_t guid, vdev_read_t *read) 491{ 492 vdev_t *vdev; 493 494 vdev = malloc(sizeof(vdev_t)); 495 memset(vdev, 0, sizeof(vdev_t)); 496 STAILQ_INIT(&vdev->v_children); 497 vdev->v_guid = guid; 498 vdev->v_state = VDEV_STATE_OFFLINE; 499 vdev->v_read = read; 500 vdev->v_phys_read = 0; 501 vdev->v_read_priv = 0; 502 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); 503 504 return (vdev); 505} 506 507static int 508vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, 509 vdev_t **vdevp, int is_newer) 510{ 511 int rc; 512 uint64_t guid, id, ashift, nparity; 513 const char *type; 514 const char *path; 515 vdev_t *vdev, *kid; 516 const unsigned char *kids; 517 int nkids, i, is_new; 518 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; 519 520 if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, 521 DATA_TYPE_UINT64, 0, &guid) 522 || nvlist_find(nvlist, ZPOOL_CONFIG_ID, 523 DATA_TYPE_UINT64, 0, &id) 524 || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, 525 DATA_TYPE_STRING, 0, &type)) { 526 printf("ZFS: can't find vdev details\n"); 527 return (ENOENT); 528 } 529 530 if (strcmp(type, VDEV_TYPE_MIRROR) 531 && strcmp(type, VDEV_TYPE_DISK) 532#ifdef ZFS_TEST 533 && strcmp(type, VDEV_TYPE_FILE) 534#endif 535 && strcmp(type, VDEV_TYPE_RAIDZ) 536 && strcmp(type, VDEV_TYPE_REPLACING)) { 537 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n"); 538 return (EIO); 539 } 540 541 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; 542 543 nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0, 544 &is_offline); 545 nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0, 546 &is_removed); 547 nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0, 548 &is_faulted); 549 nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0, 550 &is_degraded); 551 nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0, 552 &isnt_present); 553 554 vdev = vdev_find(guid); 555 if (!vdev) { 556 is_new = 1; 557 558 if (!strcmp(type, VDEV_TYPE_MIRROR)) 559 vdev = vdev_create(guid, vdev_mirror_read); 560 else if (!strcmp(type, VDEV_TYPE_RAIDZ)) 561 vdev = vdev_create(guid, vdev_raidz_read); 562 else if (!strcmp(type, VDEV_TYPE_REPLACING)) 563 vdev = vdev_create(guid, vdev_replacing_read); 564 else 565 vdev = vdev_create(guid, vdev_disk_read); 566 567 vdev->v_id = id; 568 vdev->v_top = pvdev != NULL ? pvdev : vdev; 569 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, 570 DATA_TYPE_UINT64, 0, &ashift) == 0) 571 vdev->v_ashift = ashift; 572 else 573 vdev->v_ashift = 0; 574 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, 575 DATA_TYPE_UINT64, 0, &nparity) == 0) 576 vdev->v_nparity = nparity; 577 else 578 vdev->v_nparity = 0; 579 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, 580 DATA_TYPE_STRING, 0, &path) == 0) { 581 if (strncmp(path, "/dev/", 5) == 0) 582 path += 5; 583 vdev->v_name = strdup(path); 584 } else { 585 if (!strcmp(type, "raidz")) { 586 if (vdev->v_nparity == 1) 587 vdev->v_name = "raidz1"; 588 else if (vdev->v_nparity == 2) 589 vdev->v_name = "raidz2"; 590 else if (vdev->v_nparity == 3) 591 vdev->v_name = "raidz3"; 592 else { 593 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n"); 594 return (EIO); 595 } 596 } else { 597 vdev->v_name = strdup(type); 598 } 599 } 600 } else { 601 is_new = 0; 602 } 603 604 if (is_new || is_newer) { 605 /* 606 * This is either new vdev or we've already seen this vdev, 607 * but from an older vdev label, so let's refresh its state 608 * from the newer label. 609 */ 610 if (is_offline) 611 vdev->v_state = VDEV_STATE_OFFLINE; 612 else if (is_removed) 613 vdev->v_state = VDEV_STATE_REMOVED; 614 else if (is_faulted) 615 vdev->v_state = VDEV_STATE_FAULTED; 616 else if (is_degraded) 617 vdev->v_state = VDEV_STATE_DEGRADED; 618 else if (isnt_present) 619 vdev->v_state = VDEV_STATE_CANT_OPEN; 620 } 621 622 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, 623 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids); 624 /* 625 * Its ok if we don't have any kids. 626 */ 627 if (rc == 0) { 628 vdev->v_nchildren = nkids; 629 for (i = 0; i < nkids; i++) { 630 rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer); 631 if (rc) 632 return (rc); 633 if (is_new) 634 STAILQ_INSERT_TAIL(&vdev->v_children, kid, 635 v_childlink); 636 kids = nvlist_next(kids); 637 } 638 } else { 639 vdev->v_nchildren = 0; 640 } 641 642 if (vdevp) 643 *vdevp = vdev; 644 return (0); 645} 646 647static void 648vdev_set_state(vdev_t *vdev) 649{ 650 vdev_t *kid; 651 int good_kids; 652 int bad_kids; 653 654 /* 655 * A mirror or raidz is healthy if all its kids are healthy. A 656 * mirror is degraded if any of its kids is healthy; a raidz 657 * is degraded if at most nparity kids are offline. 658 */ 659 if (STAILQ_FIRST(&vdev->v_children)) { 660 good_kids = 0; 661 bad_kids = 0; 662 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 663 if (kid->v_state == VDEV_STATE_HEALTHY) 664 good_kids++; 665 else 666 bad_kids++; 667 } 668 if (bad_kids == 0) { 669 vdev->v_state = VDEV_STATE_HEALTHY; 670 } else { 671 if (vdev->v_read == vdev_mirror_read) { 672 if (good_kids) { 673 vdev->v_state = VDEV_STATE_DEGRADED; 674 } else { 675 vdev->v_state = VDEV_STATE_OFFLINE; 676 } 677 } else if (vdev->v_read == vdev_raidz_read) { 678 if (bad_kids > vdev->v_nparity) { 679 vdev->v_state = VDEV_STATE_OFFLINE; 680 } else { 681 vdev->v_state = VDEV_STATE_DEGRADED; 682 } 683 } 684 } 685 } 686} 687 688static spa_t * 689spa_find_by_guid(uint64_t guid) 690{ 691 spa_t *spa; 692 693 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 694 if (spa->spa_guid == guid) 695 return (spa); 696 697 return (0); 698} 699 700static spa_t * 701spa_find_by_name(const char *name) 702{ 703 spa_t *spa; 704 705 STAILQ_FOREACH(spa, &zfs_pools, spa_link) 706 if (!strcmp(spa->spa_name, name)) 707 return (spa); 708 709 return (0); 710} 711 712#ifdef BOOT2 713static spa_t * 714spa_get_primary(void) 715{ 716 717 return (STAILQ_FIRST(&zfs_pools)); 718} 719 720static vdev_t * 721spa_get_primary_vdev(const spa_t *spa) 722{ 723 vdev_t *vdev; 724 vdev_t *kid; 725 726 if (spa == NULL) 727 spa = spa_get_primary(); 728 if (spa == NULL) 729 return (NULL); 730 vdev = STAILQ_FIRST(&spa->spa_vdevs); 731 if (vdev == NULL) 732 return (NULL); 733 for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL; 734 kid = STAILQ_FIRST(&vdev->v_children)) 735 vdev = kid; 736 return (vdev); 737} 738#endif 739 740static spa_t * 741spa_create(uint64_t guid) 742{ 743 spa_t *spa; 744 745 spa = malloc(sizeof(spa_t)); 746 memset(spa, 0, sizeof(spa_t)); 747 STAILQ_INIT(&spa->spa_vdevs); 748 spa->spa_guid = guid; 749 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); 750 751 return (spa); 752} 753 754static const char * 755state_name(vdev_state_t state) 756{ 757 static const char* names[] = { 758 "UNKNOWN", 759 "CLOSED", 760 "OFFLINE", 761 "REMOVED", 762 "CANT_OPEN", 763 "FAULTED", 764 "DEGRADED", 765 "ONLINE" 766 }; 767 return names[state]; 768} 769 770#ifdef BOOT2 771 772#define pager_printf printf 773 774#else 775 776static void 777pager_printf(const char *fmt, ...) 778{ 779 char line[80]; 780 va_list args; 781 782 va_start(args, fmt); 783 vsprintf(line, fmt, args); 784 va_end(args); 785 pager_output(line); 786} 787 788#endif 789 790#define STATUS_FORMAT " %s %s\n" 791 792static void 793print_state(int indent, const char *name, vdev_state_t state) 794{ 795 int i; 796 char buf[512]; 797 798 buf[0] = 0; 799 for (i = 0; i < indent; i++) 800 strcat(buf, " "); 801 strcat(buf, name); 802 pager_printf(STATUS_FORMAT, buf, state_name(state)); 803 804} 805 806static void 807vdev_status(vdev_t *vdev, int indent) 808{ 809 vdev_t *kid; 810 print_state(indent, vdev->v_name, vdev->v_state); 811 812 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { 813 vdev_status(kid, indent + 1); 814 } 815} 816 817static void 818spa_status(spa_t *spa) 819{ 820 static char bootfs[ZFS_MAXNAMELEN]; 821 uint64_t rootid; 822 vdev_t *vdev; 823 int good_kids, bad_kids, degraded_kids; 824 vdev_state_t state; 825 826 pager_printf(" pool: %s\n", spa->spa_name); 827 if (zfs_get_root(spa, &rootid) == 0 && 828 zfs_rlookup(spa, rootid, bootfs) == 0) { 829 if (bootfs[0] == '\0') 830 pager_printf("bootfs: %s\n", spa->spa_name); 831 else 832 pager_printf("bootfs: %s/%s\n", spa->spa_name, bootfs); 833 } 834 pager_printf("config:\n\n"); 835 pager_printf(STATUS_FORMAT, "NAME", "STATE"); 836 837 good_kids = 0; 838 degraded_kids = 0; 839 bad_kids = 0; 840 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { 841 if (vdev->v_state == VDEV_STATE_HEALTHY) 842 good_kids++; 843 else if (vdev->v_state == VDEV_STATE_DEGRADED) 844 degraded_kids++; 845 else 846 bad_kids++; 847 } 848 849 state = VDEV_STATE_CLOSED; 850 if (good_kids > 0 && (degraded_kids + bad_kids) == 0) 851 state = VDEV_STATE_HEALTHY; 852 else if ((good_kids + degraded_kids) > 0) 853 state = VDEV_STATE_DEGRADED; 854 855 print_state(0, spa->spa_name, state); 856 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { 857 vdev_status(vdev, 1); 858 } 859} 860 861static void 862spa_all_status(void) 863{ 864 spa_t *spa; 865 int first = 1; 866 867 STAILQ_FOREACH(spa, &zfs_pools, spa_link) { 868 if (!first) 869 pager_printf("\n"); 870 first = 0; 871 spa_status(spa); 872 } 873} 874 875static int 876vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap) 877{ 878 vdev_t vtmp; 879 vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; 880 spa_t *spa; 881 vdev_t *vdev, *top_vdev, *pool_vdev; 882 off_t off; 883 blkptr_t bp; 884 const unsigned char *nvlist; 885 uint64_t val; 886 uint64_t guid; 887 uint64_t pool_txg, pool_guid; 888 uint64_t is_log; 889 const char *pool_name; 890 const unsigned char *vdevs; 891 const unsigned char *features; 892 int i, rc, is_newer; 893 char *upbuf; 894 const struct uberblock *up; 895 896 /* 897 * Load the vdev label and figure out which 898 * uberblock is most current. 899 */ 900 memset(&vtmp, 0, sizeof(vtmp)); 901 vtmp.v_phys_read = read; 902 vtmp.v_read_priv = read_priv; 903 off = offsetof(vdev_label_t, vl_vdev_phys); 904 BP_ZERO(&bp); 905 BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); 906 BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); 907 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 908 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); 909 DVA_SET_OFFSET(BP_IDENTITY(&bp), off); 910 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); 911 if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0)) 912 return (EIO); 913 914 if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) { 915 return (EIO); 916 } 917 918 nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4; 919 920 if (nvlist_find(nvlist, 921 ZPOOL_CONFIG_VERSION, 922 DATA_TYPE_UINT64, 0, &val)) { 923 return (EIO); 924 } 925 926 if (!SPA_VERSION_IS_SUPPORTED(val)) { 927 printf("ZFS: unsupported ZFS version %u (should be %u)\n", 928 (unsigned) val, (unsigned) SPA_VERSION); 929 return (EIO); 930 } 931 932 /* Check ZFS features for read */ 933 if (nvlist_find(nvlist, 934 ZPOOL_CONFIG_FEATURES_FOR_READ, 935 DATA_TYPE_NVLIST, 0, &features) == 0 936 && nvlist_check_features_for_read(features) != 0) 937 return (EIO); 938 939 if (nvlist_find(nvlist, 940 ZPOOL_CONFIG_POOL_STATE, 941 DATA_TYPE_UINT64, 0, &val)) { 942 return (EIO); 943 } 944 945 if (val == POOL_STATE_DESTROYED) { 946 /* We don't boot only from destroyed pools. */ 947 return (EIO); 948 } 949 950 if (nvlist_find(nvlist, 951 ZPOOL_CONFIG_POOL_TXG, 952 DATA_TYPE_UINT64, 0, &pool_txg) 953 || nvlist_find(nvlist, 954 ZPOOL_CONFIG_POOL_GUID, 955 DATA_TYPE_UINT64, 0, &pool_guid) 956 || nvlist_find(nvlist, 957 ZPOOL_CONFIG_POOL_NAME, 958 DATA_TYPE_STRING, 0, &pool_name)) { 959 /* 960 * Cache and spare devices end up here - just ignore 961 * them. 962 */ 963 /*printf("ZFS: can't find pool details\n");*/ 964 return (EIO); 965 } 966 967 is_log = 0; 968 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0, 969 &is_log); 970 if (is_log) 971 return (EIO); 972 973 /* 974 * Create the pool if this is the first time we've seen it. 975 */ 976 spa = spa_find_by_guid(pool_guid); 977 if (!spa) { 978 spa = spa_create(pool_guid); 979 spa->spa_name = strdup(pool_name); 980 } 981 if (pool_txg > spa->spa_txg) { 982 spa->spa_txg = pool_txg; 983 is_newer = 1; 984 } else 985 is_newer = 0; 986 987 /* 988 * Get the vdev tree and create our in-core copy of it. 989 * If we already have a vdev with this guid, this must 990 * be some kind of alias (overlapping slices, dangerously dedicated 991 * disks etc). 992 */ 993 if (nvlist_find(nvlist, 994 ZPOOL_CONFIG_GUID, 995 DATA_TYPE_UINT64, 0, &guid)) { 996 return (EIO); 997 } 998 vdev = vdev_find(guid); 999 if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */ 1000 return (EIO); 1001 1002 if (nvlist_find(nvlist, 1003 ZPOOL_CONFIG_VDEV_TREE, 1004 DATA_TYPE_NVLIST, 0, &vdevs)) { 1005 return (EIO); 1006 } 1007 1008 rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer); 1009 if (rc) 1010 return (rc); 1011 1012 /* 1013 * Add the toplevel vdev to the pool if its not already there. 1014 */ 1015 STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink) 1016 if (top_vdev == pool_vdev) 1017 break; 1018 if (!pool_vdev && top_vdev) 1019 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink); 1020 1021 /* 1022 * We should already have created an incomplete vdev for this 1023 * vdev. Find it and initialise it with our read proc. 1024 */ 1025 vdev = vdev_find(guid); 1026 if (vdev) { 1027 vdev->v_phys_read = read; 1028 vdev->v_read_priv = read_priv; 1029 vdev->v_state = VDEV_STATE_HEALTHY; 1030 } else { 1031 printf("ZFS: inconsistent nvlist contents\n"); 1032 return (EIO); 1033 } 1034 1035 /* 1036 * Re-evaluate top-level vdev state. 1037 */ 1038 vdev_set_state(top_vdev); 1039 1040 /* 1041 * Ok, we are happy with the pool so far. Lets find 1042 * the best uberblock and then we can actually access 1043 * the contents of the pool. 1044 */ 1045 upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev)); 1046 up = (const struct uberblock *)upbuf; 1047 for (i = 0; 1048 i < VDEV_UBERBLOCK_COUNT(vdev); 1049 i++) { 1050 off = VDEV_UBERBLOCK_OFFSET(vdev, i); 1051 BP_ZERO(&bp); 1052 DVA_SET_OFFSET(&bp.blk_dva[0], off); 1053 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); 1054 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); 1055 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); 1056 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); 1057 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); 1058 1059 if (vdev_read_phys(vdev, &bp, upbuf, off, 0)) 1060 continue; 1061 1062 if (up->ub_magic != UBERBLOCK_MAGIC) 1063 continue; 1064 if (up->ub_txg < spa->spa_txg) 1065 continue; 1066 if (up->ub_txg > spa->spa_uberblock.ub_txg) { 1067 spa->spa_uberblock = *up; 1068 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) { 1069 if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp) 1070 spa->spa_uberblock = *up; 1071 } 1072 } 1073 zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev)); 1074 1075 if (spap) 1076 *spap = spa; 1077 return (0); 1078} 1079 1080static int 1081ilog2(int n) 1082{ 1083 int v; 1084 1085 for (v = 0; v < 32; v++) 1086 if (n == (1 << v)) 1087 return v; 1088 return -1; 1089} 1090 1091static int 1092zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf) 1093{ 1094 blkptr_t gbh_bp; 1095 zio_gbh_phys_t zio_gb; 1096 char *pbuf; 1097 int i; 1098 1099 /* Artificial BP for gang block header. */ 1100 gbh_bp = *bp; 1101 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 1102 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); 1103 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); 1104 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); 1105 for (i = 0; i < SPA_DVAS_PER_BP; i++) 1106 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); 1107 1108 /* Read gang header block using the artificial BP. */ 1109 if (zio_read(spa, &gbh_bp, &zio_gb)) 1110 return (EIO); 1111 1112 pbuf = buf; 1113 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { 1114 blkptr_t *gbp = &zio_gb.zg_blkptr[i]; 1115 1116 if (BP_IS_HOLE(gbp)) 1117 continue; 1118 if (zio_read(spa, gbp, pbuf)) 1119 return (EIO); 1120 pbuf += BP_GET_PSIZE(gbp); 1121 } 1122 1123 if (zio_checksum_verify(bp, buf)) 1124 return (EIO); 1125 return (0); 1126} 1127 1128static int 1129zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) 1130{ 1131 int cpfunc = BP_GET_COMPRESS(bp); 1132 uint64_t align, size; 1133 void *pbuf; 1134 int i, error; 1135 1136 error = EIO; 1137 1138 for (i = 0; i < SPA_DVAS_PER_BP; i++) { 1139 const dva_t *dva = &bp->blk_dva[i]; 1140 vdev_t *vdev; 1141 int vdevid; 1142 off_t offset; 1143 1144 if (!dva->dva_word[0] && !dva->dva_word[1]) 1145 continue; 1146 1147 vdevid = DVA_GET_VDEV(dva); 1148 offset = DVA_GET_OFFSET(dva); 1149 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { 1150 if (vdev->v_id == vdevid) 1151 break; 1152 } 1153 if (!vdev || !vdev->v_read) 1154 continue; 1155 1156 size = BP_GET_PSIZE(bp); 1157 if (vdev->v_read == vdev_raidz_read) { 1158 align = 1ULL << vdev->v_top->v_ashift; 1159 if (P2PHASE(size, align) != 0) 1160 size = P2ROUNDUP(size, align); 1161 } 1162 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) 1163 pbuf = zfs_alloc(size); 1164 else 1165 pbuf = buf; 1166 1167 if (DVA_GET_GANG(dva)) 1168 error = zio_read_gang(spa, bp, pbuf); 1169 else 1170 error = vdev->v_read(vdev, bp, pbuf, offset, size); 1171 if (error == 0) { 1172 if (cpfunc != ZIO_COMPRESS_OFF) 1173 error = zio_decompress_data(cpfunc, pbuf, 1174 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); 1175 else if (size != BP_GET_PSIZE(bp)) 1176 bcopy(pbuf, buf, BP_GET_PSIZE(bp)); 1177 } 1178 if (buf != pbuf) 1179 zfs_free(pbuf, size); 1180 if (error == 0) 1181 break; 1182 } 1183 if (error != 0) 1184 printf("ZFS: i/o error - all block copies unavailable\n"); 1185 return (error); 1186} 1187 1188static int 1189dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen) 1190{ 1191 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 1192 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 1193 int nlevels = dnode->dn_nlevels; 1194 int i, rc; 1195 1196 /* 1197 * Note: bsize may not be a power of two here so we need to do an 1198 * actual divide rather than a bitshift. 1199 */ 1200 while (buflen > 0) { 1201 uint64_t bn = offset / bsize; 1202 int boff = offset % bsize; 1203 int ibn; 1204 const blkptr_t *indbp; 1205 blkptr_t bp; 1206 1207 if (bn > dnode->dn_maxblkid) 1208 return (EIO); 1209 1210 if (dnode == dnode_cache_obj && bn == dnode_cache_bn) 1211 goto cached; 1212 1213 indbp = dnode->dn_blkptr; 1214 for (i = 0; i < nlevels; i++) { 1215 /* 1216 * Copy the bp from the indirect array so that 1217 * we can re-use the scratch buffer for multi-level 1218 * objects. 1219 */ 1220 ibn = bn >> ((nlevels - i - 1) * ibshift); 1221 ibn &= ((1 << ibshift) - 1); 1222 bp = indbp[ibn]; 1223 rc = zio_read(spa, &bp, dnode_cache_buf); 1224 if (rc) 1225 return (rc); 1226 indbp = (const blkptr_t *) dnode_cache_buf; 1227 } 1228 dnode_cache_obj = dnode; 1229 dnode_cache_bn = bn; 1230 cached: 1231 1232 /* 1233 * The buffer contains our data block. Copy what we 1234 * need from it and loop. 1235 */ 1236 i = bsize - boff; 1237 if (i > buflen) i = buflen; 1238 memcpy(buf, &dnode_cache_buf[boff], i); 1239 buf = ((char*) buf) + i; 1240 offset += i; 1241 buflen -= i; 1242 } 1243 1244 return (0); 1245} 1246 1247/* 1248 * Lookup a value in a microzap directory. Assumes that the zap 1249 * scratch buffer contains the directory contents. 1250 */ 1251static int 1252mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value) 1253{ 1254 const mzap_phys_t *mz; 1255 const mzap_ent_phys_t *mze; 1256 size_t size; 1257 int chunks, i; 1258 1259 /* 1260 * Microzap objects use exactly one block. Read the whole 1261 * thing. 1262 */ 1263 size = dnode->dn_datablkszsec * 512; 1264 1265 mz = (const mzap_phys_t *) zap_scratch; 1266 chunks = size / MZAP_ENT_LEN - 1; 1267 1268 for (i = 0; i < chunks; i++) { 1269 mze = &mz->mz_chunk[i]; 1270 if (!strcmp(mze->mze_name, name)) { 1271 *value = mze->mze_value; 1272 return (0); 1273 } 1274 } 1275 1276 return (ENOENT); 1277} 1278 1279/* 1280 * Compare a name with a zap leaf entry. Return non-zero if the name 1281 * matches. 1282 */ 1283static int 1284fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name) 1285{ 1286 size_t namelen; 1287 const zap_leaf_chunk_t *nc; 1288 const char *p; 1289 1290 namelen = zc->l_entry.le_name_numints; 1291 1292 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 1293 p = name; 1294 while (namelen > 0) { 1295 size_t len; 1296 len = namelen; 1297 if (len > ZAP_LEAF_ARRAY_BYTES) 1298 len = ZAP_LEAF_ARRAY_BYTES; 1299 if (memcmp(p, nc->l_array.la_array, len)) 1300 return (0); 1301 p += len; 1302 namelen -= len; 1303 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 1304 } 1305 1306 return 1; 1307} 1308 1309/* 1310 * Extract a uint64_t value from a zap leaf entry. 1311 */ 1312static uint64_t 1313fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc) 1314{ 1315 const zap_leaf_chunk_t *vc; 1316 int i; 1317 uint64_t value; 1318 const uint8_t *p; 1319 1320 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk); 1321 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) { 1322 value = (value << 8) | p[i]; 1323 } 1324 1325 return value; 1326} 1327 1328/* 1329 * Lookup a value in a fatzap directory. Assumes that the zap scratch 1330 * buffer contains the directory header. 1331 */ 1332static int 1333fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) 1334{ 1335 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 1336 zap_phys_t zh = *(zap_phys_t *) zap_scratch; 1337 fat_zap_t z; 1338 uint64_t *ptrtbl; 1339 uint64_t hash; 1340 int rc; 1341 1342 if (zh.zap_magic != ZAP_MAGIC) 1343 return (EIO); 1344 1345 z.zap_block_shift = ilog2(bsize); 1346 z.zap_phys = (zap_phys_t *) zap_scratch; 1347 1348 /* 1349 * Figure out where the pointer table is and read it in if necessary. 1350 */ 1351 if (zh.zap_ptrtbl.zt_blk) { 1352 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize, 1353 zap_scratch, bsize); 1354 if (rc) 1355 return (rc); 1356 ptrtbl = (uint64_t *) zap_scratch; 1357 } else { 1358 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0); 1359 } 1360 1361 hash = zap_hash(zh.zap_salt, name); 1362 1363 zap_leaf_t zl; 1364 zl.l_bs = z.zap_block_shift; 1365 1366 off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs; 1367 zap_leaf_chunk_t *zc; 1368 1369 rc = dnode_read(spa, dnode, off, zap_scratch, bsize); 1370 if (rc) 1371 return (rc); 1372 1373 zl.l_phys = (zap_leaf_phys_t *) zap_scratch; 1374 1375 /* 1376 * Make sure this chunk matches our hash. 1377 */ 1378 if (zl.l_phys->l_hdr.lh_prefix_len > 0 1379 && zl.l_phys->l_hdr.lh_prefix 1380 != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len)) 1381 return (ENOENT); 1382 1383 /* 1384 * Hash within the chunk to find our entry. 1385 */ 1386 int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len); 1387 int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1); 1388 h = zl.l_phys->l_hash[h]; 1389 if (h == 0xffff) 1390 return (ENOENT); 1391 zc = &ZAP_LEAF_CHUNK(&zl, h); 1392 while (zc->l_entry.le_hash != hash) { 1393 if (zc->l_entry.le_next == 0xffff) { 1394 zc = 0; 1395 break; 1396 } 1397 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next); 1398 } 1399 if (fzap_name_equal(&zl, zc, name)) { 1400 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints > 8) 1401 return (E2BIG); 1402 *value = fzap_leaf_value(&zl, zc); 1403 return (0); 1404 } 1405 1406 return (ENOENT); 1407} 1408 1409/* 1410 * Lookup a name in a zap object and return its value as a uint64_t. 1411 */ 1412static int 1413zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value) 1414{ 1415 int rc; 1416 uint64_t zap_type; 1417 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 1418 1419 rc = dnode_read(spa, dnode, 0, zap_scratch, size); 1420 if (rc) 1421 return (rc); 1422 1423 zap_type = *(uint64_t *) zap_scratch; 1424 if (zap_type == ZBT_MICRO) 1425 return mzap_lookup(dnode, name, value); 1426 else if (zap_type == ZBT_HEADER) 1427 return fzap_lookup(spa, dnode, name, value); 1428 printf("ZFS: invalid zap_type=%d\n", (int)zap_type); 1429 return (EIO); 1430} 1431 1432/* 1433 * List a microzap directory. Assumes that the zap scratch buffer contains 1434 * the directory contents. 1435 */ 1436static int 1437mzap_list(const dnode_phys_t *dnode) 1438{ 1439 const mzap_phys_t *mz; 1440 const mzap_ent_phys_t *mze; 1441 size_t size; 1442 int chunks, i; 1443 1444 /* 1445 * Microzap objects use exactly one block. Read the whole 1446 * thing. 1447 */ 1448 size = dnode->dn_datablkszsec * 512; 1449 mz = (const mzap_phys_t *) zap_scratch; 1450 chunks = size / MZAP_ENT_LEN - 1; 1451 1452 for (i = 0; i < chunks; i++) { 1453 mze = &mz->mz_chunk[i]; 1454 if (mze->mze_name[0]) 1455 //printf("%-32s 0x%jx\n", mze->mze_name, (uintmax_t)mze->mze_value); 1456 printf("%s\n", mze->mze_name); 1457 } 1458 1459 return (0); 1460} 1461 1462/* 1463 * List a fatzap directory. Assumes that the zap scratch buffer contains 1464 * the directory header. 1465 */ 1466static int 1467fzap_list(const spa_t *spa, const dnode_phys_t *dnode) 1468{ 1469 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 1470 zap_phys_t zh = *(zap_phys_t *) zap_scratch; 1471 fat_zap_t z; 1472 int i, j; 1473 1474 if (zh.zap_magic != ZAP_MAGIC) 1475 return (EIO); 1476 1477 z.zap_block_shift = ilog2(bsize); 1478 z.zap_phys = (zap_phys_t *) zap_scratch; 1479 1480 /* 1481 * This assumes that the leaf blocks start at block 1. The 1482 * documentation isn't exactly clear on this. 1483 */ 1484 zap_leaf_t zl; 1485 zl.l_bs = z.zap_block_shift; 1486 for (i = 0; i < zh.zap_num_leafs; i++) { 1487 off_t off = (i + 1) << zl.l_bs; 1488 char name[256], *p; 1489 uint64_t value; 1490 1491 if (dnode_read(spa, dnode, off, zap_scratch, bsize)) 1492 return (EIO); 1493 1494 zl.l_phys = (zap_leaf_phys_t *) zap_scratch; 1495 1496 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 1497 zap_leaf_chunk_t *zc, *nc; 1498 int namelen; 1499 1500 zc = &ZAP_LEAF_CHUNK(&zl, j); 1501 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 1502 continue; 1503 namelen = zc->l_entry.le_name_numints; 1504 if (namelen > sizeof(name)) 1505 namelen = sizeof(name); 1506 1507 /* 1508 * Paste the name back together. 1509 */ 1510 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk); 1511 p = name; 1512 while (namelen > 0) { 1513 int len; 1514 len = namelen; 1515 if (len > ZAP_LEAF_ARRAY_BYTES) 1516 len = ZAP_LEAF_ARRAY_BYTES; 1517 memcpy(p, nc->l_array.la_array, len); 1518 p += len; 1519 namelen -= len; 1520 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next); 1521 } 1522 1523 /* 1524 * Assume the first eight bytes of the value are 1525 * a uint64_t. 1526 */ 1527 value = fzap_leaf_value(&zl, zc); 1528 1529 //printf("%s 0x%jx\n", name, (uintmax_t)value); 1530 printf("%s\n", name); 1531 } 1532 } 1533 1534 return (0); 1535} 1536 1537/* 1538 * List a zap directory. 1539 */ 1540static int 1541zap_list(const spa_t *spa, const dnode_phys_t *dnode) 1542{ 1543 uint64_t zap_type; 1544 size_t size = dnode->dn_datablkszsec * 512; 1545 1546 if (dnode_read(spa, dnode, 0, zap_scratch, size)) 1547 return (EIO); 1548 1549 zap_type = *(uint64_t *) zap_scratch; 1550 if (zap_type == ZBT_MICRO) 1551 return mzap_list(dnode); 1552 else 1553 return fzap_list(spa, dnode); 1554} 1555 1556static int 1557objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode) 1558{ 1559 off_t offset; 1560 1561 offset = objnum * sizeof(dnode_phys_t); 1562 return dnode_read(spa, &os->os_meta_dnode, offset, 1563 dnode, sizeof(dnode_phys_t)); 1564} 1565 1566static int 1567mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) 1568{ 1569 const mzap_phys_t *mz; 1570 const mzap_ent_phys_t *mze; 1571 size_t size; 1572 int chunks, i; 1573 1574 /* 1575 * Microzap objects use exactly one block. Read the whole 1576 * thing. 1577 */ 1578 size = dnode->dn_datablkszsec * 512; 1579 1580 mz = (const mzap_phys_t *) zap_scratch; 1581 chunks = size / MZAP_ENT_LEN - 1; 1582 1583 for (i = 0; i < chunks; i++) { 1584 mze = &mz->mz_chunk[i]; 1585 if (value == mze->mze_value) { 1586 strcpy(name, mze->mze_name); 1587 return (0); 1588 } 1589 } 1590 1591 return (ENOENT); 1592} 1593 1594static void 1595fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name) 1596{ 1597 size_t namelen; 1598 const zap_leaf_chunk_t *nc; 1599 char *p; 1600 1601 namelen = zc->l_entry.le_name_numints; 1602 1603 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); 1604 p = name; 1605 while (namelen > 0) { 1606 size_t len; 1607 len = namelen; 1608 if (len > ZAP_LEAF_ARRAY_BYTES) 1609 len = ZAP_LEAF_ARRAY_BYTES; 1610 memcpy(p, nc->l_array.la_array, len); 1611 p += len; 1612 namelen -= len; 1613 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next); 1614 } 1615 1616 *p = '\0'; 1617} 1618 1619static int 1620fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) 1621{ 1622 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; 1623 zap_phys_t zh = *(zap_phys_t *) zap_scratch; 1624 fat_zap_t z; 1625 int i, j; 1626 1627 if (zh.zap_magic != ZAP_MAGIC) 1628 return (EIO); 1629 1630 z.zap_block_shift = ilog2(bsize); 1631 z.zap_phys = (zap_phys_t *) zap_scratch; 1632 1633 /* 1634 * This assumes that the leaf blocks start at block 1. The 1635 * documentation isn't exactly clear on this. 1636 */ 1637 zap_leaf_t zl; 1638 zl.l_bs = z.zap_block_shift; 1639 for (i = 0; i < zh.zap_num_leafs; i++) { 1640 off_t off = (i + 1) << zl.l_bs; 1641 1642 if (dnode_read(spa, dnode, off, zap_scratch, bsize)) 1643 return (EIO); 1644 1645 zl.l_phys = (zap_leaf_phys_t *) zap_scratch; 1646 1647 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) { 1648 zap_leaf_chunk_t *zc; 1649 1650 zc = &ZAP_LEAF_CHUNK(&zl, j); 1651 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY) 1652 continue; 1653 if (zc->l_entry.le_value_intlen != 8 || 1654 zc->l_entry.le_value_numints != 1) 1655 continue; 1656 1657 if (fzap_leaf_value(&zl, zc) == value) { 1658 fzap_name_copy(&zl, zc, name); 1659 return (0); 1660 } 1661 } 1662 } 1663 1664 return (ENOENT); 1665} 1666 1667static int 1668zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) 1669{ 1670 int rc; 1671 uint64_t zap_type; 1672 size_t size = dnode->dn_datablkszsec * 512; 1673 1674 rc = dnode_read(spa, dnode, 0, zap_scratch, size); 1675 if (rc) 1676 return (rc); 1677 1678 zap_type = *(uint64_t *) zap_scratch; 1679 if (zap_type == ZBT_MICRO) 1680 return mzap_rlookup(spa, dnode, name, value); 1681 else 1682 return fzap_rlookup(spa, dnode, name, value); 1683} 1684 1685static int 1686zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result) 1687{ 1688 char name[256]; 1689 char component[256]; 1690 uint64_t dir_obj, parent_obj, child_dir_zapobj; 1691 dnode_phys_t child_dir_zap, dataset, dir, parent; 1692 dsl_dir_phys_t *dd; 1693 dsl_dataset_phys_t *ds; 1694 char *p; 1695 int len; 1696 1697 p = &name[sizeof(name) - 1]; 1698 *p = '\0'; 1699 1700 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { 1701 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 1702 return (EIO); 1703 } 1704 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus; 1705 dir_obj = ds->ds_dir_obj; 1706 1707 for (;;) { 1708 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0) 1709 return (EIO); 1710 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 1711 1712 /* Actual loop condition. */ 1713 parent_obj = dd->dd_parent_obj; 1714 if (parent_obj == 0) 1715 break; 1716 1717 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0) 1718 return (EIO); 1719 dd = (dsl_dir_phys_t *)&parent.dn_bonus; 1720 child_dir_zapobj = dd->dd_child_dir_zapobj; 1721 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) 1722 return (EIO); 1723 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0) 1724 return (EIO); 1725 1726 len = strlen(component); 1727 p -= len; 1728 memcpy(p, component, len); 1729 --p; 1730 *p = '/'; 1731 1732 /* Actual loop iteration. */ 1733 dir_obj = parent_obj; 1734 } 1735 1736 if (*p != '\0') 1737 ++p; 1738 strcpy(result, p); 1739 1740 return (0); 1741} 1742 1743static int 1744zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum) 1745{ 1746 char element[256]; 1747 uint64_t dir_obj, child_dir_zapobj; 1748 dnode_phys_t child_dir_zap, dir; 1749 dsl_dir_phys_t *dd; 1750 const char *p, *q; 1751 1752 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) 1753 return (EIO); 1754 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &dir_obj)) 1755 return (EIO); 1756 1757 p = name; 1758 for (;;) { 1759 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) 1760 return (EIO); 1761 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 1762 1763 while (*p == '/') 1764 p++; 1765 /* Actual loop condition #1. */ 1766 if (*p == '\0') 1767 break; 1768 1769 q = strchr(p, '/'); 1770 if (q) { 1771 memcpy(element, p, q - p); 1772 element[q - p] = '\0'; 1773 p = q + 1; 1774 } else { 1775 strcpy(element, p); 1776 p += strlen(p); 1777 } 1778 1779 child_dir_zapobj = dd->dd_child_dir_zapobj; 1780 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) 1781 return (EIO); 1782 1783 /* Actual loop condition #2. */ 1784 if (zap_lookup(spa, &child_dir_zap, element, &dir_obj) != 0) 1785 return (ENOENT); 1786 } 1787 1788 *objnum = dd->dd_head_dataset_obj; 1789 return (0); 1790} 1791 1792#ifndef BOOT2 1793static int 1794zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/) 1795{ 1796 uint64_t dir_obj, child_dir_zapobj; 1797 dnode_phys_t child_dir_zap, dir, dataset; 1798 dsl_dataset_phys_t *ds; 1799 dsl_dir_phys_t *dd; 1800 1801 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { 1802 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 1803 return (EIO); 1804 } 1805 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; 1806 dir_obj = ds->ds_dir_obj; 1807 1808 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) { 1809 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj); 1810 return (EIO); 1811 } 1812 dd = (dsl_dir_phys_t *)&dir.dn_bonus; 1813 1814 child_dir_zapobj = dd->dd_child_dir_zapobj; 1815 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) { 1816 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj); 1817 return (EIO); 1818 } 1819 1820 return (zap_list(spa, &child_dir_zap) != 0); 1821} 1822#endif 1823 1824/* 1825 * Find the object set given the object number of its dataset object 1826 * and return its details in *objset 1827 */ 1828static int 1829zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset) 1830{ 1831 dnode_phys_t dataset; 1832 dsl_dataset_phys_t *ds; 1833 1834 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) { 1835 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum); 1836 return (EIO); 1837 } 1838 1839 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus; 1840 if (zio_read(spa, &ds->ds_bp, objset)) { 1841 printf("ZFS: can't read object set for dataset %ju\n", 1842 (uintmax_t)objnum); 1843 return (EIO); 1844 } 1845 1846 return (0); 1847} 1848 1849/* 1850 * Find the object set pointed to by the BOOTFS property or the root 1851 * dataset if there is none and return its details in *objset 1852 */ 1853static int 1854zfs_get_root(const spa_t *spa, uint64_t *objid) 1855{ 1856 dnode_phys_t dir, propdir; 1857 uint64_t props, bootfs, root; 1858 1859 *objid = 0; 1860 1861 /* 1862 * Start with the MOS directory object. 1863 */ 1864 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) { 1865 printf("ZFS: can't read MOS object directory\n"); 1866 return (EIO); 1867 } 1868 1869 /* 1870 * Lookup the pool_props and see if we can find a bootfs. 1871 */ 1872 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0 1873 && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 1874 && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0 1875 && bootfs != 0) 1876 { 1877 *objid = bootfs; 1878 return (0); 1879 } 1880 /* 1881 * Lookup the root dataset directory 1882 */ 1883 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root) 1884 || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) { 1885 printf("ZFS: can't find root dsl_dir\n"); 1886 return (EIO); 1887 } 1888 1889 /* 1890 * Use the information from the dataset directory's bonus buffer 1891 * to find the dataset object and from that the object set itself. 1892 */ 1893 dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus; 1894 *objid = dd->dd_head_dataset_obj; 1895 return (0); 1896} 1897 1898static int 1899zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount) 1900{ 1901 1902 mount->spa = spa; 1903 1904 /* 1905 * Find the root object set if not explicitly provided 1906 */ 1907 if (rootobj == 0 && zfs_get_root(spa, &rootobj)) { 1908 printf("ZFS: can't find root filesystem\n"); 1909 return (EIO); 1910 } 1911 1912 if (zfs_mount_dataset(spa, rootobj, &mount->objset)) { 1913 printf("ZFS: can't open root filesystem\n"); 1914 return (EIO); 1915 } 1916 1917 mount->rootobj = rootobj; 1918 1919 return (0); 1920} 1921 1922static int 1923zfs_spa_init(spa_t *spa) 1924{ 1925 1926 if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) { 1927 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); 1928 return (EIO); 1929 } 1930 if (spa->spa_mos.os_type != DMU_OST_META) { 1931 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name); 1932 return (EIO); 1933 } 1934 return (0); 1935} 1936 1937static int 1938zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb) 1939{ 1940 1941 if (dn->dn_bonustype != DMU_OT_SA) { 1942 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus; 1943 1944 sb->st_mode = zp->zp_mode; 1945 sb->st_uid = zp->zp_uid; 1946 sb->st_gid = zp->zp_gid; 1947 sb->st_size = zp->zp_size; 1948 } else { 1949 sa_hdr_phys_t *sahdrp; 1950 int hdrsize; 1951 size_t size = 0; 1952 void *buf = NULL; 1953 1954 if (dn->dn_bonuslen != 0) 1955 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn); 1956 else { 1957 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) { 1958 blkptr_t *bp = &dn->dn_spill; 1959 int error; 1960 1961 size = BP_GET_LSIZE(bp); 1962 buf = zfs_alloc(size); 1963 error = zio_read(spa, bp, buf); 1964 if (error != 0) { 1965 zfs_free(buf, size); 1966 return (error); 1967 } 1968 sahdrp = buf; 1969 } else { 1970 return (EIO); 1971 } 1972 } 1973 hdrsize = SA_HDR_SIZE(sahdrp); 1974 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize + 1975 SA_MODE_OFFSET); 1976 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize + 1977 SA_UID_OFFSET); 1978 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize + 1979 SA_GID_OFFSET); 1980 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize + 1981 SA_SIZE_OFFSET); 1982 if (buf != NULL) 1983 zfs_free(buf, size); 1984 } 1985 1986 return (0); 1987} 1988 1989/* 1990 * Lookup a file and return its dnode. 1991 */ 1992static int 1993zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode) 1994{ 1995 int rc; 1996 uint64_t objnum, rootnum, parentnum; 1997 const spa_t *spa; 1998 dnode_phys_t dn; 1999 const char *p, *q; 2000 char element[256]; 2001 char path[1024]; 2002 int symlinks_followed = 0; 2003 struct stat sb; 2004 2005 spa = mount->spa; 2006 if (mount->objset.os_type != DMU_OST_ZFS) { 2007 printf("ZFS: unexpected object set type %ju\n", 2008 (uintmax_t)mount->objset.os_type); 2009 return (EIO); 2010 } 2011 2012 /* 2013 * Get the root directory dnode. 2014 */ 2015 rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn); 2016 if (rc) 2017 return (rc); 2018 2019 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum); 2020 if (rc) 2021 return (rc); 2022 2023 rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn); 2024 if (rc) 2025 return (rc); 2026 2027 objnum = rootnum; 2028 p = upath; 2029 while (p && *p) { 2030 while (*p == '/') 2031 p++; 2032 if (!*p) 2033 break; 2034 q = strchr(p, '/'); 2035 if (q) { 2036 memcpy(element, p, q - p); 2037 element[q - p] = 0; 2038 p = q; 2039 } else { 2040 strcpy(element, p); 2041 p = 0; 2042 } 2043 2044 rc = zfs_dnode_stat(spa, &dn, &sb); 2045 if (rc) 2046 return (rc); 2047 if (!S_ISDIR(sb.st_mode)) 2048 return (ENOTDIR); 2049 2050 parentnum = objnum; 2051 rc = zap_lookup(spa, &dn, element, &objnum); 2052 if (rc) 2053 return (rc); 2054 objnum = ZFS_DIRENT_OBJ(objnum); 2055 2056 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn); 2057 if (rc) 2058 return (rc); 2059 2060 /* 2061 * Check for symlink. 2062 */ 2063 rc = zfs_dnode_stat(spa, &dn, &sb); 2064 if (rc) 2065 return (rc); 2066 if (S_ISLNK(sb.st_mode)) { 2067 if (symlinks_followed > 10) 2068 return (EMLINK); 2069 symlinks_followed++; 2070 2071 /* 2072 * Read the link value and copy the tail of our 2073 * current path onto the end. 2074 */ 2075 if (p) 2076 strcpy(&path[sb.st_size], p); 2077 else 2078 path[sb.st_size] = 0; 2079 if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) { 2080 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)], 2081 sb.st_size); 2082 } else { 2083 rc = dnode_read(spa, &dn, 0, path, sb.st_size); 2084 if (rc) 2085 return (rc); 2086 } 2087 2088 /* 2089 * Restart with the new path, starting either at 2090 * the root or at the parent depending whether or 2091 * not the link is relative. 2092 */ 2093 p = path; 2094 if (*p == '/') 2095 objnum = rootnum; 2096 else 2097 objnum = parentnum; 2098 objset_get_dnode(spa, &mount->objset, objnum, &dn); 2099 } 2100 } 2101 2102 *dnode = dn; 2103 return (0); 2104} 2105