1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright 2017 Nexenta Systems, Inc. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2016 Toomas Soome <tsoome@me.com> 29 * Copyright 2017 Joyent, Inc. 30 */ 31 32#include <sys/zfs_context.h> 33#include <sys/fm/fs/zfs.h> 34#include <sys/spa.h> 35#include <sys/spa_impl.h> 36#include <sys/bpobj.h> 37#include <sys/dmu.h> 38#include <sys/dmu_tx.h> 39#include <sys/dsl_dir.h> 40#include <sys/vdev_impl.h> 41#include <sys/uberblock_impl.h> 42#include <sys/metaslab.h> 43#include <sys/metaslab_impl.h> 44#include <sys/space_map.h> 45#include <sys/space_reftree.h> 46#include <sys/zio.h> 47#include <sys/zap.h> 48#include <sys/fs/zfs.h> 49#include <sys/arc.h> 50#include <sys/zil.h> 51#include <sys/dsl_scan.h> 52#include <sys/abd.h> 53#include <sys/trim_map.h> 54#include <sys/vdev_initialize.h> 55 56SYSCTL_DECL(_vfs_zfs); 57SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 58 59/* 60 * Virtual device management. 61 */ 62 63/* 64 * The limit for ZFS to automatically increase a top-level vdev's ashift 65 * from logical ashift to physical ashift. 66 * 67 * Example: one or more 512B emulation child vdevs 68 * child->vdev_ashift = 9 (512 bytes) 69 * child->vdev_physical_ashift = 12 (4096 bytes) 70 * zfs_max_auto_ashift = 11 (2048 bytes) 71 * zfs_min_auto_ashift = 9 (512 bytes) 72 * 73 * On pool creation or the addition of a new top-level vdev, ZFS will 74 * increase the ashift of the top-level vdev to 2048 as limited by 75 * zfs_max_auto_ashift. 76 * 77 * Example: one or more 512B emulation child vdevs 78 * child->vdev_ashift = 9 (512 bytes) 79 * child->vdev_physical_ashift = 12 (4096 bytes) 80 * zfs_max_auto_ashift = 13 (8192 bytes) 81 * zfs_min_auto_ashift = 9 (512 bytes) 82 * 83 * On pool creation or the addition of a new top-level vdev, ZFS will 84 * increase the ashift of the top-level vdev to 4096 to match the 85 * max vdev_physical_ashift. 86 * 87 * Example: one or more 512B emulation child vdevs 88 * child->vdev_ashift = 9 (512 bytes) 89 * child->vdev_physical_ashift = 9 (512 bytes) 90 * zfs_max_auto_ashift = 13 (8192 bytes) 91 * zfs_min_auto_ashift = 12 (4096 bytes) 92 * 93 * On pool creation or the addition of a new top-level vdev, ZFS will 94 * increase the ashift of the top-level vdev to 4096 to match the 95 * zfs_min_auto_ashift. 96 */ 97static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 98static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 99 100static int 101sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 102{ 103 uint64_t val; 104 int err; 105 106 val = zfs_max_auto_ashift; 107 err = sysctl_handle_64(oidp, &val, 0, req); 108 if (err != 0 || req->newptr == NULL) 109 return (err); 110 111 if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) 112 return (EINVAL); 113 114 zfs_max_auto_ashift = val; 115 116 return (0); 117} 118SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 119 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 120 sysctl_vfs_zfs_max_auto_ashift, "QU", 121 "Max ashift used when optimising for logical -> physical sectors size on " 122 "new top-level vdevs."); 123 124static int 125sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 126{ 127 uint64_t val; 128 int err; 129 130 val = zfs_min_auto_ashift; 131 err = sysctl_handle_64(oidp, &val, 0, req); 132 if (err != 0 || req->newptr == NULL) 133 return (err); 134 135 if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) 136 return (EINVAL); 137 138 zfs_min_auto_ashift = val; 139 140 return (0); 141} 142SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 143 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 144 sysctl_vfs_zfs_min_auto_ashift, "QU", 145 "Min ashift used when creating new top-level vdevs."); 146 147static vdev_ops_t *vdev_ops_table[] = { 148 &vdev_root_ops, 149 &vdev_raidz_ops, 150 &vdev_mirror_ops, 151 &vdev_replacing_ops, 152 &vdev_spare_ops, 153#ifdef _KERNEL 154 &vdev_geom_ops, 155#else 156 &vdev_disk_ops, 157#endif 158 &vdev_file_ops, 159 &vdev_missing_ops, 160 &vdev_hole_ops, 161 &vdev_indirect_ops, 162 NULL 163}; 164 165 166/* target number of metaslabs per top-level vdev */ 167int vdev_max_ms_count = 200; 168SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN, 169 &vdev_max_ms_count, 0, 170 "Target number of metaslabs per top-level vdev"); 171 172/* minimum number of metaslabs per top-level vdev */ 173int vdev_min_ms_count = 16; 174SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN, 175 &vdev_min_ms_count, 0, 176 "Minimum number of metaslabs per top-level vdev"); 177 178/* practical upper limit of total metaslabs per top-level vdev */ 179int vdev_ms_count_limit = 1ULL << 17; 180SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN, 181 &vdev_ms_count_limit, 0, 182 "Maximum number of metaslabs per top-level vdev"); 183 184/* lower limit for metaslab size (512M) */ 185int vdev_default_ms_shift = 29; 186SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN, 187 &vdev_default_ms_shift, 0, 188 "Default shift between vdev size and number of metaslabs"); 189 190/* upper limit for metaslab size (256G) */ 191int vdev_max_ms_shift = 38; 192SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN, 193 &vdev_max_ms_shift, 0, 194 "Maximum shift between vdev size and number of metaslabs"); 195 196boolean_t vdev_validate_skip = B_FALSE; 197SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, validate_skip, CTLFLAG_RWTUN, 198 &vdev_validate_skip, 0, 199 "Bypass vdev validation"); 200 201/* 202 * Since the DTL space map of a vdev is not expected to have a lot of 203 * entries, we default its block size to 4K. 204 */ 205int vdev_dtl_sm_blksz = (1 << 12); 206SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, 207 &vdev_dtl_sm_blksz, 0, 208 "Block size for DTL space map. Power of 2 and greater than 4096."); 209 210/* 211 * vdev-wide space maps that have lots of entries written to them at 212 * the end of each transaction can benefit from a higher I/O bandwidth 213 * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. 214 */ 215int vdev_standard_sm_blksz = (1 << 17); 216SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, 217 &vdev_standard_sm_blksz, 0, 218 "Block size for standard space map. Power of 2 and greater than 4096."); 219 220/*PRINTFLIKE2*/ 221void 222vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) 223{ 224 va_list adx; 225 char buf[256]; 226 227 va_start(adx, fmt); 228 (void) vsnprintf(buf, sizeof (buf), fmt, adx); 229 va_end(adx); 230 231 if (vd->vdev_path != NULL) { 232 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, 233 vd->vdev_path, buf); 234 } else { 235 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", 236 vd->vdev_ops->vdev_op_type, 237 (u_longlong_t)vd->vdev_id, 238 (u_longlong_t)vd->vdev_guid, buf); 239 } 240} 241 242void 243vdev_dbgmsg_print_tree(vdev_t *vd, int indent) 244{ 245 char state[20]; 246 247 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { 248 zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, 249 vd->vdev_ops->vdev_op_type); 250 return; 251 } 252 253 switch (vd->vdev_state) { 254 case VDEV_STATE_UNKNOWN: 255 (void) snprintf(state, sizeof (state), "unknown"); 256 break; 257 case VDEV_STATE_CLOSED: 258 (void) snprintf(state, sizeof (state), "closed"); 259 break; 260 case VDEV_STATE_OFFLINE: 261 (void) snprintf(state, sizeof (state), "offline"); 262 break; 263 case VDEV_STATE_REMOVED: 264 (void) snprintf(state, sizeof (state), "removed"); 265 break; 266 case VDEV_STATE_CANT_OPEN: 267 (void) snprintf(state, sizeof (state), "can't open"); 268 break; 269 case VDEV_STATE_FAULTED: 270 (void) snprintf(state, sizeof (state), "faulted"); 271 break; 272 case VDEV_STATE_DEGRADED: 273 (void) snprintf(state, sizeof (state), "degraded"); 274 break; 275 case VDEV_STATE_HEALTHY: 276 (void) snprintf(state, sizeof (state), "healthy"); 277 break; 278 default: 279 (void) snprintf(state, sizeof (state), "<state %u>", 280 (uint_t)vd->vdev_state); 281 } 282 283 zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, 284 "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, 285 vd->vdev_islog ? " (log)" : "", 286 (u_longlong_t)vd->vdev_guid, 287 vd->vdev_path ? vd->vdev_path : "N/A", state); 288 289 for (uint64_t i = 0; i < vd->vdev_children; i++) 290 vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); 291} 292 293/* 294 * Given a vdev type, return the appropriate ops vector. 295 */ 296static vdev_ops_t * 297vdev_getops(const char *type) 298{ 299 vdev_ops_t *ops, **opspp; 300 301 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 302 if (strcmp(ops->vdev_op_type, type) == 0) 303 break; 304 305 return (ops); 306} 307 308/* ARGSUSED */ 309void 310vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) 311{ 312 res->rs_start = in->rs_start; 313 res->rs_end = in->rs_end; 314} 315 316/* 317 * Default asize function: return the MAX of psize with the asize of 318 * all children. This is what's used by anything other than RAID-Z. 319 */ 320uint64_t 321vdev_default_asize(vdev_t *vd, uint64_t psize) 322{ 323 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 324 uint64_t csize; 325 326 for (int c = 0; c < vd->vdev_children; c++) { 327 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 328 asize = MAX(asize, csize); 329 } 330 331 return (asize); 332} 333 334/* 335 * Get the minimum allocatable size. We define the allocatable size as 336 * the vdev's asize rounded to the nearest metaslab. This allows us to 337 * replace or attach devices which don't have the same physical size but 338 * can still satisfy the same number of allocations. 339 */ 340uint64_t 341vdev_get_min_asize(vdev_t *vd) 342{ 343 vdev_t *pvd = vd->vdev_parent; 344 345 /* 346 * If our parent is NULL (inactive spare or cache) or is the root, 347 * just return our own asize. 348 */ 349 if (pvd == NULL) 350 return (vd->vdev_asize); 351 352 /* 353 * The top-level vdev just returns the allocatable size rounded 354 * to the nearest metaslab. 355 */ 356 if (vd == vd->vdev_top) 357 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 358 359 /* 360 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 361 * so each child must provide at least 1/Nth of its asize. 362 */ 363 if (pvd->vdev_ops == &vdev_raidz_ops) 364 return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / 365 pvd->vdev_children); 366 367 return (pvd->vdev_min_asize); 368} 369 370void 371vdev_set_min_asize(vdev_t *vd) 372{ 373 vd->vdev_min_asize = vdev_get_min_asize(vd); 374 375 for (int c = 0; c < vd->vdev_children; c++) 376 vdev_set_min_asize(vd->vdev_child[c]); 377} 378 379vdev_t * 380vdev_lookup_top(spa_t *spa, uint64_t vdev) 381{ 382 vdev_t *rvd = spa->spa_root_vdev; 383 384 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 385 386 if (vdev < rvd->vdev_children) { 387 ASSERT(rvd->vdev_child[vdev] != NULL); 388 return (rvd->vdev_child[vdev]); 389 } 390 391 return (NULL); 392} 393 394vdev_t * 395vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 396{ 397 vdev_t *mvd; 398 399 if (vd->vdev_guid == guid) 400 return (vd); 401 402 for (int c = 0; c < vd->vdev_children; c++) 403 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 404 NULL) 405 return (mvd); 406 407 return (NULL); 408} 409 410static int 411vdev_count_leaves_impl(vdev_t *vd) 412{ 413 int n = 0; 414 415 if (vd->vdev_ops->vdev_op_leaf) 416 return (1); 417 418 for (int c = 0; c < vd->vdev_children; c++) 419 n += vdev_count_leaves_impl(vd->vdev_child[c]); 420 421 return (n); 422} 423 424int 425vdev_count_leaves(spa_t *spa) 426{ 427 return (vdev_count_leaves_impl(spa->spa_root_vdev)); 428} 429 430void 431vdev_add_child(vdev_t *pvd, vdev_t *cvd) 432{ 433 size_t oldsize, newsize; 434 uint64_t id = cvd->vdev_id; 435 vdev_t **newchild; 436 spa_t *spa = cvd->vdev_spa; 437 438 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 439 ASSERT(cvd->vdev_parent == NULL); 440 441 cvd->vdev_parent = pvd; 442 443 if (pvd == NULL) 444 return; 445 446 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 447 448 oldsize = pvd->vdev_children * sizeof (vdev_t *); 449 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 450 newsize = pvd->vdev_children * sizeof (vdev_t *); 451 452 newchild = kmem_zalloc(newsize, KM_SLEEP); 453 if (pvd->vdev_child != NULL) { 454 bcopy(pvd->vdev_child, newchild, oldsize); 455 kmem_free(pvd->vdev_child, oldsize); 456 } 457 458 pvd->vdev_child = newchild; 459 pvd->vdev_child[id] = cvd; 460 461 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 462 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 463 464 /* 465 * Walk up all ancestors to update guid sum. 466 */ 467 for (; pvd != NULL; pvd = pvd->vdev_parent) 468 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 469} 470 471void 472vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 473{ 474 int c; 475 uint_t id = cvd->vdev_id; 476 477 ASSERT(cvd->vdev_parent == pvd); 478 479 if (pvd == NULL) 480 return; 481 482 ASSERT(id < pvd->vdev_children); 483 ASSERT(pvd->vdev_child[id] == cvd); 484 485 pvd->vdev_child[id] = NULL; 486 cvd->vdev_parent = NULL; 487 488 for (c = 0; c < pvd->vdev_children; c++) 489 if (pvd->vdev_child[c]) 490 break; 491 492 if (c == pvd->vdev_children) { 493 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 494 pvd->vdev_child = NULL; 495 pvd->vdev_children = 0; 496 } 497 498 /* 499 * Walk up all ancestors to update guid sum. 500 */ 501 for (; pvd != NULL; pvd = pvd->vdev_parent) 502 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 503} 504 505/* 506 * Remove any holes in the child array. 507 */ 508void 509vdev_compact_children(vdev_t *pvd) 510{ 511 vdev_t **newchild, *cvd; 512 int oldc = pvd->vdev_children; 513 int newc; 514 515 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 516 517 if (oldc == 0) 518 return; 519 520 for (int c = newc = 0; c < oldc; c++) 521 if (pvd->vdev_child[c]) 522 newc++; 523 524 if (newc > 0) { 525 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 526 527 for (int c = newc = 0; c < oldc; c++) { 528 if ((cvd = pvd->vdev_child[c]) != NULL) { 529 newchild[newc] = cvd; 530 cvd->vdev_id = newc++; 531 } 532 } 533 } else { 534 newchild = NULL; 535 } 536 537 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 538 pvd->vdev_child = newchild; 539 pvd->vdev_children = newc; 540} 541 542/* 543 * Allocate and minimally initialize a vdev_t. 544 */ 545vdev_t * 546vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 547{ 548 vdev_t *vd; 549 vdev_indirect_config_t *vic; 550 551 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 552 vic = &vd->vdev_indirect_config; 553 554 if (spa->spa_root_vdev == NULL) { 555 ASSERT(ops == &vdev_root_ops); 556 spa->spa_root_vdev = vd; 557 spa->spa_load_guid = spa_generate_guid(NULL); 558 } 559 560 if (guid == 0 && ops != &vdev_hole_ops) { 561 if (spa->spa_root_vdev == vd) { 562 /* 563 * The root vdev's guid will also be the pool guid, 564 * which must be unique among all pools. 565 */ 566 guid = spa_generate_guid(NULL); 567 } else { 568 /* 569 * Any other vdev's guid must be unique within the pool. 570 */ 571 guid = spa_generate_guid(spa); 572 } 573 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 574 } 575 576 vd->vdev_spa = spa; 577 vd->vdev_id = id; 578 vd->vdev_guid = guid; 579 vd->vdev_guid_sum = guid; 580 vd->vdev_ops = ops; 581 vd->vdev_state = VDEV_STATE_CLOSED; 582 vd->vdev_ishole = (ops == &vdev_hole_ops); 583 vic->vic_prev_indirect_vdev = UINT64_MAX; 584 585 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); 586 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); 587 vd->vdev_obsolete_segments = range_tree_create(NULL, NULL); 588 589 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 590 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 591 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 592 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 593 mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); 594 mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 595 mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); 596 cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); 597 cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); 598 599 for (int t = 0; t < DTL_TYPES; t++) { 600 vd->vdev_dtl[t] = range_tree_create(NULL, NULL); 601 } 602 txg_list_create(&vd->vdev_ms_list, spa, 603 offsetof(struct metaslab, ms_txg_node)); 604 txg_list_create(&vd->vdev_dtl_list, spa, 605 offsetof(struct vdev, vdev_dtl_node)); 606 vd->vdev_stat.vs_timestamp = gethrtime(); 607 vdev_queue_init(vd); 608 vdev_cache_init(vd); 609 610 return (vd); 611} 612 613/* 614 * Allocate a new vdev. The 'alloctype' is used to control whether we are 615 * creating a new vdev or loading an existing one - the behavior is slightly 616 * different for each case. 617 */ 618int 619vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 620 int alloctype) 621{ 622 vdev_ops_t *ops; 623 char *type; 624 uint64_t guid = 0, islog, nparity; 625 vdev_t *vd; 626 vdev_indirect_config_t *vic; 627 628 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 629 630 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 631 return (SET_ERROR(EINVAL)); 632 633 if ((ops = vdev_getops(type)) == NULL) 634 return (SET_ERROR(EINVAL)); 635 636 /* 637 * If this is a load, get the vdev guid from the nvlist. 638 * Otherwise, vdev_alloc_common() will generate one for us. 639 */ 640 if (alloctype == VDEV_ALLOC_LOAD) { 641 uint64_t label_id; 642 643 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 644 label_id != id) 645 return (SET_ERROR(EINVAL)); 646 647 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 648 return (SET_ERROR(EINVAL)); 649 } else if (alloctype == VDEV_ALLOC_SPARE) { 650 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 651 return (SET_ERROR(EINVAL)); 652 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 653 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 654 return (SET_ERROR(EINVAL)); 655 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 656 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 657 return (SET_ERROR(EINVAL)); 658 } 659 660 /* 661 * The first allocated vdev must be of type 'root'. 662 */ 663 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 664 return (SET_ERROR(EINVAL)); 665 666 /* 667 * Determine whether we're a log vdev. 668 */ 669 islog = 0; 670 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 671 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 672 return (SET_ERROR(ENOTSUP)); 673 674 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 675 return (SET_ERROR(ENOTSUP)); 676 677 /* 678 * Set the nparity property for RAID-Z vdevs. 679 */ 680 nparity = -1ULL; 681 if (ops == &vdev_raidz_ops) { 682 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 683 &nparity) == 0) { 684 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 685 return (SET_ERROR(EINVAL)); 686 /* 687 * Previous versions could only support 1 or 2 parity 688 * device. 689 */ 690 if (nparity > 1 && 691 spa_version(spa) < SPA_VERSION_RAIDZ2) 692 return (SET_ERROR(ENOTSUP)); 693 if (nparity > 2 && 694 spa_version(spa) < SPA_VERSION_RAIDZ3) 695 return (SET_ERROR(ENOTSUP)); 696 } else { 697 /* 698 * We require the parity to be specified for SPAs that 699 * support multiple parity levels. 700 */ 701 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 702 return (SET_ERROR(EINVAL)); 703 /* 704 * Otherwise, we default to 1 parity device for RAID-Z. 705 */ 706 nparity = 1; 707 } 708 } else { 709 nparity = 0; 710 } 711 ASSERT(nparity != -1ULL); 712 713 vd = vdev_alloc_common(spa, id, guid, ops); 714 vic = &vd->vdev_indirect_config; 715 716 vd->vdev_islog = islog; 717 vd->vdev_nparity = nparity; 718 719 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 720 vd->vdev_path = spa_strdup(vd->vdev_path); 721 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 722 vd->vdev_devid = spa_strdup(vd->vdev_devid); 723 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 724 &vd->vdev_physpath) == 0) 725 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 726 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 727 vd->vdev_fru = spa_strdup(vd->vdev_fru); 728 729 /* 730 * Set the whole_disk property. If it's not specified, leave the value 731 * as -1. 732 */ 733 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 734 &vd->vdev_wholedisk) != 0) 735 vd->vdev_wholedisk = -1ULL; 736 737 ASSERT0(vic->vic_mapping_object); 738 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, 739 &vic->vic_mapping_object); 740 ASSERT0(vic->vic_births_object); 741 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, 742 &vic->vic_births_object); 743 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); 744 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 745 &vic->vic_prev_indirect_vdev); 746 747 /* 748 * Look for the 'not present' flag. This will only be set if the device 749 * was not present at the time of import. 750 */ 751 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 752 &vd->vdev_not_present); 753 754 /* 755 * Get the alignment requirement. 756 */ 757 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 758 759 /* 760 * Retrieve the vdev creation time. 761 */ 762 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 763 &vd->vdev_crtxg); 764 765 /* 766 * If we're a top-level vdev, try to load the allocation parameters. 767 */ 768 if (parent && !parent->vdev_parent && 769 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 770 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 771 &vd->vdev_ms_array); 772 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 773 &vd->vdev_ms_shift); 774 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 775 &vd->vdev_asize); 776 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 777 &vd->vdev_removing); 778 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 779 &vd->vdev_top_zap); 780 } else { 781 ASSERT0(vd->vdev_top_zap); 782 } 783 784 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 785 ASSERT(alloctype == VDEV_ALLOC_LOAD || 786 alloctype == VDEV_ALLOC_ADD || 787 alloctype == VDEV_ALLOC_SPLIT || 788 alloctype == VDEV_ALLOC_ROOTPOOL); 789 vd->vdev_mg = metaslab_group_create(islog ? 790 spa_log_class(spa) : spa_normal_class(spa), vd, 791 spa->spa_alloc_count); 792 } 793 794 if (vd->vdev_ops->vdev_op_leaf && 795 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 796 (void) nvlist_lookup_uint64(nv, 797 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 798 } else { 799 ASSERT0(vd->vdev_leaf_zap); 800 } 801 802 /* 803 * If we're a leaf vdev, try to load the DTL object and other state. 804 */ 805 806 if (vd->vdev_ops->vdev_op_leaf && 807 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 808 alloctype == VDEV_ALLOC_ROOTPOOL)) { 809 if (alloctype == VDEV_ALLOC_LOAD) { 810 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 811 &vd->vdev_dtl_object); 812 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 813 &vd->vdev_unspare); 814 } 815 816 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 817 uint64_t spare = 0; 818 819 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 820 &spare) == 0 && spare) 821 spa_spare_add(vd); 822 } 823 824 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 825 &vd->vdev_offline); 826 827 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 828 &vd->vdev_resilver_txg); 829 830 /* 831 * When importing a pool, we want to ignore the persistent fault 832 * state, as the diagnosis made on another system may not be 833 * valid in the current context. Local vdevs will 834 * remain in the faulted state. 835 */ 836 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 837 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 838 &vd->vdev_faulted); 839 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 840 &vd->vdev_degraded); 841 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 842 &vd->vdev_removed); 843 844 if (vd->vdev_faulted || vd->vdev_degraded) { 845 char *aux; 846 847 vd->vdev_label_aux = 848 VDEV_AUX_ERR_EXCEEDED; 849 if (nvlist_lookup_string(nv, 850 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 851 strcmp(aux, "external") == 0) 852 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 853 } 854 } 855 } 856 857 /* 858 * Add ourselves to the parent's list of children. 859 */ 860 vdev_add_child(parent, vd); 861 862 *vdp = vd; 863 864 return (0); 865} 866 867void 868vdev_free(vdev_t *vd) 869{ 870 spa_t *spa = vd->vdev_spa; 871 ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 872 873 /* 874 * Scan queues are normally destroyed at the end of a scan. If the 875 * queue exists here, that implies the vdev is being removed while 876 * the scan is still running. 877 */ 878 if (vd->vdev_scan_io_queue != NULL) { 879 mutex_enter(&vd->vdev_scan_io_queue_lock); 880 dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); 881 vd->vdev_scan_io_queue = NULL; 882 mutex_exit(&vd->vdev_scan_io_queue_lock); 883 } 884 885 /* 886 * vdev_free() implies closing the vdev first. This is simpler than 887 * trying to ensure complicated semantics for all callers. 888 */ 889 vdev_close(vd); 890 891 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 892 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 893 894 /* 895 * Free all children. 896 */ 897 for (int c = 0; c < vd->vdev_children; c++) 898 vdev_free(vd->vdev_child[c]); 899 900 ASSERT(vd->vdev_child == NULL); 901 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 902 ASSERT(vd->vdev_initialize_thread == NULL); 903 904 /* 905 * Discard allocation state. 906 */ 907 if (vd->vdev_mg != NULL) { 908 vdev_metaslab_fini(vd); 909 metaslab_group_destroy(vd->vdev_mg); 910 } 911 912 ASSERT0(vd->vdev_stat.vs_space); 913 ASSERT0(vd->vdev_stat.vs_dspace); 914 ASSERT0(vd->vdev_stat.vs_alloc); 915 916 /* 917 * Remove this vdev from its parent's child list. 918 */ 919 vdev_remove_child(vd->vdev_parent, vd); 920 921 ASSERT(vd->vdev_parent == NULL); 922 923 /* 924 * Clean up vdev structure. 925 */ 926 vdev_queue_fini(vd); 927 vdev_cache_fini(vd); 928 929 if (vd->vdev_path) 930 spa_strfree(vd->vdev_path); 931 if (vd->vdev_devid) 932 spa_strfree(vd->vdev_devid); 933 if (vd->vdev_physpath) 934 spa_strfree(vd->vdev_physpath); 935 if (vd->vdev_fru) 936 spa_strfree(vd->vdev_fru); 937 938 if (vd->vdev_isspare) 939 spa_spare_remove(vd); 940 if (vd->vdev_isl2cache) 941 spa_l2cache_remove(vd); 942 943 txg_list_destroy(&vd->vdev_ms_list); 944 txg_list_destroy(&vd->vdev_dtl_list); 945 946 mutex_enter(&vd->vdev_dtl_lock); 947 space_map_close(vd->vdev_dtl_sm); 948 for (int t = 0; t < DTL_TYPES; t++) { 949 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 950 range_tree_destroy(vd->vdev_dtl[t]); 951 } 952 mutex_exit(&vd->vdev_dtl_lock); 953 954 EQUIV(vd->vdev_indirect_births != NULL, 955 vd->vdev_indirect_mapping != NULL); 956 if (vd->vdev_indirect_births != NULL) { 957 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 958 vdev_indirect_births_close(vd->vdev_indirect_births); 959 } 960 961 if (vd->vdev_obsolete_sm != NULL) { 962 ASSERT(vd->vdev_removing || 963 vd->vdev_ops == &vdev_indirect_ops); 964 space_map_close(vd->vdev_obsolete_sm); 965 vd->vdev_obsolete_sm = NULL; 966 } 967 range_tree_destroy(vd->vdev_obsolete_segments); 968 rw_destroy(&vd->vdev_indirect_rwlock); 969 mutex_destroy(&vd->vdev_obsolete_lock); 970 971 mutex_destroy(&vd->vdev_queue_lock); 972 mutex_destroy(&vd->vdev_dtl_lock); 973 mutex_destroy(&vd->vdev_stat_lock); 974 mutex_destroy(&vd->vdev_probe_lock); 975 mutex_destroy(&vd->vdev_scan_io_queue_lock); 976 mutex_destroy(&vd->vdev_initialize_lock); 977 mutex_destroy(&vd->vdev_initialize_io_lock); 978 cv_destroy(&vd->vdev_initialize_io_cv); 979 cv_destroy(&vd->vdev_initialize_cv); 980 981 if (vd == spa->spa_root_vdev) 982 spa->spa_root_vdev = NULL; 983 984 kmem_free(vd, sizeof (vdev_t)); 985} 986 987/* 988 * Transfer top-level vdev state from svd to tvd. 989 */ 990static void 991vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 992{ 993 spa_t *spa = svd->vdev_spa; 994 metaslab_t *msp; 995 vdev_t *vd; 996 int t; 997 998 ASSERT(tvd == tvd->vdev_top); 999 1000 tvd->vdev_ms_array = svd->vdev_ms_array; 1001 tvd->vdev_ms_shift = svd->vdev_ms_shift; 1002 tvd->vdev_ms_count = svd->vdev_ms_count; 1003 tvd->vdev_top_zap = svd->vdev_top_zap; 1004 1005 svd->vdev_ms_array = 0; 1006 svd->vdev_ms_shift = 0; 1007 svd->vdev_ms_count = 0; 1008 svd->vdev_top_zap = 0; 1009 1010 if (tvd->vdev_mg) 1011 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 1012 tvd->vdev_mg = svd->vdev_mg; 1013 tvd->vdev_ms = svd->vdev_ms; 1014 1015 svd->vdev_mg = NULL; 1016 svd->vdev_ms = NULL; 1017 1018 if (tvd->vdev_mg != NULL) 1019 tvd->vdev_mg->mg_vd = tvd; 1020 1021 tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; 1022 svd->vdev_checkpoint_sm = NULL; 1023 1024 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 1025 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 1026 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 1027 1028 svd->vdev_stat.vs_alloc = 0; 1029 svd->vdev_stat.vs_space = 0; 1030 svd->vdev_stat.vs_dspace = 0; 1031 1032 /* 1033 * State which may be set on a top-level vdev that's in the 1034 * process of being removed. 1035 */ 1036 ASSERT0(tvd->vdev_indirect_config.vic_births_object); 1037 ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); 1038 ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); 1039 ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); 1040 ASSERT3P(tvd->vdev_indirect_births, ==, NULL); 1041 ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); 1042 ASSERT0(tvd->vdev_removing); 1043 tvd->vdev_removing = svd->vdev_removing; 1044 tvd->vdev_indirect_config = svd->vdev_indirect_config; 1045 tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; 1046 tvd->vdev_indirect_births = svd->vdev_indirect_births; 1047 range_tree_swap(&svd->vdev_obsolete_segments, 1048 &tvd->vdev_obsolete_segments); 1049 tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; 1050 svd->vdev_indirect_config.vic_mapping_object = 0; 1051 svd->vdev_indirect_config.vic_births_object = 0; 1052 svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; 1053 svd->vdev_indirect_mapping = NULL; 1054 svd->vdev_indirect_births = NULL; 1055 svd->vdev_obsolete_sm = NULL; 1056 svd->vdev_removing = 0; 1057 1058 for (t = 0; t < TXG_SIZE; t++) { 1059 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 1060 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 1061 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 1062 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 1063 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 1064 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 1065 } 1066 1067 if (list_link_active(&svd->vdev_config_dirty_node)) { 1068 vdev_config_clean(svd); 1069 vdev_config_dirty(tvd); 1070 } 1071 1072 if (list_link_active(&svd->vdev_state_dirty_node)) { 1073 vdev_state_clean(svd); 1074 vdev_state_dirty(tvd); 1075 } 1076 1077 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 1078 svd->vdev_deflate_ratio = 0; 1079 1080 tvd->vdev_islog = svd->vdev_islog; 1081 svd->vdev_islog = 0; 1082 1083 dsl_scan_io_queue_vdev_xfer(svd, tvd); 1084} 1085 1086static void 1087vdev_top_update(vdev_t *tvd, vdev_t *vd) 1088{ 1089 if (vd == NULL) 1090 return; 1091 1092 vd->vdev_top = tvd; 1093 1094 for (int c = 0; c < vd->vdev_children; c++) 1095 vdev_top_update(tvd, vd->vdev_child[c]); 1096} 1097 1098/* 1099 * Add a mirror/replacing vdev above an existing vdev. 1100 */ 1101vdev_t * 1102vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 1103{ 1104 spa_t *spa = cvd->vdev_spa; 1105 vdev_t *pvd = cvd->vdev_parent; 1106 vdev_t *mvd; 1107 1108 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1109 1110 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 1111 1112 mvd->vdev_asize = cvd->vdev_asize; 1113 mvd->vdev_min_asize = cvd->vdev_min_asize; 1114 mvd->vdev_max_asize = cvd->vdev_max_asize; 1115 mvd->vdev_psize = cvd->vdev_psize; 1116 mvd->vdev_ashift = cvd->vdev_ashift; 1117 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 1118 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 1119 mvd->vdev_state = cvd->vdev_state; 1120 mvd->vdev_crtxg = cvd->vdev_crtxg; 1121 1122 vdev_remove_child(pvd, cvd); 1123 vdev_add_child(pvd, mvd); 1124 cvd->vdev_id = mvd->vdev_children; 1125 vdev_add_child(mvd, cvd); 1126 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 1127 1128 if (mvd == mvd->vdev_top) 1129 vdev_top_transfer(cvd, mvd); 1130 1131 return (mvd); 1132} 1133 1134/* 1135 * Remove a 1-way mirror/replacing vdev from the tree. 1136 */ 1137void 1138vdev_remove_parent(vdev_t *cvd) 1139{ 1140 vdev_t *mvd = cvd->vdev_parent; 1141 vdev_t *pvd = mvd->vdev_parent; 1142 1143 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1144 1145 ASSERT(mvd->vdev_children == 1); 1146 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 1147 mvd->vdev_ops == &vdev_replacing_ops || 1148 mvd->vdev_ops == &vdev_spare_ops); 1149 cvd->vdev_ashift = mvd->vdev_ashift; 1150 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 1151 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 1152 1153 vdev_remove_child(mvd, cvd); 1154 vdev_remove_child(pvd, mvd); 1155 1156 /* 1157 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 1158 * Otherwise, we could have detached an offline device, and when we 1159 * go to import the pool we'll think we have two top-level vdevs, 1160 * instead of a different version of the same top-level vdev. 1161 */ 1162 if (mvd->vdev_top == mvd) { 1163 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 1164 cvd->vdev_orig_guid = cvd->vdev_guid; 1165 cvd->vdev_guid += guid_delta; 1166 cvd->vdev_guid_sum += guid_delta; 1167 } 1168 cvd->vdev_id = mvd->vdev_id; 1169 vdev_add_child(pvd, cvd); 1170 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 1171 1172 if (cvd == cvd->vdev_top) 1173 vdev_top_transfer(mvd, cvd); 1174 1175 ASSERT(mvd->vdev_children == 0); 1176 vdev_free(mvd); 1177} 1178 1179int 1180vdev_metaslab_init(vdev_t *vd, uint64_t txg) 1181{ 1182 spa_t *spa = vd->vdev_spa; 1183 objset_t *mos = spa->spa_meta_objset; 1184 uint64_t m; 1185 uint64_t oldc = vd->vdev_ms_count; 1186 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 1187 metaslab_t **mspp; 1188 int error; 1189 1190 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1191 1192 /* 1193 * This vdev is not being allocated from yet or is a hole. 1194 */ 1195 if (vd->vdev_ms_shift == 0) 1196 return (0); 1197 1198 ASSERT(!vd->vdev_ishole); 1199 1200 ASSERT(oldc <= newc); 1201 1202 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 1203 1204 if (oldc != 0) { 1205 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 1206 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 1207 } 1208 1209 vd->vdev_ms = mspp; 1210 vd->vdev_ms_count = newc; 1211 for (m = oldc; m < newc; m++) { 1212 uint64_t object = 0; 1213 1214 /* 1215 * vdev_ms_array may be 0 if we are creating the "fake" 1216 * metaslabs for an indirect vdev for zdb's leak detection. 1217 * See zdb_leak_init(). 1218 */ 1219 if (txg == 0 && vd->vdev_ms_array != 0) { 1220 error = dmu_read(mos, vd->vdev_ms_array, 1221 m * sizeof (uint64_t), sizeof (uint64_t), &object, 1222 DMU_READ_PREFETCH); 1223 if (error != 0) { 1224 vdev_dbgmsg(vd, "unable to read the metaslab " 1225 "array [error=%d]", error); 1226 return (error); 1227 } 1228 } 1229 1230 error = metaslab_init(vd->vdev_mg, m, object, txg, 1231 &(vd->vdev_ms[m])); 1232 if (error != 0) { 1233 vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", 1234 error); 1235 return (error); 1236 } 1237 } 1238 1239 if (txg == 0) 1240 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 1241 1242 /* 1243 * If the vdev is being removed we don't activate 1244 * the metaslabs since we want to ensure that no new 1245 * allocations are performed on this device. 1246 */ 1247 if (oldc == 0 && !vd->vdev_removing) 1248 metaslab_group_activate(vd->vdev_mg); 1249 1250 if (txg == 0) 1251 spa_config_exit(spa, SCL_ALLOC, FTAG); 1252 1253 return (0); 1254} 1255 1256void 1257vdev_metaslab_fini(vdev_t *vd) 1258{ 1259 if (vd->vdev_checkpoint_sm != NULL) { 1260 ASSERT(spa_feature_is_active(vd->vdev_spa, 1261 SPA_FEATURE_POOL_CHECKPOINT)); 1262 space_map_close(vd->vdev_checkpoint_sm); 1263 /* 1264 * Even though we close the space map, we need to set its 1265 * pointer to NULL. The reason is that vdev_metaslab_fini() 1266 * may be called multiple times for certain operations 1267 * (i.e. when destroying a pool) so we need to ensure that 1268 * this clause never executes twice. This logic is similar 1269 * to the one used for the vdev_ms clause below. 1270 */ 1271 vd->vdev_checkpoint_sm = NULL; 1272 } 1273 1274 if (vd->vdev_ms != NULL) { 1275 uint64_t count = vd->vdev_ms_count; 1276 1277 metaslab_group_passivate(vd->vdev_mg); 1278 for (uint64_t m = 0; m < count; m++) { 1279 metaslab_t *msp = vd->vdev_ms[m]; 1280 1281 if (msp != NULL) 1282 metaslab_fini(msp); 1283 } 1284 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1285 vd->vdev_ms = NULL; 1286 1287 vd->vdev_ms_count = 0; 1288 } 1289 ASSERT0(vd->vdev_ms_count); 1290} 1291 1292typedef struct vdev_probe_stats { 1293 boolean_t vps_readable; 1294 boolean_t vps_writeable; 1295 int vps_flags; 1296} vdev_probe_stats_t; 1297 1298static void 1299vdev_probe_done(zio_t *zio) 1300{ 1301 spa_t *spa = zio->io_spa; 1302 vdev_t *vd = zio->io_vd; 1303 vdev_probe_stats_t *vps = zio->io_private; 1304 1305 ASSERT(vd->vdev_probe_zio != NULL); 1306 1307 if (zio->io_type == ZIO_TYPE_READ) { 1308 if (zio->io_error == 0) 1309 vps->vps_readable = 1; 1310 if (zio->io_error == 0 && spa_writeable(spa)) { 1311 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1312 zio->io_offset, zio->io_size, zio->io_abd, 1313 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1314 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1315 } else { 1316 abd_free(zio->io_abd); 1317 } 1318 } else if (zio->io_type == ZIO_TYPE_WRITE) { 1319 if (zio->io_error == 0) 1320 vps->vps_writeable = 1; 1321 abd_free(zio->io_abd); 1322 } else if (zio->io_type == ZIO_TYPE_NULL) { 1323 zio_t *pio; 1324 1325 vd->vdev_cant_read |= !vps->vps_readable; 1326 vd->vdev_cant_write |= !vps->vps_writeable; 1327 1328 if (vdev_readable(vd) && 1329 (vdev_writeable(vd) || !spa_writeable(spa))) { 1330 zio->io_error = 0; 1331 } else { 1332 ASSERT(zio->io_error != 0); 1333 vdev_dbgmsg(vd, "failed probe"); 1334 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1335 spa, vd, NULL, 0, 0); 1336 zio->io_error = SET_ERROR(ENXIO); 1337 } 1338 1339 mutex_enter(&vd->vdev_probe_lock); 1340 ASSERT(vd->vdev_probe_zio == zio); 1341 vd->vdev_probe_zio = NULL; 1342 mutex_exit(&vd->vdev_probe_lock); 1343 1344 zio_link_t *zl = NULL; 1345 while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1346 if (!vdev_accessible(vd, pio)) 1347 pio->io_error = SET_ERROR(ENXIO); 1348 1349 kmem_free(vps, sizeof (*vps)); 1350 } 1351} 1352 1353/* 1354 * Determine whether this device is accessible. 1355 * 1356 * Read and write to several known locations: the pad regions of each 1357 * vdev label but the first, which we leave alone in case it contains 1358 * a VTOC. 1359 */ 1360zio_t * 1361vdev_probe(vdev_t *vd, zio_t *zio) 1362{ 1363 spa_t *spa = vd->vdev_spa; 1364 vdev_probe_stats_t *vps = NULL; 1365 zio_t *pio; 1366 1367 ASSERT(vd->vdev_ops->vdev_op_leaf); 1368 1369 /* 1370 * Don't probe the probe. 1371 */ 1372 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1373 return (NULL); 1374 1375 /* 1376 * To prevent 'probe storms' when a device fails, we create 1377 * just one probe i/o at a time. All zios that want to probe 1378 * this vdev will become parents of the probe io. 1379 */ 1380 mutex_enter(&vd->vdev_probe_lock); 1381 1382 if ((pio = vd->vdev_probe_zio) == NULL) { 1383 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1384 1385 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1386 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1387 ZIO_FLAG_TRYHARD; 1388 1389 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1390 /* 1391 * vdev_cant_read and vdev_cant_write can only 1392 * transition from TRUE to FALSE when we have the 1393 * SCL_ZIO lock as writer; otherwise they can only 1394 * transition from FALSE to TRUE. This ensures that 1395 * any zio looking at these values can assume that 1396 * failures persist for the life of the I/O. That's 1397 * important because when a device has intermittent 1398 * connectivity problems, we want to ensure that 1399 * they're ascribed to the device (ENXIO) and not 1400 * the zio (EIO). 1401 * 1402 * Since we hold SCL_ZIO as writer here, clear both 1403 * values so the probe can reevaluate from first 1404 * principles. 1405 */ 1406 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1407 vd->vdev_cant_read = B_FALSE; 1408 vd->vdev_cant_write = B_FALSE; 1409 } 1410 1411 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1412 vdev_probe_done, vps, 1413 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1414 1415 /* 1416 * We can't change the vdev state in this context, so we 1417 * kick off an async task to do it on our behalf. 1418 */ 1419 if (zio != NULL) { 1420 vd->vdev_probe_wanted = B_TRUE; 1421 spa_async_request(spa, SPA_ASYNC_PROBE); 1422 } 1423 } 1424 1425 if (zio != NULL) 1426 zio_add_child(zio, pio); 1427 1428 mutex_exit(&vd->vdev_probe_lock); 1429 1430 if (vps == NULL) { 1431 ASSERT(zio != NULL); 1432 return (NULL); 1433 } 1434 1435 for (int l = 1; l < VDEV_LABELS; l++) { 1436 zio_nowait(zio_read_phys(pio, vd, 1437 vdev_label_offset(vd->vdev_psize, l, 1438 offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, 1439 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), 1440 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1441 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1442 } 1443 1444 if (zio == NULL) 1445 return (pio); 1446 1447 zio_nowait(pio); 1448 return (NULL); 1449} 1450 1451static void 1452vdev_open_child(void *arg) 1453{ 1454 vdev_t *vd = arg; 1455 1456 vd->vdev_open_thread = curthread; 1457 vd->vdev_open_error = vdev_open(vd); 1458 vd->vdev_open_thread = NULL; 1459} 1460 1461boolean_t 1462vdev_uses_zvols(vdev_t *vd) 1463{ 1464 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1465 strlen(ZVOL_DIR)) == 0) 1466 return (B_TRUE); 1467 for (int c = 0; c < vd->vdev_children; c++) 1468 if (vdev_uses_zvols(vd->vdev_child[c])) 1469 return (B_TRUE); 1470 return (B_FALSE); 1471} 1472 1473void 1474vdev_open_children(vdev_t *vd) 1475{ 1476 taskq_t *tq; 1477 int children = vd->vdev_children; 1478 1479 vd->vdev_nonrot = B_TRUE; 1480 1481 /* 1482 * in order to handle pools on top of zvols, do the opens 1483 * in a single thread so that the same thread holds the 1484 * spa_namespace_lock 1485 */ 1486 if (B_TRUE || vdev_uses_zvols(vd)) { 1487 for (int c = 0; c < children; c++) { 1488 vd->vdev_child[c]->vdev_open_error = 1489 vdev_open(vd->vdev_child[c]); 1490 vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; 1491 } 1492 return; 1493 } 1494 tq = taskq_create("vdev_open", children, minclsyspri, 1495 children, children, TASKQ_PREPOPULATE); 1496 1497 for (int c = 0; c < children; c++) 1498 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1499 TQ_SLEEP) != 0); 1500 1501 taskq_destroy(tq); 1502 1503 for (int c = 0; c < children; c++) 1504 vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; 1505} 1506 1507/* 1508 * Compute the raidz-deflation ratio. Note, we hard-code 1509 * in 128k (1 << 17) because it is the "typical" blocksize. 1510 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 1511 * otherwise it would inconsistently account for existing bp's. 1512 */ 1513static void 1514vdev_set_deflate_ratio(vdev_t *vd) 1515{ 1516 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { 1517 vd->vdev_deflate_ratio = (1 << 17) / 1518 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 1519 } 1520} 1521 1522/* 1523 * Prepare a virtual device for access. 1524 */ 1525int 1526vdev_open(vdev_t *vd) 1527{ 1528 spa_t *spa = vd->vdev_spa; 1529 int error; 1530 uint64_t osize = 0; 1531 uint64_t max_osize = 0; 1532 uint64_t asize, max_asize, psize; 1533 uint64_t logical_ashift = 0; 1534 uint64_t physical_ashift = 0; 1535 1536 ASSERT(vd->vdev_open_thread == curthread || 1537 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1538 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1539 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1540 vd->vdev_state == VDEV_STATE_OFFLINE); 1541 1542 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1543 vd->vdev_cant_read = B_FALSE; 1544 vd->vdev_cant_write = B_FALSE; 1545 vd->vdev_notrim = B_FALSE; 1546 vd->vdev_min_asize = vdev_get_min_asize(vd); 1547 1548 /* 1549 * If this vdev is not removed, check its fault status. If it's 1550 * faulted, bail out of the open. 1551 */ 1552 if (!vd->vdev_removed && vd->vdev_faulted) { 1553 ASSERT(vd->vdev_children == 0); 1554 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1555 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1556 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1557 vd->vdev_label_aux); 1558 return (SET_ERROR(ENXIO)); 1559 } else if (vd->vdev_offline) { 1560 ASSERT(vd->vdev_children == 0); 1561 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1562 return (SET_ERROR(ENXIO)); 1563 } 1564 1565 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1566 &logical_ashift, &physical_ashift); 1567 1568 /* 1569 * Reset the vdev_reopening flag so that we actually close 1570 * the vdev on error. 1571 */ 1572 vd->vdev_reopening = B_FALSE; 1573 if (zio_injection_enabled && error == 0) 1574 error = zio_handle_device_injection(vd, NULL, ENXIO); 1575 1576 if (error) { 1577 if (vd->vdev_removed && 1578 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1579 vd->vdev_removed = B_FALSE; 1580 1581 if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { 1582 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, 1583 vd->vdev_stat.vs_aux); 1584 } else { 1585 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1586 vd->vdev_stat.vs_aux); 1587 } 1588 return (error); 1589 } 1590 1591 vd->vdev_removed = B_FALSE; 1592 1593 /* 1594 * Recheck the faulted flag now that we have confirmed that 1595 * the vdev is accessible. If we're faulted, bail. 1596 */ 1597 if (vd->vdev_faulted) { 1598 ASSERT(vd->vdev_children == 0); 1599 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1600 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1601 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1602 vd->vdev_label_aux); 1603 return (SET_ERROR(ENXIO)); 1604 } 1605 1606 if (vd->vdev_degraded) { 1607 ASSERT(vd->vdev_children == 0); 1608 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1609 VDEV_AUX_ERR_EXCEEDED); 1610 } else { 1611 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1612 } 1613 1614 /* 1615 * For hole or missing vdevs we just return success. 1616 */ 1617 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1618 return (0); 1619 1620 if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1621 trim_map_create(vd); 1622 1623 for (int c = 0; c < vd->vdev_children; c++) { 1624 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1625 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1626 VDEV_AUX_NONE); 1627 break; 1628 } 1629 } 1630 1631 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1632 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1633 1634 if (vd->vdev_children == 0) { 1635 if (osize < SPA_MINDEVSIZE) { 1636 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1637 VDEV_AUX_TOO_SMALL); 1638 return (SET_ERROR(EOVERFLOW)); 1639 } 1640 psize = osize; 1641 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1642 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1643 VDEV_LABEL_END_SIZE); 1644 } else { 1645 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1646 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1647 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1648 VDEV_AUX_TOO_SMALL); 1649 return (SET_ERROR(EOVERFLOW)); 1650 } 1651 psize = 0; 1652 asize = osize; 1653 max_asize = max_osize; 1654 } 1655 1656 vd->vdev_psize = psize; 1657 1658 /* 1659 * Make sure the allocatable size hasn't shrunk too much. 1660 */ 1661 if (asize < vd->vdev_min_asize) { 1662 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1663 VDEV_AUX_BAD_LABEL); 1664 return (SET_ERROR(EINVAL)); 1665 } 1666 1667 vd->vdev_physical_ashift = 1668 MAX(physical_ashift, vd->vdev_physical_ashift); 1669 vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1670 vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1671 1672 if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1673 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1674 VDEV_AUX_ASHIFT_TOO_BIG); 1675 return (EINVAL); 1676 } 1677 1678 if (vd->vdev_asize == 0) { 1679 /* 1680 * This is the first-ever open, so use the computed values. 1681 * For testing purposes, a higher ashift can be requested. 1682 */ 1683 vd->vdev_asize = asize; 1684 vd->vdev_max_asize = max_asize; 1685 } else { 1686 /* 1687 * Make sure the alignment requirement hasn't increased. 1688 */ 1689 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1690 vd->vdev_ops->vdev_op_leaf) { 1691 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1692 VDEV_AUX_BAD_LABEL); 1693 return (EINVAL); 1694 } 1695 vd->vdev_max_asize = max_asize; 1696 } 1697 1698 /* 1699 * If all children are healthy we update asize if either: 1700 * The asize has increased, due to a device expansion caused by dynamic 1701 * LUN growth or vdev replacement, and automatic expansion is enabled; 1702 * making the additional space available. 1703 * 1704 * The asize has decreased, due to a device shrink usually caused by a 1705 * vdev replace with a smaller device. This ensures that calculations 1706 * based of max_asize and asize e.g. esize are always valid. It's safe 1707 * to do this as we've already validated that asize is greater than 1708 * vdev_min_asize. 1709 */ 1710 if (vd->vdev_state == VDEV_STATE_HEALTHY && 1711 ((asize > vd->vdev_asize && 1712 (vd->vdev_expanding || spa->spa_autoexpand)) || 1713 (asize < vd->vdev_asize))) 1714 vd->vdev_asize = asize; 1715 1716 vdev_set_min_asize(vd); 1717 1718 /* 1719 * Ensure we can issue some IO before declaring the 1720 * vdev open for business. 1721 */ 1722 if (vd->vdev_ops->vdev_op_leaf && 1723 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1724 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1725 VDEV_AUX_ERR_EXCEEDED); 1726 return (error); 1727 } 1728 1729 /* 1730 * Track the min and max ashift values for normal data devices. 1731 */ 1732 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1733 !vd->vdev_islog && vd->vdev_aux == NULL) { 1734 if (vd->vdev_ashift > spa->spa_max_ashift) 1735 spa->spa_max_ashift = vd->vdev_ashift; 1736 if (vd->vdev_ashift < spa->spa_min_ashift) 1737 spa->spa_min_ashift = vd->vdev_ashift; 1738 } 1739 1740 /* 1741 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1742 * resilver. But don't do this if we are doing a reopen for a scrub, 1743 * since this would just restart the scrub we are already doing. 1744 */ 1745 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1746 vdev_resilver_needed(vd, NULL, NULL)) 1747 spa_async_request(spa, SPA_ASYNC_RESILVER); 1748 1749 return (0); 1750} 1751 1752/* 1753 * Called once the vdevs are all opened, this routine validates the label 1754 * contents. This needs to be done before vdev_load() so that we don't 1755 * inadvertently do repair I/Os to the wrong device. 1756 * 1757 * This function will only return failure if one of the vdevs indicates that it 1758 * has since been destroyed or exported. This is only possible if 1759 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1760 * will be updated but the function will return 0. 1761 */ 1762int 1763vdev_validate(vdev_t *vd) 1764{ 1765 spa_t *spa = vd->vdev_spa; 1766 nvlist_t *label; 1767 uint64_t guid = 0, aux_guid = 0, top_guid; 1768 uint64_t state; 1769 nvlist_t *nvl; 1770 uint64_t txg; 1771 1772 if (vdev_validate_skip) 1773 return (0); 1774 1775 for (uint64_t c = 0; c < vd->vdev_children; c++) 1776 if (vdev_validate(vd->vdev_child[c]) != 0) 1777 return (SET_ERROR(EBADF)); 1778 1779 /* 1780 * If the device has already failed, or was marked offline, don't do 1781 * any further validation. Otherwise, label I/O will fail and we will 1782 * overwrite the previous state. 1783 */ 1784 if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) 1785 return (0); 1786 1787 /* 1788 * If we are performing an extreme rewind, we allow for a label that 1789 * was modified at a point after the current txg. 1790 * If config lock is not held do not check for the txg. spa_sync could 1791 * be updating the vdev's label before updating spa_last_synced_txg. 1792 */ 1793 if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || 1794 spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) 1795 txg = UINT64_MAX; 1796 else 1797 txg = spa_last_synced_txg(spa); 1798 1799 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1800 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1801 VDEV_AUX_BAD_LABEL); 1802 vdev_dbgmsg(vd, "vdev_validate: failed reading config for " 1803 "txg %llu", (u_longlong_t)txg); 1804 return (0); 1805 } 1806 1807 /* 1808 * Determine if this vdev has been split off into another 1809 * pool. If so, then refuse to open it. 1810 */ 1811 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1812 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1813 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1814 VDEV_AUX_SPLIT_POOL); 1815 nvlist_free(label); 1816 vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); 1817 return (0); 1818 } 1819 1820 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { 1821 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1822 VDEV_AUX_CORRUPT_DATA); 1823 nvlist_free(label); 1824 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 1825 ZPOOL_CONFIG_POOL_GUID); 1826 return (0); 1827 } 1828 1829 /* 1830 * If config is not trusted then ignore the spa guid check. This is 1831 * necessary because if the machine crashed during a re-guid the new 1832 * guid might have been written to all of the vdev labels, but not the 1833 * cached config. The check will be performed again once we have the 1834 * trusted config from the MOS. 1835 */ 1836 if (spa->spa_trust_config && guid != spa_guid(spa)) { 1837 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1838 VDEV_AUX_CORRUPT_DATA); 1839 nvlist_free(label); 1840 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " 1841 "match config (%llu != %llu)", (u_longlong_t)guid, 1842 (u_longlong_t)spa_guid(spa)); 1843 return (0); 1844 } 1845 1846 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1847 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1848 &aux_guid) != 0) 1849 aux_guid = 0; 1850 1851 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { 1852 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1853 VDEV_AUX_CORRUPT_DATA); 1854 nvlist_free(label); 1855 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 1856 ZPOOL_CONFIG_GUID); 1857 return (0); 1858 } 1859 1860 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) 1861 != 0) { 1862 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1863 VDEV_AUX_CORRUPT_DATA); 1864 nvlist_free(label); 1865 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 1866 ZPOOL_CONFIG_TOP_GUID); 1867 return (0); 1868 } 1869 1870 /* 1871 * If this vdev just became a top-level vdev because its sibling was 1872 * detached, it will have adopted the parent's vdev guid -- but the 1873 * label may or may not be on disk yet. Fortunately, either version 1874 * of the label will have the same top guid, so if we're a top-level 1875 * vdev, we can safely compare to that instead. 1876 * However, if the config comes from a cachefile that failed to update 1877 * after the detach, a top-level vdev will appear as a non top-level 1878 * vdev in the config. Also relax the constraints if we perform an 1879 * extreme rewind. 1880 * 1881 * If we split this vdev off instead, then we also check the 1882 * original pool's guid. We don't want to consider the vdev 1883 * corrupt if it is partway through a split operation. 1884 */ 1885 if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { 1886 boolean_t mismatch = B_FALSE; 1887 if (spa->spa_trust_config && !spa->spa_extreme_rewind) { 1888 if (vd != vd->vdev_top || vd->vdev_guid != top_guid) 1889 mismatch = B_TRUE; 1890 } else { 1891 if (vd->vdev_guid != top_guid && 1892 vd->vdev_top->vdev_guid != guid) 1893 mismatch = B_TRUE; 1894 } 1895 1896 if (mismatch) { 1897 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1898 VDEV_AUX_CORRUPT_DATA); 1899 nvlist_free(label); 1900 vdev_dbgmsg(vd, "vdev_validate: config guid " 1901 "doesn't match label guid"); 1902 vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", 1903 (u_longlong_t)vd->vdev_guid, 1904 (u_longlong_t)vd->vdev_top->vdev_guid); 1905 vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " 1906 "aux_guid %llu", (u_longlong_t)guid, 1907 (u_longlong_t)top_guid, (u_longlong_t)aux_guid); 1908 return (0); 1909 } 1910 } 1911 1912 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1913 &state) != 0) { 1914 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1915 VDEV_AUX_CORRUPT_DATA); 1916 nvlist_free(label); 1917 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 1918 ZPOOL_CONFIG_POOL_STATE); 1919 return (0); 1920 } 1921 1922 nvlist_free(label); 1923 1924 /* 1925 * If this is a verbatim import, no need to check the 1926 * state of the pool. 1927 */ 1928 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1929 spa_load_state(spa) == SPA_LOAD_OPEN && 1930 state != POOL_STATE_ACTIVE) { 1931 vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " 1932 "for spa %s", (u_longlong_t)state, spa->spa_name); 1933 return (SET_ERROR(EBADF)); 1934 } 1935 1936 /* 1937 * If we were able to open and validate a vdev that was 1938 * previously marked permanently unavailable, clear that state 1939 * now. 1940 */ 1941 if (vd->vdev_not_present) 1942 vd->vdev_not_present = 0; 1943 1944 return (0); 1945} 1946 1947static void 1948vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) 1949{ 1950 if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { 1951 if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { 1952 zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " 1953 "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, 1954 dvd->vdev_path, svd->vdev_path); 1955 spa_strfree(dvd->vdev_path); 1956 dvd->vdev_path = spa_strdup(svd->vdev_path); 1957 } 1958 } else if (svd->vdev_path != NULL) { 1959 dvd->vdev_path = spa_strdup(svd->vdev_path); 1960 zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", 1961 (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); 1962 } 1963} 1964 1965/* 1966 * Recursively copy vdev paths from one vdev to another. Source and destination 1967 * vdev trees must have same geometry otherwise return error. Intended to copy 1968 * paths from userland config into MOS config. 1969 */ 1970int 1971vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) 1972{ 1973 if ((svd->vdev_ops == &vdev_missing_ops) || 1974 (svd->vdev_ishole && dvd->vdev_ishole) || 1975 (dvd->vdev_ops == &vdev_indirect_ops)) 1976 return (0); 1977 1978 if (svd->vdev_ops != dvd->vdev_ops) { 1979 vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", 1980 svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); 1981 return (SET_ERROR(EINVAL)); 1982 } 1983 1984 if (svd->vdev_guid != dvd->vdev_guid) { 1985 vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " 1986 "%llu)", (u_longlong_t)svd->vdev_guid, 1987 (u_longlong_t)dvd->vdev_guid); 1988 return (SET_ERROR(EINVAL)); 1989 } 1990 1991 if (svd->vdev_children != dvd->vdev_children) { 1992 vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " 1993 "%llu != %llu", (u_longlong_t)svd->vdev_children, 1994 (u_longlong_t)dvd->vdev_children); 1995 return (SET_ERROR(EINVAL)); 1996 } 1997 1998 for (uint64_t i = 0; i < svd->vdev_children; i++) { 1999 int error = vdev_copy_path_strict(svd->vdev_child[i], 2000 dvd->vdev_child[i]); 2001 if (error != 0) 2002 return (error); 2003 } 2004 2005 if (svd->vdev_ops->vdev_op_leaf) 2006 vdev_copy_path_impl(svd, dvd); 2007 2008 return (0); 2009} 2010 2011static void 2012vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) 2013{ 2014 ASSERT(stvd->vdev_top == stvd); 2015 ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); 2016 2017 for (uint64_t i = 0; i < dvd->vdev_children; i++) { 2018 vdev_copy_path_search(stvd, dvd->vdev_child[i]); 2019 } 2020 2021 if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) 2022 return; 2023 2024 /* 2025 * The idea here is that while a vdev can shift positions within 2026 * a top vdev (when replacing, attaching mirror, etc.) it cannot 2027 * step outside of it. 2028 */ 2029 vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); 2030 2031 if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) 2032 return; 2033 2034 ASSERT(vd->vdev_ops->vdev_op_leaf); 2035 2036 vdev_copy_path_impl(vd, dvd); 2037} 2038 2039/* 2040 * Recursively copy vdev paths from one root vdev to another. Source and 2041 * destination vdev trees may differ in geometry. For each destination leaf 2042 * vdev, search a vdev with the same guid and top vdev id in the source. 2043 * Intended to copy paths from userland config into MOS config. 2044 */ 2045void 2046vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) 2047{ 2048 uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); 2049 ASSERT(srvd->vdev_ops == &vdev_root_ops); 2050 ASSERT(drvd->vdev_ops == &vdev_root_ops); 2051 2052 for (uint64_t i = 0; i < children; i++) { 2053 vdev_copy_path_search(srvd->vdev_child[i], 2054 drvd->vdev_child[i]); 2055 } 2056} 2057 2058/* 2059 * Close a virtual device. 2060 */ 2061void 2062vdev_close(vdev_t *vd) 2063{ 2064 spa_t *spa = vd->vdev_spa; 2065 vdev_t *pvd = vd->vdev_parent; 2066 2067 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2068 2069 /* 2070 * If our parent is reopening, then we are as well, unless we are 2071 * going offline. 2072 */ 2073 if (pvd != NULL && pvd->vdev_reopening) 2074 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 2075 2076 vd->vdev_ops->vdev_op_close(vd); 2077 2078 vdev_cache_purge(vd); 2079 2080 if (vd->vdev_ops->vdev_op_leaf) 2081 trim_map_destroy(vd); 2082 2083 /* 2084 * We record the previous state before we close it, so that if we are 2085 * doing a reopen(), we don't generate FMA ereports if we notice that 2086 * it's still faulted. 2087 */ 2088 vd->vdev_prevstate = vd->vdev_state; 2089 2090 if (vd->vdev_offline) 2091 vd->vdev_state = VDEV_STATE_OFFLINE; 2092 else 2093 vd->vdev_state = VDEV_STATE_CLOSED; 2094 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2095} 2096 2097void 2098vdev_hold(vdev_t *vd) 2099{ 2100 spa_t *spa = vd->vdev_spa; 2101 2102 ASSERT(spa_is_root(spa)); 2103 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 2104 return; 2105 2106 for (int c = 0; c < vd->vdev_children; c++) 2107 vdev_hold(vd->vdev_child[c]); 2108 2109 if (vd->vdev_ops->vdev_op_leaf) 2110 vd->vdev_ops->vdev_op_hold(vd); 2111} 2112 2113void 2114vdev_rele(vdev_t *vd) 2115{ 2116 spa_t *spa = vd->vdev_spa; 2117 2118 ASSERT(spa_is_root(spa)); 2119 for (int c = 0; c < vd->vdev_children; c++) 2120 vdev_rele(vd->vdev_child[c]); 2121 2122 if (vd->vdev_ops->vdev_op_leaf) 2123 vd->vdev_ops->vdev_op_rele(vd); 2124} 2125 2126/* 2127 * Reopen all interior vdevs and any unopened leaves. We don't actually 2128 * reopen leaf vdevs which had previously been opened as they might deadlock 2129 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 2130 * If the leaf has never been opened then open it, as usual. 2131 */ 2132void 2133vdev_reopen(vdev_t *vd) 2134{ 2135 spa_t *spa = vd->vdev_spa; 2136 2137 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2138 2139 /* set the reopening flag unless we're taking the vdev offline */ 2140 vd->vdev_reopening = !vd->vdev_offline; 2141 vdev_close(vd); 2142 (void) vdev_open(vd); 2143 2144 /* 2145 * Call vdev_validate() here to make sure we have the same device. 2146 * Otherwise, a device with an invalid label could be successfully 2147 * opened in response to vdev_reopen(). 2148 */ 2149 if (vd->vdev_aux) { 2150 (void) vdev_validate_aux(vd); 2151 if (vdev_readable(vd) && vdev_writeable(vd) && 2152 vd->vdev_aux == &spa->spa_l2cache && 2153 !l2arc_vdev_present(vd)) 2154 l2arc_add_vdev(spa, vd); 2155 } else { 2156 (void) vdev_validate(vd); 2157 } 2158 2159 /* 2160 * Reassess parent vdev's health. 2161 */ 2162 vdev_propagate_state(vd); 2163} 2164 2165int 2166vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 2167{ 2168 int error; 2169 2170 /* 2171 * Normally, partial opens (e.g. of a mirror) are allowed. 2172 * For a create, however, we want to fail the request if 2173 * there are any components we can't open. 2174 */ 2175 error = vdev_open(vd); 2176 2177 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 2178 vdev_close(vd); 2179 return (error ? error : ENXIO); 2180 } 2181 2182 /* 2183 * Recursively load DTLs and initialize all labels. 2184 */ 2185 if ((error = vdev_dtl_load(vd)) != 0 || 2186 (error = vdev_label_init(vd, txg, isreplacing ? 2187 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 2188 vdev_close(vd); 2189 return (error); 2190 } 2191 2192 return (0); 2193} 2194 2195void 2196vdev_metaslab_set_size(vdev_t *vd) 2197{ 2198 uint64_t asize = vd->vdev_asize; 2199 uint64_t ms_count = asize >> vdev_default_ms_shift; 2200 uint64_t ms_shift; 2201 2202 /* 2203 * There are two dimensions to the metaslab sizing calculation: 2204 * the size of the metaslab and the count of metaslabs per vdev. 2205 * In general, we aim for vdev_max_ms_count (200) metaslabs. The 2206 * range of the dimensions are as follows: 2207 * 2208 * 2^29 <= ms_size <= 2^38 2209 * 16 <= ms_count <= 131,072 2210 * 2211 * On the lower end of vdev sizes, we aim for metaslabs sizes of 2212 * at least 512MB (2^29) to minimize fragmentation effects when 2213 * testing with smaller devices. However, the count constraint 2214 * of at least 16 metaslabs will override this minimum size goal. 2215 * 2216 * On the upper end of vdev sizes, we aim for a maximum metaslab 2217 * size of 256GB. However, we will cap the total count to 2^17 2218 * metaslabs to keep our memory footprint in check. 2219 * 2220 * The net effect of applying above constrains is summarized below. 2221 * 2222 * vdev size metaslab count 2223 * -------------|----------------- 2224 * < 8GB ~16 2225 * 8GB - 100GB one per 512MB 2226 * 100GB - 50TB ~200 2227 * 50TB - 32PB one per 256GB 2228 * > 32PB ~131,072 2229 * ------------------------------- 2230 */ 2231 2232 if (ms_count < vdev_min_ms_count) 2233 ms_shift = highbit64(asize / vdev_min_ms_count); 2234 else if (ms_count > vdev_max_ms_count) 2235 ms_shift = highbit64(asize / vdev_max_ms_count); 2236 else 2237 ms_shift = vdev_default_ms_shift; 2238 2239 if (ms_shift < SPA_MAXBLOCKSHIFT) { 2240 ms_shift = SPA_MAXBLOCKSHIFT; 2241 } else if (ms_shift > vdev_max_ms_shift) { 2242 ms_shift = vdev_max_ms_shift; 2243 /* cap the total count to constrain memory footprint */ 2244 if ((asize >> ms_shift) > vdev_ms_count_limit) 2245 ms_shift = highbit64(asize / vdev_ms_count_limit); 2246 } 2247 2248 vd->vdev_ms_shift = ms_shift; 2249 ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); 2250} 2251 2252/* 2253 * Maximize performance by inflating the configured ashift for top level 2254 * vdevs to be as close to the physical ashift as possible while maintaining 2255 * administrator defined limits and ensuring it doesn't go below the 2256 * logical ashift. 2257 */ 2258void 2259vdev_ashift_optimize(vdev_t *vd) 2260{ 2261 if (vd == vd->vdev_top) { 2262 if (vd->vdev_ashift < vd->vdev_physical_ashift) { 2263 vd->vdev_ashift = MIN( 2264 MAX(zfs_max_auto_ashift, vd->vdev_ashift), 2265 MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 2266 } else { 2267 /* 2268 * Unusual case where logical ashift > physical ashift 2269 * so we can't cap the calculated ashift based on max 2270 * ashift as that would cause failures. 2271 * We still check if we need to increase it to match 2272 * the min ashift. 2273 */ 2274 vd->vdev_ashift = MAX(zfs_min_auto_ashift, 2275 vd->vdev_ashift); 2276 } 2277 } 2278} 2279 2280void 2281vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 2282{ 2283 ASSERT(vd == vd->vdev_top); 2284 /* indirect vdevs don't have metaslabs or dtls */ 2285 ASSERT(vdev_is_concrete(vd) || flags == 0); 2286 ASSERT(ISP2(flags)); 2287 ASSERT(spa_writeable(vd->vdev_spa)); 2288 2289 if (flags & VDD_METASLAB) 2290 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 2291 2292 if (flags & VDD_DTL) 2293 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 2294 2295 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 2296} 2297 2298void 2299vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 2300{ 2301 for (int c = 0; c < vd->vdev_children; c++) 2302 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 2303 2304 if (vd->vdev_ops->vdev_op_leaf) 2305 vdev_dirty(vd->vdev_top, flags, vd, txg); 2306} 2307 2308/* 2309 * DTLs. 2310 * 2311 * A vdev's DTL (dirty time log) is the set of transaction groups for which 2312 * the vdev has less than perfect replication. There are four kinds of DTL: 2313 * 2314 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 2315 * 2316 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 2317 * 2318 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 2319 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 2320 * txgs that was scrubbed. 2321 * 2322 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 2323 * persistent errors or just some device being offline. 2324 * Unlike the other three, the DTL_OUTAGE map is not generally 2325 * maintained; it's only computed when needed, typically to 2326 * determine whether a device can be detached. 2327 * 2328 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 2329 * either has the data or it doesn't. 2330 * 2331 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 2332 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 2333 * if any child is less than fully replicated, then so is its parent. 2334 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 2335 * comprising only those txgs which appear in 'maxfaults' or more children; 2336 * those are the txgs we don't have enough replication to read. For example, 2337 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 2338 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 2339 * two child DTL_MISSING maps. 2340 * 2341 * It should be clear from the above that to compute the DTLs and outage maps 2342 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 2343 * Therefore, that is all we keep on disk. When loading the pool, or after 2344 * a configuration change, we generate all other DTLs from first principles. 2345 */ 2346void 2347vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 2348{ 2349 range_tree_t *rt = vd->vdev_dtl[t]; 2350 2351 ASSERT(t < DTL_TYPES); 2352 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 2353 ASSERT(spa_writeable(vd->vdev_spa)); 2354 2355 mutex_enter(&vd->vdev_dtl_lock); 2356 if (!range_tree_contains(rt, txg, size)) 2357 range_tree_add(rt, txg, size); 2358 mutex_exit(&vd->vdev_dtl_lock); 2359} 2360 2361boolean_t 2362vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 2363{ 2364 range_tree_t *rt = vd->vdev_dtl[t]; 2365 boolean_t dirty = B_FALSE; 2366 2367 ASSERT(t < DTL_TYPES); 2368 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 2369 2370 /* 2371 * While we are loading the pool, the DTLs have not been loaded yet. 2372 * Ignore the DTLs and try all devices. This avoids a recursive 2373 * mutex enter on the vdev_dtl_lock, and also makes us try hard 2374 * when loading the pool (relying on the checksum to ensure that 2375 * we get the right data -- note that we while loading, we are 2376 * only reading the MOS, which is always checksummed). 2377 */ 2378 if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) 2379 return (B_FALSE); 2380 2381 mutex_enter(&vd->vdev_dtl_lock); 2382 if (!range_tree_is_empty(rt)) 2383 dirty = range_tree_contains(rt, txg, size); 2384 mutex_exit(&vd->vdev_dtl_lock); 2385 2386 return (dirty); 2387} 2388 2389boolean_t 2390vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 2391{ 2392 range_tree_t *rt = vd->vdev_dtl[t]; 2393 boolean_t empty; 2394 2395 mutex_enter(&vd->vdev_dtl_lock); 2396 empty = range_tree_is_empty(rt); 2397 mutex_exit(&vd->vdev_dtl_lock); 2398 2399 return (empty); 2400} 2401 2402/* 2403 * Returns B_TRUE if vdev determines offset needs to be resilvered. 2404 */ 2405boolean_t 2406vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) 2407{ 2408 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 2409 2410 if (vd->vdev_ops->vdev_op_need_resilver == NULL || 2411 vd->vdev_ops->vdev_op_leaf) 2412 return (B_TRUE); 2413 2414 return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); 2415} 2416 2417/* 2418 * Returns the lowest txg in the DTL range. 2419 */ 2420static uint64_t 2421vdev_dtl_min(vdev_t *vd) 2422{ 2423 range_seg_t *rs; 2424 2425 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 2426 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 2427 ASSERT0(vd->vdev_children); 2428 2429 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 2430 return (rs->rs_start - 1); 2431} 2432 2433/* 2434 * Returns the highest txg in the DTL. 2435 */ 2436static uint64_t 2437vdev_dtl_max(vdev_t *vd) 2438{ 2439 range_seg_t *rs; 2440 2441 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 2442 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 2443 ASSERT0(vd->vdev_children); 2444 2445 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 2446 return (rs->rs_end); 2447} 2448 2449/* 2450 * Determine if a resilvering vdev should remove any DTL entries from 2451 * its range. If the vdev was resilvering for the entire duration of the 2452 * scan then it should excise that range from its DTLs. Otherwise, this 2453 * vdev is considered partially resilvered and should leave its DTL 2454 * entries intact. The comment in vdev_dtl_reassess() describes how we 2455 * excise the DTLs. 2456 */ 2457static boolean_t 2458vdev_dtl_should_excise(vdev_t *vd) 2459{ 2460 spa_t *spa = vd->vdev_spa; 2461 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 2462 2463 ASSERT0(scn->scn_phys.scn_errors); 2464 ASSERT0(vd->vdev_children); 2465 2466 if (vd->vdev_state < VDEV_STATE_DEGRADED) 2467 return (B_FALSE); 2468 2469 if (vd->vdev_resilver_txg == 0 || 2470 range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) 2471 return (B_TRUE); 2472 2473 /* 2474 * When a resilver is initiated the scan will assign the scn_max_txg 2475 * value to the highest txg value that exists in all DTLs. If this 2476 * device's max DTL is not part of this scan (i.e. it is not in 2477 * the range (scn_min_txg, scn_max_txg] then it is not eligible 2478 * for excision. 2479 */ 2480 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 2481 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 2482 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 2483 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 2484 return (B_TRUE); 2485 } 2486 return (B_FALSE); 2487} 2488 2489/* 2490 * Reassess DTLs after a config change or scrub completion. 2491 */ 2492void 2493vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 2494{ 2495 spa_t *spa = vd->vdev_spa; 2496 avl_tree_t reftree; 2497 int minref; 2498 2499 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2500 2501 for (int c = 0; c < vd->vdev_children; c++) 2502 vdev_dtl_reassess(vd->vdev_child[c], txg, 2503 scrub_txg, scrub_done); 2504 2505 if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) 2506 return; 2507 2508 if (vd->vdev_ops->vdev_op_leaf) { 2509 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 2510 2511 mutex_enter(&vd->vdev_dtl_lock); 2512 2513 /* 2514 * If we've completed a scan cleanly then determine 2515 * if this vdev should remove any DTLs. We only want to 2516 * excise regions on vdevs that were available during 2517 * the entire duration of this scan. 2518 */ 2519 if (scrub_txg != 0 && 2520 (spa->spa_scrub_started || 2521 (scn != NULL && scn->scn_phys.scn_errors == 0)) && 2522 vdev_dtl_should_excise(vd)) { 2523 /* 2524 * We completed a scrub up to scrub_txg. If we 2525 * did it without rebooting, then the scrub dtl 2526 * will be valid, so excise the old region and 2527 * fold in the scrub dtl. Otherwise, leave the 2528 * dtl as-is if there was an error. 2529 * 2530 * There's little trick here: to excise the beginning 2531 * of the DTL_MISSING map, we put it into a reference 2532 * tree and then add a segment with refcnt -1 that 2533 * covers the range [0, scrub_txg). This means 2534 * that each txg in that range has refcnt -1 or 0. 2535 * We then add DTL_SCRUB with a refcnt of 2, so that 2536 * entries in the range [0, scrub_txg) will have a 2537 * positive refcnt -- either 1 or 2. We then convert 2538 * the reference tree into the new DTL_MISSING map. 2539 */ 2540 space_reftree_create(&reftree); 2541 space_reftree_add_map(&reftree, 2542 vd->vdev_dtl[DTL_MISSING], 1); 2543 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 2544 space_reftree_add_map(&reftree, 2545 vd->vdev_dtl[DTL_SCRUB], 2); 2546 space_reftree_generate_map(&reftree, 2547 vd->vdev_dtl[DTL_MISSING], 1); 2548 space_reftree_destroy(&reftree); 2549 } 2550 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 2551 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2552 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 2553 if (scrub_done) 2554 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 2555 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 2556 if (!vdev_readable(vd)) 2557 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 2558 else 2559 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 2560 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 2561 2562 /* 2563 * If the vdev was resilvering and no longer has any 2564 * DTLs then reset its resilvering flag and dirty 2565 * the top level so that we persist the change. 2566 */ 2567 if (vd->vdev_resilver_txg != 0 && 2568 range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && 2569 range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { 2570 vd->vdev_resilver_txg = 0; 2571 vdev_config_dirty(vd->vdev_top); 2572 } 2573 2574 mutex_exit(&vd->vdev_dtl_lock); 2575 2576 if (txg != 0) 2577 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2578 return; 2579 } 2580 2581 mutex_enter(&vd->vdev_dtl_lock); 2582 for (int t = 0; t < DTL_TYPES; t++) { 2583 /* account for child's outage in parent's missing map */ 2584 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2585 if (t == DTL_SCRUB) 2586 continue; /* leaf vdevs only */ 2587 if (t == DTL_PARTIAL) 2588 minref = 1; /* i.e. non-zero */ 2589 else if (vd->vdev_nparity != 0) 2590 minref = vd->vdev_nparity + 1; /* RAID-Z */ 2591 else 2592 minref = vd->vdev_children; /* any kind of mirror */ 2593 space_reftree_create(&reftree); 2594 for (int c = 0; c < vd->vdev_children; c++) { 2595 vdev_t *cvd = vd->vdev_child[c]; 2596 mutex_enter(&cvd->vdev_dtl_lock); 2597 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2598 mutex_exit(&cvd->vdev_dtl_lock); 2599 } 2600 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2601 space_reftree_destroy(&reftree); 2602 } 2603 mutex_exit(&vd->vdev_dtl_lock); 2604} 2605 2606int 2607vdev_dtl_load(vdev_t *vd) 2608{ 2609 spa_t *spa = vd->vdev_spa; 2610 objset_t *mos = spa->spa_meta_objset; 2611 int error = 0; 2612 2613 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2614 ASSERT(vdev_is_concrete(vd)); 2615 2616 error = space_map_open(&vd->vdev_dtl_sm, mos, 2617 vd->vdev_dtl_object, 0, -1ULL, 0); 2618 if (error) 2619 return (error); 2620 ASSERT(vd->vdev_dtl_sm != NULL); 2621 2622 mutex_enter(&vd->vdev_dtl_lock); 2623 2624 /* 2625 * Now that we've opened the space_map we need to update 2626 * the in-core DTL. 2627 */ 2628 space_map_update(vd->vdev_dtl_sm); 2629 2630 error = space_map_load(vd->vdev_dtl_sm, 2631 vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2632 mutex_exit(&vd->vdev_dtl_lock); 2633 2634 return (error); 2635 } 2636 2637 for (int c = 0; c < vd->vdev_children; c++) { 2638 error = vdev_dtl_load(vd->vdev_child[c]); 2639 if (error != 0) 2640 break; 2641 } 2642 2643 return (error); 2644} 2645 2646void 2647vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) 2648{ 2649 spa_t *spa = vd->vdev_spa; 2650 2651 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 2652 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2653 zapobj, tx)); 2654} 2655 2656uint64_t 2657vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) 2658{ 2659 spa_t *spa = vd->vdev_spa; 2660 uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 2661 DMU_OT_NONE, 0, tx); 2662 2663 ASSERT(zap != 0); 2664 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 2665 zap, tx)); 2666 2667 return (zap); 2668} 2669 2670void 2671vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) 2672{ 2673 if (vd->vdev_ops != &vdev_hole_ops && 2674 vd->vdev_ops != &vdev_missing_ops && 2675 vd->vdev_ops != &vdev_root_ops && 2676 !vd->vdev_top->vdev_removing) { 2677 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 2678 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 2679 } 2680 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 2681 vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 2682 } 2683 } 2684 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2685 vdev_construct_zaps(vd->vdev_child[i], tx); 2686 } 2687} 2688 2689void 2690vdev_dtl_sync(vdev_t *vd, uint64_t txg) 2691{ 2692 spa_t *spa = vd->vdev_spa; 2693 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2694 objset_t *mos = spa->spa_meta_objset; 2695 range_tree_t *rtsync; 2696 dmu_tx_t *tx; 2697 uint64_t object = space_map_object(vd->vdev_dtl_sm); 2698 2699 ASSERT(vdev_is_concrete(vd)); 2700 ASSERT(vd->vdev_ops->vdev_op_leaf); 2701 2702 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2703 2704 if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2705 mutex_enter(&vd->vdev_dtl_lock); 2706 space_map_free(vd->vdev_dtl_sm, tx); 2707 space_map_close(vd->vdev_dtl_sm); 2708 vd->vdev_dtl_sm = NULL; 2709 mutex_exit(&vd->vdev_dtl_lock); 2710 2711 /* 2712 * We only destroy the leaf ZAP for detached leaves or for 2713 * removed log devices. Removed data devices handle leaf ZAP 2714 * cleanup later, once cancellation is no longer possible. 2715 */ 2716 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || 2717 vd->vdev_top->vdev_islog)) { 2718 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 2719 vd->vdev_leaf_zap = 0; 2720 } 2721 2722 dmu_tx_commit(tx); 2723 return; 2724 } 2725 2726 if (vd->vdev_dtl_sm == NULL) { 2727 uint64_t new_object; 2728 2729 new_object = space_map_alloc(mos, vdev_dtl_sm_blksz, tx); 2730 VERIFY3U(new_object, !=, 0); 2731 2732 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2733 0, -1ULL, 0)); 2734 ASSERT(vd->vdev_dtl_sm != NULL); 2735 } 2736 2737 rtsync = range_tree_create(NULL, NULL); 2738 2739 mutex_enter(&vd->vdev_dtl_lock); 2740 range_tree_walk(rt, range_tree_add, rtsync); 2741 mutex_exit(&vd->vdev_dtl_lock); 2742 2743 space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); 2744 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); 2745 range_tree_vacate(rtsync, NULL, NULL); 2746 2747 range_tree_destroy(rtsync); 2748 2749 /* 2750 * If the object for the space map has changed then dirty 2751 * the top level so that we update the config. 2752 */ 2753 if (object != space_map_object(vd->vdev_dtl_sm)) { 2754 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " 2755 "new object %llu", (u_longlong_t)txg, spa_name(spa), 2756 (u_longlong_t)object, 2757 (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); 2758 vdev_config_dirty(vd->vdev_top); 2759 } 2760 2761 dmu_tx_commit(tx); 2762 2763 mutex_enter(&vd->vdev_dtl_lock); 2764 space_map_update(vd->vdev_dtl_sm); 2765 mutex_exit(&vd->vdev_dtl_lock); 2766} 2767 2768/* 2769 * Determine whether the specified vdev can be offlined/detached/removed 2770 * without losing data. 2771 */ 2772boolean_t 2773vdev_dtl_required(vdev_t *vd) 2774{ 2775 spa_t *spa = vd->vdev_spa; 2776 vdev_t *tvd = vd->vdev_top; 2777 uint8_t cant_read = vd->vdev_cant_read; 2778 boolean_t required; 2779 2780 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2781 2782 if (vd == spa->spa_root_vdev || vd == tvd) 2783 return (B_TRUE); 2784 2785 /* 2786 * Temporarily mark the device as unreadable, and then determine 2787 * whether this results in any DTL outages in the top-level vdev. 2788 * If not, we can safely offline/detach/remove the device. 2789 */ 2790 vd->vdev_cant_read = B_TRUE; 2791 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2792 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2793 vd->vdev_cant_read = cant_read; 2794 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2795 2796 if (!required && zio_injection_enabled) 2797 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2798 2799 return (required); 2800} 2801 2802/* 2803 * Determine if resilver is needed, and if so the txg range. 2804 */ 2805boolean_t 2806vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2807{ 2808 boolean_t needed = B_FALSE; 2809 uint64_t thismin = UINT64_MAX; 2810 uint64_t thismax = 0; 2811 2812 if (vd->vdev_children == 0) { 2813 mutex_enter(&vd->vdev_dtl_lock); 2814 if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && 2815 vdev_writeable(vd)) { 2816 2817 thismin = vdev_dtl_min(vd); 2818 thismax = vdev_dtl_max(vd); 2819 needed = B_TRUE; 2820 } 2821 mutex_exit(&vd->vdev_dtl_lock); 2822 } else { 2823 for (int c = 0; c < vd->vdev_children; c++) { 2824 vdev_t *cvd = vd->vdev_child[c]; 2825 uint64_t cmin, cmax; 2826 2827 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2828 thismin = MIN(thismin, cmin); 2829 thismax = MAX(thismax, cmax); 2830 needed = B_TRUE; 2831 } 2832 } 2833 } 2834 2835 if (needed && minp) { 2836 *minp = thismin; 2837 *maxp = thismax; 2838 } 2839 return (needed); 2840} 2841 2842/* 2843 * Gets the checkpoint space map object from the vdev's ZAP. 2844 * Returns the spacemap object, or 0 if it wasn't in the ZAP 2845 * or the ZAP doesn't exist yet. 2846 */ 2847int 2848vdev_checkpoint_sm_object(vdev_t *vd) 2849{ 2850 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 2851 if (vd->vdev_top_zap == 0) { 2852 return (0); 2853 } 2854 2855 uint64_t sm_obj = 0; 2856 int err = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, 2857 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, &sm_obj); 2858 2859 ASSERT(err == 0 || err == ENOENT); 2860 2861 return (sm_obj); 2862} 2863 2864int 2865vdev_load(vdev_t *vd) 2866{ 2867 int error = 0; 2868 /* 2869 * Recursively load all children. 2870 */ 2871 for (int c = 0; c < vd->vdev_children; c++) { 2872 error = vdev_load(vd->vdev_child[c]); 2873 if (error != 0) { 2874 return (error); 2875 } 2876 } 2877 2878 vdev_set_deflate_ratio(vd); 2879 2880 /* 2881 * If this is a top-level vdev, initialize its metaslabs. 2882 */ 2883 if (vd == vd->vdev_top && vdev_is_concrete(vd)) { 2884 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { 2885 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2886 VDEV_AUX_CORRUPT_DATA); 2887 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " 2888 "asize=%llu", (u_longlong_t)vd->vdev_ashift, 2889 (u_longlong_t)vd->vdev_asize); 2890 return (SET_ERROR(ENXIO)); 2891 } else if ((error = vdev_metaslab_init(vd, 0)) != 0) { 2892 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " 2893 "[error=%d]", error); 2894 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2895 VDEV_AUX_CORRUPT_DATA); 2896 return (error); 2897 } 2898 2899 uint64_t checkpoint_sm_obj = vdev_checkpoint_sm_object(vd); 2900 if (checkpoint_sm_obj != 0) { 2901 objset_t *mos = spa_meta_objset(vd->vdev_spa); 2902 ASSERT(vd->vdev_asize != 0); 2903 ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); 2904 2905 if ((error = space_map_open(&vd->vdev_checkpoint_sm, 2906 mos, checkpoint_sm_obj, 0, vd->vdev_asize, 2907 vd->vdev_ashift))) { 2908 vdev_dbgmsg(vd, "vdev_load: space_map_open " 2909 "failed for checkpoint spacemap (obj %llu) " 2910 "[error=%d]", 2911 (u_longlong_t)checkpoint_sm_obj, error); 2912 return (error); 2913 } 2914 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 2915 space_map_update(vd->vdev_checkpoint_sm); 2916 2917 /* 2918 * Since the checkpoint_sm contains free entries 2919 * exclusively we can use sm_alloc to indicate the 2920 * culmulative checkpointed space that has been freed. 2921 */ 2922 vd->vdev_stat.vs_checkpoint_space = 2923 -vd->vdev_checkpoint_sm->sm_alloc; 2924 vd->vdev_spa->spa_checkpoint_info.sci_dspace += 2925 vd->vdev_stat.vs_checkpoint_space; 2926 } 2927 } 2928 2929 /* 2930 * If this is a leaf vdev, load its DTL. 2931 */ 2932 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { 2933 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2934 VDEV_AUX_CORRUPT_DATA); 2935 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " 2936 "[error=%d]", error); 2937 return (error); 2938 } 2939 2940 uint64_t obsolete_sm_object = vdev_obsolete_sm_object(vd); 2941 if (obsolete_sm_object != 0) { 2942 objset_t *mos = vd->vdev_spa->spa_meta_objset; 2943 ASSERT(vd->vdev_asize != 0); 2944 ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); 2945 2946 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, 2947 obsolete_sm_object, 0, vd->vdev_asize, 0))) { 2948 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2949 VDEV_AUX_CORRUPT_DATA); 2950 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " 2951 "obsolete spacemap (obj %llu) [error=%d]", 2952 (u_longlong_t)obsolete_sm_object, error); 2953 return (error); 2954 } 2955 space_map_update(vd->vdev_obsolete_sm); 2956 } 2957 2958 return (0); 2959} 2960 2961/* 2962 * The special vdev case is used for hot spares and l2cache devices. Its 2963 * sole purpose it to set the vdev state for the associated vdev. To do this, 2964 * we make sure that we can open the underlying device, then try to read the 2965 * label, and make sure that the label is sane and that it hasn't been 2966 * repurposed to another pool. 2967 */ 2968int 2969vdev_validate_aux(vdev_t *vd) 2970{ 2971 nvlist_t *label; 2972 uint64_t guid, version; 2973 uint64_t state; 2974 2975 if (!vdev_readable(vd)) 2976 return (0); 2977 2978 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2979 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2980 VDEV_AUX_CORRUPT_DATA); 2981 return (-1); 2982 } 2983 2984 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2985 !SPA_VERSION_IS_SUPPORTED(version) || 2986 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2987 guid != vd->vdev_guid || 2988 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2989 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2990 VDEV_AUX_CORRUPT_DATA); 2991 nvlist_free(label); 2992 return (-1); 2993 } 2994 2995 /* 2996 * We don't actually check the pool state here. If it's in fact in 2997 * use by another pool, we update this fact on the fly when requested. 2998 */ 2999 nvlist_free(label); 3000 return (0); 3001} 3002 3003/* 3004 * Free the objects used to store this vdev's spacemaps, and the array 3005 * that points to them. 3006 */ 3007void 3008vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) 3009{ 3010 if (vd->vdev_ms_array == 0) 3011 return; 3012 3013 objset_t *mos = vd->vdev_spa->spa_meta_objset; 3014 uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; 3015 size_t array_bytes = array_count * sizeof (uint64_t); 3016 uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); 3017 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, 3018 array_bytes, smobj_array, 0)); 3019 3020 for (uint64_t i = 0; i < array_count; i++) { 3021 uint64_t smobj = smobj_array[i]; 3022 if (smobj == 0) 3023 continue; 3024 3025 space_map_free_obj(mos, smobj, tx); 3026 } 3027 3028 kmem_free(smobj_array, array_bytes); 3029 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); 3030 vd->vdev_ms_array = 0; 3031} 3032 3033static void 3034vdev_remove_empty(vdev_t *vd, uint64_t txg) 3035{ 3036 spa_t *spa = vd->vdev_spa; 3037 dmu_tx_t *tx; 3038 3039 ASSERT(vd == vd->vdev_top); 3040 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 3041 3042 if (vd->vdev_ms != NULL) { 3043 metaslab_group_t *mg = vd->vdev_mg; 3044 3045 metaslab_group_histogram_verify(mg); 3046 metaslab_class_histogram_verify(mg->mg_class); 3047 3048 for (int m = 0; m < vd->vdev_ms_count; m++) { 3049 metaslab_t *msp = vd->vdev_ms[m]; 3050 3051 if (msp == NULL || msp->ms_sm == NULL) 3052 continue; 3053 3054 mutex_enter(&msp->ms_lock); 3055 /* 3056 * If the metaslab was not loaded when the vdev 3057 * was removed then the histogram accounting may 3058 * not be accurate. Update the histogram information 3059 * here so that we ensure that the metaslab group 3060 * and metaslab class are up-to-date. 3061 */ 3062 metaslab_group_histogram_remove(mg, msp); 3063 3064 VERIFY0(space_map_allocated(msp->ms_sm)); 3065 space_map_close(msp->ms_sm); 3066 msp->ms_sm = NULL; 3067 mutex_exit(&msp->ms_lock); 3068 } 3069 3070 if (vd->vdev_checkpoint_sm != NULL) { 3071 ASSERT(spa_has_checkpoint(spa)); 3072 space_map_close(vd->vdev_checkpoint_sm); 3073 vd->vdev_checkpoint_sm = NULL; 3074 } 3075 3076 metaslab_group_histogram_verify(mg); 3077 metaslab_class_histogram_verify(mg->mg_class); 3078 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 3079 ASSERT0(mg->mg_histogram[i]); 3080 } 3081 3082 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 3083 vdev_destroy_spacemaps(vd, tx); 3084 3085 if (vd->vdev_islog && vd->vdev_top_zap != 0) { 3086 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 3087 vd->vdev_top_zap = 0; 3088 } 3089 dmu_tx_commit(tx); 3090} 3091 3092void 3093vdev_sync_done(vdev_t *vd, uint64_t txg) 3094{ 3095 metaslab_t *msp; 3096 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 3097 3098 ASSERT(vdev_is_concrete(vd)); 3099 3100 while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 3101 != NULL) 3102 metaslab_sync_done(msp, txg); 3103 3104 if (reassess) 3105 metaslab_sync_reassess(vd->vdev_mg); 3106} 3107 3108void 3109vdev_sync(vdev_t *vd, uint64_t txg) 3110{ 3111 spa_t *spa = vd->vdev_spa; 3112 vdev_t *lvd; 3113 metaslab_t *msp; 3114 dmu_tx_t *tx; 3115 3116 if (range_tree_space(vd->vdev_obsolete_segments) > 0) { 3117 dmu_tx_t *tx; 3118 3119 ASSERT(vd->vdev_removing || 3120 vd->vdev_ops == &vdev_indirect_ops); 3121 3122 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3123 vdev_indirect_sync_obsolete(vd, tx); 3124 dmu_tx_commit(tx); 3125 3126 /* 3127 * If the vdev is indirect, it can't have dirty 3128 * metaslabs or DTLs. 3129 */ 3130 if (vd->vdev_ops == &vdev_indirect_ops) { 3131 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); 3132 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); 3133 return; 3134 } 3135 } 3136 3137 ASSERT(vdev_is_concrete(vd)); 3138 3139 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && 3140 !vd->vdev_removing) { 3141 ASSERT(vd == vd->vdev_top); 3142 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 3143 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3144 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 3145 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 3146 ASSERT(vd->vdev_ms_array != 0); 3147 vdev_config_dirty(vd); 3148 dmu_tx_commit(tx); 3149 } 3150 3151 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 3152 metaslab_sync(msp, txg); 3153 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 3154 } 3155 3156 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 3157 vdev_dtl_sync(lvd, txg); 3158 3159 /* 3160 * Remove the metadata associated with this vdev once it's empty. 3161 * Note that this is typically used for log/cache device removal; 3162 * we don't empty toplevel vdevs when removing them. But if 3163 * a toplevel happens to be emptied, this is not harmful. 3164 */ 3165 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) { 3166 vdev_remove_empty(vd, txg); 3167 } 3168 3169 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 3170} 3171 3172uint64_t 3173vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 3174{ 3175 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 3176} 3177 3178/* 3179 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 3180 * not be opened, and no I/O is attempted. 3181 */ 3182int 3183vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 3184{ 3185 vdev_t *vd, *tvd; 3186 3187 spa_vdev_state_enter(spa, SCL_NONE); 3188 3189 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3190 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 3191 3192 if (!vd->vdev_ops->vdev_op_leaf) 3193 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 3194 3195 tvd = vd->vdev_top; 3196 3197 /* 3198 * We don't directly use the aux state here, but if we do a 3199 * vdev_reopen(), we need this value to be present to remember why we 3200 * were faulted. 3201 */ 3202 vd->vdev_label_aux = aux; 3203 3204 /* 3205 * Faulted state takes precedence over degraded. 3206 */ 3207 vd->vdev_delayed_close = B_FALSE; 3208 vd->vdev_faulted = 1ULL; 3209 vd->vdev_degraded = 0ULL; 3210 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 3211 3212 /* 3213 * If this device has the only valid copy of the data, then 3214 * back off and simply mark the vdev as degraded instead. 3215 */ 3216 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 3217 vd->vdev_degraded = 1ULL; 3218 vd->vdev_faulted = 0ULL; 3219 3220 /* 3221 * If we reopen the device and it's not dead, only then do we 3222 * mark it degraded. 3223 */ 3224 vdev_reopen(tvd); 3225 3226 if (vdev_readable(vd)) 3227 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 3228 } 3229 3230 return (spa_vdev_state_exit(spa, vd, 0)); 3231} 3232 3233/* 3234 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 3235 * user that something is wrong. The vdev continues to operate as normal as far 3236 * as I/O is concerned. 3237 */ 3238int 3239vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 3240{ 3241 vdev_t *vd; 3242 3243 spa_vdev_state_enter(spa, SCL_NONE); 3244 3245 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3246 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 3247 3248 if (!vd->vdev_ops->vdev_op_leaf) 3249 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 3250 3251 /* 3252 * If the vdev is already faulted, then don't do anything. 3253 */ 3254 if (vd->vdev_faulted || vd->vdev_degraded) 3255 return (spa_vdev_state_exit(spa, NULL, 0)); 3256 3257 vd->vdev_degraded = 1ULL; 3258 if (!vdev_is_dead(vd)) 3259 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 3260 aux); 3261 3262 return (spa_vdev_state_exit(spa, vd, 0)); 3263} 3264 3265/* 3266 * Online the given vdev. 3267 * 3268 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 3269 * spare device should be detached when the device finishes resilvering. 3270 * Second, the online should be treated like a 'test' online case, so no FMA 3271 * events are generated if the device fails to open. 3272 */ 3273int 3274vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 3275{ 3276 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 3277 boolean_t wasoffline; 3278 vdev_state_t oldstate; 3279 3280 spa_vdev_state_enter(spa, SCL_NONE); 3281 3282 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3283 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 3284 3285 if (!vd->vdev_ops->vdev_op_leaf) 3286 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 3287 3288 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); 3289 oldstate = vd->vdev_state; 3290 3291 tvd = vd->vdev_top; 3292 vd->vdev_offline = B_FALSE; 3293 vd->vdev_tmpoffline = B_FALSE; 3294 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 3295 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 3296 3297 /* XXX - L2ARC 1.0 does not support expansion */ 3298 if (!vd->vdev_aux) { 3299 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3300 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 3301 } 3302 3303 vdev_reopen(tvd); 3304 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 3305 3306 if (!vd->vdev_aux) { 3307 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3308 pvd->vdev_expanding = B_FALSE; 3309 } 3310 3311 if (newstate) 3312 *newstate = vd->vdev_state; 3313 if ((flags & ZFS_ONLINE_UNSPARE) && 3314 !vdev_is_dead(vd) && vd->vdev_parent && 3315 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 3316 vd->vdev_parent->vdev_child[0] == vd) 3317 vd->vdev_unspare = B_TRUE; 3318 3319 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 3320 3321 /* XXX - L2ARC 1.0 does not support expansion */ 3322 if (vd->vdev_aux) 3323 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 3324 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 3325 } 3326 3327 /* Restart initializing if necessary */ 3328 mutex_enter(&vd->vdev_initialize_lock); 3329 if (vdev_writeable(vd) && 3330 vd->vdev_initialize_thread == NULL && 3331 vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { 3332 (void) vdev_initialize(vd); 3333 } 3334 mutex_exit(&vd->vdev_initialize_lock); 3335 3336 if (wasoffline || 3337 (oldstate < VDEV_STATE_DEGRADED && 3338 vd->vdev_state >= VDEV_STATE_DEGRADED)) 3339 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); 3340 3341 return (spa_vdev_state_exit(spa, vd, 0)); 3342} 3343 3344static int 3345vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 3346{ 3347 vdev_t *vd, *tvd; 3348 int error = 0; 3349 uint64_t generation; 3350 metaslab_group_t *mg; 3351 3352top: 3353 spa_vdev_state_enter(spa, SCL_ALLOC); 3354 3355 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 3356 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 3357 3358 if (!vd->vdev_ops->vdev_op_leaf) 3359 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 3360 3361 tvd = vd->vdev_top; 3362 mg = tvd->vdev_mg; 3363 generation = spa->spa_config_generation + 1; 3364 3365 /* 3366 * If the device isn't already offline, try to offline it. 3367 */ 3368 if (!vd->vdev_offline) { 3369 /* 3370 * If this device has the only valid copy of some data, 3371 * don't allow it to be offlined. Log devices are always 3372 * expendable. 3373 */ 3374 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 3375 vdev_dtl_required(vd)) 3376 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 3377 3378 /* 3379 * If the top-level is a slog and it has had allocations 3380 * then proceed. We check that the vdev's metaslab group 3381 * is not NULL since it's possible that we may have just 3382 * added this vdev but not yet initialized its metaslabs. 3383 */ 3384 if (tvd->vdev_islog && mg != NULL) { 3385 /* 3386 * Prevent any future allocations. 3387 */ 3388 metaslab_group_passivate(mg); 3389 (void) spa_vdev_state_exit(spa, vd, 0); 3390 3391 error = spa_reset_logs(spa); 3392 3393 /* 3394 * If the log device was successfully reset but has 3395 * checkpointed data, do not offline it. 3396 */ 3397 if (error == 0 && 3398 tvd->vdev_checkpoint_sm != NULL) { 3399 ASSERT3U(tvd->vdev_checkpoint_sm->sm_alloc, 3400 !=, 0); 3401 error = ZFS_ERR_CHECKPOINT_EXISTS; 3402 } 3403 3404 spa_vdev_state_enter(spa, SCL_ALLOC); 3405 3406 /* 3407 * Check to see if the config has changed. 3408 */ 3409 if (error || generation != spa->spa_config_generation) { 3410 metaslab_group_activate(mg); 3411 if (error) 3412 return (spa_vdev_state_exit(spa, 3413 vd, error)); 3414 (void) spa_vdev_state_exit(spa, vd, 0); 3415 goto top; 3416 } 3417 ASSERT0(tvd->vdev_stat.vs_alloc); 3418 } 3419 3420 /* 3421 * Offline this device and reopen its top-level vdev. 3422 * If the top-level vdev is a log device then just offline 3423 * it. Otherwise, if this action results in the top-level 3424 * vdev becoming unusable, undo it and fail the request. 3425 */ 3426 vd->vdev_offline = B_TRUE; 3427 vdev_reopen(tvd); 3428 3429 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 3430 vdev_is_dead(tvd)) { 3431 vd->vdev_offline = B_FALSE; 3432 vdev_reopen(tvd); 3433 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 3434 } 3435 3436 /* 3437 * Add the device back into the metaslab rotor so that 3438 * once we online the device it's open for business. 3439 */ 3440 if (tvd->vdev_islog && mg != NULL) 3441 metaslab_group_activate(mg); 3442 } 3443 3444 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 3445 3446 return (spa_vdev_state_exit(spa, vd, 0)); 3447} 3448 3449int 3450vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 3451{ 3452 int error; 3453 3454 mutex_enter(&spa->spa_vdev_top_lock); 3455 error = vdev_offline_locked(spa, guid, flags); 3456 mutex_exit(&spa->spa_vdev_top_lock); 3457 3458 return (error); 3459} 3460 3461/* 3462 * Clear the error counts associated with this vdev. Unlike vdev_online() and 3463 * vdev_offline(), we assume the spa config is locked. We also clear all 3464 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 3465 */ 3466void 3467vdev_clear(spa_t *spa, vdev_t *vd) 3468{ 3469 vdev_t *rvd = spa->spa_root_vdev; 3470 3471 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3472 3473 if (vd == NULL) 3474 vd = rvd; 3475 3476 vd->vdev_stat.vs_read_errors = 0; 3477 vd->vdev_stat.vs_write_errors = 0; 3478 vd->vdev_stat.vs_checksum_errors = 0; 3479 3480 for (int c = 0; c < vd->vdev_children; c++) 3481 vdev_clear(spa, vd->vdev_child[c]); 3482 3483 if (vd == rvd) { 3484 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 3485 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 3486 3487 for (int c = 0; c < spa->spa_spares.sav_count; c++) 3488 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 3489 } 3490 3491 /* 3492 * It makes no sense to "clear" an indirect vdev. 3493 */ 3494 if (!vdev_is_concrete(vd)) 3495 return; 3496 3497 /* 3498 * If we're in the FAULTED state or have experienced failed I/O, then 3499 * clear the persistent state and attempt to reopen the device. We 3500 * also mark the vdev config dirty, so that the new faulted state is 3501 * written out to disk. 3502 */ 3503 if (vd->vdev_faulted || vd->vdev_degraded || 3504 !vdev_readable(vd) || !vdev_writeable(vd)) { 3505 3506 /* 3507 * When reopening in reponse to a clear event, it may be due to 3508 * a fmadm repair request. In this case, if the device is 3509 * still broken, we want to still post the ereport again. 3510 */ 3511 vd->vdev_forcefault = B_TRUE; 3512 3513 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 3514 vd->vdev_cant_read = B_FALSE; 3515 vd->vdev_cant_write = B_FALSE; 3516 3517 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 3518 3519 vd->vdev_forcefault = B_FALSE; 3520 3521 if (vd != rvd && vdev_writeable(vd->vdev_top)) 3522 vdev_state_dirty(vd->vdev_top); 3523 3524 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 3525 spa_async_request(spa, SPA_ASYNC_RESILVER); 3526 3527 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); 3528 } 3529 3530 /* 3531 * When clearing a FMA-diagnosed fault, we always want to 3532 * unspare the device, as we assume that the original spare was 3533 * done in response to the FMA fault. 3534 */ 3535 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 3536 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 3537 vd->vdev_parent->vdev_child[0] == vd) 3538 vd->vdev_unspare = B_TRUE; 3539} 3540 3541boolean_t 3542vdev_is_dead(vdev_t *vd) 3543{ 3544 /* 3545 * Holes and missing devices are always considered "dead". 3546 * This simplifies the code since we don't have to check for 3547 * these types of devices in the various code paths. 3548 * Instead we rely on the fact that we skip over dead devices 3549 * before issuing I/O to them. 3550 */ 3551 return (vd->vdev_state < VDEV_STATE_DEGRADED || 3552 vd->vdev_ops == &vdev_hole_ops || 3553 vd->vdev_ops == &vdev_missing_ops); 3554} 3555 3556boolean_t 3557vdev_readable(vdev_t *vd) 3558{ 3559 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 3560} 3561 3562boolean_t 3563vdev_writeable(vdev_t *vd) 3564{ 3565 return (!vdev_is_dead(vd) && !vd->vdev_cant_write && 3566 vdev_is_concrete(vd)); 3567} 3568 3569boolean_t 3570vdev_allocatable(vdev_t *vd) 3571{ 3572 uint64_t state = vd->vdev_state; 3573 3574 /* 3575 * We currently allow allocations from vdevs which may be in the 3576 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 3577 * fails to reopen then we'll catch it later when we're holding 3578 * the proper locks. Note that we have to get the vdev state 3579 * in a local variable because although it changes atomically, 3580 * we're asking two separate questions about it. 3581 */ 3582 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 3583 !vd->vdev_cant_write && vdev_is_concrete(vd) && 3584 vd->vdev_mg->mg_initialized); 3585} 3586 3587boolean_t 3588vdev_accessible(vdev_t *vd, zio_t *zio) 3589{ 3590 ASSERT(zio->io_vd == vd); 3591 3592 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 3593 return (B_FALSE); 3594 3595 if (zio->io_type == ZIO_TYPE_READ) 3596 return (!vd->vdev_cant_read); 3597 3598 if (zio->io_type == ZIO_TYPE_WRITE) 3599 return (!vd->vdev_cant_write); 3600 3601 return (B_TRUE); 3602} 3603 3604boolean_t 3605vdev_is_spacemap_addressable(vdev_t *vd) 3606{ 3607 /* 3608 * Assuming 47 bits of the space map entry dedicated for the entry's 3609 * offset (see description in space_map.h), we calculate the maximum 3610 * address that can be described by a space map entry for the given 3611 * device. 3612 */ 3613 uint64_t shift = vd->vdev_ashift + 47; 3614 3615 if (shift >= 63) /* detect potential overflow */ 3616 return (B_TRUE); 3617 3618 return (vd->vdev_asize < (1ULL << shift)); 3619} 3620 3621/* 3622 * Get statistics for the given vdev. 3623 */ 3624void 3625vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 3626{ 3627 spa_t *spa = vd->vdev_spa; 3628 vdev_t *rvd = spa->spa_root_vdev; 3629 vdev_t *tvd = vd->vdev_top; 3630 3631 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 3632 3633 mutex_enter(&vd->vdev_stat_lock); 3634 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 3635 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 3636 vs->vs_state = vd->vdev_state; 3637 vs->vs_rsize = vdev_get_min_asize(vd); 3638 if (vd->vdev_ops->vdev_op_leaf) { 3639 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 3640 /* 3641 * Report intializing progress. Since we don't have the 3642 * initializing locks held, this is only an estimate (although a 3643 * fairly accurate one). 3644 */ 3645 vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; 3646 vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; 3647 vs->vs_initialize_state = vd->vdev_initialize_state; 3648 vs->vs_initialize_action_time = vd->vdev_initialize_action_time; 3649 } 3650 /* 3651 * Report expandable space on top-level, non-auxillary devices only. 3652 * The expandable space is reported in terms of metaslab sized units 3653 * since that determines how much space the pool can expand. 3654 */ 3655 if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) { 3656 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize - 3657 spa->spa_bootsize, 1ULL << tvd->vdev_ms_shift); 3658 } 3659 vs->vs_configured_ashift = vd->vdev_top != NULL 3660 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 3661 vs->vs_logical_ashift = vd->vdev_logical_ashift; 3662 vs->vs_physical_ashift = vd->vdev_physical_ashift; 3663 if (vd->vdev_aux == NULL && vd == vd->vdev_top && 3664 vdev_is_concrete(vd)) { 3665 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 3666 } 3667 3668 /* 3669 * If we're getting stats on the root vdev, aggregate the I/O counts 3670 * over all top-level vdevs (i.e. the direct children of the root). 3671 */ 3672 if (vd == rvd) { 3673 for (int c = 0; c < rvd->vdev_children; c++) { 3674 vdev_t *cvd = rvd->vdev_child[c]; 3675 vdev_stat_t *cvs = &cvd->vdev_stat; 3676 3677 for (int t = 0; t < ZIO_TYPES; t++) { 3678 vs->vs_ops[t] += cvs->vs_ops[t]; 3679 vs->vs_bytes[t] += cvs->vs_bytes[t]; 3680 } 3681 cvs->vs_scan_removing = cvd->vdev_removing; 3682 } 3683 } 3684 mutex_exit(&vd->vdev_stat_lock); 3685} 3686 3687void 3688vdev_clear_stats(vdev_t *vd) 3689{ 3690 mutex_enter(&vd->vdev_stat_lock); 3691 vd->vdev_stat.vs_space = 0; 3692 vd->vdev_stat.vs_dspace = 0; 3693 vd->vdev_stat.vs_alloc = 0; 3694 mutex_exit(&vd->vdev_stat_lock); 3695} 3696 3697void 3698vdev_scan_stat_init(vdev_t *vd) 3699{ 3700 vdev_stat_t *vs = &vd->vdev_stat; 3701 3702 for (int c = 0; c < vd->vdev_children; c++) 3703 vdev_scan_stat_init(vd->vdev_child[c]); 3704 3705 mutex_enter(&vd->vdev_stat_lock); 3706 vs->vs_scan_processed = 0; 3707 mutex_exit(&vd->vdev_stat_lock); 3708} 3709 3710void 3711vdev_stat_update(zio_t *zio, uint64_t psize) 3712{ 3713 spa_t *spa = zio->io_spa; 3714 vdev_t *rvd = spa->spa_root_vdev; 3715 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 3716 vdev_t *pvd; 3717 uint64_t txg = zio->io_txg; 3718 vdev_stat_t *vs = &vd->vdev_stat; 3719 zio_type_t type = zio->io_type; 3720 int flags = zio->io_flags; 3721 3722 /* 3723 * If this i/o is a gang leader, it didn't do any actual work. 3724 */ 3725 if (zio->io_gang_tree) 3726 return; 3727 3728 if (zio->io_error == 0) { 3729 /* 3730 * If this is a root i/o, don't count it -- we've already 3731 * counted the top-level vdevs, and vdev_get_stats() will 3732 * aggregate them when asked. This reduces contention on 3733 * the root vdev_stat_lock and implicitly handles blocks 3734 * that compress away to holes, for which there is no i/o. 3735 * (Holes never create vdev children, so all the counters 3736 * remain zero, which is what we want.) 3737 * 3738 * Note: this only applies to successful i/o (io_error == 0) 3739 * because unlike i/o counts, errors are not additive. 3740 * When reading a ditto block, for example, failure of 3741 * one top-level vdev does not imply a root-level error. 3742 */ 3743 if (vd == rvd) 3744 return; 3745 3746 ASSERT(vd == zio->io_vd); 3747 3748 if (flags & ZIO_FLAG_IO_BYPASS) 3749 return; 3750 3751 mutex_enter(&vd->vdev_stat_lock); 3752 3753 if (flags & ZIO_FLAG_IO_REPAIR) { 3754 if (flags & ZIO_FLAG_SCAN_THREAD) { 3755 dsl_scan_phys_t *scn_phys = 3756 &spa->spa_dsl_pool->dp_scan->scn_phys; 3757 uint64_t *processed = &scn_phys->scn_processed; 3758 3759 /* XXX cleanup? */ 3760 if (vd->vdev_ops->vdev_op_leaf) 3761 atomic_add_64(processed, psize); 3762 vs->vs_scan_processed += psize; 3763 } 3764 3765 if (flags & ZIO_FLAG_SELF_HEAL) 3766 vs->vs_self_healed += psize; 3767 } 3768 3769 vs->vs_ops[type]++; 3770 vs->vs_bytes[type] += psize; 3771 3772 mutex_exit(&vd->vdev_stat_lock); 3773 return; 3774 } 3775 3776 if (flags & ZIO_FLAG_SPECULATIVE) 3777 return; 3778 3779 /* 3780 * If this is an I/O error that is going to be retried, then ignore the 3781 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 3782 * hard errors, when in reality they can happen for any number of 3783 * innocuous reasons (bus resets, MPxIO link failure, etc). 3784 */ 3785 if (zio->io_error == EIO && 3786 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 3787 return; 3788 3789 /* 3790 * Intent logs writes won't propagate their error to the root 3791 * I/O so don't mark these types of failures as pool-level 3792 * errors. 3793 */ 3794 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 3795 return; 3796 3797 mutex_enter(&vd->vdev_stat_lock); 3798 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 3799 if (zio->io_error == ECKSUM) 3800 vs->vs_checksum_errors++; 3801 else 3802 vs->vs_read_errors++; 3803 } 3804 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 3805 vs->vs_write_errors++; 3806 mutex_exit(&vd->vdev_stat_lock); 3807 3808 if (spa->spa_load_state == SPA_LOAD_NONE && 3809 type == ZIO_TYPE_WRITE && txg != 0 && 3810 (!(flags & ZIO_FLAG_IO_REPAIR) || 3811 (flags & ZIO_FLAG_SCAN_THREAD) || 3812 spa->spa_claiming)) { 3813 /* 3814 * This is either a normal write (not a repair), or it's 3815 * a repair induced by the scrub thread, or it's a repair 3816 * made by zil_claim() during spa_load() in the first txg. 3817 * In the normal case, we commit the DTL change in the same 3818 * txg as the block was born. In the scrub-induced repair 3819 * case, we know that scrubs run in first-pass syncing context, 3820 * so we commit the DTL change in spa_syncing_txg(spa). 3821 * In the zil_claim() case, we commit in spa_first_txg(spa). 3822 * 3823 * We currently do not make DTL entries for failed spontaneous 3824 * self-healing writes triggered by normal (non-scrubbing) 3825 * reads, because we have no transactional context in which to 3826 * do so -- and it's not clear that it'd be desirable anyway. 3827 */ 3828 if (vd->vdev_ops->vdev_op_leaf) { 3829 uint64_t commit_txg = txg; 3830 if (flags & ZIO_FLAG_SCAN_THREAD) { 3831 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3832 ASSERT(spa_sync_pass(spa) == 1); 3833 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 3834 commit_txg = spa_syncing_txg(spa); 3835 } else if (spa->spa_claiming) { 3836 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 3837 commit_txg = spa_first_txg(spa); 3838 } 3839 ASSERT(commit_txg >= spa_syncing_txg(spa)); 3840 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3841 return; 3842 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3843 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3844 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3845 } 3846 if (vd != rvd) 3847 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3848 } 3849} 3850 3851/* 3852 * Update the in-core space usage stats for this vdev, its metaslab class, 3853 * and the root vdev. 3854 */ 3855void 3856vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3857 int64_t space_delta) 3858{ 3859 int64_t dspace_delta = space_delta; 3860 spa_t *spa = vd->vdev_spa; 3861 vdev_t *rvd = spa->spa_root_vdev; 3862 metaslab_group_t *mg = vd->vdev_mg; 3863 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3864 3865 ASSERT(vd == vd->vdev_top); 3866 3867 /* 3868 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3869 * factor. We must calculate this here and not at the root vdev 3870 * because the root vdev's psize-to-asize is simply the max of its 3871 * childrens', thus not accurate enough for us. 3872 */ 3873 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3874 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3875 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3876 vd->vdev_deflate_ratio; 3877 3878 mutex_enter(&vd->vdev_stat_lock); 3879 vd->vdev_stat.vs_alloc += alloc_delta; 3880 vd->vdev_stat.vs_space += space_delta; 3881 vd->vdev_stat.vs_dspace += dspace_delta; 3882 mutex_exit(&vd->vdev_stat_lock); 3883 3884 if (mc == spa_normal_class(spa)) { 3885 mutex_enter(&rvd->vdev_stat_lock); 3886 rvd->vdev_stat.vs_alloc += alloc_delta; 3887 rvd->vdev_stat.vs_space += space_delta; 3888 rvd->vdev_stat.vs_dspace += dspace_delta; 3889 mutex_exit(&rvd->vdev_stat_lock); 3890 } 3891 3892 if (mc != NULL) { 3893 ASSERT(rvd == vd->vdev_parent); 3894 ASSERT(vd->vdev_ms_count != 0); 3895 3896 metaslab_class_space_update(mc, 3897 alloc_delta, defer_delta, space_delta, dspace_delta); 3898 } 3899} 3900 3901/* 3902 * Mark a top-level vdev's config as dirty, placing it on the dirty list 3903 * so that it will be written out next time the vdev configuration is synced. 3904 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3905 */ 3906void 3907vdev_config_dirty(vdev_t *vd) 3908{ 3909 spa_t *spa = vd->vdev_spa; 3910 vdev_t *rvd = spa->spa_root_vdev; 3911 int c; 3912 3913 ASSERT(spa_writeable(spa)); 3914 3915 /* 3916 * If this is an aux vdev (as with l2cache and spare devices), then we 3917 * update the vdev config manually and set the sync flag. 3918 */ 3919 if (vd->vdev_aux != NULL) { 3920 spa_aux_vdev_t *sav = vd->vdev_aux; 3921 nvlist_t **aux; 3922 uint_t naux; 3923 3924 for (c = 0; c < sav->sav_count; c++) { 3925 if (sav->sav_vdevs[c] == vd) 3926 break; 3927 } 3928 3929 if (c == sav->sav_count) { 3930 /* 3931 * We're being removed. There's nothing more to do. 3932 */ 3933 ASSERT(sav->sav_sync == B_TRUE); 3934 return; 3935 } 3936 3937 sav->sav_sync = B_TRUE; 3938 3939 if (nvlist_lookup_nvlist_array(sav->sav_config, 3940 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3941 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3942 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3943 } 3944 3945 ASSERT(c < naux); 3946 3947 /* 3948 * Setting the nvlist in the middle if the array is a little 3949 * sketchy, but it will work. 3950 */ 3951 nvlist_free(aux[c]); 3952 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3953 3954 return; 3955 } 3956 3957 /* 3958 * The dirty list is protected by the SCL_CONFIG lock. The caller 3959 * must either hold SCL_CONFIG as writer, or must be the sync thread 3960 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3961 * so this is sufficient to ensure mutual exclusion. 3962 */ 3963 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3964 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3965 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3966 3967 if (vd == rvd) { 3968 for (c = 0; c < rvd->vdev_children; c++) 3969 vdev_config_dirty(rvd->vdev_child[c]); 3970 } else { 3971 ASSERT(vd == vd->vdev_top); 3972 3973 if (!list_link_active(&vd->vdev_config_dirty_node) && 3974 vdev_is_concrete(vd)) { 3975 list_insert_head(&spa->spa_config_dirty_list, vd); 3976 } 3977 } 3978} 3979 3980void 3981vdev_config_clean(vdev_t *vd) 3982{ 3983 spa_t *spa = vd->vdev_spa; 3984 3985 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3986 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3987 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3988 3989 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3990 list_remove(&spa->spa_config_dirty_list, vd); 3991} 3992 3993/* 3994 * Mark a top-level vdev's state as dirty, so that the next pass of 3995 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3996 * the state changes from larger config changes because they require 3997 * much less locking, and are often needed for administrative actions. 3998 */ 3999void 4000vdev_state_dirty(vdev_t *vd) 4001{ 4002 spa_t *spa = vd->vdev_spa; 4003 4004 ASSERT(spa_writeable(spa)); 4005 ASSERT(vd == vd->vdev_top); 4006 4007 /* 4008 * The state list is protected by the SCL_STATE lock. The caller 4009 * must either hold SCL_STATE as writer, or must be the sync thread 4010 * (which holds SCL_STATE as reader). There's only one sync thread, 4011 * so this is sufficient to ensure mutual exclusion. 4012 */ 4013 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 4014 (dsl_pool_sync_context(spa_get_dsl(spa)) && 4015 spa_config_held(spa, SCL_STATE, RW_READER))); 4016 4017 if (!list_link_active(&vd->vdev_state_dirty_node) && 4018 vdev_is_concrete(vd)) 4019 list_insert_head(&spa->spa_state_dirty_list, vd); 4020} 4021 4022void 4023vdev_state_clean(vdev_t *vd) 4024{ 4025 spa_t *spa = vd->vdev_spa; 4026 4027 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 4028 (dsl_pool_sync_context(spa_get_dsl(spa)) && 4029 spa_config_held(spa, SCL_STATE, RW_READER))); 4030 4031 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 4032 list_remove(&spa->spa_state_dirty_list, vd); 4033} 4034 4035/* 4036 * Propagate vdev state up from children to parent. 4037 */ 4038void 4039vdev_propagate_state(vdev_t *vd) 4040{ 4041 spa_t *spa = vd->vdev_spa; 4042 vdev_t *rvd = spa->spa_root_vdev; 4043 int degraded = 0, faulted = 0; 4044 int corrupted = 0; 4045 vdev_t *child; 4046 4047 if (vd->vdev_children > 0) { 4048 for (int c = 0; c < vd->vdev_children; c++) { 4049 child = vd->vdev_child[c]; 4050 4051 /* 4052 * Don't factor holes or indirect vdevs into the 4053 * decision. 4054 */ 4055 if (!vdev_is_concrete(child)) 4056 continue; 4057 4058 if (!vdev_readable(child) || 4059 (!vdev_writeable(child) && spa_writeable(spa))) { 4060 /* 4061 * Root special: if there is a top-level log 4062 * device, treat the root vdev as if it were 4063 * degraded. 4064 */ 4065 if (child->vdev_islog && vd == rvd) 4066 degraded++; 4067 else 4068 faulted++; 4069 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 4070 degraded++; 4071 } 4072 4073 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 4074 corrupted++; 4075 } 4076 4077 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 4078 4079 /* 4080 * Root special: if there is a top-level vdev that cannot be 4081 * opened due to corrupted metadata, then propagate the root 4082 * vdev's aux state as 'corrupt' rather than 'insufficient 4083 * replicas'. 4084 */ 4085 if (corrupted && vd == rvd && 4086 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 4087 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 4088 VDEV_AUX_CORRUPT_DATA); 4089 } 4090 4091 if (vd->vdev_parent) 4092 vdev_propagate_state(vd->vdev_parent); 4093} 4094 4095/* 4096 * Set a vdev's state. If this is during an open, we don't update the parent 4097 * state, because we're in the process of opening children depth-first. 4098 * Otherwise, we propagate the change to the parent. 4099 * 4100 * If this routine places a device in a faulted state, an appropriate ereport is 4101 * generated. 4102 */ 4103void 4104vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 4105{ 4106 uint64_t save_state; 4107 spa_t *spa = vd->vdev_spa; 4108 4109 if (state == vd->vdev_state) { 4110 vd->vdev_stat.vs_aux = aux; 4111 return; 4112 } 4113 4114 save_state = vd->vdev_state; 4115 4116 vd->vdev_state = state; 4117 vd->vdev_stat.vs_aux = aux; 4118 4119 /* 4120 * If we are setting the vdev state to anything but an open state, then 4121 * always close the underlying device unless the device has requested 4122 * a delayed close (i.e. we're about to remove or fault the device). 4123 * Otherwise, we keep accessible but invalid devices open forever. 4124 * We don't call vdev_close() itself, because that implies some extra 4125 * checks (offline, etc) that we don't want here. This is limited to 4126 * leaf devices, because otherwise closing the device will affect other 4127 * children. 4128 */ 4129 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 4130 vd->vdev_ops->vdev_op_leaf) 4131 vd->vdev_ops->vdev_op_close(vd); 4132 4133 if (vd->vdev_removed && 4134 state == VDEV_STATE_CANT_OPEN && 4135 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 4136 /* 4137 * If the previous state is set to VDEV_STATE_REMOVED, then this 4138 * device was previously marked removed and someone attempted to 4139 * reopen it. If this failed due to a nonexistent device, then 4140 * keep the device in the REMOVED state. We also let this be if 4141 * it is one of our special test online cases, which is only 4142 * attempting to online the device and shouldn't generate an FMA 4143 * fault. 4144 */ 4145 vd->vdev_state = VDEV_STATE_REMOVED; 4146 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 4147 } else if (state == VDEV_STATE_REMOVED) { 4148 vd->vdev_removed = B_TRUE; 4149 } else if (state == VDEV_STATE_CANT_OPEN) { 4150 /* 4151 * If we fail to open a vdev during an import or recovery, we 4152 * mark it as "not available", which signifies that it was 4153 * never there to begin with. Failure to open such a device 4154 * is not considered an error. 4155 */ 4156 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 4157 spa_load_state(spa) == SPA_LOAD_RECOVER) && 4158 vd->vdev_ops->vdev_op_leaf) 4159 vd->vdev_not_present = 1; 4160 4161 /* 4162 * Post the appropriate ereport. If the 'prevstate' field is 4163 * set to something other than VDEV_STATE_UNKNOWN, it indicates 4164 * that this is part of a vdev_reopen(). In this case, we don't 4165 * want to post the ereport if the device was already in the 4166 * CANT_OPEN state beforehand. 4167 * 4168 * If the 'checkremove' flag is set, then this is an attempt to 4169 * online the device in response to an insertion event. If we 4170 * hit this case, then we have detected an insertion event for a 4171 * faulted or offline device that wasn't in the removed state. 4172 * In this scenario, we don't post an ereport because we are 4173 * about to replace the device, or attempt an online with 4174 * vdev_forcefault, which will generate the fault for us. 4175 */ 4176 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 4177 !vd->vdev_not_present && !vd->vdev_checkremove && 4178 vd != spa->spa_root_vdev) { 4179 const char *class; 4180 4181 switch (aux) { 4182 case VDEV_AUX_OPEN_FAILED: 4183 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 4184 break; 4185 case VDEV_AUX_CORRUPT_DATA: 4186 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 4187 break; 4188 case VDEV_AUX_NO_REPLICAS: 4189 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 4190 break; 4191 case VDEV_AUX_BAD_GUID_SUM: 4192 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 4193 break; 4194 case VDEV_AUX_TOO_SMALL: 4195 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 4196 break; 4197 case VDEV_AUX_BAD_LABEL: 4198 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 4199 break; 4200 default: 4201 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 4202 } 4203 4204 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 4205 } 4206 4207 /* Erase any notion of persistent removed state */ 4208 vd->vdev_removed = B_FALSE; 4209 } else { 4210 vd->vdev_removed = B_FALSE; 4211 } 4212 4213 /* 4214 * Notify the fmd of the state change. Be verbose and post 4215 * notifications even for stuff that's not important; the fmd agent can 4216 * sort it out. Don't emit state change events for non-leaf vdevs since 4217 * they can't change state on their own. The FMD can check their state 4218 * if it wants to when it sees that a leaf vdev had a state change. 4219 */ 4220 if (vd->vdev_ops->vdev_op_leaf) 4221 zfs_post_state_change(spa, vd); 4222 4223 if (!isopen && vd->vdev_parent) 4224 vdev_propagate_state(vd->vdev_parent); 4225} 4226 4227boolean_t 4228vdev_children_are_offline(vdev_t *vd) 4229{ 4230 ASSERT(!vd->vdev_ops->vdev_op_leaf); 4231 4232 for (uint64_t i = 0; i < vd->vdev_children; i++) { 4233 if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) 4234 return (B_FALSE); 4235 } 4236 4237 return (B_TRUE); 4238} 4239 4240/* 4241 * Check the vdev configuration to ensure that it's capable of supporting 4242 * a root pool. We do not support partial configuration. 4243 * In addition, only a single top-level vdev is allowed. 4244 * 4245 * FreeBSD does not have above limitations. 4246 */ 4247boolean_t 4248vdev_is_bootable(vdev_t *vd) 4249{ 4250#ifdef illumos 4251 if (!vd->vdev_ops->vdev_op_leaf) { 4252 char *vdev_type = vd->vdev_ops->vdev_op_type; 4253 4254 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 4255 vd->vdev_children > 1) { 4256 return (B_FALSE); 4257 } else if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || 4258 strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { 4259 return (B_FALSE); 4260 } 4261 } 4262 4263 for (int c = 0; c < vd->vdev_children; c++) { 4264 if (!vdev_is_bootable(vd->vdev_child[c])) 4265 return (B_FALSE); 4266 } 4267#endif /* illumos */ 4268 return (B_TRUE); 4269} 4270 4271boolean_t 4272vdev_is_concrete(vdev_t *vd) 4273{ 4274 vdev_ops_t *ops = vd->vdev_ops; 4275 if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || 4276 ops == &vdev_missing_ops || ops == &vdev_root_ops) { 4277 return (B_FALSE); 4278 } else { 4279 return (B_TRUE); 4280 } 4281} 4282 4283/* 4284 * Determine if a log device has valid content. If the vdev was 4285 * removed or faulted in the MOS config then we know that 4286 * the content on the log device has already been written to the pool. 4287 */ 4288boolean_t 4289vdev_log_state_valid(vdev_t *vd) 4290{ 4291 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 4292 !vd->vdev_removed) 4293 return (B_TRUE); 4294 4295 for (int c = 0; c < vd->vdev_children; c++) 4296 if (vdev_log_state_valid(vd->vdev_child[c])) 4297 return (B_TRUE); 4298 4299 return (B_FALSE); 4300} 4301 4302/* 4303 * Expand a vdev if possible. 4304 */ 4305void 4306vdev_expand(vdev_t *vd, uint64_t txg) 4307{ 4308 ASSERT(vd->vdev_top == vd); 4309 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4310 ASSERT(vdev_is_concrete(vd)); 4311 4312 vdev_set_deflate_ratio(vd); 4313 4314 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 4315 VERIFY(vdev_metaslab_init(vd, txg) == 0); 4316 vdev_config_dirty(vd); 4317 } 4318} 4319 4320/* 4321 * Split a vdev. 4322 */ 4323void 4324vdev_split(vdev_t *vd) 4325{ 4326 vdev_t *cvd, *pvd = vd->vdev_parent; 4327 4328 vdev_remove_child(pvd, vd); 4329 vdev_compact_children(pvd); 4330 4331 cvd = pvd->vdev_child[0]; 4332 if (pvd->vdev_children == 1) { 4333 vdev_remove_parent(cvd); 4334 cvd->vdev_splitting = B_TRUE; 4335 } 4336 vdev_propagate_state(cvd); 4337} 4338 4339void 4340vdev_deadman(vdev_t *vd) 4341{ 4342 for (int c = 0; c < vd->vdev_children; c++) { 4343 vdev_t *cvd = vd->vdev_child[c]; 4344 4345 vdev_deadman(cvd); 4346 } 4347 4348 if (vd->vdev_ops->vdev_op_leaf) { 4349 vdev_queue_t *vq = &vd->vdev_queue; 4350 4351 mutex_enter(&vq->vq_lock); 4352 if (avl_numnodes(&vq->vq_active_tree) > 0) { 4353 spa_t *spa = vd->vdev_spa; 4354 zio_t *fio; 4355 uint64_t delta; 4356 4357 /* 4358 * Look at the head of all the pending queues, 4359 * if any I/O has been outstanding for longer than 4360 * the spa_deadman_synctime we panic the system. 4361 */ 4362 fio = avl_first(&vq->vq_active_tree); 4363 delta = gethrtime() - fio->io_timestamp; 4364 if (delta > spa_deadman_synctime(spa)) { 4365 vdev_dbgmsg(vd, "SLOW IO: zio timestamp " 4366 "%lluns, delta %lluns, last io %lluns", 4367 fio->io_timestamp, (u_longlong_t)delta, 4368 vq->vq_io_complete_ts); 4369 fm_panic("I/O to pool '%s' appears to be " 4370 "hung on vdev guid %llu at '%s'.", 4371 spa_name(spa), 4372 (long long unsigned int) vd->vdev_guid, 4373 vd->vdev_path); 4374 } 4375 } 4376 mutex_exit(&vq->vq_lock); 4377 } 4378} 4379