vdev.c revision 297112
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 26 * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Integros [integros.com] 28 */ 29 30#include <sys/zfs_context.h> 31#include <sys/fm/fs/zfs.h> 32#include <sys/spa.h> 33#include <sys/spa_impl.h> 34#include <sys/dmu.h> 35#include <sys/dmu_tx.h> 36#include <sys/vdev_impl.h> 37#include <sys/uberblock_impl.h> 38#include <sys/metaslab.h> 39#include <sys/metaslab_impl.h> 40#include <sys/space_map.h> 41#include <sys/space_reftree.h> 42#include <sys/zio.h> 43#include <sys/zap.h> 44#include <sys/fs/zfs.h> 45#include <sys/arc.h> 46#include <sys/zil.h> 47#include <sys/dsl_scan.h> 48#include <sys/trim_map.h> 49 50SYSCTL_DECL(_vfs_zfs); 51SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); 52 53/* 54 * Virtual device management. 55 */ 56 57/* 58 * The limit for ZFS to automatically increase a top-level vdev's ashift 59 * from logical ashift to physical ashift. 60 * 61 * Example: one or more 512B emulation child vdevs 62 * child->vdev_ashift = 9 (512 bytes) 63 * child->vdev_physical_ashift = 12 (4096 bytes) 64 * zfs_max_auto_ashift = 11 (2048 bytes) 65 * zfs_min_auto_ashift = 9 (512 bytes) 66 * 67 * On pool creation or the addition of a new top-level vdev, ZFS will 68 * increase the ashift of the top-level vdev to 2048 as limited by 69 * zfs_max_auto_ashift. 70 * 71 * Example: one or more 512B emulation child vdevs 72 * child->vdev_ashift = 9 (512 bytes) 73 * child->vdev_physical_ashift = 12 (4096 bytes) 74 * zfs_max_auto_ashift = 13 (8192 bytes) 75 * zfs_min_auto_ashift = 9 (512 bytes) 76 * 77 * On pool creation or the addition of a new top-level vdev, ZFS will 78 * increase the ashift of the top-level vdev to 4096 to match the 79 * max vdev_physical_ashift. 80 * 81 * Example: one or more 512B emulation child vdevs 82 * child->vdev_ashift = 9 (512 bytes) 83 * child->vdev_physical_ashift = 9 (512 bytes) 84 * zfs_max_auto_ashift = 13 (8192 bytes) 85 * zfs_min_auto_ashift = 12 (4096 bytes) 86 * 87 * On pool creation or the addition of a new top-level vdev, ZFS will 88 * increase the ashift of the top-level vdev to 4096 to match the 89 * zfs_min_auto_ashift. 90 */ 91static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; 92static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT; 93 94static int 95sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) 96{ 97 uint64_t val; 98 int err; 99 100 val = zfs_max_auto_ashift; 101 err = sysctl_handle_64(oidp, &val, 0, req); 102 if (err != 0 || req->newptr == NULL) 103 return (err); 104 105 if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift) 106 return (EINVAL); 107 108 zfs_max_auto_ashift = val; 109 110 return (0); 111} 112SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, 113 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 114 sysctl_vfs_zfs_max_auto_ashift, "QU", 115 "Max ashift used when optimising for logical -> physical sectors size on " 116 "new top-level vdevs."); 117 118static int 119sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) 120{ 121 uint64_t val; 122 int err; 123 124 val = zfs_min_auto_ashift; 125 err = sysctl_handle_64(oidp, &val, 0, req); 126 if (err != 0 || req->newptr == NULL) 127 return (err); 128 129 if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift) 130 return (EINVAL); 131 132 zfs_min_auto_ashift = val; 133 134 return (0); 135} 136SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, 137 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 138 sysctl_vfs_zfs_min_auto_ashift, "QU", 139 "Min ashift used when creating new top-level vdevs."); 140 141static vdev_ops_t *vdev_ops_table[] = { 142 &vdev_root_ops, 143 &vdev_raidz_ops, 144 &vdev_mirror_ops, 145 &vdev_replacing_ops, 146 &vdev_spare_ops, 147#ifdef _KERNEL 148 &vdev_geom_ops, 149#else 150 &vdev_disk_ops, 151#endif 152 &vdev_file_ops, 153 &vdev_missing_ops, 154 &vdev_hole_ops, 155 NULL 156}; 157 158 159/* 160 * When a vdev is added, it will be divided into approximately (but no 161 * more than) this number of metaslabs. 162 */ 163int metaslabs_per_vdev = 200; 164SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN, 165 &metaslabs_per_vdev, 0, 166 "When a vdev is added, how many metaslabs the vdev should be divided into"); 167 168/* 169 * Given a vdev type, return the appropriate ops vector. 170 */ 171static vdev_ops_t * 172vdev_getops(const char *type) 173{ 174 vdev_ops_t *ops, **opspp; 175 176 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 177 if (strcmp(ops->vdev_op_type, type) == 0) 178 break; 179 180 return (ops); 181} 182 183/* 184 * Default asize function: return the MAX of psize with the asize of 185 * all children. This is what's used by anything other than RAID-Z. 186 */ 187uint64_t 188vdev_default_asize(vdev_t *vd, uint64_t psize) 189{ 190 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 191 uint64_t csize; 192 193 for (int c = 0; c < vd->vdev_children; c++) { 194 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 195 asize = MAX(asize, csize); 196 } 197 198 return (asize); 199} 200 201/* 202 * Get the minimum allocatable size. We define the allocatable size as 203 * the vdev's asize rounded to the nearest metaslab. This allows us to 204 * replace or attach devices which don't have the same physical size but 205 * can still satisfy the same number of allocations. 206 */ 207uint64_t 208vdev_get_min_asize(vdev_t *vd) 209{ 210 vdev_t *pvd = vd->vdev_parent; 211 212 /* 213 * If our parent is NULL (inactive spare or cache) or is the root, 214 * just return our own asize. 215 */ 216 if (pvd == NULL) 217 return (vd->vdev_asize); 218 219 /* 220 * The top-level vdev just returns the allocatable size rounded 221 * to the nearest metaslab. 222 */ 223 if (vd == vd->vdev_top) 224 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 225 226 /* 227 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 228 * so each child must provide at least 1/Nth of its asize. 229 */ 230 if (pvd->vdev_ops == &vdev_raidz_ops) 231 return (pvd->vdev_min_asize / pvd->vdev_children); 232 233 return (pvd->vdev_min_asize); 234} 235 236void 237vdev_set_min_asize(vdev_t *vd) 238{ 239 vd->vdev_min_asize = vdev_get_min_asize(vd); 240 241 for (int c = 0; c < vd->vdev_children; c++) 242 vdev_set_min_asize(vd->vdev_child[c]); 243} 244 245vdev_t * 246vdev_lookup_top(spa_t *spa, uint64_t vdev) 247{ 248 vdev_t *rvd = spa->spa_root_vdev; 249 250 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 251 252 if (vdev < rvd->vdev_children) { 253 ASSERT(rvd->vdev_child[vdev] != NULL); 254 return (rvd->vdev_child[vdev]); 255 } 256 257 return (NULL); 258} 259 260vdev_t * 261vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 262{ 263 vdev_t *mvd; 264 265 if (vd->vdev_guid == guid) 266 return (vd); 267 268 for (int c = 0; c < vd->vdev_children; c++) 269 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 270 NULL) 271 return (mvd); 272 273 return (NULL); 274} 275 276static int 277vdev_count_leaves_impl(vdev_t *vd) 278{ 279 int n = 0; 280 281 if (vd->vdev_ops->vdev_op_leaf) 282 return (1); 283 284 for (int c = 0; c < vd->vdev_children; c++) 285 n += vdev_count_leaves_impl(vd->vdev_child[c]); 286 287 return (n); 288} 289 290int 291vdev_count_leaves(spa_t *spa) 292{ 293 return (vdev_count_leaves_impl(spa->spa_root_vdev)); 294} 295 296void 297vdev_add_child(vdev_t *pvd, vdev_t *cvd) 298{ 299 size_t oldsize, newsize; 300 uint64_t id = cvd->vdev_id; 301 vdev_t **newchild; 302 spa_t *spa = cvd->vdev_spa; 303 304 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 305 ASSERT(cvd->vdev_parent == NULL); 306 307 cvd->vdev_parent = pvd; 308 309 if (pvd == NULL) 310 return; 311 312 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 313 314 oldsize = pvd->vdev_children * sizeof (vdev_t *); 315 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 316 newsize = pvd->vdev_children * sizeof (vdev_t *); 317 318 newchild = kmem_zalloc(newsize, KM_SLEEP); 319 if (pvd->vdev_child != NULL) { 320 bcopy(pvd->vdev_child, newchild, oldsize); 321 kmem_free(pvd->vdev_child, oldsize); 322 } 323 324 pvd->vdev_child = newchild; 325 pvd->vdev_child[id] = cvd; 326 327 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 328 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 329 330 /* 331 * Walk up all ancestors to update guid sum. 332 */ 333 for (; pvd != NULL; pvd = pvd->vdev_parent) 334 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 335} 336 337void 338vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 339{ 340 int c; 341 uint_t id = cvd->vdev_id; 342 343 ASSERT(cvd->vdev_parent == pvd); 344 345 if (pvd == NULL) 346 return; 347 348 ASSERT(id < pvd->vdev_children); 349 ASSERT(pvd->vdev_child[id] == cvd); 350 351 pvd->vdev_child[id] = NULL; 352 cvd->vdev_parent = NULL; 353 354 for (c = 0; c < pvd->vdev_children; c++) 355 if (pvd->vdev_child[c]) 356 break; 357 358 if (c == pvd->vdev_children) { 359 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 360 pvd->vdev_child = NULL; 361 pvd->vdev_children = 0; 362 } 363 364 /* 365 * Walk up all ancestors to update guid sum. 366 */ 367 for (; pvd != NULL; pvd = pvd->vdev_parent) 368 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 369} 370 371/* 372 * Remove any holes in the child array. 373 */ 374void 375vdev_compact_children(vdev_t *pvd) 376{ 377 vdev_t **newchild, *cvd; 378 int oldc = pvd->vdev_children; 379 int newc; 380 381 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 382 383 for (int c = newc = 0; c < oldc; c++) 384 if (pvd->vdev_child[c]) 385 newc++; 386 387 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 388 389 for (int c = newc = 0; c < oldc; c++) { 390 if ((cvd = pvd->vdev_child[c]) != NULL) { 391 newchild[newc] = cvd; 392 cvd->vdev_id = newc++; 393 } 394 } 395 396 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 397 pvd->vdev_child = newchild; 398 pvd->vdev_children = newc; 399} 400 401/* 402 * Allocate and minimally initialize a vdev_t. 403 */ 404vdev_t * 405vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 406{ 407 vdev_t *vd; 408 409 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 410 411 if (spa->spa_root_vdev == NULL) { 412 ASSERT(ops == &vdev_root_ops); 413 spa->spa_root_vdev = vd; 414 spa->spa_load_guid = spa_generate_guid(NULL); 415 } 416 417 if (guid == 0 && ops != &vdev_hole_ops) { 418 if (spa->spa_root_vdev == vd) { 419 /* 420 * The root vdev's guid will also be the pool guid, 421 * which must be unique among all pools. 422 */ 423 guid = spa_generate_guid(NULL); 424 } else { 425 /* 426 * Any other vdev's guid must be unique within the pool. 427 */ 428 guid = spa_generate_guid(spa); 429 } 430 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 431 } 432 433 vd->vdev_spa = spa; 434 vd->vdev_id = id; 435 vd->vdev_guid = guid; 436 vd->vdev_guid_sum = guid; 437 vd->vdev_ops = ops; 438 vd->vdev_state = VDEV_STATE_CLOSED; 439 vd->vdev_ishole = (ops == &vdev_hole_ops); 440 441 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 442 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 443 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 444 for (int t = 0; t < DTL_TYPES; t++) { 445 vd->vdev_dtl[t] = range_tree_create(NULL, NULL, 446 &vd->vdev_dtl_lock); 447 } 448 txg_list_create(&vd->vdev_ms_list, 449 offsetof(struct metaslab, ms_txg_node)); 450 txg_list_create(&vd->vdev_dtl_list, 451 offsetof(struct vdev, vdev_dtl_node)); 452 vd->vdev_stat.vs_timestamp = gethrtime(); 453 vdev_queue_init(vd); 454 vdev_cache_init(vd); 455 456 return (vd); 457} 458 459/* 460 * Allocate a new vdev. The 'alloctype' is used to control whether we are 461 * creating a new vdev or loading an existing one - the behavior is slightly 462 * different for each case. 463 */ 464int 465vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 466 int alloctype) 467{ 468 vdev_ops_t *ops; 469 char *type; 470 uint64_t guid = 0, islog, nparity; 471 vdev_t *vd; 472 473 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 474 475 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 476 return (SET_ERROR(EINVAL)); 477 478 if ((ops = vdev_getops(type)) == NULL) 479 return (SET_ERROR(EINVAL)); 480 481 /* 482 * If this is a load, get the vdev guid from the nvlist. 483 * Otherwise, vdev_alloc_common() will generate one for us. 484 */ 485 if (alloctype == VDEV_ALLOC_LOAD) { 486 uint64_t label_id; 487 488 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 489 label_id != id) 490 return (SET_ERROR(EINVAL)); 491 492 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 493 return (SET_ERROR(EINVAL)); 494 } else if (alloctype == VDEV_ALLOC_SPARE) { 495 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 496 return (SET_ERROR(EINVAL)); 497 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 498 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 499 return (SET_ERROR(EINVAL)); 500 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 501 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 502 return (SET_ERROR(EINVAL)); 503 } 504 505 /* 506 * The first allocated vdev must be of type 'root'. 507 */ 508 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 509 return (SET_ERROR(EINVAL)); 510 511 /* 512 * Determine whether we're a log vdev. 513 */ 514 islog = 0; 515 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 516 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 517 return (SET_ERROR(ENOTSUP)); 518 519 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 520 return (SET_ERROR(ENOTSUP)); 521 522 /* 523 * Set the nparity property for RAID-Z vdevs. 524 */ 525 nparity = -1ULL; 526 if (ops == &vdev_raidz_ops) { 527 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 528 &nparity) == 0) { 529 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 530 return (SET_ERROR(EINVAL)); 531 /* 532 * Previous versions could only support 1 or 2 parity 533 * device. 534 */ 535 if (nparity > 1 && 536 spa_version(spa) < SPA_VERSION_RAIDZ2) 537 return (SET_ERROR(ENOTSUP)); 538 if (nparity > 2 && 539 spa_version(spa) < SPA_VERSION_RAIDZ3) 540 return (SET_ERROR(ENOTSUP)); 541 } else { 542 /* 543 * We require the parity to be specified for SPAs that 544 * support multiple parity levels. 545 */ 546 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 547 return (SET_ERROR(EINVAL)); 548 /* 549 * Otherwise, we default to 1 parity device for RAID-Z. 550 */ 551 nparity = 1; 552 } 553 } else { 554 nparity = 0; 555 } 556 ASSERT(nparity != -1ULL); 557 558 vd = vdev_alloc_common(spa, id, guid, ops); 559 560 vd->vdev_islog = islog; 561 vd->vdev_nparity = nparity; 562 563 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 564 vd->vdev_path = spa_strdup(vd->vdev_path); 565 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 566 vd->vdev_devid = spa_strdup(vd->vdev_devid); 567 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 568 &vd->vdev_physpath) == 0) 569 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 570 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 571 vd->vdev_fru = spa_strdup(vd->vdev_fru); 572 573 /* 574 * Set the whole_disk property. If it's not specified, leave the value 575 * as -1. 576 */ 577 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 578 &vd->vdev_wholedisk) != 0) 579 vd->vdev_wholedisk = -1ULL; 580 581 /* 582 * Look for the 'not present' flag. This will only be set if the device 583 * was not present at the time of import. 584 */ 585 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 586 &vd->vdev_not_present); 587 588 /* 589 * Get the alignment requirement. 590 */ 591 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 592 593 /* 594 * Retrieve the vdev creation time. 595 */ 596 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 597 &vd->vdev_crtxg); 598 599 /* 600 * If we're a top-level vdev, try to load the allocation parameters. 601 */ 602 if (parent && !parent->vdev_parent && 603 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 604 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 605 &vd->vdev_ms_array); 606 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 607 &vd->vdev_ms_shift); 608 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 609 &vd->vdev_asize); 610 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 611 &vd->vdev_removing); 612 } 613 614 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 615 ASSERT(alloctype == VDEV_ALLOC_LOAD || 616 alloctype == VDEV_ALLOC_ADD || 617 alloctype == VDEV_ALLOC_SPLIT || 618 alloctype == VDEV_ALLOC_ROOTPOOL); 619 vd->vdev_mg = metaslab_group_create(islog ? 620 spa_log_class(spa) : spa_normal_class(spa), vd); 621 } 622 623 /* 624 * If we're a leaf vdev, try to load the DTL object and other state. 625 */ 626 if (vd->vdev_ops->vdev_op_leaf && 627 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 628 alloctype == VDEV_ALLOC_ROOTPOOL)) { 629 if (alloctype == VDEV_ALLOC_LOAD) { 630 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 631 &vd->vdev_dtl_object); 632 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 633 &vd->vdev_unspare); 634 } 635 636 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 637 uint64_t spare = 0; 638 639 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 640 &spare) == 0 && spare) 641 spa_spare_add(vd); 642 } 643 644 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 645 &vd->vdev_offline); 646 647 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 648 &vd->vdev_resilver_txg); 649 650 /* 651 * When importing a pool, we want to ignore the persistent fault 652 * state, as the diagnosis made on another system may not be 653 * valid in the current context. Local vdevs will 654 * remain in the faulted state. 655 */ 656 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 657 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 658 &vd->vdev_faulted); 659 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 660 &vd->vdev_degraded); 661 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 662 &vd->vdev_removed); 663 664 if (vd->vdev_faulted || vd->vdev_degraded) { 665 char *aux; 666 667 vd->vdev_label_aux = 668 VDEV_AUX_ERR_EXCEEDED; 669 if (nvlist_lookup_string(nv, 670 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 671 strcmp(aux, "external") == 0) 672 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 673 } 674 } 675 } 676 677 /* 678 * Add ourselves to the parent's list of children. 679 */ 680 vdev_add_child(parent, vd); 681 682 *vdp = vd; 683 684 return (0); 685} 686 687void 688vdev_free(vdev_t *vd) 689{ 690 spa_t *spa = vd->vdev_spa; 691 692 /* 693 * vdev_free() implies closing the vdev first. This is simpler than 694 * trying to ensure complicated semantics for all callers. 695 */ 696 vdev_close(vd); 697 698 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 699 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 700 701 /* 702 * Free all children. 703 */ 704 for (int c = 0; c < vd->vdev_children; c++) 705 vdev_free(vd->vdev_child[c]); 706 707 ASSERT(vd->vdev_child == NULL); 708 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 709 710 /* 711 * Discard allocation state. 712 */ 713 if (vd->vdev_mg != NULL) { 714 vdev_metaslab_fini(vd); 715 metaslab_group_destroy(vd->vdev_mg); 716 } 717 718 ASSERT0(vd->vdev_stat.vs_space); 719 ASSERT0(vd->vdev_stat.vs_dspace); 720 ASSERT0(vd->vdev_stat.vs_alloc); 721 722 /* 723 * Remove this vdev from its parent's child list. 724 */ 725 vdev_remove_child(vd->vdev_parent, vd); 726 727 ASSERT(vd->vdev_parent == NULL); 728 729 /* 730 * Clean up vdev structure. 731 */ 732 vdev_queue_fini(vd); 733 vdev_cache_fini(vd); 734 735 if (vd->vdev_path) 736 spa_strfree(vd->vdev_path); 737 if (vd->vdev_devid) 738 spa_strfree(vd->vdev_devid); 739 if (vd->vdev_physpath) 740 spa_strfree(vd->vdev_physpath); 741 if (vd->vdev_fru) 742 spa_strfree(vd->vdev_fru); 743 744 if (vd->vdev_isspare) 745 spa_spare_remove(vd); 746 if (vd->vdev_isl2cache) 747 spa_l2cache_remove(vd); 748 749 txg_list_destroy(&vd->vdev_ms_list); 750 txg_list_destroy(&vd->vdev_dtl_list); 751 752 mutex_enter(&vd->vdev_dtl_lock); 753 space_map_close(vd->vdev_dtl_sm); 754 for (int t = 0; t < DTL_TYPES; t++) { 755 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 756 range_tree_destroy(vd->vdev_dtl[t]); 757 } 758 mutex_exit(&vd->vdev_dtl_lock); 759 760 mutex_destroy(&vd->vdev_dtl_lock); 761 mutex_destroy(&vd->vdev_stat_lock); 762 mutex_destroy(&vd->vdev_probe_lock); 763 764 if (vd == spa->spa_root_vdev) 765 spa->spa_root_vdev = NULL; 766 767 kmem_free(vd, sizeof (vdev_t)); 768} 769 770/* 771 * Transfer top-level vdev state from svd to tvd. 772 */ 773static void 774vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 775{ 776 spa_t *spa = svd->vdev_spa; 777 metaslab_t *msp; 778 vdev_t *vd; 779 int t; 780 781 ASSERT(tvd == tvd->vdev_top); 782 783 tvd->vdev_ms_array = svd->vdev_ms_array; 784 tvd->vdev_ms_shift = svd->vdev_ms_shift; 785 tvd->vdev_ms_count = svd->vdev_ms_count; 786 787 svd->vdev_ms_array = 0; 788 svd->vdev_ms_shift = 0; 789 svd->vdev_ms_count = 0; 790 791 if (tvd->vdev_mg) 792 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 793 tvd->vdev_mg = svd->vdev_mg; 794 tvd->vdev_ms = svd->vdev_ms; 795 796 svd->vdev_mg = NULL; 797 svd->vdev_ms = NULL; 798 799 if (tvd->vdev_mg != NULL) 800 tvd->vdev_mg->mg_vd = tvd; 801 802 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 803 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 804 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 805 806 svd->vdev_stat.vs_alloc = 0; 807 svd->vdev_stat.vs_space = 0; 808 svd->vdev_stat.vs_dspace = 0; 809 810 for (t = 0; t < TXG_SIZE; t++) { 811 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 812 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 813 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 814 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 815 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 816 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 817 } 818 819 if (list_link_active(&svd->vdev_config_dirty_node)) { 820 vdev_config_clean(svd); 821 vdev_config_dirty(tvd); 822 } 823 824 if (list_link_active(&svd->vdev_state_dirty_node)) { 825 vdev_state_clean(svd); 826 vdev_state_dirty(tvd); 827 } 828 829 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 830 svd->vdev_deflate_ratio = 0; 831 832 tvd->vdev_islog = svd->vdev_islog; 833 svd->vdev_islog = 0; 834} 835 836static void 837vdev_top_update(vdev_t *tvd, vdev_t *vd) 838{ 839 if (vd == NULL) 840 return; 841 842 vd->vdev_top = tvd; 843 844 for (int c = 0; c < vd->vdev_children; c++) 845 vdev_top_update(tvd, vd->vdev_child[c]); 846} 847 848/* 849 * Add a mirror/replacing vdev above an existing vdev. 850 */ 851vdev_t * 852vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 853{ 854 spa_t *spa = cvd->vdev_spa; 855 vdev_t *pvd = cvd->vdev_parent; 856 vdev_t *mvd; 857 858 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 859 860 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 861 862 mvd->vdev_asize = cvd->vdev_asize; 863 mvd->vdev_min_asize = cvd->vdev_min_asize; 864 mvd->vdev_max_asize = cvd->vdev_max_asize; 865 mvd->vdev_ashift = cvd->vdev_ashift; 866 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 867 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 868 mvd->vdev_state = cvd->vdev_state; 869 mvd->vdev_crtxg = cvd->vdev_crtxg; 870 871 vdev_remove_child(pvd, cvd); 872 vdev_add_child(pvd, mvd); 873 cvd->vdev_id = mvd->vdev_children; 874 vdev_add_child(mvd, cvd); 875 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 876 877 if (mvd == mvd->vdev_top) 878 vdev_top_transfer(cvd, mvd); 879 880 return (mvd); 881} 882 883/* 884 * Remove a 1-way mirror/replacing vdev from the tree. 885 */ 886void 887vdev_remove_parent(vdev_t *cvd) 888{ 889 vdev_t *mvd = cvd->vdev_parent; 890 vdev_t *pvd = mvd->vdev_parent; 891 892 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 893 894 ASSERT(mvd->vdev_children == 1); 895 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 896 mvd->vdev_ops == &vdev_replacing_ops || 897 mvd->vdev_ops == &vdev_spare_ops); 898 cvd->vdev_ashift = mvd->vdev_ashift; 899 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 900 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 901 902 vdev_remove_child(mvd, cvd); 903 vdev_remove_child(pvd, mvd); 904 905 /* 906 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 907 * Otherwise, we could have detached an offline device, and when we 908 * go to import the pool we'll think we have two top-level vdevs, 909 * instead of a different version of the same top-level vdev. 910 */ 911 if (mvd->vdev_top == mvd) { 912 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 913 cvd->vdev_orig_guid = cvd->vdev_guid; 914 cvd->vdev_guid += guid_delta; 915 cvd->vdev_guid_sum += guid_delta; 916 } 917 cvd->vdev_id = mvd->vdev_id; 918 vdev_add_child(pvd, cvd); 919 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 920 921 if (cvd == cvd->vdev_top) 922 vdev_top_transfer(mvd, cvd); 923 924 ASSERT(mvd->vdev_children == 0); 925 vdev_free(mvd); 926} 927 928int 929vdev_metaslab_init(vdev_t *vd, uint64_t txg) 930{ 931 spa_t *spa = vd->vdev_spa; 932 objset_t *mos = spa->spa_meta_objset; 933 uint64_t m; 934 uint64_t oldc = vd->vdev_ms_count; 935 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 936 metaslab_t **mspp; 937 int error; 938 939 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 940 941 /* 942 * This vdev is not being allocated from yet or is a hole. 943 */ 944 if (vd->vdev_ms_shift == 0) 945 return (0); 946 947 ASSERT(!vd->vdev_ishole); 948 949 /* 950 * Compute the raidz-deflation ratio. Note, we hard-code 951 * in 128k (1 << 17) because it is the "typical" blocksize. 952 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 953 * otherwise it would inconsistently account for existing bp's. 954 */ 955 vd->vdev_deflate_ratio = (1 << 17) / 956 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 957 958 ASSERT(oldc <= newc); 959 960 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 961 962 if (oldc != 0) { 963 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 964 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 965 } 966 967 vd->vdev_ms = mspp; 968 vd->vdev_ms_count = newc; 969 970 for (m = oldc; m < newc; m++) { 971 uint64_t object = 0; 972 973 if (txg == 0) { 974 error = dmu_read(mos, vd->vdev_ms_array, 975 m * sizeof (uint64_t), sizeof (uint64_t), &object, 976 DMU_READ_PREFETCH); 977 if (error) 978 return (error); 979 } 980 981 error = metaslab_init(vd->vdev_mg, m, object, txg, 982 &(vd->vdev_ms[m])); 983 if (error) 984 return (error); 985 } 986 987 if (txg == 0) 988 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 989 990 /* 991 * If the vdev is being removed we don't activate 992 * the metaslabs since we want to ensure that no new 993 * allocations are performed on this device. 994 */ 995 if (oldc == 0 && !vd->vdev_removing) 996 metaslab_group_activate(vd->vdev_mg); 997 998 if (txg == 0) 999 spa_config_exit(spa, SCL_ALLOC, FTAG); 1000 1001 return (0); 1002} 1003 1004void 1005vdev_metaslab_fini(vdev_t *vd) 1006{ 1007 uint64_t m; 1008 uint64_t count = vd->vdev_ms_count; 1009 1010 if (vd->vdev_ms != NULL) { 1011 metaslab_group_passivate(vd->vdev_mg); 1012 for (m = 0; m < count; m++) { 1013 metaslab_t *msp = vd->vdev_ms[m]; 1014 1015 if (msp != NULL) 1016 metaslab_fini(msp); 1017 } 1018 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1019 vd->vdev_ms = NULL; 1020 } 1021} 1022 1023typedef struct vdev_probe_stats { 1024 boolean_t vps_readable; 1025 boolean_t vps_writeable; 1026 int vps_flags; 1027} vdev_probe_stats_t; 1028 1029static void 1030vdev_probe_done(zio_t *zio) 1031{ 1032 spa_t *spa = zio->io_spa; 1033 vdev_t *vd = zio->io_vd; 1034 vdev_probe_stats_t *vps = zio->io_private; 1035 1036 ASSERT(vd->vdev_probe_zio != NULL); 1037 1038 if (zio->io_type == ZIO_TYPE_READ) { 1039 if (zio->io_error == 0) 1040 vps->vps_readable = 1; 1041 if (zio->io_error == 0 && spa_writeable(spa)) { 1042 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1043 zio->io_offset, zio->io_size, zio->io_data, 1044 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1045 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1046 } else { 1047 zio_buf_free(zio->io_data, zio->io_size); 1048 } 1049 } else if (zio->io_type == ZIO_TYPE_WRITE) { 1050 if (zio->io_error == 0) 1051 vps->vps_writeable = 1; 1052 zio_buf_free(zio->io_data, zio->io_size); 1053 } else if (zio->io_type == ZIO_TYPE_NULL) { 1054 zio_t *pio; 1055 1056 vd->vdev_cant_read |= !vps->vps_readable; 1057 vd->vdev_cant_write |= !vps->vps_writeable; 1058 1059 if (vdev_readable(vd) && 1060 (vdev_writeable(vd) || !spa_writeable(spa))) { 1061 zio->io_error = 0; 1062 } else { 1063 ASSERT(zio->io_error != 0); 1064 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1065 spa, vd, NULL, 0, 0); 1066 zio->io_error = SET_ERROR(ENXIO); 1067 } 1068 1069 mutex_enter(&vd->vdev_probe_lock); 1070 ASSERT(vd->vdev_probe_zio == zio); 1071 vd->vdev_probe_zio = NULL; 1072 mutex_exit(&vd->vdev_probe_lock); 1073 1074 while ((pio = zio_walk_parents(zio)) != NULL) 1075 if (!vdev_accessible(vd, pio)) 1076 pio->io_error = SET_ERROR(ENXIO); 1077 1078 kmem_free(vps, sizeof (*vps)); 1079 } 1080} 1081 1082/* 1083 * Determine whether this device is accessible. 1084 * 1085 * Read and write to several known locations: the pad regions of each 1086 * vdev label but the first, which we leave alone in case it contains 1087 * a VTOC. 1088 */ 1089zio_t * 1090vdev_probe(vdev_t *vd, zio_t *zio) 1091{ 1092 spa_t *spa = vd->vdev_spa; 1093 vdev_probe_stats_t *vps = NULL; 1094 zio_t *pio; 1095 1096 ASSERT(vd->vdev_ops->vdev_op_leaf); 1097 1098 /* 1099 * Don't probe the probe. 1100 */ 1101 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1102 return (NULL); 1103 1104 /* 1105 * To prevent 'probe storms' when a device fails, we create 1106 * just one probe i/o at a time. All zios that want to probe 1107 * this vdev will become parents of the probe io. 1108 */ 1109 mutex_enter(&vd->vdev_probe_lock); 1110 1111 if ((pio = vd->vdev_probe_zio) == NULL) { 1112 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1113 1114 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1115 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1116 ZIO_FLAG_TRYHARD; 1117 1118 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1119 /* 1120 * vdev_cant_read and vdev_cant_write can only 1121 * transition from TRUE to FALSE when we have the 1122 * SCL_ZIO lock as writer; otherwise they can only 1123 * transition from FALSE to TRUE. This ensures that 1124 * any zio looking at these values can assume that 1125 * failures persist for the life of the I/O. That's 1126 * important because when a device has intermittent 1127 * connectivity problems, we want to ensure that 1128 * they're ascribed to the device (ENXIO) and not 1129 * the zio (EIO). 1130 * 1131 * Since we hold SCL_ZIO as writer here, clear both 1132 * values so the probe can reevaluate from first 1133 * principles. 1134 */ 1135 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1136 vd->vdev_cant_read = B_FALSE; 1137 vd->vdev_cant_write = B_FALSE; 1138 } 1139 1140 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1141 vdev_probe_done, vps, 1142 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1143 1144 /* 1145 * We can't change the vdev state in this context, so we 1146 * kick off an async task to do it on our behalf. 1147 */ 1148 if (zio != NULL) { 1149 vd->vdev_probe_wanted = B_TRUE; 1150 spa_async_request(spa, SPA_ASYNC_PROBE); 1151 } 1152 } 1153 1154 if (zio != NULL) 1155 zio_add_child(zio, pio); 1156 1157 mutex_exit(&vd->vdev_probe_lock); 1158 1159 if (vps == NULL) { 1160 ASSERT(zio != NULL); 1161 return (NULL); 1162 } 1163 1164 for (int l = 1; l < VDEV_LABELS; l++) { 1165 zio_nowait(zio_read_phys(pio, vd, 1166 vdev_label_offset(vd->vdev_psize, l, 1167 offsetof(vdev_label_t, vl_pad2)), 1168 VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1169 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1170 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1171 } 1172 1173 if (zio == NULL) 1174 return (pio); 1175 1176 zio_nowait(pio); 1177 return (NULL); 1178} 1179 1180static void 1181vdev_open_child(void *arg) 1182{ 1183 vdev_t *vd = arg; 1184 1185 vd->vdev_open_thread = curthread; 1186 vd->vdev_open_error = vdev_open(vd); 1187 vd->vdev_open_thread = NULL; 1188} 1189 1190boolean_t 1191vdev_uses_zvols(vdev_t *vd) 1192{ 1193 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1194 strlen(ZVOL_DIR)) == 0) 1195 return (B_TRUE); 1196 for (int c = 0; c < vd->vdev_children; c++) 1197 if (vdev_uses_zvols(vd->vdev_child[c])) 1198 return (B_TRUE); 1199 return (B_FALSE); 1200} 1201 1202void 1203vdev_open_children(vdev_t *vd) 1204{ 1205 taskq_t *tq; 1206 int children = vd->vdev_children; 1207 1208 /* 1209 * in order to handle pools on top of zvols, do the opens 1210 * in a single thread so that the same thread holds the 1211 * spa_namespace_lock 1212 */ 1213 if (B_TRUE || vdev_uses_zvols(vd)) { 1214 for (int c = 0; c < children; c++) 1215 vd->vdev_child[c]->vdev_open_error = 1216 vdev_open(vd->vdev_child[c]); 1217 return; 1218 } 1219 tq = taskq_create("vdev_open", children, minclsyspri, 1220 children, children, TASKQ_PREPOPULATE); 1221 1222 for (int c = 0; c < children; c++) 1223 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1224 TQ_SLEEP) != 0); 1225 1226 taskq_destroy(tq); 1227} 1228 1229/* 1230 * Prepare a virtual device for access. 1231 */ 1232int 1233vdev_open(vdev_t *vd) 1234{ 1235 spa_t *spa = vd->vdev_spa; 1236 int error; 1237 uint64_t osize = 0; 1238 uint64_t max_osize = 0; 1239 uint64_t asize, max_asize, psize; 1240 uint64_t logical_ashift = 0; 1241 uint64_t physical_ashift = 0; 1242 1243 ASSERT(vd->vdev_open_thread == curthread || 1244 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1245 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1246 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1247 vd->vdev_state == VDEV_STATE_OFFLINE); 1248 1249 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1250 vd->vdev_cant_read = B_FALSE; 1251 vd->vdev_cant_write = B_FALSE; 1252 vd->vdev_notrim = B_FALSE; 1253 vd->vdev_min_asize = vdev_get_min_asize(vd); 1254 1255 /* 1256 * If this vdev is not removed, check its fault status. If it's 1257 * faulted, bail out of the open. 1258 */ 1259 if (!vd->vdev_removed && vd->vdev_faulted) { 1260 ASSERT(vd->vdev_children == 0); 1261 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1262 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1263 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1264 vd->vdev_label_aux); 1265 return (SET_ERROR(ENXIO)); 1266 } else if (vd->vdev_offline) { 1267 ASSERT(vd->vdev_children == 0); 1268 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1269 return (SET_ERROR(ENXIO)); 1270 } 1271 1272 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 1273 &logical_ashift, &physical_ashift); 1274 1275 /* 1276 * Reset the vdev_reopening flag so that we actually close 1277 * the vdev on error. 1278 */ 1279 vd->vdev_reopening = B_FALSE; 1280 if (zio_injection_enabled && error == 0) 1281 error = zio_handle_device_injection(vd, NULL, ENXIO); 1282 1283 if (error) { 1284 if (vd->vdev_removed && 1285 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1286 vd->vdev_removed = B_FALSE; 1287 1288 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1289 vd->vdev_stat.vs_aux); 1290 return (error); 1291 } 1292 1293 vd->vdev_removed = B_FALSE; 1294 1295 /* 1296 * Recheck the faulted flag now that we have confirmed that 1297 * the vdev is accessible. If we're faulted, bail. 1298 */ 1299 if (vd->vdev_faulted) { 1300 ASSERT(vd->vdev_children == 0); 1301 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1302 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1303 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1304 vd->vdev_label_aux); 1305 return (SET_ERROR(ENXIO)); 1306 } 1307 1308 if (vd->vdev_degraded) { 1309 ASSERT(vd->vdev_children == 0); 1310 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1311 VDEV_AUX_ERR_EXCEEDED); 1312 } else { 1313 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1314 } 1315 1316 /* 1317 * For hole or missing vdevs we just return success. 1318 */ 1319 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1320 return (0); 1321 1322 if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf) 1323 trim_map_create(vd); 1324 1325 for (int c = 0; c < vd->vdev_children; c++) { 1326 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1327 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1328 VDEV_AUX_NONE); 1329 break; 1330 } 1331 } 1332 1333 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1334 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1335 1336 if (vd->vdev_children == 0) { 1337 if (osize < SPA_MINDEVSIZE) { 1338 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1339 VDEV_AUX_TOO_SMALL); 1340 return (SET_ERROR(EOVERFLOW)); 1341 } 1342 psize = osize; 1343 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1344 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1345 VDEV_LABEL_END_SIZE); 1346 } else { 1347 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1348 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1349 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1350 VDEV_AUX_TOO_SMALL); 1351 return (SET_ERROR(EOVERFLOW)); 1352 } 1353 psize = 0; 1354 asize = osize; 1355 max_asize = max_osize; 1356 } 1357 1358 vd->vdev_psize = psize; 1359 1360 /* 1361 * Make sure the allocatable size hasn't shrunk. 1362 */ 1363 if (asize < vd->vdev_min_asize) { 1364 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1365 VDEV_AUX_BAD_LABEL); 1366 return (SET_ERROR(EINVAL)); 1367 } 1368 1369 vd->vdev_physical_ashift = 1370 MAX(physical_ashift, vd->vdev_physical_ashift); 1371 vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); 1372 vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); 1373 1374 if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { 1375 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1376 VDEV_AUX_ASHIFT_TOO_BIG); 1377 return (EINVAL); 1378 } 1379 1380 if (vd->vdev_asize == 0) { 1381 /* 1382 * This is the first-ever open, so use the computed values. 1383 * For testing purposes, a higher ashift can be requested. 1384 */ 1385 vd->vdev_asize = asize; 1386 vd->vdev_max_asize = max_asize; 1387 } else { 1388 /* 1389 * Make sure the alignment requirement hasn't increased. 1390 */ 1391 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 1392 vd->vdev_ops->vdev_op_leaf) { 1393 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1394 VDEV_AUX_BAD_LABEL); 1395 return (EINVAL); 1396 } 1397 vd->vdev_max_asize = max_asize; 1398 } 1399 1400 /* 1401 * If all children are healthy and the asize has increased, 1402 * then we've experienced dynamic LUN growth. If automatic 1403 * expansion is enabled then use the additional space. 1404 */ 1405 if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1406 (vd->vdev_expanding || spa->spa_autoexpand)) 1407 vd->vdev_asize = asize; 1408 1409 vdev_set_min_asize(vd); 1410 1411 /* 1412 * Ensure we can issue some IO before declaring the 1413 * vdev open for business. 1414 */ 1415 if (vd->vdev_ops->vdev_op_leaf && 1416 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1417 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1418 VDEV_AUX_ERR_EXCEEDED); 1419 return (error); 1420 } 1421 1422 /* 1423 * Track the min and max ashift values for normal data devices. 1424 */ 1425 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1426 !vd->vdev_islog && vd->vdev_aux == NULL) { 1427 if (vd->vdev_ashift > spa->spa_max_ashift) 1428 spa->spa_max_ashift = vd->vdev_ashift; 1429 if (vd->vdev_ashift < spa->spa_min_ashift) 1430 spa->spa_min_ashift = vd->vdev_ashift; 1431 } 1432 1433 /* 1434 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1435 * resilver. But don't do this if we are doing a reopen for a scrub, 1436 * since this would just restart the scrub we are already doing. 1437 */ 1438 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1439 vdev_resilver_needed(vd, NULL, NULL)) 1440 spa_async_request(spa, SPA_ASYNC_RESILVER); 1441 1442 return (0); 1443} 1444 1445/* 1446 * Called once the vdevs are all opened, this routine validates the label 1447 * contents. This needs to be done before vdev_load() so that we don't 1448 * inadvertently do repair I/Os to the wrong device. 1449 * 1450 * If 'strict' is false ignore the spa guid check. This is necessary because 1451 * if the machine crashed during a re-guid the new guid might have been written 1452 * to all of the vdev labels, but not the cached config. The strict check 1453 * will be performed when the pool is opened again using the mos config. 1454 * 1455 * This function will only return failure if one of the vdevs indicates that it 1456 * has since been destroyed or exported. This is only possible if 1457 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1458 * will be updated but the function will return 0. 1459 */ 1460int 1461vdev_validate(vdev_t *vd, boolean_t strict) 1462{ 1463 spa_t *spa = vd->vdev_spa; 1464 nvlist_t *label; 1465 uint64_t guid = 0, top_guid; 1466 uint64_t state; 1467 1468 for (int c = 0; c < vd->vdev_children; c++) 1469 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1470 return (SET_ERROR(EBADF)); 1471 1472 /* 1473 * If the device has already failed, or was marked offline, don't do 1474 * any further validation. Otherwise, label I/O will fail and we will 1475 * overwrite the previous state. 1476 */ 1477 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1478 uint64_t aux_guid = 0; 1479 nvlist_t *nvl; 1480 uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1481 spa_last_synced_txg(spa) : -1ULL; 1482 1483 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1484 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1485 VDEV_AUX_BAD_LABEL); 1486 return (0); 1487 } 1488 1489 /* 1490 * Determine if this vdev has been split off into another 1491 * pool. If so, then refuse to open it. 1492 */ 1493 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1494 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1495 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1496 VDEV_AUX_SPLIT_POOL); 1497 nvlist_free(label); 1498 return (0); 1499 } 1500 1501 if (strict && (nvlist_lookup_uint64(label, 1502 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1503 guid != spa_guid(spa))) { 1504 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1505 VDEV_AUX_CORRUPT_DATA); 1506 nvlist_free(label); 1507 return (0); 1508 } 1509 1510 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1511 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1512 &aux_guid) != 0) 1513 aux_guid = 0; 1514 1515 /* 1516 * If this vdev just became a top-level vdev because its 1517 * sibling was detached, it will have adopted the parent's 1518 * vdev guid -- but the label may or may not be on disk yet. 1519 * Fortunately, either version of the label will have the 1520 * same top guid, so if we're a top-level vdev, we can 1521 * safely compare to that instead. 1522 * 1523 * If we split this vdev off instead, then we also check the 1524 * original pool's guid. We don't want to consider the vdev 1525 * corrupt if it is partway through a split operation. 1526 */ 1527 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1528 &guid) != 0 || 1529 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1530 &top_guid) != 0 || 1531 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1532 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1533 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1534 VDEV_AUX_CORRUPT_DATA); 1535 nvlist_free(label); 1536 return (0); 1537 } 1538 1539 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1540 &state) != 0) { 1541 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1542 VDEV_AUX_CORRUPT_DATA); 1543 nvlist_free(label); 1544 return (0); 1545 } 1546 1547 nvlist_free(label); 1548 1549 /* 1550 * If this is a verbatim import, no need to check the 1551 * state of the pool. 1552 */ 1553 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1554 spa_load_state(spa) == SPA_LOAD_OPEN && 1555 state != POOL_STATE_ACTIVE) 1556 return (SET_ERROR(EBADF)); 1557 1558 /* 1559 * If we were able to open and validate a vdev that was 1560 * previously marked permanently unavailable, clear that state 1561 * now. 1562 */ 1563 if (vd->vdev_not_present) 1564 vd->vdev_not_present = 0; 1565 } 1566 1567 return (0); 1568} 1569 1570/* 1571 * Close a virtual device. 1572 */ 1573void 1574vdev_close(vdev_t *vd) 1575{ 1576 spa_t *spa = vd->vdev_spa; 1577 vdev_t *pvd = vd->vdev_parent; 1578 1579 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1580 1581 /* 1582 * If our parent is reopening, then we are as well, unless we are 1583 * going offline. 1584 */ 1585 if (pvd != NULL && pvd->vdev_reopening) 1586 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1587 1588 vd->vdev_ops->vdev_op_close(vd); 1589 1590 vdev_cache_purge(vd); 1591 1592 if (vd->vdev_ops->vdev_op_leaf) 1593 trim_map_destroy(vd); 1594 1595 /* 1596 * We record the previous state before we close it, so that if we are 1597 * doing a reopen(), we don't generate FMA ereports if we notice that 1598 * it's still faulted. 1599 */ 1600 vd->vdev_prevstate = vd->vdev_state; 1601 1602 if (vd->vdev_offline) 1603 vd->vdev_state = VDEV_STATE_OFFLINE; 1604 else 1605 vd->vdev_state = VDEV_STATE_CLOSED; 1606 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1607} 1608 1609void 1610vdev_hold(vdev_t *vd) 1611{ 1612 spa_t *spa = vd->vdev_spa; 1613 1614 ASSERT(spa_is_root(spa)); 1615 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1616 return; 1617 1618 for (int c = 0; c < vd->vdev_children; c++) 1619 vdev_hold(vd->vdev_child[c]); 1620 1621 if (vd->vdev_ops->vdev_op_leaf) 1622 vd->vdev_ops->vdev_op_hold(vd); 1623} 1624 1625void 1626vdev_rele(vdev_t *vd) 1627{ 1628 spa_t *spa = vd->vdev_spa; 1629 1630 ASSERT(spa_is_root(spa)); 1631 for (int c = 0; c < vd->vdev_children; c++) 1632 vdev_rele(vd->vdev_child[c]); 1633 1634 if (vd->vdev_ops->vdev_op_leaf) 1635 vd->vdev_ops->vdev_op_rele(vd); 1636} 1637 1638/* 1639 * Reopen all interior vdevs and any unopened leaves. We don't actually 1640 * reopen leaf vdevs which had previously been opened as they might deadlock 1641 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1642 * If the leaf has never been opened then open it, as usual. 1643 */ 1644void 1645vdev_reopen(vdev_t *vd) 1646{ 1647 spa_t *spa = vd->vdev_spa; 1648 1649 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1650 1651 /* set the reopening flag unless we're taking the vdev offline */ 1652 vd->vdev_reopening = !vd->vdev_offline; 1653 vdev_close(vd); 1654 (void) vdev_open(vd); 1655 1656 /* 1657 * Call vdev_validate() here to make sure we have the same device. 1658 * Otherwise, a device with an invalid label could be successfully 1659 * opened in response to vdev_reopen(). 1660 */ 1661 if (vd->vdev_aux) { 1662 (void) vdev_validate_aux(vd); 1663 if (vdev_readable(vd) && vdev_writeable(vd) && 1664 vd->vdev_aux == &spa->spa_l2cache && 1665 !l2arc_vdev_present(vd)) 1666 l2arc_add_vdev(spa, vd); 1667 } else { 1668 (void) vdev_validate(vd, B_TRUE); 1669 } 1670 1671 /* 1672 * Reassess parent vdev's health. 1673 */ 1674 vdev_propagate_state(vd); 1675} 1676 1677int 1678vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1679{ 1680 int error; 1681 1682 /* 1683 * Normally, partial opens (e.g. of a mirror) are allowed. 1684 * For a create, however, we want to fail the request if 1685 * there are any components we can't open. 1686 */ 1687 error = vdev_open(vd); 1688 1689 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1690 vdev_close(vd); 1691 return (error ? error : ENXIO); 1692 } 1693 1694 /* 1695 * Recursively load DTLs and initialize all labels. 1696 */ 1697 if ((error = vdev_dtl_load(vd)) != 0 || 1698 (error = vdev_label_init(vd, txg, isreplacing ? 1699 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1700 vdev_close(vd); 1701 return (error); 1702 } 1703 1704 return (0); 1705} 1706 1707void 1708vdev_metaslab_set_size(vdev_t *vd) 1709{ 1710 /* 1711 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1712 */ 1713 vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1714 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1715} 1716 1717/* 1718 * Maximize performance by inflating the configured ashift for top level 1719 * vdevs to be as close to the physical ashift as possible while maintaining 1720 * administrator defined limits and ensuring it doesn't go below the 1721 * logical ashift. 1722 */ 1723void 1724vdev_ashift_optimize(vdev_t *vd) 1725{ 1726 if (vd == vd->vdev_top) { 1727 if (vd->vdev_ashift < vd->vdev_physical_ashift) { 1728 vd->vdev_ashift = MIN( 1729 MAX(zfs_max_auto_ashift, vd->vdev_ashift), 1730 MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift)); 1731 } else { 1732 /* 1733 * Unusual case where logical ashift > physical ashift 1734 * so we can't cap the calculated ashift based on max 1735 * ashift as that would cause failures. 1736 * We still check if we need to increase it to match 1737 * the min ashift. 1738 */ 1739 vd->vdev_ashift = MAX(zfs_min_auto_ashift, 1740 vd->vdev_ashift); 1741 } 1742 } 1743} 1744 1745void 1746vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1747{ 1748 ASSERT(vd == vd->vdev_top); 1749 ASSERT(!vd->vdev_ishole); 1750 ASSERT(ISP2(flags)); 1751 ASSERT(spa_writeable(vd->vdev_spa)); 1752 1753 if (flags & VDD_METASLAB) 1754 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1755 1756 if (flags & VDD_DTL) 1757 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1758 1759 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1760} 1761 1762void 1763vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 1764{ 1765 for (int c = 0; c < vd->vdev_children; c++) 1766 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1767 1768 if (vd->vdev_ops->vdev_op_leaf) 1769 vdev_dirty(vd->vdev_top, flags, vd, txg); 1770} 1771 1772/* 1773 * DTLs. 1774 * 1775 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1776 * the vdev has less than perfect replication. There are four kinds of DTL: 1777 * 1778 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1779 * 1780 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1781 * 1782 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1783 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1784 * txgs that was scrubbed. 1785 * 1786 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1787 * persistent errors or just some device being offline. 1788 * Unlike the other three, the DTL_OUTAGE map is not generally 1789 * maintained; it's only computed when needed, typically to 1790 * determine whether a device can be detached. 1791 * 1792 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1793 * either has the data or it doesn't. 1794 * 1795 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1796 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1797 * if any child is less than fully replicated, then so is its parent. 1798 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1799 * comprising only those txgs which appear in 'maxfaults' or more children; 1800 * those are the txgs we don't have enough replication to read. For example, 1801 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1802 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1803 * two child DTL_MISSING maps. 1804 * 1805 * It should be clear from the above that to compute the DTLs and outage maps 1806 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1807 * Therefore, that is all we keep on disk. When loading the pool, or after 1808 * a configuration change, we generate all other DTLs from first principles. 1809 */ 1810void 1811vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1812{ 1813 range_tree_t *rt = vd->vdev_dtl[t]; 1814 1815 ASSERT(t < DTL_TYPES); 1816 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1817 ASSERT(spa_writeable(vd->vdev_spa)); 1818 1819 mutex_enter(rt->rt_lock); 1820 if (!range_tree_contains(rt, txg, size)) 1821 range_tree_add(rt, txg, size); 1822 mutex_exit(rt->rt_lock); 1823} 1824 1825boolean_t 1826vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1827{ 1828 range_tree_t *rt = vd->vdev_dtl[t]; 1829 boolean_t dirty = B_FALSE; 1830 1831 ASSERT(t < DTL_TYPES); 1832 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1833 1834 mutex_enter(rt->rt_lock); 1835 if (range_tree_space(rt) != 0) 1836 dirty = range_tree_contains(rt, txg, size); 1837 mutex_exit(rt->rt_lock); 1838 1839 return (dirty); 1840} 1841 1842boolean_t 1843vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1844{ 1845 range_tree_t *rt = vd->vdev_dtl[t]; 1846 boolean_t empty; 1847 1848 mutex_enter(rt->rt_lock); 1849 empty = (range_tree_space(rt) == 0); 1850 mutex_exit(rt->rt_lock); 1851 1852 return (empty); 1853} 1854 1855/* 1856 * Returns the lowest txg in the DTL range. 1857 */ 1858static uint64_t 1859vdev_dtl_min(vdev_t *vd) 1860{ 1861 range_seg_t *rs; 1862 1863 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1864 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1865 ASSERT0(vd->vdev_children); 1866 1867 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1868 return (rs->rs_start - 1); 1869} 1870 1871/* 1872 * Returns the highest txg in the DTL. 1873 */ 1874static uint64_t 1875vdev_dtl_max(vdev_t *vd) 1876{ 1877 range_seg_t *rs; 1878 1879 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1880 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1881 ASSERT0(vd->vdev_children); 1882 1883 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1884 return (rs->rs_end); 1885} 1886 1887/* 1888 * Determine if a resilvering vdev should remove any DTL entries from 1889 * its range. If the vdev was resilvering for the entire duration of the 1890 * scan then it should excise that range from its DTLs. Otherwise, this 1891 * vdev is considered partially resilvered and should leave its DTL 1892 * entries intact. The comment in vdev_dtl_reassess() describes how we 1893 * excise the DTLs. 1894 */ 1895static boolean_t 1896vdev_dtl_should_excise(vdev_t *vd) 1897{ 1898 spa_t *spa = vd->vdev_spa; 1899 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1900 1901 ASSERT0(scn->scn_phys.scn_errors); 1902 ASSERT0(vd->vdev_children); 1903 1904 if (vd->vdev_resilver_txg == 0 || 1905 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 1906 return (B_TRUE); 1907 1908 /* 1909 * When a resilver is initiated the scan will assign the scn_max_txg 1910 * value to the highest txg value that exists in all DTLs. If this 1911 * device's max DTL is not part of this scan (i.e. it is not in 1912 * the range (scn_min_txg, scn_max_txg] then it is not eligible 1913 * for excision. 1914 */ 1915 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 1916 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 1917 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 1918 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 1919 return (B_TRUE); 1920 } 1921 return (B_FALSE); 1922} 1923 1924/* 1925 * Reassess DTLs after a config change or scrub completion. 1926 */ 1927void 1928vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1929{ 1930 spa_t *spa = vd->vdev_spa; 1931 avl_tree_t reftree; 1932 int minref; 1933 1934 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1935 1936 for (int c = 0; c < vd->vdev_children; c++) 1937 vdev_dtl_reassess(vd->vdev_child[c], txg, 1938 scrub_txg, scrub_done); 1939 1940 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1941 return; 1942 1943 if (vd->vdev_ops->vdev_op_leaf) { 1944 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1945 1946 mutex_enter(&vd->vdev_dtl_lock); 1947 1948 /* 1949 * If we've completed a scan cleanly then determine 1950 * if this vdev should remove any DTLs. We only want to 1951 * excise regions on vdevs that were available during 1952 * the entire duration of this scan. 1953 */ 1954 if (scrub_txg != 0 && 1955 (spa->spa_scrub_started || 1956 (scn != NULL && scn->scn_phys.scn_errors == 0)) && 1957 vdev_dtl_should_excise(vd)) { 1958 /* 1959 * We completed a scrub up to scrub_txg. If we 1960 * did it without rebooting, then the scrub dtl 1961 * will be valid, so excise the old region and 1962 * fold in the scrub dtl. Otherwise, leave the 1963 * dtl as-is if there was an error. 1964 * 1965 * There's little trick here: to excise the beginning 1966 * of the DTL_MISSING map, we put it into a reference 1967 * tree and then add a segment with refcnt -1 that 1968 * covers the range [0, scrub_txg). This means 1969 * that each txg in that range has refcnt -1 or 0. 1970 * We then add DTL_SCRUB with a refcnt of 2, so that 1971 * entries in the range [0, scrub_txg) will have a 1972 * positive refcnt -- either 1 or 2. We then convert 1973 * the reference tree into the new DTL_MISSING map. 1974 */ 1975 space_reftree_create(&reftree); 1976 space_reftree_add_map(&reftree, 1977 vd->vdev_dtl[DTL_MISSING], 1); 1978 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 1979 space_reftree_add_map(&reftree, 1980 vd->vdev_dtl[DTL_SCRUB], 2); 1981 space_reftree_generate_map(&reftree, 1982 vd->vdev_dtl[DTL_MISSING], 1); 1983 space_reftree_destroy(&reftree); 1984 } 1985 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1986 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1987 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 1988 if (scrub_done) 1989 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1990 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1991 if (!vdev_readable(vd)) 1992 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1993 else 1994 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1995 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 1996 1997 /* 1998 * If the vdev was resilvering and no longer has any 1999 * DTLs then reset its resilvering flag and dirty 2000 * the top level so that we persist the change. 2001 */ 2002 if (vd->vdev_resilver_txg != 0 && 2003 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 2004 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) { 2005 vd->vdev_resilver_txg = 0; 2006 vdev_config_dirty(vd->vdev_top); 2007 } 2008 2009 mutex_exit(&vd->vdev_dtl_lock); 2010 2011 if (txg != 0) 2012 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 2013 return; 2014 } 2015 2016 mutex_enter(&vd->vdev_dtl_lock); 2017 for (int t = 0; t < DTL_TYPES; t++) { 2018 /* account for child's outage in parent's missing map */ 2019 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 2020 if (t == DTL_SCRUB) 2021 continue; /* leaf vdevs only */ 2022 if (t == DTL_PARTIAL) 2023 minref = 1; /* i.e. non-zero */ 2024 else if (vd->vdev_nparity != 0) 2025 minref = vd->vdev_nparity + 1; /* RAID-Z */ 2026 else 2027 minref = vd->vdev_children; /* any kind of mirror */ 2028 space_reftree_create(&reftree); 2029 for (int c = 0; c < vd->vdev_children; c++) { 2030 vdev_t *cvd = vd->vdev_child[c]; 2031 mutex_enter(&cvd->vdev_dtl_lock); 2032 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 2033 mutex_exit(&cvd->vdev_dtl_lock); 2034 } 2035 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 2036 space_reftree_destroy(&reftree); 2037 } 2038 mutex_exit(&vd->vdev_dtl_lock); 2039} 2040 2041int 2042vdev_dtl_load(vdev_t *vd) 2043{ 2044 spa_t *spa = vd->vdev_spa; 2045 objset_t *mos = spa->spa_meta_objset; 2046 int error = 0; 2047 2048 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 2049 ASSERT(!vd->vdev_ishole); 2050 2051 error = space_map_open(&vd->vdev_dtl_sm, mos, 2052 vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); 2053 if (error) 2054 return (error); 2055 ASSERT(vd->vdev_dtl_sm != NULL); 2056 2057 mutex_enter(&vd->vdev_dtl_lock); 2058 2059 /* 2060 * Now that we've opened the space_map we need to update 2061 * the in-core DTL. 2062 */ 2063 space_map_update(vd->vdev_dtl_sm); 2064 2065 error = space_map_load(vd->vdev_dtl_sm, 2066 vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 2067 mutex_exit(&vd->vdev_dtl_lock); 2068 2069 return (error); 2070 } 2071 2072 for (int c = 0; c < vd->vdev_children; c++) { 2073 error = vdev_dtl_load(vd->vdev_child[c]); 2074 if (error != 0) 2075 break; 2076 } 2077 2078 return (error); 2079} 2080 2081void 2082vdev_dtl_sync(vdev_t *vd, uint64_t txg) 2083{ 2084 spa_t *spa = vd->vdev_spa; 2085 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2086 objset_t *mos = spa->spa_meta_objset; 2087 range_tree_t *rtsync; 2088 kmutex_t rtlock; 2089 dmu_tx_t *tx; 2090 uint64_t object = space_map_object(vd->vdev_dtl_sm); 2091 2092 ASSERT(!vd->vdev_ishole); 2093 ASSERT(vd->vdev_ops->vdev_op_leaf); 2094 2095 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2096 2097 if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2098 mutex_enter(&vd->vdev_dtl_lock); 2099 space_map_free(vd->vdev_dtl_sm, tx); 2100 space_map_close(vd->vdev_dtl_sm); 2101 vd->vdev_dtl_sm = NULL; 2102 mutex_exit(&vd->vdev_dtl_lock); 2103 dmu_tx_commit(tx); 2104 return; 2105 } 2106 2107 if (vd->vdev_dtl_sm == NULL) { 2108 uint64_t new_object; 2109 2110 new_object = space_map_alloc(mos, tx); 2111 VERIFY3U(new_object, !=, 0); 2112 2113 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2114 0, -1ULL, 0, &vd->vdev_dtl_lock)); 2115 ASSERT(vd->vdev_dtl_sm != NULL); 2116 } 2117 2118 bzero(&rtlock, sizeof(rtlock)); 2119 mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); 2120 2121 rtsync = range_tree_create(NULL, NULL, &rtlock); 2122 2123 mutex_enter(&rtlock); 2124 2125 mutex_enter(&vd->vdev_dtl_lock); 2126 range_tree_walk(rt, range_tree_add, rtsync); 2127 mutex_exit(&vd->vdev_dtl_lock); 2128 2129 space_map_truncate(vd->vdev_dtl_sm, tx); 2130 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2131 range_tree_vacate(rtsync, NULL, NULL); 2132 2133 range_tree_destroy(rtsync); 2134 2135 mutex_exit(&rtlock); 2136 mutex_destroy(&rtlock); 2137 2138 /* 2139 * If the object for the space map has changed then dirty 2140 * the top level so that we update the config. 2141 */ 2142 if (object != space_map_object(vd->vdev_dtl_sm)) { 2143 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " 2144 "new object %llu", txg, spa_name(spa), object, 2145 space_map_object(vd->vdev_dtl_sm)); 2146 vdev_config_dirty(vd->vdev_top); 2147 } 2148 2149 dmu_tx_commit(tx); 2150 2151 mutex_enter(&vd->vdev_dtl_lock); 2152 space_map_update(vd->vdev_dtl_sm); 2153 mutex_exit(&vd->vdev_dtl_lock); 2154} 2155 2156/* 2157 * Determine whether the specified vdev can be offlined/detached/removed 2158 * without losing data. 2159 */ 2160boolean_t 2161vdev_dtl_required(vdev_t *vd) 2162{ 2163 spa_t *spa = vd->vdev_spa; 2164 vdev_t *tvd = vd->vdev_top; 2165 uint8_t cant_read = vd->vdev_cant_read; 2166 boolean_t required; 2167 2168 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2169 2170 if (vd == spa->spa_root_vdev || vd == tvd) 2171 return (B_TRUE); 2172 2173 /* 2174 * Temporarily mark the device as unreadable, and then determine 2175 * whether this results in any DTL outages in the top-level vdev. 2176 * If not, we can safely offline/detach/remove the device. 2177 */ 2178 vd->vdev_cant_read = B_TRUE; 2179 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2180 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2181 vd->vdev_cant_read = cant_read; 2182 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2183 2184 if (!required && zio_injection_enabled) 2185 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2186 2187 return (required); 2188} 2189 2190/* 2191 * Determine if resilver is needed, and if so the txg range. 2192 */ 2193boolean_t 2194vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2195{ 2196 boolean_t needed = B_FALSE; 2197 uint64_t thismin = UINT64_MAX; 2198 uint64_t thismax = 0; 2199 2200 if (vd->vdev_children == 0) { 2201 mutex_enter(&vd->vdev_dtl_lock); 2202 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2203 vdev_writeable(vd)) { 2204 2205 thismin = vdev_dtl_min(vd); 2206 thismax = vdev_dtl_max(vd); 2207 needed = B_TRUE; 2208 } 2209 mutex_exit(&vd->vdev_dtl_lock); 2210 } else { 2211 for (int c = 0; c < vd->vdev_children; c++) { 2212 vdev_t *cvd = vd->vdev_child[c]; 2213 uint64_t cmin, cmax; 2214 2215 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2216 thismin = MIN(thismin, cmin); 2217 thismax = MAX(thismax, cmax); 2218 needed = B_TRUE; 2219 } 2220 } 2221 } 2222 2223 if (needed && minp) { 2224 *minp = thismin; 2225 *maxp = thismax; 2226 } 2227 return (needed); 2228} 2229 2230void 2231vdev_load(vdev_t *vd) 2232{ 2233 /* 2234 * Recursively load all children. 2235 */ 2236 for (int c = 0; c < vd->vdev_children; c++) 2237 vdev_load(vd->vdev_child[c]); 2238 2239 /* 2240 * If this is a top-level vdev, initialize its metaslabs. 2241 */ 2242 if (vd == vd->vdev_top && !vd->vdev_ishole && 2243 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 2244 vdev_metaslab_init(vd, 0) != 0)) 2245 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2246 VDEV_AUX_CORRUPT_DATA); 2247 2248 /* 2249 * If this is a leaf vdev, load its DTL. 2250 */ 2251 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 2252 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2253 VDEV_AUX_CORRUPT_DATA); 2254} 2255 2256/* 2257 * The special vdev case is used for hot spares and l2cache devices. Its 2258 * sole purpose it to set the vdev state for the associated vdev. To do this, 2259 * we make sure that we can open the underlying device, then try to read the 2260 * label, and make sure that the label is sane and that it hasn't been 2261 * repurposed to another pool. 2262 */ 2263int 2264vdev_validate_aux(vdev_t *vd) 2265{ 2266 nvlist_t *label; 2267 uint64_t guid, version; 2268 uint64_t state; 2269 2270 if (!vdev_readable(vd)) 2271 return (0); 2272 2273 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2274 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2275 VDEV_AUX_CORRUPT_DATA); 2276 return (-1); 2277 } 2278 2279 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2280 !SPA_VERSION_IS_SUPPORTED(version) || 2281 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2282 guid != vd->vdev_guid || 2283 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2284 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2285 VDEV_AUX_CORRUPT_DATA); 2286 nvlist_free(label); 2287 return (-1); 2288 } 2289 2290 /* 2291 * We don't actually check the pool state here. If it's in fact in 2292 * use by another pool, we update this fact on the fly when requested. 2293 */ 2294 nvlist_free(label); 2295 return (0); 2296} 2297 2298void 2299vdev_remove(vdev_t *vd, uint64_t txg) 2300{ 2301 spa_t *spa = vd->vdev_spa; 2302 objset_t *mos = spa->spa_meta_objset; 2303 dmu_tx_t *tx; 2304 2305 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2306 2307 if (vd->vdev_ms != NULL) { 2308 metaslab_group_t *mg = vd->vdev_mg; 2309 2310 metaslab_group_histogram_verify(mg); 2311 metaslab_class_histogram_verify(mg->mg_class); 2312 2313 for (int m = 0; m < vd->vdev_ms_count; m++) { 2314 metaslab_t *msp = vd->vdev_ms[m]; 2315 2316 if (msp == NULL || msp->ms_sm == NULL) 2317 continue; 2318 2319 mutex_enter(&msp->ms_lock); 2320 /* 2321 * If the metaslab was not loaded when the vdev 2322 * was removed then the histogram accounting may 2323 * not be accurate. Update the histogram information 2324 * here so that we ensure that the metaslab group 2325 * and metaslab class are up-to-date. 2326 */ 2327 metaslab_group_histogram_remove(mg, msp); 2328 2329 VERIFY0(space_map_allocated(msp->ms_sm)); 2330 space_map_free(msp->ms_sm, tx); 2331 space_map_close(msp->ms_sm); 2332 msp->ms_sm = NULL; 2333 mutex_exit(&msp->ms_lock); 2334 } 2335 2336 metaslab_group_histogram_verify(mg); 2337 metaslab_class_histogram_verify(mg->mg_class); 2338 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2339 ASSERT0(mg->mg_histogram[i]); 2340 2341 } 2342 2343 if (vd->vdev_ms_array) { 2344 (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2345 vd->vdev_ms_array = 0; 2346 } 2347 dmu_tx_commit(tx); 2348} 2349 2350void 2351vdev_sync_done(vdev_t *vd, uint64_t txg) 2352{ 2353 metaslab_t *msp; 2354 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2355 2356 ASSERT(!vd->vdev_ishole); 2357 2358 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2359 metaslab_sync_done(msp, txg); 2360 2361 if (reassess) 2362 metaslab_sync_reassess(vd->vdev_mg); 2363} 2364 2365void 2366vdev_sync(vdev_t *vd, uint64_t txg) 2367{ 2368 spa_t *spa = vd->vdev_spa; 2369 vdev_t *lvd; 2370 metaslab_t *msp; 2371 dmu_tx_t *tx; 2372 2373 ASSERT(!vd->vdev_ishole); 2374 2375 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2376 ASSERT(vd == vd->vdev_top); 2377 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2378 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2379 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2380 ASSERT(vd->vdev_ms_array != 0); 2381 vdev_config_dirty(vd); 2382 dmu_tx_commit(tx); 2383 } 2384 2385 /* 2386 * Remove the metadata associated with this vdev once it's empty. 2387 */ 2388 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2389 vdev_remove(vd, txg); 2390 2391 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2392 metaslab_sync(msp, txg); 2393 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2394 } 2395 2396 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2397 vdev_dtl_sync(lvd, txg); 2398 2399 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2400} 2401 2402uint64_t 2403vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2404{ 2405 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2406} 2407 2408/* 2409 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2410 * not be opened, and no I/O is attempted. 2411 */ 2412int 2413vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2414{ 2415 vdev_t *vd, *tvd; 2416 2417 spa_vdev_state_enter(spa, SCL_NONE); 2418 2419 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2420 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2421 2422 if (!vd->vdev_ops->vdev_op_leaf) 2423 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2424 2425 tvd = vd->vdev_top; 2426 2427 /* 2428 * We don't directly use the aux state here, but if we do a 2429 * vdev_reopen(), we need this value to be present to remember why we 2430 * were faulted. 2431 */ 2432 vd->vdev_label_aux = aux; 2433 2434 /* 2435 * Faulted state takes precedence over degraded. 2436 */ 2437 vd->vdev_delayed_close = B_FALSE; 2438 vd->vdev_faulted = 1ULL; 2439 vd->vdev_degraded = 0ULL; 2440 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2441 2442 /* 2443 * If this device has the only valid copy of the data, then 2444 * back off and simply mark the vdev as degraded instead. 2445 */ 2446 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2447 vd->vdev_degraded = 1ULL; 2448 vd->vdev_faulted = 0ULL; 2449 2450 /* 2451 * If we reopen the device and it's not dead, only then do we 2452 * mark it degraded. 2453 */ 2454 vdev_reopen(tvd); 2455 2456 if (vdev_readable(vd)) 2457 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2458 } 2459 2460 return (spa_vdev_state_exit(spa, vd, 0)); 2461} 2462 2463/* 2464 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2465 * user that something is wrong. The vdev continues to operate as normal as far 2466 * as I/O is concerned. 2467 */ 2468int 2469vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2470{ 2471 vdev_t *vd; 2472 2473 spa_vdev_state_enter(spa, SCL_NONE); 2474 2475 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2476 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2477 2478 if (!vd->vdev_ops->vdev_op_leaf) 2479 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2480 2481 /* 2482 * If the vdev is already faulted, then don't do anything. 2483 */ 2484 if (vd->vdev_faulted || vd->vdev_degraded) 2485 return (spa_vdev_state_exit(spa, NULL, 0)); 2486 2487 vd->vdev_degraded = 1ULL; 2488 if (!vdev_is_dead(vd)) 2489 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2490 aux); 2491 2492 return (spa_vdev_state_exit(spa, vd, 0)); 2493} 2494 2495/* 2496 * Online the given vdev. 2497 * 2498 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2499 * spare device should be detached when the device finishes resilvering. 2500 * Second, the online should be treated like a 'test' online case, so no FMA 2501 * events are generated if the device fails to open. 2502 */ 2503int 2504vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2505{ 2506 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2507 boolean_t postevent = B_FALSE; 2508 2509 spa_vdev_state_enter(spa, SCL_NONE); 2510 2511 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2512 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2513 2514 if (!vd->vdev_ops->vdev_op_leaf) 2515 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2516 2517 postevent = 2518 (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ? 2519 B_TRUE : B_FALSE; 2520 2521 tvd = vd->vdev_top; 2522 vd->vdev_offline = B_FALSE; 2523 vd->vdev_tmpoffline = B_FALSE; 2524 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2525 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2526 2527 /* XXX - L2ARC 1.0 does not support expansion */ 2528 if (!vd->vdev_aux) { 2529 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2530 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2531 } 2532 2533 vdev_reopen(tvd); 2534 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2535 2536 if (!vd->vdev_aux) { 2537 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2538 pvd->vdev_expanding = B_FALSE; 2539 } 2540 2541 if (newstate) 2542 *newstate = vd->vdev_state; 2543 if ((flags & ZFS_ONLINE_UNSPARE) && 2544 !vdev_is_dead(vd) && vd->vdev_parent && 2545 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2546 vd->vdev_parent->vdev_child[0] == vd) 2547 vd->vdev_unspare = B_TRUE; 2548 2549 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2550 2551 /* XXX - L2ARC 1.0 does not support expansion */ 2552 if (vd->vdev_aux) 2553 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2554 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2555 } 2556 2557 if (postevent) 2558 spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); 2559 2560 return (spa_vdev_state_exit(spa, vd, 0)); 2561} 2562 2563static int 2564vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2565{ 2566 vdev_t *vd, *tvd; 2567 int error = 0; 2568 uint64_t generation; 2569 metaslab_group_t *mg; 2570 2571top: 2572 spa_vdev_state_enter(spa, SCL_ALLOC); 2573 2574 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2575 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2576 2577 if (!vd->vdev_ops->vdev_op_leaf) 2578 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2579 2580 tvd = vd->vdev_top; 2581 mg = tvd->vdev_mg; 2582 generation = spa->spa_config_generation + 1; 2583 2584 /* 2585 * If the device isn't already offline, try to offline it. 2586 */ 2587 if (!vd->vdev_offline) { 2588 /* 2589 * If this device has the only valid copy of some data, 2590 * don't allow it to be offlined. Log devices are always 2591 * expendable. 2592 */ 2593 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2594 vdev_dtl_required(vd)) 2595 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2596 2597 /* 2598 * If the top-level is a slog and it has had allocations 2599 * then proceed. We check that the vdev's metaslab group 2600 * is not NULL since it's possible that we may have just 2601 * added this vdev but not yet initialized its metaslabs. 2602 */ 2603 if (tvd->vdev_islog && mg != NULL) { 2604 /* 2605 * Prevent any future allocations. 2606 */ 2607 metaslab_group_passivate(mg); 2608 (void) spa_vdev_state_exit(spa, vd, 0); 2609 2610 error = spa_offline_log(spa); 2611 2612 spa_vdev_state_enter(spa, SCL_ALLOC); 2613 2614 /* 2615 * Check to see if the config has changed. 2616 */ 2617 if (error || generation != spa->spa_config_generation) { 2618 metaslab_group_activate(mg); 2619 if (error) 2620 return (spa_vdev_state_exit(spa, 2621 vd, error)); 2622 (void) spa_vdev_state_exit(spa, vd, 0); 2623 goto top; 2624 } 2625 ASSERT0(tvd->vdev_stat.vs_alloc); 2626 } 2627 2628 /* 2629 * Offline this device and reopen its top-level vdev. 2630 * If the top-level vdev is a log device then just offline 2631 * it. Otherwise, if this action results in the top-level 2632 * vdev becoming unusable, undo it and fail the request. 2633 */ 2634 vd->vdev_offline = B_TRUE; 2635 vdev_reopen(tvd); 2636 2637 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2638 vdev_is_dead(tvd)) { 2639 vd->vdev_offline = B_FALSE; 2640 vdev_reopen(tvd); 2641 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2642 } 2643 2644 /* 2645 * Add the device back into the metaslab rotor so that 2646 * once we online the device it's open for business. 2647 */ 2648 if (tvd->vdev_islog && mg != NULL) 2649 metaslab_group_activate(mg); 2650 } 2651 2652 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2653 2654 return (spa_vdev_state_exit(spa, vd, 0)); 2655} 2656 2657int 2658vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2659{ 2660 int error; 2661 2662 mutex_enter(&spa->spa_vdev_top_lock); 2663 error = vdev_offline_locked(spa, guid, flags); 2664 mutex_exit(&spa->spa_vdev_top_lock); 2665 2666 return (error); 2667} 2668 2669/* 2670 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2671 * vdev_offline(), we assume the spa config is locked. We also clear all 2672 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2673 */ 2674void 2675vdev_clear(spa_t *spa, vdev_t *vd) 2676{ 2677 vdev_t *rvd = spa->spa_root_vdev; 2678 2679 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2680 2681 if (vd == NULL) 2682 vd = rvd; 2683 2684 vd->vdev_stat.vs_read_errors = 0; 2685 vd->vdev_stat.vs_write_errors = 0; 2686 vd->vdev_stat.vs_checksum_errors = 0; 2687 2688 for (int c = 0; c < vd->vdev_children; c++) 2689 vdev_clear(spa, vd->vdev_child[c]); 2690 2691 if (vd == rvd) { 2692 for (int c = 0; c < spa->spa_l2cache.sav_count; c++) 2693 vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]); 2694 2695 for (int c = 0; c < spa->spa_spares.sav_count; c++) 2696 vdev_clear(spa, spa->spa_spares.sav_vdevs[c]); 2697 } 2698 2699 /* 2700 * If we're in the FAULTED state or have experienced failed I/O, then 2701 * clear the persistent state and attempt to reopen the device. We 2702 * also mark the vdev config dirty, so that the new faulted state is 2703 * written out to disk. 2704 */ 2705 if (vd->vdev_faulted || vd->vdev_degraded || 2706 !vdev_readable(vd) || !vdev_writeable(vd)) { 2707 2708 /* 2709 * When reopening in reponse to a clear event, it may be due to 2710 * a fmadm repair request. In this case, if the device is 2711 * still broken, we want to still post the ereport again. 2712 */ 2713 vd->vdev_forcefault = B_TRUE; 2714 2715 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2716 vd->vdev_cant_read = B_FALSE; 2717 vd->vdev_cant_write = B_FALSE; 2718 2719 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2720 2721 vd->vdev_forcefault = B_FALSE; 2722 2723 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2724 vdev_state_dirty(vd->vdev_top); 2725 2726 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2727 spa_async_request(spa, SPA_ASYNC_RESILVER); 2728 2729 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2730 } 2731 2732 /* 2733 * When clearing a FMA-diagnosed fault, we always want to 2734 * unspare the device, as we assume that the original spare was 2735 * done in response to the FMA fault. 2736 */ 2737 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2738 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2739 vd->vdev_parent->vdev_child[0] == vd) 2740 vd->vdev_unspare = B_TRUE; 2741} 2742 2743boolean_t 2744vdev_is_dead(vdev_t *vd) 2745{ 2746 /* 2747 * Holes and missing devices are always considered "dead". 2748 * This simplifies the code since we don't have to check for 2749 * these types of devices in the various code paths. 2750 * Instead we rely on the fact that we skip over dead devices 2751 * before issuing I/O to them. 2752 */ 2753 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2754 vd->vdev_ops == &vdev_missing_ops); 2755} 2756 2757boolean_t 2758vdev_readable(vdev_t *vd) 2759{ 2760 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2761} 2762 2763boolean_t 2764vdev_writeable(vdev_t *vd) 2765{ 2766 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2767} 2768 2769boolean_t 2770vdev_allocatable(vdev_t *vd) 2771{ 2772 uint64_t state = vd->vdev_state; 2773 2774 /* 2775 * We currently allow allocations from vdevs which may be in the 2776 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2777 * fails to reopen then we'll catch it later when we're holding 2778 * the proper locks. Note that we have to get the vdev state 2779 * in a local variable because although it changes atomically, 2780 * we're asking two separate questions about it. 2781 */ 2782 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2783 !vd->vdev_cant_write && !vd->vdev_ishole); 2784} 2785 2786boolean_t 2787vdev_accessible(vdev_t *vd, zio_t *zio) 2788{ 2789 ASSERT(zio->io_vd == vd); 2790 2791 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2792 return (B_FALSE); 2793 2794 if (zio->io_type == ZIO_TYPE_READ) 2795 return (!vd->vdev_cant_read); 2796 2797 if (zio->io_type == ZIO_TYPE_WRITE) 2798 return (!vd->vdev_cant_write); 2799 2800 return (B_TRUE); 2801} 2802 2803/* 2804 * Get statistics for the given vdev. 2805 */ 2806void 2807vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2808{ 2809 spa_t *spa = vd->vdev_spa; 2810 vdev_t *rvd = spa->spa_root_vdev; 2811 2812 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2813 2814 mutex_enter(&vd->vdev_stat_lock); 2815 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2816 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2817 vs->vs_state = vd->vdev_state; 2818 vs->vs_rsize = vdev_get_min_asize(vd); 2819 if (vd->vdev_ops->vdev_op_leaf) 2820 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2821 if (vd->vdev_max_asize != 0) 2822 vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; 2823 vs->vs_configured_ashift = vd->vdev_top != NULL 2824 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 2825 vs->vs_logical_ashift = vd->vdev_logical_ashift; 2826 vs->vs_physical_ashift = vd->vdev_physical_ashift; 2827 if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { 2828 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 2829 } 2830 2831 /* 2832 * If we're getting stats on the root vdev, aggregate the I/O counts 2833 * over all top-level vdevs (i.e. the direct children of the root). 2834 */ 2835 if (vd == rvd) { 2836 for (int c = 0; c < rvd->vdev_children; c++) { 2837 vdev_t *cvd = rvd->vdev_child[c]; 2838 vdev_stat_t *cvs = &cvd->vdev_stat; 2839 2840 for (int t = 0; t < ZIO_TYPES; t++) { 2841 vs->vs_ops[t] += cvs->vs_ops[t]; 2842 vs->vs_bytes[t] += cvs->vs_bytes[t]; 2843 } 2844 cvs->vs_scan_removing = cvd->vdev_removing; 2845 } 2846 } 2847 mutex_exit(&vd->vdev_stat_lock); 2848} 2849 2850void 2851vdev_clear_stats(vdev_t *vd) 2852{ 2853 mutex_enter(&vd->vdev_stat_lock); 2854 vd->vdev_stat.vs_space = 0; 2855 vd->vdev_stat.vs_dspace = 0; 2856 vd->vdev_stat.vs_alloc = 0; 2857 mutex_exit(&vd->vdev_stat_lock); 2858} 2859 2860void 2861vdev_scan_stat_init(vdev_t *vd) 2862{ 2863 vdev_stat_t *vs = &vd->vdev_stat; 2864 2865 for (int c = 0; c < vd->vdev_children; c++) 2866 vdev_scan_stat_init(vd->vdev_child[c]); 2867 2868 mutex_enter(&vd->vdev_stat_lock); 2869 vs->vs_scan_processed = 0; 2870 mutex_exit(&vd->vdev_stat_lock); 2871} 2872 2873void 2874vdev_stat_update(zio_t *zio, uint64_t psize) 2875{ 2876 spa_t *spa = zio->io_spa; 2877 vdev_t *rvd = spa->spa_root_vdev; 2878 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2879 vdev_t *pvd; 2880 uint64_t txg = zio->io_txg; 2881 vdev_stat_t *vs = &vd->vdev_stat; 2882 zio_type_t type = zio->io_type; 2883 int flags = zio->io_flags; 2884 2885 /* 2886 * If this i/o is a gang leader, it didn't do any actual work. 2887 */ 2888 if (zio->io_gang_tree) 2889 return; 2890 2891 if (zio->io_error == 0) { 2892 /* 2893 * If this is a root i/o, don't count it -- we've already 2894 * counted the top-level vdevs, and vdev_get_stats() will 2895 * aggregate them when asked. This reduces contention on 2896 * the root vdev_stat_lock and implicitly handles blocks 2897 * that compress away to holes, for which there is no i/o. 2898 * (Holes never create vdev children, so all the counters 2899 * remain zero, which is what we want.) 2900 * 2901 * Note: this only applies to successful i/o (io_error == 0) 2902 * because unlike i/o counts, errors are not additive. 2903 * When reading a ditto block, for example, failure of 2904 * one top-level vdev does not imply a root-level error. 2905 */ 2906 if (vd == rvd) 2907 return; 2908 2909 ASSERT(vd == zio->io_vd); 2910 2911 if (flags & ZIO_FLAG_IO_BYPASS) 2912 return; 2913 2914 mutex_enter(&vd->vdev_stat_lock); 2915 2916 if (flags & ZIO_FLAG_IO_REPAIR) { 2917 if (flags & ZIO_FLAG_SCAN_THREAD) { 2918 dsl_scan_phys_t *scn_phys = 2919 &spa->spa_dsl_pool->dp_scan->scn_phys; 2920 uint64_t *processed = &scn_phys->scn_processed; 2921 2922 /* XXX cleanup? */ 2923 if (vd->vdev_ops->vdev_op_leaf) 2924 atomic_add_64(processed, psize); 2925 vs->vs_scan_processed += psize; 2926 } 2927 2928 if (flags & ZIO_FLAG_SELF_HEAL) 2929 vs->vs_self_healed += psize; 2930 } 2931 2932 vs->vs_ops[type]++; 2933 vs->vs_bytes[type] += psize; 2934 2935 mutex_exit(&vd->vdev_stat_lock); 2936 return; 2937 } 2938 2939 if (flags & ZIO_FLAG_SPECULATIVE) 2940 return; 2941 2942 /* 2943 * If this is an I/O error that is going to be retried, then ignore the 2944 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2945 * hard errors, when in reality they can happen for any number of 2946 * innocuous reasons (bus resets, MPxIO link failure, etc). 2947 */ 2948 if (zio->io_error == EIO && 2949 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2950 return; 2951 2952 /* 2953 * Intent logs writes won't propagate their error to the root 2954 * I/O so don't mark these types of failures as pool-level 2955 * errors. 2956 */ 2957 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2958 return; 2959 2960 mutex_enter(&vd->vdev_stat_lock); 2961 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2962 if (zio->io_error == ECKSUM) 2963 vs->vs_checksum_errors++; 2964 else 2965 vs->vs_read_errors++; 2966 } 2967 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2968 vs->vs_write_errors++; 2969 mutex_exit(&vd->vdev_stat_lock); 2970 2971 if (type == ZIO_TYPE_WRITE && txg != 0 && 2972 (!(flags & ZIO_FLAG_IO_REPAIR) || 2973 (flags & ZIO_FLAG_SCAN_THREAD) || 2974 spa->spa_claiming)) { 2975 /* 2976 * This is either a normal write (not a repair), or it's 2977 * a repair induced by the scrub thread, or it's a repair 2978 * made by zil_claim() during spa_load() in the first txg. 2979 * In the normal case, we commit the DTL change in the same 2980 * txg as the block was born. In the scrub-induced repair 2981 * case, we know that scrubs run in first-pass syncing context, 2982 * so we commit the DTL change in spa_syncing_txg(spa). 2983 * In the zil_claim() case, we commit in spa_first_txg(spa). 2984 * 2985 * We currently do not make DTL entries for failed spontaneous 2986 * self-healing writes triggered by normal (non-scrubbing) 2987 * reads, because we have no transactional context in which to 2988 * do so -- and it's not clear that it'd be desirable anyway. 2989 */ 2990 if (vd->vdev_ops->vdev_op_leaf) { 2991 uint64_t commit_txg = txg; 2992 if (flags & ZIO_FLAG_SCAN_THREAD) { 2993 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2994 ASSERT(spa_sync_pass(spa) == 1); 2995 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2996 commit_txg = spa_syncing_txg(spa); 2997 } else if (spa->spa_claiming) { 2998 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2999 commit_txg = spa_first_txg(spa); 3000 } 3001 ASSERT(commit_txg >= spa_syncing_txg(spa)); 3002 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 3003 return; 3004 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 3005 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 3006 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 3007 } 3008 if (vd != rvd) 3009 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 3010 } 3011} 3012 3013/* 3014 * Update the in-core space usage stats for this vdev, its metaslab class, 3015 * and the root vdev. 3016 */ 3017void 3018vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 3019 int64_t space_delta) 3020{ 3021 int64_t dspace_delta = space_delta; 3022 spa_t *spa = vd->vdev_spa; 3023 vdev_t *rvd = spa->spa_root_vdev; 3024 metaslab_group_t *mg = vd->vdev_mg; 3025 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 3026 3027 ASSERT(vd == vd->vdev_top); 3028 3029 /* 3030 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 3031 * factor. We must calculate this here and not at the root vdev 3032 * because the root vdev's psize-to-asize is simply the max of its 3033 * childrens', thus not accurate enough for us. 3034 */ 3035 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 3036 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 3037 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 3038 vd->vdev_deflate_ratio; 3039 3040 mutex_enter(&vd->vdev_stat_lock); 3041 vd->vdev_stat.vs_alloc += alloc_delta; 3042 vd->vdev_stat.vs_space += space_delta; 3043 vd->vdev_stat.vs_dspace += dspace_delta; 3044 mutex_exit(&vd->vdev_stat_lock); 3045 3046 if (mc == spa_normal_class(spa)) { 3047 mutex_enter(&rvd->vdev_stat_lock); 3048 rvd->vdev_stat.vs_alloc += alloc_delta; 3049 rvd->vdev_stat.vs_space += space_delta; 3050 rvd->vdev_stat.vs_dspace += dspace_delta; 3051 mutex_exit(&rvd->vdev_stat_lock); 3052 } 3053 3054 if (mc != NULL) { 3055 ASSERT(rvd == vd->vdev_parent); 3056 ASSERT(vd->vdev_ms_count != 0); 3057 3058 metaslab_class_space_update(mc, 3059 alloc_delta, defer_delta, space_delta, dspace_delta); 3060 } 3061} 3062 3063/* 3064 * Mark a top-level vdev's config as dirty, placing it on the dirty list 3065 * so that it will be written out next time the vdev configuration is synced. 3066 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 3067 */ 3068void 3069vdev_config_dirty(vdev_t *vd) 3070{ 3071 spa_t *spa = vd->vdev_spa; 3072 vdev_t *rvd = spa->spa_root_vdev; 3073 int c; 3074 3075 ASSERT(spa_writeable(spa)); 3076 3077 /* 3078 * If this is an aux vdev (as with l2cache and spare devices), then we 3079 * update the vdev config manually and set the sync flag. 3080 */ 3081 if (vd->vdev_aux != NULL) { 3082 spa_aux_vdev_t *sav = vd->vdev_aux; 3083 nvlist_t **aux; 3084 uint_t naux; 3085 3086 for (c = 0; c < sav->sav_count; c++) { 3087 if (sav->sav_vdevs[c] == vd) 3088 break; 3089 } 3090 3091 if (c == sav->sav_count) { 3092 /* 3093 * We're being removed. There's nothing more to do. 3094 */ 3095 ASSERT(sav->sav_sync == B_TRUE); 3096 return; 3097 } 3098 3099 sav->sav_sync = B_TRUE; 3100 3101 if (nvlist_lookup_nvlist_array(sav->sav_config, 3102 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3103 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3104 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3105 } 3106 3107 ASSERT(c < naux); 3108 3109 /* 3110 * Setting the nvlist in the middle if the array is a little 3111 * sketchy, but it will work. 3112 */ 3113 nvlist_free(aux[c]); 3114 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3115 3116 return; 3117 } 3118 3119 /* 3120 * The dirty list is protected by the SCL_CONFIG lock. The caller 3121 * must either hold SCL_CONFIG as writer, or must be the sync thread 3122 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3123 * so this is sufficient to ensure mutual exclusion. 3124 */ 3125 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3126 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3127 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3128 3129 if (vd == rvd) { 3130 for (c = 0; c < rvd->vdev_children; c++) 3131 vdev_config_dirty(rvd->vdev_child[c]); 3132 } else { 3133 ASSERT(vd == vd->vdev_top); 3134 3135 if (!list_link_active(&vd->vdev_config_dirty_node) && 3136 !vd->vdev_ishole) 3137 list_insert_head(&spa->spa_config_dirty_list, vd); 3138 } 3139} 3140 3141void 3142vdev_config_clean(vdev_t *vd) 3143{ 3144 spa_t *spa = vd->vdev_spa; 3145 3146 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3147 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3148 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3149 3150 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3151 list_remove(&spa->spa_config_dirty_list, vd); 3152} 3153 3154/* 3155 * Mark a top-level vdev's state as dirty, so that the next pass of 3156 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3157 * the state changes from larger config changes because they require 3158 * much less locking, and are often needed for administrative actions. 3159 */ 3160void 3161vdev_state_dirty(vdev_t *vd) 3162{ 3163 spa_t *spa = vd->vdev_spa; 3164 3165 ASSERT(spa_writeable(spa)); 3166 ASSERT(vd == vd->vdev_top); 3167 3168 /* 3169 * The state list is protected by the SCL_STATE lock. The caller 3170 * must either hold SCL_STATE as writer, or must be the sync thread 3171 * (which holds SCL_STATE as reader). There's only one sync thread, 3172 * so this is sufficient to ensure mutual exclusion. 3173 */ 3174 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3175 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3176 spa_config_held(spa, SCL_STATE, RW_READER))); 3177 3178 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 3179 list_insert_head(&spa->spa_state_dirty_list, vd); 3180} 3181 3182void 3183vdev_state_clean(vdev_t *vd) 3184{ 3185 spa_t *spa = vd->vdev_spa; 3186 3187 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3188 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3189 spa_config_held(spa, SCL_STATE, RW_READER))); 3190 3191 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3192 list_remove(&spa->spa_state_dirty_list, vd); 3193} 3194 3195/* 3196 * Propagate vdev state up from children to parent. 3197 */ 3198void 3199vdev_propagate_state(vdev_t *vd) 3200{ 3201 spa_t *spa = vd->vdev_spa; 3202 vdev_t *rvd = spa->spa_root_vdev; 3203 int degraded = 0, faulted = 0; 3204 int corrupted = 0; 3205 vdev_t *child; 3206 3207 if (vd->vdev_children > 0) { 3208 for (int c = 0; c < vd->vdev_children; c++) { 3209 child = vd->vdev_child[c]; 3210 3211 /* 3212 * Don't factor holes into the decision. 3213 */ 3214 if (child->vdev_ishole) 3215 continue; 3216 3217 if (!vdev_readable(child) || 3218 (!vdev_writeable(child) && spa_writeable(spa))) { 3219 /* 3220 * Root special: if there is a top-level log 3221 * device, treat the root vdev as if it were 3222 * degraded. 3223 */ 3224 if (child->vdev_islog && vd == rvd) 3225 degraded++; 3226 else 3227 faulted++; 3228 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3229 degraded++; 3230 } 3231 3232 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3233 corrupted++; 3234 } 3235 3236 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3237 3238 /* 3239 * Root special: if there is a top-level vdev that cannot be 3240 * opened due to corrupted metadata, then propagate the root 3241 * vdev's aux state as 'corrupt' rather than 'insufficient 3242 * replicas'. 3243 */ 3244 if (corrupted && vd == rvd && 3245 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3246 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3247 VDEV_AUX_CORRUPT_DATA); 3248 } 3249 3250 if (vd->vdev_parent) 3251 vdev_propagate_state(vd->vdev_parent); 3252} 3253 3254/* 3255 * Set a vdev's state. If this is during an open, we don't update the parent 3256 * state, because we're in the process of opening children depth-first. 3257 * Otherwise, we propagate the change to the parent. 3258 * 3259 * If this routine places a device in a faulted state, an appropriate ereport is 3260 * generated. 3261 */ 3262void 3263vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3264{ 3265 uint64_t save_state; 3266 spa_t *spa = vd->vdev_spa; 3267 3268 if (state == vd->vdev_state) { 3269 vd->vdev_stat.vs_aux = aux; 3270 return; 3271 } 3272 3273 save_state = vd->vdev_state; 3274 3275 vd->vdev_state = state; 3276 vd->vdev_stat.vs_aux = aux; 3277 3278 /* 3279 * If we are setting the vdev state to anything but an open state, then 3280 * always close the underlying device unless the device has requested 3281 * a delayed close (i.e. we're about to remove or fault the device). 3282 * Otherwise, we keep accessible but invalid devices open forever. 3283 * We don't call vdev_close() itself, because that implies some extra 3284 * checks (offline, etc) that we don't want here. This is limited to 3285 * leaf devices, because otherwise closing the device will affect other 3286 * children. 3287 */ 3288 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3289 vd->vdev_ops->vdev_op_leaf) 3290 vd->vdev_ops->vdev_op_close(vd); 3291 3292 /* 3293 * If we have brought this vdev back into service, we need 3294 * to notify fmd so that it can gracefully repair any outstanding 3295 * cases due to a missing device. We do this in all cases, even those 3296 * that probably don't correlate to a repaired fault. This is sure to 3297 * catch all cases, and we let the zfs-retire agent sort it out. If 3298 * this is a transient state it's OK, as the retire agent will 3299 * double-check the state of the vdev before repairing it. 3300 */ 3301 if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 3302 vd->vdev_prevstate != state) 3303 zfs_post_state_change(spa, vd); 3304 3305 if (vd->vdev_removed && 3306 state == VDEV_STATE_CANT_OPEN && 3307 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3308 /* 3309 * If the previous state is set to VDEV_STATE_REMOVED, then this 3310 * device was previously marked removed and someone attempted to 3311 * reopen it. If this failed due to a nonexistent device, then 3312 * keep the device in the REMOVED state. We also let this be if 3313 * it is one of our special test online cases, which is only 3314 * attempting to online the device and shouldn't generate an FMA 3315 * fault. 3316 */ 3317 vd->vdev_state = VDEV_STATE_REMOVED; 3318 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3319 } else if (state == VDEV_STATE_REMOVED) { 3320 vd->vdev_removed = B_TRUE; 3321 } else if (state == VDEV_STATE_CANT_OPEN) { 3322 /* 3323 * If we fail to open a vdev during an import or recovery, we 3324 * mark it as "not available", which signifies that it was 3325 * never there to begin with. Failure to open such a device 3326 * is not considered an error. 3327 */ 3328 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3329 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3330 vd->vdev_ops->vdev_op_leaf) 3331 vd->vdev_not_present = 1; 3332 3333 /* 3334 * Post the appropriate ereport. If the 'prevstate' field is 3335 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3336 * that this is part of a vdev_reopen(). In this case, we don't 3337 * want to post the ereport if the device was already in the 3338 * CANT_OPEN state beforehand. 3339 * 3340 * If the 'checkremove' flag is set, then this is an attempt to 3341 * online the device in response to an insertion event. If we 3342 * hit this case, then we have detected an insertion event for a 3343 * faulted or offline device that wasn't in the removed state. 3344 * In this scenario, we don't post an ereport because we are 3345 * about to replace the device, or attempt an online with 3346 * vdev_forcefault, which will generate the fault for us. 3347 */ 3348 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3349 !vd->vdev_not_present && !vd->vdev_checkremove && 3350 vd != spa->spa_root_vdev) { 3351 const char *class; 3352 3353 switch (aux) { 3354 case VDEV_AUX_OPEN_FAILED: 3355 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3356 break; 3357 case VDEV_AUX_CORRUPT_DATA: 3358 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3359 break; 3360 case VDEV_AUX_NO_REPLICAS: 3361 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3362 break; 3363 case VDEV_AUX_BAD_GUID_SUM: 3364 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3365 break; 3366 case VDEV_AUX_TOO_SMALL: 3367 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3368 break; 3369 case VDEV_AUX_BAD_LABEL: 3370 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3371 break; 3372 default: 3373 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3374 } 3375 3376 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3377 } 3378 3379 /* Erase any notion of persistent removed state */ 3380 vd->vdev_removed = B_FALSE; 3381 } else { 3382 vd->vdev_removed = B_FALSE; 3383 } 3384 3385 if (!isopen && vd->vdev_parent) 3386 vdev_propagate_state(vd->vdev_parent); 3387} 3388 3389/* 3390 * Check the vdev configuration to ensure that it's capable of supporting 3391 * a root pool. 3392 * 3393 * On Solaris, we do not support RAID-Z or partial configuration. In 3394 * addition, only a single top-level vdev is allowed and none of the 3395 * leaves can be wholedisks. 3396 * 3397 * For FreeBSD, we can boot from any configuration. There is a 3398 * limitation that the boot filesystem must be either uncompressed or 3399 * compresses with lzjb compression but I'm not sure how to enforce 3400 * that here. 3401 */ 3402boolean_t 3403vdev_is_bootable(vdev_t *vd) 3404{ 3405#ifdef illumos 3406 if (!vd->vdev_ops->vdev_op_leaf) { 3407 char *vdev_type = vd->vdev_ops->vdev_op_type; 3408 3409 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3410 vd->vdev_children > 1) { 3411 return (B_FALSE); 3412 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3413 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3414 return (B_FALSE); 3415 } 3416 } 3417 3418 for (int c = 0; c < vd->vdev_children; c++) { 3419 if (!vdev_is_bootable(vd->vdev_child[c])) 3420 return (B_FALSE); 3421 } 3422#endif /* illumos */ 3423 return (B_TRUE); 3424} 3425 3426/* 3427 * Load the state from the original vdev tree (ovd) which 3428 * we've retrieved from the MOS config object. If the original 3429 * vdev was offline or faulted then we transfer that state to the 3430 * device in the current vdev tree (nvd). 3431 */ 3432void 3433vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3434{ 3435 spa_t *spa = nvd->vdev_spa; 3436 3437 ASSERT(nvd->vdev_top->vdev_islog); 3438 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3439 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3440 3441 for (int c = 0; c < nvd->vdev_children; c++) 3442 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3443 3444 if (nvd->vdev_ops->vdev_op_leaf) { 3445 /* 3446 * Restore the persistent vdev state 3447 */ 3448 nvd->vdev_offline = ovd->vdev_offline; 3449 nvd->vdev_faulted = ovd->vdev_faulted; 3450 nvd->vdev_degraded = ovd->vdev_degraded; 3451 nvd->vdev_removed = ovd->vdev_removed; 3452 } 3453} 3454 3455/* 3456 * Determine if a log device has valid content. If the vdev was 3457 * removed or faulted in the MOS config then we know that 3458 * the content on the log device has already been written to the pool. 3459 */ 3460boolean_t 3461vdev_log_state_valid(vdev_t *vd) 3462{ 3463 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3464 !vd->vdev_removed) 3465 return (B_TRUE); 3466 3467 for (int c = 0; c < vd->vdev_children; c++) 3468 if (vdev_log_state_valid(vd->vdev_child[c])) 3469 return (B_TRUE); 3470 3471 return (B_FALSE); 3472} 3473 3474/* 3475 * Expand a vdev if possible. 3476 */ 3477void 3478vdev_expand(vdev_t *vd, uint64_t txg) 3479{ 3480 ASSERT(vd->vdev_top == vd); 3481 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3482 3483 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3484 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3485 vdev_config_dirty(vd); 3486 } 3487} 3488 3489/* 3490 * Split a vdev. 3491 */ 3492void 3493vdev_split(vdev_t *vd) 3494{ 3495 vdev_t *cvd, *pvd = vd->vdev_parent; 3496 3497 vdev_remove_child(pvd, vd); 3498 vdev_compact_children(pvd); 3499 3500 cvd = pvd->vdev_child[0]; 3501 if (pvd->vdev_children == 1) { 3502 vdev_remove_parent(cvd); 3503 cvd->vdev_splitting = B_TRUE; 3504 } 3505 vdev_propagate_state(cvd); 3506} 3507 3508void 3509vdev_deadman(vdev_t *vd) 3510{ 3511 for (int c = 0; c < vd->vdev_children; c++) { 3512 vdev_t *cvd = vd->vdev_child[c]; 3513 3514 vdev_deadman(cvd); 3515 } 3516 3517 if (vd->vdev_ops->vdev_op_leaf) { 3518 vdev_queue_t *vq = &vd->vdev_queue; 3519 3520 mutex_enter(&vq->vq_lock); 3521 if (avl_numnodes(&vq->vq_active_tree) > 0) { 3522 spa_t *spa = vd->vdev_spa; 3523 zio_t *fio; 3524 uint64_t delta; 3525 3526 /* 3527 * Look at the head of all the pending queues, 3528 * if any I/O has been outstanding for longer than 3529 * the spa_deadman_synctime we panic the system. 3530 */ 3531 fio = avl_first(&vq->vq_active_tree); 3532 delta = gethrtime() - fio->io_timestamp; 3533 if (delta > spa_deadman_synctime(spa)) { 3534 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3535 "delta %lluns, last io %lluns", 3536 fio->io_timestamp, delta, 3537 vq->vq_io_complete_ts); 3538 fm_panic("I/O to pool '%s' appears to be " 3539 "hung on vdev guid %llu at '%s'.", 3540 spa_name(spa), 3541 (long long unsigned int) vd->vdev_guid, 3542 vd->vdev_path); 3543 } 3544 } 3545 mutex_exit(&vq->vq_lock); 3546 } 3547} 3548