spa.c revision 269006
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 */ 28 29/* 30 * SPA: Storage Pool Allocator 31 * 32 * This file contains all the routines used when modifying on-disk SPA state. 33 * This includes opening, importing, destroying, exporting a pool, and syncing a 34 * pool. 35 */ 36 37#include <sys/zfs_context.h> 38#include <sys/fm/fs/zfs.h> 39#include <sys/spa_impl.h> 40#include <sys/zio.h> 41#include <sys/zio_checksum.h> 42#include <sys/dmu.h> 43#include <sys/dmu_tx.h> 44#include <sys/zap.h> 45#include <sys/zil.h> 46#include <sys/ddt.h> 47#include <sys/vdev_impl.h> 48#include <sys/metaslab.h> 49#include <sys/metaslab_impl.h> 50#include <sys/uberblock_impl.h> 51#include <sys/txg.h> 52#include <sys/avl.h> 53#include <sys/dmu_traverse.h> 54#include <sys/dmu_objset.h> 55#include <sys/unique.h> 56#include <sys/dsl_pool.h> 57#include <sys/dsl_dataset.h> 58#include <sys/dsl_dir.h> 59#include <sys/dsl_prop.h> 60#include <sys/dsl_synctask.h> 61#include <sys/fs/zfs.h> 62#include <sys/arc.h> 63#include <sys/callb.h> 64#include <sys/spa_boot.h> 65#include <sys/zfs_ioctl.h> 66#include <sys/dsl_scan.h> 67#include <sys/dmu_send.h> 68#include <sys/dsl_destroy.h> 69#include <sys/dsl_userhold.h> 70#include <sys/zfeature.h> 71#include <sys/zvol.h> 72#include <sys/trim_map.h> 73 74#ifdef _KERNEL 75#include <sys/callb.h> 76#include <sys/cpupart.h> 77#include <sys/zone.h> 78#endif /* _KERNEL */ 79 80#include "zfs_prop.h" 81#include "zfs_comutil.h" 82 83/* Check hostid on import? */ 84static int check_hostid = 1; 85 86SYSCTL_DECL(_vfs_zfs); 87TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 88SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 89 "Check hostid on import?"); 90 91/* 92 * The interval, in seconds, at which failed configuration cache file writes 93 * should be retried. 94 */ 95static int zfs_ccw_retry_interval = 300; 96 97typedef enum zti_modes { 98 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 99 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 100 ZTI_MODE_NULL, /* don't create a taskq */ 101 ZTI_NMODES 102} zti_modes_t; 103 104#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 105#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 106#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 107 108#define ZTI_N(n) ZTI_P(n, 1) 109#define ZTI_ONE ZTI_N(1) 110 111typedef struct zio_taskq_info { 112 zti_modes_t zti_mode; 113 uint_t zti_value; 114 uint_t zti_count; 115} zio_taskq_info_t; 116 117static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 118 "issue", "issue_high", "intr", "intr_high" 119}; 120 121/* 122 * This table defines the taskq settings for each ZFS I/O type. When 123 * initializing a pool, we use this table to create an appropriately sized 124 * taskq. Some operations are low volume and therefore have a small, static 125 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 126 * macros. Other operations process a large amount of data; the ZTI_BATCH 127 * macro causes us to create a taskq oriented for throughput. Some operations 128 * are so high frequency and short-lived that the taskq itself can become a a 129 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 130 * additional degree of parallelism specified by the number of threads per- 131 * taskq and the number of taskqs; when dispatching an event in this case, the 132 * particular taskq is chosen at random. 133 * 134 * The different taskq priorities are to handle the different contexts (issue 135 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 136 * need to be handled with minimum delay. 137 */ 138const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 139 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 140 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 141 { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */ 142 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 143 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 144 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 145 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 146}; 147 148static void spa_sync_version(void *arg, dmu_tx_t *tx); 149static void spa_sync_props(void *arg, dmu_tx_t *tx); 150static boolean_t spa_has_active_shared_spare(spa_t *spa); 151static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 152 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 153 char **ereport); 154static void spa_vdev_resilver_done(spa_t *spa); 155 156uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 157#ifdef PSRSET_BIND 158id_t zio_taskq_psrset_bind = PS_NONE; 159#endif 160#ifdef SYSDC 161boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 162#endif 163uint_t zio_taskq_basedc = 80; /* base duty cycle */ 164 165boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 166extern int zfs_sync_pass_deferred_free; 167 168#ifndef illumos 169extern void spa_deadman(void *arg); 170#endif 171 172/* 173 * This (illegal) pool name is used when temporarily importing a spa_t in order 174 * to get the vdev stats associated with the imported devices. 175 */ 176#define TRYIMPORT_NAME "$import" 177 178/* 179 * ========================================================================== 180 * SPA properties routines 181 * ========================================================================== 182 */ 183 184/* 185 * Add a (source=src, propname=propval) list to an nvlist. 186 */ 187static void 188spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 189 uint64_t intval, zprop_source_t src) 190{ 191 const char *propname = zpool_prop_to_name(prop); 192 nvlist_t *propval; 193 194 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 195 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 196 197 if (strval != NULL) 198 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 199 else 200 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 201 202 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 203 nvlist_free(propval); 204} 205 206/* 207 * Get property values from the spa configuration. 208 */ 209static void 210spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 211{ 212 vdev_t *rvd = spa->spa_root_vdev; 213 dsl_pool_t *pool = spa->spa_dsl_pool; 214 uint64_t size; 215 uint64_t alloc; 216 uint64_t space; 217 uint64_t cap, version; 218 zprop_source_t src = ZPROP_SRC_NONE; 219 spa_config_dirent_t *dp; 220 221 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 222 223 if (rvd != NULL) { 224 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 225 size = metaslab_class_get_space(spa_normal_class(spa)); 226 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 227 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 228 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 229 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 230 size - alloc, src); 231 232 space = 0; 233 for (int c = 0; c < rvd->vdev_children; c++) { 234 vdev_t *tvd = rvd->vdev_child[c]; 235 space += tvd->vdev_max_asize - tvd->vdev_asize; 236 } 237 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 238 src); 239 240 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 241 (spa_mode(spa) == FREAD), src); 242 243 cap = (size == 0) ? 0 : (alloc * 100 / size); 244 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 245 246 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 247 ddt_get_pool_dedup_ratio(spa), src); 248 249 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 250 rvd->vdev_state, src); 251 252 version = spa_version(spa); 253 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 254 src = ZPROP_SRC_DEFAULT; 255 else 256 src = ZPROP_SRC_LOCAL; 257 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 258 } 259 260 if (pool != NULL) { 261 /* 262 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 263 * when opening pools before this version freedir will be NULL. 264 */ 265 if (pool->dp_free_dir != NULL) { 266 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 267 pool->dp_free_dir->dd_phys->dd_used_bytes, src); 268 } else { 269 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 270 NULL, 0, src); 271 } 272 273 if (pool->dp_leak_dir != NULL) { 274 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 275 pool->dp_leak_dir->dd_phys->dd_used_bytes, src); 276 } else { 277 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 278 NULL, 0, src); 279 } 280 } 281 282 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 283 284 if (spa->spa_comment != NULL) { 285 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 286 0, ZPROP_SRC_LOCAL); 287 } 288 289 if (spa->spa_root != NULL) 290 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 291 0, ZPROP_SRC_LOCAL); 292 293 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 294 if (dp->scd_path == NULL) { 295 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 296 "none", 0, ZPROP_SRC_LOCAL); 297 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 298 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 299 dp->scd_path, 0, ZPROP_SRC_LOCAL); 300 } 301 } 302} 303 304/* 305 * Get zpool property values. 306 */ 307int 308spa_prop_get(spa_t *spa, nvlist_t **nvp) 309{ 310 objset_t *mos = spa->spa_meta_objset; 311 zap_cursor_t zc; 312 zap_attribute_t za; 313 int err; 314 315 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 316 317 mutex_enter(&spa->spa_props_lock); 318 319 /* 320 * Get properties from the spa config. 321 */ 322 spa_prop_get_config(spa, nvp); 323 324 /* If no pool property object, no more prop to get. */ 325 if (mos == NULL || spa->spa_pool_props_object == 0) { 326 mutex_exit(&spa->spa_props_lock); 327 return (0); 328 } 329 330 /* 331 * Get properties from the MOS pool property object. 332 */ 333 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 334 (err = zap_cursor_retrieve(&zc, &za)) == 0; 335 zap_cursor_advance(&zc)) { 336 uint64_t intval = 0; 337 char *strval = NULL; 338 zprop_source_t src = ZPROP_SRC_DEFAULT; 339 zpool_prop_t prop; 340 341 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 342 continue; 343 344 switch (za.za_integer_length) { 345 case 8: 346 /* integer property */ 347 if (za.za_first_integer != 348 zpool_prop_default_numeric(prop)) 349 src = ZPROP_SRC_LOCAL; 350 351 if (prop == ZPOOL_PROP_BOOTFS) { 352 dsl_pool_t *dp; 353 dsl_dataset_t *ds = NULL; 354 355 dp = spa_get_dsl(spa); 356 dsl_pool_config_enter(dp, FTAG); 357 if (err = dsl_dataset_hold_obj(dp, 358 za.za_first_integer, FTAG, &ds)) { 359 dsl_pool_config_exit(dp, FTAG); 360 break; 361 } 362 363 strval = kmem_alloc( 364 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 365 KM_SLEEP); 366 dsl_dataset_name(ds, strval); 367 dsl_dataset_rele(ds, FTAG); 368 dsl_pool_config_exit(dp, FTAG); 369 } else { 370 strval = NULL; 371 intval = za.za_first_integer; 372 } 373 374 spa_prop_add_list(*nvp, prop, strval, intval, src); 375 376 if (strval != NULL) 377 kmem_free(strval, 378 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 379 380 break; 381 382 case 1: 383 /* string property */ 384 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 385 err = zap_lookup(mos, spa->spa_pool_props_object, 386 za.za_name, 1, za.za_num_integers, strval); 387 if (err) { 388 kmem_free(strval, za.za_num_integers); 389 break; 390 } 391 spa_prop_add_list(*nvp, prop, strval, 0, src); 392 kmem_free(strval, za.za_num_integers); 393 break; 394 395 default: 396 break; 397 } 398 } 399 zap_cursor_fini(&zc); 400 mutex_exit(&spa->spa_props_lock); 401out: 402 if (err && err != ENOENT) { 403 nvlist_free(*nvp); 404 *nvp = NULL; 405 return (err); 406 } 407 408 return (0); 409} 410 411/* 412 * Validate the given pool properties nvlist and modify the list 413 * for the property values to be set. 414 */ 415static int 416spa_prop_validate(spa_t *spa, nvlist_t *props) 417{ 418 nvpair_t *elem; 419 int error = 0, reset_bootfs = 0; 420 uint64_t objnum = 0; 421 boolean_t has_feature = B_FALSE; 422 423 elem = NULL; 424 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 425 uint64_t intval; 426 char *strval, *slash, *check, *fname; 427 const char *propname = nvpair_name(elem); 428 zpool_prop_t prop = zpool_name_to_prop(propname); 429 430 switch (prop) { 431 case ZPROP_INVAL: 432 if (!zpool_prop_feature(propname)) { 433 error = SET_ERROR(EINVAL); 434 break; 435 } 436 437 /* 438 * Sanitize the input. 439 */ 440 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 441 error = SET_ERROR(EINVAL); 442 break; 443 } 444 445 if (nvpair_value_uint64(elem, &intval) != 0) { 446 error = SET_ERROR(EINVAL); 447 break; 448 } 449 450 if (intval != 0) { 451 error = SET_ERROR(EINVAL); 452 break; 453 } 454 455 fname = strchr(propname, '@') + 1; 456 if (zfeature_lookup_name(fname, NULL) != 0) { 457 error = SET_ERROR(EINVAL); 458 break; 459 } 460 461 has_feature = B_TRUE; 462 break; 463 464 case ZPOOL_PROP_VERSION: 465 error = nvpair_value_uint64(elem, &intval); 466 if (!error && 467 (intval < spa_version(spa) || 468 intval > SPA_VERSION_BEFORE_FEATURES || 469 has_feature)) 470 error = SET_ERROR(EINVAL); 471 break; 472 473 case ZPOOL_PROP_DELEGATION: 474 case ZPOOL_PROP_AUTOREPLACE: 475 case ZPOOL_PROP_LISTSNAPS: 476 case ZPOOL_PROP_AUTOEXPAND: 477 error = nvpair_value_uint64(elem, &intval); 478 if (!error && intval > 1) 479 error = SET_ERROR(EINVAL); 480 break; 481 482 case ZPOOL_PROP_BOOTFS: 483 /* 484 * If the pool version is less than SPA_VERSION_BOOTFS, 485 * or the pool is still being created (version == 0), 486 * the bootfs property cannot be set. 487 */ 488 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 489 error = SET_ERROR(ENOTSUP); 490 break; 491 } 492 493 /* 494 * Make sure the vdev config is bootable 495 */ 496 if (!vdev_is_bootable(spa->spa_root_vdev)) { 497 error = SET_ERROR(ENOTSUP); 498 break; 499 } 500 501 reset_bootfs = 1; 502 503 error = nvpair_value_string(elem, &strval); 504 505 if (!error) { 506 objset_t *os; 507 uint64_t compress; 508 509 if (strval == NULL || strval[0] == '\0') { 510 objnum = zpool_prop_default_numeric( 511 ZPOOL_PROP_BOOTFS); 512 break; 513 } 514 515 if (error = dmu_objset_hold(strval, FTAG, &os)) 516 break; 517 518 /* Must be ZPL and not gzip compressed. */ 519 520 if (dmu_objset_type(os) != DMU_OST_ZFS) { 521 error = SET_ERROR(ENOTSUP); 522 } else if ((error = 523 dsl_prop_get_int_ds(dmu_objset_ds(os), 524 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 525 &compress)) == 0 && 526 !BOOTFS_COMPRESS_VALID(compress)) { 527 error = SET_ERROR(ENOTSUP); 528 } else { 529 objnum = dmu_objset_id(os); 530 } 531 dmu_objset_rele(os, FTAG); 532 } 533 break; 534 535 case ZPOOL_PROP_FAILUREMODE: 536 error = nvpair_value_uint64(elem, &intval); 537 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 538 intval > ZIO_FAILURE_MODE_PANIC)) 539 error = SET_ERROR(EINVAL); 540 541 /* 542 * This is a special case which only occurs when 543 * the pool has completely failed. This allows 544 * the user to change the in-core failmode property 545 * without syncing it out to disk (I/Os might 546 * currently be blocked). We do this by returning 547 * EIO to the caller (spa_prop_set) to trick it 548 * into thinking we encountered a property validation 549 * error. 550 */ 551 if (!error && spa_suspended(spa)) { 552 spa->spa_failmode = intval; 553 error = SET_ERROR(EIO); 554 } 555 break; 556 557 case ZPOOL_PROP_CACHEFILE: 558 if ((error = nvpair_value_string(elem, &strval)) != 0) 559 break; 560 561 if (strval[0] == '\0') 562 break; 563 564 if (strcmp(strval, "none") == 0) 565 break; 566 567 if (strval[0] != '/') { 568 error = SET_ERROR(EINVAL); 569 break; 570 } 571 572 slash = strrchr(strval, '/'); 573 ASSERT(slash != NULL); 574 575 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 576 strcmp(slash, "/..") == 0) 577 error = SET_ERROR(EINVAL); 578 break; 579 580 case ZPOOL_PROP_COMMENT: 581 if ((error = nvpair_value_string(elem, &strval)) != 0) 582 break; 583 for (check = strval; *check != '\0'; check++) { 584 /* 585 * The kernel doesn't have an easy isprint() 586 * check. For this kernel check, we merely 587 * check ASCII apart from DEL. Fix this if 588 * there is an easy-to-use kernel isprint(). 589 */ 590 if (*check >= 0x7f) { 591 error = SET_ERROR(EINVAL); 592 break; 593 } 594 check++; 595 } 596 if (strlen(strval) > ZPROP_MAX_COMMENT) 597 error = E2BIG; 598 break; 599 600 case ZPOOL_PROP_DEDUPDITTO: 601 if (spa_version(spa) < SPA_VERSION_DEDUP) 602 error = SET_ERROR(ENOTSUP); 603 else 604 error = nvpair_value_uint64(elem, &intval); 605 if (error == 0 && 606 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 607 error = SET_ERROR(EINVAL); 608 break; 609 } 610 611 if (error) 612 break; 613 } 614 615 if (!error && reset_bootfs) { 616 error = nvlist_remove(props, 617 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 618 619 if (!error) { 620 error = nvlist_add_uint64(props, 621 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 622 } 623 } 624 625 return (error); 626} 627 628void 629spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 630{ 631 char *cachefile; 632 spa_config_dirent_t *dp; 633 634 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 635 &cachefile) != 0) 636 return; 637 638 dp = kmem_alloc(sizeof (spa_config_dirent_t), 639 KM_SLEEP); 640 641 if (cachefile[0] == '\0') 642 dp->scd_path = spa_strdup(spa_config_path); 643 else if (strcmp(cachefile, "none") == 0) 644 dp->scd_path = NULL; 645 else 646 dp->scd_path = spa_strdup(cachefile); 647 648 list_insert_head(&spa->spa_config_list, dp); 649 if (need_sync) 650 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 651} 652 653int 654spa_prop_set(spa_t *spa, nvlist_t *nvp) 655{ 656 int error; 657 nvpair_t *elem = NULL; 658 boolean_t need_sync = B_FALSE; 659 660 if ((error = spa_prop_validate(spa, nvp)) != 0) 661 return (error); 662 663 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 664 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 665 666 if (prop == ZPOOL_PROP_CACHEFILE || 667 prop == ZPOOL_PROP_ALTROOT || 668 prop == ZPOOL_PROP_READONLY) 669 continue; 670 671 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 672 uint64_t ver; 673 674 if (prop == ZPOOL_PROP_VERSION) { 675 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 676 } else { 677 ASSERT(zpool_prop_feature(nvpair_name(elem))); 678 ver = SPA_VERSION_FEATURES; 679 need_sync = B_TRUE; 680 } 681 682 /* Save time if the version is already set. */ 683 if (ver == spa_version(spa)) 684 continue; 685 686 /* 687 * In addition to the pool directory object, we might 688 * create the pool properties object, the features for 689 * read object, the features for write object, or the 690 * feature descriptions object. 691 */ 692 error = dsl_sync_task(spa->spa_name, NULL, 693 spa_sync_version, &ver, 694 6, ZFS_SPACE_CHECK_RESERVED); 695 if (error) 696 return (error); 697 continue; 698 } 699 700 need_sync = B_TRUE; 701 break; 702 } 703 704 if (need_sync) { 705 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 706 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 707 } 708 709 return (0); 710} 711 712/* 713 * If the bootfs property value is dsobj, clear it. 714 */ 715void 716spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 717{ 718 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 719 VERIFY(zap_remove(spa->spa_meta_objset, 720 spa->spa_pool_props_object, 721 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 722 spa->spa_bootfs = 0; 723 } 724} 725 726/*ARGSUSED*/ 727static int 728spa_change_guid_check(void *arg, dmu_tx_t *tx) 729{ 730 uint64_t *newguid = arg; 731 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 732 vdev_t *rvd = spa->spa_root_vdev; 733 uint64_t vdev_state; 734 735 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 736 vdev_state = rvd->vdev_state; 737 spa_config_exit(spa, SCL_STATE, FTAG); 738 739 if (vdev_state != VDEV_STATE_HEALTHY) 740 return (SET_ERROR(ENXIO)); 741 742 ASSERT3U(spa_guid(spa), !=, *newguid); 743 744 return (0); 745} 746 747static void 748spa_change_guid_sync(void *arg, dmu_tx_t *tx) 749{ 750 uint64_t *newguid = arg; 751 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 752 uint64_t oldguid; 753 vdev_t *rvd = spa->spa_root_vdev; 754 755 oldguid = spa_guid(spa); 756 757 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 758 rvd->vdev_guid = *newguid; 759 rvd->vdev_guid_sum += (*newguid - oldguid); 760 vdev_config_dirty(rvd); 761 spa_config_exit(spa, SCL_STATE, FTAG); 762 763 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 764 oldguid, *newguid); 765} 766 767/* 768 * Change the GUID for the pool. This is done so that we can later 769 * re-import a pool built from a clone of our own vdevs. We will modify 770 * the root vdev's guid, our own pool guid, and then mark all of our 771 * vdevs dirty. Note that we must make sure that all our vdevs are 772 * online when we do this, or else any vdevs that weren't present 773 * would be orphaned from our pool. We are also going to issue a 774 * sysevent to update any watchers. 775 */ 776int 777spa_change_guid(spa_t *spa) 778{ 779 int error; 780 uint64_t guid; 781 782 mutex_enter(&spa->spa_vdev_top_lock); 783 mutex_enter(&spa_namespace_lock); 784 guid = spa_generate_guid(NULL); 785 786 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 787 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 788 789 if (error == 0) { 790 spa_config_sync(spa, B_FALSE, B_TRUE); 791 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 792 } 793 794 mutex_exit(&spa_namespace_lock); 795 mutex_exit(&spa->spa_vdev_top_lock); 796 797 return (error); 798} 799 800/* 801 * ========================================================================== 802 * SPA state manipulation (open/create/destroy/import/export) 803 * ========================================================================== 804 */ 805 806static int 807spa_error_entry_compare(const void *a, const void *b) 808{ 809 spa_error_entry_t *sa = (spa_error_entry_t *)a; 810 spa_error_entry_t *sb = (spa_error_entry_t *)b; 811 int ret; 812 813 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 814 sizeof (zbookmark_phys_t)); 815 816 if (ret < 0) 817 return (-1); 818 else if (ret > 0) 819 return (1); 820 else 821 return (0); 822} 823 824/* 825 * Utility function which retrieves copies of the current logs and 826 * re-initializes them in the process. 827 */ 828void 829spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 830{ 831 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 832 833 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 834 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 835 836 avl_create(&spa->spa_errlist_scrub, 837 spa_error_entry_compare, sizeof (spa_error_entry_t), 838 offsetof(spa_error_entry_t, se_avl)); 839 avl_create(&spa->spa_errlist_last, 840 spa_error_entry_compare, sizeof (spa_error_entry_t), 841 offsetof(spa_error_entry_t, se_avl)); 842} 843 844static void 845spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 846{ 847 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 848 enum zti_modes mode = ztip->zti_mode; 849 uint_t value = ztip->zti_value; 850 uint_t count = ztip->zti_count; 851 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 852 char name[32]; 853 uint_t flags = 0; 854 boolean_t batch = B_FALSE; 855 856 if (mode == ZTI_MODE_NULL) { 857 tqs->stqs_count = 0; 858 tqs->stqs_taskq = NULL; 859 return; 860 } 861 862 ASSERT3U(count, >, 0); 863 864 tqs->stqs_count = count; 865 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 866 867 switch (mode) { 868 case ZTI_MODE_FIXED: 869 ASSERT3U(value, >=, 1); 870 value = MAX(value, 1); 871 break; 872 873 case ZTI_MODE_BATCH: 874 batch = B_TRUE; 875 flags |= TASKQ_THREADS_CPU_PCT; 876 value = zio_taskq_batch_pct; 877 break; 878 879 default: 880 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 881 "spa_activate()", 882 zio_type_name[t], zio_taskq_types[q], mode, value); 883 break; 884 } 885 886 for (uint_t i = 0; i < count; i++) { 887 taskq_t *tq; 888 889 if (count > 1) { 890 (void) snprintf(name, sizeof (name), "%s_%s_%u", 891 zio_type_name[t], zio_taskq_types[q], i); 892 } else { 893 (void) snprintf(name, sizeof (name), "%s_%s", 894 zio_type_name[t], zio_taskq_types[q]); 895 } 896 897#ifdef SYSDC 898 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 899 if (batch) 900 flags |= TASKQ_DC_BATCH; 901 902 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 903 spa->spa_proc, zio_taskq_basedc, flags); 904 } else { 905#endif 906 pri_t pri = maxclsyspri; 907 /* 908 * The write issue taskq can be extremely CPU 909 * intensive. Run it at slightly lower priority 910 * than the other taskqs. 911 */ 912 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 913 pri--; 914 915 tq = taskq_create_proc(name, value, pri, 50, 916 INT_MAX, spa->spa_proc, flags); 917#ifdef SYSDC 918 } 919#endif 920 921 tqs->stqs_taskq[i] = tq; 922 } 923} 924 925static void 926spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 927{ 928 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 929 930 if (tqs->stqs_taskq == NULL) { 931 ASSERT0(tqs->stqs_count); 932 return; 933 } 934 935 for (uint_t i = 0; i < tqs->stqs_count; i++) { 936 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 937 taskq_destroy(tqs->stqs_taskq[i]); 938 } 939 940 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 941 tqs->stqs_taskq = NULL; 942} 943 944/* 945 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 946 * Note that a type may have multiple discrete taskqs to avoid lock contention 947 * on the taskq itself. In that case we choose which taskq at random by using 948 * the low bits of gethrtime(). 949 */ 950void 951spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 952 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 953{ 954 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 955 taskq_t *tq; 956 957 ASSERT3P(tqs->stqs_taskq, !=, NULL); 958 ASSERT3U(tqs->stqs_count, !=, 0); 959 960 if (tqs->stqs_count == 1) { 961 tq = tqs->stqs_taskq[0]; 962 } else { 963#ifdef _KERNEL 964 tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 965#else 966 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 967#endif 968 } 969 970 taskq_dispatch_ent(tq, func, arg, flags, ent); 971} 972 973static void 974spa_create_zio_taskqs(spa_t *spa) 975{ 976 for (int t = 0; t < ZIO_TYPES; t++) { 977 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 978 spa_taskqs_init(spa, t, q); 979 } 980 } 981} 982 983#ifdef _KERNEL 984#ifdef SPA_PROCESS 985static void 986spa_thread(void *arg) 987{ 988 callb_cpr_t cprinfo; 989 990 spa_t *spa = arg; 991 user_t *pu = PTOU(curproc); 992 993 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 994 spa->spa_name); 995 996 ASSERT(curproc != &p0); 997 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 998 "zpool-%s", spa->spa_name); 999 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1000 1001#ifdef PSRSET_BIND 1002 /* bind this thread to the requested psrset */ 1003 if (zio_taskq_psrset_bind != PS_NONE) { 1004 pool_lock(); 1005 mutex_enter(&cpu_lock); 1006 mutex_enter(&pidlock); 1007 mutex_enter(&curproc->p_lock); 1008 1009 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1010 0, NULL, NULL) == 0) { 1011 curthread->t_bind_pset = zio_taskq_psrset_bind; 1012 } else { 1013 cmn_err(CE_WARN, 1014 "Couldn't bind process for zfs pool \"%s\" to " 1015 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1016 } 1017 1018 mutex_exit(&curproc->p_lock); 1019 mutex_exit(&pidlock); 1020 mutex_exit(&cpu_lock); 1021 pool_unlock(); 1022 } 1023#endif 1024 1025#ifdef SYSDC 1026 if (zio_taskq_sysdc) { 1027 sysdc_thread_enter(curthread, 100, 0); 1028 } 1029#endif 1030 1031 spa->spa_proc = curproc; 1032 spa->spa_did = curthread->t_did; 1033 1034 spa_create_zio_taskqs(spa); 1035 1036 mutex_enter(&spa->spa_proc_lock); 1037 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1038 1039 spa->spa_proc_state = SPA_PROC_ACTIVE; 1040 cv_broadcast(&spa->spa_proc_cv); 1041 1042 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1043 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1044 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1045 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1046 1047 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1048 spa->spa_proc_state = SPA_PROC_GONE; 1049 spa->spa_proc = &p0; 1050 cv_broadcast(&spa->spa_proc_cv); 1051 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1052 1053 mutex_enter(&curproc->p_lock); 1054 lwp_exit(); 1055} 1056#endif /* SPA_PROCESS */ 1057#endif 1058 1059/* 1060 * Activate an uninitialized pool. 1061 */ 1062static void 1063spa_activate(spa_t *spa, int mode) 1064{ 1065 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1066 1067 spa->spa_state = POOL_STATE_ACTIVE; 1068 spa->spa_mode = mode; 1069 1070 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1071 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1072 1073 /* Try to create a covering process */ 1074 mutex_enter(&spa->spa_proc_lock); 1075 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1076 ASSERT(spa->spa_proc == &p0); 1077 spa->spa_did = 0; 1078 1079#ifdef SPA_PROCESS 1080 /* Only create a process if we're going to be around a while. */ 1081 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1082 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1083 NULL, 0) == 0) { 1084 spa->spa_proc_state = SPA_PROC_CREATED; 1085 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1086 cv_wait(&spa->spa_proc_cv, 1087 &spa->spa_proc_lock); 1088 } 1089 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1090 ASSERT(spa->spa_proc != &p0); 1091 ASSERT(spa->spa_did != 0); 1092 } else { 1093#ifdef _KERNEL 1094 cmn_err(CE_WARN, 1095 "Couldn't create process for zfs pool \"%s\"\n", 1096 spa->spa_name); 1097#endif 1098 } 1099 } 1100#endif /* SPA_PROCESS */ 1101 mutex_exit(&spa->spa_proc_lock); 1102 1103 /* If we didn't create a process, we need to create our taskqs. */ 1104 ASSERT(spa->spa_proc == &p0); 1105 if (spa->spa_proc == &p0) { 1106 spa_create_zio_taskqs(spa); 1107 } 1108 1109 /* 1110 * Start TRIM thread. 1111 */ 1112 trim_thread_create(spa); 1113 1114 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1115 offsetof(vdev_t, vdev_config_dirty_node)); 1116 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1117 offsetof(vdev_t, vdev_state_dirty_node)); 1118 1119 txg_list_create(&spa->spa_vdev_txg_list, 1120 offsetof(struct vdev, vdev_txg_node)); 1121 1122 avl_create(&spa->spa_errlist_scrub, 1123 spa_error_entry_compare, sizeof (spa_error_entry_t), 1124 offsetof(spa_error_entry_t, se_avl)); 1125 avl_create(&spa->spa_errlist_last, 1126 spa_error_entry_compare, sizeof (spa_error_entry_t), 1127 offsetof(spa_error_entry_t, se_avl)); 1128} 1129 1130/* 1131 * Opposite of spa_activate(). 1132 */ 1133static void 1134spa_deactivate(spa_t *spa) 1135{ 1136 ASSERT(spa->spa_sync_on == B_FALSE); 1137 ASSERT(spa->spa_dsl_pool == NULL); 1138 ASSERT(spa->spa_root_vdev == NULL); 1139 ASSERT(spa->spa_async_zio_root == NULL); 1140 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1141 1142 /* 1143 * Stop TRIM thread in case spa_unload() wasn't called directly 1144 * before spa_deactivate(). 1145 */ 1146 trim_thread_destroy(spa); 1147 1148 txg_list_destroy(&spa->spa_vdev_txg_list); 1149 1150 list_destroy(&spa->spa_config_dirty_list); 1151 list_destroy(&spa->spa_state_dirty_list); 1152 1153 for (int t = 0; t < ZIO_TYPES; t++) { 1154 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1155 spa_taskqs_fini(spa, t, q); 1156 } 1157 } 1158 1159 metaslab_class_destroy(spa->spa_normal_class); 1160 spa->spa_normal_class = NULL; 1161 1162 metaslab_class_destroy(spa->spa_log_class); 1163 spa->spa_log_class = NULL; 1164 1165 /* 1166 * If this was part of an import or the open otherwise failed, we may 1167 * still have errors left in the queues. Empty them just in case. 1168 */ 1169 spa_errlog_drain(spa); 1170 1171 avl_destroy(&spa->spa_errlist_scrub); 1172 avl_destroy(&spa->spa_errlist_last); 1173 1174 spa->spa_state = POOL_STATE_UNINITIALIZED; 1175 1176 mutex_enter(&spa->spa_proc_lock); 1177 if (spa->spa_proc_state != SPA_PROC_NONE) { 1178 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1179 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1180 cv_broadcast(&spa->spa_proc_cv); 1181 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1182 ASSERT(spa->spa_proc != &p0); 1183 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1184 } 1185 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1186 spa->spa_proc_state = SPA_PROC_NONE; 1187 } 1188 ASSERT(spa->spa_proc == &p0); 1189 mutex_exit(&spa->spa_proc_lock); 1190 1191#ifdef SPA_PROCESS 1192 /* 1193 * We want to make sure spa_thread() has actually exited the ZFS 1194 * module, so that the module can't be unloaded out from underneath 1195 * it. 1196 */ 1197 if (spa->spa_did != 0) { 1198 thread_join(spa->spa_did); 1199 spa->spa_did = 0; 1200 } 1201#endif /* SPA_PROCESS */ 1202} 1203 1204/* 1205 * Verify a pool configuration, and construct the vdev tree appropriately. This 1206 * will create all the necessary vdevs in the appropriate layout, with each vdev 1207 * in the CLOSED state. This will prep the pool before open/creation/import. 1208 * All vdev validation is done by the vdev_alloc() routine. 1209 */ 1210static int 1211spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1212 uint_t id, int atype) 1213{ 1214 nvlist_t **child; 1215 uint_t children; 1216 int error; 1217 1218 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1219 return (error); 1220 1221 if ((*vdp)->vdev_ops->vdev_op_leaf) 1222 return (0); 1223 1224 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1225 &child, &children); 1226 1227 if (error == ENOENT) 1228 return (0); 1229 1230 if (error) { 1231 vdev_free(*vdp); 1232 *vdp = NULL; 1233 return (SET_ERROR(EINVAL)); 1234 } 1235 1236 for (int c = 0; c < children; c++) { 1237 vdev_t *vd; 1238 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1239 atype)) != 0) { 1240 vdev_free(*vdp); 1241 *vdp = NULL; 1242 return (error); 1243 } 1244 } 1245 1246 ASSERT(*vdp != NULL); 1247 1248 return (0); 1249} 1250 1251/* 1252 * Opposite of spa_load(). 1253 */ 1254static void 1255spa_unload(spa_t *spa) 1256{ 1257 int i; 1258 1259 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1260 1261 /* 1262 * Stop TRIM thread. 1263 */ 1264 trim_thread_destroy(spa); 1265 1266 /* 1267 * Stop async tasks. 1268 */ 1269 spa_async_suspend(spa); 1270 1271 /* 1272 * Stop syncing. 1273 */ 1274 if (spa->spa_sync_on) { 1275 txg_sync_stop(spa->spa_dsl_pool); 1276 spa->spa_sync_on = B_FALSE; 1277 } 1278 1279 /* 1280 * Wait for any outstanding async I/O to complete. 1281 */ 1282 if (spa->spa_async_zio_root != NULL) { 1283 (void) zio_wait(spa->spa_async_zio_root); 1284 spa->spa_async_zio_root = NULL; 1285 } 1286 1287 bpobj_close(&spa->spa_deferred_bpobj); 1288 1289 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1290 1291 /* 1292 * Close all vdevs. 1293 */ 1294 if (spa->spa_root_vdev) 1295 vdev_free(spa->spa_root_vdev); 1296 ASSERT(spa->spa_root_vdev == NULL); 1297 1298 /* 1299 * Close the dsl pool. 1300 */ 1301 if (spa->spa_dsl_pool) { 1302 dsl_pool_close(spa->spa_dsl_pool); 1303 spa->spa_dsl_pool = NULL; 1304 spa->spa_meta_objset = NULL; 1305 } 1306 1307 ddt_unload(spa); 1308 1309 1310 /* 1311 * Drop and purge level 2 cache 1312 */ 1313 spa_l2cache_drop(spa); 1314 1315 for (i = 0; i < spa->spa_spares.sav_count; i++) 1316 vdev_free(spa->spa_spares.sav_vdevs[i]); 1317 if (spa->spa_spares.sav_vdevs) { 1318 kmem_free(spa->spa_spares.sav_vdevs, 1319 spa->spa_spares.sav_count * sizeof (void *)); 1320 spa->spa_spares.sav_vdevs = NULL; 1321 } 1322 if (spa->spa_spares.sav_config) { 1323 nvlist_free(spa->spa_spares.sav_config); 1324 spa->spa_spares.sav_config = NULL; 1325 } 1326 spa->spa_spares.sav_count = 0; 1327 1328 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1329 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1330 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1331 } 1332 if (spa->spa_l2cache.sav_vdevs) { 1333 kmem_free(spa->spa_l2cache.sav_vdevs, 1334 spa->spa_l2cache.sav_count * sizeof (void *)); 1335 spa->spa_l2cache.sav_vdevs = NULL; 1336 } 1337 if (spa->spa_l2cache.sav_config) { 1338 nvlist_free(spa->spa_l2cache.sav_config); 1339 spa->spa_l2cache.sav_config = NULL; 1340 } 1341 spa->spa_l2cache.sav_count = 0; 1342 1343 spa->spa_async_suspended = 0; 1344 1345 if (spa->spa_comment != NULL) { 1346 spa_strfree(spa->spa_comment); 1347 spa->spa_comment = NULL; 1348 } 1349 1350 spa_config_exit(spa, SCL_ALL, FTAG); 1351} 1352 1353/* 1354 * Load (or re-load) the current list of vdevs describing the active spares for 1355 * this pool. When this is called, we have some form of basic information in 1356 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1357 * then re-generate a more complete list including status information. 1358 */ 1359static void 1360spa_load_spares(spa_t *spa) 1361{ 1362 nvlist_t **spares; 1363 uint_t nspares; 1364 int i; 1365 vdev_t *vd, *tvd; 1366 1367 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1368 1369 /* 1370 * First, close and free any existing spare vdevs. 1371 */ 1372 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1373 vd = spa->spa_spares.sav_vdevs[i]; 1374 1375 /* Undo the call to spa_activate() below */ 1376 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1377 B_FALSE)) != NULL && tvd->vdev_isspare) 1378 spa_spare_remove(tvd); 1379 vdev_close(vd); 1380 vdev_free(vd); 1381 } 1382 1383 if (spa->spa_spares.sav_vdevs) 1384 kmem_free(spa->spa_spares.sav_vdevs, 1385 spa->spa_spares.sav_count * sizeof (void *)); 1386 1387 if (spa->spa_spares.sav_config == NULL) 1388 nspares = 0; 1389 else 1390 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1391 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1392 1393 spa->spa_spares.sav_count = (int)nspares; 1394 spa->spa_spares.sav_vdevs = NULL; 1395 1396 if (nspares == 0) 1397 return; 1398 1399 /* 1400 * Construct the array of vdevs, opening them to get status in the 1401 * process. For each spare, there is potentially two different vdev_t 1402 * structures associated with it: one in the list of spares (used only 1403 * for basic validation purposes) and one in the active vdev 1404 * configuration (if it's spared in). During this phase we open and 1405 * validate each vdev on the spare list. If the vdev also exists in the 1406 * active configuration, then we also mark this vdev as an active spare. 1407 */ 1408 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1409 KM_SLEEP); 1410 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1411 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1412 VDEV_ALLOC_SPARE) == 0); 1413 ASSERT(vd != NULL); 1414 1415 spa->spa_spares.sav_vdevs[i] = vd; 1416 1417 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1418 B_FALSE)) != NULL) { 1419 if (!tvd->vdev_isspare) 1420 spa_spare_add(tvd); 1421 1422 /* 1423 * We only mark the spare active if we were successfully 1424 * able to load the vdev. Otherwise, importing a pool 1425 * with a bad active spare would result in strange 1426 * behavior, because multiple pool would think the spare 1427 * is actively in use. 1428 * 1429 * There is a vulnerability here to an equally bizarre 1430 * circumstance, where a dead active spare is later 1431 * brought back to life (onlined or otherwise). Given 1432 * the rarity of this scenario, and the extra complexity 1433 * it adds, we ignore the possibility. 1434 */ 1435 if (!vdev_is_dead(tvd)) 1436 spa_spare_activate(tvd); 1437 } 1438 1439 vd->vdev_top = vd; 1440 vd->vdev_aux = &spa->spa_spares; 1441 1442 if (vdev_open(vd) != 0) 1443 continue; 1444 1445 if (vdev_validate_aux(vd) == 0) 1446 spa_spare_add(vd); 1447 } 1448 1449 /* 1450 * Recompute the stashed list of spares, with status information 1451 * this time. 1452 */ 1453 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1454 DATA_TYPE_NVLIST_ARRAY) == 0); 1455 1456 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1457 KM_SLEEP); 1458 for (i = 0; i < spa->spa_spares.sav_count; i++) 1459 spares[i] = vdev_config_generate(spa, 1460 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1461 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1462 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1463 for (i = 0; i < spa->spa_spares.sav_count; i++) 1464 nvlist_free(spares[i]); 1465 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1466} 1467 1468/* 1469 * Load (or re-load) the current list of vdevs describing the active l2cache for 1470 * this pool. When this is called, we have some form of basic information in 1471 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1472 * then re-generate a more complete list including status information. 1473 * Devices which are already active have their details maintained, and are 1474 * not re-opened. 1475 */ 1476static void 1477spa_load_l2cache(spa_t *spa) 1478{ 1479 nvlist_t **l2cache; 1480 uint_t nl2cache; 1481 int i, j, oldnvdevs; 1482 uint64_t guid; 1483 vdev_t *vd, **oldvdevs, **newvdevs; 1484 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1485 1486 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1487 1488 if (sav->sav_config != NULL) { 1489 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1490 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1491 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1492 } else { 1493 nl2cache = 0; 1494 newvdevs = NULL; 1495 } 1496 1497 oldvdevs = sav->sav_vdevs; 1498 oldnvdevs = sav->sav_count; 1499 sav->sav_vdevs = NULL; 1500 sav->sav_count = 0; 1501 1502 /* 1503 * Process new nvlist of vdevs. 1504 */ 1505 for (i = 0; i < nl2cache; i++) { 1506 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1507 &guid) == 0); 1508 1509 newvdevs[i] = NULL; 1510 for (j = 0; j < oldnvdevs; j++) { 1511 vd = oldvdevs[j]; 1512 if (vd != NULL && guid == vd->vdev_guid) { 1513 /* 1514 * Retain previous vdev for add/remove ops. 1515 */ 1516 newvdevs[i] = vd; 1517 oldvdevs[j] = NULL; 1518 break; 1519 } 1520 } 1521 1522 if (newvdevs[i] == NULL) { 1523 /* 1524 * Create new vdev 1525 */ 1526 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1527 VDEV_ALLOC_L2CACHE) == 0); 1528 ASSERT(vd != NULL); 1529 newvdevs[i] = vd; 1530 1531 /* 1532 * Commit this vdev as an l2cache device, 1533 * even if it fails to open. 1534 */ 1535 spa_l2cache_add(vd); 1536 1537 vd->vdev_top = vd; 1538 vd->vdev_aux = sav; 1539 1540 spa_l2cache_activate(vd); 1541 1542 if (vdev_open(vd) != 0) 1543 continue; 1544 1545 (void) vdev_validate_aux(vd); 1546 1547 if (!vdev_is_dead(vd)) 1548 l2arc_add_vdev(spa, vd); 1549 } 1550 } 1551 1552 /* 1553 * Purge vdevs that were dropped 1554 */ 1555 for (i = 0; i < oldnvdevs; i++) { 1556 uint64_t pool; 1557 1558 vd = oldvdevs[i]; 1559 if (vd != NULL) { 1560 ASSERT(vd->vdev_isl2cache); 1561 1562 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1563 pool != 0ULL && l2arc_vdev_present(vd)) 1564 l2arc_remove_vdev(vd); 1565 vdev_clear_stats(vd); 1566 vdev_free(vd); 1567 } 1568 } 1569 1570 if (oldvdevs) 1571 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1572 1573 if (sav->sav_config == NULL) 1574 goto out; 1575 1576 sav->sav_vdevs = newvdevs; 1577 sav->sav_count = (int)nl2cache; 1578 1579 /* 1580 * Recompute the stashed list of l2cache devices, with status 1581 * information this time. 1582 */ 1583 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1584 DATA_TYPE_NVLIST_ARRAY) == 0); 1585 1586 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1587 for (i = 0; i < sav->sav_count; i++) 1588 l2cache[i] = vdev_config_generate(spa, 1589 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1590 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1591 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1592out: 1593 for (i = 0; i < sav->sav_count; i++) 1594 nvlist_free(l2cache[i]); 1595 if (sav->sav_count) 1596 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1597} 1598 1599static int 1600load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1601{ 1602 dmu_buf_t *db; 1603 char *packed = NULL; 1604 size_t nvsize = 0; 1605 int error; 1606 *value = NULL; 1607 1608 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1609 if (error != 0) 1610 return (error); 1611 nvsize = *(uint64_t *)db->db_data; 1612 dmu_buf_rele(db, FTAG); 1613 1614 packed = kmem_alloc(nvsize, KM_SLEEP); 1615 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1616 DMU_READ_PREFETCH); 1617 if (error == 0) 1618 error = nvlist_unpack(packed, nvsize, value, 0); 1619 kmem_free(packed, nvsize); 1620 1621 return (error); 1622} 1623 1624/* 1625 * Checks to see if the given vdev could not be opened, in which case we post a 1626 * sysevent to notify the autoreplace code that the device has been removed. 1627 */ 1628static void 1629spa_check_removed(vdev_t *vd) 1630{ 1631 for (int c = 0; c < vd->vdev_children; c++) 1632 spa_check_removed(vd->vdev_child[c]); 1633 1634 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1635 !vd->vdev_ishole) { 1636 zfs_post_autoreplace(vd->vdev_spa, vd); 1637 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1638 } 1639} 1640 1641/* 1642 * Validate the current config against the MOS config 1643 */ 1644static boolean_t 1645spa_config_valid(spa_t *spa, nvlist_t *config) 1646{ 1647 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1648 nvlist_t *nv; 1649 1650 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1651 1652 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1653 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1654 1655 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1656 1657 /* 1658 * If we're doing a normal import, then build up any additional 1659 * diagnostic information about missing devices in this config. 1660 * We'll pass this up to the user for further processing. 1661 */ 1662 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1663 nvlist_t **child, *nv; 1664 uint64_t idx = 0; 1665 1666 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1667 KM_SLEEP); 1668 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1669 1670 for (int c = 0; c < rvd->vdev_children; c++) { 1671 vdev_t *tvd = rvd->vdev_child[c]; 1672 vdev_t *mtvd = mrvd->vdev_child[c]; 1673 1674 if (tvd->vdev_ops == &vdev_missing_ops && 1675 mtvd->vdev_ops != &vdev_missing_ops && 1676 mtvd->vdev_islog) 1677 child[idx++] = vdev_config_generate(spa, mtvd, 1678 B_FALSE, 0); 1679 } 1680 1681 if (idx) { 1682 VERIFY(nvlist_add_nvlist_array(nv, 1683 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1684 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1685 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1686 1687 for (int i = 0; i < idx; i++) 1688 nvlist_free(child[i]); 1689 } 1690 nvlist_free(nv); 1691 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1692 } 1693 1694 /* 1695 * Compare the root vdev tree with the information we have 1696 * from the MOS config (mrvd). Check each top-level vdev 1697 * with the corresponding MOS config top-level (mtvd). 1698 */ 1699 for (int c = 0; c < rvd->vdev_children; c++) { 1700 vdev_t *tvd = rvd->vdev_child[c]; 1701 vdev_t *mtvd = mrvd->vdev_child[c]; 1702 1703 /* 1704 * Resolve any "missing" vdevs in the current configuration. 1705 * If we find that the MOS config has more accurate information 1706 * about the top-level vdev then use that vdev instead. 1707 */ 1708 if (tvd->vdev_ops == &vdev_missing_ops && 1709 mtvd->vdev_ops != &vdev_missing_ops) { 1710 1711 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1712 continue; 1713 1714 /* 1715 * Device specific actions. 1716 */ 1717 if (mtvd->vdev_islog) { 1718 spa_set_log_state(spa, SPA_LOG_CLEAR); 1719 } else { 1720 /* 1721 * XXX - once we have 'readonly' pool 1722 * support we should be able to handle 1723 * missing data devices by transitioning 1724 * the pool to readonly. 1725 */ 1726 continue; 1727 } 1728 1729 /* 1730 * Swap the missing vdev with the data we were 1731 * able to obtain from the MOS config. 1732 */ 1733 vdev_remove_child(rvd, tvd); 1734 vdev_remove_child(mrvd, mtvd); 1735 1736 vdev_add_child(rvd, mtvd); 1737 vdev_add_child(mrvd, tvd); 1738 1739 spa_config_exit(spa, SCL_ALL, FTAG); 1740 vdev_load(mtvd); 1741 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1742 1743 vdev_reopen(rvd); 1744 } else if (mtvd->vdev_islog) { 1745 /* 1746 * Load the slog device's state from the MOS config 1747 * since it's possible that the label does not 1748 * contain the most up-to-date information. 1749 */ 1750 vdev_load_log_state(tvd, mtvd); 1751 vdev_reopen(tvd); 1752 } 1753 } 1754 vdev_free(mrvd); 1755 spa_config_exit(spa, SCL_ALL, FTAG); 1756 1757 /* 1758 * Ensure we were able to validate the config. 1759 */ 1760 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1761} 1762 1763/* 1764 * Check for missing log devices 1765 */ 1766static boolean_t 1767spa_check_logs(spa_t *spa) 1768{ 1769 boolean_t rv = B_FALSE; 1770 1771 switch (spa->spa_log_state) { 1772 case SPA_LOG_MISSING: 1773 /* need to recheck in case slog has been restored */ 1774 case SPA_LOG_UNKNOWN: 1775 rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1776 NULL, DS_FIND_CHILDREN) != 0); 1777 if (rv) 1778 spa_set_log_state(spa, SPA_LOG_MISSING); 1779 break; 1780 } 1781 return (rv); 1782} 1783 1784static boolean_t 1785spa_passivate_log(spa_t *spa) 1786{ 1787 vdev_t *rvd = spa->spa_root_vdev; 1788 boolean_t slog_found = B_FALSE; 1789 1790 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1791 1792 if (!spa_has_slogs(spa)) 1793 return (B_FALSE); 1794 1795 for (int c = 0; c < rvd->vdev_children; c++) { 1796 vdev_t *tvd = rvd->vdev_child[c]; 1797 metaslab_group_t *mg = tvd->vdev_mg; 1798 1799 if (tvd->vdev_islog) { 1800 metaslab_group_passivate(mg); 1801 slog_found = B_TRUE; 1802 } 1803 } 1804 1805 return (slog_found); 1806} 1807 1808static void 1809spa_activate_log(spa_t *spa) 1810{ 1811 vdev_t *rvd = spa->spa_root_vdev; 1812 1813 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1814 1815 for (int c = 0; c < rvd->vdev_children; c++) { 1816 vdev_t *tvd = rvd->vdev_child[c]; 1817 metaslab_group_t *mg = tvd->vdev_mg; 1818 1819 if (tvd->vdev_islog) 1820 metaslab_group_activate(mg); 1821 } 1822} 1823 1824int 1825spa_offline_log(spa_t *spa) 1826{ 1827 int error; 1828 1829 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1830 NULL, DS_FIND_CHILDREN); 1831 if (error == 0) { 1832 /* 1833 * We successfully offlined the log device, sync out the 1834 * current txg so that the "stubby" block can be removed 1835 * by zil_sync(). 1836 */ 1837 txg_wait_synced(spa->spa_dsl_pool, 0); 1838 } 1839 return (error); 1840} 1841 1842static void 1843spa_aux_check_removed(spa_aux_vdev_t *sav) 1844{ 1845 int i; 1846 1847 for (i = 0; i < sav->sav_count; i++) 1848 spa_check_removed(sav->sav_vdevs[i]); 1849} 1850 1851void 1852spa_claim_notify(zio_t *zio) 1853{ 1854 spa_t *spa = zio->io_spa; 1855 1856 if (zio->io_error) 1857 return; 1858 1859 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1860 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1861 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1862 mutex_exit(&spa->spa_props_lock); 1863} 1864 1865typedef struct spa_load_error { 1866 uint64_t sle_meta_count; 1867 uint64_t sle_data_count; 1868} spa_load_error_t; 1869 1870static void 1871spa_load_verify_done(zio_t *zio) 1872{ 1873 blkptr_t *bp = zio->io_bp; 1874 spa_load_error_t *sle = zio->io_private; 1875 dmu_object_type_t type = BP_GET_TYPE(bp); 1876 int error = zio->io_error; 1877 1878 if (error) { 1879 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1880 type != DMU_OT_INTENT_LOG) 1881 atomic_add_64(&sle->sle_meta_count, 1); 1882 else 1883 atomic_add_64(&sle->sle_data_count, 1); 1884 } 1885 zio_data_buf_free(zio->io_data, zio->io_size); 1886} 1887 1888/*ARGSUSED*/ 1889static int 1890spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1891 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1892{ 1893 if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1894 zio_t *rio = arg; 1895 size_t size = BP_GET_PSIZE(bp); 1896 void *data = zio_data_buf_alloc(size); 1897 1898 zio_nowait(zio_read(rio, spa, bp, data, size, 1899 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1900 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1901 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1902 } 1903 return (0); 1904} 1905 1906static int 1907spa_load_verify(spa_t *spa) 1908{ 1909 zio_t *rio; 1910 spa_load_error_t sle = { 0 }; 1911 zpool_rewind_policy_t policy; 1912 boolean_t verify_ok = B_FALSE; 1913 int error; 1914 1915 zpool_get_rewind_policy(spa->spa_config, &policy); 1916 1917 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1918 return (0); 1919 1920 rio = zio_root(spa, NULL, &sle, 1921 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1922 1923 error = traverse_pool(spa, spa->spa_verify_min_txg, 1924 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1925 1926 (void) zio_wait(rio); 1927 1928 spa->spa_load_meta_errors = sle.sle_meta_count; 1929 spa->spa_load_data_errors = sle.sle_data_count; 1930 1931 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1932 sle.sle_data_count <= policy.zrp_maxdata) { 1933 int64_t loss = 0; 1934 1935 verify_ok = B_TRUE; 1936 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1937 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1938 1939 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1940 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1941 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1942 VERIFY(nvlist_add_int64(spa->spa_load_info, 1943 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1944 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1945 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1946 } else { 1947 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1948 } 1949 1950 if (error) { 1951 if (error != ENXIO && error != EIO) 1952 error = SET_ERROR(EIO); 1953 return (error); 1954 } 1955 1956 return (verify_ok ? 0 : EIO); 1957} 1958 1959/* 1960 * Find a value in the pool props object. 1961 */ 1962static void 1963spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1964{ 1965 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1966 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1967} 1968 1969/* 1970 * Find a value in the pool directory object. 1971 */ 1972static int 1973spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1974{ 1975 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1976 name, sizeof (uint64_t), 1, val)); 1977} 1978 1979static int 1980spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1981{ 1982 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1983 return (err); 1984} 1985 1986/* 1987 * Fix up config after a partly-completed split. This is done with the 1988 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1989 * pool have that entry in their config, but only the splitting one contains 1990 * a list of all the guids of the vdevs that are being split off. 1991 * 1992 * This function determines what to do with that list: either rejoin 1993 * all the disks to the pool, or complete the splitting process. To attempt 1994 * the rejoin, each disk that is offlined is marked online again, and 1995 * we do a reopen() call. If the vdev label for every disk that was 1996 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1997 * then we call vdev_split() on each disk, and complete the split. 1998 * 1999 * Otherwise we leave the config alone, with all the vdevs in place in 2000 * the original pool. 2001 */ 2002static void 2003spa_try_repair(spa_t *spa, nvlist_t *config) 2004{ 2005 uint_t extracted; 2006 uint64_t *glist; 2007 uint_t i, gcount; 2008 nvlist_t *nvl; 2009 vdev_t **vd; 2010 boolean_t attempt_reopen; 2011 2012 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2013 return; 2014 2015 /* check that the config is complete */ 2016 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2017 &glist, &gcount) != 0) 2018 return; 2019 2020 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2021 2022 /* attempt to online all the vdevs & validate */ 2023 attempt_reopen = B_TRUE; 2024 for (i = 0; i < gcount; i++) { 2025 if (glist[i] == 0) /* vdev is hole */ 2026 continue; 2027 2028 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2029 if (vd[i] == NULL) { 2030 /* 2031 * Don't bother attempting to reopen the disks; 2032 * just do the split. 2033 */ 2034 attempt_reopen = B_FALSE; 2035 } else { 2036 /* attempt to re-online it */ 2037 vd[i]->vdev_offline = B_FALSE; 2038 } 2039 } 2040 2041 if (attempt_reopen) { 2042 vdev_reopen(spa->spa_root_vdev); 2043 2044 /* check each device to see what state it's in */ 2045 for (extracted = 0, i = 0; i < gcount; i++) { 2046 if (vd[i] != NULL && 2047 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2048 break; 2049 ++extracted; 2050 } 2051 } 2052 2053 /* 2054 * If every disk has been moved to the new pool, or if we never 2055 * even attempted to look at them, then we split them off for 2056 * good. 2057 */ 2058 if (!attempt_reopen || gcount == extracted) { 2059 for (i = 0; i < gcount; i++) 2060 if (vd[i] != NULL) 2061 vdev_split(vd[i]); 2062 vdev_reopen(spa->spa_root_vdev); 2063 } 2064 2065 kmem_free(vd, gcount * sizeof (vdev_t *)); 2066} 2067 2068static int 2069spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2070 boolean_t mosconfig) 2071{ 2072 nvlist_t *config = spa->spa_config; 2073 char *ereport = FM_EREPORT_ZFS_POOL; 2074 char *comment; 2075 int error; 2076 uint64_t pool_guid; 2077 nvlist_t *nvl; 2078 2079 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2080 return (SET_ERROR(EINVAL)); 2081 2082 ASSERT(spa->spa_comment == NULL); 2083 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2084 spa->spa_comment = spa_strdup(comment); 2085 2086 /* 2087 * Versioning wasn't explicitly added to the label until later, so if 2088 * it's not present treat it as the initial version. 2089 */ 2090 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2091 &spa->spa_ubsync.ub_version) != 0) 2092 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2093 2094 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2095 &spa->spa_config_txg); 2096 2097 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2098 spa_guid_exists(pool_guid, 0)) { 2099 error = SET_ERROR(EEXIST); 2100 } else { 2101 spa->spa_config_guid = pool_guid; 2102 2103 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2104 &nvl) == 0) { 2105 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2106 KM_SLEEP) == 0); 2107 } 2108 2109 nvlist_free(spa->spa_load_info); 2110 spa->spa_load_info = fnvlist_alloc(); 2111 2112 gethrestime(&spa->spa_loaded_ts); 2113 error = spa_load_impl(spa, pool_guid, config, state, type, 2114 mosconfig, &ereport); 2115 } 2116 2117 spa->spa_minref = refcount_count(&spa->spa_refcount); 2118 if (error) { 2119 if (error != EEXIST) { 2120 spa->spa_loaded_ts.tv_sec = 0; 2121 spa->spa_loaded_ts.tv_nsec = 0; 2122 } 2123 if (error != EBADF) { 2124 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2125 } 2126 } 2127 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2128 spa->spa_ena = 0; 2129 2130 return (error); 2131} 2132 2133/* 2134 * Load an existing storage pool, using the pool's builtin spa_config as a 2135 * source of configuration information. 2136 */ 2137static int 2138spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2139 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2140 char **ereport) 2141{ 2142 int error = 0; 2143 nvlist_t *nvroot = NULL; 2144 nvlist_t *label; 2145 vdev_t *rvd; 2146 uberblock_t *ub = &spa->spa_uberblock; 2147 uint64_t children, config_cache_txg = spa->spa_config_txg; 2148 int orig_mode = spa->spa_mode; 2149 int parse; 2150 uint64_t obj; 2151 boolean_t missing_feat_write = B_FALSE; 2152 2153 /* 2154 * If this is an untrusted config, access the pool in read-only mode. 2155 * This prevents things like resilvering recently removed devices. 2156 */ 2157 if (!mosconfig) 2158 spa->spa_mode = FREAD; 2159 2160 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2161 2162 spa->spa_load_state = state; 2163 2164 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2165 return (SET_ERROR(EINVAL)); 2166 2167 parse = (type == SPA_IMPORT_EXISTING ? 2168 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2169 2170 /* 2171 * Create "The Godfather" zio to hold all async IOs 2172 */ 2173 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2174 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2175 2176 /* 2177 * Parse the configuration into a vdev tree. We explicitly set the 2178 * value that will be returned by spa_version() since parsing the 2179 * configuration requires knowing the version number. 2180 */ 2181 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2182 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2183 spa_config_exit(spa, SCL_ALL, FTAG); 2184 2185 if (error != 0) 2186 return (error); 2187 2188 ASSERT(spa->spa_root_vdev == rvd); 2189 2190 if (type != SPA_IMPORT_ASSEMBLE) { 2191 ASSERT(spa_guid(spa) == pool_guid); 2192 } 2193 2194 /* 2195 * Try to open all vdevs, loading each label in the process. 2196 */ 2197 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2198 error = vdev_open(rvd); 2199 spa_config_exit(spa, SCL_ALL, FTAG); 2200 if (error != 0) 2201 return (error); 2202 2203 /* 2204 * We need to validate the vdev labels against the configuration that 2205 * we have in hand, which is dependent on the setting of mosconfig. If 2206 * mosconfig is true then we're validating the vdev labels based on 2207 * that config. Otherwise, we're validating against the cached config 2208 * (zpool.cache) that was read when we loaded the zfs module, and then 2209 * later we will recursively call spa_load() and validate against 2210 * the vdev config. 2211 * 2212 * If we're assembling a new pool that's been split off from an 2213 * existing pool, the labels haven't yet been updated so we skip 2214 * validation for now. 2215 */ 2216 if (type != SPA_IMPORT_ASSEMBLE) { 2217 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2218 error = vdev_validate(rvd, mosconfig); 2219 spa_config_exit(spa, SCL_ALL, FTAG); 2220 2221 if (error != 0) 2222 return (error); 2223 2224 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2225 return (SET_ERROR(ENXIO)); 2226 } 2227 2228 /* 2229 * Find the best uberblock. 2230 */ 2231 vdev_uberblock_load(rvd, ub, &label); 2232 2233 /* 2234 * If we weren't able to find a single valid uberblock, return failure. 2235 */ 2236 if (ub->ub_txg == 0) { 2237 nvlist_free(label); 2238 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2239 } 2240 2241 /* 2242 * If the pool has an unsupported version we can't open it. 2243 */ 2244 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2245 nvlist_free(label); 2246 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2247 } 2248 2249 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2250 nvlist_t *features; 2251 2252 /* 2253 * If we weren't able to find what's necessary for reading the 2254 * MOS in the label, return failure. 2255 */ 2256 if (label == NULL || nvlist_lookup_nvlist(label, 2257 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2258 nvlist_free(label); 2259 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2260 ENXIO)); 2261 } 2262 2263 /* 2264 * Update our in-core representation with the definitive values 2265 * from the label. 2266 */ 2267 nvlist_free(spa->spa_label_features); 2268 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2269 } 2270 2271 nvlist_free(label); 2272 2273 /* 2274 * Look through entries in the label nvlist's features_for_read. If 2275 * there is a feature listed there which we don't understand then we 2276 * cannot open a pool. 2277 */ 2278 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2279 nvlist_t *unsup_feat; 2280 2281 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2282 0); 2283 2284 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2285 NULL); nvp != NULL; 2286 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2287 if (!zfeature_is_supported(nvpair_name(nvp))) { 2288 VERIFY(nvlist_add_string(unsup_feat, 2289 nvpair_name(nvp), "") == 0); 2290 } 2291 } 2292 2293 if (!nvlist_empty(unsup_feat)) { 2294 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2295 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2296 nvlist_free(unsup_feat); 2297 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2298 ENOTSUP)); 2299 } 2300 2301 nvlist_free(unsup_feat); 2302 } 2303 2304 /* 2305 * If the vdev guid sum doesn't match the uberblock, we have an 2306 * incomplete configuration. We first check to see if the pool 2307 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2308 * If it is, defer the vdev_guid_sum check till later so we 2309 * can handle missing vdevs. 2310 */ 2311 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2312 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2313 rvd->vdev_guid_sum != ub->ub_guid_sum) 2314 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2315 2316 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2317 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2318 spa_try_repair(spa, config); 2319 spa_config_exit(spa, SCL_ALL, FTAG); 2320 nvlist_free(spa->spa_config_splitting); 2321 spa->spa_config_splitting = NULL; 2322 } 2323 2324 /* 2325 * Initialize internal SPA structures. 2326 */ 2327 spa->spa_state = POOL_STATE_ACTIVE; 2328 spa->spa_ubsync = spa->spa_uberblock; 2329 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2330 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2331 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2332 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2333 spa->spa_claim_max_txg = spa->spa_first_txg; 2334 spa->spa_prev_software_version = ub->ub_software_version; 2335 2336 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2337 if (error) 2338 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2339 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2340 2341 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2342 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2343 2344 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2345 boolean_t missing_feat_read = B_FALSE; 2346 nvlist_t *unsup_feat, *enabled_feat; 2347 2348 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2349 &spa->spa_feat_for_read_obj) != 0) { 2350 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2351 } 2352 2353 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2354 &spa->spa_feat_for_write_obj) != 0) { 2355 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2356 } 2357 2358 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2359 &spa->spa_feat_desc_obj) != 0) { 2360 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2361 } 2362 2363 enabled_feat = fnvlist_alloc(); 2364 unsup_feat = fnvlist_alloc(); 2365 2366 if (!spa_features_check(spa, B_FALSE, 2367 unsup_feat, enabled_feat)) 2368 missing_feat_read = B_TRUE; 2369 2370 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2371 if (!spa_features_check(spa, B_TRUE, 2372 unsup_feat, enabled_feat)) { 2373 missing_feat_write = B_TRUE; 2374 } 2375 } 2376 2377 fnvlist_add_nvlist(spa->spa_load_info, 2378 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2379 2380 if (!nvlist_empty(unsup_feat)) { 2381 fnvlist_add_nvlist(spa->spa_load_info, 2382 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2383 } 2384 2385 fnvlist_free(enabled_feat); 2386 fnvlist_free(unsup_feat); 2387 2388 if (!missing_feat_read) { 2389 fnvlist_add_boolean(spa->spa_load_info, 2390 ZPOOL_CONFIG_CAN_RDONLY); 2391 } 2392 2393 /* 2394 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2395 * twofold: to determine whether the pool is available for 2396 * import in read-write mode and (if it is not) whether the 2397 * pool is available for import in read-only mode. If the pool 2398 * is available for import in read-write mode, it is displayed 2399 * as available in userland; if it is not available for import 2400 * in read-only mode, it is displayed as unavailable in 2401 * userland. If the pool is available for import in read-only 2402 * mode but not read-write mode, it is displayed as unavailable 2403 * in userland with a special note that the pool is actually 2404 * available for open in read-only mode. 2405 * 2406 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2407 * missing a feature for write, we must first determine whether 2408 * the pool can be opened read-only before returning to 2409 * userland in order to know whether to display the 2410 * abovementioned note. 2411 */ 2412 if (missing_feat_read || (missing_feat_write && 2413 spa_writeable(spa))) { 2414 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2415 ENOTSUP)); 2416 } 2417 2418 /* 2419 * Load refcounts for ZFS features from disk into an in-memory 2420 * cache during SPA initialization. 2421 */ 2422 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2423 uint64_t refcount; 2424 2425 error = feature_get_refcount_from_disk(spa, 2426 &spa_feature_table[i], &refcount); 2427 if (error == 0) { 2428 spa->spa_feat_refcount_cache[i] = refcount; 2429 } else if (error == ENOTSUP) { 2430 spa->spa_feat_refcount_cache[i] = 2431 SPA_FEATURE_DISABLED; 2432 } else { 2433 return (spa_vdev_err(rvd, 2434 VDEV_AUX_CORRUPT_DATA, EIO)); 2435 } 2436 } 2437 } 2438 2439 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2440 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2441 &spa->spa_feat_enabled_txg_obj) != 0) 2442 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2443 } 2444 2445 spa->spa_is_initializing = B_TRUE; 2446 error = dsl_pool_open(spa->spa_dsl_pool); 2447 spa->spa_is_initializing = B_FALSE; 2448 if (error != 0) 2449 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2450 2451 if (!mosconfig) { 2452 uint64_t hostid; 2453 nvlist_t *policy = NULL, *nvconfig; 2454 2455 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2456 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2457 2458 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2459 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2460 char *hostname; 2461 unsigned long myhostid = 0; 2462 2463 VERIFY(nvlist_lookup_string(nvconfig, 2464 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2465 2466#ifdef _KERNEL 2467 myhostid = zone_get_hostid(NULL); 2468#else /* _KERNEL */ 2469 /* 2470 * We're emulating the system's hostid in userland, so 2471 * we can't use zone_get_hostid(). 2472 */ 2473 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2474#endif /* _KERNEL */ 2475 if (check_hostid && hostid != 0 && myhostid != 0 && 2476 hostid != myhostid) { 2477 nvlist_free(nvconfig); 2478 cmn_err(CE_WARN, "pool '%s' could not be " 2479 "loaded as it was last accessed by " 2480 "another system (host: %s hostid: 0x%lx). " 2481 "See: http://illumos.org/msg/ZFS-8000-EY", 2482 spa_name(spa), hostname, 2483 (unsigned long)hostid); 2484 return (SET_ERROR(EBADF)); 2485 } 2486 } 2487 if (nvlist_lookup_nvlist(spa->spa_config, 2488 ZPOOL_REWIND_POLICY, &policy) == 0) 2489 VERIFY(nvlist_add_nvlist(nvconfig, 2490 ZPOOL_REWIND_POLICY, policy) == 0); 2491 2492 spa_config_set(spa, nvconfig); 2493 spa_unload(spa); 2494 spa_deactivate(spa); 2495 spa_activate(spa, orig_mode); 2496 2497 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2498 } 2499 2500 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2501 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2502 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2503 if (error != 0) 2504 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2505 2506 /* 2507 * Load the bit that tells us to use the new accounting function 2508 * (raid-z deflation). If we have an older pool, this will not 2509 * be present. 2510 */ 2511 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2512 if (error != 0 && error != ENOENT) 2513 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2514 2515 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2516 &spa->spa_creation_version); 2517 if (error != 0 && error != ENOENT) 2518 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2519 2520 /* 2521 * Load the persistent error log. If we have an older pool, this will 2522 * not be present. 2523 */ 2524 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2525 if (error != 0 && error != ENOENT) 2526 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2527 2528 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2529 &spa->spa_errlog_scrub); 2530 if (error != 0 && error != ENOENT) 2531 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2532 2533 /* 2534 * Load the history object. If we have an older pool, this 2535 * will not be present. 2536 */ 2537 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2538 if (error != 0 && error != ENOENT) 2539 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2540 2541 /* 2542 * If we're assembling the pool from the split-off vdevs of 2543 * an existing pool, we don't want to attach the spares & cache 2544 * devices. 2545 */ 2546 2547 /* 2548 * Load any hot spares for this pool. 2549 */ 2550 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2551 if (error != 0 && error != ENOENT) 2552 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2553 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2554 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2555 if (load_nvlist(spa, spa->spa_spares.sav_object, 2556 &spa->spa_spares.sav_config) != 0) 2557 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2558 2559 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2560 spa_load_spares(spa); 2561 spa_config_exit(spa, SCL_ALL, FTAG); 2562 } else if (error == 0) { 2563 spa->spa_spares.sav_sync = B_TRUE; 2564 } 2565 2566 /* 2567 * Load any level 2 ARC devices for this pool. 2568 */ 2569 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2570 &spa->spa_l2cache.sav_object); 2571 if (error != 0 && error != ENOENT) 2572 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2573 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2574 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2575 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2576 &spa->spa_l2cache.sav_config) != 0) 2577 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2578 2579 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2580 spa_load_l2cache(spa); 2581 spa_config_exit(spa, SCL_ALL, FTAG); 2582 } else if (error == 0) { 2583 spa->spa_l2cache.sav_sync = B_TRUE; 2584 } 2585 2586 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2587 2588 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2589 if (error && error != ENOENT) 2590 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2591 2592 if (error == 0) { 2593 uint64_t autoreplace; 2594 2595 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2596 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2597 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2598 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2599 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2600 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2601 &spa->spa_dedup_ditto); 2602 2603 spa->spa_autoreplace = (autoreplace != 0); 2604 } 2605 2606 /* 2607 * If the 'autoreplace' property is set, then post a resource notifying 2608 * the ZFS DE that it should not issue any faults for unopenable 2609 * devices. We also iterate over the vdevs, and post a sysevent for any 2610 * unopenable vdevs so that the normal autoreplace handler can take 2611 * over. 2612 */ 2613 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2614 spa_check_removed(spa->spa_root_vdev); 2615 /* 2616 * For the import case, this is done in spa_import(), because 2617 * at this point we're using the spare definitions from 2618 * the MOS config, not necessarily from the userland config. 2619 */ 2620 if (state != SPA_LOAD_IMPORT) { 2621 spa_aux_check_removed(&spa->spa_spares); 2622 spa_aux_check_removed(&spa->spa_l2cache); 2623 } 2624 } 2625 2626 /* 2627 * Load the vdev state for all toplevel vdevs. 2628 */ 2629 vdev_load(rvd); 2630 2631 /* 2632 * Propagate the leaf DTLs we just loaded all the way up the tree. 2633 */ 2634 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2635 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2636 spa_config_exit(spa, SCL_ALL, FTAG); 2637 2638 /* 2639 * Load the DDTs (dedup tables). 2640 */ 2641 error = ddt_load(spa); 2642 if (error != 0) 2643 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2644 2645 spa_update_dspace(spa); 2646 2647 /* 2648 * Validate the config, using the MOS config to fill in any 2649 * information which might be missing. If we fail to validate 2650 * the config then declare the pool unfit for use. If we're 2651 * assembling a pool from a split, the log is not transferred 2652 * over. 2653 */ 2654 if (type != SPA_IMPORT_ASSEMBLE) { 2655 nvlist_t *nvconfig; 2656 2657 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2658 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2659 2660 if (!spa_config_valid(spa, nvconfig)) { 2661 nvlist_free(nvconfig); 2662 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2663 ENXIO)); 2664 } 2665 nvlist_free(nvconfig); 2666 2667 /* 2668 * Now that we've validated the config, check the state of the 2669 * root vdev. If it can't be opened, it indicates one or 2670 * more toplevel vdevs are faulted. 2671 */ 2672 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2673 return (SET_ERROR(ENXIO)); 2674 2675 if (spa_check_logs(spa)) { 2676 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2677 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2678 } 2679 } 2680 2681 if (missing_feat_write) { 2682 ASSERT(state == SPA_LOAD_TRYIMPORT); 2683 2684 /* 2685 * At this point, we know that we can open the pool in 2686 * read-only mode but not read-write mode. We now have enough 2687 * information and can return to userland. 2688 */ 2689 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2690 } 2691 2692 /* 2693 * We've successfully opened the pool, verify that we're ready 2694 * to start pushing transactions. 2695 */ 2696 if (state != SPA_LOAD_TRYIMPORT) { 2697 if (error = spa_load_verify(spa)) 2698 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2699 error)); 2700 } 2701 2702 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2703 spa->spa_load_max_txg == UINT64_MAX)) { 2704 dmu_tx_t *tx; 2705 int need_update = B_FALSE; 2706 2707 ASSERT(state != SPA_LOAD_TRYIMPORT); 2708 2709 /* 2710 * Claim log blocks that haven't been committed yet. 2711 * This must all happen in a single txg. 2712 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2713 * invoked from zil_claim_log_block()'s i/o done callback. 2714 * Price of rollback is that we abandon the log. 2715 */ 2716 spa->spa_claiming = B_TRUE; 2717 2718 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2719 spa_first_txg(spa)); 2720 (void) dmu_objset_find(spa_name(spa), 2721 zil_claim, tx, DS_FIND_CHILDREN); 2722 dmu_tx_commit(tx); 2723 2724 spa->spa_claiming = B_FALSE; 2725 2726 spa_set_log_state(spa, SPA_LOG_GOOD); 2727 spa->spa_sync_on = B_TRUE; 2728 txg_sync_start(spa->spa_dsl_pool); 2729 2730 /* 2731 * Wait for all claims to sync. We sync up to the highest 2732 * claimed log block birth time so that claimed log blocks 2733 * don't appear to be from the future. spa_claim_max_txg 2734 * will have been set for us by either zil_check_log_chain() 2735 * (invoked from spa_check_logs()) or zil_claim() above. 2736 */ 2737 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2738 2739 /* 2740 * If the config cache is stale, or we have uninitialized 2741 * metaslabs (see spa_vdev_add()), then update the config. 2742 * 2743 * If this is a verbatim import, trust the current 2744 * in-core spa_config and update the disk labels. 2745 */ 2746 if (config_cache_txg != spa->spa_config_txg || 2747 state == SPA_LOAD_IMPORT || 2748 state == SPA_LOAD_RECOVER || 2749 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2750 need_update = B_TRUE; 2751 2752 for (int c = 0; c < rvd->vdev_children; c++) 2753 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2754 need_update = B_TRUE; 2755 2756 /* 2757 * Update the config cache asychronously in case we're the 2758 * root pool, in which case the config cache isn't writable yet. 2759 */ 2760 if (need_update) 2761 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2762 2763 /* 2764 * Check all DTLs to see if anything needs resilvering. 2765 */ 2766 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2767 vdev_resilver_needed(rvd, NULL, NULL)) 2768 spa_async_request(spa, SPA_ASYNC_RESILVER); 2769 2770 /* 2771 * Log the fact that we booted up (so that we can detect if 2772 * we rebooted in the middle of an operation). 2773 */ 2774 spa_history_log_version(spa, "open"); 2775 2776 /* 2777 * Delete any inconsistent datasets. 2778 */ 2779 (void) dmu_objset_find(spa_name(spa), 2780 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2781 2782 /* 2783 * Clean up any stale temporary dataset userrefs. 2784 */ 2785 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2786 } 2787 2788 return (0); 2789} 2790 2791static int 2792spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2793{ 2794 int mode = spa->spa_mode; 2795 2796 spa_unload(spa); 2797 spa_deactivate(spa); 2798 2799 spa->spa_load_max_txg--; 2800 2801 spa_activate(spa, mode); 2802 spa_async_suspend(spa); 2803 2804 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2805} 2806 2807/* 2808 * If spa_load() fails this function will try loading prior txg's. If 2809 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2810 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2811 * function will not rewind the pool and will return the same error as 2812 * spa_load(). 2813 */ 2814static int 2815spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2816 uint64_t max_request, int rewind_flags) 2817{ 2818 nvlist_t *loadinfo = NULL; 2819 nvlist_t *config = NULL; 2820 int load_error, rewind_error; 2821 uint64_t safe_rewind_txg; 2822 uint64_t min_txg; 2823 2824 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2825 spa->spa_load_max_txg = spa->spa_load_txg; 2826 spa_set_log_state(spa, SPA_LOG_CLEAR); 2827 } else { 2828 spa->spa_load_max_txg = max_request; 2829 } 2830 2831 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2832 mosconfig); 2833 if (load_error == 0) 2834 return (0); 2835 2836 if (spa->spa_root_vdev != NULL) 2837 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2838 2839 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2840 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2841 2842 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2843 nvlist_free(config); 2844 return (load_error); 2845 } 2846 2847 if (state == SPA_LOAD_RECOVER) { 2848 /* Price of rolling back is discarding txgs, including log */ 2849 spa_set_log_state(spa, SPA_LOG_CLEAR); 2850 } else { 2851 /* 2852 * If we aren't rolling back save the load info from our first 2853 * import attempt so that we can restore it after attempting 2854 * to rewind. 2855 */ 2856 loadinfo = spa->spa_load_info; 2857 spa->spa_load_info = fnvlist_alloc(); 2858 } 2859 2860 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2861 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2862 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2863 TXG_INITIAL : safe_rewind_txg; 2864 2865 /* 2866 * Continue as long as we're finding errors, we're still within 2867 * the acceptable rewind range, and we're still finding uberblocks 2868 */ 2869 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2870 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2871 if (spa->spa_load_max_txg < safe_rewind_txg) 2872 spa->spa_extreme_rewind = B_TRUE; 2873 rewind_error = spa_load_retry(spa, state, mosconfig); 2874 } 2875 2876 spa->spa_extreme_rewind = B_FALSE; 2877 spa->spa_load_max_txg = UINT64_MAX; 2878 2879 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2880 spa_config_set(spa, config); 2881 2882 if (state == SPA_LOAD_RECOVER) { 2883 ASSERT3P(loadinfo, ==, NULL); 2884 return (rewind_error); 2885 } else { 2886 /* Store the rewind info as part of the initial load info */ 2887 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2888 spa->spa_load_info); 2889 2890 /* Restore the initial load info */ 2891 fnvlist_free(spa->spa_load_info); 2892 spa->spa_load_info = loadinfo; 2893 2894 return (load_error); 2895 } 2896} 2897 2898/* 2899 * Pool Open/Import 2900 * 2901 * The import case is identical to an open except that the configuration is sent 2902 * down from userland, instead of grabbed from the configuration cache. For the 2903 * case of an open, the pool configuration will exist in the 2904 * POOL_STATE_UNINITIALIZED state. 2905 * 2906 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2907 * the same time open the pool, without having to keep around the spa_t in some 2908 * ambiguous state. 2909 */ 2910static int 2911spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2912 nvlist_t **config) 2913{ 2914 spa_t *spa; 2915 spa_load_state_t state = SPA_LOAD_OPEN; 2916 int error; 2917 int locked = B_FALSE; 2918 int firstopen = B_FALSE; 2919 2920 *spapp = NULL; 2921 2922 /* 2923 * As disgusting as this is, we need to support recursive calls to this 2924 * function because dsl_dir_open() is called during spa_load(), and ends 2925 * up calling spa_open() again. The real fix is to figure out how to 2926 * avoid dsl_dir_open() calling this in the first place. 2927 */ 2928 if (mutex_owner(&spa_namespace_lock) != curthread) { 2929 mutex_enter(&spa_namespace_lock); 2930 locked = B_TRUE; 2931 } 2932 2933 if ((spa = spa_lookup(pool)) == NULL) { 2934 if (locked) 2935 mutex_exit(&spa_namespace_lock); 2936 return (SET_ERROR(ENOENT)); 2937 } 2938 2939 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2940 zpool_rewind_policy_t policy; 2941 2942 firstopen = B_TRUE; 2943 2944 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2945 &policy); 2946 if (policy.zrp_request & ZPOOL_DO_REWIND) 2947 state = SPA_LOAD_RECOVER; 2948 2949 spa_activate(spa, spa_mode_global); 2950 2951 if (state != SPA_LOAD_RECOVER) 2952 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2953 2954 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2955 policy.zrp_request); 2956 2957 if (error == EBADF) { 2958 /* 2959 * If vdev_validate() returns failure (indicated by 2960 * EBADF), it indicates that one of the vdevs indicates 2961 * that the pool has been exported or destroyed. If 2962 * this is the case, the config cache is out of sync and 2963 * we should remove the pool from the namespace. 2964 */ 2965 spa_unload(spa); 2966 spa_deactivate(spa); 2967 spa_config_sync(spa, B_TRUE, B_TRUE); 2968 spa_remove(spa); 2969 if (locked) 2970 mutex_exit(&spa_namespace_lock); 2971 return (SET_ERROR(ENOENT)); 2972 } 2973 2974 if (error) { 2975 /* 2976 * We can't open the pool, but we still have useful 2977 * information: the state of each vdev after the 2978 * attempted vdev_open(). Return this to the user. 2979 */ 2980 if (config != NULL && spa->spa_config) { 2981 VERIFY(nvlist_dup(spa->spa_config, config, 2982 KM_SLEEP) == 0); 2983 VERIFY(nvlist_add_nvlist(*config, 2984 ZPOOL_CONFIG_LOAD_INFO, 2985 spa->spa_load_info) == 0); 2986 } 2987 spa_unload(spa); 2988 spa_deactivate(spa); 2989 spa->spa_last_open_failed = error; 2990 if (locked) 2991 mutex_exit(&spa_namespace_lock); 2992 *spapp = NULL; 2993 return (error); 2994 } 2995 } 2996 2997 spa_open_ref(spa, tag); 2998 2999 if (config != NULL) 3000 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3001 3002 /* 3003 * If we've recovered the pool, pass back any information we 3004 * gathered while doing the load. 3005 */ 3006 if (state == SPA_LOAD_RECOVER) { 3007 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3008 spa->spa_load_info) == 0); 3009 } 3010 3011 if (locked) { 3012 spa->spa_last_open_failed = 0; 3013 spa->spa_last_ubsync_txg = 0; 3014 spa->spa_load_txg = 0; 3015 mutex_exit(&spa_namespace_lock); 3016#ifdef __FreeBSD__ 3017#ifdef _KERNEL 3018 if (firstopen) 3019 zvol_create_minors(spa->spa_name); 3020#endif 3021#endif 3022 } 3023 3024 *spapp = spa; 3025 3026 return (0); 3027} 3028 3029int 3030spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3031 nvlist_t **config) 3032{ 3033 return (spa_open_common(name, spapp, tag, policy, config)); 3034} 3035 3036int 3037spa_open(const char *name, spa_t **spapp, void *tag) 3038{ 3039 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3040} 3041 3042/* 3043 * Lookup the given spa_t, incrementing the inject count in the process, 3044 * preventing it from being exported or destroyed. 3045 */ 3046spa_t * 3047spa_inject_addref(char *name) 3048{ 3049 spa_t *spa; 3050 3051 mutex_enter(&spa_namespace_lock); 3052 if ((spa = spa_lookup(name)) == NULL) { 3053 mutex_exit(&spa_namespace_lock); 3054 return (NULL); 3055 } 3056 spa->spa_inject_ref++; 3057 mutex_exit(&spa_namespace_lock); 3058 3059 return (spa); 3060} 3061 3062void 3063spa_inject_delref(spa_t *spa) 3064{ 3065 mutex_enter(&spa_namespace_lock); 3066 spa->spa_inject_ref--; 3067 mutex_exit(&spa_namespace_lock); 3068} 3069 3070/* 3071 * Add spares device information to the nvlist. 3072 */ 3073static void 3074spa_add_spares(spa_t *spa, nvlist_t *config) 3075{ 3076 nvlist_t **spares; 3077 uint_t i, nspares; 3078 nvlist_t *nvroot; 3079 uint64_t guid; 3080 vdev_stat_t *vs; 3081 uint_t vsc; 3082 uint64_t pool; 3083 3084 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3085 3086 if (spa->spa_spares.sav_count == 0) 3087 return; 3088 3089 VERIFY(nvlist_lookup_nvlist(config, 3090 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3091 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3092 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3093 if (nspares != 0) { 3094 VERIFY(nvlist_add_nvlist_array(nvroot, 3095 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3096 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3097 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3098 3099 /* 3100 * Go through and find any spares which have since been 3101 * repurposed as an active spare. If this is the case, update 3102 * their status appropriately. 3103 */ 3104 for (i = 0; i < nspares; i++) { 3105 VERIFY(nvlist_lookup_uint64(spares[i], 3106 ZPOOL_CONFIG_GUID, &guid) == 0); 3107 if (spa_spare_exists(guid, &pool, NULL) && 3108 pool != 0ULL) { 3109 VERIFY(nvlist_lookup_uint64_array( 3110 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3111 (uint64_t **)&vs, &vsc) == 0); 3112 vs->vs_state = VDEV_STATE_CANT_OPEN; 3113 vs->vs_aux = VDEV_AUX_SPARED; 3114 } 3115 } 3116 } 3117} 3118 3119/* 3120 * Add l2cache device information to the nvlist, including vdev stats. 3121 */ 3122static void 3123spa_add_l2cache(spa_t *spa, nvlist_t *config) 3124{ 3125 nvlist_t **l2cache; 3126 uint_t i, j, nl2cache; 3127 nvlist_t *nvroot; 3128 uint64_t guid; 3129 vdev_t *vd; 3130 vdev_stat_t *vs; 3131 uint_t vsc; 3132 3133 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3134 3135 if (spa->spa_l2cache.sav_count == 0) 3136 return; 3137 3138 VERIFY(nvlist_lookup_nvlist(config, 3139 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3140 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3141 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3142 if (nl2cache != 0) { 3143 VERIFY(nvlist_add_nvlist_array(nvroot, 3144 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3145 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3146 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3147 3148 /* 3149 * Update level 2 cache device stats. 3150 */ 3151 3152 for (i = 0; i < nl2cache; i++) { 3153 VERIFY(nvlist_lookup_uint64(l2cache[i], 3154 ZPOOL_CONFIG_GUID, &guid) == 0); 3155 3156 vd = NULL; 3157 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3158 if (guid == 3159 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3160 vd = spa->spa_l2cache.sav_vdevs[j]; 3161 break; 3162 } 3163 } 3164 ASSERT(vd != NULL); 3165 3166 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3167 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3168 == 0); 3169 vdev_get_stats(vd, vs); 3170 } 3171 } 3172} 3173 3174static void 3175spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3176{ 3177 nvlist_t *features; 3178 zap_cursor_t zc; 3179 zap_attribute_t za; 3180 3181 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3182 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3183 3184 /* We may be unable to read features if pool is suspended. */ 3185 if (spa_suspended(spa)) 3186 goto out; 3187 3188 if (spa->spa_feat_for_read_obj != 0) { 3189 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3190 spa->spa_feat_for_read_obj); 3191 zap_cursor_retrieve(&zc, &za) == 0; 3192 zap_cursor_advance(&zc)) { 3193 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3194 za.za_num_integers == 1); 3195 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3196 za.za_first_integer)); 3197 } 3198 zap_cursor_fini(&zc); 3199 } 3200 3201 if (spa->spa_feat_for_write_obj != 0) { 3202 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3203 spa->spa_feat_for_write_obj); 3204 zap_cursor_retrieve(&zc, &za) == 0; 3205 zap_cursor_advance(&zc)) { 3206 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3207 za.za_num_integers == 1); 3208 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3209 za.za_first_integer)); 3210 } 3211 zap_cursor_fini(&zc); 3212 } 3213 3214out: 3215 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3216 features) == 0); 3217 nvlist_free(features); 3218} 3219 3220int 3221spa_get_stats(const char *name, nvlist_t **config, 3222 char *altroot, size_t buflen) 3223{ 3224 int error; 3225 spa_t *spa; 3226 3227 *config = NULL; 3228 error = spa_open_common(name, &spa, FTAG, NULL, config); 3229 3230 if (spa != NULL) { 3231 /* 3232 * This still leaves a window of inconsistency where the spares 3233 * or l2cache devices could change and the config would be 3234 * self-inconsistent. 3235 */ 3236 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3237 3238 if (*config != NULL) { 3239 uint64_t loadtimes[2]; 3240 3241 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3242 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3243 VERIFY(nvlist_add_uint64_array(*config, 3244 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3245 3246 VERIFY(nvlist_add_uint64(*config, 3247 ZPOOL_CONFIG_ERRCOUNT, 3248 spa_get_errlog_size(spa)) == 0); 3249 3250 if (spa_suspended(spa)) 3251 VERIFY(nvlist_add_uint64(*config, 3252 ZPOOL_CONFIG_SUSPENDED, 3253 spa->spa_failmode) == 0); 3254 3255 spa_add_spares(spa, *config); 3256 spa_add_l2cache(spa, *config); 3257 spa_add_feature_stats(spa, *config); 3258 } 3259 } 3260 3261 /* 3262 * We want to get the alternate root even for faulted pools, so we cheat 3263 * and call spa_lookup() directly. 3264 */ 3265 if (altroot) { 3266 if (spa == NULL) { 3267 mutex_enter(&spa_namespace_lock); 3268 spa = spa_lookup(name); 3269 if (spa) 3270 spa_altroot(spa, altroot, buflen); 3271 else 3272 altroot[0] = '\0'; 3273 spa = NULL; 3274 mutex_exit(&spa_namespace_lock); 3275 } else { 3276 spa_altroot(spa, altroot, buflen); 3277 } 3278 } 3279 3280 if (spa != NULL) { 3281 spa_config_exit(spa, SCL_CONFIG, FTAG); 3282 spa_close(spa, FTAG); 3283 } 3284 3285 return (error); 3286} 3287 3288/* 3289 * Validate that the auxiliary device array is well formed. We must have an 3290 * array of nvlists, each which describes a valid leaf vdev. If this is an 3291 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3292 * specified, as long as they are well-formed. 3293 */ 3294static int 3295spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3296 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3297 vdev_labeltype_t label) 3298{ 3299 nvlist_t **dev; 3300 uint_t i, ndev; 3301 vdev_t *vd; 3302 int error; 3303 3304 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3305 3306 /* 3307 * It's acceptable to have no devs specified. 3308 */ 3309 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3310 return (0); 3311 3312 if (ndev == 0) 3313 return (SET_ERROR(EINVAL)); 3314 3315 /* 3316 * Make sure the pool is formatted with a version that supports this 3317 * device type. 3318 */ 3319 if (spa_version(spa) < version) 3320 return (SET_ERROR(ENOTSUP)); 3321 3322 /* 3323 * Set the pending device list so we correctly handle device in-use 3324 * checking. 3325 */ 3326 sav->sav_pending = dev; 3327 sav->sav_npending = ndev; 3328 3329 for (i = 0; i < ndev; i++) { 3330 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3331 mode)) != 0) 3332 goto out; 3333 3334 if (!vd->vdev_ops->vdev_op_leaf) { 3335 vdev_free(vd); 3336 error = SET_ERROR(EINVAL); 3337 goto out; 3338 } 3339 3340 /* 3341 * The L2ARC currently only supports disk devices in 3342 * kernel context. For user-level testing, we allow it. 3343 */ 3344#ifdef _KERNEL 3345 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3346 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3347 error = SET_ERROR(ENOTBLK); 3348 vdev_free(vd); 3349 goto out; 3350 } 3351#endif 3352 vd->vdev_top = vd; 3353 3354 if ((error = vdev_open(vd)) == 0 && 3355 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3356 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3357 vd->vdev_guid) == 0); 3358 } 3359 3360 vdev_free(vd); 3361 3362 if (error && 3363 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3364 goto out; 3365 else 3366 error = 0; 3367 } 3368 3369out: 3370 sav->sav_pending = NULL; 3371 sav->sav_npending = 0; 3372 return (error); 3373} 3374 3375static int 3376spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3377{ 3378 int error; 3379 3380 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3381 3382 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3383 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3384 VDEV_LABEL_SPARE)) != 0) { 3385 return (error); 3386 } 3387 3388 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3389 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3390 VDEV_LABEL_L2CACHE)); 3391} 3392 3393static void 3394spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3395 const char *config) 3396{ 3397 int i; 3398 3399 if (sav->sav_config != NULL) { 3400 nvlist_t **olddevs; 3401 uint_t oldndevs; 3402 nvlist_t **newdevs; 3403 3404 /* 3405 * Generate new dev list by concatentating with the 3406 * current dev list. 3407 */ 3408 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3409 &olddevs, &oldndevs) == 0); 3410 3411 newdevs = kmem_alloc(sizeof (void *) * 3412 (ndevs + oldndevs), KM_SLEEP); 3413 for (i = 0; i < oldndevs; i++) 3414 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3415 KM_SLEEP) == 0); 3416 for (i = 0; i < ndevs; i++) 3417 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3418 KM_SLEEP) == 0); 3419 3420 VERIFY(nvlist_remove(sav->sav_config, config, 3421 DATA_TYPE_NVLIST_ARRAY) == 0); 3422 3423 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3424 config, newdevs, ndevs + oldndevs) == 0); 3425 for (i = 0; i < oldndevs + ndevs; i++) 3426 nvlist_free(newdevs[i]); 3427 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3428 } else { 3429 /* 3430 * Generate a new dev list. 3431 */ 3432 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3433 KM_SLEEP) == 0); 3434 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3435 devs, ndevs) == 0); 3436 } 3437} 3438 3439/* 3440 * Stop and drop level 2 ARC devices 3441 */ 3442void 3443spa_l2cache_drop(spa_t *spa) 3444{ 3445 vdev_t *vd; 3446 int i; 3447 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3448 3449 for (i = 0; i < sav->sav_count; i++) { 3450 uint64_t pool; 3451 3452 vd = sav->sav_vdevs[i]; 3453 ASSERT(vd != NULL); 3454 3455 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3456 pool != 0ULL && l2arc_vdev_present(vd)) 3457 l2arc_remove_vdev(vd); 3458 } 3459} 3460 3461/* 3462 * Pool Creation 3463 */ 3464int 3465spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3466 nvlist_t *zplprops) 3467{ 3468 spa_t *spa; 3469 char *altroot = NULL; 3470 vdev_t *rvd; 3471 dsl_pool_t *dp; 3472 dmu_tx_t *tx; 3473 int error = 0; 3474 uint64_t txg = TXG_INITIAL; 3475 nvlist_t **spares, **l2cache; 3476 uint_t nspares, nl2cache; 3477 uint64_t version, obj; 3478 boolean_t has_features; 3479 3480 /* 3481 * If this pool already exists, return failure. 3482 */ 3483 mutex_enter(&spa_namespace_lock); 3484 if (spa_lookup(pool) != NULL) { 3485 mutex_exit(&spa_namespace_lock); 3486 return (SET_ERROR(EEXIST)); 3487 } 3488 3489 /* 3490 * Allocate a new spa_t structure. 3491 */ 3492 (void) nvlist_lookup_string(props, 3493 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3494 spa = spa_add(pool, NULL, altroot); 3495 spa_activate(spa, spa_mode_global); 3496 3497 if (props && (error = spa_prop_validate(spa, props))) { 3498 spa_deactivate(spa); 3499 spa_remove(spa); 3500 mutex_exit(&spa_namespace_lock); 3501 return (error); 3502 } 3503 3504 has_features = B_FALSE; 3505 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3506 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3507 if (zpool_prop_feature(nvpair_name(elem))) 3508 has_features = B_TRUE; 3509 } 3510 3511 if (has_features || nvlist_lookup_uint64(props, 3512 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3513 version = SPA_VERSION; 3514 } 3515 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3516 3517 spa->spa_first_txg = txg; 3518 spa->spa_uberblock.ub_txg = txg - 1; 3519 spa->spa_uberblock.ub_version = version; 3520 spa->spa_ubsync = spa->spa_uberblock; 3521 3522 /* 3523 * Create "The Godfather" zio to hold all async IOs 3524 */ 3525 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3526 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3527 3528 /* 3529 * Create the root vdev. 3530 */ 3531 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3532 3533 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3534 3535 ASSERT(error != 0 || rvd != NULL); 3536 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3537 3538 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3539 error = SET_ERROR(EINVAL); 3540 3541 if (error == 0 && 3542 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3543 (error = spa_validate_aux(spa, nvroot, txg, 3544 VDEV_ALLOC_ADD)) == 0) { 3545 for (int c = 0; c < rvd->vdev_children; c++) { 3546 vdev_ashift_optimize(rvd->vdev_child[c]); 3547 vdev_metaslab_set_size(rvd->vdev_child[c]); 3548 vdev_expand(rvd->vdev_child[c], txg); 3549 } 3550 } 3551 3552 spa_config_exit(spa, SCL_ALL, FTAG); 3553 3554 if (error != 0) { 3555 spa_unload(spa); 3556 spa_deactivate(spa); 3557 spa_remove(spa); 3558 mutex_exit(&spa_namespace_lock); 3559 return (error); 3560 } 3561 3562 /* 3563 * Get the list of spares, if specified. 3564 */ 3565 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3566 &spares, &nspares) == 0) { 3567 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3568 KM_SLEEP) == 0); 3569 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3570 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3571 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3572 spa_load_spares(spa); 3573 spa_config_exit(spa, SCL_ALL, FTAG); 3574 spa->spa_spares.sav_sync = B_TRUE; 3575 } 3576 3577 /* 3578 * Get the list of level 2 cache devices, if specified. 3579 */ 3580 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3581 &l2cache, &nl2cache) == 0) { 3582 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3583 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3584 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3585 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3586 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3587 spa_load_l2cache(spa); 3588 spa_config_exit(spa, SCL_ALL, FTAG); 3589 spa->spa_l2cache.sav_sync = B_TRUE; 3590 } 3591 3592 spa->spa_is_initializing = B_TRUE; 3593 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3594 spa->spa_meta_objset = dp->dp_meta_objset; 3595 spa->spa_is_initializing = B_FALSE; 3596 3597 /* 3598 * Create DDTs (dedup tables). 3599 */ 3600 ddt_create(spa); 3601 3602 spa_update_dspace(spa); 3603 3604 tx = dmu_tx_create_assigned(dp, txg); 3605 3606 /* 3607 * Create the pool config object. 3608 */ 3609 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3610 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3611 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3612 3613 if (zap_add(spa->spa_meta_objset, 3614 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3615 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3616 cmn_err(CE_PANIC, "failed to add pool config"); 3617 } 3618 3619 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3620 spa_feature_create_zap_objects(spa, tx); 3621 3622 if (zap_add(spa->spa_meta_objset, 3623 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3624 sizeof (uint64_t), 1, &version, tx) != 0) { 3625 cmn_err(CE_PANIC, "failed to add pool version"); 3626 } 3627 3628 /* Newly created pools with the right version are always deflated. */ 3629 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3630 spa->spa_deflate = TRUE; 3631 if (zap_add(spa->spa_meta_objset, 3632 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3633 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3634 cmn_err(CE_PANIC, "failed to add deflate"); 3635 } 3636 } 3637 3638 /* 3639 * Create the deferred-free bpobj. Turn off compression 3640 * because sync-to-convergence takes longer if the blocksize 3641 * keeps changing. 3642 */ 3643 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3644 dmu_object_set_compress(spa->spa_meta_objset, obj, 3645 ZIO_COMPRESS_OFF, tx); 3646 if (zap_add(spa->spa_meta_objset, 3647 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3648 sizeof (uint64_t), 1, &obj, tx) != 0) { 3649 cmn_err(CE_PANIC, "failed to add bpobj"); 3650 } 3651 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3652 spa->spa_meta_objset, obj)); 3653 3654 /* 3655 * Create the pool's history object. 3656 */ 3657 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3658 spa_history_create_obj(spa, tx); 3659 3660 /* 3661 * Set pool properties. 3662 */ 3663 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3664 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3665 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3666 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3667 3668 if (props != NULL) { 3669 spa_configfile_set(spa, props, B_FALSE); 3670 spa_sync_props(props, tx); 3671 } 3672 3673 dmu_tx_commit(tx); 3674 3675 spa->spa_sync_on = B_TRUE; 3676 txg_sync_start(spa->spa_dsl_pool); 3677 3678 /* 3679 * We explicitly wait for the first transaction to complete so that our 3680 * bean counters are appropriately updated. 3681 */ 3682 txg_wait_synced(spa->spa_dsl_pool, txg); 3683 3684 spa_config_sync(spa, B_FALSE, B_TRUE); 3685 3686 spa_history_log_version(spa, "create"); 3687 3688 spa->spa_minref = refcount_count(&spa->spa_refcount); 3689 3690 mutex_exit(&spa_namespace_lock); 3691 3692 return (0); 3693} 3694 3695#ifdef _KERNEL 3696#if defined(sun) 3697/* 3698 * Get the root pool information from the root disk, then import the root pool 3699 * during the system boot up time. 3700 */ 3701extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3702 3703static nvlist_t * 3704spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3705{ 3706 nvlist_t *config; 3707 nvlist_t *nvtop, *nvroot; 3708 uint64_t pgid; 3709 3710 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3711 return (NULL); 3712 3713 /* 3714 * Add this top-level vdev to the child array. 3715 */ 3716 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3717 &nvtop) == 0); 3718 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3719 &pgid) == 0); 3720 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3721 3722 /* 3723 * Put this pool's top-level vdevs into a root vdev. 3724 */ 3725 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3726 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3727 VDEV_TYPE_ROOT) == 0); 3728 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3729 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3730 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3731 &nvtop, 1) == 0); 3732 3733 /* 3734 * Replace the existing vdev_tree with the new root vdev in 3735 * this pool's configuration (remove the old, add the new). 3736 */ 3737 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3738 nvlist_free(nvroot); 3739 return (config); 3740} 3741 3742/* 3743 * Walk the vdev tree and see if we can find a device with "better" 3744 * configuration. A configuration is "better" if the label on that 3745 * device has a more recent txg. 3746 */ 3747static void 3748spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3749{ 3750 for (int c = 0; c < vd->vdev_children; c++) 3751 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3752 3753 if (vd->vdev_ops->vdev_op_leaf) { 3754 nvlist_t *label; 3755 uint64_t label_txg; 3756 3757 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3758 &label) != 0) 3759 return; 3760 3761 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3762 &label_txg) == 0); 3763 3764 /* 3765 * Do we have a better boot device? 3766 */ 3767 if (label_txg > *txg) { 3768 *txg = label_txg; 3769 *avd = vd; 3770 } 3771 nvlist_free(label); 3772 } 3773} 3774 3775/* 3776 * Import a root pool. 3777 * 3778 * For x86. devpath_list will consist of devid and/or physpath name of 3779 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3780 * The GRUB "findroot" command will return the vdev we should boot. 3781 * 3782 * For Sparc, devpath_list consists the physpath name of the booting device 3783 * no matter the rootpool is a single device pool or a mirrored pool. 3784 * e.g. 3785 * "/pci@1f,0/ide@d/disk@0,0:a" 3786 */ 3787int 3788spa_import_rootpool(char *devpath, char *devid) 3789{ 3790 spa_t *spa; 3791 vdev_t *rvd, *bvd, *avd = NULL; 3792 nvlist_t *config, *nvtop; 3793 uint64_t guid, txg; 3794 char *pname; 3795 int error; 3796 3797 /* 3798 * Read the label from the boot device and generate a configuration. 3799 */ 3800 config = spa_generate_rootconf(devpath, devid, &guid); 3801#if defined(_OBP) && defined(_KERNEL) 3802 if (config == NULL) { 3803 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3804 /* iscsi boot */ 3805 get_iscsi_bootpath_phy(devpath); 3806 config = spa_generate_rootconf(devpath, devid, &guid); 3807 } 3808 } 3809#endif 3810 if (config == NULL) { 3811 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3812 devpath); 3813 return (SET_ERROR(EIO)); 3814 } 3815 3816 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3817 &pname) == 0); 3818 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3819 3820 mutex_enter(&spa_namespace_lock); 3821 if ((spa = spa_lookup(pname)) != NULL) { 3822 /* 3823 * Remove the existing root pool from the namespace so that we 3824 * can replace it with the correct config we just read in. 3825 */ 3826 spa_remove(spa); 3827 } 3828 3829 spa = spa_add(pname, config, NULL); 3830 spa->spa_is_root = B_TRUE; 3831 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3832 3833 /* 3834 * Build up a vdev tree based on the boot device's label config. 3835 */ 3836 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3837 &nvtop) == 0); 3838 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3839 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3840 VDEV_ALLOC_ROOTPOOL); 3841 spa_config_exit(spa, SCL_ALL, FTAG); 3842 if (error) { 3843 mutex_exit(&spa_namespace_lock); 3844 nvlist_free(config); 3845 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3846 pname); 3847 return (error); 3848 } 3849 3850 /* 3851 * Get the boot vdev. 3852 */ 3853 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3854 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3855 (u_longlong_t)guid); 3856 error = SET_ERROR(ENOENT); 3857 goto out; 3858 } 3859 3860 /* 3861 * Determine if there is a better boot device. 3862 */ 3863 avd = bvd; 3864 spa_alt_rootvdev(rvd, &avd, &txg); 3865 if (avd != bvd) { 3866 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3867 "try booting from '%s'", avd->vdev_path); 3868 error = SET_ERROR(EINVAL); 3869 goto out; 3870 } 3871 3872 /* 3873 * If the boot device is part of a spare vdev then ensure that 3874 * we're booting off the active spare. 3875 */ 3876 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3877 !bvd->vdev_isspare) { 3878 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3879 "try booting from '%s'", 3880 bvd->vdev_parent-> 3881 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3882 error = SET_ERROR(EINVAL); 3883 goto out; 3884 } 3885 3886 error = 0; 3887out: 3888 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3889 vdev_free(rvd); 3890 spa_config_exit(spa, SCL_ALL, FTAG); 3891 mutex_exit(&spa_namespace_lock); 3892 3893 nvlist_free(config); 3894 return (error); 3895} 3896 3897#else 3898 3899extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3900 uint64_t *count); 3901 3902static nvlist_t * 3903spa_generate_rootconf(const char *name) 3904{ 3905 nvlist_t **configs, **tops; 3906 nvlist_t *config; 3907 nvlist_t *best_cfg, *nvtop, *nvroot; 3908 uint64_t *holes; 3909 uint64_t best_txg; 3910 uint64_t nchildren; 3911 uint64_t pgid; 3912 uint64_t count; 3913 uint64_t i; 3914 uint_t nholes; 3915 3916 if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3917 return (NULL); 3918 3919 ASSERT3U(count, !=, 0); 3920 best_txg = 0; 3921 for (i = 0; i < count; i++) { 3922 uint64_t txg; 3923 3924 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3925 &txg) == 0); 3926 if (txg > best_txg) { 3927 best_txg = txg; 3928 best_cfg = configs[i]; 3929 } 3930 } 3931 3932 /* 3933 * Multi-vdev root pool configuration discovery is not supported yet. 3934 */ 3935 nchildren = 1; 3936 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3937 holes = NULL; 3938 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3939 &holes, &nholes); 3940 3941 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3942 for (i = 0; i < nchildren; i++) { 3943 if (i >= count) 3944 break; 3945 if (configs[i] == NULL) 3946 continue; 3947 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3948 &nvtop) == 0); 3949 nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3950 } 3951 for (i = 0; holes != NULL && i < nholes; i++) { 3952 if (i >= nchildren) 3953 continue; 3954 if (tops[holes[i]] != NULL) 3955 continue; 3956 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3957 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3958 VDEV_TYPE_HOLE) == 0); 3959 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3960 holes[i]) == 0); 3961 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3962 0) == 0); 3963 } 3964 for (i = 0; i < nchildren; i++) { 3965 if (tops[i] != NULL) 3966 continue; 3967 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3968 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3969 VDEV_TYPE_MISSING) == 0); 3970 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3971 i) == 0); 3972 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3973 0) == 0); 3974 } 3975 3976 /* 3977 * Create pool config based on the best vdev config. 3978 */ 3979 nvlist_dup(best_cfg, &config, KM_SLEEP); 3980 3981 /* 3982 * Put this pool's top-level vdevs into a root vdev. 3983 */ 3984 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3985 &pgid) == 0); 3986 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3987 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3988 VDEV_TYPE_ROOT) == 0); 3989 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3990 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3991 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3992 tops, nchildren) == 0); 3993 3994 /* 3995 * Replace the existing vdev_tree with the new root vdev in 3996 * this pool's configuration (remove the old, add the new). 3997 */ 3998 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3999 4000 /* 4001 * Drop vdev config elements that should not be present at pool level. 4002 */ 4003 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4004 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4005 4006 for (i = 0; i < count; i++) 4007 nvlist_free(configs[i]); 4008 kmem_free(configs, count * sizeof(void *)); 4009 for (i = 0; i < nchildren; i++) 4010 nvlist_free(tops[i]); 4011 kmem_free(tops, nchildren * sizeof(void *)); 4012 nvlist_free(nvroot); 4013 return (config); 4014} 4015 4016int 4017spa_import_rootpool(const char *name) 4018{ 4019 spa_t *spa; 4020 vdev_t *rvd, *bvd, *avd = NULL; 4021 nvlist_t *config, *nvtop; 4022 uint64_t txg; 4023 char *pname; 4024 int error; 4025 4026 /* 4027 * Read the label from the boot device and generate a configuration. 4028 */ 4029 config = spa_generate_rootconf(name); 4030 4031 mutex_enter(&spa_namespace_lock); 4032 if (config != NULL) { 4033 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4034 &pname) == 0 && strcmp(name, pname) == 0); 4035 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4036 == 0); 4037 4038 if ((spa = spa_lookup(pname)) != NULL) { 4039 /* 4040 * Remove the existing root pool from the namespace so 4041 * that we can replace it with the correct config 4042 * we just read in. 4043 */ 4044 spa_remove(spa); 4045 } 4046 spa = spa_add(pname, config, NULL); 4047 4048 /* 4049 * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4050 * via spa_version(). 4051 */ 4052 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4053 &spa->spa_ubsync.ub_version) != 0) 4054 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4055 } else if ((spa = spa_lookup(name)) == NULL) { 4056 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4057 name); 4058 return (EIO); 4059 } else { 4060 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4061 } 4062 spa->spa_is_root = B_TRUE; 4063 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4064 4065 /* 4066 * Build up a vdev tree based on the boot device's label config. 4067 */ 4068 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4069 &nvtop) == 0); 4070 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4071 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4072 VDEV_ALLOC_ROOTPOOL); 4073 spa_config_exit(spa, SCL_ALL, FTAG); 4074 if (error) { 4075 mutex_exit(&spa_namespace_lock); 4076 nvlist_free(config); 4077 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4078 pname); 4079 return (error); 4080 } 4081 4082 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4083 vdev_free(rvd); 4084 spa_config_exit(spa, SCL_ALL, FTAG); 4085 mutex_exit(&spa_namespace_lock); 4086 4087 nvlist_free(config); 4088 return (0); 4089} 4090 4091#endif /* sun */ 4092#endif 4093 4094/* 4095 * Import a non-root pool into the system. 4096 */ 4097int 4098spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4099{ 4100 spa_t *spa; 4101 char *altroot = NULL; 4102 spa_load_state_t state = SPA_LOAD_IMPORT; 4103 zpool_rewind_policy_t policy; 4104 uint64_t mode = spa_mode_global; 4105 uint64_t readonly = B_FALSE; 4106 int error; 4107 nvlist_t *nvroot; 4108 nvlist_t **spares, **l2cache; 4109 uint_t nspares, nl2cache; 4110 4111 /* 4112 * If a pool with this name exists, return failure. 4113 */ 4114 mutex_enter(&spa_namespace_lock); 4115 if (spa_lookup(pool) != NULL) { 4116 mutex_exit(&spa_namespace_lock); 4117 return (SET_ERROR(EEXIST)); 4118 } 4119 4120 /* 4121 * Create and initialize the spa structure. 4122 */ 4123 (void) nvlist_lookup_string(props, 4124 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4125 (void) nvlist_lookup_uint64(props, 4126 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4127 if (readonly) 4128 mode = FREAD; 4129 spa = spa_add(pool, config, altroot); 4130 spa->spa_import_flags = flags; 4131 4132 /* 4133 * Verbatim import - Take a pool and insert it into the namespace 4134 * as if it had been loaded at boot. 4135 */ 4136 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4137 if (props != NULL) 4138 spa_configfile_set(spa, props, B_FALSE); 4139 4140 spa_config_sync(spa, B_FALSE, B_TRUE); 4141 4142 mutex_exit(&spa_namespace_lock); 4143 return (0); 4144 } 4145 4146 spa_activate(spa, mode); 4147 4148 /* 4149 * Don't start async tasks until we know everything is healthy. 4150 */ 4151 spa_async_suspend(spa); 4152 4153 zpool_get_rewind_policy(config, &policy); 4154 if (policy.zrp_request & ZPOOL_DO_REWIND) 4155 state = SPA_LOAD_RECOVER; 4156 4157 /* 4158 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4159 * because the user-supplied config is actually the one to trust when 4160 * doing an import. 4161 */ 4162 if (state != SPA_LOAD_RECOVER) 4163 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4164 4165 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4166 policy.zrp_request); 4167 4168 /* 4169 * Propagate anything learned while loading the pool and pass it 4170 * back to caller (i.e. rewind info, missing devices, etc). 4171 */ 4172 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4173 spa->spa_load_info) == 0); 4174 4175 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4176 /* 4177 * Toss any existing sparelist, as it doesn't have any validity 4178 * anymore, and conflicts with spa_has_spare(). 4179 */ 4180 if (spa->spa_spares.sav_config) { 4181 nvlist_free(spa->spa_spares.sav_config); 4182 spa->spa_spares.sav_config = NULL; 4183 spa_load_spares(spa); 4184 } 4185 if (spa->spa_l2cache.sav_config) { 4186 nvlist_free(spa->spa_l2cache.sav_config); 4187 spa->spa_l2cache.sav_config = NULL; 4188 spa_load_l2cache(spa); 4189 } 4190 4191 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4192 &nvroot) == 0); 4193 if (error == 0) 4194 error = spa_validate_aux(spa, nvroot, -1ULL, 4195 VDEV_ALLOC_SPARE); 4196 if (error == 0) 4197 error = spa_validate_aux(spa, nvroot, -1ULL, 4198 VDEV_ALLOC_L2CACHE); 4199 spa_config_exit(spa, SCL_ALL, FTAG); 4200 4201 if (props != NULL) 4202 spa_configfile_set(spa, props, B_FALSE); 4203 4204 if (error != 0 || (props && spa_writeable(spa) && 4205 (error = spa_prop_set(spa, props)))) { 4206 spa_unload(spa); 4207 spa_deactivate(spa); 4208 spa_remove(spa); 4209 mutex_exit(&spa_namespace_lock); 4210 return (error); 4211 } 4212 4213 spa_async_resume(spa); 4214 4215 /* 4216 * Override any spares and level 2 cache devices as specified by 4217 * the user, as these may have correct device names/devids, etc. 4218 */ 4219 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4220 &spares, &nspares) == 0) { 4221 if (spa->spa_spares.sav_config) 4222 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4223 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4224 else 4225 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4226 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4227 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4228 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4229 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4230 spa_load_spares(spa); 4231 spa_config_exit(spa, SCL_ALL, FTAG); 4232 spa->spa_spares.sav_sync = B_TRUE; 4233 } 4234 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4235 &l2cache, &nl2cache) == 0) { 4236 if (spa->spa_l2cache.sav_config) 4237 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4238 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4239 else 4240 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4241 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4242 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4243 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4244 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4245 spa_load_l2cache(spa); 4246 spa_config_exit(spa, SCL_ALL, FTAG); 4247 spa->spa_l2cache.sav_sync = B_TRUE; 4248 } 4249 4250 /* 4251 * Check for any removed devices. 4252 */ 4253 if (spa->spa_autoreplace) { 4254 spa_aux_check_removed(&spa->spa_spares); 4255 spa_aux_check_removed(&spa->spa_l2cache); 4256 } 4257 4258 if (spa_writeable(spa)) { 4259 /* 4260 * Update the config cache to include the newly-imported pool. 4261 */ 4262 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4263 } 4264 4265 /* 4266 * It's possible that the pool was expanded while it was exported. 4267 * We kick off an async task to handle this for us. 4268 */ 4269 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4270 4271 mutex_exit(&spa_namespace_lock); 4272 spa_history_log_version(spa, "import"); 4273 4274#ifdef __FreeBSD__ 4275#ifdef _KERNEL 4276 zvol_create_minors(pool); 4277#endif 4278#endif 4279 return (0); 4280} 4281 4282nvlist_t * 4283spa_tryimport(nvlist_t *tryconfig) 4284{ 4285 nvlist_t *config = NULL; 4286 char *poolname; 4287 spa_t *spa; 4288 uint64_t state; 4289 int error; 4290 4291 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4292 return (NULL); 4293 4294 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4295 return (NULL); 4296 4297 /* 4298 * Create and initialize the spa structure. 4299 */ 4300 mutex_enter(&spa_namespace_lock); 4301 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4302 spa_activate(spa, FREAD); 4303 4304 /* 4305 * Pass off the heavy lifting to spa_load(). 4306 * Pass TRUE for mosconfig because the user-supplied config 4307 * is actually the one to trust when doing an import. 4308 */ 4309 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4310 4311 /* 4312 * If 'tryconfig' was at least parsable, return the current config. 4313 */ 4314 if (spa->spa_root_vdev != NULL) { 4315 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4316 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4317 poolname) == 0); 4318 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4319 state) == 0); 4320 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4321 spa->spa_uberblock.ub_timestamp) == 0); 4322 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4323 spa->spa_load_info) == 0); 4324 4325 /* 4326 * If the bootfs property exists on this pool then we 4327 * copy it out so that external consumers can tell which 4328 * pools are bootable. 4329 */ 4330 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4331 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4332 4333 /* 4334 * We have to play games with the name since the 4335 * pool was opened as TRYIMPORT_NAME. 4336 */ 4337 if (dsl_dsobj_to_dsname(spa_name(spa), 4338 spa->spa_bootfs, tmpname) == 0) { 4339 char *cp; 4340 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4341 4342 cp = strchr(tmpname, '/'); 4343 if (cp == NULL) { 4344 (void) strlcpy(dsname, tmpname, 4345 MAXPATHLEN); 4346 } else { 4347 (void) snprintf(dsname, MAXPATHLEN, 4348 "%s/%s", poolname, ++cp); 4349 } 4350 VERIFY(nvlist_add_string(config, 4351 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4352 kmem_free(dsname, MAXPATHLEN); 4353 } 4354 kmem_free(tmpname, MAXPATHLEN); 4355 } 4356 4357 /* 4358 * Add the list of hot spares and level 2 cache devices. 4359 */ 4360 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4361 spa_add_spares(spa, config); 4362 spa_add_l2cache(spa, config); 4363 spa_config_exit(spa, SCL_CONFIG, FTAG); 4364 } 4365 4366 spa_unload(spa); 4367 spa_deactivate(spa); 4368 spa_remove(spa); 4369 mutex_exit(&spa_namespace_lock); 4370 4371 return (config); 4372} 4373 4374/* 4375 * Pool export/destroy 4376 * 4377 * The act of destroying or exporting a pool is very simple. We make sure there 4378 * is no more pending I/O and any references to the pool are gone. Then, we 4379 * update the pool state and sync all the labels to disk, removing the 4380 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4381 * we don't sync the labels or remove the configuration cache. 4382 */ 4383static int 4384spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4385 boolean_t force, boolean_t hardforce) 4386{ 4387 spa_t *spa; 4388 4389 if (oldconfig) 4390 *oldconfig = NULL; 4391 4392 if (!(spa_mode_global & FWRITE)) 4393 return (SET_ERROR(EROFS)); 4394 4395 mutex_enter(&spa_namespace_lock); 4396 if ((spa = spa_lookup(pool)) == NULL) { 4397 mutex_exit(&spa_namespace_lock); 4398 return (SET_ERROR(ENOENT)); 4399 } 4400 4401 /* 4402 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4403 * reacquire the namespace lock, and see if we can export. 4404 */ 4405 spa_open_ref(spa, FTAG); 4406 mutex_exit(&spa_namespace_lock); 4407 spa_async_suspend(spa); 4408 mutex_enter(&spa_namespace_lock); 4409 spa_close(spa, FTAG); 4410 4411 /* 4412 * The pool will be in core if it's openable, 4413 * in which case we can modify its state. 4414 */ 4415 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4416 /* 4417 * Objsets may be open only because they're dirty, so we 4418 * have to force it to sync before checking spa_refcnt. 4419 */ 4420 txg_wait_synced(spa->spa_dsl_pool, 0); 4421 4422 /* 4423 * A pool cannot be exported or destroyed if there are active 4424 * references. If we are resetting a pool, allow references by 4425 * fault injection handlers. 4426 */ 4427 if (!spa_refcount_zero(spa) || 4428 (spa->spa_inject_ref != 0 && 4429 new_state != POOL_STATE_UNINITIALIZED)) { 4430 spa_async_resume(spa); 4431 mutex_exit(&spa_namespace_lock); 4432 return (SET_ERROR(EBUSY)); 4433 } 4434 4435 /* 4436 * A pool cannot be exported if it has an active shared spare. 4437 * This is to prevent other pools stealing the active spare 4438 * from an exported pool. At user's own will, such pool can 4439 * be forcedly exported. 4440 */ 4441 if (!force && new_state == POOL_STATE_EXPORTED && 4442 spa_has_active_shared_spare(spa)) { 4443 spa_async_resume(spa); 4444 mutex_exit(&spa_namespace_lock); 4445 return (SET_ERROR(EXDEV)); 4446 } 4447 4448 /* 4449 * We want this to be reflected on every label, 4450 * so mark them all dirty. spa_unload() will do the 4451 * final sync that pushes these changes out. 4452 */ 4453 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4454 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4455 spa->spa_state = new_state; 4456 spa->spa_final_txg = spa_last_synced_txg(spa) + 4457 TXG_DEFER_SIZE + 1; 4458 vdev_config_dirty(spa->spa_root_vdev); 4459 spa_config_exit(spa, SCL_ALL, FTAG); 4460 } 4461 } 4462 4463 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4464 4465 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4466 spa_unload(spa); 4467 spa_deactivate(spa); 4468 } 4469 4470 if (oldconfig && spa->spa_config) 4471 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4472 4473 if (new_state != POOL_STATE_UNINITIALIZED) { 4474 if (!hardforce) 4475 spa_config_sync(spa, B_TRUE, B_TRUE); 4476 spa_remove(spa); 4477 } 4478 mutex_exit(&spa_namespace_lock); 4479 4480 return (0); 4481} 4482 4483/* 4484 * Destroy a storage pool. 4485 */ 4486int 4487spa_destroy(char *pool) 4488{ 4489 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4490 B_FALSE, B_FALSE)); 4491} 4492 4493/* 4494 * Export a storage pool. 4495 */ 4496int 4497spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4498 boolean_t hardforce) 4499{ 4500 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4501 force, hardforce)); 4502} 4503 4504/* 4505 * Similar to spa_export(), this unloads the spa_t without actually removing it 4506 * from the namespace in any way. 4507 */ 4508int 4509spa_reset(char *pool) 4510{ 4511 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4512 B_FALSE, B_FALSE)); 4513} 4514 4515/* 4516 * ========================================================================== 4517 * Device manipulation 4518 * ========================================================================== 4519 */ 4520 4521/* 4522 * Add a device to a storage pool. 4523 */ 4524int 4525spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4526{ 4527 uint64_t txg, id; 4528 int error; 4529 vdev_t *rvd = spa->spa_root_vdev; 4530 vdev_t *vd, *tvd; 4531 nvlist_t **spares, **l2cache; 4532 uint_t nspares, nl2cache; 4533 4534 ASSERT(spa_writeable(spa)); 4535 4536 txg = spa_vdev_enter(spa); 4537 4538 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4539 VDEV_ALLOC_ADD)) != 0) 4540 return (spa_vdev_exit(spa, NULL, txg, error)); 4541 4542 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4543 4544 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4545 &nspares) != 0) 4546 nspares = 0; 4547 4548 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4549 &nl2cache) != 0) 4550 nl2cache = 0; 4551 4552 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4553 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4554 4555 if (vd->vdev_children != 0 && 4556 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4557 return (spa_vdev_exit(spa, vd, txg, error)); 4558 4559 /* 4560 * We must validate the spares and l2cache devices after checking the 4561 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4562 */ 4563 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4564 return (spa_vdev_exit(spa, vd, txg, error)); 4565 4566 /* 4567 * Transfer each new top-level vdev from vd to rvd. 4568 */ 4569 for (int c = 0; c < vd->vdev_children; c++) { 4570 4571 /* 4572 * Set the vdev id to the first hole, if one exists. 4573 */ 4574 for (id = 0; id < rvd->vdev_children; id++) { 4575 if (rvd->vdev_child[id]->vdev_ishole) { 4576 vdev_free(rvd->vdev_child[id]); 4577 break; 4578 } 4579 } 4580 tvd = vd->vdev_child[c]; 4581 vdev_remove_child(vd, tvd); 4582 tvd->vdev_id = id; 4583 vdev_add_child(rvd, tvd); 4584 vdev_config_dirty(tvd); 4585 } 4586 4587 if (nspares != 0) { 4588 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4589 ZPOOL_CONFIG_SPARES); 4590 spa_load_spares(spa); 4591 spa->spa_spares.sav_sync = B_TRUE; 4592 } 4593 4594 if (nl2cache != 0) { 4595 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4596 ZPOOL_CONFIG_L2CACHE); 4597 spa_load_l2cache(spa); 4598 spa->spa_l2cache.sav_sync = B_TRUE; 4599 } 4600 4601 /* 4602 * We have to be careful when adding new vdevs to an existing pool. 4603 * If other threads start allocating from these vdevs before we 4604 * sync the config cache, and we lose power, then upon reboot we may 4605 * fail to open the pool because there are DVAs that the config cache 4606 * can't translate. Therefore, we first add the vdevs without 4607 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4608 * and then let spa_config_update() initialize the new metaslabs. 4609 * 4610 * spa_load() checks for added-but-not-initialized vdevs, so that 4611 * if we lose power at any point in this sequence, the remaining 4612 * steps will be completed the next time we load the pool. 4613 */ 4614 (void) spa_vdev_exit(spa, vd, txg, 0); 4615 4616 mutex_enter(&spa_namespace_lock); 4617 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4618 mutex_exit(&spa_namespace_lock); 4619 4620 return (0); 4621} 4622 4623/* 4624 * Attach a device to a mirror. The arguments are the path to any device 4625 * in the mirror, and the nvroot for the new device. If the path specifies 4626 * a device that is not mirrored, we automatically insert the mirror vdev. 4627 * 4628 * If 'replacing' is specified, the new device is intended to replace the 4629 * existing device; in this case the two devices are made into their own 4630 * mirror using the 'replacing' vdev, which is functionally identical to 4631 * the mirror vdev (it actually reuses all the same ops) but has a few 4632 * extra rules: you can't attach to it after it's been created, and upon 4633 * completion of resilvering, the first disk (the one being replaced) 4634 * is automatically detached. 4635 */ 4636int 4637spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4638{ 4639 uint64_t txg, dtl_max_txg; 4640 vdev_t *rvd = spa->spa_root_vdev; 4641 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4642 vdev_ops_t *pvops; 4643 char *oldvdpath, *newvdpath; 4644 int newvd_isspare; 4645 int error; 4646 4647 ASSERT(spa_writeable(spa)); 4648 4649 txg = spa_vdev_enter(spa); 4650 4651 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4652 4653 if (oldvd == NULL) 4654 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4655 4656 if (!oldvd->vdev_ops->vdev_op_leaf) 4657 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4658 4659 pvd = oldvd->vdev_parent; 4660 4661 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4662 VDEV_ALLOC_ATTACH)) != 0) 4663 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4664 4665 if (newrootvd->vdev_children != 1) 4666 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4667 4668 newvd = newrootvd->vdev_child[0]; 4669 4670 if (!newvd->vdev_ops->vdev_op_leaf) 4671 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4672 4673 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4674 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4675 4676 /* 4677 * Spares can't replace logs 4678 */ 4679 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4680 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4681 4682 if (!replacing) { 4683 /* 4684 * For attach, the only allowable parent is a mirror or the root 4685 * vdev. 4686 */ 4687 if (pvd->vdev_ops != &vdev_mirror_ops && 4688 pvd->vdev_ops != &vdev_root_ops) 4689 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4690 4691 pvops = &vdev_mirror_ops; 4692 } else { 4693 /* 4694 * Active hot spares can only be replaced by inactive hot 4695 * spares. 4696 */ 4697 if (pvd->vdev_ops == &vdev_spare_ops && 4698 oldvd->vdev_isspare && 4699 !spa_has_spare(spa, newvd->vdev_guid)) 4700 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4701 4702 /* 4703 * If the source is a hot spare, and the parent isn't already a 4704 * spare, then we want to create a new hot spare. Otherwise, we 4705 * want to create a replacing vdev. The user is not allowed to 4706 * attach to a spared vdev child unless the 'isspare' state is 4707 * the same (spare replaces spare, non-spare replaces 4708 * non-spare). 4709 */ 4710 if (pvd->vdev_ops == &vdev_replacing_ops && 4711 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4712 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4713 } else if (pvd->vdev_ops == &vdev_spare_ops && 4714 newvd->vdev_isspare != oldvd->vdev_isspare) { 4715 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4716 } 4717 4718 if (newvd->vdev_isspare) 4719 pvops = &vdev_spare_ops; 4720 else 4721 pvops = &vdev_replacing_ops; 4722 } 4723 4724 /* 4725 * Make sure the new device is big enough. 4726 */ 4727 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4728 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4729 4730 /* 4731 * The new device cannot have a higher alignment requirement 4732 * than the top-level vdev. 4733 */ 4734 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4735 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4736 4737 /* 4738 * If this is an in-place replacement, update oldvd's path and devid 4739 * to make it distinguishable from newvd, and unopenable from now on. 4740 */ 4741 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4742 spa_strfree(oldvd->vdev_path); 4743 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4744 KM_SLEEP); 4745 (void) sprintf(oldvd->vdev_path, "%s/%s", 4746 newvd->vdev_path, "old"); 4747 if (oldvd->vdev_devid != NULL) { 4748 spa_strfree(oldvd->vdev_devid); 4749 oldvd->vdev_devid = NULL; 4750 } 4751 } 4752 4753 /* mark the device being resilvered */ 4754 newvd->vdev_resilver_txg = txg; 4755 4756 /* 4757 * If the parent is not a mirror, or if we're replacing, insert the new 4758 * mirror/replacing/spare vdev above oldvd. 4759 */ 4760 if (pvd->vdev_ops != pvops) 4761 pvd = vdev_add_parent(oldvd, pvops); 4762 4763 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4764 ASSERT(pvd->vdev_ops == pvops); 4765 ASSERT(oldvd->vdev_parent == pvd); 4766 4767 /* 4768 * Extract the new device from its root and add it to pvd. 4769 */ 4770 vdev_remove_child(newrootvd, newvd); 4771 newvd->vdev_id = pvd->vdev_children; 4772 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4773 vdev_add_child(pvd, newvd); 4774 4775 tvd = newvd->vdev_top; 4776 ASSERT(pvd->vdev_top == tvd); 4777 ASSERT(tvd->vdev_parent == rvd); 4778 4779 vdev_config_dirty(tvd); 4780 4781 /* 4782 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4783 * for any dmu_sync-ed blocks. It will propagate upward when 4784 * spa_vdev_exit() calls vdev_dtl_reassess(). 4785 */ 4786 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4787 4788 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4789 dtl_max_txg - TXG_INITIAL); 4790 4791 if (newvd->vdev_isspare) { 4792 spa_spare_activate(newvd); 4793 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4794 } 4795 4796 oldvdpath = spa_strdup(oldvd->vdev_path); 4797 newvdpath = spa_strdup(newvd->vdev_path); 4798 newvd_isspare = newvd->vdev_isspare; 4799 4800 /* 4801 * Mark newvd's DTL dirty in this txg. 4802 */ 4803 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4804 4805 /* 4806 * Schedule the resilver to restart in the future. We do this to 4807 * ensure that dmu_sync-ed blocks have been stitched into the 4808 * respective datasets. 4809 */ 4810 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4811 4812 /* 4813 * Commit the config 4814 */ 4815 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4816 4817 spa_history_log_internal(spa, "vdev attach", NULL, 4818 "%s vdev=%s %s vdev=%s", 4819 replacing && newvd_isspare ? "spare in" : 4820 replacing ? "replace" : "attach", newvdpath, 4821 replacing ? "for" : "to", oldvdpath); 4822 4823 spa_strfree(oldvdpath); 4824 spa_strfree(newvdpath); 4825 4826 if (spa->spa_bootfs) 4827 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4828 4829 return (0); 4830} 4831 4832/* 4833 * Detach a device from a mirror or replacing vdev. 4834 * 4835 * If 'replace_done' is specified, only detach if the parent 4836 * is a replacing vdev. 4837 */ 4838int 4839spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4840{ 4841 uint64_t txg; 4842 int error; 4843 vdev_t *rvd = spa->spa_root_vdev; 4844 vdev_t *vd, *pvd, *cvd, *tvd; 4845 boolean_t unspare = B_FALSE; 4846 uint64_t unspare_guid = 0; 4847 char *vdpath; 4848 4849 ASSERT(spa_writeable(spa)); 4850 4851 txg = spa_vdev_enter(spa); 4852 4853 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4854 4855 if (vd == NULL) 4856 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4857 4858 if (!vd->vdev_ops->vdev_op_leaf) 4859 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4860 4861 pvd = vd->vdev_parent; 4862 4863 /* 4864 * If the parent/child relationship is not as expected, don't do it. 4865 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4866 * vdev that's replacing B with C. The user's intent in replacing 4867 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4868 * the replace by detaching C, the expected behavior is to end up 4869 * M(A,B). But suppose that right after deciding to detach C, 4870 * the replacement of B completes. We would have M(A,C), and then 4871 * ask to detach C, which would leave us with just A -- not what 4872 * the user wanted. To prevent this, we make sure that the 4873 * parent/child relationship hasn't changed -- in this example, 4874 * that C's parent is still the replacing vdev R. 4875 */ 4876 if (pvd->vdev_guid != pguid && pguid != 0) 4877 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4878 4879 /* 4880 * Only 'replacing' or 'spare' vdevs can be replaced. 4881 */ 4882 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4883 pvd->vdev_ops != &vdev_spare_ops) 4884 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4885 4886 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4887 spa_version(spa) >= SPA_VERSION_SPARES); 4888 4889 /* 4890 * Only mirror, replacing, and spare vdevs support detach. 4891 */ 4892 if (pvd->vdev_ops != &vdev_replacing_ops && 4893 pvd->vdev_ops != &vdev_mirror_ops && 4894 pvd->vdev_ops != &vdev_spare_ops) 4895 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4896 4897 /* 4898 * If this device has the only valid copy of some data, 4899 * we cannot safely detach it. 4900 */ 4901 if (vdev_dtl_required(vd)) 4902 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4903 4904 ASSERT(pvd->vdev_children >= 2); 4905 4906 /* 4907 * If we are detaching the second disk from a replacing vdev, then 4908 * check to see if we changed the original vdev's path to have "/old" 4909 * at the end in spa_vdev_attach(). If so, undo that change now. 4910 */ 4911 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4912 vd->vdev_path != NULL) { 4913 size_t len = strlen(vd->vdev_path); 4914 4915 for (int c = 0; c < pvd->vdev_children; c++) { 4916 cvd = pvd->vdev_child[c]; 4917 4918 if (cvd == vd || cvd->vdev_path == NULL) 4919 continue; 4920 4921 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4922 strcmp(cvd->vdev_path + len, "/old") == 0) { 4923 spa_strfree(cvd->vdev_path); 4924 cvd->vdev_path = spa_strdup(vd->vdev_path); 4925 break; 4926 } 4927 } 4928 } 4929 4930 /* 4931 * If we are detaching the original disk from a spare, then it implies 4932 * that the spare should become a real disk, and be removed from the 4933 * active spare list for the pool. 4934 */ 4935 if (pvd->vdev_ops == &vdev_spare_ops && 4936 vd->vdev_id == 0 && 4937 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4938 unspare = B_TRUE; 4939 4940 /* 4941 * Erase the disk labels so the disk can be used for other things. 4942 * This must be done after all other error cases are handled, 4943 * but before we disembowel vd (so we can still do I/O to it). 4944 * But if we can't do it, don't treat the error as fatal -- 4945 * it may be that the unwritability of the disk is the reason 4946 * it's being detached! 4947 */ 4948 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4949 4950 /* 4951 * Remove vd from its parent and compact the parent's children. 4952 */ 4953 vdev_remove_child(pvd, vd); 4954 vdev_compact_children(pvd); 4955 4956 /* 4957 * Remember one of the remaining children so we can get tvd below. 4958 */ 4959 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4960 4961 /* 4962 * If we need to remove the remaining child from the list of hot spares, 4963 * do it now, marking the vdev as no longer a spare in the process. 4964 * We must do this before vdev_remove_parent(), because that can 4965 * change the GUID if it creates a new toplevel GUID. For a similar 4966 * reason, we must remove the spare now, in the same txg as the detach; 4967 * otherwise someone could attach a new sibling, change the GUID, and 4968 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4969 */ 4970 if (unspare) { 4971 ASSERT(cvd->vdev_isspare); 4972 spa_spare_remove(cvd); 4973 unspare_guid = cvd->vdev_guid; 4974 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4975 cvd->vdev_unspare = B_TRUE; 4976 } 4977 4978 /* 4979 * If the parent mirror/replacing vdev only has one child, 4980 * the parent is no longer needed. Remove it from the tree. 4981 */ 4982 if (pvd->vdev_children == 1) { 4983 if (pvd->vdev_ops == &vdev_spare_ops) 4984 cvd->vdev_unspare = B_FALSE; 4985 vdev_remove_parent(cvd); 4986 } 4987 4988 4989 /* 4990 * We don't set tvd until now because the parent we just removed 4991 * may have been the previous top-level vdev. 4992 */ 4993 tvd = cvd->vdev_top; 4994 ASSERT(tvd->vdev_parent == rvd); 4995 4996 /* 4997 * Reevaluate the parent vdev state. 4998 */ 4999 vdev_propagate_state(cvd); 5000 5001 /* 5002 * If the 'autoexpand' property is set on the pool then automatically 5003 * try to expand the size of the pool. For example if the device we 5004 * just detached was smaller than the others, it may be possible to 5005 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5006 * first so that we can obtain the updated sizes of the leaf vdevs. 5007 */ 5008 if (spa->spa_autoexpand) { 5009 vdev_reopen(tvd); 5010 vdev_expand(tvd, txg); 5011 } 5012 5013 vdev_config_dirty(tvd); 5014 5015 /* 5016 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5017 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5018 * But first make sure we're not on any *other* txg's DTL list, to 5019 * prevent vd from being accessed after it's freed. 5020 */ 5021 vdpath = spa_strdup(vd->vdev_path); 5022 for (int t = 0; t < TXG_SIZE; t++) 5023 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5024 vd->vdev_detached = B_TRUE; 5025 vdev_dirty(tvd, VDD_DTL, vd, txg); 5026 5027 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5028 5029 /* hang on to the spa before we release the lock */ 5030 spa_open_ref(spa, FTAG); 5031 5032 error = spa_vdev_exit(spa, vd, txg, 0); 5033 5034 spa_history_log_internal(spa, "detach", NULL, 5035 "vdev=%s", vdpath); 5036 spa_strfree(vdpath); 5037 5038 /* 5039 * If this was the removal of the original device in a hot spare vdev, 5040 * then we want to go through and remove the device from the hot spare 5041 * list of every other pool. 5042 */ 5043 if (unspare) { 5044 spa_t *altspa = NULL; 5045 5046 mutex_enter(&spa_namespace_lock); 5047 while ((altspa = spa_next(altspa)) != NULL) { 5048 if (altspa->spa_state != POOL_STATE_ACTIVE || 5049 altspa == spa) 5050 continue; 5051 5052 spa_open_ref(altspa, FTAG); 5053 mutex_exit(&spa_namespace_lock); 5054 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5055 mutex_enter(&spa_namespace_lock); 5056 spa_close(altspa, FTAG); 5057 } 5058 mutex_exit(&spa_namespace_lock); 5059 5060 /* search the rest of the vdevs for spares to remove */ 5061 spa_vdev_resilver_done(spa); 5062 } 5063 5064 /* all done with the spa; OK to release */ 5065 mutex_enter(&spa_namespace_lock); 5066 spa_close(spa, FTAG); 5067 mutex_exit(&spa_namespace_lock); 5068 5069 return (error); 5070} 5071 5072/* 5073 * Split a set of devices from their mirrors, and create a new pool from them. 5074 */ 5075int 5076spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5077 nvlist_t *props, boolean_t exp) 5078{ 5079 int error = 0; 5080 uint64_t txg, *glist; 5081 spa_t *newspa; 5082 uint_t c, children, lastlog; 5083 nvlist_t **child, *nvl, *tmp; 5084 dmu_tx_t *tx; 5085 char *altroot = NULL; 5086 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5087 boolean_t activate_slog; 5088 5089 ASSERT(spa_writeable(spa)); 5090 5091 txg = spa_vdev_enter(spa); 5092 5093 /* clear the log and flush everything up to now */ 5094 activate_slog = spa_passivate_log(spa); 5095 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5096 error = spa_offline_log(spa); 5097 txg = spa_vdev_config_enter(spa); 5098 5099 if (activate_slog) 5100 spa_activate_log(spa); 5101 5102 if (error != 0) 5103 return (spa_vdev_exit(spa, NULL, txg, error)); 5104 5105 /* check new spa name before going any further */ 5106 if (spa_lookup(newname) != NULL) 5107 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5108 5109 /* 5110 * scan through all the children to ensure they're all mirrors 5111 */ 5112 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5113 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5114 &children) != 0) 5115 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5116 5117 /* first, check to ensure we've got the right child count */ 5118 rvd = spa->spa_root_vdev; 5119 lastlog = 0; 5120 for (c = 0; c < rvd->vdev_children; c++) { 5121 vdev_t *vd = rvd->vdev_child[c]; 5122 5123 /* don't count the holes & logs as children */ 5124 if (vd->vdev_islog || vd->vdev_ishole) { 5125 if (lastlog == 0) 5126 lastlog = c; 5127 continue; 5128 } 5129 5130 lastlog = 0; 5131 } 5132 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5133 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5134 5135 /* next, ensure no spare or cache devices are part of the split */ 5136 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5137 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5138 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5139 5140 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5141 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5142 5143 /* then, loop over each vdev and validate it */ 5144 for (c = 0; c < children; c++) { 5145 uint64_t is_hole = 0; 5146 5147 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5148 &is_hole); 5149 5150 if (is_hole != 0) { 5151 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5152 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5153 continue; 5154 } else { 5155 error = SET_ERROR(EINVAL); 5156 break; 5157 } 5158 } 5159 5160 /* which disk is going to be split? */ 5161 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5162 &glist[c]) != 0) { 5163 error = SET_ERROR(EINVAL); 5164 break; 5165 } 5166 5167 /* look it up in the spa */ 5168 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5169 if (vml[c] == NULL) { 5170 error = SET_ERROR(ENODEV); 5171 break; 5172 } 5173 5174 /* make sure there's nothing stopping the split */ 5175 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5176 vml[c]->vdev_islog || 5177 vml[c]->vdev_ishole || 5178 vml[c]->vdev_isspare || 5179 vml[c]->vdev_isl2cache || 5180 !vdev_writeable(vml[c]) || 5181 vml[c]->vdev_children != 0 || 5182 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5183 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5184 error = SET_ERROR(EINVAL); 5185 break; 5186 } 5187 5188 if (vdev_dtl_required(vml[c])) { 5189 error = SET_ERROR(EBUSY); 5190 break; 5191 } 5192 5193 /* we need certain info from the top level */ 5194 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5195 vml[c]->vdev_top->vdev_ms_array) == 0); 5196 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5197 vml[c]->vdev_top->vdev_ms_shift) == 0); 5198 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5199 vml[c]->vdev_top->vdev_asize) == 0); 5200 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5201 vml[c]->vdev_top->vdev_ashift) == 0); 5202 } 5203 5204 if (error != 0) { 5205 kmem_free(vml, children * sizeof (vdev_t *)); 5206 kmem_free(glist, children * sizeof (uint64_t)); 5207 return (spa_vdev_exit(spa, NULL, txg, error)); 5208 } 5209 5210 /* stop writers from using the disks */ 5211 for (c = 0; c < children; c++) { 5212 if (vml[c] != NULL) 5213 vml[c]->vdev_offline = B_TRUE; 5214 } 5215 vdev_reopen(spa->spa_root_vdev); 5216 5217 /* 5218 * Temporarily record the splitting vdevs in the spa config. This 5219 * will disappear once the config is regenerated. 5220 */ 5221 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5222 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5223 glist, children) == 0); 5224 kmem_free(glist, children * sizeof (uint64_t)); 5225 5226 mutex_enter(&spa->spa_props_lock); 5227 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5228 nvl) == 0); 5229 mutex_exit(&spa->spa_props_lock); 5230 spa->spa_config_splitting = nvl; 5231 vdev_config_dirty(spa->spa_root_vdev); 5232 5233 /* configure and create the new pool */ 5234 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5235 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5236 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5237 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5238 spa_version(spa)) == 0); 5239 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5240 spa->spa_config_txg) == 0); 5241 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5242 spa_generate_guid(NULL)) == 0); 5243 (void) nvlist_lookup_string(props, 5244 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5245 5246 /* add the new pool to the namespace */ 5247 newspa = spa_add(newname, config, altroot); 5248 newspa->spa_config_txg = spa->spa_config_txg; 5249 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5250 5251 /* release the spa config lock, retaining the namespace lock */ 5252 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5253 5254 if (zio_injection_enabled) 5255 zio_handle_panic_injection(spa, FTAG, 1); 5256 5257 spa_activate(newspa, spa_mode_global); 5258 spa_async_suspend(newspa); 5259 5260#ifndef sun 5261 /* mark that we are creating new spa by splitting */ 5262 newspa->spa_splitting_newspa = B_TRUE; 5263#endif 5264 /* create the new pool from the disks of the original pool */ 5265 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5266#ifndef sun 5267 newspa->spa_splitting_newspa = B_FALSE; 5268#endif 5269 if (error) 5270 goto out; 5271 5272 /* if that worked, generate a real config for the new pool */ 5273 if (newspa->spa_root_vdev != NULL) { 5274 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5275 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5276 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5277 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5278 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5279 B_TRUE)); 5280 } 5281 5282 /* set the props */ 5283 if (props != NULL) { 5284 spa_configfile_set(newspa, props, B_FALSE); 5285 error = spa_prop_set(newspa, props); 5286 if (error) 5287 goto out; 5288 } 5289 5290 /* flush everything */ 5291 txg = spa_vdev_config_enter(newspa); 5292 vdev_config_dirty(newspa->spa_root_vdev); 5293 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5294 5295 if (zio_injection_enabled) 5296 zio_handle_panic_injection(spa, FTAG, 2); 5297 5298 spa_async_resume(newspa); 5299 5300 /* finally, update the original pool's config */ 5301 txg = spa_vdev_config_enter(spa); 5302 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5303 error = dmu_tx_assign(tx, TXG_WAIT); 5304 if (error != 0) 5305 dmu_tx_abort(tx); 5306 for (c = 0; c < children; c++) { 5307 if (vml[c] != NULL) { 5308 vdev_split(vml[c]); 5309 if (error == 0) 5310 spa_history_log_internal(spa, "detach", tx, 5311 "vdev=%s", vml[c]->vdev_path); 5312 vdev_free(vml[c]); 5313 } 5314 } 5315 vdev_config_dirty(spa->spa_root_vdev); 5316 spa->spa_config_splitting = NULL; 5317 nvlist_free(nvl); 5318 if (error == 0) 5319 dmu_tx_commit(tx); 5320 (void) spa_vdev_exit(spa, NULL, txg, 0); 5321 5322 if (zio_injection_enabled) 5323 zio_handle_panic_injection(spa, FTAG, 3); 5324 5325 /* split is complete; log a history record */ 5326 spa_history_log_internal(newspa, "split", NULL, 5327 "from pool %s", spa_name(spa)); 5328 5329 kmem_free(vml, children * sizeof (vdev_t *)); 5330 5331 /* if we're not going to mount the filesystems in userland, export */ 5332 if (exp) 5333 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5334 B_FALSE, B_FALSE); 5335 5336 return (error); 5337 5338out: 5339 spa_unload(newspa); 5340 spa_deactivate(newspa); 5341 spa_remove(newspa); 5342 5343 txg = spa_vdev_config_enter(spa); 5344 5345 /* re-online all offlined disks */ 5346 for (c = 0; c < children; c++) { 5347 if (vml[c] != NULL) 5348 vml[c]->vdev_offline = B_FALSE; 5349 } 5350 vdev_reopen(spa->spa_root_vdev); 5351 5352 nvlist_free(spa->spa_config_splitting); 5353 spa->spa_config_splitting = NULL; 5354 (void) spa_vdev_exit(spa, NULL, txg, error); 5355 5356 kmem_free(vml, children * sizeof (vdev_t *)); 5357 return (error); 5358} 5359 5360static nvlist_t * 5361spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5362{ 5363 for (int i = 0; i < count; i++) { 5364 uint64_t guid; 5365 5366 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5367 &guid) == 0); 5368 5369 if (guid == target_guid) 5370 return (nvpp[i]); 5371 } 5372 5373 return (NULL); 5374} 5375 5376static void 5377spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5378 nvlist_t *dev_to_remove) 5379{ 5380 nvlist_t **newdev = NULL; 5381 5382 if (count > 1) 5383 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5384 5385 for (int i = 0, j = 0; i < count; i++) { 5386 if (dev[i] == dev_to_remove) 5387 continue; 5388 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5389 } 5390 5391 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5392 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5393 5394 for (int i = 0; i < count - 1; i++) 5395 nvlist_free(newdev[i]); 5396 5397 if (count > 1) 5398 kmem_free(newdev, (count - 1) * sizeof (void *)); 5399} 5400 5401/* 5402 * Evacuate the device. 5403 */ 5404static int 5405spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5406{ 5407 uint64_t txg; 5408 int error = 0; 5409 5410 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5411 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5412 ASSERT(vd == vd->vdev_top); 5413 5414 /* 5415 * Evacuate the device. We don't hold the config lock as writer 5416 * since we need to do I/O but we do keep the 5417 * spa_namespace_lock held. Once this completes the device 5418 * should no longer have any blocks allocated on it. 5419 */ 5420 if (vd->vdev_islog) { 5421 if (vd->vdev_stat.vs_alloc != 0) 5422 error = spa_offline_log(spa); 5423 } else { 5424 error = SET_ERROR(ENOTSUP); 5425 } 5426 5427 if (error) 5428 return (error); 5429 5430 /* 5431 * The evacuation succeeded. Remove any remaining MOS metadata 5432 * associated with this vdev, and wait for these changes to sync. 5433 */ 5434 ASSERT0(vd->vdev_stat.vs_alloc); 5435 txg = spa_vdev_config_enter(spa); 5436 vd->vdev_removing = B_TRUE; 5437 vdev_dirty_leaves(vd, VDD_DTL, txg); 5438 vdev_config_dirty(vd); 5439 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5440 5441 return (0); 5442} 5443 5444/* 5445 * Complete the removal by cleaning up the namespace. 5446 */ 5447static void 5448spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5449{ 5450 vdev_t *rvd = spa->spa_root_vdev; 5451 uint64_t id = vd->vdev_id; 5452 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5453 5454 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5455 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5456 ASSERT(vd == vd->vdev_top); 5457 5458 /* 5459 * Only remove any devices which are empty. 5460 */ 5461 if (vd->vdev_stat.vs_alloc != 0) 5462 return; 5463 5464 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5465 5466 if (list_link_active(&vd->vdev_state_dirty_node)) 5467 vdev_state_clean(vd); 5468 if (list_link_active(&vd->vdev_config_dirty_node)) 5469 vdev_config_clean(vd); 5470 5471 vdev_free(vd); 5472 5473 if (last_vdev) { 5474 vdev_compact_children(rvd); 5475 } else { 5476 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5477 vdev_add_child(rvd, vd); 5478 } 5479 vdev_config_dirty(rvd); 5480 5481 /* 5482 * Reassess the health of our root vdev. 5483 */ 5484 vdev_reopen(rvd); 5485} 5486 5487/* 5488 * Remove a device from the pool - 5489 * 5490 * Removing a device from the vdev namespace requires several steps 5491 * and can take a significant amount of time. As a result we use 5492 * the spa_vdev_config_[enter/exit] functions which allow us to 5493 * grab and release the spa_config_lock while still holding the namespace 5494 * lock. During each step the configuration is synced out. 5495 * 5496 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5497 * devices. 5498 */ 5499int 5500spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5501{ 5502 vdev_t *vd; 5503 metaslab_group_t *mg; 5504 nvlist_t **spares, **l2cache, *nv; 5505 uint64_t txg = 0; 5506 uint_t nspares, nl2cache; 5507 int error = 0; 5508 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5509 5510 ASSERT(spa_writeable(spa)); 5511 5512 if (!locked) 5513 txg = spa_vdev_enter(spa); 5514 5515 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5516 5517 if (spa->spa_spares.sav_vdevs != NULL && 5518 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5519 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5520 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5521 /* 5522 * Only remove the hot spare if it's not currently in use 5523 * in this pool. 5524 */ 5525 if (vd == NULL || unspare) { 5526 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5527 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5528 spa_load_spares(spa); 5529 spa->spa_spares.sav_sync = B_TRUE; 5530 } else { 5531 error = SET_ERROR(EBUSY); 5532 } 5533 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5534 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5535 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5536 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5537 /* 5538 * Cache devices can always be removed. 5539 */ 5540 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5541 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5542 spa_load_l2cache(spa); 5543 spa->spa_l2cache.sav_sync = B_TRUE; 5544 } else if (vd != NULL && vd->vdev_islog) { 5545 ASSERT(!locked); 5546 ASSERT(vd == vd->vdev_top); 5547 5548 mg = vd->vdev_mg; 5549 5550 /* 5551 * Stop allocating from this vdev. 5552 */ 5553 metaslab_group_passivate(mg); 5554 5555 /* 5556 * Wait for the youngest allocations and frees to sync, 5557 * and then wait for the deferral of those frees to finish. 5558 */ 5559 spa_vdev_config_exit(spa, NULL, 5560 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5561 5562 /* 5563 * Attempt to evacuate the vdev. 5564 */ 5565 error = spa_vdev_remove_evacuate(spa, vd); 5566 5567 txg = spa_vdev_config_enter(spa); 5568 5569 /* 5570 * If we couldn't evacuate the vdev, unwind. 5571 */ 5572 if (error) { 5573 metaslab_group_activate(mg); 5574 return (spa_vdev_exit(spa, NULL, txg, error)); 5575 } 5576 5577 /* 5578 * Clean up the vdev namespace. 5579 */ 5580 spa_vdev_remove_from_namespace(spa, vd); 5581 5582 } else if (vd != NULL) { 5583 /* 5584 * Normal vdevs cannot be removed (yet). 5585 */ 5586 error = SET_ERROR(ENOTSUP); 5587 } else { 5588 /* 5589 * There is no vdev of any kind with the specified guid. 5590 */ 5591 error = SET_ERROR(ENOENT); 5592 } 5593 5594 if (!locked) 5595 return (spa_vdev_exit(spa, NULL, txg, error)); 5596 5597 return (error); 5598} 5599 5600/* 5601 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5602 * currently spared, so we can detach it. 5603 */ 5604static vdev_t * 5605spa_vdev_resilver_done_hunt(vdev_t *vd) 5606{ 5607 vdev_t *newvd, *oldvd; 5608 5609 for (int c = 0; c < vd->vdev_children; c++) { 5610 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5611 if (oldvd != NULL) 5612 return (oldvd); 5613 } 5614 5615 /* 5616 * Check for a completed replacement. We always consider the first 5617 * vdev in the list to be the oldest vdev, and the last one to be 5618 * the newest (see spa_vdev_attach() for how that works). In 5619 * the case where the newest vdev is faulted, we will not automatically 5620 * remove it after a resilver completes. This is OK as it will require 5621 * user intervention to determine which disk the admin wishes to keep. 5622 */ 5623 if (vd->vdev_ops == &vdev_replacing_ops) { 5624 ASSERT(vd->vdev_children > 1); 5625 5626 newvd = vd->vdev_child[vd->vdev_children - 1]; 5627 oldvd = vd->vdev_child[0]; 5628 5629 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5630 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5631 !vdev_dtl_required(oldvd)) 5632 return (oldvd); 5633 } 5634 5635 /* 5636 * Check for a completed resilver with the 'unspare' flag set. 5637 */ 5638 if (vd->vdev_ops == &vdev_spare_ops) { 5639 vdev_t *first = vd->vdev_child[0]; 5640 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5641 5642 if (last->vdev_unspare) { 5643 oldvd = first; 5644 newvd = last; 5645 } else if (first->vdev_unspare) { 5646 oldvd = last; 5647 newvd = first; 5648 } else { 5649 oldvd = NULL; 5650 } 5651 5652 if (oldvd != NULL && 5653 vdev_dtl_empty(newvd, DTL_MISSING) && 5654 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5655 !vdev_dtl_required(oldvd)) 5656 return (oldvd); 5657 5658 /* 5659 * If there are more than two spares attached to a disk, 5660 * and those spares are not required, then we want to 5661 * attempt to free them up now so that they can be used 5662 * by other pools. Once we're back down to a single 5663 * disk+spare, we stop removing them. 5664 */ 5665 if (vd->vdev_children > 2) { 5666 newvd = vd->vdev_child[1]; 5667 5668 if (newvd->vdev_isspare && last->vdev_isspare && 5669 vdev_dtl_empty(last, DTL_MISSING) && 5670 vdev_dtl_empty(last, DTL_OUTAGE) && 5671 !vdev_dtl_required(newvd)) 5672 return (newvd); 5673 } 5674 } 5675 5676 return (NULL); 5677} 5678 5679static void 5680spa_vdev_resilver_done(spa_t *spa) 5681{ 5682 vdev_t *vd, *pvd, *ppvd; 5683 uint64_t guid, sguid, pguid, ppguid; 5684 5685 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5686 5687 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5688 pvd = vd->vdev_parent; 5689 ppvd = pvd->vdev_parent; 5690 guid = vd->vdev_guid; 5691 pguid = pvd->vdev_guid; 5692 ppguid = ppvd->vdev_guid; 5693 sguid = 0; 5694 /* 5695 * If we have just finished replacing a hot spared device, then 5696 * we need to detach the parent's first child (the original hot 5697 * spare) as well. 5698 */ 5699 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5700 ppvd->vdev_children == 2) { 5701 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5702 sguid = ppvd->vdev_child[1]->vdev_guid; 5703 } 5704 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5705 5706 spa_config_exit(spa, SCL_ALL, FTAG); 5707 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5708 return; 5709 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5710 return; 5711 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5712 } 5713 5714 spa_config_exit(spa, SCL_ALL, FTAG); 5715} 5716 5717/* 5718 * Update the stored path or FRU for this vdev. 5719 */ 5720int 5721spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5722 boolean_t ispath) 5723{ 5724 vdev_t *vd; 5725 boolean_t sync = B_FALSE; 5726 5727 ASSERT(spa_writeable(spa)); 5728 5729 spa_vdev_state_enter(spa, SCL_ALL); 5730 5731 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5732 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5733 5734 if (!vd->vdev_ops->vdev_op_leaf) 5735 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5736 5737 if (ispath) { 5738 if (strcmp(value, vd->vdev_path) != 0) { 5739 spa_strfree(vd->vdev_path); 5740 vd->vdev_path = spa_strdup(value); 5741 sync = B_TRUE; 5742 } 5743 } else { 5744 if (vd->vdev_fru == NULL) { 5745 vd->vdev_fru = spa_strdup(value); 5746 sync = B_TRUE; 5747 } else if (strcmp(value, vd->vdev_fru) != 0) { 5748 spa_strfree(vd->vdev_fru); 5749 vd->vdev_fru = spa_strdup(value); 5750 sync = B_TRUE; 5751 } 5752 } 5753 5754 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5755} 5756 5757int 5758spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5759{ 5760 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5761} 5762 5763int 5764spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5765{ 5766 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5767} 5768 5769/* 5770 * ========================================================================== 5771 * SPA Scanning 5772 * ========================================================================== 5773 */ 5774 5775int 5776spa_scan_stop(spa_t *spa) 5777{ 5778 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5779 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5780 return (SET_ERROR(EBUSY)); 5781 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5782} 5783 5784int 5785spa_scan(spa_t *spa, pool_scan_func_t func) 5786{ 5787 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5788 5789 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5790 return (SET_ERROR(ENOTSUP)); 5791 5792 /* 5793 * If a resilver was requested, but there is no DTL on a 5794 * writeable leaf device, we have nothing to do. 5795 */ 5796 if (func == POOL_SCAN_RESILVER && 5797 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5798 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5799 return (0); 5800 } 5801 5802 return (dsl_scan(spa->spa_dsl_pool, func)); 5803} 5804 5805/* 5806 * ========================================================================== 5807 * SPA async task processing 5808 * ========================================================================== 5809 */ 5810 5811static void 5812spa_async_remove(spa_t *spa, vdev_t *vd) 5813{ 5814 if (vd->vdev_remove_wanted) { 5815 vd->vdev_remove_wanted = B_FALSE; 5816 vd->vdev_delayed_close = B_FALSE; 5817 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5818 5819 /* 5820 * We want to clear the stats, but we don't want to do a full 5821 * vdev_clear() as that will cause us to throw away 5822 * degraded/faulted state as well as attempt to reopen the 5823 * device, all of which is a waste. 5824 */ 5825 vd->vdev_stat.vs_read_errors = 0; 5826 vd->vdev_stat.vs_write_errors = 0; 5827 vd->vdev_stat.vs_checksum_errors = 0; 5828 5829 vdev_state_dirty(vd->vdev_top); 5830 } 5831 5832 for (int c = 0; c < vd->vdev_children; c++) 5833 spa_async_remove(spa, vd->vdev_child[c]); 5834} 5835 5836static void 5837spa_async_probe(spa_t *spa, vdev_t *vd) 5838{ 5839 if (vd->vdev_probe_wanted) { 5840 vd->vdev_probe_wanted = B_FALSE; 5841 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5842 } 5843 5844 for (int c = 0; c < vd->vdev_children; c++) 5845 spa_async_probe(spa, vd->vdev_child[c]); 5846} 5847 5848static void 5849spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5850{ 5851 sysevent_id_t eid; 5852 nvlist_t *attr; 5853 char *physpath; 5854 5855 if (!spa->spa_autoexpand) 5856 return; 5857 5858 for (int c = 0; c < vd->vdev_children; c++) { 5859 vdev_t *cvd = vd->vdev_child[c]; 5860 spa_async_autoexpand(spa, cvd); 5861 } 5862 5863 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5864 return; 5865 5866 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5867 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5868 5869 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5870 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5871 5872 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5873 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5874 5875 nvlist_free(attr); 5876 kmem_free(physpath, MAXPATHLEN); 5877} 5878 5879static void 5880spa_async_thread(void *arg) 5881{ 5882 spa_t *spa = arg; 5883 int tasks; 5884 5885 ASSERT(spa->spa_sync_on); 5886 5887 mutex_enter(&spa->spa_async_lock); 5888 tasks = spa->spa_async_tasks; 5889 spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5890 mutex_exit(&spa->spa_async_lock); 5891 5892 /* 5893 * See if the config needs to be updated. 5894 */ 5895 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5896 uint64_t old_space, new_space; 5897 5898 mutex_enter(&spa_namespace_lock); 5899 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5900 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5901 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5902 mutex_exit(&spa_namespace_lock); 5903 5904 /* 5905 * If the pool grew as a result of the config update, 5906 * then log an internal history event. 5907 */ 5908 if (new_space != old_space) { 5909 spa_history_log_internal(spa, "vdev online", NULL, 5910 "pool '%s' size: %llu(+%llu)", 5911 spa_name(spa), new_space, new_space - old_space); 5912 } 5913 } 5914 5915 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5916 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5917 spa_async_autoexpand(spa, spa->spa_root_vdev); 5918 spa_config_exit(spa, SCL_CONFIG, FTAG); 5919 } 5920 5921 /* 5922 * See if any devices need to be probed. 5923 */ 5924 if (tasks & SPA_ASYNC_PROBE) { 5925 spa_vdev_state_enter(spa, SCL_NONE); 5926 spa_async_probe(spa, spa->spa_root_vdev); 5927 (void) spa_vdev_state_exit(spa, NULL, 0); 5928 } 5929 5930 /* 5931 * If any devices are done replacing, detach them. 5932 */ 5933 if (tasks & SPA_ASYNC_RESILVER_DONE) 5934 spa_vdev_resilver_done(spa); 5935 5936 /* 5937 * Kick off a resilver. 5938 */ 5939 if (tasks & SPA_ASYNC_RESILVER) 5940 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5941 5942 /* 5943 * Let the world know that we're done. 5944 */ 5945 mutex_enter(&spa->spa_async_lock); 5946 spa->spa_async_thread = NULL; 5947 cv_broadcast(&spa->spa_async_cv); 5948 mutex_exit(&spa->spa_async_lock); 5949 thread_exit(); 5950} 5951 5952static void 5953spa_async_thread_vd(void *arg) 5954{ 5955 spa_t *spa = arg; 5956 int tasks; 5957 5958 ASSERT(spa->spa_sync_on); 5959 5960 mutex_enter(&spa->spa_async_lock); 5961 tasks = spa->spa_async_tasks; 5962retry: 5963 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 5964 mutex_exit(&spa->spa_async_lock); 5965 5966 /* 5967 * See if any devices need to be marked REMOVED. 5968 */ 5969 if (tasks & SPA_ASYNC_REMOVE) { 5970 spa_vdev_state_enter(spa, SCL_NONE); 5971 spa_async_remove(spa, spa->spa_root_vdev); 5972 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5973 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5974 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5975 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5976 (void) spa_vdev_state_exit(spa, NULL, 0); 5977 } 5978 5979 /* 5980 * Let the world know that we're done. 5981 */ 5982 mutex_enter(&spa->spa_async_lock); 5983 tasks = spa->spa_async_tasks; 5984 if ((tasks & SPA_ASYNC_REMOVE) != 0) 5985 goto retry; 5986 spa->spa_async_thread_vd = NULL; 5987 cv_broadcast(&spa->spa_async_cv); 5988 mutex_exit(&spa->spa_async_lock); 5989 thread_exit(); 5990} 5991 5992void 5993spa_async_suspend(spa_t *spa) 5994{ 5995 mutex_enter(&spa->spa_async_lock); 5996 spa->spa_async_suspended++; 5997 while (spa->spa_async_thread != NULL && 5998 spa->spa_async_thread_vd != NULL) 5999 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6000 mutex_exit(&spa->spa_async_lock); 6001} 6002 6003void 6004spa_async_resume(spa_t *spa) 6005{ 6006 mutex_enter(&spa->spa_async_lock); 6007 ASSERT(spa->spa_async_suspended != 0); 6008 spa->spa_async_suspended--; 6009 mutex_exit(&spa->spa_async_lock); 6010} 6011 6012static boolean_t 6013spa_async_tasks_pending(spa_t *spa) 6014{ 6015 uint_t non_config_tasks; 6016 uint_t config_task; 6017 boolean_t config_task_suspended; 6018 6019 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6020 SPA_ASYNC_REMOVE); 6021 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6022 if (spa->spa_ccw_fail_time == 0) { 6023 config_task_suspended = B_FALSE; 6024 } else { 6025 config_task_suspended = 6026 (gethrtime() - spa->spa_ccw_fail_time) < 6027 (zfs_ccw_retry_interval * NANOSEC); 6028 } 6029 6030 return (non_config_tasks || (config_task && !config_task_suspended)); 6031} 6032 6033static void 6034spa_async_dispatch(spa_t *spa) 6035{ 6036 mutex_enter(&spa->spa_async_lock); 6037 if (spa_async_tasks_pending(spa) && 6038 !spa->spa_async_suspended && 6039 spa->spa_async_thread == NULL && 6040 rootdir != NULL) 6041 spa->spa_async_thread = thread_create(NULL, 0, 6042 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6043 mutex_exit(&spa->spa_async_lock); 6044} 6045 6046static void 6047spa_async_dispatch_vd(spa_t *spa) 6048{ 6049 mutex_enter(&spa->spa_async_lock); 6050 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6051 !spa->spa_async_suspended && 6052 spa->spa_async_thread_vd == NULL && 6053 rootdir != NULL) 6054 spa->spa_async_thread_vd = thread_create(NULL, 0, 6055 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6056 mutex_exit(&spa->spa_async_lock); 6057} 6058 6059void 6060spa_async_request(spa_t *spa, int task) 6061{ 6062 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6063 mutex_enter(&spa->spa_async_lock); 6064 spa->spa_async_tasks |= task; 6065 mutex_exit(&spa->spa_async_lock); 6066 spa_async_dispatch_vd(spa); 6067} 6068 6069/* 6070 * ========================================================================== 6071 * SPA syncing routines 6072 * ========================================================================== 6073 */ 6074 6075static int 6076bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6077{ 6078 bpobj_t *bpo = arg; 6079 bpobj_enqueue(bpo, bp, tx); 6080 return (0); 6081} 6082 6083static int 6084spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6085{ 6086 zio_t *zio = arg; 6087 6088 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6089 BP_GET_PSIZE(bp), zio->io_flags)); 6090 return (0); 6091} 6092 6093/* 6094 * Note: this simple function is not inlined to make it easier to dtrace the 6095 * amount of time spent syncing frees. 6096 */ 6097static void 6098spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6099{ 6100 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6101 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6102 VERIFY(zio_wait(zio) == 0); 6103} 6104 6105/* 6106 * Note: this simple function is not inlined to make it easier to dtrace the 6107 * amount of time spent syncing deferred frees. 6108 */ 6109static void 6110spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6111{ 6112 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6113 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6114 spa_free_sync_cb, zio, tx), ==, 0); 6115 VERIFY0(zio_wait(zio)); 6116} 6117 6118 6119static void 6120spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6121{ 6122 char *packed = NULL; 6123 size_t bufsize; 6124 size_t nvsize = 0; 6125 dmu_buf_t *db; 6126 6127 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6128 6129 /* 6130 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6131 * information. This avoids the dmu_buf_will_dirty() path and 6132 * saves us a pre-read to get data we don't actually care about. 6133 */ 6134 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6135 packed = kmem_alloc(bufsize, KM_SLEEP); 6136 6137 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6138 KM_SLEEP) == 0); 6139 bzero(packed + nvsize, bufsize - nvsize); 6140 6141 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6142 6143 kmem_free(packed, bufsize); 6144 6145 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6146 dmu_buf_will_dirty(db, tx); 6147 *(uint64_t *)db->db_data = nvsize; 6148 dmu_buf_rele(db, FTAG); 6149} 6150 6151static void 6152spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6153 const char *config, const char *entry) 6154{ 6155 nvlist_t *nvroot; 6156 nvlist_t **list; 6157 int i; 6158 6159 if (!sav->sav_sync) 6160 return; 6161 6162 /* 6163 * Update the MOS nvlist describing the list of available devices. 6164 * spa_validate_aux() will have already made sure this nvlist is 6165 * valid and the vdevs are labeled appropriately. 6166 */ 6167 if (sav->sav_object == 0) { 6168 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6169 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6170 sizeof (uint64_t), tx); 6171 VERIFY(zap_update(spa->spa_meta_objset, 6172 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6173 &sav->sav_object, tx) == 0); 6174 } 6175 6176 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6177 if (sav->sav_count == 0) { 6178 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6179 } else { 6180 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6181 for (i = 0; i < sav->sav_count; i++) 6182 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6183 B_FALSE, VDEV_CONFIG_L2CACHE); 6184 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6185 sav->sav_count) == 0); 6186 for (i = 0; i < sav->sav_count; i++) 6187 nvlist_free(list[i]); 6188 kmem_free(list, sav->sav_count * sizeof (void *)); 6189 } 6190 6191 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6192 nvlist_free(nvroot); 6193 6194 sav->sav_sync = B_FALSE; 6195} 6196 6197static void 6198spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6199{ 6200 nvlist_t *config; 6201 6202 if (list_is_empty(&spa->spa_config_dirty_list)) 6203 return; 6204 6205 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6206 6207 config = spa_config_generate(spa, spa->spa_root_vdev, 6208 dmu_tx_get_txg(tx), B_FALSE); 6209 6210 /* 6211 * If we're upgrading the spa version then make sure that 6212 * the config object gets updated with the correct version. 6213 */ 6214 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6215 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6216 spa->spa_uberblock.ub_version); 6217 6218 spa_config_exit(spa, SCL_STATE, FTAG); 6219 6220 if (spa->spa_config_syncing) 6221 nvlist_free(spa->spa_config_syncing); 6222 spa->spa_config_syncing = config; 6223 6224 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6225} 6226 6227static void 6228spa_sync_version(void *arg, dmu_tx_t *tx) 6229{ 6230 uint64_t *versionp = arg; 6231 uint64_t version = *versionp; 6232 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6233 6234 /* 6235 * Setting the version is special cased when first creating the pool. 6236 */ 6237 ASSERT(tx->tx_txg != TXG_INITIAL); 6238 6239 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6240 ASSERT(version >= spa_version(spa)); 6241 6242 spa->spa_uberblock.ub_version = version; 6243 vdev_config_dirty(spa->spa_root_vdev); 6244 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6245} 6246 6247/* 6248 * Set zpool properties. 6249 */ 6250static void 6251spa_sync_props(void *arg, dmu_tx_t *tx) 6252{ 6253 nvlist_t *nvp = arg; 6254 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6255 objset_t *mos = spa->spa_meta_objset; 6256 nvpair_t *elem = NULL; 6257 6258 mutex_enter(&spa->spa_props_lock); 6259 6260 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6261 uint64_t intval; 6262 char *strval, *fname; 6263 zpool_prop_t prop; 6264 const char *propname; 6265 zprop_type_t proptype; 6266 spa_feature_t fid; 6267 6268 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6269 case ZPROP_INVAL: 6270 /* 6271 * We checked this earlier in spa_prop_validate(). 6272 */ 6273 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6274 6275 fname = strchr(nvpair_name(elem), '@') + 1; 6276 VERIFY0(zfeature_lookup_name(fname, &fid)); 6277 6278 spa_feature_enable(spa, fid, tx); 6279 spa_history_log_internal(spa, "set", tx, 6280 "%s=enabled", nvpair_name(elem)); 6281 break; 6282 6283 case ZPOOL_PROP_VERSION: 6284 intval = fnvpair_value_uint64(elem); 6285 /* 6286 * The version is synced seperatly before other 6287 * properties and should be correct by now. 6288 */ 6289 ASSERT3U(spa_version(spa), >=, intval); 6290 break; 6291 6292 case ZPOOL_PROP_ALTROOT: 6293 /* 6294 * 'altroot' is a non-persistent property. It should 6295 * have been set temporarily at creation or import time. 6296 */ 6297 ASSERT(spa->spa_root != NULL); 6298 break; 6299 6300 case ZPOOL_PROP_READONLY: 6301 case ZPOOL_PROP_CACHEFILE: 6302 /* 6303 * 'readonly' and 'cachefile' are also non-persisitent 6304 * properties. 6305 */ 6306 break; 6307 case ZPOOL_PROP_COMMENT: 6308 strval = fnvpair_value_string(elem); 6309 if (spa->spa_comment != NULL) 6310 spa_strfree(spa->spa_comment); 6311 spa->spa_comment = spa_strdup(strval); 6312 /* 6313 * We need to dirty the configuration on all the vdevs 6314 * so that their labels get updated. It's unnecessary 6315 * to do this for pool creation since the vdev's 6316 * configuratoin has already been dirtied. 6317 */ 6318 if (tx->tx_txg != TXG_INITIAL) 6319 vdev_config_dirty(spa->spa_root_vdev); 6320 spa_history_log_internal(spa, "set", tx, 6321 "%s=%s", nvpair_name(elem), strval); 6322 break; 6323 default: 6324 /* 6325 * Set pool property values in the poolprops mos object. 6326 */ 6327 if (spa->spa_pool_props_object == 0) { 6328 spa->spa_pool_props_object = 6329 zap_create_link(mos, DMU_OT_POOL_PROPS, 6330 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6331 tx); 6332 } 6333 6334 /* normalize the property name */ 6335 propname = zpool_prop_to_name(prop); 6336 proptype = zpool_prop_get_type(prop); 6337 6338 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6339 ASSERT(proptype == PROP_TYPE_STRING); 6340 strval = fnvpair_value_string(elem); 6341 VERIFY0(zap_update(mos, 6342 spa->spa_pool_props_object, propname, 6343 1, strlen(strval) + 1, strval, tx)); 6344 spa_history_log_internal(spa, "set", tx, 6345 "%s=%s", nvpair_name(elem), strval); 6346 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6347 intval = fnvpair_value_uint64(elem); 6348 6349 if (proptype == PROP_TYPE_INDEX) { 6350 const char *unused; 6351 VERIFY0(zpool_prop_index_to_string( 6352 prop, intval, &unused)); 6353 } 6354 VERIFY0(zap_update(mos, 6355 spa->spa_pool_props_object, propname, 6356 8, 1, &intval, tx)); 6357 spa_history_log_internal(spa, "set", tx, 6358 "%s=%lld", nvpair_name(elem), intval); 6359 } else { 6360 ASSERT(0); /* not allowed */ 6361 } 6362 6363 switch (prop) { 6364 case ZPOOL_PROP_DELEGATION: 6365 spa->spa_delegation = intval; 6366 break; 6367 case ZPOOL_PROP_BOOTFS: 6368 spa->spa_bootfs = intval; 6369 break; 6370 case ZPOOL_PROP_FAILUREMODE: 6371 spa->spa_failmode = intval; 6372 break; 6373 case ZPOOL_PROP_AUTOEXPAND: 6374 spa->spa_autoexpand = intval; 6375 if (tx->tx_txg != TXG_INITIAL) 6376 spa_async_request(spa, 6377 SPA_ASYNC_AUTOEXPAND); 6378 break; 6379 case ZPOOL_PROP_DEDUPDITTO: 6380 spa->spa_dedup_ditto = intval; 6381 break; 6382 default: 6383 break; 6384 } 6385 } 6386 6387 } 6388 6389 mutex_exit(&spa->spa_props_lock); 6390} 6391 6392/* 6393 * Perform one-time upgrade on-disk changes. spa_version() does not 6394 * reflect the new version this txg, so there must be no changes this 6395 * txg to anything that the upgrade code depends on after it executes. 6396 * Therefore this must be called after dsl_pool_sync() does the sync 6397 * tasks. 6398 */ 6399static void 6400spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6401{ 6402 dsl_pool_t *dp = spa->spa_dsl_pool; 6403 6404 ASSERT(spa->spa_sync_pass == 1); 6405 6406 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6407 6408 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6409 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6410 dsl_pool_create_origin(dp, tx); 6411 6412 /* Keeping the origin open increases spa_minref */ 6413 spa->spa_minref += 3; 6414 } 6415 6416 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6417 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6418 dsl_pool_upgrade_clones(dp, tx); 6419 } 6420 6421 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6422 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6423 dsl_pool_upgrade_dir_clones(dp, tx); 6424 6425 /* Keeping the freedir open increases spa_minref */ 6426 spa->spa_minref += 3; 6427 } 6428 6429 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6430 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6431 spa_feature_create_zap_objects(spa, tx); 6432 } 6433 6434 /* 6435 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6436 * when possibility to use lz4 compression for metadata was added 6437 * Old pools that have this feature enabled must be upgraded to have 6438 * this feature active 6439 */ 6440 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6441 boolean_t lz4_en = spa_feature_is_enabled(spa, 6442 SPA_FEATURE_LZ4_COMPRESS); 6443 boolean_t lz4_ac = spa_feature_is_active(spa, 6444 SPA_FEATURE_LZ4_COMPRESS); 6445 6446 if (lz4_en && !lz4_ac) 6447 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6448 } 6449 rrw_exit(&dp->dp_config_rwlock, FTAG); 6450} 6451 6452/* 6453 * Sync the specified transaction group. New blocks may be dirtied as 6454 * part of the process, so we iterate until it converges. 6455 */ 6456void 6457spa_sync(spa_t *spa, uint64_t txg) 6458{ 6459 dsl_pool_t *dp = spa->spa_dsl_pool; 6460 objset_t *mos = spa->spa_meta_objset; 6461 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6462 vdev_t *rvd = spa->spa_root_vdev; 6463 vdev_t *vd; 6464 dmu_tx_t *tx; 6465 int error; 6466 6467 VERIFY(spa_writeable(spa)); 6468 6469 /* 6470 * Lock out configuration changes. 6471 */ 6472 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6473 6474 spa->spa_syncing_txg = txg; 6475 spa->spa_sync_pass = 0; 6476 6477 /* 6478 * If there are any pending vdev state changes, convert them 6479 * into config changes that go out with this transaction group. 6480 */ 6481 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6482 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6483 /* 6484 * We need the write lock here because, for aux vdevs, 6485 * calling vdev_config_dirty() modifies sav_config. 6486 * This is ugly and will become unnecessary when we 6487 * eliminate the aux vdev wart by integrating all vdevs 6488 * into the root vdev tree. 6489 */ 6490 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6491 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6492 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6493 vdev_state_clean(vd); 6494 vdev_config_dirty(vd); 6495 } 6496 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6497 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6498 } 6499 spa_config_exit(spa, SCL_STATE, FTAG); 6500 6501 tx = dmu_tx_create_assigned(dp, txg); 6502 6503 spa->spa_sync_starttime = gethrtime(); 6504#ifdef illumos 6505 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6506 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6507#else /* FreeBSD */ 6508#ifdef _KERNEL 6509 callout_reset(&spa->spa_deadman_cycid, 6510 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6511#endif 6512#endif 6513 6514 /* 6515 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6516 * set spa_deflate if we have no raid-z vdevs. 6517 */ 6518 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6519 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6520 int i; 6521 6522 for (i = 0; i < rvd->vdev_children; i++) { 6523 vd = rvd->vdev_child[i]; 6524 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6525 break; 6526 } 6527 if (i == rvd->vdev_children) { 6528 spa->spa_deflate = TRUE; 6529 VERIFY(0 == zap_add(spa->spa_meta_objset, 6530 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6531 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6532 } 6533 } 6534 6535 /* 6536 * If anything has changed in this txg, or if someone is waiting 6537 * for this txg to sync (eg, spa_vdev_remove()), push the 6538 * deferred frees from the previous txg. If not, leave them 6539 * alone so that we don't generate work on an otherwise idle 6540 * system. 6541 */ 6542 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6543 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6544 !txg_list_empty(&dp->dp_sync_tasks, txg) || 6545 ((dsl_scan_active(dp->dp_scan) || 6546 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6547 spa_sync_deferred_frees(spa, tx); 6548 } 6549 6550 /* 6551 * Iterate to convergence. 6552 */ 6553 do { 6554 int pass = ++spa->spa_sync_pass; 6555 6556 spa_sync_config_object(spa, tx); 6557 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6558 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6559 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6560 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6561 spa_errlog_sync(spa, txg); 6562 dsl_pool_sync(dp, txg); 6563 6564 if (pass < zfs_sync_pass_deferred_free) { 6565 spa_sync_frees(spa, free_bpl, tx); 6566 } else { 6567 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6568 &spa->spa_deferred_bpobj, tx); 6569 } 6570 6571 ddt_sync(spa, txg); 6572 dsl_scan_sync(dp, tx); 6573 6574 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6575 vdev_sync(vd, txg); 6576 6577 if (pass == 1) 6578 spa_sync_upgrades(spa, tx); 6579 6580 } while (dmu_objset_is_dirty(mos, txg)); 6581 6582 /* 6583 * Rewrite the vdev configuration (which includes the uberblock) 6584 * to commit the transaction group. 6585 * 6586 * If there are no dirty vdevs, we sync the uberblock to a few 6587 * random top-level vdevs that are known to be visible in the 6588 * config cache (see spa_vdev_add() for a complete description). 6589 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6590 */ 6591 for (;;) { 6592 /* 6593 * We hold SCL_STATE to prevent vdev open/close/etc. 6594 * while we're attempting to write the vdev labels. 6595 */ 6596 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6597 6598 if (list_is_empty(&spa->spa_config_dirty_list)) { 6599 vdev_t *svd[SPA_DVAS_PER_BP]; 6600 int svdcount = 0; 6601 int children = rvd->vdev_children; 6602 int c0 = spa_get_random(children); 6603 6604 for (int c = 0; c < children; c++) { 6605 vd = rvd->vdev_child[(c0 + c) % children]; 6606 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6607 continue; 6608 svd[svdcount++] = vd; 6609 if (svdcount == SPA_DVAS_PER_BP) 6610 break; 6611 } 6612 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6613 if (error != 0) 6614 error = vdev_config_sync(svd, svdcount, txg, 6615 B_TRUE); 6616 } else { 6617 error = vdev_config_sync(rvd->vdev_child, 6618 rvd->vdev_children, txg, B_FALSE); 6619 if (error != 0) 6620 error = vdev_config_sync(rvd->vdev_child, 6621 rvd->vdev_children, txg, B_TRUE); 6622 } 6623 6624 if (error == 0) 6625 spa->spa_last_synced_guid = rvd->vdev_guid; 6626 6627 spa_config_exit(spa, SCL_STATE, FTAG); 6628 6629 if (error == 0) 6630 break; 6631 zio_suspend(spa, NULL); 6632 zio_resume_wait(spa); 6633 } 6634 dmu_tx_commit(tx); 6635 6636#ifdef illumos 6637 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6638#else /* FreeBSD */ 6639#ifdef _KERNEL 6640 callout_drain(&spa->spa_deadman_cycid); 6641#endif 6642#endif 6643 6644 /* 6645 * Clear the dirty config list. 6646 */ 6647 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6648 vdev_config_clean(vd); 6649 6650 /* 6651 * Now that the new config has synced transactionally, 6652 * let it become visible to the config cache. 6653 */ 6654 if (spa->spa_config_syncing != NULL) { 6655 spa_config_set(spa, spa->spa_config_syncing); 6656 spa->spa_config_txg = txg; 6657 spa->spa_config_syncing = NULL; 6658 } 6659 6660 spa->spa_ubsync = spa->spa_uberblock; 6661 6662 dsl_pool_sync_done(dp, txg); 6663 6664 /* 6665 * Update usable space statistics. 6666 */ 6667 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6668 vdev_sync_done(vd, txg); 6669 6670 spa_update_dspace(spa); 6671 6672 /* 6673 * It had better be the case that we didn't dirty anything 6674 * since vdev_config_sync(). 6675 */ 6676 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6677 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6678 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6679 6680 spa->spa_sync_pass = 0; 6681 6682 spa_config_exit(spa, SCL_CONFIG, FTAG); 6683 6684 spa_handle_ignored_writes(spa); 6685 6686 /* 6687 * If any async tasks have been requested, kick them off. 6688 */ 6689 spa_async_dispatch(spa); 6690 spa_async_dispatch_vd(spa); 6691} 6692 6693/* 6694 * Sync all pools. We don't want to hold the namespace lock across these 6695 * operations, so we take a reference on the spa_t and drop the lock during the 6696 * sync. 6697 */ 6698void 6699spa_sync_allpools(void) 6700{ 6701 spa_t *spa = NULL; 6702 mutex_enter(&spa_namespace_lock); 6703 while ((spa = spa_next(spa)) != NULL) { 6704 if (spa_state(spa) != POOL_STATE_ACTIVE || 6705 !spa_writeable(spa) || spa_suspended(spa)) 6706 continue; 6707 spa_open_ref(spa, FTAG); 6708 mutex_exit(&spa_namespace_lock); 6709 txg_wait_synced(spa_get_dsl(spa), 0); 6710 mutex_enter(&spa_namespace_lock); 6711 spa_close(spa, FTAG); 6712 } 6713 mutex_exit(&spa_namespace_lock); 6714} 6715 6716/* 6717 * ========================================================================== 6718 * Miscellaneous routines 6719 * ========================================================================== 6720 */ 6721 6722/* 6723 * Remove all pools in the system. 6724 */ 6725void 6726spa_evict_all(void) 6727{ 6728 spa_t *spa; 6729 6730 /* 6731 * Remove all cached state. All pools should be closed now, 6732 * so every spa in the AVL tree should be unreferenced. 6733 */ 6734 mutex_enter(&spa_namespace_lock); 6735 while ((spa = spa_next(NULL)) != NULL) { 6736 /* 6737 * Stop async tasks. The async thread may need to detach 6738 * a device that's been replaced, which requires grabbing 6739 * spa_namespace_lock, so we must drop it here. 6740 */ 6741 spa_open_ref(spa, FTAG); 6742 mutex_exit(&spa_namespace_lock); 6743 spa_async_suspend(spa); 6744 mutex_enter(&spa_namespace_lock); 6745 spa_close(spa, FTAG); 6746 6747 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6748 spa_unload(spa); 6749 spa_deactivate(spa); 6750 } 6751 spa_remove(spa); 6752 } 6753 mutex_exit(&spa_namespace_lock); 6754} 6755 6756vdev_t * 6757spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6758{ 6759 vdev_t *vd; 6760 int i; 6761 6762 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6763 return (vd); 6764 6765 if (aux) { 6766 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6767 vd = spa->spa_l2cache.sav_vdevs[i]; 6768 if (vd->vdev_guid == guid) 6769 return (vd); 6770 } 6771 6772 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6773 vd = spa->spa_spares.sav_vdevs[i]; 6774 if (vd->vdev_guid == guid) 6775 return (vd); 6776 } 6777 } 6778 6779 return (NULL); 6780} 6781 6782void 6783spa_upgrade(spa_t *spa, uint64_t version) 6784{ 6785 ASSERT(spa_writeable(spa)); 6786 6787 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6788 6789 /* 6790 * This should only be called for a non-faulted pool, and since a 6791 * future version would result in an unopenable pool, this shouldn't be 6792 * possible. 6793 */ 6794 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6795 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6796 6797 spa->spa_uberblock.ub_version = version; 6798 vdev_config_dirty(spa->spa_root_vdev); 6799 6800 spa_config_exit(spa, SCL_ALL, FTAG); 6801 6802 txg_wait_synced(spa_get_dsl(spa), 0); 6803} 6804 6805boolean_t 6806spa_has_spare(spa_t *spa, uint64_t guid) 6807{ 6808 int i; 6809 uint64_t spareguid; 6810 spa_aux_vdev_t *sav = &spa->spa_spares; 6811 6812 for (i = 0; i < sav->sav_count; i++) 6813 if (sav->sav_vdevs[i]->vdev_guid == guid) 6814 return (B_TRUE); 6815 6816 for (i = 0; i < sav->sav_npending; i++) { 6817 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6818 &spareguid) == 0 && spareguid == guid) 6819 return (B_TRUE); 6820 } 6821 6822 return (B_FALSE); 6823} 6824 6825/* 6826 * Check if a pool has an active shared spare device. 6827 * Note: reference count of an active spare is 2, as a spare and as a replace 6828 */ 6829static boolean_t 6830spa_has_active_shared_spare(spa_t *spa) 6831{ 6832 int i, refcnt; 6833 uint64_t pool; 6834 spa_aux_vdev_t *sav = &spa->spa_spares; 6835 6836 for (i = 0; i < sav->sav_count; i++) { 6837 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6838 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6839 refcnt > 2) 6840 return (B_TRUE); 6841 } 6842 6843 return (B_FALSE); 6844} 6845 6846/* 6847 * Post a sysevent corresponding to the given event. The 'name' must be one of 6848 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6849 * filled in from the spa and (optionally) the vdev. This doesn't do anything 6850 * in the userland libzpool, as we don't want consumers to misinterpret ztest 6851 * or zdb as real changes. 6852 */ 6853void 6854spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6855{ 6856#ifdef _KERNEL 6857 sysevent_t *ev; 6858 sysevent_attr_list_t *attr = NULL; 6859 sysevent_value_t value; 6860 sysevent_id_t eid; 6861 6862 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6863 SE_SLEEP); 6864 6865 value.value_type = SE_DATA_TYPE_STRING; 6866 value.value.sv_string = spa_name(spa); 6867 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6868 goto done; 6869 6870 value.value_type = SE_DATA_TYPE_UINT64; 6871 value.value.sv_uint64 = spa_guid(spa); 6872 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6873 goto done; 6874 6875 if (vd) { 6876 value.value_type = SE_DATA_TYPE_UINT64; 6877 value.value.sv_uint64 = vd->vdev_guid; 6878 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6879 SE_SLEEP) != 0) 6880 goto done; 6881 6882 if (vd->vdev_path) { 6883 value.value_type = SE_DATA_TYPE_STRING; 6884 value.value.sv_string = vd->vdev_path; 6885 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6886 &value, SE_SLEEP) != 0) 6887 goto done; 6888 } 6889 } 6890 6891 if (sysevent_attach_attributes(ev, attr) != 0) 6892 goto done; 6893 attr = NULL; 6894 6895 (void) log_sysevent(ev, SE_SLEEP, &eid); 6896 6897done: 6898 if (attr) 6899 sysevent_free_attr(attr); 6900 sysevent_free(ev); 6901#endif 6902} 6903