spa.c revision 294334
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright 2013 Saso Kiselkov. All rights reserved. 29 */ 30 31/* 32 * SPA: Storage Pool Allocator 33 * 34 * This file contains all the routines used when modifying on-disk SPA state. 35 * This includes opening, importing, destroying, exporting a pool, and syncing a 36 * pool. 37 */ 38 39#include <sys/zfs_context.h> 40#include <sys/fm/fs/zfs.h> 41#include <sys/spa_impl.h> 42#include <sys/zio.h> 43#include <sys/zio_checksum.h> 44#include <sys/dmu.h> 45#include <sys/dmu_tx.h> 46#include <sys/zap.h> 47#include <sys/zil.h> 48#include <sys/ddt.h> 49#include <sys/vdev_impl.h> 50#include <sys/metaslab.h> 51#include <sys/metaslab_impl.h> 52#include <sys/uberblock_impl.h> 53#include <sys/txg.h> 54#include <sys/avl.h> 55#include <sys/dmu_traverse.h> 56#include <sys/dmu_objset.h> 57#include <sys/unique.h> 58#include <sys/dsl_pool.h> 59#include <sys/dsl_dataset.h> 60#include <sys/dsl_dir.h> 61#include <sys/dsl_prop.h> 62#include <sys/dsl_synctask.h> 63#include <sys/fs/zfs.h> 64#include <sys/arc.h> 65#include <sys/callb.h> 66#include <sys/spa_boot.h> 67#include <sys/zfs_ioctl.h> 68#include <sys/dsl_scan.h> 69#include <sys/dmu_send.h> 70#include <sys/dsl_destroy.h> 71#include <sys/dsl_userhold.h> 72#include <sys/zfeature.h> 73#include <sys/zvol.h> 74#include <sys/trim_map.h> 75 76#ifdef _KERNEL 77#include <sys/callb.h> 78#include <sys/cpupart.h> 79#include <sys/zone.h> 80#endif /* _KERNEL */ 81 82#include "zfs_prop.h" 83#include "zfs_comutil.h" 84 85/* Check hostid on import? */ 86static int check_hostid = 1; 87 88SYSCTL_DECL(_vfs_zfs); 89TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 90SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 91 "Check hostid on import?"); 92 93/* 94 * The interval, in seconds, at which failed configuration cache file writes 95 * should be retried. 96 */ 97static int zfs_ccw_retry_interval = 300; 98 99typedef enum zti_modes { 100 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 101 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 102 ZTI_MODE_NULL, /* don't create a taskq */ 103 ZTI_NMODES 104} zti_modes_t; 105 106#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 107#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 108#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 109 110#define ZTI_N(n) ZTI_P(n, 1) 111#define ZTI_ONE ZTI_N(1) 112 113typedef struct zio_taskq_info { 114 zti_modes_t zti_mode; 115 uint_t zti_value; 116 uint_t zti_count; 117} zio_taskq_info_t; 118 119static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 120 "issue", "issue_high", "intr", "intr_high" 121}; 122 123/* 124 * This table defines the taskq settings for each ZFS I/O type. When 125 * initializing a pool, we use this table to create an appropriately sized 126 * taskq. Some operations are low volume and therefore have a small, static 127 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 128 * macros. Other operations process a large amount of data; the ZTI_BATCH 129 * macro causes us to create a taskq oriented for throughput. Some operations 130 * are so high frequency and short-lived that the taskq itself can become a a 131 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 132 * additional degree of parallelism specified by the number of threads per- 133 * taskq and the number of taskqs; when dispatching an event in this case, the 134 * particular taskq is chosen at random. 135 * 136 * The different taskq priorities are to handle the different contexts (issue 137 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 138 * need to be handled with minimum delay. 139 */ 140const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 141 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 142 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 143 { ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */ 144 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 145 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 146 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 147 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 148}; 149 150static void spa_sync_version(void *arg, dmu_tx_t *tx); 151static void spa_sync_props(void *arg, dmu_tx_t *tx); 152static boolean_t spa_has_active_shared_spare(spa_t *spa); 153static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 154 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 155 char **ereport); 156static void spa_vdev_resilver_done(spa_t *spa); 157 158uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 159#ifdef PSRSET_BIND 160id_t zio_taskq_psrset_bind = PS_NONE; 161#endif 162#ifdef SYSDC 163boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 164#endif 165uint_t zio_taskq_basedc = 80; /* base duty cycle */ 166 167boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 168extern int zfs_sync_pass_deferred_free; 169 170#ifndef illumos 171extern void spa_deadman(void *arg); 172#endif 173 174/* 175 * This (illegal) pool name is used when temporarily importing a spa_t in order 176 * to get the vdev stats associated with the imported devices. 177 */ 178#define TRYIMPORT_NAME "$import" 179 180/* 181 * ========================================================================== 182 * SPA properties routines 183 * ========================================================================== 184 */ 185 186/* 187 * Add a (source=src, propname=propval) list to an nvlist. 188 */ 189static void 190spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 191 uint64_t intval, zprop_source_t src) 192{ 193 const char *propname = zpool_prop_to_name(prop); 194 nvlist_t *propval; 195 196 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 197 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 198 199 if (strval != NULL) 200 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 201 else 202 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 203 204 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 205 nvlist_free(propval); 206} 207 208/* 209 * Get property values from the spa configuration. 210 */ 211static void 212spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 213{ 214 vdev_t *rvd = spa->spa_root_vdev; 215 dsl_pool_t *pool = spa->spa_dsl_pool; 216 uint64_t size, alloc, cap, version; 217 zprop_source_t src = ZPROP_SRC_NONE; 218 spa_config_dirent_t *dp; 219 metaslab_class_t *mc = spa_normal_class(spa); 220 221 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 222 223 if (rvd != NULL) { 224 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 225 size = metaslab_class_get_space(spa_normal_class(spa)); 226 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 227 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 228 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 229 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 230 size - alloc, src); 231 232 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 233 metaslab_class_fragmentation(mc), src); 234 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 235 metaslab_class_expandable_space(mc), src); 236 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 237 (spa_mode(spa) == FREAD), src); 238 239 cap = (size == 0) ? 0 : (alloc * 100 / size); 240 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 241 242 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 243 ddt_get_pool_dedup_ratio(spa), src); 244 245 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 246 rvd->vdev_state, src); 247 248 version = spa_version(spa); 249 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 250 src = ZPROP_SRC_DEFAULT; 251 else 252 src = ZPROP_SRC_LOCAL; 253 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 254 } 255 256 if (pool != NULL) { 257 /* 258 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 259 * when opening pools before this version freedir will be NULL. 260 */ 261 if (pool->dp_free_dir != NULL) { 262 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 263 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 264 src); 265 } else { 266 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 267 NULL, 0, src); 268 } 269 270 if (pool->dp_leak_dir != NULL) { 271 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 272 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 273 src); 274 } else { 275 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 276 NULL, 0, src); 277 } 278 } 279 280 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 281 282 if (spa->spa_comment != NULL) { 283 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 284 0, ZPROP_SRC_LOCAL); 285 } 286 287 if (spa->spa_root != NULL) 288 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 289 0, ZPROP_SRC_LOCAL); 290 291 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 292 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 293 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 294 } else { 295 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 296 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 297 } 298 299 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 300 if (dp->scd_path == NULL) { 301 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 302 "none", 0, ZPROP_SRC_LOCAL); 303 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 304 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 305 dp->scd_path, 0, ZPROP_SRC_LOCAL); 306 } 307 } 308} 309 310/* 311 * Get zpool property values. 312 */ 313int 314spa_prop_get(spa_t *spa, nvlist_t **nvp) 315{ 316 objset_t *mos = spa->spa_meta_objset; 317 zap_cursor_t zc; 318 zap_attribute_t za; 319 int err; 320 321 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 322 323 mutex_enter(&spa->spa_props_lock); 324 325 /* 326 * Get properties from the spa config. 327 */ 328 spa_prop_get_config(spa, nvp); 329 330 /* If no pool property object, no more prop to get. */ 331 if (mos == NULL || spa->spa_pool_props_object == 0) { 332 mutex_exit(&spa->spa_props_lock); 333 return (0); 334 } 335 336 /* 337 * Get properties from the MOS pool property object. 338 */ 339 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 340 (err = zap_cursor_retrieve(&zc, &za)) == 0; 341 zap_cursor_advance(&zc)) { 342 uint64_t intval = 0; 343 char *strval = NULL; 344 zprop_source_t src = ZPROP_SRC_DEFAULT; 345 zpool_prop_t prop; 346 347 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 348 continue; 349 350 switch (za.za_integer_length) { 351 case 8: 352 /* integer property */ 353 if (za.za_first_integer != 354 zpool_prop_default_numeric(prop)) 355 src = ZPROP_SRC_LOCAL; 356 357 if (prop == ZPOOL_PROP_BOOTFS) { 358 dsl_pool_t *dp; 359 dsl_dataset_t *ds = NULL; 360 361 dp = spa_get_dsl(spa); 362 dsl_pool_config_enter(dp, FTAG); 363 if (err = dsl_dataset_hold_obj(dp, 364 za.za_first_integer, FTAG, &ds)) { 365 dsl_pool_config_exit(dp, FTAG); 366 break; 367 } 368 369 strval = kmem_alloc( 370 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 371 KM_SLEEP); 372 dsl_dataset_name(ds, strval); 373 dsl_dataset_rele(ds, FTAG); 374 dsl_pool_config_exit(dp, FTAG); 375 } else { 376 strval = NULL; 377 intval = za.za_first_integer; 378 } 379 380 spa_prop_add_list(*nvp, prop, strval, intval, src); 381 382 if (strval != NULL) 383 kmem_free(strval, 384 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 385 386 break; 387 388 case 1: 389 /* string property */ 390 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 391 err = zap_lookup(mos, spa->spa_pool_props_object, 392 za.za_name, 1, za.za_num_integers, strval); 393 if (err) { 394 kmem_free(strval, za.za_num_integers); 395 break; 396 } 397 spa_prop_add_list(*nvp, prop, strval, 0, src); 398 kmem_free(strval, za.za_num_integers); 399 break; 400 401 default: 402 break; 403 } 404 } 405 zap_cursor_fini(&zc); 406 mutex_exit(&spa->spa_props_lock); 407out: 408 if (err && err != ENOENT) { 409 nvlist_free(*nvp); 410 *nvp = NULL; 411 return (err); 412 } 413 414 return (0); 415} 416 417/* 418 * Validate the given pool properties nvlist and modify the list 419 * for the property values to be set. 420 */ 421static int 422spa_prop_validate(spa_t *spa, nvlist_t *props) 423{ 424 nvpair_t *elem; 425 int error = 0, reset_bootfs = 0; 426 uint64_t objnum = 0; 427 boolean_t has_feature = B_FALSE; 428 429 elem = NULL; 430 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 431 uint64_t intval; 432 char *strval, *slash, *check, *fname; 433 const char *propname = nvpair_name(elem); 434 zpool_prop_t prop = zpool_name_to_prop(propname); 435 436 switch (prop) { 437 case ZPROP_INVAL: 438 if (!zpool_prop_feature(propname)) { 439 error = SET_ERROR(EINVAL); 440 break; 441 } 442 443 /* 444 * Sanitize the input. 445 */ 446 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 447 error = SET_ERROR(EINVAL); 448 break; 449 } 450 451 if (nvpair_value_uint64(elem, &intval) != 0) { 452 error = SET_ERROR(EINVAL); 453 break; 454 } 455 456 if (intval != 0) { 457 error = SET_ERROR(EINVAL); 458 break; 459 } 460 461 fname = strchr(propname, '@') + 1; 462 if (zfeature_lookup_name(fname, NULL) != 0) { 463 error = SET_ERROR(EINVAL); 464 break; 465 } 466 467 has_feature = B_TRUE; 468 break; 469 470 case ZPOOL_PROP_VERSION: 471 error = nvpair_value_uint64(elem, &intval); 472 if (!error && 473 (intval < spa_version(spa) || 474 intval > SPA_VERSION_BEFORE_FEATURES || 475 has_feature)) 476 error = SET_ERROR(EINVAL); 477 break; 478 479 case ZPOOL_PROP_DELEGATION: 480 case ZPOOL_PROP_AUTOREPLACE: 481 case ZPOOL_PROP_LISTSNAPS: 482 case ZPOOL_PROP_AUTOEXPAND: 483 error = nvpair_value_uint64(elem, &intval); 484 if (!error && intval > 1) 485 error = SET_ERROR(EINVAL); 486 break; 487 488 case ZPOOL_PROP_BOOTFS: 489 /* 490 * If the pool version is less than SPA_VERSION_BOOTFS, 491 * or the pool is still being created (version == 0), 492 * the bootfs property cannot be set. 493 */ 494 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 495 error = SET_ERROR(ENOTSUP); 496 break; 497 } 498 499 /* 500 * Make sure the vdev config is bootable 501 */ 502 if (!vdev_is_bootable(spa->spa_root_vdev)) { 503 error = SET_ERROR(ENOTSUP); 504 break; 505 } 506 507 reset_bootfs = 1; 508 509 error = nvpair_value_string(elem, &strval); 510 511 if (!error) { 512 objset_t *os; 513 uint64_t propval; 514 515 if (strval == NULL || strval[0] == '\0') { 516 objnum = zpool_prop_default_numeric( 517 ZPOOL_PROP_BOOTFS); 518 break; 519 } 520 521 if (error = dmu_objset_hold(strval, FTAG, &os)) 522 break; 523 524 /* 525 * Must be ZPL, and its property settings 526 * must be supported by GRUB (compression 527 * is not gzip, and large blocks are not used). 528 */ 529 530 if (dmu_objset_type(os) != DMU_OST_ZFS) { 531 error = SET_ERROR(ENOTSUP); 532 } else if ((error = 533 dsl_prop_get_int_ds(dmu_objset_ds(os), 534 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 535 &propval)) == 0 && 536 !BOOTFS_COMPRESS_VALID(propval)) { 537 error = SET_ERROR(ENOTSUP); 538 } else if ((error = 539 dsl_prop_get_int_ds(dmu_objset_ds(os), 540 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 541 &propval)) == 0 && 542 propval > SPA_OLD_MAXBLOCKSIZE) { 543 error = SET_ERROR(ENOTSUP); 544 } else { 545 objnum = dmu_objset_id(os); 546 } 547 dmu_objset_rele(os, FTAG); 548 } 549 break; 550 551 case ZPOOL_PROP_FAILUREMODE: 552 error = nvpair_value_uint64(elem, &intval); 553 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 554 intval > ZIO_FAILURE_MODE_PANIC)) 555 error = SET_ERROR(EINVAL); 556 557 /* 558 * This is a special case which only occurs when 559 * the pool has completely failed. This allows 560 * the user to change the in-core failmode property 561 * without syncing it out to disk (I/Os might 562 * currently be blocked). We do this by returning 563 * EIO to the caller (spa_prop_set) to trick it 564 * into thinking we encountered a property validation 565 * error. 566 */ 567 if (!error && spa_suspended(spa)) { 568 spa->spa_failmode = intval; 569 error = SET_ERROR(EIO); 570 } 571 break; 572 573 case ZPOOL_PROP_CACHEFILE: 574 if ((error = nvpair_value_string(elem, &strval)) != 0) 575 break; 576 577 if (strval[0] == '\0') 578 break; 579 580 if (strcmp(strval, "none") == 0) 581 break; 582 583 if (strval[0] != '/') { 584 error = SET_ERROR(EINVAL); 585 break; 586 } 587 588 slash = strrchr(strval, '/'); 589 ASSERT(slash != NULL); 590 591 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 592 strcmp(slash, "/..") == 0) 593 error = SET_ERROR(EINVAL); 594 break; 595 596 case ZPOOL_PROP_COMMENT: 597 if ((error = nvpair_value_string(elem, &strval)) != 0) 598 break; 599 for (check = strval; *check != '\0'; check++) { 600 /* 601 * The kernel doesn't have an easy isprint() 602 * check. For this kernel check, we merely 603 * check ASCII apart from DEL. Fix this if 604 * there is an easy-to-use kernel isprint(). 605 */ 606 if (*check >= 0x7f) { 607 error = SET_ERROR(EINVAL); 608 break; 609 } 610 } 611 if (strlen(strval) > ZPROP_MAX_COMMENT) 612 error = E2BIG; 613 break; 614 615 case ZPOOL_PROP_DEDUPDITTO: 616 if (spa_version(spa) < SPA_VERSION_DEDUP) 617 error = SET_ERROR(ENOTSUP); 618 else 619 error = nvpair_value_uint64(elem, &intval); 620 if (error == 0 && 621 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 622 error = SET_ERROR(EINVAL); 623 break; 624 } 625 626 if (error) 627 break; 628 } 629 630 if (!error && reset_bootfs) { 631 error = nvlist_remove(props, 632 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 633 634 if (!error) { 635 error = nvlist_add_uint64(props, 636 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 637 } 638 } 639 640 return (error); 641} 642 643void 644spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 645{ 646 char *cachefile; 647 spa_config_dirent_t *dp; 648 649 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 650 &cachefile) != 0) 651 return; 652 653 dp = kmem_alloc(sizeof (spa_config_dirent_t), 654 KM_SLEEP); 655 656 if (cachefile[0] == '\0') 657 dp->scd_path = spa_strdup(spa_config_path); 658 else if (strcmp(cachefile, "none") == 0) 659 dp->scd_path = NULL; 660 else 661 dp->scd_path = spa_strdup(cachefile); 662 663 list_insert_head(&spa->spa_config_list, dp); 664 if (need_sync) 665 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 666} 667 668int 669spa_prop_set(spa_t *spa, nvlist_t *nvp) 670{ 671 int error; 672 nvpair_t *elem = NULL; 673 boolean_t need_sync = B_FALSE; 674 675 if ((error = spa_prop_validate(spa, nvp)) != 0) 676 return (error); 677 678 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 679 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 680 681 if (prop == ZPOOL_PROP_CACHEFILE || 682 prop == ZPOOL_PROP_ALTROOT || 683 prop == ZPOOL_PROP_READONLY) 684 continue; 685 686 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 687 uint64_t ver; 688 689 if (prop == ZPOOL_PROP_VERSION) { 690 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 691 } else { 692 ASSERT(zpool_prop_feature(nvpair_name(elem))); 693 ver = SPA_VERSION_FEATURES; 694 need_sync = B_TRUE; 695 } 696 697 /* Save time if the version is already set. */ 698 if (ver == spa_version(spa)) 699 continue; 700 701 /* 702 * In addition to the pool directory object, we might 703 * create the pool properties object, the features for 704 * read object, the features for write object, or the 705 * feature descriptions object. 706 */ 707 error = dsl_sync_task(spa->spa_name, NULL, 708 spa_sync_version, &ver, 709 6, ZFS_SPACE_CHECK_RESERVED); 710 if (error) 711 return (error); 712 continue; 713 } 714 715 need_sync = B_TRUE; 716 break; 717 } 718 719 if (need_sync) { 720 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 721 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 722 } 723 724 return (0); 725} 726 727/* 728 * If the bootfs property value is dsobj, clear it. 729 */ 730void 731spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 732{ 733 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 734 VERIFY(zap_remove(spa->spa_meta_objset, 735 spa->spa_pool_props_object, 736 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 737 spa->spa_bootfs = 0; 738 } 739} 740 741/*ARGSUSED*/ 742static int 743spa_change_guid_check(void *arg, dmu_tx_t *tx) 744{ 745 uint64_t *newguid = arg; 746 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 747 vdev_t *rvd = spa->spa_root_vdev; 748 uint64_t vdev_state; 749 750 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 751 vdev_state = rvd->vdev_state; 752 spa_config_exit(spa, SCL_STATE, FTAG); 753 754 if (vdev_state != VDEV_STATE_HEALTHY) 755 return (SET_ERROR(ENXIO)); 756 757 ASSERT3U(spa_guid(spa), !=, *newguid); 758 759 return (0); 760} 761 762static void 763spa_change_guid_sync(void *arg, dmu_tx_t *tx) 764{ 765 uint64_t *newguid = arg; 766 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 767 uint64_t oldguid; 768 vdev_t *rvd = spa->spa_root_vdev; 769 770 oldguid = spa_guid(spa); 771 772 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 773 rvd->vdev_guid = *newguid; 774 rvd->vdev_guid_sum += (*newguid - oldguid); 775 vdev_config_dirty(rvd); 776 spa_config_exit(spa, SCL_STATE, FTAG); 777 778 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 779 oldguid, *newguid); 780} 781 782/* 783 * Change the GUID for the pool. This is done so that we can later 784 * re-import a pool built from a clone of our own vdevs. We will modify 785 * the root vdev's guid, our own pool guid, and then mark all of our 786 * vdevs dirty. Note that we must make sure that all our vdevs are 787 * online when we do this, or else any vdevs that weren't present 788 * would be orphaned from our pool. We are also going to issue a 789 * sysevent to update any watchers. 790 */ 791int 792spa_change_guid(spa_t *spa) 793{ 794 int error; 795 uint64_t guid; 796 797 mutex_enter(&spa->spa_vdev_top_lock); 798 mutex_enter(&spa_namespace_lock); 799 guid = spa_generate_guid(NULL); 800 801 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 802 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 803 804 if (error == 0) { 805 spa_config_sync(spa, B_FALSE, B_TRUE); 806 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 807 } 808 809 mutex_exit(&spa_namespace_lock); 810 mutex_exit(&spa->spa_vdev_top_lock); 811 812 return (error); 813} 814 815/* 816 * ========================================================================== 817 * SPA state manipulation (open/create/destroy/import/export) 818 * ========================================================================== 819 */ 820 821static int 822spa_error_entry_compare(const void *a, const void *b) 823{ 824 spa_error_entry_t *sa = (spa_error_entry_t *)a; 825 spa_error_entry_t *sb = (spa_error_entry_t *)b; 826 int ret; 827 828 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 829 sizeof (zbookmark_phys_t)); 830 831 if (ret < 0) 832 return (-1); 833 else if (ret > 0) 834 return (1); 835 else 836 return (0); 837} 838 839/* 840 * Utility function which retrieves copies of the current logs and 841 * re-initializes them in the process. 842 */ 843void 844spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 845{ 846 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 847 848 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 849 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 850 851 avl_create(&spa->spa_errlist_scrub, 852 spa_error_entry_compare, sizeof (spa_error_entry_t), 853 offsetof(spa_error_entry_t, se_avl)); 854 avl_create(&spa->spa_errlist_last, 855 spa_error_entry_compare, sizeof (spa_error_entry_t), 856 offsetof(spa_error_entry_t, se_avl)); 857} 858 859static void 860spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 861{ 862 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 863 enum zti_modes mode = ztip->zti_mode; 864 uint_t value = ztip->zti_value; 865 uint_t count = ztip->zti_count; 866 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 867 char name[32]; 868 uint_t flags = 0; 869 boolean_t batch = B_FALSE; 870 871 if (mode == ZTI_MODE_NULL) { 872 tqs->stqs_count = 0; 873 tqs->stqs_taskq = NULL; 874 return; 875 } 876 877 ASSERT3U(count, >, 0); 878 879 tqs->stqs_count = count; 880 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 881 882 switch (mode) { 883 case ZTI_MODE_FIXED: 884 ASSERT3U(value, >=, 1); 885 value = MAX(value, 1); 886 break; 887 888 case ZTI_MODE_BATCH: 889 batch = B_TRUE; 890 flags |= TASKQ_THREADS_CPU_PCT; 891 value = zio_taskq_batch_pct; 892 break; 893 894 default: 895 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 896 "spa_activate()", 897 zio_type_name[t], zio_taskq_types[q], mode, value); 898 break; 899 } 900 901 for (uint_t i = 0; i < count; i++) { 902 taskq_t *tq; 903 904 if (count > 1) { 905 (void) snprintf(name, sizeof (name), "%s_%s_%u", 906 zio_type_name[t], zio_taskq_types[q], i); 907 } else { 908 (void) snprintf(name, sizeof (name), "%s_%s", 909 zio_type_name[t], zio_taskq_types[q]); 910 } 911 912#ifdef SYSDC 913 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 914 if (batch) 915 flags |= TASKQ_DC_BATCH; 916 917 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 918 spa->spa_proc, zio_taskq_basedc, flags); 919 } else { 920#endif 921 pri_t pri = maxclsyspri; 922 /* 923 * The write issue taskq can be extremely CPU 924 * intensive. Run it at slightly lower priority 925 * than the other taskqs. 926 */ 927 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 928 pri--; 929 930 tq = taskq_create_proc(name, value, pri, 50, 931 INT_MAX, spa->spa_proc, flags); 932#ifdef SYSDC 933 } 934#endif 935 936 tqs->stqs_taskq[i] = tq; 937 } 938} 939 940static void 941spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 942{ 943 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 944 945 if (tqs->stqs_taskq == NULL) { 946 ASSERT0(tqs->stqs_count); 947 return; 948 } 949 950 for (uint_t i = 0; i < tqs->stqs_count; i++) { 951 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 952 taskq_destroy(tqs->stqs_taskq[i]); 953 } 954 955 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 956 tqs->stqs_taskq = NULL; 957} 958 959/* 960 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 961 * Note that a type may have multiple discrete taskqs to avoid lock contention 962 * on the taskq itself. In that case we choose which taskq at random by using 963 * the low bits of gethrtime(). 964 */ 965void 966spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 967 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 968{ 969 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 970 taskq_t *tq; 971 972 ASSERT3P(tqs->stqs_taskq, !=, NULL); 973 ASSERT3U(tqs->stqs_count, !=, 0); 974 975 if (tqs->stqs_count == 1) { 976 tq = tqs->stqs_taskq[0]; 977 } else { 978#ifdef _KERNEL 979 tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 980#else 981 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 982#endif 983 } 984 985 taskq_dispatch_ent(tq, func, arg, flags, ent); 986} 987 988static void 989spa_create_zio_taskqs(spa_t *spa) 990{ 991 for (int t = 0; t < ZIO_TYPES; t++) { 992 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 993 spa_taskqs_init(spa, t, q); 994 } 995 } 996} 997 998#ifdef _KERNEL 999#ifdef SPA_PROCESS 1000static void 1001spa_thread(void *arg) 1002{ 1003 callb_cpr_t cprinfo; 1004 1005 spa_t *spa = arg; 1006 user_t *pu = PTOU(curproc); 1007 1008 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1009 spa->spa_name); 1010 1011 ASSERT(curproc != &p0); 1012 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1013 "zpool-%s", spa->spa_name); 1014 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1015 1016#ifdef PSRSET_BIND 1017 /* bind this thread to the requested psrset */ 1018 if (zio_taskq_psrset_bind != PS_NONE) { 1019 pool_lock(); 1020 mutex_enter(&cpu_lock); 1021 mutex_enter(&pidlock); 1022 mutex_enter(&curproc->p_lock); 1023 1024 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1025 0, NULL, NULL) == 0) { 1026 curthread->t_bind_pset = zio_taskq_psrset_bind; 1027 } else { 1028 cmn_err(CE_WARN, 1029 "Couldn't bind process for zfs pool \"%s\" to " 1030 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1031 } 1032 1033 mutex_exit(&curproc->p_lock); 1034 mutex_exit(&pidlock); 1035 mutex_exit(&cpu_lock); 1036 pool_unlock(); 1037 } 1038#endif 1039 1040#ifdef SYSDC 1041 if (zio_taskq_sysdc) { 1042 sysdc_thread_enter(curthread, 100, 0); 1043 } 1044#endif 1045 1046 spa->spa_proc = curproc; 1047 spa->spa_did = curthread->t_did; 1048 1049 spa_create_zio_taskqs(spa); 1050 1051 mutex_enter(&spa->spa_proc_lock); 1052 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1053 1054 spa->spa_proc_state = SPA_PROC_ACTIVE; 1055 cv_broadcast(&spa->spa_proc_cv); 1056 1057 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1058 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1059 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1060 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1061 1062 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1063 spa->spa_proc_state = SPA_PROC_GONE; 1064 spa->spa_proc = &p0; 1065 cv_broadcast(&spa->spa_proc_cv); 1066 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1067 1068 mutex_enter(&curproc->p_lock); 1069 lwp_exit(); 1070} 1071#endif /* SPA_PROCESS */ 1072#endif 1073 1074/* 1075 * Activate an uninitialized pool. 1076 */ 1077static void 1078spa_activate(spa_t *spa, int mode) 1079{ 1080 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1081 1082 spa->spa_state = POOL_STATE_ACTIVE; 1083 spa->spa_mode = mode; 1084 1085 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1086 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1087 1088 /* Try to create a covering process */ 1089 mutex_enter(&spa->spa_proc_lock); 1090 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1091 ASSERT(spa->spa_proc == &p0); 1092 spa->spa_did = 0; 1093 1094#ifdef SPA_PROCESS 1095 /* Only create a process if we're going to be around a while. */ 1096 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1097 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1098 NULL, 0) == 0) { 1099 spa->spa_proc_state = SPA_PROC_CREATED; 1100 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1101 cv_wait(&spa->spa_proc_cv, 1102 &spa->spa_proc_lock); 1103 } 1104 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1105 ASSERT(spa->spa_proc != &p0); 1106 ASSERT(spa->spa_did != 0); 1107 } else { 1108#ifdef _KERNEL 1109 cmn_err(CE_WARN, 1110 "Couldn't create process for zfs pool \"%s\"\n", 1111 spa->spa_name); 1112#endif 1113 } 1114 } 1115#endif /* SPA_PROCESS */ 1116 mutex_exit(&spa->spa_proc_lock); 1117 1118 /* If we didn't create a process, we need to create our taskqs. */ 1119 ASSERT(spa->spa_proc == &p0); 1120 if (spa->spa_proc == &p0) { 1121 spa_create_zio_taskqs(spa); 1122 } 1123 1124 /* 1125 * Start TRIM thread. 1126 */ 1127 trim_thread_create(spa); 1128 1129 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1130 offsetof(vdev_t, vdev_config_dirty_node)); 1131 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1132 offsetof(objset_t, os_evicting_node)); 1133 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1134 offsetof(vdev_t, vdev_state_dirty_node)); 1135 1136 txg_list_create(&spa->spa_vdev_txg_list, 1137 offsetof(struct vdev, vdev_txg_node)); 1138 1139 avl_create(&spa->spa_errlist_scrub, 1140 spa_error_entry_compare, sizeof (spa_error_entry_t), 1141 offsetof(spa_error_entry_t, se_avl)); 1142 avl_create(&spa->spa_errlist_last, 1143 spa_error_entry_compare, sizeof (spa_error_entry_t), 1144 offsetof(spa_error_entry_t, se_avl)); 1145} 1146 1147/* 1148 * Opposite of spa_activate(). 1149 */ 1150static void 1151spa_deactivate(spa_t *spa) 1152{ 1153 ASSERT(spa->spa_sync_on == B_FALSE); 1154 ASSERT(spa->spa_dsl_pool == NULL); 1155 ASSERT(spa->spa_root_vdev == NULL); 1156 ASSERT(spa->spa_async_zio_root == NULL); 1157 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1158 1159 /* 1160 * Stop TRIM thread in case spa_unload() wasn't called directly 1161 * before spa_deactivate(). 1162 */ 1163 trim_thread_destroy(spa); 1164 1165 spa_evicting_os_wait(spa); 1166 1167 txg_list_destroy(&spa->spa_vdev_txg_list); 1168 1169 list_destroy(&spa->spa_config_dirty_list); 1170 list_destroy(&spa->spa_evicting_os_list); 1171 list_destroy(&spa->spa_state_dirty_list); 1172 1173 for (int t = 0; t < ZIO_TYPES; t++) { 1174 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1175 spa_taskqs_fini(spa, t, q); 1176 } 1177 } 1178 1179 metaslab_class_destroy(spa->spa_normal_class); 1180 spa->spa_normal_class = NULL; 1181 1182 metaslab_class_destroy(spa->spa_log_class); 1183 spa->spa_log_class = NULL; 1184 1185 /* 1186 * If this was part of an import or the open otherwise failed, we may 1187 * still have errors left in the queues. Empty them just in case. 1188 */ 1189 spa_errlog_drain(spa); 1190 1191 avl_destroy(&spa->spa_errlist_scrub); 1192 avl_destroy(&spa->spa_errlist_last); 1193 1194 spa->spa_state = POOL_STATE_UNINITIALIZED; 1195 1196 mutex_enter(&spa->spa_proc_lock); 1197 if (spa->spa_proc_state != SPA_PROC_NONE) { 1198 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1199 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1200 cv_broadcast(&spa->spa_proc_cv); 1201 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1202 ASSERT(spa->spa_proc != &p0); 1203 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1204 } 1205 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1206 spa->spa_proc_state = SPA_PROC_NONE; 1207 } 1208 ASSERT(spa->spa_proc == &p0); 1209 mutex_exit(&spa->spa_proc_lock); 1210 1211#ifdef SPA_PROCESS 1212 /* 1213 * We want to make sure spa_thread() has actually exited the ZFS 1214 * module, so that the module can't be unloaded out from underneath 1215 * it. 1216 */ 1217 if (spa->spa_did != 0) { 1218 thread_join(spa->spa_did); 1219 spa->spa_did = 0; 1220 } 1221#endif /* SPA_PROCESS */ 1222} 1223 1224/* 1225 * Verify a pool configuration, and construct the vdev tree appropriately. This 1226 * will create all the necessary vdevs in the appropriate layout, with each vdev 1227 * in the CLOSED state. This will prep the pool before open/creation/import. 1228 * All vdev validation is done by the vdev_alloc() routine. 1229 */ 1230static int 1231spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1232 uint_t id, int atype) 1233{ 1234 nvlist_t **child; 1235 uint_t children; 1236 int error; 1237 1238 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1239 return (error); 1240 1241 if ((*vdp)->vdev_ops->vdev_op_leaf) 1242 return (0); 1243 1244 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1245 &child, &children); 1246 1247 if (error == ENOENT) 1248 return (0); 1249 1250 if (error) { 1251 vdev_free(*vdp); 1252 *vdp = NULL; 1253 return (SET_ERROR(EINVAL)); 1254 } 1255 1256 for (int c = 0; c < children; c++) { 1257 vdev_t *vd; 1258 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1259 atype)) != 0) { 1260 vdev_free(*vdp); 1261 *vdp = NULL; 1262 return (error); 1263 } 1264 } 1265 1266 ASSERT(*vdp != NULL); 1267 1268 return (0); 1269} 1270 1271/* 1272 * Opposite of spa_load(). 1273 */ 1274static void 1275spa_unload(spa_t *spa) 1276{ 1277 int i; 1278 1279 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1280 1281 /* 1282 * Stop TRIM thread. 1283 */ 1284 trim_thread_destroy(spa); 1285 1286 /* 1287 * Stop async tasks. 1288 */ 1289 spa_async_suspend(spa); 1290 1291 /* 1292 * Stop syncing. 1293 */ 1294 if (spa->spa_sync_on) { 1295 txg_sync_stop(spa->spa_dsl_pool); 1296 spa->spa_sync_on = B_FALSE; 1297 } 1298 1299 /* 1300 * Wait for any outstanding async I/O to complete. 1301 */ 1302 if (spa->spa_async_zio_root != NULL) { 1303 for (int i = 0; i < max_ncpus; i++) 1304 (void) zio_wait(spa->spa_async_zio_root[i]); 1305 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1306 spa->spa_async_zio_root = NULL; 1307 } 1308 1309 bpobj_close(&spa->spa_deferred_bpobj); 1310 1311 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1312 1313 /* 1314 * Close all vdevs. 1315 */ 1316 if (spa->spa_root_vdev) 1317 vdev_free(spa->spa_root_vdev); 1318 ASSERT(spa->spa_root_vdev == NULL); 1319 1320 /* 1321 * Close the dsl pool. 1322 */ 1323 if (spa->spa_dsl_pool) { 1324 dsl_pool_close(spa->spa_dsl_pool); 1325 spa->spa_dsl_pool = NULL; 1326 spa->spa_meta_objset = NULL; 1327 } 1328 1329 ddt_unload(spa); 1330 1331 1332 /* 1333 * Drop and purge level 2 cache 1334 */ 1335 spa_l2cache_drop(spa); 1336 1337 for (i = 0; i < spa->spa_spares.sav_count; i++) 1338 vdev_free(spa->spa_spares.sav_vdevs[i]); 1339 if (spa->spa_spares.sav_vdevs) { 1340 kmem_free(spa->spa_spares.sav_vdevs, 1341 spa->spa_spares.sav_count * sizeof (void *)); 1342 spa->spa_spares.sav_vdevs = NULL; 1343 } 1344 if (spa->spa_spares.sav_config) { 1345 nvlist_free(spa->spa_spares.sav_config); 1346 spa->spa_spares.sav_config = NULL; 1347 } 1348 spa->spa_spares.sav_count = 0; 1349 1350 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1351 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1352 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1353 } 1354 if (spa->spa_l2cache.sav_vdevs) { 1355 kmem_free(spa->spa_l2cache.sav_vdevs, 1356 spa->spa_l2cache.sav_count * sizeof (void *)); 1357 spa->spa_l2cache.sav_vdevs = NULL; 1358 } 1359 if (spa->spa_l2cache.sav_config) { 1360 nvlist_free(spa->spa_l2cache.sav_config); 1361 spa->spa_l2cache.sav_config = NULL; 1362 } 1363 spa->spa_l2cache.sav_count = 0; 1364 1365 spa->spa_async_suspended = 0; 1366 1367 if (spa->spa_comment != NULL) { 1368 spa_strfree(spa->spa_comment); 1369 spa->spa_comment = NULL; 1370 } 1371 1372 spa_config_exit(spa, SCL_ALL, FTAG); 1373} 1374 1375/* 1376 * Load (or re-load) the current list of vdevs describing the active spares for 1377 * this pool. When this is called, we have some form of basic information in 1378 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1379 * then re-generate a more complete list including status information. 1380 */ 1381static void 1382spa_load_spares(spa_t *spa) 1383{ 1384 nvlist_t **spares; 1385 uint_t nspares; 1386 int i; 1387 vdev_t *vd, *tvd; 1388 1389 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1390 1391 /* 1392 * First, close and free any existing spare vdevs. 1393 */ 1394 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1395 vd = spa->spa_spares.sav_vdevs[i]; 1396 1397 /* Undo the call to spa_activate() below */ 1398 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1399 B_FALSE)) != NULL && tvd->vdev_isspare) 1400 spa_spare_remove(tvd); 1401 vdev_close(vd); 1402 vdev_free(vd); 1403 } 1404 1405 if (spa->spa_spares.sav_vdevs) 1406 kmem_free(spa->spa_spares.sav_vdevs, 1407 spa->spa_spares.sav_count * sizeof (void *)); 1408 1409 if (spa->spa_spares.sav_config == NULL) 1410 nspares = 0; 1411 else 1412 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1413 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1414 1415 spa->spa_spares.sav_count = (int)nspares; 1416 spa->spa_spares.sav_vdevs = NULL; 1417 1418 if (nspares == 0) 1419 return; 1420 1421 /* 1422 * Construct the array of vdevs, opening them to get status in the 1423 * process. For each spare, there is potentially two different vdev_t 1424 * structures associated with it: one in the list of spares (used only 1425 * for basic validation purposes) and one in the active vdev 1426 * configuration (if it's spared in). During this phase we open and 1427 * validate each vdev on the spare list. If the vdev also exists in the 1428 * active configuration, then we also mark this vdev as an active spare. 1429 */ 1430 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1431 KM_SLEEP); 1432 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1433 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1434 VDEV_ALLOC_SPARE) == 0); 1435 ASSERT(vd != NULL); 1436 1437 spa->spa_spares.sav_vdevs[i] = vd; 1438 1439 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1440 B_FALSE)) != NULL) { 1441 if (!tvd->vdev_isspare) 1442 spa_spare_add(tvd); 1443 1444 /* 1445 * We only mark the spare active if we were successfully 1446 * able to load the vdev. Otherwise, importing a pool 1447 * with a bad active spare would result in strange 1448 * behavior, because multiple pool would think the spare 1449 * is actively in use. 1450 * 1451 * There is a vulnerability here to an equally bizarre 1452 * circumstance, where a dead active spare is later 1453 * brought back to life (onlined or otherwise). Given 1454 * the rarity of this scenario, and the extra complexity 1455 * it adds, we ignore the possibility. 1456 */ 1457 if (!vdev_is_dead(tvd)) 1458 spa_spare_activate(tvd); 1459 } 1460 1461 vd->vdev_top = vd; 1462 vd->vdev_aux = &spa->spa_spares; 1463 1464 if (vdev_open(vd) != 0) 1465 continue; 1466 1467 if (vdev_validate_aux(vd) == 0) 1468 spa_spare_add(vd); 1469 } 1470 1471 /* 1472 * Recompute the stashed list of spares, with status information 1473 * this time. 1474 */ 1475 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1476 DATA_TYPE_NVLIST_ARRAY) == 0); 1477 1478 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1479 KM_SLEEP); 1480 for (i = 0; i < spa->spa_spares.sav_count; i++) 1481 spares[i] = vdev_config_generate(spa, 1482 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1483 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1484 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1485 for (i = 0; i < spa->spa_spares.sav_count; i++) 1486 nvlist_free(spares[i]); 1487 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1488} 1489 1490/* 1491 * Load (or re-load) the current list of vdevs describing the active l2cache for 1492 * this pool. When this is called, we have some form of basic information in 1493 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1494 * then re-generate a more complete list including status information. 1495 * Devices which are already active have their details maintained, and are 1496 * not re-opened. 1497 */ 1498static void 1499spa_load_l2cache(spa_t *spa) 1500{ 1501 nvlist_t **l2cache; 1502 uint_t nl2cache; 1503 int i, j, oldnvdevs; 1504 uint64_t guid; 1505 vdev_t *vd, **oldvdevs, **newvdevs; 1506 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1507 1508 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1509 1510 if (sav->sav_config != NULL) { 1511 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1512 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1513 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1514 } else { 1515 nl2cache = 0; 1516 newvdevs = NULL; 1517 } 1518 1519 oldvdevs = sav->sav_vdevs; 1520 oldnvdevs = sav->sav_count; 1521 sav->sav_vdevs = NULL; 1522 sav->sav_count = 0; 1523 1524 /* 1525 * Process new nvlist of vdevs. 1526 */ 1527 for (i = 0; i < nl2cache; i++) { 1528 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1529 &guid) == 0); 1530 1531 newvdevs[i] = NULL; 1532 for (j = 0; j < oldnvdevs; j++) { 1533 vd = oldvdevs[j]; 1534 if (vd != NULL && guid == vd->vdev_guid) { 1535 /* 1536 * Retain previous vdev for add/remove ops. 1537 */ 1538 newvdevs[i] = vd; 1539 oldvdevs[j] = NULL; 1540 break; 1541 } 1542 } 1543 1544 if (newvdevs[i] == NULL) { 1545 /* 1546 * Create new vdev 1547 */ 1548 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1549 VDEV_ALLOC_L2CACHE) == 0); 1550 ASSERT(vd != NULL); 1551 newvdevs[i] = vd; 1552 1553 /* 1554 * Commit this vdev as an l2cache device, 1555 * even if it fails to open. 1556 */ 1557 spa_l2cache_add(vd); 1558 1559 vd->vdev_top = vd; 1560 vd->vdev_aux = sav; 1561 1562 spa_l2cache_activate(vd); 1563 1564 if (vdev_open(vd) != 0) 1565 continue; 1566 1567 (void) vdev_validate_aux(vd); 1568 1569 if (!vdev_is_dead(vd)) 1570 l2arc_add_vdev(spa, vd); 1571 } 1572 } 1573 1574 /* 1575 * Purge vdevs that were dropped 1576 */ 1577 for (i = 0; i < oldnvdevs; i++) { 1578 uint64_t pool; 1579 1580 vd = oldvdevs[i]; 1581 if (vd != NULL) { 1582 ASSERT(vd->vdev_isl2cache); 1583 1584 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1585 pool != 0ULL && l2arc_vdev_present(vd)) 1586 l2arc_remove_vdev(vd); 1587 vdev_clear_stats(vd); 1588 vdev_free(vd); 1589 } 1590 } 1591 1592 if (oldvdevs) 1593 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1594 1595 if (sav->sav_config == NULL) 1596 goto out; 1597 1598 sav->sav_vdevs = newvdevs; 1599 sav->sav_count = (int)nl2cache; 1600 1601 /* 1602 * Recompute the stashed list of l2cache devices, with status 1603 * information this time. 1604 */ 1605 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1606 DATA_TYPE_NVLIST_ARRAY) == 0); 1607 1608 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1609 for (i = 0; i < sav->sav_count; i++) 1610 l2cache[i] = vdev_config_generate(spa, 1611 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1612 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1613 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1614out: 1615 for (i = 0; i < sav->sav_count; i++) 1616 nvlist_free(l2cache[i]); 1617 if (sav->sav_count) 1618 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1619} 1620 1621static int 1622load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1623{ 1624 dmu_buf_t *db; 1625 char *packed = NULL; 1626 size_t nvsize = 0; 1627 int error; 1628 *value = NULL; 1629 1630 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1631 if (error != 0) 1632 return (error); 1633 1634 nvsize = *(uint64_t *)db->db_data; 1635 dmu_buf_rele(db, FTAG); 1636 1637 packed = kmem_alloc(nvsize, KM_SLEEP); 1638 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1639 DMU_READ_PREFETCH); 1640 if (error == 0) 1641 error = nvlist_unpack(packed, nvsize, value, 0); 1642 kmem_free(packed, nvsize); 1643 1644 return (error); 1645} 1646 1647/* 1648 * Checks to see if the given vdev could not be opened, in which case we post a 1649 * sysevent to notify the autoreplace code that the device has been removed. 1650 */ 1651static void 1652spa_check_removed(vdev_t *vd) 1653{ 1654 for (int c = 0; c < vd->vdev_children; c++) 1655 spa_check_removed(vd->vdev_child[c]); 1656 1657 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1658 !vd->vdev_ishole) { 1659 zfs_post_autoreplace(vd->vdev_spa, vd); 1660 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1661 } 1662} 1663 1664/* 1665 * Validate the current config against the MOS config 1666 */ 1667static boolean_t 1668spa_config_valid(spa_t *spa, nvlist_t *config) 1669{ 1670 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1671 nvlist_t *nv; 1672 1673 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1674 1675 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1676 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1677 1678 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1679 1680 /* 1681 * If we're doing a normal import, then build up any additional 1682 * diagnostic information about missing devices in this config. 1683 * We'll pass this up to the user for further processing. 1684 */ 1685 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1686 nvlist_t **child, *nv; 1687 uint64_t idx = 0; 1688 1689 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1690 KM_SLEEP); 1691 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1692 1693 for (int c = 0; c < rvd->vdev_children; c++) { 1694 vdev_t *tvd = rvd->vdev_child[c]; 1695 vdev_t *mtvd = mrvd->vdev_child[c]; 1696 1697 if (tvd->vdev_ops == &vdev_missing_ops && 1698 mtvd->vdev_ops != &vdev_missing_ops && 1699 mtvd->vdev_islog) 1700 child[idx++] = vdev_config_generate(spa, mtvd, 1701 B_FALSE, 0); 1702 } 1703 1704 if (idx) { 1705 VERIFY(nvlist_add_nvlist_array(nv, 1706 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1707 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1708 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1709 1710 for (int i = 0; i < idx; i++) 1711 nvlist_free(child[i]); 1712 } 1713 nvlist_free(nv); 1714 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1715 } 1716 1717 /* 1718 * Compare the root vdev tree with the information we have 1719 * from the MOS config (mrvd). Check each top-level vdev 1720 * with the corresponding MOS config top-level (mtvd). 1721 */ 1722 for (int c = 0; c < rvd->vdev_children; c++) { 1723 vdev_t *tvd = rvd->vdev_child[c]; 1724 vdev_t *mtvd = mrvd->vdev_child[c]; 1725 1726 /* 1727 * Resolve any "missing" vdevs in the current configuration. 1728 * If we find that the MOS config has more accurate information 1729 * about the top-level vdev then use that vdev instead. 1730 */ 1731 if (tvd->vdev_ops == &vdev_missing_ops && 1732 mtvd->vdev_ops != &vdev_missing_ops) { 1733 1734 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1735 continue; 1736 1737 /* 1738 * Device specific actions. 1739 */ 1740 if (mtvd->vdev_islog) { 1741 spa_set_log_state(spa, SPA_LOG_CLEAR); 1742 } else { 1743 /* 1744 * XXX - once we have 'readonly' pool 1745 * support we should be able to handle 1746 * missing data devices by transitioning 1747 * the pool to readonly. 1748 */ 1749 continue; 1750 } 1751 1752 /* 1753 * Swap the missing vdev with the data we were 1754 * able to obtain from the MOS config. 1755 */ 1756 vdev_remove_child(rvd, tvd); 1757 vdev_remove_child(mrvd, mtvd); 1758 1759 vdev_add_child(rvd, mtvd); 1760 vdev_add_child(mrvd, tvd); 1761 1762 spa_config_exit(spa, SCL_ALL, FTAG); 1763 vdev_load(mtvd); 1764 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1765 1766 vdev_reopen(rvd); 1767 } else if (mtvd->vdev_islog) { 1768 /* 1769 * Load the slog device's state from the MOS config 1770 * since it's possible that the label does not 1771 * contain the most up-to-date information. 1772 */ 1773 vdev_load_log_state(tvd, mtvd); 1774 vdev_reopen(tvd); 1775 } 1776 } 1777 vdev_free(mrvd); 1778 spa_config_exit(spa, SCL_ALL, FTAG); 1779 1780 /* 1781 * Ensure we were able to validate the config. 1782 */ 1783 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1784} 1785 1786/* 1787 * Check for missing log devices 1788 */ 1789static boolean_t 1790spa_check_logs(spa_t *spa) 1791{ 1792 boolean_t rv = B_FALSE; 1793 dsl_pool_t *dp = spa_get_dsl(spa); 1794 1795 switch (spa->spa_log_state) { 1796 case SPA_LOG_MISSING: 1797 /* need to recheck in case slog has been restored */ 1798 case SPA_LOG_UNKNOWN: 1799 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1800 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1801 if (rv) 1802 spa_set_log_state(spa, SPA_LOG_MISSING); 1803 break; 1804 } 1805 return (rv); 1806} 1807 1808static boolean_t 1809spa_passivate_log(spa_t *spa) 1810{ 1811 vdev_t *rvd = spa->spa_root_vdev; 1812 boolean_t slog_found = B_FALSE; 1813 1814 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1815 1816 if (!spa_has_slogs(spa)) 1817 return (B_FALSE); 1818 1819 for (int c = 0; c < rvd->vdev_children; c++) { 1820 vdev_t *tvd = rvd->vdev_child[c]; 1821 metaslab_group_t *mg = tvd->vdev_mg; 1822 1823 if (tvd->vdev_islog) { 1824 metaslab_group_passivate(mg); 1825 slog_found = B_TRUE; 1826 } 1827 } 1828 1829 return (slog_found); 1830} 1831 1832static void 1833spa_activate_log(spa_t *spa) 1834{ 1835 vdev_t *rvd = spa->spa_root_vdev; 1836 1837 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1838 1839 for (int c = 0; c < rvd->vdev_children; c++) { 1840 vdev_t *tvd = rvd->vdev_child[c]; 1841 metaslab_group_t *mg = tvd->vdev_mg; 1842 1843 if (tvd->vdev_islog) 1844 metaslab_group_activate(mg); 1845 } 1846} 1847 1848int 1849spa_offline_log(spa_t *spa) 1850{ 1851 int error; 1852 1853 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1854 NULL, DS_FIND_CHILDREN); 1855 if (error == 0) { 1856 /* 1857 * We successfully offlined the log device, sync out the 1858 * current txg so that the "stubby" block can be removed 1859 * by zil_sync(). 1860 */ 1861 txg_wait_synced(spa->spa_dsl_pool, 0); 1862 } 1863 return (error); 1864} 1865 1866static void 1867spa_aux_check_removed(spa_aux_vdev_t *sav) 1868{ 1869 int i; 1870 1871 for (i = 0; i < sav->sav_count; i++) 1872 spa_check_removed(sav->sav_vdevs[i]); 1873} 1874 1875void 1876spa_claim_notify(zio_t *zio) 1877{ 1878 spa_t *spa = zio->io_spa; 1879 1880 if (zio->io_error) 1881 return; 1882 1883 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1884 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1885 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1886 mutex_exit(&spa->spa_props_lock); 1887} 1888 1889typedef struct spa_load_error { 1890 uint64_t sle_meta_count; 1891 uint64_t sle_data_count; 1892} spa_load_error_t; 1893 1894static void 1895spa_load_verify_done(zio_t *zio) 1896{ 1897 blkptr_t *bp = zio->io_bp; 1898 spa_load_error_t *sle = zio->io_private; 1899 dmu_object_type_t type = BP_GET_TYPE(bp); 1900 int error = zio->io_error; 1901 spa_t *spa = zio->io_spa; 1902 1903 if (error) { 1904 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1905 type != DMU_OT_INTENT_LOG) 1906 atomic_inc_64(&sle->sle_meta_count); 1907 else 1908 atomic_inc_64(&sle->sle_data_count); 1909 } 1910 zio_data_buf_free(zio->io_data, zio->io_size); 1911 1912 mutex_enter(&spa->spa_scrub_lock); 1913 spa->spa_scrub_inflight--; 1914 cv_broadcast(&spa->spa_scrub_io_cv); 1915 mutex_exit(&spa->spa_scrub_lock); 1916} 1917 1918/* 1919 * Maximum number of concurrent scrub i/os to create while verifying 1920 * a pool while importing it. 1921 */ 1922int spa_load_verify_maxinflight = 10000; 1923boolean_t spa_load_verify_metadata = B_TRUE; 1924boolean_t spa_load_verify_data = B_TRUE; 1925 1926SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1927 &spa_load_verify_maxinflight, 0, 1928 "Maximum number of concurrent scrub I/Os to create while verifying a " 1929 "pool while importing it"); 1930 1931SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 1932 &spa_load_verify_metadata, 0, 1933 "Check metadata on import?"); 1934 1935SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 1936 &spa_load_verify_data, 0, 1937 "Check user data on import?"); 1938 1939/*ARGSUSED*/ 1940static int 1941spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1942 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1943{ 1944 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1945 return (0); 1946 /* 1947 * Note: normally this routine will not be called if 1948 * spa_load_verify_metadata is not set. However, it may be useful 1949 * to manually set the flag after the traversal has begun. 1950 */ 1951 if (!spa_load_verify_metadata) 1952 return (0); 1953 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1954 return (0); 1955 1956 zio_t *rio = arg; 1957 size_t size = BP_GET_PSIZE(bp); 1958 void *data = zio_data_buf_alloc(size); 1959 1960 mutex_enter(&spa->spa_scrub_lock); 1961 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1962 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1963 spa->spa_scrub_inflight++; 1964 mutex_exit(&spa->spa_scrub_lock); 1965 1966 zio_nowait(zio_read(rio, spa, bp, data, size, 1967 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1968 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1969 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1970 return (0); 1971} 1972 1973static int 1974spa_load_verify(spa_t *spa) 1975{ 1976 zio_t *rio; 1977 spa_load_error_t sle = { 0 }; 1978 zpool_rewind_policy_t policy; 1979 boolean_t verify_ok = B_FALSE; 1980 int error = 0; 1981 1982 zpool_get_rewind_policy(spa->spa_config, &policy); 1983 1984 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1985 return (0); 1986 1987 rio = zio_root(spa, NULL, &sle, 1988 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1989 1990 if (spa_load_verify_metadata) { 1991 error = traverse_pool(spa, spa->spa_verify_min_txg, 1992 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 1993 spa_load_verify_cb, rio); 1994 } 1995 1996 (void) zio_wait(rio); 1997 1998 spa->spa_load_meta_errors = sle.sle_meta_count; 1999 spa->spa_load_data_errors = sle.sle_data_count; 2000 2001 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2002 sle.sle_data_count <= policy.zrp_maxdata) { 2003 int64_t loss = 0; 2004 2005 verify_ok = B_TRUE; 2006 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2007 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2008 2009 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2010 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2011 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2012 VERIFY(nvlist_add_int64(spa->spa_load_info, 2013 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2014 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2015 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2016 } else { 2017 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2018 } 2019 2020 if (error) { 2021 if (error != ENXIO && error != EIO) 2022 error = SET_ERROR(EIO); 2023 return (error); 2024 } 2025 2026 return (verify_ok ? 0 : EIO); 2027} 2028 2029/* 2030 * Find a value in the pool props object. 2031 */ 2032static void 2033spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2034{ 2035 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2036 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2037} 2038 2039/* 2040 * Find a value in the pool directory object. 2041 */ 2042static int 2043spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2044{ 2045 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2046 name, sizeof (uint64_t), 1, val)); 2047} 2048 2049static int 2050spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2051{ 2052 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2053 return (err); 2054} 2055 2056/* 2057 * Fix up config after a partly-completed split. This is done with the 2058 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2059 * pool have that entry in their config, but only the splitting one contains 2060 * a list of all the guids of the vdevs that are being split off. 2061 * 2062 * This function determines what to do with that list: either rejoin 2063 * all the disks to the pool, or complete the splitting process. To attempt 2064 * the rejoin, each disk that is offlined is marked online again, and 2065 * we do a reopen() call. If the vdev label for every disk that was 2066 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2067 * then we call vdev_split() on each disk, and complete the split. 2068 * 2069 * Otherwise we leave the config alone, with all the vdevs in place in 2070 * the original pool. 2071 */ 2072static void 2073spa_try_repair(spa_t *spa, nvlist_t *config) 2074{ 2075 uint_t extracted; 2076 uint64_t *glist; 2077 uint_t i, gcount; 2078 nvlist_t *nvl; 2079 vdev_t **vd; 2080 boolean_t attempt_reopen; 2081 2082 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2083 return; 2084 2085 /* check that the config is complete */ 2086 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2087 &glist, &gcount) != 0) 2088 return; 2089 2090 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2091 2092 /* attempt to online all the vdevs & validate */ 2093 attempt_reopen = B_TRUE; 2094 for (i = 0; i < gcount; i++) { 2095 if (glist[i] == 0) /* vdev is hole */ 2096 continue; 2097 2098 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2099 if (vd[i] == NULL) { 2100 /* 2101 * Don't bother attempting to reopen the disks; 2102 * just do the split. 2103 */ 2104 attempt_reopen = B_FALSE; 2105 } else { 2106 /* attempt to re-online it */ 2107 vd[i]->vdev_offline = B_FALSE; 2108 } 2109 } 2110 2111 if (attempt_reopen) { 2112 vdev_reopen(spa->spa_root_vdev); 2113 2114 /* check each device to see what state it's in */ 2115 for (extracted = 0, i = 0; i < gcount; i++) { 2116 if (vd[i] != NULL && 2117 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2118 break; 2119 ++extracted; 2120 } 2121 } 2122 2123 /* 2124 * If every disk has been moved to the new pool, or if we never 2125 * even attempted to look at them, then we split them off for 2126 * good. 2127 */ 2128 if (!attempt_reopen || gcount == extracted) { 2129 for (i = 0; i < gcount; i++) 2130 if (vd[i] != NULL) 2131 vdev_split(vd[i]); 2132 vdev_reopen(spa->spa_root_vdev); 2133 } 2134 2135 kmem_free(vd, gcount * sizeof (vdev_t *)); 2136} 2137 2138static int 2139spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2140 boolean_t mosconfig) 2141{ 2142 nvlist_t *config = spa->spa_config; 2143 char *ereport = FM_EREPORT_ZFS_POOL; 2144 char *comment; 2145 int error; 2146 uint64_t pool_guid; 2147 nvlist_t *nvl; 2148 2149 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2150 return (SET_ERROR(EINVAL)); 2151 2152 ASSERT(spa->spa_comment == NULL); 2153 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2154 spa->spa_comment = spa_strdup(comment); 2155 2156 /* 2157 * Versioning wasn't explicitly added to the label until later, so if 2158 * it's not present treat it as the initial version. 2159 */ 2160 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2161 &spa->spa_ubsync.ub_version) != 0) 2162 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2163 2164 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2165 &spa->spa_config_txg); 2166 2167 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2168 spa_guid_exists(pool_guid, 0)) { 2169 error = SET_ERROR(EEXIST); 2170 } else { 2171 spa->spa_config_guid = pool_guid; 2172 2173 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2174 &nvl) == 0) { 2175 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2176 KM_SLEEP) == 0); 2177 } 2178 2179 nvlist_free(spa->spa_load_info); 2180 spa->spa_load_info = fnvlist_alloc(); 2181 2182 gethrestime(&spa->spa_loaded_ts); 2183 error = spa_load_impl(spa, pool_guid, config, state, type, 2184 mosconfig, &ereport); 2185 } 2186 2187 /* 2188 * Don't count references from objsets that are already closed 2189 * and are making their way through the eviction process. 2190 */ 2191 spa_evicting_os_wait(spa); 2192 spa->spa_minref = refcount_count(&spa->spa_refcount); 2193 if (error) { 2194 if (error != EEXIST) { 2195 spa->spa_loaded_ts.tv_sec = 0; 2196 spa->spa_loaded_ts.tv_nsec = 0; 2197 } 2198 if (error != EBADF) { 2199 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2200 } 2201 } 2202 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2203 spa->spa_ena = 0; 2204 2205 return (error); 2206} 2207 2208/* 2209 * Load an existing storage pool, using the pool's builtin spa_config as a 2210 * source of configuration information. 2211 */ 2212static int 2213spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2214 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2215 char **ereport) 2216{ 2217 int error = 0; 2218 nvlist_t *nvroot = NULL; 2219 nvlist_t *label; 2220 vdev_t *rvd; 2221 uberblock_t *ub = &spa->spa_uberblock; 2222 uint64_t children, config_cache_txg = spa->spa_config_txg; 2223 int orig_mode = spa->spa_mode; 2224 int parse; 2225 uint64_t obj; 2226 boolean_t missing_feat_write = B_FALSE; 2227 2228 /* 2229 * If this is an untrusted config, access the pool in read-only mode. 2230 * This prevents things like resilvering recently removed devices. 2231 */ 2232 if (!mosconfig) 2233 spa->spa_mode = FREAD; 2234 2235 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2236 2237 spa->spa_load_state = state; 2238 2239 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2240 return (SET_ERROR(EINVAL)); 2241 2242 parse = (type == SPA_IMPORT_EXISTING ? 2243 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2244 2245 /* 2246 * Create "The Godfather" zio to hold all async IOs 2247 */ 2248 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2249 KM_SLEEP); 2250 for (int i = 0; i < max_ncpus; i++) { 2251 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2252 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2253 ZIO_FLAG_GODFATHER); 2254 } 2255 2256 /* 2257 * Parse the configuration into a vdev tree. We explicitly set the 2258 * value that will be returned by spa_version() since parsing the 2259 * configuration requires knowing the version number. 2260 */ 2261 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2262 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2263 spa_config_exit(spa, SCL_ALL, FTAG); 2264 2265 if (error != 0) 2266 return (error); 2267 2268 ASSERT(spa->spa_root_vdev == rvd); 2269 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2270 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2271 2272 if (type != SPA_IMPORT_ASSEMBLE) { 2273 ASSERT(spa_guid(spa) == pool_guid); 2274 } 2275 2276 /* 2277 * Try to open all vdevs, loading each label in the process. 2278 */ 2279 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2280 error = vdev_open(rvd); 2281 spa_config_exit(spa, SCL_ALL, FTAG); 2282 if (error != 0) 2283 return (error); 2284 2285 /* 2286 * We need to validate the vdev labels against the configuration that 2287 * we have in hand, which is dependent on the setting of mosconfig. If 2288 * mosconfig is true then we're validating the vdev labels based on 2289 * that config. Otherwise, we're validating against the cached config 2290 * (zpool.cache) that was read when we loaded the zfs module, and then 2291 * later we will recursively call spa_load() and validate against 2292 * the vdev config. 2293 * 2294 * If we're assembling a new pool that's been split off from an 2295 * existing pool, the labels haven't yet been updated so we skip 2296 * validation for now. 2297 */ 2298 if (type != SPA_IMPORT_ASSEMBLE) { 2299 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2300 error = vdev_validate(rvd, mosconfig); 2301 spa_config_exit(spa, SCL_ALL, FTAG); 2302 2303 if (error != 0) 2304 return (error); 2305 2306 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2307 return (SET_ERROR(ENXIO)); 2308 } 2309 2310 /* 2311 * Find the best uberblock. 2312 */ 2313 vdev_uberblock_load(rvd, ub, &label); 2314 2315 /* 2316 * If we weren't able to find a single valid uberblock, return failure. 2317 */ 2318 if (ub->ub_txg == 0) { 2319 nvlist_free(label); 2320 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2321 } 2322 2323 /* 2324 * If the pool has an unsupported version we can't open it. 2325 */ 2326 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2327 nvlist_free(label); 2328 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2329 } 2330 2331 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2332 nvlist_t *features; 2333 2334 /* 2335 * If we weren't able to find what's necessary for reading the 2336 * MOS in the label, return failure. 2337 */ 2338 if (label == NULL || nvlist_lookup_nvlist(label, 2339 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2340 nvlist_free(label); 2341 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2342 ENXIO)); 2343 } 2344 2345 /* 2346 * Update our in-core representation with the definitive values 2347 * from the label. 2348 */ 2349 nvlist_free(spa->spa_label_features); 2350 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2351 } 2352 2353 nvlist_free(label); 2354 2355 /* 2356 * Look through entries in the label nvlist's features_for_read. If 2357 * there is a feature listed there which we don't understand then we 2358 * cannot open a pool. 2359 */ 2360 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2361 nvlist_t *unsup_feat; 2362 2363 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2364 0); 2365 2366 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2367 NULL); nvp != NULL; 2368 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2369 if (!zfeature_is_supported(nvpair_name(nvp))) { 2370 VERIFY(nvlist_add_string(unsup_feat, 2371 nvpair_name(nvp), "") == 0); 2372 } 2373 } 2374 2375 if (!nvlist_empty(unsup_feat)) { 2376 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2377 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2378 nvlist_free(unsup_feat); 2379 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2380 ENOTSUP)); 2381 } 2382 2383 nvlist_free(unsup_feat); 2384 } 2385 2386 /* 2387 * If the vdev guid sum doesn't match the uberblock, we have an 2388 * incomplete configuration. We first check to see if the pool 2389 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2390 * If it is, defer the vdev_guid_sum check till later so we 2391 * can handle missing vdevs. 2392 */ 2393 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2394 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2395 rvd->vdev_guid_sum != ub->ub_guid_sum) 2396 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2397 2398 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2399 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2400 spa_try_repair(spa, config); 2401 spa_config_exit(spa, SCL_ALL, FTAG); 2402 nvlist_free(spa->spa_config_splitting); 2403 spa->spa_config_splitting = NULL; 2404 } 2405 2406 /* 2407 * Initialize internal SPA structures. 2408 */ 2409 spa->spa_state = POOL_STATE_ACTIVE; 2410 spa->spa_ubsync = spa->spa_uberblock; 2411 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2412 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2413 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2414 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2415 spa->spa_claim_max_txg = spa->spa_first_txg; 2416 spa->spa_prev_software_version = ub->ub_software_version; 2417 2418 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2419 if (error) 2420 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2421 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2422 2423 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2424 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2425 2426 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2427 boolean_t missing_feat_read = B_FALSE; 2428 nvlist_t *unsup_feat, *enabled_feat; 2429 2430 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2431 &spa->spa_feat_for_read_obj) != 0) { 2432 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2433 } 2434 2435 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2436 &spa->spa_feat_for_write_obj) != 0) { 2437 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2438 } 2439 2440 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2441 &spa->spa_feat_desc_obj) != 0) { 2442 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2443 } 2444 2445 enabled_feat = fnvlist_alloc(); 2446 unsup_feat = fnvlist_alloc(); 2447 2448 if (!spa_features_check(spa, B_FALSE, 2449 unsup_feat, enabled_feat)) 2450 missing_feat_read = B_TRUE; 2451 2452 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2453 if (!spa_features_check(spa, B_TRUE, 2454 unsup_feat, enabled_feat)) { 2455 missing_feat_write = B_TRUE; 2456 } 2457 } 2458 2459 fnvlist_add_nvlist(spa->spa_load_info, 2460 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2461 2462 if (!nvlist_empty(unsup_feat)) { 2463 fnvlist_add_nvlist(spa->spa_load_info, 2464 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2465 } 2466 2467 fnvlist_free(enabled_feat); 2468 fnvlist_free(unsup_feat); 2469 2470 if (!missing_feat_read) { 2471 fnvlist_add_boolean(spa->spa_load_info, 2472 ZPOOL_CONFIG_CAN_RDONLY); 2473 } 2474 2475 /* 2476 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2477 * twofold: to determine whether the pool is available for 2478 * import in read-write mode and (if it is not) whether the 2479 * pool is available for import in read-only mode. If the pool 2480 * is available for import in read-write mode, it is displayed 2481 * as available in userland; if it is not available for import 2482 * in read-only mode, it is displayed as unavailable in 2483 * userland. If the pool is available for import in read-only 2484 * mode but not read-write mode, it is displayed as unavailable 2485 * in userland with a special note that the pool is actually 2486 * available for open in read-only mode. 2487 * 2488 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2489 * missing a feature for write, we must first determine whether 2490 * the pool can be opened read-only before returning to 2491 * userland in order to know whether to display the 2492 * abovementioned note. 2493 */ 2494 if (missing_feat_read || (missing_feat_write && 2495 spa_writeable(spa))) { 2496 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2497 ENOTSUP)); 2498 } 2499 2500 /* 2501 * Load refcounts for ZFS features from disk into an in-memory 2502 * cache during SPA initialization. 2503 */ 2504 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2505 uint64_t refcount; 2506 2507 error = feature_get_refcount_from_disk(spa, 2508 &spa_feature_table[i], &refcount); 2509 if (error == 0) { 2510 spa->spa_feat_refcount_cache[i] = refcount; 2511 } else if (error == ENOTSUP) { 2512 spa->spa_feat_refcount_cache[i] = 2513 SPA_FEATURE_DISABLED; 2514 } else { 2515 return (spa_vdev_err(rvd, 2516 VDEV_AUX_CORRUPT_DATA, EIO)); 2517 } 2518 } 2519 } 2520 2521 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2522 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2523 &spa->spa_feat_enabled_txg_obj) != 0) 2524 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2525 } 2526 2527 spa->spa_is_initializing = B_TRUE; 2528 error = dsl_pool_open(spa->spa_dsl_pool); 2529 spa->spa_is_initializing = B_FALSE; 2530 if (error != 0) 2531 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2532 2533 if (!mosconfig) { 2534 uint64_t hostid; 2535 nvlist_t *policy = NULL, *nvconfig; 2536 2537 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2538 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2539 2540 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2541 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2542 char *hostname; 2543 unsigned long myhostid = 0; 2544 2545 VERIFY(nvlist_lookup_string(nvconfig, 2546 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2547 2548#ifdef _KERNEL 2549 myhostid = zone_get_hostid(NULL); 2550#else /* _KERNEL */ 2551 /* 2552 * We're emulating the system's hostid in userland, so 2553 * we can't use zone_get_hostid(). 2554 */ 2555 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2556#endif /* _KERNEL */ 2557 if (check_hostid && hostid != 0 && myhostid != 0 && 2558 hostid != myhostid) { 2559 nvlist_free(nvconfig); 2560 cmn_err(CE_WARN, "pool '%s' could not be " 2561 "loaded as it was last accessed by " 2562 "another system (host: %s hostid: 0x%lx). " 2563 "See: http://illumos.org/msg/ZFS-8000-EY", 2564 spa_name(spa), hostname, 2565 (unsigned long)hostid); 2566 return (SET_ERROR(EBADF)); 2567 } 2568 } 2569 if (nvlist_lookup_nvlist(spa->spa_config, 2570 ZPOOL_REWIND_POLICY, &policy) == 0) 2571 VERIFY(nvlist_add_nvlist(nvconfig, 2572 ZPOOL_REWIND_POLICY, policy) == 0); 2573 2574 spa_config_set(spa, nvconfig); 2575 spa_unload(spa); 2576 spa_deactivate(spa); 2577 spa_activate(spa, orig_mode); 2578 2579 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2580 } 2581 2582 /* Grab the secret checksum salt from the MOS. */ 2583 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2584 DMU_POOL_CHECKSUM_SALT, 1, 2585 sizeof (spa->spa_cksum_salt.zcs_bytes), 2586 spa->spa_cksum_salt.zcs_bytes); 2587 if (error == ENOENT) { 2588 /* Generate a new salt for subsequent use */ 2589 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2590 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2591 } else if (error != 0) { 2592 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2593 } 2594 2595 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2596 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2597 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2598 if (error != 0) 2599 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2600 2601 /* 2602 * Load the bit that tells us to use the new accounting function 2603 * (raid-z deflation). If we have an older pool, this will not 2604 * be present. 2605 */ 2606 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2607 if (error != 0 && error != ENOENT) 2608 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2609 2610 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2611 &spa->spa_creation_version); 2612 if (error != 0 && error != ENOENT) 2613 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2614 2615 /* 2616 * Load the persistent error log. If we have an older pool, this will 2617 * not be present. 2618 */ 2619 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2620 if (error != 0 && error != ENOENT) 2621 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2622 2623 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2624 &spa->spa_errlog_scrub); 2625 if (error != 0 && error != ENOENT) 2626 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2627 2628 /* 2629 * Load the history object. If we have an older pool, this 2630 * will not be present. 2631 */ 2632 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2633 if (error != 0 && error != ENOENT) 2634 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2635 2636 /* 2637 * If we're assembling the pool from the split-off vdevs of 2638 * an existing pool, we don't want to attach the spares & cache 2639 * devices. 2640 */ 2641 2642 /* 2643 * Load any hot spares for this pool. 2644 */ 2645 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2646 if (error != 0 && error != ENOENT) 2647 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2648 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2649 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2650 if (load_nvlist(spa, spa->spa_spares.sav_object, 2651 &spa->spa_spares.sav_config) != 0) 2652 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2653 2654 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2655 spa_load_spares(spa); 2656 spa_config_exit(spa, SCL_ALL, FTAG); 2657 } else if (error == 0) { 2658 spa->spa_spares.sav_sync = B_TRUE; 2659 } 2660 2661 /* 2662 * Load any level 2 ARC devices for this pool. 2663 */ 2664 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2665 &spa->spa_l2cache.sav_object); 2666 if (error != 0 && error != ENOENT) 2667 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2668 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2669 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2670 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2671 &spa->spa_l2cache.sav_config) != 0) 2672 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2673 2674 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2675 spa_load_l2cache(spa); 2676 spa_config_exit(spa, SCL_ALL, FTAG); 2677 } else if (error == 0) { 2678 spa->spa_l2cache.sav_sync = B_TRUE; 2679 } 2680 2681 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2682 2683 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2684 if (error && error != ENOENT) 2685 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2686 2687 if (error == 0) { 2688 uint64_t autoreplace; 2689 2690 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2691 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2692 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2693 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2694 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2695 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2696 &spa->spa_dedup_ditto); 2697 2698 spa->spa_autoreplace = (autoreplace != 0); 2699 } 2700 2701 /* 2702 * If the 'autoreplace' property is set, then post a resource notifying 2703 * the ZFS DE that it should not issue any faults for unopenable 2704 * devices. We also iterate over the vdevs, and post a sysevent for any 2705 * unopenable vdevs so that the normal autoreplace handler can take 2706 * over. 2707 */ 2708 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2709 spa_check_removed(spa->spa_root_vdev); 2710 /* 2711 * For the import case, this is done in spa_import(), because 2712 * at this point we're using the spare definitions from 2713 * the MOS config, not necessarily from the userland config. 2714 */ 2715 if (state != SPA_LOAD_IMPORT) { 2716 spa_aux_check_removed(&spa->spa_spares); 2717 spa_aux_check_removed(&spa->spa_l2cache); 2718 } 2719 } 2720 2721 /* 2722 * Load the vdev state for all toplevel vdevs. 2723 */ 2724 vdev_load(rvd); 2725 2726 /* 2727 * Propagate the leaf DTLs we just loaded all the way up the tree. 2728 */ 2729 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2730 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2731 spa_config_exit(spa, SCL_ALL, FTAG); 2732 2733 /* 2734 * Load the DDTs (dedup tables). 2735 */ 2736 error = ddt_load(spa); 2737 if (error != 0) 2738 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2739 2740 spa_update_dspace(spa); 2741 2742 /* 2743 * Validate the config, using the MOS config to fill in any 2744 * information which might be missing. If we fail to validate 2745 * the config then declare the pool unfit for use. If we're 2746 * assembling a pool from a split, the log is not transferred 2747 * over. 2748 */ 2749 if (type != SPA_IMPORT_ASSEMBLE) { 2750 nvlist_t *nvconfig; 2751 2752 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2753 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2754 2755 if (!spa_config_valid(spa, nvconfig)) { 2756 nvlist_free(nvconfig); 2757 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2758 ENXIO)); 2759 } 2760 nvlist_free(nvconfig); 2761 2762 /* 2763 * Now that we've validated the config, check the state of the 2764 * root vdev. If it can't be opened, it indicates one or 2765 * more toplevel vdevs are faulted. 2766 */ 2767 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2768 return (SET_ERROR(ENXIO)); 2769 2770 if (spa_writeable(spa) && spa_check_logs(spa)) { 2771 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2772 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2773 } 2774 } 2775 2776 if (missing_feat_write) { 2777 ASSERT(state == SPA_LOAD_TRYIMPORT); 2778 2779 /* 2780 * At this point, we know that we can open the pool in 2781 * read-only mode but not read-write mode. We now have enough 2782 * information and can return to userland. 2783 */ 2784 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2785 } 2786 2787 /* 2788 * We've successfully opened the pool, verify that we're ready 2789 * to start pushing transactions. 2790 */ 2791 if (state != SPA_LOAD_TRYIMPORT) { 2792 if (error = spa_load_verify(spa)) 2793 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2794 error)); 2795 } 2796 2797 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2798 spa->spa_load_max_txg == UINT64_MAX)) { 2799 dmu_tx_t *tx; 2800 int need_update = B_FALSE; 2801 dsl_pool_t *dp = spa_get_dsl(spa); 2802 2803 ASSERT(state != SPA_LOAD_TRYIMPORT); 2804 2805 /* 2806 * Claim log blocks that haven't been committed yet. 2807 * This must all happen in a single txg. 2808 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2809 * invoked from zil_claim_log_block()'s i/o done callback. 2810 * Price of rollback is that we abandon the log. 2811 */ 2812 spa->spa_claiming = B_TRUE; 2813 2814 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2815 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2816 zil_claim, tx, DS_FIND_CHILDREN); 2817 dmu_tx_commit(tx); 2818 2819 spa->spa_claiming = B_FALSE; 2820 2821 spa_set_log_state(spa, SPA_LOG_GOOD); 2822 spa->spa_sync_on = B_TRUE; 2823 txg_sync_start(spa->spa_dsl_pool); 2824 2825 /* 2826 * Wait for all claims to sync. We sync up to the highest 2827 * claimed log block birth time so that claimed log blocks 2828 * don't appear to be from the future. spa_claim_max_txg 2829 * will have been set for us by either zil_check_log_chain() 2830 * (invoked from spa_check_logs()) or zil_claim() above. 2831 */ 2832 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2833 2834 /* 2835 * If the config cache is stale, or we have uninitialized 2836 * metaslabs (see spa_vdev_add()), then update the config. 2837 * 2838 * If this is a verbatim import, trust the current 2839 * in-core spa_config and update the disk labels. 2840 */ 2841 if (config_cache_txg != spa->spa_config_txg || 2842 state == SPA_LOAD_IMPORT || 2843 state == SPA_LOAD_RECOVER || 2844 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2845 need_update = B_TRUE; 2846 2847 for (int c = 0; c < rvd->vdev_children; c++) 2848 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2849 need_update = B_TRUE; 2850 2851 /* 2852 * Update the config cache asychronously in case we're the 2853 * root pool, in which case the config cache isn't writable yet. 2854 */ 2855 if (need_update) 2856 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2857 2858 /* 2859 * Check all DTLs to see if anything needs resilvering. 2860 */ 2861 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2862 vdev_resilver_needed(rvd, NULL, NULL)) 2863 spa_async_request(spa, SPA_ASYNC_RESILVER); 2864 2865 /* 2866 * Log the fact that we booted up (so that we can detect if 2867 * we rebooted in the middle of an operation). 2868 */ 2869 spa_history_log_version(spa, "open"); 2870 2871 /* 2872 * Delete any inconsistent datasets. 2873 */ 2874 (void) dmu_objset_find(spa_name(spa), 2875 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2876 2877 /* 2878 * Clean up any stale temporary dataset userrefs. 2879 */ 2880 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2881 } 2882 2883 return (0); 2884} 2885 2886static int 2887spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2888{ 2889 int mode = spa->spa_mode; 2890 2891 spa_unload(spa); 2892 spa_deactivate(spa); 2893 2894 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2895 2896 spa_activate(spa, mode); 2897 spa_async_suspend(spa); 2898 2899 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2900} 2901 2902/* 2903 * If spa_load() fails this function will try loading prior txg's. If 2904 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2905 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2906 * function will not rewind the pool and will return the same error as 2907 * spa_load(). 2908 */ 2909static int 2910spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2911 uint64_t max_request, int rewind_flags) 2912{ 2913 nvlist_t *loadinfo = NULL; 2914 nvlist_t *config = NULL; 2915 int load_error, rewind_error; 2916 uint64_t safe_rewind_txg; 2917 uint64_t min_txg; 2918 2919 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2920 spa->spa_load_max_txg = spa->spa_load_txg; 2921 spa_set_log_state(spa, SPA_LOG_CLEAR); 2922 } else { 2923 spa->spa_load_max_txg = max_request; 2924 if (max_request != UINT64_MAX) 2925 spa->spa_extreme_rewind = B_TRUE; 2926 } 2927 2928 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2929 mosconfig); 2930 if (load_error == 0) 2931 return (0); 2932 2933 if (spa->spa_root_vdev != NULL) 2934 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2935 2936 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2937 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2938 2939 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2940 nvlist_free(config); 2941 return (load_error); 2942 } 2943 2944 if (state == SPA_LOAD_RECOVER) { 2945 /* Price of rolling back is discarding txgs, including log */ 2946 spa_set_log_state(spa, SPA_LOG_CLEAR); 2947 } else { 2948 /* 2949 * If we aren't rolling back save the load info from our first 2950 * import attempt so that we can restore it after attempting 2951 * to rewind. 2952 */ 2953 loadinfo = spa->spa_load_info; 2954 spa->spa_load_info = fnvlist_alloc(); 2955 } 2956 2957 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2958 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2959 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2960 TXG_INITIAL : safe_rewind_txg; 2961 2962 /* 2963 * Continue as long as we're finding errors, we're still within 2964 * the acceptable rewind range, and we're still finding uberblocks 2965 */ 2966 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2967 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2968 if (spa->spa_load_max_txg < safe_rewind_txg) 2969 spa->spa_extreme_rewind = B_TRUE; 2970 rewind_error = spa_load_retry(spa, state, mosconfig); 2971 } 2972 2973 spa->spa_extreme_rewind = B_FALSE; 2974 spa->spa_load_max_txg = UINT64_MAX; 2975 2976 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2977 spa_config_set(spa, config); 2978 2979 if (state == SPA_LOAD_RECOVER) { 2980 ASSERT3P(loadinfo, ==, NULL); 2981 return (rewind_error); 2982 } else { 2983 /* Store the rewind info as part of the initial load info */ 2984 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2985 spa->spa_load_info); 2986 2987 /* Restore the initial load info */ 2988 fnvlist_free(spa->spa_load_info); 2989 spa->spa_load_info = loadinfo; 2990 2991 return (load_error); 2992 } 2993} 2994 2995/* 2996 * Pool Open/Import 2997 * 2998 * The import case is identical to an open except that the configuration is sent 2999 * down from userland, instead of grabbed from the configuration cache. For the 3000 * case of an open, the pool configuration will exist in the 3001 * POOL_STATE_UNINITIALIZED state. 3002 * 3003 * The stats information (gen/count/ustats) is used to gather vdev statistics at 3004 * the same time open the pool, without having to keep around the spa_t in some 3005 * ambiguous state. 3006 */ 3007static int 3008spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3009 nvlist_t **config) 3010{ 3011 spa_t *spa; 3012 spa_load_state_t state = SPA_LOAD_OPEN; 3013 int error; 3014 int locked = B_FALSE; 3015 int firstopen = B_FALSE; 3016 3017 *spapp = NULL; 3018 3019 /* 3020 * As disgusting as this is, we need to support recursive calls to this 3021 * function because dsl_dir_open() is called during spa_load(), and ends 3022 * up calling spa_open() again. The real fix is to figure out how to 3023 * avoid dsl_dir_open() calling this in the first place. 3024 */ 3025 if (mutex_owner(&spa_namespace_lock) != curthread) { 3026 mutex_enter(&spa_namespace_lock); 3027 locked = B_TRUE; 3028 } 3029 3030 if ((spa = spa_lookup(pool)) == NULL) { 3031 if (locked) 3032 mutex_exit(&spa_namespace_lock); 3033 return (SET_ERROR(ENOENT)); 3034 } 3035 3036 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3037 zpool_rewind_policy_t policy; 3038 3039 firstopen = B_TRUE; 3040 3041 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3042 &policy); 3043 if (policy.zrp_request & ZPOOL_DO_REWIND) 3044 state = SPA_LOAD_RECOVER; 3045 3046 spa_activate(spa, spa_mode_global); 3047 3048 if (state != SPA_LOAD_RECOVER) 3049 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3050 3051 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3052 policy.zrp_request); 3053 3054 if (error == EBADF) { 3055 /* 3056 * If vdev_validate() returns failure (indicated by 3057 * EBADF), it indicates that one of the vdevs indicates 3058 * that the pool has been exported or destroyed. If 3059 * this is the case, the config cache is out of sync and 3060 * we should remove the pool from the namespace. 3061 */ 3062 spa_unload(spa); 3063 spa_deactivate(spa); 3064 spa_config_sync(spa, B_TRUE, B_TRUE); 3065 spa_remove(spa); 3066 if (locked) 3067 mutex_exit(&spa_namespace_lock); 3068 return (SET_ERROR(ENOENT)); 3069 } 3070 3071 if (error) { 3072 /* 3073 * We can't open the pool, but we still have useful 3074 * information: the state of each vdev after the 3075 * attempted vdev_open(). Return this to the user. 3076 */ 3077 if (config != NULL && spa->spa_config) { 3078 VERIFY(nvlist_dup(spa->spa_config, config, 3079 KM_SLEEP) == 0); 3080 VERIFY(nvlist_add_nvlist(*config, 3081 ZPOOL_CONFIG_LOAD_INFO, 3082 spa->spa_load_info) == 0); 3083 } 3084 spa_unload(spa); 3085 spa_deactivate(spa); 3086 spa->spa_last_open_failed = error; 3087 if (locked) 3088 mutex_exit(&spa_namespace_lock); 3089 *spapp = NULL; 3090 return (error); 3091 } 3092 } 3093 3094 spa_open_ref(spa, tag); 3095 3096 if (config != NULL) 3097 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3098 3099 /* 3100 * If we've recovered the pool, pass back any information we 3101 * gathered while doing the load. 3102 */ 3103 if (state == SPA_LOAD_RECOVER) { 3104 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3105 spa->spa_load_info) == 0); 3106 } 3107 3108 if (locked) { 3109 spa->spa_last_open_failed = 0; 3110 spa->spa_last_ubsync_txg = 0; 3111 spa->spa_load_txg = 0; 3112 mutex_exit(&spa_namespace_lock); 3113#ifdef __FreeBSD__ 3114#ifdef _KERNEL 3115 if (firstopen) 3116 zvol_create_minors(spa->spa_name); 3117#endif 3118#endif 3119 } 3120 3121 *spapp = spa; 3122 3123 return (0); 3124} 3125 3126int 3127spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3128 nvlist_t **config) 3129{ 3130 return (spa_open_common(name, spapp, tag, policy, config)); 3131} 3132 3133int 3134spa_open(const char *name, spa_t **spapp, void *tag) 3135{ 3136 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3137} 3138 3139/* 3140 * Lookup the given spa_t, incrementing the inject count in the process, 3141 * preventing it from being exported or destroyed. 3142 */ 3143spa_t * 3144spa_inject_addref(char *name) 3145{ 3146 spa_t *spa; 3147 3148 mutex_enter(&spa_namespace_lock); 3149 if ((spa = spa_lookup(name)) == NULL) { 3150 mutex_exit(&spa_namespace_lock); 3151 return (NULL); 3152 } 3153 spa->spa_inject_ref++; 3154 mutex_exit(&spa_namespace_lock); 3155 3156 return (spa); 3157} 3158 3159void 3160spa_inject_delref(spa_t *spa) 3161{ 3162 mutex_enter(&spa_namespace_lock); 3163 spa->spa_inject_ref--; 3164 mutex_exit(&spa_namespace_lock); 3165} 3166 3167/* 3168 * Add spares device information to the nvlist. 3169 */ 3170static void 3171spa_add_spares(spa_t *spa, nvlist_t *config) 3172{ 3173 nvlist_t **spares; 3174 uint_t i, nspares; 3175 nvlist_t *nvroot; 3176 uint64_t guid; 3177 vdev_stat_t *vs; 3178 uint_t vsc; 3179 uint64_t pool; 3180 3181 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3182 3183 if (spa->spa_spares.sav_count == 0) 3184 return; 3185 3186 VERIFY(nvlist_lookup_nvlist(config, 3187 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3188 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3189 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3190 if (nspares != 0) { 3191 VERIFY(nvlist_add_nvlist_array(nvroot, 3192 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3193 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3194 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3195 3196 /* 3197 * Go through and find any spares which have since been 3198 * repurposed as an active spare. If this is the case, update 3199 * their status appropriately. 3200 */ 3201 for (i = 0; i < nspares; i++) { 3202 VERIFY(nvlist_lookup_uint64(spares[i], 3203 ZPOOL_CONFIG_GUID, &guid) == 0); 3204 if (spa_spare_exists(guid, &pool, NULL) && 3205 pool != 0ULL) { 3206 VERIFY(nvlist_lookup_uint64_array( 3207 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3208 (uint64_t **)&vs, &vsc) == 0); 3209 vs->vs_state = VDEV_STATE_CANT_OPEN; 3210 vs->vs_aux = VDEV_AUX_SPARED; 3211 } 3212 } 3213 } 3214} 3215 3216/* 3217 * Add l2cache device information to the nvlist, including vdev stats. 3218 */ 3219static void 3220spa_add_l2cache(spa_t *spa, nvlist_t *config) 3221{ 3222 nvlist_t **l2cache; 3223 uint_t i, j, nl2cache; 3224 nvlist_t *nvroot; 3225 uint64_t guid; 3226 vdev_t *vd; 3227 vdev_stat_t *vs; 3228 uint_t vsc; 3229 3230 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3231 3232 if (spa->spa_l2cache.sav_count == 0) 3233 return; 3234 3235 VERIFY(nvlist_lookup_nvlist(config, 3236 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3237 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3238 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3239 if (nl2cache != 0) { 3240 VERIFY(nvlist_add_nvlist_array(nvroot, 3241 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3242 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3243 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3244 3245 /* 3246 * Update level 2 cache device stats. 3247 */ 3248 3249 for (i = 0; i < nl2cache; i++) { 3250 VERIFY(nvlist_lookup_uint64(l2cache[i], 3251 ZPOOL_CONFIG_GUID, &guid) == 0); 3252 3253 vd = NULL; 3254 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3255 if (guid == 3256 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3257 vd = spa->spa_l2cache.sav_vdevs[j]; 3258 break; 3259 } 3260 } 3261 ASSERT(vd != NULL); 3262 3263 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3264 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3265 == 0); 3266 vdev_get_stats(vd, vs); 3267 } 3268 } 3269} 3270 3271static void 3272spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3273{ 3274 nvlist_t *features; 3275 zap_cursor_t zc; 3276 zap_attribute_t za; 3277 3278 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3279 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3280 3281 /* We may be unable to read features if pool is suspended. */ 3282 if (spa_suspended(spa)) 3283 goto out; 3284 3285 if (spa->spa_feat_for_read_obj != 0) { 3286 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3287 spa->spa_feat_for_read_obj); 3288 zap_cursor_retrieve(&zc, &za) == 0; 3289 zap_cursor_advance(&zc)) { 3290 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3291 za.za_num_integers == 1); 3292 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3293 za.za_first_integer)); 3294 } 3295 zap_cursor_fini(&zc); 3296 } 3297 3298 if (spa->spa_feat_for_write_obj != 0) { 3299 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3300 spa->spa_feat_for_write_obj); 3301 zap_cursor_retrieve(&zc, &za) == 0; 3302 zap_cursor_advance(&zc)) { 3303 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3304 za.za_num_integers == 1); 3305 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3306 za.za_first_integer)); 3307 } 3308 zap_cursor_fini(&zc); 3309 } 3310 3311out: 3312 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3313 features) == 0); 3314 nvlist_free(features); 3315} 3316 3317int 3318spa_get_stats(const char *name, nvlist_t **config, 3319 char *altroot, size_t buflen) 3320{ 3321 int error; 3322 spa_t *spa; 3323 3324 *config = NULL; 3325 error = spa_open_common(name, &spa, FTAG, NULL, config); 3326 3327 if (spa != NULL) { 3328 /* 3329 * This still leaves a window of inconsistency where the spares 3330 * or l2cache devices could change and the config would be 3331 * self-inconsistent. 3332 */ 3333 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3334 3335 if (*config != NULL) { 3336 uint64_t loadtimes[2]; 3337 3338 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3339 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3340 VERIFY(nvlist_add_uint64_array(*config, 3341 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3342 3343 VERIFY(nvlist_add_uint64(*config, 3344 ZPOOL_CONFIG_ERRCOUNT, 3345 spa_get_errlog_size(spa)) == 0); 3346 3347 if (spa_suspended(spa)) 3348 VERIFY(nvlist_add_uint64(*config, 3349 ZPOOL_CONFIG_SUSPENDED, 3350 spa->spa_failmode) == 0); 3351 3352 spa_add_spares(spa, *config); 3353 spa_add_l2cache(spa, *config); 3354 spa_add_feature_stats(spa, *config); 3355 } 3356 } 3357 3358 /* 3359 * We want to get the alternate root even for faulted pools, so we cheat 3360 * and call spa_lookup() directly. 3361 */ 3362 if (altroot) { 3363 if (spa == NULL) { 3364 mutex_enter(&spa_namespace_lock); 3365 spa = spa_lookup(name); 3366 if (spa) 3367 spa_altroot(spa, altroot, buflen); 3368 else 3369 altroot[0] = '\0'; 3370 spa = NULL; 3371 mutex_exit(&spa_namespace_lock); 3372 } else { 3373 spa_altroot(spa, altroot, buflen); 3374 } 3375 } 3376 3377 if (spa != NULL) { 3378 spa_config_exit(spa, SCL_CONFIG, FTAG); 3379 spa_close(spa, FTAG); 3380 } 3381 3382 return (error); 3383} 3384 3385/* 3386 * Validate that the auxiliary device array is well formed. We must have an 3387 * array of nvlists, each which describes a valid leaf vdev. If this is an 3388 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3389 * specified, as long as they are well-formed. 3390 */ 3391static int 3392spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3393 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3394 vdev_labeltype_t label) 3395{ 3396 nvlist_t **dev; 3397 uint_t i, ndev; 3398 vdev_t *vd; 3399 int error; 3400 3401 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3402 3403 /* 3404 * It's acceptable to have no devs specified. 3405 */ 3406 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3407 return (0); 3408 3409 if (ndev == 0) 3410 return (SET_ERROR(EINVAL)); 3411 3412 /* 3413 * Make sure the pool is formatted with a version that supports this 3414 * device type. 3415 */ 3416 if (spa_version(spa) < version) 3417 return (SET_ERROR(ENOTSUP)); 3418 3419 /* 3420 * Set the pending device list so we correctly handle device in-use 3421 * checking. 3422 */ 3423 sav->sav_pending = dev; 3424 sav->sav_npending = ndev; 3425 3426 for (i = 0; i < ndev; i++) { 3427 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3428 mode)) != 0) 3429 goto out; 3430 3431 if (!vd->vdev_ops->vdev_op_leaf) { 3432 vdev_free(vd); 3433 error = SET_ERROR(EINVAL); 3434 goto out; 3435 } 3436 3437 /* 3438 * The L2ARC currently only supports disk devices in 3439 * kernel context. For user-level testing, we allow it. 3440 */ 3441#ifdef _KERNEL 3442 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3443 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3444 error = SET_ERROR(ENOTBLK); 3445 vdev_free(vd); 3446 goto out; 3447 } 3448#endif 3449 vd->vdev_top = vd; 3450 3451 if ((error = vdev_open(vd)) == 0 && 3452 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3453 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3454 vd->vdev_guid) == 0); 3455 } 3456 3457 vdev_free(vd); 3458 3459 if (error && 3460 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3461 goto out; 3462 else 3463 error = 0; 3464 } 3465 3466out: 3467 sav->sav_pending = NULL; 3468 sav->sav_npending = 0; 3469 return (error); 3470} 3471 3472static int 3473spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3474{ 3475 int error; 3476 3477 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3478 3479 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3480 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3481 VDEV_LABEL_SPARE)) != 0) { 3482 return (error); 3483 } 3484 3485 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3486 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3487 VDEV_LABEL_L2CACHE)); 3488} 3489 3490static void 3491spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3492 const char *config) 3493{ 3494 int i; 3495 3496 if (sav->sav_config != NULL) { 3497 nvlist_t **olddevs; 3498 uint_t oldndevs; 3499 nvlist_t **newdevs; 3500 3501 /* 3502 * Generate new dev list by concatentating with the 3503 * current dev list. 3504 */ 3505 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3506 &olddevs, &oldndevs) == 0); 3507 3508 newdevs = kmem_alloc(sizeof (void *) * 3509 (ndevs + oldndevs), KM_SLEEP); 3510 for (i = 0; i < oldndevs; i++) 3511 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3512 KM_SLEEP) == 0); 3513 for (i = 0; i < ndevs; i++) 3514 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3515 KM_SLEEP) == 0); 3516 3517 VERIFY(nvlist_remove(sav->sav_config, config, 3518 DATA_TYPE_NVLIST_ARRAY) == 0); 3519 3520 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3521 config, newdevs, ndevs + oldndevs) == 0); 3522 for (i = 0; i < oldndevs + ndevs; i++) 3523 nvlist_free(newdevs[i]); 3524 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3525 } else { 3526 /* 3527 * Generate a new dev list. 3528 */ 3529 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3530 KM_SLEEP) == 0); 3531 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3532 devs, ndevs) == 0); 3533 } 3534} 3535 3536/* 3537 * Stop and drop level 2 ARC devices 3538 */ 3539void 3540spa_l2cache_drop(spa_t *spa) 3541{ 3542 vdev_t *vd; 3543 int i; 3544 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3545 3546 for (i = 0; i < sav->sav_count; i++) { 3547 uint64_t pool; 3548 3549 vd = sav->sav_vdevs[i]; 3550 ASSERT(vd != NULL); 3551 3552 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3553 pool != 0ULL && l2arc_vdev_present(vd)) 3554 l2arc_remove_vdev(vd); 3555 } 3556} 3557 3558/* 3559 * Pool Creation 3560 */ 3561int 3562spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3563 nvlist_t *zplprops) 3564{ 3565 spa_t *spa; 3566 char *altroot = NULL; 3567 vdev_t *rvd; 3568 dsl_pool_t *dp; 3569 dmu_tx_t *tx; 3570 int error = 0; 3571 uint64_t txg = TXG_INITIAL; 3572 nvlist_t **spares, **l2cache; 3573 uint_t nspares, nl2cache; 3574 uint64_t version, obj; 3575 boolean_t has_features; 3576 3577 /* 3578 * If this pool already exists, return failure. 3579 */ 3580 mutex_enter(&spa_namespace_lock); 3581 if (spa_lookup(pool) != NULL) { 3582 mutex_exit(&spa_namespace_lock); 3583 return (SET_ERROR(EEXIST)); 3584 } 3585 3586 /* 3587 * Allocate a new spa_t structure. 3588 */ 3589 (void) nvlist_lookup_string(props, 3590 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3591 spa = spa_add(pool, NULL, altroot); 3592 spa_activate(spa, spa_mode_global); 3593 3594 if (props && (error = spa_prop_validate(spa, props))) { 3595 spa_deactivate(spa); 3596 spa_remove(spa); 3597 mutex_exit(&spa_namespace_lock); 3598 return (error); 3599 } 3600 3601 has_features = B_FALSE; 3602 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3603 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3604 if (zpool_prop_feature(nvpair_name(elem))) 3605 has_features = B_TRUE; 3606 } 3607 3608 if (has_features || nvlist_lookup_uint64(props, 3609 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3610 version = SPA_VERSION; 3611 } 3612 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3613 3614 spa->spa_first_txg = txg; 3615 spa->spa_uberblock.ub_txg = txg - 1; 3616 spa->spa_uberblock.ub_version = version; 3617 spa->spa_ubsync = spa->spa_uberblock; 3618 3619 /* 3620 * Create "The Godfather" zio to hold all async IOs 3621 */ 3622 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3623 KM_SLEEP); 3624 for (int i = 0; i < max_ncpus; i++) { 3625 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3626 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3627 ZIO_FLAG_GODFATHER); 3628 } 3629 3630 /* 3631 * Create the root vdev. 3632 */ 3633 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3634 3635 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3636 3637 ASSERT(error != 0 || rvd != NULL); 3638 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3639 3640 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3641 error = SET_ERROR(EINVAL); 3642 3643 if (error == 0 && 3644 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3645 (error = spa_validate_aux(spa, nvroot, txg, 3646 VDEV_ALLOC_ADD)) == 0) { 3647 for (int c = 0; c < rvd->vdev_children; c++) { 3648 vdev_ashift_optimize(rvd->vdev_child[c]); 3649 vdev_metaslab_set_size(rvd->vdev_child[c]); 3650 vdev_expand(rvd->vdev_child[c], txg); 3651 } 3652 } 3653 3654 spa_config_exit(spa, SCL_ALL, FTAG); 3655 3656 if (error != 0) { 3657 spa_unload(spa); 3658 spa_deactivate(spa); 3659 spa_remove(spa); 3660 mutex_exit(&spa_namespace_lock); 3661 return (error); 3662 } 3663 3664 /* 3665 * Get the list of spares, if specified. 3666 */ 3667 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3668 &spares, &nspares) == 0) { 3669 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3670 KM_SLEEP) == 0); 3671 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3672 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3673 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3674 spa_load_spares(spa); 3675 spa_config_exit(spa, SCL_ALL, FTAG); 3676 spa->spa_spares.sav_sync = B_TRUE; 3677 } 3678 3679 /* 3680 * Get the list of level 2 cache devices, if specified. 3681 */ 3682 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3683 &l2cache, &nl2cache) == 0) { 3684 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3685 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3686 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3687 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3688 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3689 spa_load_l2cache(spa); 3690 spa_config_exit(spa, SCL_ALL, FTAG); 3691 spa->spa_l2cache.sav_sync = B_TRUE; 3692 } 3693 3694 spa->spa_is_initializing = B_TRUE; 3695 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3696 spa->spa_meta_objset = dp->dp_meta_objset; 3697 spa->spa_is_initializing = B_FALSE; 3698 3699 /* 3700 * Create DDTs (dedup tables). 3701 */ 3702 ddt_create(spa); 3703 3704 spa_update_dspace(spa); 3705 3706 tx = dmu_tx_create_assigned(dp, txg); 3707 3708 /* 3709 * Create the pool config object. 3710 */ 3711 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3712 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3713 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3714 3715 if (zap_add(spa->spa_meta_objset, 3716 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3717 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3718 cmn_err(CE_PANIC, "failed to add pool config"); 3719 } 3720 3721 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3722 spa_feature_create_zap_objects(spa, tx); 3723 3724 if (zap_add(spa->spa_meta_objset, 3725 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3726 sizeof (uint64_t), 1, &version, tx) != 0) { 3727 cmn_err(CE_PANIC, "failed to add pool version"); 3728 } 3729 3730 /* Newly created pools with the right version are always deflated. */ 3731 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3732 spa->spa_deflate = TRUE; 3733 if (zap_add(spa->spa_meta_objset, 3734 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3735 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3736 cmn_err(CE_PANIC, "failed to add deflate"); 3737 } 3738 } 3739 3740 /* 3741 * Create the deferred-free bpobj. Turn off compression 3742 * because sync-to-convergence takes longer if the blocksize 3743 * keeps changing. 3744 */ 3745 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3746 dmu_object_set_compress(spa->spa_meta_objset, obj, 3747 ZIO_COMPRESS_OFF, tx); 3748 if (zap_add(spa->spa_meta_objset, 3749 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3750 sizeof (uint64_t), 1, &obj, tx) != 0) { 3751 cmn_err(CE_PANIC, "failed to add bpobj"); 3752 } 3753 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3754 spa->spa_meta_objset, obj)); 3755 3756 /* 3757 * Create the pool's history object. 3758 */ 3759 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3760 spa_history_create_obj(spa, tx); 3761 3762 /* 3763 * Generate some random noise for salted checksums to operate on. 3764 */ 3765 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3766 sizeof (spa->spa_cksum_salt.zcs_bytes)); 3767 3768 /* 3769 * Set pool properties. 3770 */ 3771 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3772 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3773 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3774 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3775 3776 if (props != NULL) { 3777 spa_configfile_set(spa, props, B_FALSE); 3778 spa_sync_props(props, tx); 3779 } 3780 3781 dmu_tx_commit(tx); 3782 3783 spa->spa_sync_on = B_TRUE; 3784 txg_sync_start(spa->spa_dsl_pool); 3785 3786 /* 3787 * We explicitly wait for the first transaction to complete so that our 3788 * bean counters are appropriately updated. 3789 */ 3790 txg_wait_synced(spa->spa_dsl_pool, txg); 3791 3792 spa_config_sync(spa, B_FALSE, B_TRUE); 3793 spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); 3794 3795 spa_history_log_version(spa, "create"); 3796 3797 /* 3798 * Don't count references from objsets that are already closed 3799 * and are making their way through the eviction process. 3800 */ 3801 spa_evicting_os_wait(spa); 3802 spa->spa_minref = refcount_count(&spa->spa_refcount); 3803 3804 mutex_exit(&spa_namespace_lock); 3805 3806 return (0); 3807} 3808 3809#ifdef _KERNEL 3810#if defined(sun) 3811/* 3812 * Get the root pool information from the root disk, then import the root pool 3813 * during the system boot up time. 3814 */ 3815extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3816 3817static nvlist_t * 3818spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3819{ 3820 nvlist_t *config; 3821 nvlist_t *nvtop, *nvroot; 3822 uint64_t pgid; 3823 3824 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3825 return (NULL); 3826 3827 /* 3828 * Add this top-level vdev to the child array. 3829 */ 3830 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3831 &nvtop) == 0); 3832 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3833 &pgid) == 0); 3834 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3835 3836 /* 3837 * Put this pool's top-level vdevs into a root vdev. 3838 */ 3839 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3840 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3841 VDEV_TYPE_ROOT) == 0); 3842 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3843 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3844 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3845 &nvtop, 1) == 0); 3846 3847 /* 3848 * Replace the existing vdev_tree with the new root vdev in 3849 * this pool's configuration (remove the old, add the new). 3850 */ 3851 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3852 nvlist_free(nvroot); 3853 return (config); 3854} 3855 3856/* 3857 * Walk the vdev tree and see if we can find a device with "better" 3858 * configuration. A configuration is "better" if the label on that 3859 * device has a more recent txg. 3860 */ 3861static void 3862spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3863{ 3864 for (int c = 0; c < vd->vdev_children; c++) 3865 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3866 3867 if (vd->vdev_ops->vdev_op_leaf) { 3868 nvlist_t *label; 3869 uint64_t label_txg; 3870 3871 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3872 &label) != 0) 3873 return; 3874 3875 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3876 &label_txg) == 0); 3877 3878 /* 3879 * Do we have a better boot device? 3880 */ 3881 if (label_txg > *txg) { 3882 *txg = label_txg; 3883 *avd = vd; 3884 } 3885 nvlist_free(label); 3886 } 3887} 3888 3889/* 3890 * Import a root pool. 3891 * 3892 * For x86. devpath_list will consist of devid and/or physpath name of 3893 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3894 * The GRUB "findroot" command will return the vdev we should boot. 3895 * 3896 * For Sparc, devpath_list consists the physpath name of the booting device 3897 * no matter the rootpool is a single device pool or a mirrored pool. 3898 * e.g. 3899 * "/pci@1f,0/ide@d/disk@0,0:a" 3900 */ 3901int 3902spa_import_rootpool(char *devpath, char *devid) 3903{ 3904 spa_t *spa; 3905 vdev_t *rvd, *bvd, *avd = NULL; 3906 nvlist_t *config, *nvtop; 3907 uint64_t guid, txg; 3908 char *pname; 3909 int error; 3910 3911 /* 3912 * Read the label from the boot device and generate a configuration. 3913 */ 3914 config = spa_generate_rootconf(devpath, devid, &guid); 3915#if defined(_OBP) && defined(_KERNEL) 3916 if (config == NULL) { 3917 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3918 /* iscsi boot */ 3919 get_iscsi_bootpath_phy(devpath); 3920 config = spa_generate_rootconf(devpath, devid, &guid); 3921 } 3922 } 3923#endif 3924 if (config == NULL) { 3925 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3926 devpath); 3927 return (SET_ERROR(EIO)); 3928 } 3929 3930 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3931 &pname) == 0); 3932 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3933 3934 mutex_enter(&spa_namespace_lock); 3935 if ((spa = spa_lookup(pname)) != NULL) { 3936 /* 3937 * Remove the existing root pool from the namespace so that we 3938 * can replace it with the correct config we just read in. 3939 */ 3940 spa_remove(spa); 3941 } 3942 3943 spa = spa_add(pname, config, NULL); 3944 spa->spa_is_root = B_TRUE; 3945 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3946 3947 /* 3948 * Build up a vdev tree based on the boot device's label config. 3949 */ 3950 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3951 &nvtop) == 0); 3952 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3953 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3954 VDEV_ALLOC_ROOTPOOL); 3955 spa_config_exit(spa, SCL_ALL, FTAG); 3956 if (error) { 3957 mutex_exit(&spa_namespace_lock); 3958 nvlist_free(config); 3959 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3960 pname); 3961 return (error); 3962 } 3963 3964 /* 3965 * Get the boot vdev. 3966 */ 3967 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3968 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3969 (u_longlong_t)guid); 3970 error = SET_ERROR(ENOENT); 3971 goto out; 3972 } 3973 3974 /* 3975 * Determine if there is a better boot device. 3976 */ 3977 avd = bvd; 3978 spa_alt_rootvdev(rvd, &avd, &txg); 3979 if (avd != bvd) { 3980 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3981 "try booting from '%s'", avd->vdev_path); 3982 error = SET_ERROR(EINVAL); 3983 goto out; 3984 } 3985 3986 /* 3987 * If the boot device is part of a spare vdev then ensure that 3988 * we're booting off the active spare. 3989 */ 3990 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3991 !bvd->vdev_isspare) { 3992 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3993 "try booting from '%s'", 3994 bvd->vdev_parent-> 3995 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3996 error = SET_ERROR(EINVAL); 3997 goto out; 3998 } 3999 4000 error = 0; 4001out: 4002 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4003 vdev_free(rvd); 4004 spa_config_exit(spa, SCL_ALL, FTAG); 4005 mutex_exit(&spa_namespace_lock); 4006 4007 nvlist_free(config); 4008 return (error); 4009} 4010 4011#else 4012 4013extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 4014 uint64_t *count); 4015 4016static nvlist_t * 4017spa_generate_rootconf(const char *name) 4018{ 4019 nvlist_t **configs, **tops; 4020 nvlist_t *config; 4021 nvlist_t *best_cfg, *nvtop, *nvroot; 4022 uint64_t *holes; 4023 uint64_t best_txg; 4024 uint64_t nchildren; 4025 uint64_t pgid; 4026 uint64_t count; 4027 uint64_t i; 4028 uint_t nholes; 4029 4030 if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 4031 return (NULL); 4032 4033 ASSERT3U(count, !=, 0); 4034 best_txg = 0; 4035 for (i = 0; i < count; i++) { 4036 uint64_t txg; 4037 4038 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 4039 &txg) == 0); 4040 if (txg > best_txg) { 4041 best_txg = txg; 4042 best_cfg = configs[i]; 4043 } 4044 } 4045 4046 /* 4047 * Multi-vdev root pool configuration discovery is not supported yet. 4048 */ 4049 nchildren = 1; 4050 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 4051 holes = NULL; 4052 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 4053 &holes, &nholes); 4054 4055 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 4056 for (i = 0; i < nchildren; i++) { 4057 if (i >= count) 4058 break; 4059 if (configs[i] == NULL) 4060 continue; 4061 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4062 &nvtop) == 0); 4063 nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4064 } 4065 for (i = 0; holes != NULL && i < nholes; i++) { 4066 if (i >= nchildren) 4067 continue; 4068 if (tops[holes[i]] != NULL) 4069 continue; 4070 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4071 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4072 VDEV_TYPE_HOLE) == 0); 4073 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4074 holes[i]) == 0); 4075 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4076 0) == 0); 4077 } 4078 for (i = 0; i < nchildren; i++) { 4079 if (tops[i] != NULL) 4080 continue; 4081 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4082 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4083 VDEV_TYPE_MISSING) == 0); 4084 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4085 i) == 0); 4086 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4087 0) == 0); 4088 } 4089 4090 /* 4091 * Create pool config based on the best vdev config. 4092 */ 4093 nvlist_dup(best_cfg, &config, KM_SLEEP); 4094 4095 /* 4096 * Put this pool's top-level vdevs into a root vdev. 4097 */ 4098 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4099 &pgid) == 0); 4100 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4101 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4102 VDEV_TYPE_ROOT) == 0); 4103 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4104 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4105 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4106 tops, nchildren) == 0); 4107 4108 /* 4109 * Replace the existing vdev_tree with the new root vdev in 4110 * this pool's configuration (remove the old, add the new). 4111 */ 4112 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4113 4114 /* 4115 * Drop vdev config elements that should not be present at pool level. 4116 */ 4117 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4118 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4119 4120 for (i = 0; i < count; i++) 4121 nvlist_free(configs[i]); 4122 kmem_free(configs, count * sizeof(void *)); 4123 for (i = 0; i < nchildren; i++) 4124 nvlist_free(tops[i]); 4125 kmem_free(tops, nchildren * sizeof(void *)); 4126 nvlist_free(nvroot); 4127 return (config); 4128} 4129 4130int 4131spa_import_rootpool(const char *name) 4132{ 4133 spa_t *spa; 4134 vdev_t *rvd, *bvd, *avd = NULL; 4135 nvlist_t *config, *nvtop; 4136 uint64_t txg; 4137 char *pname; 4138 int error; 4139 4140 /* 4141 * Read the label from the boot device and generate a configuration. 4142 */ 4143 config = spa_generate_rootconf(name); 4144 4145 mutex_enter(&spa_namespace_lock); 4146 if (config != NULL) { 4147 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4148 &pname) == 0 && strcmp(name, pname) == 0); 4149 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4150 == 0); 4151 4152 if ((spa = spa_lookup(pname)) != NULL) { 4153 /* 4154 * Remove the existing root pool from the namespace so 4155 * that we can replace it with the correct config 4156 * we just read in. 4157 */ 4158 spa_remove(spa); 4159 } 4160 spa = spa_add(pname, config, NULL); 4161 4162 /* 4163 * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4164 * via spa_version(). 4165 */ 4166 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4167 &spa->spa_ubsync.ub_version) != 0) 4168 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4169 } else if ((spa = spa_lookup(name)) == NULL) { 4170 mutex_exit(&spa_namespace_lock); 4171 nvlist_free(config); 4172 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4173 name); 4174 return (EIO); 4175 } else { 4176 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4177 } 4178 spa->spa_is_root = B_TRUE; 4179 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4180 4181 /* 4182 * Build up a vdev tree based on the boot device's label config. 4183 */ 4184 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4185 &nvtop) == 0); 4186 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4187 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4188 VDEV_ALLOC_ROOTPOOL); 4189 spa_config_exit(spa, SCL_ALL, FTAG); 4190 if (error) { 4191 mutex_exit(&spa_namespace_lock); 4192 nvlist_free(config); 4193 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4194 pname); 4195 return (error); 4196 } 4197 4198 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4199 vdev_free(rvd); 4200 spa_config_exit(spa, SCL_ALL, FTAG); 4201 mutex_exit(&spa_namespace_lock); 4202 4203 nvlist_free(config); 4204 return (0); 4205} 4206 4207#endif /* sun */ 4208#endif 4209 4210/* 4211 * Import a non-root pool into the system. 4212 */ 4213int 4214spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4215{ 4216 spa_t *spa; 4217 char *altroot = NULL; 4218 spa_load_state_t state = SPA_LOAD_IMPORT; 4219 zpool_rewind_policy_t policy; 4220 uint64_t mode = spa_mode_global; 4221 uint64_t readonly = B_FALSE; 4222 int error; 4223 nvlist_t *nvroot; 4224 nvlist_t **spares, **l2cache; 4225 uint_t nspares, nl2cache; 4226 4227 /* 4228 * If a pool with this name exists, return failure. 4229 */ 4230 mutex_enter(&spa_namespace_lock); 4231 if (spa_lookup(pool) != NULL) { 4232 mutex_exit(&spa_namespace_lock); 4233 return (SET_ERROR(EEXIST)); 4234 } 4235 4236 /* 4237 * Create and initialize the spa structure. 4238 */ 4239 (void) nvlist_lookup_string(props, 4240 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4241 (void) nvlist_lookup_uint64(props, 4242 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4243 if (readonly) 4244 mode = FREAD; 4245 spa = spa_add(pool, config, altroot); 4246 spa->spa_import_flags = flags; 4247 4248 /* 4249 * Verbatim import - Take a pool and insert it into the namespace 4250 * as if it had been loaded at boot. 4251 */ 4252 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4253 if (props != NULL) 4254 spa_configfile_set(spa, props, B_FALSE); 4255 4256 spa_config_sync(spa, B_FALSE, B_TRUE); 4257 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4258 4259 mutex_exit(&spa_namespace_lock); 4260 return (0); 4261 } 4262 4263 spa_activate(spa, mode); 4264 4265 /* 4266 * Don't start async tasks until we know everything is healthy. 4267 */ 4268 spa_async_suspend(spa); 4269 4270 zpool_get_rewind_policy(config, &policy); 4271 if (policy.zrp_request & ZPOOL_DO_REWIND) 4272 state = SPA_LOAD_RECOVER; 4273 4274 /* 4275 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4276 * because the user-supplied config is actually the one to trust when 4277 * doing an import. 4278 */ 4279 if (state != SPA_LOAD_RECOVER) 4280 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4281 4282 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4283 policy.zrp_request); 4284 4285 /* 4286 * Propagate anything learned while loading the pool and pass it 4287 * back to caller (i.e. rewind info, missing devices, etc). 4288 */ 4289 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4290 spa->spa_load_info) == 0); 4291 4292 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4293 /* 4294 * Toss any existing sparelist, as it doesn't have any validity 4295 * anymore, and conflicts with spa_has_spare(). 4296 */ 4297 if (spa->spa_spares.sav_config) { 4298 nvlist_free(spa->spa_spares.sav_config); 4299 spa->spa_spares.sav_config = NULL; 4300 spa_load_spares(spa); 4301 } 4302 if (spa->spa_l2cache.sav_config) { 4303 nvlist_free(spa->spa_l2cache.sav_config); 4304 spa->spa_l2cache.sav_config = NULL; 4305 spa_load_l2cache(spa); 4306 } 4307 4308 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4309 &nvroot) == 0); 4310 if (error == 0) 4311 error = spa_validate_aux(spa, nvroot, -1ULL, 4312 VDEV_ALLOC_SPARE); 4313 if (error == 0) 4314 error = spa_validate_aux(spa, nvroot, -1ULL, 4315 VDEV_ALLOC_L2CACHE); 4316 spa_config_exit(spa, SCL_ALL, FTAG); 4317 4318 if (props != NULL) 4319 spa_configfile_set(spa, props, B_FALSE); 4320 4321 if (error != 0 || (props && spa_writeable(spa) && 4322 (error = spa_prop_set(spa, props)))) { 4323 spa_unload(spa); 4324 spa_deactivate(spa); 4325 spa_remove(spa); 4326 mutex_exit(&spa_namespace_lock); 4327 return (error); 4328 } 4329 4330 spa_async_resume(spa); 4331 4332 /* 4333 * Override any spares and level 2 cache devices as specified by 4334 * the user, as these may have correct device names/devids, etc. 4335 */ 4336 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4337 &spares, &nspares) == 0) { 4338 if (spa->spa_spares.sav_config) 4339 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4340 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4341 else 4342 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4343 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4344 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4345 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4346 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4347 spa_load_spares(spa); 4348 spa_config_exit(spa, SCL_ALL, FTAG); 4349 spa->spa_spares.sav_sync = B_TRUE; 4350 } 4351 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4352 &l2cache, &nl2cache) == 0) { 4353 if (spa->spa_l2cache.sav_config) 4354 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4355 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4356 else 4357 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4358 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4359 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4360 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4361 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4362 spa_load_l2cache(spa); 4363 spa_config_exit(spa, SCL_ALL, FTAG); 4364 spa->spa_l2cache.sav_sync = B_TRUE; 4365 } 4366 4367 /* 4368 * Check for any removed devices. 4369 */ 4370 if (spa->spa_autoreplace) { 4371 spa_aux_check_removed(&spa->spa_spares); 4372 spa_aux_check_removed(&spa->spa_l2cache); 4373 } 4374 4375 if (spa_writeable(spa)) { 4376 /* 4377 * Update the config cache to include the newly-imported pool. 4378 */ 4379 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4380 } 4381 4382 /* 4383 * It's possible that the pool was expanded while it was exported. 4384 * We kick off an async task to handle this for us. 4385 */ 4386 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4387 4388 spa_history_log_version(spa, "import"); 4389 4390 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4391 4392 mutex_exit(&spa_namespace_lock); 4393 4394#ifdef __FreeBSD__ 4395#ifdef _KERNEL 4396 zvol_create_minors(pool); 4397#endif 4398#endif 4399 return (0); 4400} 4401 4402nvlist_t * 4403spa_tryimport(nvlist_t *tryconfig) 4404{ 4405 nvlist_t *config = NULL; 4406 char *poolname; 4407 spa_t *spa; 4408 uint64_t state; 4409 int error; 4410 4411 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4412 return (NULL); 4413 4414 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4415 return (NULL); 4416 4417 /* 4418 * Create and initialize the spa structure. 4419 */ 4420 mutex_enter(&spa_namespace_lock); 4421 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4422 spa_activate(spa, FREAD); 4423 4424 /* 4425 * Pass off the heavy lifting to spa_load(). 4426 * Pass TRUE for mosconfig because the user-supplied config 4427 * is actually the one to trust when doing an import. 4428 */ 4429 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4430 4431 /* 4432 * If 'tryconfig' was at least parsable, return the current config. 4433 */ 4434 if (spa->spa_root_vdev != NULL) { 4435 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4436 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4437 poolname) == 0); 4438 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4439 state) == 0); 4440 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4441 spa->spa_uberblock.ub_timestamp) == 0); 4442 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4443 spa->spa_load_info) == 0); 4444 4445 /* 4446 * If the bootfs property exists on this pool then we 4447 * copy it out so that external consumers can tell which 4448 * pools are bootable. 4449 */ 4450 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4451 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4452 4453 /* 4454 * We have to play games with the name since the 4455 * pool was opened as TRYIMPORT_NAME. 4456 */ 4457 if (dsl_dsobj_to_dsname(spa_name(spa), 4458 spa->spa_bootfs, tmpname) == 0) { 4459 char *cp; 4460 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4461 4462 cp = strchr(tmpname, '/'); 4463 if (cp == NULL) { 4464 (void) strlcpy(dsname, tmpname, 4465 MAXPATHLEN); 4466 } else { 4467 (void) snprintf(dsname, MAXPATHLEN, 4468 "%s/%s", poolname, ++cp); 4469 } 4470 VERIFY(nvlist_add_string(config, 4471 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4472 kmem_free(dsname, MAXPATHLEN); 4473 } 4474 kmem_free(tmpname, MAXPATHLEN); 4475 } 4476 4477 /* 4478 * Add the list of hot spares and level 2 cache devices. 4479 */ 4480 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4481 spa_add_spares(spa, config); 4482 spa_add_l2cache(spa, config); 4483 spa_config_exit(spa, SCL_CONFIG, FTAG); 4484 } 4485 4486 spa_unload(spa); 4487 spa_deactivate(spa); 4488 spa_remove(spa); 4489 mutex_exit(&spa_namespace_lock); 4490 4491 return (config); 4492} 4493 4494/* 4495 * Pool export/destroy 4496 * 4497 * The act of destroying or exporting a pool is very simple. We make sure there 4498 * is no more pending I/O and any references to the pool are gone. Then, we 4499 * update the pool state and sync all the labels to disk, removing the 4500 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4501 * we don't sync the labels or remove the configuration cache. 4502 */ 4503static int 4504spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4505 boolean_t force, boolean_t hardforce) 4506{ 4507 spa_t *spa; 4508 4509 if (oldconfig) 4510 *oldconfig = NULL; 4511 4512 if (!(spa_mode_global & FWRITE)) 4513 return (SET_ERROR(EROFS)); 4514 4515 mutex_enter(&spa_namespace_lock); 4516 if ((spa = spa_lookup(pool)) == NULL) { 4517 mutex_exit(&spa_namespace_lock); 4518 return (SET_ERROR(ENOENT)); 4519 } 4520 4521 /* 4522 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4523 * reacquire the namespace lock, and see if we can export. 4524 */ 4525 spa_open_ref(spa, FTAG); 4526 mutex_exit(&spa_namespace_lock); 4527 spa_async_suspend(spa); 4528 mutex_enter(&spa_namespace_lock); 4529 spa_close(spa, FTAG); 4530 4531 /* 4532 * The pool will be in core if it's openable, 4533 * in which case we can modify its state. 4534 */ 4535 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4536 /* 4537 * Objsets may be open only because they're dirty, so we 4538 * have to force it to sync before checking spa_refcnt. 4539 */ 4540 txg_wait_synced(spa->spa_dsl_pool, 0); 4541 spa_evicting_os_wait(spa); 4542 4543 /* 4544 * A pool cannot be exported or destroyed if there are active 4545 * references. If we are resetting a pool, allow references by 4546 * fault injection handlers. 4547 */ 4548 if (!spa_refcount_zero(spa) || 4549 (spa->spa_inject_ref != 0 && 4550 new_state != POOL_STATE_UNINITIALIZED)) { 4551 spa_async_resume(spa); 4552 mutex_exit(&spa_namespace_lock); 4553 return (SET_ERROR(EBUSY)); 4554 } 4555 4556 /* 4557 * A pool cannot be exported if it has an active shared spare. 4558 * This is to prevent other pools stealing the active spare 4559 * from an exported pool. At user's own will, such pool can 4560 * be forcedly exported. 4561 */ 4562 if (!force && new_state == POOL_STATE_EXPORTED && 4563 spa_has_active_shared_spare(spa)) { 4564 spa_async_resume(spa); 4565 mutex_exit(&spa_namespace_lock); 4566 return (SET_ERROR(EXDEV)); 4567 } 4568 4569 /* 4570 * We want this to be reflected on every label, 4571 * so mark them all dirty. spa_unload() will do the 4572 * final sync that pushes these changes out. 4573 */ 4574 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4575 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4576 spa->spa_state = new_state; 4577 spa->spa_final_txg = spa_last_synced_txg(spa) + 4578 TXG_DEFER_SIZE + 1; 4579 vdev_config_dirty(spa->spa_root_vdev); 4580 spa_config_exit(spa, SCL_ALL, FTAG); 4581 } 4582 } 4583 4584 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4585 4586 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4587 spa_unload(spa); 4588 spa_deactivate(spa); 4589 } 4590 4591 if (oldconfig && spa->spa_config) 4592 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4593 4594 if (new_state != POOL_STATE_UNINITIALIZED) { 4595 if (!hardforce) 4596 spa_config_sync(spa, B_TRUE, B_TRUE); 4597 spa_remove(spa); 4598 } 4599 mutex_exit(&spa_namespace_lock); 4600 4601 return (0); 4602} 4603 4604/* 4605 * Destroy a storage pool. 4606 */ 4607int 4608spa_destroy(char *pool) 4609{ 4610 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4611 B_FALSE, B_FALSE)); 4612} 4613 4614/* 4615 * Export a storage pool. 4616 */ 4617int 4618spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4619 boolean_t hardforce) 4620{ 4621 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4622 force, hardforce)); 4623} 4624 4625/* 4626 * Similar to spa_export(), this unloads the spa_t without actually removing it 4627 * from the namespace in any way. 4628 */ 4629int 4630spa_reset(char *pool) 4631{ 4632 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4633 B_FALSE, B_FALSE)); 4634} 4635 4636/* 4637 * ========================================================================== 4638 * Device manipulation 4639 * ========================================================================== 4640 */ 4641 4642/* 4643 * Add a device to a storage pool. 4644 */ 4645int 4646spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4647{ 4648 uint64_t txg, id; 4649 int error; 4650 vdev_t *rvd = spa->spa_root_vdev; 4651 vdev_t *vd, *tvd; 4652 nvlist_t **spares, **l2cache; 4653 uint_t nspares, nl2cache; 4654 4655 ASSERT(spa_writeable(spa)); 4656 4657 txg = spa_vdev_enter(spa); 4658 4659 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4660 VDEV_ALLOC_ADD)) != 0) 4661 return (spa_vdev_exit(spa, NULL, txg, error)); 4662 4663 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4664 4665 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4666 &nspares) != 0) 4667 nspares = 0; 4668 4669 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4670 &nl2cache) != 0) 4671 nl2cache = 0; 4672 4673 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4674 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4675 4676 if (vd->vdev_children != 0 && 4677 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4678 return (spa_vdev_exit(spa, vd, txg, error)); 4679 4680 /* 4681 * We must validate the spares and l2cache devices after checking the 4682 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4683 */ 4684 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4685 return (spa_vdev_exit(spa, vd, txg, error)); 4686 4687 /* 4688 * Transfer each new top-level vdev from vd to rvd. 4689 */ 4690 for (int c = 0; c < vd->vdev_children; c++) { 4691 4692 /* 4693 * Set the vdev id to the first hole, if one exists. 4694 */ 4695 for (id = 0; id < rvd->vdev_children; id++) { 4696 if (rvd->vdev_child[id]->vdev_ishole) { 4697 vdev_free(rvd->vdev_child[id]); 4698 break; 4699 } 4700 } 4701 tvd = vd->vdev_child[c]; 4702 vdev_remove_child(vd, tvd); 4703 tvd->vdev_id = id; 4704 vdev_add_child(rvd, tvd); 4705 vdev_config_dirty(tvd); 4706 } 4707 4708 if (nspares != 0) { 4709 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4710 ZPOOL_CONFIG_SPARES); 4711 spa_load_spares(spa); 4712 spa->spa_spares.sav_sync = B_TRUE; 4713 } 4714 4715 if (nl2cache != 0) { 4716 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4717 ZPOOL_CONFIG_L2CACHE); 4718 spa_load_l2cache(spa); 4719 spa->spa_l2cache.sav_sync = B_TRUE; 4720 } 4721 4722 /* 4723 * We have to be careful when adding new vdevs to an existing pool. 4724 * If other threads start allocating from these vdevs before we 4725 * sync the config cache, and we lose power, then upon reboot we may 4726 * fail to open the pool because there are DVAs that the config cache 4727 * can't translate. Therefore, we first add the vdevs without 4728 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4729 * and then let spa_config_update() initialize the new metaslabs. 4730 * 4731 * spa_load() checks for added-but-not-initialized vdevs, so that 4732 * if we lose power at any point in this sequence, the remaining 4733 * steps will be completed the next time we load the pool. 4734 */ 4735 (void) spa_vdev_exit(spa, vd, txg, 0); 4736 4737 mutex_enter(&spa_namespace_lock); 4738 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4739 spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); 4740 mutex_exit(&spa_namespace_lock); 4741 4742 return (0); 4743} 4744 4745/* 4746 * Attach a device to a mirror. The arguments are the path to any device 4747 * in the mirror, and the nvroot for the new device. If the path specifies 4748 * a device that is not mirrored, we automatically insert the mirror vdev. 4749 * 4750 * If 'replacing' is specified, the new device is intended to replace the 4751 * existing device; in this case the two devices are made into their own 4752 * mirror using the 'replacing' vdev, which is functionally identical to 4753 * the mirror vdev (it actually reuses all the same ops) but has a few 4754 * extra rules: you can't attach to it after it's been created, and upon 4755 * completion of resilvering, the first disk (the one being replaced) 4756 * is automatically detached. 4757 */ 4758int 4759spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4760{ 4761 uint64_t txg, dtl_max_txg; 4762 vdev_t *rvd = spa->spa_root_vdev; 4763 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4764 vdev_ops_t *pvops; 4765 char *oldvdpath, *newvdpath; 4766 int newvd_isspare; 4767 int error; 4768 4769 ASSERT(spa_writeable(spa)); 4770 4771 txg = spa_vdev_enter(spa); 4772 4773 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4774 4775 if (oldvd == NULL) 4776 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4777 4778 if (!oldvd->vdev_ops->vdev_op_leaf) 4779 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4780 4781 pvd = oldvd->vdev_parent; 4782 4783 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4784 VDEV_ALLOC_ATTACH)) != 0) 4785 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4786 4787 if (newrootvd->vdev_children != 1) 4788 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4789 4790 newvd = newrootvd->vdev_child[0]; 4791 4792 if (!newvd->vdev_ops->vdev_op_leaf) 4793 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4794 4795 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4796 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4797 4798 /* 4799 * Spares can't replace logs 4800 */ 4801 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4802 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4803 4804 if (!replacing) { 4805 /* 4806 * For attach, the only allowable parent is a mirror or the root 4807 * vdev. 4808 */ 4809 if (pvd->vdev_ops != &vdev_mirror_ops && 4810 pvd->vdev_ops != &vdev_root_ops) 4811 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4812 4813 pvops = &vdev_mirror_ops; 4814 } else { 4815 /* 4816 * Active hot spares can only be replaced by inactive hot 4817 * spares. 4818 */ 4819 if (pvd->vdev_ops == &vdev_spare_ops && 4820 oldvd->vdev_isspare && 4821 !spa_has_spare(spa, newvd->vdev_guid)) 4822 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4823 4824 /* 4825 * If the source is a hot spare, and the parent isn't already a 4826 * spare, then we want to create a new hot spare. Otherwise, we 4827 * want to create a replacing vdev. The user is not allowed to 4828 * attach to a spared vdev child unless the 'isspare' state is 4829 * the same (spare replaces spare, non-spare replaces 4830 * non-spare). 4831 */ 4832 if (pvd->vdev_ops == &vdev_replacing_ops && 4833 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4834 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4835 } else if (pvd->vdev_ops == &vdev_spare_ops && 4836 newvd->vdev_isspare != oldvd->vdev_isspare) { 4837 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4838 } 4839 4840 if (newvd->vdev_isspare) 4841 pvops = &vdev_spare_ops; 4842 else 4843 pvops = &vdev_replacing_ops; 4844 } 4845 4846 /* 4847 * Make sure the new device is big enough. 4848 */ 4849 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4850 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4851 4852 /* 4853 * The new device cannot have a higher alignment requirement 4854 * than the top-level vdev. 4855 */ 4856 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4857 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4858 4859 /* 4860 * If this is an in-place replacement, update oldvd's path and devid 4861 * to make it distinguishable from newvd, and unopenable from now on. 4862 */ 4863 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4864 spa_strfree(oldvd->vdev_path); 4865 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4866 KM_SLEEP); 4867 (void) sprintf(oldvd->vdev_path, "%s/%s", 4868 newvd->vdev_path, "old"); 4869 if (oldvd->vdev_devid != NULL) { 4870 spa_strfree(oldvd->vdev_devid); 4871 oldvd->vdev_devid = NULL; 4872 } 4873 } 4874 4875 /* mark the device being resilvered */ 4876 newvd->vdev_resilver_txg = txg; 4877 4878 /* 4879 * If the parent is not a mirror, or if we're replacing, insert the new 4880 * mirror/replacing/spare vdev above oldvd. 4881 */ 4882 if (pvd->vdev_ops != pvops) 4883 pvd = vdev_add_parent(oldvd, pvops); 4884 4885 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4886 ASSERT(pvd->vdev_ops == pvops); 4887 ASSERT(oldvd->vdev_parent == pvd); 4888 4889 /* 4890 * Extract the new device from its root and add it to pvd. 4891 */ 4892 vdev_remove_child(newrootvd, newvd); 4893 newvd->vdev_id = pvd->vdev_children; 4894 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4895 vdev_add_child(pvd, newvd); 4896 4897 tvd = newvd->vdev_top; 4898 ASSERT(pvd->vdev_top == tvd); 4899 ASSERT(tvd->vdev_parent == rvd); 4900 4901 vdev_config_dirty(tvd); 4902 4903 /* 4904 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4905 * for any dmu_sync-ed blocks. It will propagate upward when 4906 * spa_vdev_exit() calls vdev_dtl_reassess(). 4907 */ 4908 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4909 4910 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4911 dtl_max_txg - TXG_INITIAL); 4912 4913 if (newvd->vdev_isspare) { 4914 spa_spare_activate(newvd); 4915 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4916 } 4917 4918 oldvdpath = spa_strdup(oldvd->vdev_path); 4919 newvdpath = spa_strdup(newvd->vdev_path); 4920 newvd_isspare = newvd->vdev_isspare; 4921 4922 /* 4923 * Mark newvd's DTL dirty in this txg. 4924 */ 4925 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4926 4927 /* 4928 * Schedule the resilver to restart in the future. We do this to 4929 * ensure that dmu_sync-ed blocks have been stitched into the 4930 * respective datasets. 4931 */ 4932 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4933 4934 if (spa->spa_bootfs) 4935 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4936 4937 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); 4938 4939 /* 4940 * Commit the config 4941 */ 4942 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4943 4944 spa_history_log_internal(spa, "vdev attach", NULL, 4945 "%s vdev=%s %s vdev=%s", 4946 replacing && newvd_isspare ? "spare in" : 4947 replacing ? "replace" : "attach", newvdpath, 4948 replacing ? "for" : "to", oldvdpath); 4949 4950 spa_strfree(oldvdpath); 4951 spa_strfree(newvdpath); 4952 4953 return (0); 4954} 4955 4956/* 4957 * Detach a device from a mirror or replacing vdev. 4958 * 4959 * If 'replace_done' is specified, only detach if the parent 4960 * is a replacing vdev. 4961 */ 4962int 4963spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4964{ 4965 uint64_t txg; 4966 int error; 4967 vdev_t *rvd = spa->spa_root_vdev; 4968 vdev_t *vd, *pvd, *cvd, *tvd; 4969 boolean_t unspare = B_FALSE; 4970 uint64_t unspare_guid = 0; 4971 char *vdpath; 4972 4973 ASSERT(spa_writeable(spa)); 4974 4975 txg = spa_vdev_enter(spa); 4976 4977 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4978 4979 if (vd == NULL) 4980 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4981 4982 if (!vd->vdev_ops->vdev_op_leaf) 4983 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4984 4985 pvd = vd->vdev_parent; 4986 4987 /* 4988 * If the parent/child relationship is not as expected, don't do it. 4989 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4990 * vdev that's replacing B with C. The user's intent in replacing 4991 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4992 * the replace by detaching C, the expected behavior is to end up 4993 * M(A,B). But suppose that right after deciding to detach C, 4994 * the replacement of B completes. We would have M(A,C), and then 4995 * ask to detach C, which would leave us with just A -- not what 4996 * the user wanted. To prevent this, we make sure that the 4997 * parent/child relationship hasn't changed -- in this example, 4998 * that C's parent is still the replacing vdev R. 4999 */ 5000 if (pvd->vdev_guid != pguid && pguid != 0) 5001 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5002 5003 /* 5004 * Only 'replacing' or 'spare' vdevs can be replaced. 5005 */ 5006 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 5007 pvd->vdev_ops != &vdev_spare_ops) 5008 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5009 5010 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 5011 spa_version(spa) >= SPA_VERSION_SPARES); 5012 5013 /* 5014 * Only mirror, replacing, and spare vdevs support detach. 5015 */ 5016 if (pvd->vdev_ops != &vdev_replacing_ops && 5017 pvd->vdev_ops != &vdev_mirror_ops && 5018 pvd->vdev_ops != &vdev_spare_ops) 5019 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5020 5021 /* 5022 * If this device has the only valid copy of some data, 5023 * we cannot safely detach it. 5024 */ 5025 if (vdev_dtl_required(vd)) 5026 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5027 5028 ASSERT(pvd->vdev_children >= 2); 5029 5030 /* 5031 * If we are detaching the second disk from a replacing vdev, then 5032 * check to see if we changed the original vdev's path to have "/old" 5033 * at the end in spa_vdev_attach(). If so, undo that change now. 5034 */ 5035 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5036 vd->vdev_path != NULL) { 5037 size_t len = strlen(vd->vdev_path); 5038 5039 for (int c = 0; c < pvd->vdev_children; c++) { 5040 cvd = pvd->vdev_child[c]; 5041 5042 if (cvd == vd || cvd->vdev_path == NULL) 5043 continue; 5044 5045 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5046 strcmp(cvd->vdev_path + len, "/old") == 0) { 5047 spa_strfree(cvd->vdev_path); 5048 cvd->vdev_path = spa_strdup(vd->vdev_path); 5049 break; 5050 } 5051 } 5052 } 5053 5054 /* 5055 * If we are detaching the original disk from a spare, then it implies 5056 * that the spare should become a real disk, and be removed from the 5057 * active spare list for the pool. 5058 */ 5059 if (pvd->vdev_ops == &vdev_spare_ops && 5060 vd->vdev_id == 0 && 5061 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5062 unspare = B_TRUE; 5063 5064 /* 5065 * Erase the disk labels so the disk can be used for other things. 5066 * This must be done after all other error cases are handled, 5067 * but before we disembowel vd (so we can still do I/O to it). 5068 * But if we can't do it, don't treat the error as fatal -- 5069 * it may be that the unwritability of the disk is the reason 5070 * it's being detached! 5071 */ 5072 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5073 5074 /* 5075 * Remove vd from its parent and compact the parent's children. 5076 */ 5077 vdev_remove_child(pvd, vd); 5078 vdev_compact_children(pvd); 5079 5080 /* 5081 * Remember one of the remaining children so we can get tvd below. 5082 */ 5083 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5084 5085 /* 5086 * If we need to remove the remaining child from the list of hot spares, 5087 * do it now, marking the vdev as no longer a spare in the process. 5088 * We must do this before vdev_remove_parent(), because that can 5089 * change the GUID if it creates a new toplevel GUID. For a similar 5090 * reason, we must remove the spare now, in the same txg as the detach; 5091 * otherwise someone could attach a new sibling, change the GUID, and 5092 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5093 */ 5094 if (unspare) { 5095 ASSERT(cvd->vdev_isspare); 5096 spa_spare_remove(cvd); 5097 unspare_guid = cvd->vdev_guid; 5098 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5099 cvd->vdev_unspare = B_TRUE; 5100 } 5101 5102 /* 5103 * If the parent mirror/replacing vdev only has one child, 5104 * the parent is no longer needed. Remove it from the tree. 5105 */ 5106 if (pvd->vdev_children == 1) { 5107 if (pvd->vdev_ops == &vdev_spare_ops) 5108 cvd->vdev_unspare = B_FALSE; 5109 vdev_remove_parent(cvd); 5110 } 5111 5112 5113 /* 5114 * We don't set tvd until now because the parent we just removed 5115 * may have been the previous top-level vdev. 5116 */ 5117 tvd = cvd->vdev_top; 5118 ASSERT(tvd->vdev_parent == rvd); 5119 5120 /* 5121 * Reevaluate the parent vdev state. 5122 */ 5123 vdev_propagate_state(cvd); 5124 5125 /* 5126 * If the 'autoexpand' property is set on the pool then automatically 5127 * try to expand the size of the pool. For example if the device we 5128 * just detached was smaller than the others, it may be possible to 5129 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5130 * first so that we can obtain the updated sizes of the leaf vdevs. 5131 */ 5132 if (spa->spa_autoexpand) { 5133 vdev_reopen(tvd); 5134 vdev_expand(tvd, txg); 5135 } 5136 5137 vdev_config_dirty(tvd); 5138 5139 /* 5140 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5141 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5142 * But first make sure we're not on any *other* txg's DTL list, to 5143 * prevent vd from being accessed after it's freed. 5144 */ 5145 vdpath = spa_strdup(vd->vdev_path); 5146 for (int t = 0; t < TXG_SIZE; t++) 5147 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5148 vd->vdev_detached = B_TRUE; 5149 vdev_dirty(tvd, VDD_DTL, vd, txg); 5150 5151 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5152 5153 /* hang on to the spa before we release the lock */ 5154 spa_open_ref(spa, FTAG); 5155 5156 error = spa_vdev_exit(spa, vd, txg, 0); 5157 5158 spa_history_log_internal(spa, "detach", NULL, 5159 "vdev=%s", vdpath); 5160 spa_strfree(vdpath); 5161 5162 /* 5163 * If this was the removal of the original device in a hot spare vdev, 5164 * then we want to go through and remove the device from the hot spare 5165 * list of every other pool. 5166 */ 5167 if (unspare) { 5168 spa_t *altspa = NULL; 5169 5170 mutex_enter(&spa_namespace_lock); 5171 while ((altspa = spa_next(altspa)) != NULL) { 5172 if (altspa->spa_state != POOL_STATE_ACTIVE || 5173 altspa == spa) 5174 continue; 5175 5176 spa_open_ref(altspa, FTAG); 5177 mutex_exit(&spa_namespace_lock); 5178 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5179 mutex_enter(&spa_namespace_lock); 5180 spa_close(altspa, FTAG); 5181 } 5182 mutex_exit(&spa_namespace_lock); 5183 5184 /* search the rest of the vdevs for spares to remove */ 5185 spa_vdev_resilver_done(spa); 5186 } 5187 5188 /* all done with the spa; OK to release */ 5189 mutex_enter(&spa_namespace_lock); 5190 spa_close(spa, FTAG); 5191 mutex_exit(&spa_namespace_lock); 5192 5193 return (error); 5194} 5195 5196/* 5197 * Split a set of devices from their mirrors, and create a new pool from them. 5198 */ 5199int 5200spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5201 nvlist_t *props, boolean_t exp) 5202{ 5203 int error = 0; 5204 uint64_t txg, *glist; 5205 spa_t *newspa; 5206 uint_t c, children, lastlog; 5207 nvlist_t **child, *nvl, *tmp; 5208 dmu_tx_t *tx; 5209 char *altroot = NULL; 5210 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5211 boolean_t activate_slog; 5212 5213 ASSERT(spa_writeable(spa)); 5214 5215 txg = spa_vdev_enter(spa); 5216 5217 /* clear the log and flush everything up to now */ 5218 activate_slog = spa_passivate_log(spa); 5219 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5220 error = spa_offline_log(spa); 5221 txg = spa_vdev_config_enter(spa); 5222 5223 if (activate_slog) 5224 spa_activate_log(spa); 5225 5226 if (error != 0) 5227 return (spa_vdev_exit(spa, NULL, txg, error)); 5228 5229 /* check new spa name before going any further */ 5230 if (spa_lookup(newname) != NULL) 5231 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5232 5233 /* 5234 * scan through all the children to ensure they're all mirrors 5235 */ 5236 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5237 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5238 &children) != 0) 5239 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5240 5241 /* first, check to ensure we've got the right child count */ 5242 rvd = spa->spa_root_vdev; 5243 lastlog = 0; 5244 for (c = 0; c < rvd->vdev_children; c++) { 5245 vdev_t *vd = rvd->vdev_child[c]; 5246 5247 /* don't count the holes & logs as children */ 5248 if (vd->vdev_islog || vd->vdev_ishole) { 5249 if (lastlog == 0) 5250 lastlog = c; 5251 continue; 5252 } 5253 5254 lastlog = 0; 5255 } 5256 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5257 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5258 5259 /* next, ensure no spare or cache devices are part of the split */ 5260 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5261 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5262 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5263 5264 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5265 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5266 5267 /* then, loop over each vdev and validate it */ 5268 for (c = 0; c < children; c++) { 5269 uint64_t is_hole = 0; 5270 5271 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5272 &is_hole); 5273 5274 if (is_hole != 0) { 5275 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5276 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5277 continue; 5278 } else { 5279 error = SET_ERROR(EINVAL); 5280 break; 5281 } 5282 } 5283 5284 /* which disk is going to be split? */ 5285 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5286 &glist[c]) != 0) { 5287 error = SET_ERROR(EINVAL); 5288 break; 5289 } 5290 5291 /* look it up in the spa */ 5292 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5293 if (vml[c] == NULL) { 5294 error = SET_ERROR(ENODEV); 5295 break; 5296 } 5297 5298 /* make sure there's nothing stopping the split */ 5299 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5300 vml[c]->vdev_islog || 5301 vml[c]->vdev_ishole || 5302 vml[c]->vdev_isspare || 5303 vml[c]->vdev_isl2cache || 5304 !vdev_writeable(vml[c]) || 5305 vml[c]->vdev_children != 0 || 5306 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5307 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5308 error = SET_ERROR(EINVAL); 5309 break; 5310 } 5311 5312 if (vdev_dtl_required(vml[c])) { 5313 error = SET_ERROR(EBUSY); 5314 break; 5315 } 5316 5317 /* we need certain info from the top level */ 5318 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5319 vml[c]->vdev_top->vdev_ms_array) == 0); 5320 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5321 vml[c]->vdev_top->vdev_ms_shift) == 0); 5322 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5323 vml[c]->vdev_top->vdev_asize) == 0); 5324 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5325 vml[c]->vdev_top->vdev_ashift) == 0); 5326 } 5327 5328 if (error != 0) { 5329 kmem_free(vml, children * sizeof (vdev_t *)); 5330 kmem_free(glist, children * sizeof (uint64_t)); 5331 return (spa_vdev_exit(spa, NULL, txg, error)); 5332 } 5333 5334 /* stop writers from using the disks */ 5335 for (c = 0; c < children; c++) { 5336 if (vml[c] != NULL) 5337 vml[c]->vdev_offline = B_TRUE; 5338 } 5339 vdev_reopen(spa->spa_root_vdev); 5340 5341 /* 5342 * Temporarily record the splitting vdevs in the spa config. This 5343 * will disappear once the config is regenerated. 5344 */ 5345 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5346 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5347 glist, children) == 0); 5348 kmem_free(glist, children * sizeof (uint64_t)); 5349 5350 mutex_enter(&spa->spa_props_lock); 5351 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5352 nvl) == 0); 5353 mutex_exit(&spa->spa_props_lock); 5354 spa->spa_config_splitting = nvl; 5355 vdev_config_dirty(spa->spa_root_vdev); 5356 5357 /* configure and create the new pool */ 5358 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5359 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5360 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5361 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5362 spa_version(spa)) == 0); 5363 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5364 spa->spa_config_txg) == 0); 5365 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5366 spa_generate_guid(NULL)) == 0); 5367 (void) nvlist_lookup_string(props, 5368 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5369 5370 /* add the new pool to the namespace */ 5371 newspa = spa_add(newname, config, altroot); 5372 newspa->spa_config_txg = spa->spa_config_txg; 5373 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5374 5375 /* release the spa config lock, retaining the namespace lock */ 5376 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5377 5378 if (zio_injection_enabled) 5379 zio_handle_panic_injection(spa, FTAG, 1); 5380 5381 spa_activate(newspa, spa_mode_global); 5382 spa_async_suspend(newspa); 5383 5384#ifndef sun 5385 /* mark that we are creating new spa by splitting */ 5386 newspa->spa_splitting_newspa = B_TRUE; 5387#endif 5388 /* create the new pool from the disks of the original pool */ 5389 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5390#ifndef sun 5391 newspa->spa_splitting_newspa = B_FALSE; 5392#endif 5393 if (error) 5394 goto out; 5395 5396 /* if that worked, generate a real config for the new pool */ 5397 if (newspa->spa_root_vdev != NULL) { 5398 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5399 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5400 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5401 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5402 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5403 B_TRUE)); 5404 } 5405 5406 /* set the props */ 5407 if (props != NULL) { 5408 spa_configfile_set(newspa, props, B_FALSE); 5409 error = spa_prop_set(newspa, props); 5410 if (error) 5411 goto out; 5412 } 5413 5414 /* flush everything */ 5415 txg = spa_vdev_config_enter(newspa); 5416 vdev_config_dirty(newspa->spa_root_vdev); 5417 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5418 5419 if (zio_injection_enabled) 5420 zio_handle_panic_injection(spa, FTAG, 2); 5421 5422 spa_async_resume(newspa); 5423 5424 /* finally, update the original pool's config */ 5425 txg = spa_vdev_config_enter(spa); 5426 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5427 error = dmu_tx_assign(tx, TXG_WAIT); 5428 if (error != 0) 5429 dmu_tx_abort(tx); 5430 for (c = 0; c < children; c++) { 5431 if (vml[c] != NULL) { 5432 vdev_split(vml[c]); 5433 if (error == 0) 5434 spa_history_log_internal(spa, "detach", tx, 5435 "vdev=%s", vml[c]->vdev_path); 5436 vdev_free(vml[c]); 5437 } 5438 } 5439 vdev_config_dirty(spa->spa_root_vdev); 5440 spa->spa_config_splitting = NULL; 5441 nvlist_free(nvl); 5442 if (error == 0) 5443 dmu_tx_commit(tx); 5444 (void) spa_vdev_exit(spa, NULL, txg, 0); 5445 5446 if (zio_injection_enabled) 5447 zio_handle_panic_injection(spa, FTAG, 3); 5448 5449 /* split is complete; log a history record */ 5450 spa_history_log_internal(newspa, "split", NULL, 5451 "from pool %s", spa_name(spa)); 5452 5453 kmem_free(vml, children * sizeof (vdev_t *)); 5454 5455 /* if we're not going to mount the filesystems in userland, export */ 5456 if (exp) 5457 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5458 B_FALSE, B_FALSE); 5459 5460 return (error); 5461 5462out: 5463 spa_unload(newspa); 5464 spa_deactivate(newspa); 5465 spa_remove(newspa); 5466 5467 txg = spa_vdev_config_enter(spa); 5468 5469 /* re-online all offlined disks */ 5470 for (c = 0; c < children; c++) { 5471 if (vml[c] != NULL) 5472 vml[c]->vdev_offline = B_FALSE; 5473 } 5474 vdev_reopen(spa->spa_root_vdev); 5475 5476 nvlist_free(spa->spa_config_splitting); 5477 spa->spa_config_splitting = NULL; 5478 (void) spa_vdev_exit(spa, NULL, txg, error); 5479 5480 kmem_free(vml, children * sizeof (vdev_t *)); 5481 return (error); 5482} 5483 5484static nvlist_t * 5485spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5486{ 5487 for (int i = 0; i < count; i++) { 5488 uint64_t guid; 5489 5490 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5491 &guid) == 0); 5492 5493 if (guid == target_guid) 5494 return (nvpp[i]); 5495 } 5496 5497 return (NULL); 5498} 5499 5500static void 5501spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5502 nvlist_t *dev_to_remove) 5503{ 5504 nvlist_t **newdev = NULL; 5505 5506 if (count > 1) 5507 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5508 5509 for (int i = 0, j = 0; i < count; i++) { 5510 if (dev[i] == dev_to_remove) 5511 continue; 5512 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5513 } 5514 5515 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5516 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5517 5518 for (int i = 0; i < count - 1; i++) 5519 nvlist_free(newdev[i]); 5520 5521 if (count > 1) 5522 kmem_free(newdev, (count - 1) * sizeof (void *)); 5523} 5524 5525/* 5526 * Evacuate the device. 5527 */ 5528static int 5529spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5530{ 5531 uint64_t txg; 5532 int error = 0; 5533 5534 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5535 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5536 ASSERT(vd == vd->vdev_top); 5537 5538 /* 5539 * Evacuate the device. We don't hold the config lock as writer 5540 * since we need to do I/O but we do keep the 5541 * spa_namespace_lock held. Once this completes the device 5542 * should no longer have any blocks allocated on it. 5543 */ 5544 if (vd->vdev_islog) { 5545 if (vd->vdev_stat.vs_alloc != 0) 5546 error = spa_offline_log(spa); 5547 } else { 5548 error = SET_ERROR(ENOTSUP); 5549 } 5550 5551 if (error) 5552 return (error); 5553 5554 /* 5555 * The evacuation succeeded. Remove any remaining MOS metadata 5556 * associated with this vdev, and wait for these changes to sync. 5557 */ 5558 ASSERT0(vd->vdev_stat.vs_alloc); 5559 txg = spa_vdev_config_enter(spa); 5560 vd->vdev_removing = B_TRUE; 5561 vdev_dirty_leaves(vd, VDD_DTL, txg); 5562 vdev_config_dirty(vd); 5563 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5564 5565 return (0); 5566} 5567 5568/* 5569 * Complete the removal by cleaning up the namespace. 5570 */ 5571static void 5572spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5573{ 5574 vdev_t *rvd = spa->spa_root_vdev; 5575 uint64_t id = vd->vdev_id; 5576 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5577 5578 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5579 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5580 ASSERT(vd == vd->vdev_top); 5581 5582 /* 5583 * Only remove any devices which are empty. 5584 */ 5585 if (vd->vdev_stat.vs_alloc != 0) 5586 return; 5587 5588 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5589 5590 if (list_link_active(&vd->vdev_state_dirty_node)) 5591 vdev_state_clean(vd); 5592 if (list_link_active(&vd->vdev_config_dirty_node)) 5593 vdev_config_clean(vd); 5594 5595 vdev_free(vd); 5596 5597 if (last_vdev) { 5598 vdev_compact_children(rvd); 5599 } else { 5600 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5601 vdev_add_child(rvd, vd); 5602 } 5603 vdev_config_dirty(rvd); 5604 5605 /* 5606 * Reassess the health of our root vdev. 5607 */ 5608 vdev_reopen(rvd); 5609} 5610 5611/* 5612 * Remove a device from the pool - 5613 * 5614 * Removing a device from the vdev namespace requires several steps 5615 * and can take a significant amount of time. As a result we use 5616 * the spa_vdev_config_[enter/exit] functions which allow us to 5617 * grab and release the spa_config_lock while still holding the namespace 5618 * lock. During each step the configuration is synced out. 5619 * 5620 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5621 * devices. 5622 */ 5623int 5624spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5625{ 5626 vdev_t *vd; 5627 metaslab_group_t *mg; 5628 nvlist_t **spares, **l2cache, *nv; 5629 uint64_t txg = 0; 5630 uint_t nspares, nl2cache; 5631 int error = 0; 5632 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5633 5634 ASSERT(spa_writeable(spa)); 5635 5636 if (!locked) 5637 txg = spa_vdev_enter(spa); 5638 5639 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5640 5641 if (spa->spa_spares.sav_vdevs != NULL && 5642 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5643 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5644 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5645 /* 5646 * Only remove the hot spare if it's not currently in use 5647 * in this pool. 5648 */ 5649 if (vd == NULL || unspare) { 5650 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5651 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5652 spa_load_spares(spa); 5653 spa->spa_spares.sav_sync = B_TRUE; 5654 } else { 5655 error = SET_ERROR(EBUSY); 5656 } 5657 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5658 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5659 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5660 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5661 /* 5662 * Cache devices can always be removed. 5663 */ 5664 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5665 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5666 spa_load_l2cache(spa); 5667 spa->spa_l2cache.sav_sync = B_TRUE; 5668 } else if (vd != NULL && vd->vdev_islog) { 5669 ASSERT(!locked); 5670 ASSERT(vd == vd->vdev_top); 5671 5672 mg = vd->vdev_mg; 5673 5674 /* 5675 * Stop allocating from this vdev. 5676 */ 5677 metaslab_group_passivate(mg); 5678 5679 /* 5680 * Wait for the youngest allocations and frees to sync, 5681 * and then wait for the deferral of those frees to finish. 5682 */ 5683 spa_vdev_config_exit(spa, NULL, 5684 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5685 5686 /* 5687 * Attempt to evacuate the vdev. 5688 */ 5689 error = spa_vdev_remove_evacuate(spa, vd); 5690 5691 txg = spa_vdev_config_enter(spa); 5692 5693 /* 5694 * If we couldn't evacuate the vdev, unwind. 5695 */ 5696 if (error) { 5697 metaslab_group_activate(mg); 5698 return (spa_vdev_exit(spa, NULL, txg, error)); 5699 } 5700 5701 /* 5702 * Clean up the vdev namespace. 5703 */ 5704 spa_vdev_remove_from_namespace(spa, vd); 5705 5706 } else if (vd != NULL) { 5707 /* 5708 * Normal vdevs cannot be removed (yet). 5709 */ 5710 error = SET_ERROR(ENOTSUP); 5711 } else { 5712 /* 5713 * There is no vdev of any kind with the specified guid. 5714 */ 5715 error = SET_ERROR(ENOENT); 5716 } 5717 5718 if (!locked) 5719 return (spa_vdev_exit(spa, NULL, txg, error)); 5720 5721 return (error); 5722} 5723 5724/* 5725 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5726 * currently spared, so we can detach it. 5727 */ 5728static vdev_t * 5729spa_vdev_resilver_done_hunt(vdev_t *vd) 5730{ 5731 vdev_t *newvd, *oldvd; 5732 5733 for (int c = 0; c < vd->vdev_children; c++) { 5734 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5735 if (oldvd != NULL) 5736 return (oldvd); 5737 } 5738 5739 /* 5740 * Check for a completed replacement. We always consider the first 5741 * vdev in the list to be the oldest vdev, and the last one to be 5742 * the newest (see spa_vdev_attach() for how that works). In 5743 * the case where the newest vdev is faulted, we will not automatically 5744 * remove it after a resilver completes. This is OK as it will require 5745 * user intervention to determine which disk the admin wishes to keep. 5746 */ 5747 if (vd->vdev_ops == &vdev_replacing_ops) { 5748 ASSERT(vd->vdev_children > 1); 5749 5750 newvd = vd->vdev_child[vd->vdev_children - 1]; 5751 oldvd = vd->vdev_child[0]; 5752 5753 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5754 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5755 !vdev_dtl_required(oldvd)) 5756 return (oldvd); 5757 } 5758 5759 /* 5760 * Check for a completed resilver with the 'unspare' flag set. 5761 */ 5762 if (vd->vdev_ops == &vdev_spare_ops) { 5763 vdev_t *first = vd->vdev_child[0]; 5764 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5765 5766 if (last->vdev_unspare) { 5767 oldvd = first; 5768 newvd = last; 5769 } else if (first->vdev_unspare) { 5770 oldvd = last; 5771 newvd = first; 5772 } else { 5773 oldvd = NULL; 5774 } 5775 5776 if (oldvd != NULL && 5777 vdev_dtl_empty(newvd, DTL_MISSING) && 5778 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5779 !vdev_dtl_required(oldvd)) 5780 return (oldvd); 5781 5782 /* 5783 * If there are more than two spares attached to a disk, 5784 * and those spares are not required, then we want to 5785 * attempt to free them up now so that they can be used 5786 * by other pools. Once we're back down to a single 5787 * disk+spare, we stop removing them. 5788 */ 5789 if (vd->vdev_children > 2) { 5790 newvd = vd->vdev_child[1]; 5791 5792 if (newvd->vdev_isspare && last->vdev_isspare && 5793 vdev_dtl_empty(last, DTL_MISSING) && 5794 vdev_dtl_empty(last, DTL_OUTAGE) && 5795 !vdev_dtl_required(newvd)) 5796 return (newvd); 5797 } 5798 } 5799 5800 return (NULL); 5801} 5802 5803static void 5804spa_vdev_resilver_done(spa_t *spa) 5805{ 5806 vdev_t *vd, *pvd, *ppvd; 5807 uint64_t guid, sguid, pguid, ppguid; 5808 5809 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5810 5811 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5812 pvd = vd->vdev_parent; 5813 ppvd = pvd->vdev_parent; 5814 guid = vd->vdev_guid; 5815 pguid = pvd->vdev_guid; 5816 ppguid = ppvd->vdev_guid; 5817 sguid = 0; 5818 /* 5819 * If we have just finished replacing a hot spared device, then 5820 * we need to detach the parent's first child (the original hot 5821 * spare) as well. 5822 */ 5823 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5824 ppvd->vdev_children == 2) { 5825 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5826 sguid = ppvd->vdev_child[1]->vdev_guid; 5827 } 5828 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5829 5830 spa_config_exit(spa, SCL_ALL, FTAG); 5831 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5832 return; 5833 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5834 return; 5835 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5836 } 5837 5838 spa_config_exit(spa, SCL_ALL, FTAG); 5839} 5840 5841/* 5842 * Update the stored path or FRU for this vdev. 5843 */ 5844int 5845spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5846 boolean_t ispath) 5847{ 5848 vdev_t *vd; 5849 boolean_t sync = B_FALSE; 5850 5851 ASSERT(spa_writeable(spa)); 5852 5853 spa_vdev_state_enter(spa, SCL_ALL); 5854 5855 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5856 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5857 5858 if (!vd->vdev_ops->vdev_op_leaf) 5859 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5860 5861 if (ispath) { 5862 if (strcmp(value, vd->vdev_path) != 0) { 5863 spa_strfree(vd->vdev_path); 5864 vd->vdev_path = spa_strdup(value); 5865 sync = B_TRUE; 5866 } 5867 } else { 5868 if (vd->vdev_fru == NULL) { 5869 vd->vdev_fru = spa_strdup(value); 5870 sync = B_TRUE; 5871 } else if (strcmp(value, vd->vdev_fru) != 0) { 5872 spa_strfree(vd->vdev_fru); 5873 vd->vdev_fru = spa_strdup(value); 5874 sync = B_TRUE; 5875 } 5876 } 5877 5878 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5879} 5880 5881int 5882spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5883{ 5884 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5885} 5886 5887int 5888spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5889{ 5890 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5891} 5892 5893/* 5894 * ========================================================================== 5895 * SPA Scanning 5896 * ========================================================================== 5897 */ 5898 5899int 5900spa_scan_stop(spa_t *spa) 5901{ 5902 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5903 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5904 return (SET_ERROR(EBUSY)); 5905 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5906} 5907 5908int 5909spa_scan(spa_t *spa, pool_scan_func_t func) 5910{ 5911 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5912 5913 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5914 return (SET_ERROR(ENOTSUP)); 5915 5916 /* 5917 * If a resilver was requested, but there is no DTL on a 5918 * writeable leaf device, we have nothing to do. 5919 */ 5920 if (func == POOL_SCAN_RESILVER && 5921 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5922 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5923 return (0); 5924 } 5925 5926 return (dsl_scan(spa->spa_dsl_pool, func)); 5927} 5928 5929/* 5930 * ========================================================================== 5931 * SPA async task processing 5932 * ========================================================================== 5933 */ 5934 5935static void 5936spa_async_remove(spa_t *spa, vdev_t *vd) 5937{ 5938 if (vd->vdev_remove_wanted) { 5939 vd->vdev_remove_wanted = B_FALSE; 5940 vd->vdev_delayed_close = B_FALSE; 5941 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5942 5943 /* 5944 * We want to clear the stats, but we don't want to do a full 5945 * vdev_clear() as that will cause us to throw away 5946 * degraded/faulted state as well as attempt to reopen the 5947 * device, all of which is a waste. 5948 */ 5949 vd->vdev_stat.vs_read_errors = 0; 5950 vd->vdev_stat.vs_write_errors = 0; 5951 vd->vdev_stat.vs_checksum_errors = 0; 5952 5953 vdev_state_dirty(vd->vdev_top); 5954 } 5955 5956 for (int c = 0; c < vd->vdev_children; c++) 5957 spa_async_remove(spa, vd->vdev_child[c]); 5958} 5959 5960static void 5961spa_async_probe(spa_t *spa, vdev_t *vd) 5962{ 5963 if (vd->vdev_probe_wanted) { 5964 vd->vdev_probe_wanted = B_FALSE; 5965 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5966 } 5967 5968 for (int c = 0; c < vd->vdev_children; c++) 5969 spa_async_probe(spa, vd->vdev_child[c]); 5970} 5971 5972static void 5973spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5974{ 5975 sysevent_id_t eid; 5976 nvlist_t *attr; 5977 char *physpath; 5978 5979 if (!spa->spa_autoexpand) 5980 return; 5981 5982 for (int c = 0; c < vd->vdev_children; c++) { 5983 vdev_t *cvd = vd->vdev_child[c]; 5984 spa_async_autoexpand(spa, cvd); 5985 } 5986 5987 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5988 return; 5989 5990 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5991 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5992 5993 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5994 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5995 5996 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5997 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5998 5999 nvlist_free(attr); 6000 kmem_free(physpath, MAXPATHLEN); 6001} 6002 6003static void 6004spa_async_thread(void *arg) 6005{ 6006 spa_t *spa = arg; 6007 int tasks; 6008 6009 ASSERT(spa->spa_sync_on); 6010 6011 mutex_enter(&spa->spa_async_lock); 6012 tasks = spa->spa_async_tasks; 6013 spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 6014 mutex_exit(&spa->spa_async_lock); 6015 6016 /* 6017 * See if the config needs to be updated. 6018 */ 6019 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6020 uint64_t old_space, new_space; 6021 6022 mutex_enter(&spa_namespace_lock); 6023 old_space = metaslab_class_get_space(spa_normal_class(spa)); 6024 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6025 new_space = metaslab_class_get_space(spa_normal_class(spa)); 6026 mutex_exit(&spa_namespace_lock); 6027 6028 /* 6029 * If the pool grew as a result of the config update, 6030 * then log an internal history event. 6031 */ 6032 if (new_space != old_space) { 6033 spa_history_log_internal(spa, "vdev online", NULL, 6034 "pool '%s' size: %llu(+%llu)", 6035 spa_name(spa), new_space, new_space - old_space); 6036 } 6037 } 6038 6039 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6040 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6041 spa_async_autoexpand(spa, spa->spa_root_vdev); 6042 spa_config_exit(spa, SCL_CONFIG, FTAG); 6043 } 6044 6045 /* 6046 * See if any devices need to be probed. 6047 */ 6048 if (tasks & SPA_ASYNC_PROBE) { 6049 spa_vdev_state_enter(spa, SCL_NONE); 6050 spa_async_probe(spa, spa->spa_root_vdev); 6051 (void) spa_vdev_state_exit(spa, NULL, 0); 6052 } 6053 6054 /* 6055 * If any devices are done replacing, detach them. 6056 */ 6057 if (tasks & SPA_ASYNC_RESILVER_DONE) 6058 spa_vdev_resilver_done(spa); 6059 6060 /* 6061 * Kick off a resilver. 6062 */ 6063 if (tasks & SPA_ASYNC_RESILVER) 6064 dsl_resilver_restart(spa->spa_dsl_pool, 0); 6065 6066 /* 6067 * Let the world know that we're done. 6068 */ 6069 mutex_enter(&spa->spa_async_lock); 6070 spa->spa_async_thread = NULL; 6071 cv_broadcast(&spa->spa_async_cv); 6072 mutex_exit(&spa->spa_async_lock); 6073 thread_exit(); 6074} 6075 6076static void 6077spa_async_thread_vd(void *arg) 6078{ 6079 spa_t *spa = arg; 6080 int tasks; 6081 6082 ASSERT(spa->spa_sync_on); 6083 6084 mutex_enter(&spa->spa_async_lock); 6085 tasks = spa->spa_async_tasks; 6086retry: 6087 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6088 mutex_exit(&spa->spa_async_lock); 6089 6090 /* 6091 * See if any devices need to be marked REMOVED. 6092 */ 6093 if (tasks & SPA_ASYNC_REMOVE) { 6094 spa_vdev_state_enter(spa, SCL_NONE); 6095 spa_async_remove(spa, spa->spa_root_vdev); 6096 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6097 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6098 for (int i = 0; i < spa->spa_spares.sav_count; i++) 6099 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6100 (void) spa_vdev_state_exit(spa, NULL, 0); 6101 } 6102 6103 /* 6104 * Let the world know that we're done. 6105 */ 6106 mutex_enter(&spa->spa_async_lock); 6107 tasks = spa->spa_async_tasks; 6108 if ((tasks & SPA_ASYNC_REMOVE) != 0) 6109 goto retry; 6110 spa->spa_async_thread_vd = NULL; 6111 cv_broadcast(&spa->spa_async_cv); 6112 mutex_exit(&spa->spa_async_lock); 6113 thread_exit(); 6114} 6115 6116void 6117spa_async_suspend(spa_t *spa) 6118{ 6119 mutex_enter(&spa->spa_async_lock); 6120 spa->spa_async_suspended++; 6121 while (spa->spa_async_thread != NULL && 6122 spa->spa_async_thread_vd != NULL) 6123 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6124 mutex_exit(&spa->spa_async_lock); 6125} 6126 6127void 6128spa_async_resume(spa_t *spa) 6129{ 6130 mutex_enter(&spa->spa_async_lock); 6131 ASSERT(spa->spa_async_suspended != 0); 6132 spa->spa_async_suspended--; 6133 mutex_exit(&spa->spa_async_lock); 6134} 6135 6136static boolean_t 6137spa_async_tasks_pending(spa_t *spa) 6138{ 6139 uint_t non_config_tasks; 6140 uint_t config_task; 6141 boolean_t config_task_suspended; 6142 6143 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6144 SPA_ASYNC_REMOVE); 6145 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6146 if (spa->spa_ccw_fail_time == 0) { 6147 config_task_suspended = B_FALSE; 6148 } else { 6149 config_task_suspended = 6150 (gethrtime() - spa->spa_ccw_fail_time) < 6151 (zfs_ccw_retry_interval * NANOSEC); 6152 } 6153 6154 return (non_config_tasks || (config_task && !config_task_suspended)); 6155} 6156 6157static void 6158spa_async_dispatch(spa_t *spa) 6159{ 6160 mutex_enter(&spa->spa_async_lock); 6161 if (spa_async_tasks_pending(spa) && 6162 !spa->spa_async_suspended && 6163 spa->spa_async_thread == NULL && 6164 rootdir != NULL) 6165 spa->spa_async_thread = thread_create(NULL, 0, 6166 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6167 mutex_exit(&spa->spa_async_lock); 6168} 6169 6170static void 6171spa_async_dispatch_vd(spa_t *spa) 6172{ 6173 mutex_enter(&spa->spa_async_lock); 6174 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6175 !spa->spa_async_suspended && 6176 spa->spa_async_thread_vd == NULL && 6177 rootdir != NULL) 6178 spa->spa_async_thread_vd = thread_create(NULL, 0, 6179 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6180 mutex_exit(&spa->spa_async_lock); 6181} 6182 6183void 6184spa_async_request(spa_t *spa, int task) 6185{ 6186 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6187 mutex_enter(&spa->spa_async_lock); 6188 spa->spa_async_tasks |= task; 6189 mutex_exit(&spa->spa_async_lock); 6190 spa_async_dispatch_vd(spa); 6191} 6192 6193/* 6194 * ========================================================================== 6195 * SPA syncing routines 6196 * ========================================================================== 6197 */ 6198 6199static int 6200bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6201{ 6202 bpobj_t *bpo = arg; 6203 bpobj_enqueue(bpo, bp, tx); 6204 return (0); 6205} 6206 6207static int 6208spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6209{ 6210 zio_t *zio = arg; 6211 6212 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6213 BP_GET_PSIZE(bp), zio->io_flags)); 6214 return (0); 6215} 6216 6217/* 6218 * Note: this simple function is not inlined to make it easier to dtrace the 6219 * amount of time spent syncing frees. 6220 */ 6221static void 6222spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6223{ 6224 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6225 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6226 VERIFY(zio_wait(zio) == 0); 6227} 6228 6229/* 6230 * Note: this simple function is not inlined to make it easier to dtrace the 6231 * amount of time spent syncing deferred frees. 6232 */ 6233static void 6234spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6235{ 6236 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6237 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6238 spa_free_sync_cb, zio, tx), ==, 0); 6239 VERIFY0(zio_wait(zio)); 6240} 6241 6242 6243static void 6244spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6245{ 6246 char *packed = NULL; 6247 size_t bufsize; 6248 size_t nvsize = 0; 6249 dmu_buf_t *db; 6250 6251 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6252 6253 /* 6254 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6255 * information. This avoids the dmu_buf_will_dirty() path and 6256 * saves us a pre-read to get data we don't actually care about. 6257 */ 6258 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6259 packed = kmem_alloc(bufsize, KM_SLEEP); 6260 6261 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6262 KM_SLEEP) == 0); 6263 bzero(packed + nvsize, bufsize - nvsize); 6264 6265 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6266 6267 kmem_free(packed, bufsize); 6268 6269 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6270 dmu_buf_will_dirty(db, tx); 6271 *(uint64_t *)db->db_data = nvsize; 6272 dmu_buf_rele(db, FTAG); 6273} 6274 6275static void 6276spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6277 const char *config, const char *entry) 6278{ 6279 nvlist_t *nvroot; 6280 nvlist_t **list; 6281 int i; 6282 6283 if (!sav->sav_sync) 6284 return; 6285 6286 /* 6287 * Update the MOS nvlist describing the list of available devices. 6288 * spa_validate_aux() will have already made sure this nvlist is 6289 * valid and the vdevs are labeled appropriately. 6290 */ 6291 if (sav->sav_object == 0) { 6292 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6293 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6294 sizeof (uint64_t), tx); 6295 VERIFY(zap_update(spa->spa_meta_objset, 6296 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6297 &sav->sav_object, tx) == 0); 6298 } 6299 6300 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6301 if (sav->sav_count == 0) { 6302 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6303 } else { 6304 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6305 for (i = 0; i < sav->sav_count; i++) 6306 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6307 B_FALSE, VDEV_CONFIG_L2CACHE); 6308 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6309 sav->sav_count) == 0); 6310 for (i = 0; i < sav->sav_count; i++) 6311 nvlist_free(list[i]); 6312 kmem_free(list, sav->sav_count * sizeof (void *)); 6313 } 6314 6315 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6316 nvlist_free(nvroot); 6317 6318 sav->sav_sync = B_FALSE; 6319} 6320 6321static void 6322spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6323{ 6324 nvlist_t *config; 6325 6326 if (list_is_empty(&spa->spa_config_dirty_list)) 6327 return; 6328 6329 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6330 6331 config = spa_config_generate(spa, spa->spa_root_vdev, 6332 dmu_tx_get_txg(tx), B_FALSE); 6333 6334 /* 6335 * If we're upgrading the spa version then make sure that 6336 * the config object gets updated with the correct version. 6337 */ 6338 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6339 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6340 spa->spa_uberblock.ub_version); 6341 6342 spa_config_exit(spa, SCL_STATE, FTAG); 6343 6344 if (spa->spa_config_syncing) 6345 nvlist_free(spa->spa_config_syncing); 6346 spa->spa_config_syncing = config; 6347 6348 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6349} 6350 6351static void 6352spa_sync_version(void *arg, dmu_tx_t *tx) 6353{ 6354 uint64_t *versionp = arg; 6355 uint64_t version = *versionp; 6356 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6357 6358 /* 6359 * Setting the version is special cased when first creating the pool. 6360 */ 6361 ASSERT(tx->tx_txg != TXG_INITIAL); 6362 6363 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6364 ASSERT(version >= spa_version(spa)); 6365 6366 spa->spa_uberblock.ub_version = version; 6367 vdev_config_dirty(spa->spa_root_vdev); 6368 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6369} 6370 6371/* 6372 * Set zpool properties. 6373 */ 6374static void 6375spa_sync_props(void *arg, dmu_tx_t *tx) 6376{ 6377 nvlist_t *nvp = arg; 6378 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6379 objset_t *mos = spa->spa_meta_objset; 6380 nvpair_t *elem = NULL; 6381 6382 mutex_enter(&spa->spa_props_lock); 6383 6384 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6385 uint64_t intval; 6386 char *strval, *fname; 6387 zpool_prop_t prop; 6388 const char *propname; 6389 zprop_type_t proptype; 6390 spa_feature_t fid; 6391 6392 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6393 case ZPROP_INVAL: 6394 /* 6395 * We checked this earlier in spa_prop_validate(). 6396 */ 6397 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6398 6399 fname = strchr(nvpair_name(elem), '@') + 1; 6400 VERIFY0(zfeature_lookup_name(fname, &fid)); 6401 6402 spa_feature_enable(spa, fid, tx); 6403 spa_history_log_internal(spa, "set", tx, 6404 "%s=enabled", nvpair_name(elem)); 6405 break; 6406 6407 case ZPOOL_PROP_VERSION: 6408 intval = fnvpair_value_uint64(elem); 6409 /* 6410 * The version is synced seperatly before other 6411 * properties and should be correct by now. 6412 */ 6413 ASSERT3U(spa_version(spa), >=, intval); 6414 break; 6415 6416 case ZPOOL_PROP_ALTROOT: 6417 /* 6418 * 'altroot' is a non-persistent property. It should 6419 * have been set temporarily at creation or import time. 6420 */ 6421 ASSERT(spa->spa_root != NULL); 6422 break; 6423 6424 case ZPOOL_PROP_READONLY: 6425 case ZPOOL_PROP_CACHEFILE: 6426 /* 6427 * 'readonly' and 'cachefile' are also non-persisitent 6428 * properties. 6429 */ 6430 break; 6431 case ZPOOL_PROP_COMMENT: 6432 strval = fnvpair_value_string(elem); 6433 if (spa->spa_comment != NULL) 6434 spa_strfree(spa->spa_comment); 6435 spa->spa_comment = spa_strdup(strval); 6436 /* 6437 * We need to dirty the configuration on all the vdevs 6438 * so that their labels get updated. It's unnecessary 6439 * to do this for pool creation since the vdev's 6440 * configuratoin has already been dirtied. 6441 */ 6442 if (tx->tx_txg != TXG_INITIAL) 6443 vdev_config_dirty(spa->spa_root_vdev); 6444 spa_history_log_internal(spa, "set", tx, 6445 "%s=%s", nvpair_name(elem), strval); 6446 break; 6447 default: 6448 /* 6449 * Set pool property values in the poolprops mos object. 6450 */ 6451 if (spa->spa_pool_props_object == 0) { 6452 spa->spa_pool_props_object = 6453 zap_create_link(mos, DMU_OT_POOL_PROPS, 6454 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6455 tx); 6456 } 6457 6458 /* normalize the property name */ 6459 propname = zpool_prop_to_name(prop); 6460 proptype = zpool_prop_get_type(prop); 6461 6462 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6463 ASSERT(proptype == PROP_TYPE_STRING); 6464 strval = fnvpair_value_string(elem); 6465 VERIFY0(zap_update(mos, 6466 spa->spa_pool_props_object, propname, 6467 1, strlen(strval) + 1, strval, tx)); 6468 spa_history_log_internal(spa, "set", tx, 6469 "%s=%s", nvpair_name(elem), strval); 6470 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6471 intval = fnvpair_value_uint64(elem); 6472 6473 if (proptype == PROP_TYPE_INDEX) { 6474 const char *unused; 6475 VERIFY0(zpool_prop_index_to_string( 6476 prop, intval, &unused)); 6477 } 6478 VERIFY0(zap_update(mos, 6479 spa->spa_pool_props_object, propname, 6480 8, 1, &intval, tx)); 6481 spa_history_log_internal(spa, "set", tx, 6482 "%s=%lld", nvpair_name(elem), intval); 6483 } else { 6484 ASSERT(0); /* not allowed */ 6485 } 6486 6487 switch (prop) { 6488 case ZPOOL_PROP_DELEGATION: 6489 spa->spa_delegation = intval; 6490 break; 6491 case ZPOOL_PROP_BOOTFS: 6492 spa->spa_bootfs = intval; 6493 break; 6494 case ZPOOL_PROP_FAILUREMODE: 6495 spa->spa_failmode = intval; 6496 break; 6497 case ZPOOL_PROP_AUTOEXPAND: 6498 spa->spa_autoexpand = intval; 6499 if (tx->tx_txg != TXG_INITIAL) 6500 spa_async_request(spa, 6501 SPA_ASYNC_AUTOEXPAND); 6502 break; 6503 case ZPOOL_PROP_DEDUPDITTO: 6504 spa->spa_dedup_ditto = intval; 6505 break; 6506 default: 6507 break; 6508 } 6509 } 6510 6511 } 6512 6513 mutex_exit(&spa->spa_props_lock); 6514} 6515 6516/* 6517 * Perform one-time upgrade on-disk changes. spa_version() does not 6518 * reflect the new version this txg, so there must be no changes this 6519 * txg to anything that the upgrade code depends on after it executes. 6520 * Therefore this must be called after dsl_pool_sync() does the sync 6521 * tasks. 6522 */ 6523static void 6524spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6525{ 6526 dsl_pool_t *dp = spa->spa_dsl_pool; 6527 6528 ASSERT(spa->spa_sync_pass == 1); 6529 6530 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6531 6532 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6533 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6534 dsl_pool_create_origin(dp, tx); 6535 6536 /* Keeping the origin open increases spa_minref */ 6537 spa->spa_minref += 3; 6538 } 6539 6540 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6541 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6542 dsl_pool_upgrade_clones(dp, tx); 6543 } 6544 6545 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6546 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6547 dsl_pool_upgrade_dir_clones(dp, tx); 6548 6549 /* Keeping the freedir open increases spa_minref */ 6550 spa->spa_minref += 3; 6551 } 6552 6553 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6554 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6555 spa_feature_create_zap_objects(spa, tx); 6556 } 6557 6558 /* 6559 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6560 * when possibility to use lz4 compression for metadata was added 6561 * Old pools that have this feature enabled must be upgraded to have 6562 * this feature active 6563 */ 6564 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6565 boolean_t lz4_en = spa_feature_is_enabled(spa, 6566 SPA_FEATURE_LZ4_COMPRESS); 6567 boolean_t lz4_ac = spa_feature_is_active(spa, 6568 SPA_FEATURE_LZ4_COMPRESS); 6569 6570 if (lz4_en && !lz4_ac) 6571 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6572 } 6573 6574 /* 6575 * If we haven't written the salt, do so now. Note that the 6576 * feature may not be activated yet, but that's fine since 6577 * the presence of this ZAP entry is backwards compatible. 6578 */ 6579 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6580 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6581 VERIFY0(zap_add(spa->spa_meta_objset, 6582 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6583 sizeof (spa->spa_cksum_salt.zcs_bytes), 6584 spa->spa_cksum_salt.zcs_bytes, tx)); 6585 } 6586 6587 rrw_exit(&dp->dp_config_rwlock, FTAG); 6588} 6589 6590/* 6591 * Sync the specified transaction group. New blocks may be dirtied as 6592 * part of the process, so we iterate until it converges. 6593 */ 6594void 6595spa_sync(spa_t *spa, uint64_t txg) 6596{ 6597 dsl_pool_t *dp = spa->spa_dsl_pool; 6598 objset_t *mos = spa->spa_meta_objset; 6599 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6600 vdev_t *rvd = spa->spa_root_vdev; 6601 vdev_t *vd; 6602 dmu_tx_t *tx; 6603 int error; 6604 6605 VERIFY(spa_writeable(spa)); 6606 6607 /* 6608 * Lock out configuration changes. 6609 */ 6610 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6611 6612 spa->spa_syncing_txg = txg; 6613 spa->spa_sync_pass = 0; 6614 6615 /* 6616 * If there are any pending vdev state changes, convert them 6617 * into config changes that go out with this transaction group. 6618 */ 6619 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6620 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6621 /* 6622 * We need the write lock here because, for aux vdevs, 6623 * calling vdev_config_dirty() modifies sav_config. 6624 * This is ugly and will become unnecessary when we 6625 * eliminate the aux vdev wart by integrating all vdevs 6626 * into the root vdev tree. 6627 */ 6628 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6629 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6630 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6631 vdev_state_clean(vd); 6632 vdev_config_dirty(vd); 6633 } 6634 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6635 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6636 } 6637 spa_config_exit(spa, SCL_STATE, FTAG); 6638 6639 tx = dmu_tx_create_assigned(dp, txg); 6640 6641 spa->spa_sync_starttime = gethrtime(); 6642#ifdef illumos 6643 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6644 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6645#else /* FreeBSD */ 6646#ifdef _KERNEL 6647 callout_reset(&spa->spa_deadman_cycid, 6648 hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6649#endif 6650#endif 6651 6652 /* 6653 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6654 * set spa_deflate if we have no raid-z vdevs. 6655 */ 6656 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6657 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6658 int i; 6659 6660 for (i = 0; i < rvd->vdev_children; i++) { 6661 vd = rvd->vdev_child[i]; 6662 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6663 break; 6664 } 6665 if (i == rvd->vdev_children) { 6666 spa->spa_deflate = TRUE; 6667 VERIFY(0 == zap_add(spa->spa_meta_objset, 6668 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6669 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6670 } 6671 } 6672 6673 /* 6674 * Iterate to convergence. 6675 */ 6676 do { 6677 int pass = ++spa->spa_sync_pass; 6678 6679 spa_sync_config_object(spa, tx); 6680 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6681 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6682 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6683 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6684 spa_errlog_sync(spa, txg); 6685 dsl_pool_sync(dp, txg); 6686 6687 if (pass < zfs_sync_pass_deferred_free) { 6688 spa_sync_frees(spa, free_bpl, tx); 6689 } else { 6690 /* 6691 * We can not defer frees in pass 1, because 6692 * we sync the deferred frees later in pass 1. 6693 */ 6694 ASSERT3U(pass, >, 1); 6695 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6696 &spa->spa_deferred_bpobj, tx); 6697 } 6698 6699 ddt_sync(spa, txg); 6700 dsl_scan_sync(dp, tx); 6701 6702 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6703 vdev_sync(vd, txg); 6704 6705 if (pass == 1) { 6706 spa_sync_upgrades(spa, tx); 6707 ASSERT3U(txg, >=, 6708 spa->spa_uberblock.ub_rootbp.blk_birth); 6709 /* 6710 * Note: We need to check if the MOS is dirty 6711 * because we could have marked the MOS dirty 6712 * without updating the uberblock (e.g. if we 6713 * have sync tasks but no dirty user data). We 6714 * need to check the uberblock's rootbp because 6715 * it is updated if we have synced out dirty 6716 * data (though in this case the MOS will most 6717 * likely also be dirty due to second order 6718 * effects, we don't want to rely on that here). 6719 */ 6720 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6721 !dmu_objset_is_dirty(mos, txg)) { 6722 /* 6723 * Nothing changed on the first pass, 6724 * therefore this TXG is a no-op. Avoid 6725 * syncing deferred frees, so that we 6726 * can keep this TXG as a no-op. 6727 */ 6728 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6729 txg)); 6730 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6731 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6732 break; 6733 } 6734 spa_sync_deferred_frees(spa, tx); 6735 } 6736 6737 } while (dmu_objset_is_dirty(mos, txg)); 6738 6739 /* 6740 * Rewrite the vdev configuration (which includes the uberblock) 6741 * to commit the transaction group. 6742 * 6743 * If there are no dirty vdevs, we sync the uberblock to a few 6744 * random top-level vdevs that are known to be visible in the 6745 * config cache (see spa_vdev_add() for a complete description). 6746 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6747 */ 6748 for (;;) { 6749 /* 6750 * We hold SCL_STATE to prevent vdev open/close/etc. 6751 * while we're attempting to write the vdev labels. 6752 */ 6753 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6754 6755 if (list_is_empty(&spa->spa_config_dirty_list)) { 6756 vdev_t *svd[SPA_DVAS_PER_BP]; 6757 int svdcount = 0; 6758 int children = rvd->vdev_children; 6759 int c0 = spa_get_random(children); 6760 6761 for (int c = 0; c < children; c++) { 6762 vd = rvd->vdev_child[(c0 + c) % children]; 6763 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6764 continue; 6765 svd[svdcount++] = vd; 6766 if (svdcount == SPA_DVAS_PER_BP) 6767 break; 6768 } 6769 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6770 if (error != 0) 6771 error = vdev_config_sync(svd, svdcount, txg, 6772 B_TRUE); 6773 } else { 6774 error = vdev_config_sync(rvd->vdev_child, 6775 rvd->vdev_children, txg, B_FALSE); 6776 if (error != 0) 6777 error = vdev_config_sync(rvd->vdev_child, 6778 rvd->vdev_children, txg, B_TRUE); 6779 } 6780 6781 if (error == 0) 6782 spa->spa_last_synced_guid = rvd->vdev_guid; 6783 6784 spa_config_exit(spa, SCL_STATE, FTAG); 6785 6786 if (error == 0) 6787 break; 6788 zio_suspend(spa, NULL); 6789 zio_resume_wait(spa); 6790 } 6791 dmu_tx_commit(tx); 6792 6793#ifdef illumos 6794 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6795#else /* FreeBSD */ 6796#ifdef _KERNEL 6797 callout_drain(&spa->spa_deadman_cycid); 6798#endif 6799#endif 6800 6801 /* 6802 * Clear the dirty config list. 6803 */ 6804 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6805 vdev_config_clean(vd); 6806 6807 /* 6808 * Now that the new config has synced transactionally, 6809 * let it become visible to the config cache. 6810 */ 6811 if (spa->spa_config_syncing != NULL) { 6812 spa_config_set(spa, spa->spa_config_syncing); 6813 spa->spa_config_txg = txg; 6814 spa->spa_config_syncing = NULL; 6815 } 6816 6817 spa->spa_ubsync = spa->spa_uberblock; 6818 6819 dsl_pool_sync_done(dp, txg); 6820 6821 /* 6822 * Update usable space statistics. 6823 */ 6824 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6825 vdev_sync_done(vd, txg); 6826 6827 spa_update_dspace(spa); 6828 6829 /* 6830 * It had better be the case that we didn't dirty anything 6831 * since vdev_config_sync(). 6832 */ 6833 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6834 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6835 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6836 6837 spa->spa_sync_pass = 0; 6838 6839 spa_config_exit(spa, SCL_CONFIG, FTAG); 6840 6841 spa_handle_ignored_writes(spa); 6842 6843 /* 6844 * If any async tasks have been requested, kick them off. 6845 */ 6846 spa_async_dispatch(spa); 6847 spa_async_dispatch_vd(spa); 6848} 6849 6850/* 6851 * Sync all pools. We don't want to hold the namespace lock across these 6852 * operations, so we take a reference on the spa_t and drop the lock during the 6853 * sync. 6854 */ 6855void 6856spa_sync_allpools(void) 6857{ 6858 spa_t *spa = NULL; 6859 mutex_enter(&spa_namespace_lock); 6860 while ((spa = spa_next(spa)) != NULL) { 6861 if (spa_state(spa) != POOL_STATE_ACTIVE || 6862 !spa_writeable(spa) || spa_suspended(spa)) 6863 continue; 6864 spa_open_ref(spa, FTAG); 6865 mutex_exit(&spa_namespace_lock); 6866 txg_wait_synced(spa_get_dsl(spa), 0); 6867 mutex_enter(&spa_namespace_lock); 6868 spa_close(spa, FTAG); 6869 } 6870 mutex_exit(&spa_namespace_lock); 6871} 6872 6873/* 6874 * ========================================================================== 6875 * Miscellaneous routines 6876 * ========================================================================== 6877 */ 6878 6879/* 6880 * Remove all pools in the system. 6881 */ 6882void 6883spa_evict_all(void) 6884{ 6885 spa_t *spa; 6886 6887 /* 6888 * Remove all cached state. All pools should be closed now, 6889 * so every spa in the AVL tree should be unreferenced. 6890 */ 6891 mutex_enter(&spa_namespace_lock); 6892 while ((spa = spa_next(NULL)) != NULL) { 6893 /* 6894 * Stop async tasks. The async thread may need to detach 6895 * a device that's been replaced, which requires grabbing 6896 * spa_namespace_lock, so we must drop it here. 6897 */ 6898 spa_open_ref(spa, FTAG); 6899 mutex_exit(&spa_namespace_lock); 6900 spa_async_suspend(spa); 6901 mutex_enter(&spa_namespace_lock); 6902 spa_close(spa, FTAG); 6903 6904 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6905 spa_unload(spa); 6906 spa_deactivate(spa); 6907 } 6908 spa_remove(spa); 6909 } 6910 mutex_exit(&spa_namespace_lock); 6911} 6912 6913vdev_t * 6914spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6915{ 6916 vdev_t *vd; 6917 int i; 6918 6919 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6920 return (vd); 6921 6922 if (aux) { 6923 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6924 vd = spa->spa_l2cache.sav_vdevs[i]; 6925 if (vd->vdev_guid == guid) 6926 return (vd); 6927 } 6928 6929 for (i = 0; i < spa->spa_spares.sav_count; i++) { 6930 vd = spa->spa_spares.sav_vdevs[i]; 6931 if (vd->vdev_guid == guid) 6932 return (vd); 6933 } 6934 } 6935 6936 return (NULL); 6937} 6938 6939void 6940spa_upgrade(spa_t *spa, uint64_t version) 6941{ 6942 ASSERT(spa_writeable(spa)); 6943 6944 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6945 6946 /* 6947 * This should only be called for a non-faulted pool, and since a 6948 * future version would result in an unopenable pool, this shouldn't be 6949 * possible. 6950 */ 6951 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6952 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 6953 6954 spa->spa_uberblock.ub_version = version; 6955 vdev_config_dirty(spa->spa_root_vdev); 6956 6957 spa_config_exit(spa, SCL_ALL, FTAG); 6958 6959 txg_wait_synced(spa_get_dsl(spa), 0); 6960} 6961 6962boolean_t 6963spa_has_spare(spa_t *spa, uint64_t guid) 6964{ 6965 int i; 6966 uint64_t spareguid; 6967 spa_aux_vdev_t *sav = &spa->spa_spares; 6968 6969 for (i = 0; i < sav->sav_count; i++) 6970 if (sav->sav_vdevs[i]->vdev_guid == guid) 6971 return (B_TRUE); 6972 6973 for (i = 0; i < sav->sav_npending; i++) { 6974 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6975 &spareguid) == 0 && spareguid == guid) 6976 return (B_TRUE); 6977 } 6978 6979 return (B_FALSE); 6980} 6981 6982/* 6983 * Check if a pool has an active shared spare device. 6984 * Note: reference count of an active spare is 2, as a spare and as a replace 6985 */ 6986static boolean_t 6987spa_has_active_shared_spare(spa_t *spa) 6988{ 6989 int i, refcnt; 6990 uint64_t pool; 6991 spa_aux_vdev_t *sav = &spa->spa_spares; 6992 6993 for (i = 0; i < sav->sav_count; i++) { 6994 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6995 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6996 refcnt > 2) 6997 return (B_TRUE); 6998 } 6999 7000 return (B_FALSE); 7001} 7002 7003/* 7004 * Post a sysevent corresponding to the given event. The 'name' must be one of 7005 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 7006 * filled in from the spa and (optionally) the vdev. This doesn't do anything 7007 * in the userland libzpool, as we don't want consumers to misinterpret ztest 7008 * or zdb as real changes. 7009 */ 7010void 7011spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 7012{ 7013#ifdef _KERNEL 7014 sysevent_t *ev; 7015 sysevent_attr_list_t *attr = NULL; 7016 sysevent_value_t value; 7017 sysevent_id_t eid; 7018 7019 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 7020 SE_SLEEP); 7021 7022 value.value_type = SE_DATA_TYPE_STRING; 7023 value.value.sv_string = spa_name(spa); 7024 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 7025 goto done; 7026 7027 value.value_type = SE_DATA_TYPE_UINT64; 7028 value.value.sv_uint64 = spa_guid(spa); 7029 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 7030 goto done; 7031 7032 if (vd) { 7033 value.value_type = SE_DATA_TYPE_UINT64; 7034 value.value.sv_uint64 = vd->vdev_guid; 7035 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 7036 SE_SLEEP) != 0) 7037 goto done; 7038 7039 if (vd->vdev_path) { 7040 value.value_type = SE_DATA_TYPE_STRING; 7041 value.value.sv_string = vd->vdev_path; 7042 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 7043 &value, SE_SLEEP) != 0) 7044 goto done; 7045 } 7046 } 7047 7048 if (sysevent_attach_attributes(ev, attr) != 0) 7049 goto done; 7050 attr = NULL; 7051 7052 (void) log_sysevent(ev, SE_SLEEP, &eid); 7053 7054done: 7055 if (attr) 7056 sysevent_free_attr(attr); 7057 sysevent_free(ev); 7058#endif 7059} 7060