spa.c revision 314668
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright 2013 Saso Kiselkov. All rights reserved. 29 * Copyright (c) 2014 Integros [integros.com] 30 */ 31 32/* 33 * SPA: Storage Pool Allocator 34 * 35 * This file contains all the routines used when modifying on-disk SPA state. 36 * This includes opening, importing, destroying, exporting a pool, and syncing a 37 * pool. 38 */ 39 40#include <sys/zfs_context.h> 41#include <sys/fm/fs/zfs.h> 42#include <sys/spa_impl.h> 43#include <sys/zio.h> 44#include <sys/zio_checksum.h> 45#include <sys/dmu.h> 46#include <sys/dmu_tx.h> 47#include <sys/zap.h> 48#include <sys/zil.h> 49#include <sys/ddt.h> 50#include <sys/vdev_impl.h> 51#include <sys/metaslab.h> 52#include <sys/metaslab_impl.h> 53#include <sys/uberblock_impl.h> 54#include <sys/txg.h> 55#include <sys/avl.h> 56#include <sys/dmu_traverse.h> 57#include <sys/dmu_objset.h> 58#include <sys/unique.h> 59#include <sys/dsl_pool.h> 60#include <sys/dsl_dataset.h> 61#include <sys/dsl_dir.h> 62#include <sys/dsl_prop.h> 63#include <sys/dsl_synctask.h> 64#include <sys/fs/zfs.h> 65#include <sys/arc.h> 66#include <sys/callb.h> 67#include <sys/spa_boot.h> 68#include <sys/zfs_ioctl.h> 69#include <sys/dsl_scan.h> 70#include <sys/dmu_send.h> 71#include <sys/dsl_destroy.h> 72#include <sys/dsl_userhold.h> 73#include <sys/zfeature.h> 74#include <sys/zvol.h> 75#include <sys/trim_map.h> 76 77#ifdef _KERNEL 78#include <sys/callb.h> 79#include <sys/cpupart.h> 80#include <sys/zone.h> 81#endif /* _KERNEL */ 82 83#include "zfs_prop.h" 84#include "zfs_comutil.h" 85 86/* Check hostid on import? */ 87static int check_hostid = 1; 88 89/* 90 * The interval, in seconds, at which failed configuration cache file writes 91 * should be retried. 92 */ 93static int zfs_ccw_retry_interval = 300; 94 95SYSCTL_DECL(_vfs_zfs); 96TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 97SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, 98 "Check hostid on import?"); 99TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); 100SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, 101 &zfs_ccw_retry_interval, 0, 102 "Configuration cache file write, retry after failure, interval (seconds)"); 103 104typedef enum zti_modes { 105 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 106 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 107 ZTI_MODE_NULL, /* don't create a taskq */ 108 ZTI_NMODES 109} zti_modes_t; 110 111#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 112#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 113#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 114 115#define ZTI_N(n) ZTI_P(n, 1) 116#define ZTI_ONE ZTI_N(1) 117 118typedef struct zio_taskq_info { 119 zti_modes_t zti_mode; 120 uint_t zti_value; 121 uint_t zti_count; 122} zio_taskq_info_t; 123 124static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 125 "issue", "issue_high", "intr", "intr_high" 126}; 127 128/* 129 * This table defines the taskq settings for each ZFS I/O type. When 130 * initializing a pool, we use this table to create an appropriately sized 131 * taskq. Some operations are low volume and therefore have a small, static 132 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 133 * macros. Other operations process a large amount of data; the ZTI_BATCH 134 * macro causes us to create a taskq oriented for throughput. Some operations 135 * are so high frequency and short-lived that the taskq itself can become a a 136 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 137 * additional degree of parallelism specified by the number of threads per- 138 * taskq and the number of taskqs; when dispatching an event in this case, the 139 * particular taskq is chosen at random. 140 * 141 * The different taskq priorities are to handle the different contexts (issue 142 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 143 * need to be handled with minimum delay. 144 */ 145const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 146 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 147 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 148 { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ 149 { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ 150 { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 151 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 152 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 153}; 154 155static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name); 156static void spa_event_post(sysevent_t *ev); 157static void spa_sync_version(void *arg, dmu_tx_t *tx); 158static void spa_sync_props(void *arg, dmu_tx_t *tx); 159static boolean_t spa_has_active_shared_spare(spa_t *spa); 160static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 161 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 162 char **ereport); 163static void spa_vdev_resilver_done(spa_t *spa); 164 165uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ 166#ifdef PSRSET_BIND 167id_t zio_taskq_psrset_bind = PS_NONE; 168#endif 169#ifdef SYSDC 170boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 171uint_t zio_taskq_basedc = 80; /* base duty cycle */ 172#endif 173 174boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 175extern int zfs_sync_pass_deferred_free; 176 177/* 178 * This (illegal) pool name is used when temporarily importing a spa_t in order 179 * to get the vdev stats associated with the imported devices. 180 */ 181#define TRYIMPORT_NAME "$import" 182 183/* 184 * ========================================================================== 185 * SPA properties routines 186 * ========================================================================== 187 */ 188 189/* 190 * Add a (source=src, propname=propval) list to an nvlist. 191 */ 192static void 193spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 194 uint64_t intval, zprop_source_t src) 195{ 196 const char *propname = zpool_prop_to_name(prop); 197 nvlist_t *propval; 198 199 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 200 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 201 202 if (strval != NULL) 203 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 204 else 205 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 206 207 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 208 nvlist_free(propval); 209} 210 211/* 212 * Get property values from the spa configuration. 213 */ 214static void 215spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 216{ 217 vdev_t *rvd = spa->spa_root_vdev; 218 dsl_pool_t *pool = spa->spa_dsl_pool; 219 uint64_t size, alloc, cap, version; 220 zprop_source_t src = ZPROP_SRC_NONE; 221 spa_config_dirent_t *dp; 222 metaslab_class_t *mc = spa_normal_class(spa); 223 224 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 225 226 if (rvd != NULL) { 227 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 228 size = metaslab_class_get_space(spa_normal_class(spa)); 229 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 230 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 231 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 232 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 233 size - alloc, src); 234 235 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 236 metaslab_class_fragmentation(mc), src); 237 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 238 metaslab_class_expandable_space(mc), src); 239 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 240 (spa_mode(spa) == FREAD), src); 241 242 cap = (size == 0) ? 0 : (alloc * 100 / size); 243 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 244 245 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 246 ddt_get_pool_dedup_ratio(spa), src); 247 248 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 249 rvd->vdev_state, src); 250 251 version = spa_version(spa); 252 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 253 src = ZPROP_SRC_DEFAULT; 254 else 255 src = ZPROP_SRC_LOCAL; 256 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 257 } 258 259 if (pool != NULL) { 260 /* 261 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 262 * when opening pools before this version freedir will be NULL. 263 */ 264 if (pool->dp_free_dir != NULL) { 265 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 266 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 267 src); 268 } else { 269 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 270 NULL, 0, src); 271 } 272 273 if (pool->dp_leak_dir != NULL) { 274 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 275 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 276 src); 277 } else { 278 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 279 NULL, 0, src); 280 } 281 } 282 283 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 284 285 if (spa->spa_comment != NULL) { 286 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 287 0, ZPROP_SRC_LOCAL); 288 } 289 290 if (spa->spa_root != NULL) 291 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 292 0, ZPROP_SRC_LOCAL); 293 294 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 295 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 296 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 297 } else { 298 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 299 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 300 } 301 302 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 303 if (dp->scd_path == NULL) { 304 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 305 "none", 0, ZPROP_SRC_LOCAL); 306 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 307 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 308 dp->scd_path, 0, ZPROP_SRC_LOCAL); 309 } 310 } 311} 312 313/* 314 * Get zpool property values. 315 */ 316int 317spa_prop_get(spa_t *spa, nvlist_t **nvp) 318{ 319 objset_t *mos = spa->spa_meta_objset; 320 zap_cursor_t zc; 321 zap_attribute_t za; 322 int err; 323 324 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 325 326 mutex_enter(&spa->spa_props_lock); 327 328 /* 329 * Get properties from the spa config. 330 */ 331 spa_prop_get_config(spa, nvp); 332 333 /* If no pool property object, no more prop to get. */ 334 if (mos == NULL || spa->spa_pool_props_object == 0) { 335 mutex_exit(&spa->spa_props_lock); 336 return (0); 337 } 338 339 /* 340 * Get properties from the MOS pool property object. 341 */ 342 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 343 (err = zap_cursor_retrieve(&zc, &za)) == 0; 344 zap_cursor_advance(&zc)) { 345 uint64_t intval = 0; 346 char *strval = NULL; 347 zprop_source_t src = ZPROP_SRC_DEFAULT; 348 zpool_prop_t prop; 349 350 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 351 continue; 352 353 switch (za.za_integer_length) { 354 case 8: 355 /* integer property */ 356 if (za.za_first_integer != 357 zpool_prop_default_numeric(prop)) 358 src = ZPROP_SRC_LOCAL; 359 360 if (prop == ZPOOL_PROP_BOOTFS) { 361 dsl_pool_t *dp; 362 dsl_dataset_t *ds = NULL; 363 364 dp = spa_get_dsl(spa); 365 dsl_pool_config_enter(dp, FTAG); 366 if (err = dsl_dataset_hold_obj(dp, 367 za.za_first_integer, FTAG, &ds)) { 368 dsl_pool_config_exit(dp, FTAG); 369 break; 370 } 371 372 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 373 KM_SLEEP); 374 dsl_dataset_name(ds, strval); 375 dsl_dataset_rele(ds, FTAG); 376 dsl_pool_config_exit(dp, FTAG); 377 } else { 378 strval = NULL; 379 intval = za.za_first_integer; 380 } 381 382 spa_prop_add_list(*nvp, prop, strval, intval, src); 383 384 if (strval != NULL) 385 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 386 387 break; 388 389 case 1: 390 /* string property */ 391 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 392 err = zap_lookup(mos, spa->spa_pool_props_object, 393 za.za_name, 1, za.za_num_integers, strval); 394 if (err) { 395 kmem_free(strval, za.za_num_integers); 396 break; 397 } 398 spa_prop_add_list(*nvp, prop, strval, 0, src); 399 kmem_free(strval, za.za_num_integers); 400 break; 401 402 default: 403 break; 404 } 405 } 406 zap_cursor_fini(&zc); 407 mutex_exit(&spa->spa_props_lock); 408out: 409 if (err && err != ENOENT) { 410 nvlist_free(*nvp); 411 *nvp = NULL; 412 return (err); 413 } 414 415 return (0); 416} 417 418/* 419 * Validate the given pool properties nvlist and modify the list 420 * for the property values to be set. 421 */ 422static int 423spa_prop_validate(spa_t *spa, nvlist_t *props) 424{ 425 nvpair_t *elem; 426 int error = 0, reset_bootfs = 0; 427 uint64_t objnum = 0; 428 boolean_t has_feature = B_FALSE; 429 430 elem = NULL; 431 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 432 uint64_t intval; 433 char *strval, *slash, *check, *fname; 434 const char *propname = nvpair_name(elem); 435 zpool_prop_t prop = zpool_name_to_prop(propname); 436 437 switch (prop) { 438 case ZPROP_INVAL: 439 if (!zpool_prop_feature(propname)) { 440 error = SET_ERROR(EINVAL); 441 break; 442 } 443 444 /* 445 * Sanitize the input. 446 */ 447 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 448 error = SET_ERROR(EINVAL); 449 break; 450 } 451 452 if (nvpair_value_uint64(elem, &intval) != 0) { 453 error = SET_ERROR(EINVAL); 454 break; 455 } 456 457 if (intval != 0) { 458 error = SET_ERROR(EINVAL); 459 break; 460 } 461 462 fname = strchr(propname, '@') + 1; 463 if (zfeature_lookup_name(fname, NULL) != 0) { 464 error = SET_ERROR(EINVAL); 465 break; 466 } 467 468 has_feature = B_TRUE; 469 break; 470 471 case ZPOOL_PROP_VERSION: 472 error = nvpair_value_uint64(elem, &intval); 473 if (!error && 474 (intval < spa_version(spa) || 475 intval > SPA_VERSION_BEFORE_FEATURES || 476 has_feature)) 477 error = SET_ERROR(EINVAL); 478 break; 479 480 case ZPOOL_PROP_DELEGATION: 481 case ZPOOL_PROP_AUTOREPLACE: 482 case ZPOOL_PROP_LISTSNAPS: 483 case ZPOOL_PROP_AUTOEXPAND: 484 error = nvpair_value_uint64(elem, &intval); 485 if (!error && intval > 1) 486 error = SET_ERROR(EINVAL); 487 break; 488 489 case ZPOOL_PROP_BOOTFS: 490 /* 491 * If the pool version is less than SPA_VERSION_BOOTFS, 492 * or the pool is still being created (version == 0), 493 * the bootfs property cannot be set. 494 */ 495 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 496 error = SET_ERROR(ENOTSUP); 497 break; 498 } 499 500 /* 501 * Make sure the vdev config is bootable 502 */ 503 if (!vdev_is_bootable(spa->spa_root_vdev)) { 504 error = SET_ERROR(ENOTSUP); 505 break; 506 } 507 508 reset_bootfs = 1; 509 510 error = nvpair_value_string(elem, &strval); 511 512 if (!error) { 513 objset_t *os; 514 uint64_t propval; 515 516 if (strval == NULL || strval[0] == '\0') { 517 objnum = zpool_prop_default_numeric( 518 ZPOOL_PROP_BOOTFS); 519 break; 520 } 521 522 if (error = dmu_objset_hold(strval, FTAG, &os)) 523 break; 524 525 /* 526 * Must be ZPL, and its property settings 527 * must be supported by GRUB (compression 528 * is not gzip, and large blocks are not used). 529 */ 530 531 if (dmu_objset_type(os) != DMU_OST_ZFS) { 532 error = SET_ERROR(ENOTSUP); 533 } else if ((error = 534 dsl_prop_get_int_ds(dmu_objset_ds(os), 535 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 536 &propval)) == 0 && 537 !BOOTFS_COMPRESS_VALID(propval)) { 538 error = SET_ERROR(ENOTSUP); 539 } else if ((error = 540 dsl_prop_get_int_ds(dmu_objset_ds(os), 541 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), 542 &propval)) == 0 && 543 propval > SPA_OLD_MAXBLOCKSIZE) { 544 error = SET_ERROR(ENOTSUP); 545 } else { 546 objnum = dmu_objset_id(os); 547 } 548 dmu_objset_rele(os, FTAG); 549 } 550 break; 551 552 case ZPOOL_PROP_FAILUREMODE: 553 error = nvpair_value_uint64(elem, &intval); 554 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 555 intval > ZIO_FAILURE_MODE_PANIC)) 556 error = SET_ERROR(EINVAL); 557 558 /* 559 * This is a special case which only occurs when 560 * the pool has completely failed. This allows 561 * the user to change the in-core failmode property 562 * without syncing it out to disk (I/Os might 563 * currently be blocked). We do this by returning 564 * EIO to the caller (spa_prop_set) to trick it 565 * into thinking we encountered a property validation 566 * error. 567 */ 568 if (!error && spa_suspended(spa)) { 569 spa->spa_failmode = intval; 570 error = SET_ERROR(EIO); 571 } 572 break; 573 574 case ZPOOL_PROP_CACHEFILE: 575 if ((error = nvpair_value_string(elem, &strval)) != 0) 576 break; 577 578 if (strval[0] == '\0') 579 break; 580 581 if (strcmp(strval, "none") == 0) 582 break; 583 584 if (strval[0] != '/') { 585 error = SET_ERROR(EINVAL); 586 break; 587 } 588 589 slash = strrchr(strval, '/'); 590 ASSERT(slash != NULL); 591 592 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 593 strcmp(slash, "/..") == 0) 594 error = SET_ERROR(EINVAL); 595 break; 596 597 case ZPOOL_PROP_COMMENT: 598 if ((error = nvpair_value_string(elem, &strval)) != 0) 599 break; 600 for (check = strval; *check != '\0'; check++) { 601 /* 602 * The kernel doesn't have an easy isprint() 603 * check. For this kernel check, we merely 604 * check ASCII apart from DEL. Fix this if 605 * there is an easy-to-use kernel isprint(). 606 */ 607 if (*check >= 0x7f) { 608 error = SET_ERROR(EINVAL); 609 break; 610 } 611 } 612 if (strlen(strval) > ZPROP_MAX_COMMENT) 613 error = E2BIG; 614 break; 615 616 case ZPOOL_PROP_DEDUPDITTO: 617 if (spa_version(spa) < SPA_VERSION_DEDUP) 618 error = SET_ERROR(ENOTSUP); 619 else 620 error = nvpair_value_uint64(elem, &intval); 621 if (error == 0 && 622 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 623 error = SET_ERROR(EINVAL); 624 break; 625 } 626 627 if (error) 628 break; 629 } 630 631 if (!error && reset_bootfs) { 632 error = nvlist_remove(props, 633 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 634 635 if (!error) { 636 error = nvlist_add_uint64(props, 637 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 638 } 639 } 640 641 return (error); 642} 643 644void 645spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 646{ 647 char *cachefile; 648 spa_config_dirent_t *dp; 649 650 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 651 &cachefile) != 0) 652 return; 653 654 dp = kmem_alloc(sizeof (spa_config_dirent_t), 655 KM_SLEEP); 656 657 if (cachefile[0] == '\0') 658 dp->scd_path = spa_strdup(spa_config_path); 659 else if (strcmp(cachefile, "none") == 0) 660 dp->scd_path = NULL; 661 else 662 dp->scd_path = spa_strdup(cachefile); 663 664 list_insert_head(&spa->spa_config_list, dp); 665 if (need_sync) 666 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 667} 668 669int 670spa_prop_set(spa_t *spa, nvlist_t *nvp) 671{ 672 int error; 673 nvpair_t *elem = NULL; 674 boolean_t need_sync = B_FALSE; 675 676 if ((error = spa_prop_validate(spa, nvp)) != 0) 677 return (error); 678 679 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 680 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 681 682 if (prop == ZPOOL_PROP_CACHEFILE || 683 prop == ZPOOL_PROP_ALTROOT || 684 prop == ZPOOL_PROP_READONLY) 685 continue; 686 687 if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 688 uint64_t ver; 689 690 if (prop == ZPOOL_PROP_VERSION) { 691 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 692 } else { 693 ASSERT(zpool_prop_feature(nvpair_name(elem))); 694 ver = SPA_VERSION_FEATURES; 695 need_sync = B_TRUE; 696 } 697 698 /* Save time if the version is already set. */ 699 if (ver == spa_version(spa)) 700 continue; 701 702 /* 703 * In addition to the pool directory object, we might 704 * create the pool properties object, the features for 705 * read object, the features for write object, or the 706 * feature descriptions object. 707 */ 708 error = dsl_sync_task(spa->spa_name, NULL, 709 spa_sync_version, &ver, 710 6, ZFS_SPACE_CHECK_RESERVED); 711 if (error) 712 return (error); 713 continue; 714 } 715 716 need_sync = B_TRUE; 717 break; 718 } 719 720 if (need_sync) { 721 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 722 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 723 } 724 725 return (0); 726} 727 728/* 729 * If the bootfs property value is dsobj, clear it. 730 */ 731void 732spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 733{ 734 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 735 VERIFY(zap_remove(spa->spa_meta_objset, 736 spa->spa_pool_props_object, 737 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 738 spa->spa_bootfs = 0; 739 } 740} 741 742/*ARGSUSED*/ 743static int 744spa_change_guid_check(void *arg, dmu_tx_t *tx) 745{ 746 uint64_t *newguid = arg; 747 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 748 vdev_t *rvd = spa->spa_root_vdev; 749 uint64_t vdev_state; 750 751 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 752 vdev_state = rvd->vdev_state; 753 spa_config_exit(spa, SCL_STATE, FTAG); 754 755 if (vdev_state != VDEV_STATE_HEALTHY) 756 return (SET_ERROR(ENXIO)); 757 758 ASSERT3U(spa_guid(spa), !=, *newguid); 759 760 return (0); 761} 762 763static void 764spa_change_guid_sync(void *arg, dmu_tx_t *tx) 765{ 766 uint64_t *newguid = arg; 767 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 768 uint64_t oldguid; 769 vdev_t *rvd = spa->spa_root_vdev; 770 771 oldguid = spa_guid(spa); 772 773 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 774 rvd->vdev_guid = *newguid; 775 rvd->vdev_guid_sum += (*newguid - oldguid); 776 vdev_config_dirty(rvd); 777 spa_config_exit(spa, SCL_STATE, FTAG); 778 779 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 780 oldguid, *newguid); 781} 782 783/* 784 * Change the GUID for the pool. This is done so that we can later 785 * re-import a pool built from a clone of our own vdevs. We will modify 786 * the root vdev's guid, our own pool guid, and then mark all of our 787 * vdevs dirty. Note that we must make sure that all our vdevs are 788 * online when we do this, or else any vdevs that weren't present 789 * would be orphaned from our pool. We are also going to issue a 790 * sysevent to update any watchers. 791 */ 792int 793spa_change_guid(spa_t *spa) 794{ 795 int error; 796 uint64_t guid; 797 798 mutex_enter(&spa->spa_vdev_top_lock); 799 mutex_enter(&spa_namespace_lock); 800 guid = spa_generate_guid(NULL); 801 802 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 803 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 804 805 if (error == 0) { 806 spa_config_sync(spa, B_FALSE, B_TRUE); 807 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 808 } 809 810 mutex_exit(&spa_namespace_lock); 811 mutex_exit(&spa->spa_vdev_top_lock); 812 813 return (error); 814} 815 816/* 817 * ========================================================================== 818 * SPA state manipulation (open/create/destroy/import/export) 819 * ========================================================================== 820 */ 821 822static int 823spa_error_entry_compare(const void *a, const void *b) 824{ 825 spa_error_entry_t *sa = (spa_error_entry_t *)a; 826 spa_error_entry_t *sb = (spa_error_entry_t *)b; 827 int ret; 828 829 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 830 sizeof (zbookmark_phys_t)); 831 832 if (ret < 0) 833 return (-1); 834 else if (ret > 0) 835 return (1); 836 else 837 return (0); 838} 839 840/* 841 * Utility function which retrieves copies of the current logs and 842 * re-initializes them in the process. 843 */ 844void 845spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 846{ 847 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 848 849 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 850 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 851 852 avl_create(&spa->spa_errlist_scrub, 853 spa_error_entry_compare, sizeof (spa_error_entry_t), 854 offsetof(spa_error_entry_t, se_avl)); 855 avl_create(&spa->spa_errlist_last, 856 spa_error_entry_compare, sizeof (spa_error_entry_t), 857 offsetof(spa_error_entry_t, se_avl)); 858} 859 860static void 861spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 862{ 863 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 864 enum zti_modes mode = ztip->zti_mode; 865 uint_t value = ztip->zti_value; 866 uint_t count = ztip->zti_count; 867 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 868 char name[32]; 869 uint_t flags = 0; 870 boolean_t batch = B_FALSE; 871 872 if (mode == ZTI_MODE_NULL) { 873 tqs->stqs_count = 0; 874 tqs->stqs_taskq = NULL; 875 return; 876 } 877 878 ASSERT3U(count, >, 0); 879 880 tqs->stqs_count = count; 881 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 882 883 switch (mode) { 884 case ZTI_MODE_FIXED: 885 ASSERT3U(value, >=, 1); 886 value = MAX(value, 1); 887 break; 888 889 case ZTI_MODE_BATCH: 890 batch = B_TRUE; 891 flags |= TASKQ_THREADS_CPU_PCT; 892 value = zio_taskq_batch_pct; 893 break; 894 895 default: 896 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 897 "spa_activate()", 898 zio_type_name[t], zio_taskq_types[q], mode, value); 899 break; 900 } 901 902 for (uint_t i = 0; i < count; i++) { 903 taskq_t *tq; 904 905 if (count > 1) { 906 (void) snprintf(name, sizeof (name), "%s_%s_%u", 907 zio_type_name[t], zio_taskq_types[q], i); 908 } else { 909 (void) snprintf(name, sizeof (name), "%s_%s", 910 zio_type_name[t], zio_taskq_types[q]); 911 } 912 913#ifdef SYSDC 914 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 915 if (batch) 916 flags |= TASKQ_DC_BATCH; 917 918 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 919 spa->spa_proc, zio_taskq_basedc, flags); 920 } else { 921#endif 922 pri_t pri = maxclsyspri; 923 /* 924 * The write issue taskq can be extremely CPU 925 * intensive. Run it at slightly lower priority 926 * than the other taskqs. 927 */ 928 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) 929 pri++; 930 931 tq = taskq_create_proc(name, value, pri, 50, 932 INT_MAX, spa->spa_proc, flags); 933#ifdef SYSDC 934 } 935#endif 936 937 tqs->stqs_taskq[i] = tq; 938 } 939} 940 941static void 942spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 943{ 944 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 945 946 if (tqs->stqs_taskq == NULL) { 947 ASSERT0(tqs->stqs_count); 948 return; 949 } 950 951 for (uint_t i = 0; i < tqs->stqs_count; i++) { 952 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 953 taskq_destroy(tqs->stqs_taskq[i]); 954 } 955 956 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 957 tqs->stqs_taskq = NULL; 958} 959 960/* 961 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 962 * Note that a type may have multiple discrete taskqs to avoid lock contention 963 * on the taskq itself. In that case we choose which taskq at random by using 964 * the low bits of gethrtime(). 965 */ 966void 967spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 968 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 969{ 970 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 971 taskq_t *tq; 972 973 ASSERT3P(tqs->stqs_taskq, !=, NULL); 974 ASSERT3U(tqs->stqs_count, !=, 0); 975 976 if (tqs->stqs_count == 1) { 977 tq = tqs->stqs_taskq[0]; 978 } else { 979#ifdef _KERNEL 980 tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; 981#else 982 tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; 983#endif 984 } 985 986 taskq_dispatch_ent(tq, func, arg, flags, ent); 987} 988 989static void 990spa_create_zio_taskqs(spa_t *spa) 991{ 992 for (int t = 0; t < ZIO_TYPES; t++) { 993 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 994 spa_taskqs_init(spa, t, q); 995 } 996 } 997} 998 999#ifdef _KERNEL 1000#ifdef SPA_PROCESS 1001static void 1002spa_thread(void *arg) 1003{ 1004 callb_cpr_t cprinfo; 1005 1006 spa_t *spa = arg; 1007 user_t *pu = PTOU(curproc); 1008 1009 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1010 spa->spa_name); 1011 1012 ASSERT(curproc != &p0); 1013 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1014 "zpool-%s", spa->spa_name); 1015 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1016 1017#ifdef PSRSET_BIND 1018 /* bind this thread to the requested psrset */ 1019 if (zio_taskq_psrset_bind != PS_NONE) { 1020 pool_lock(); 1021 mutex_enter(&cpu_lock); 1022 mutex_enter(&pidlock); 1023 mutex_enter(&curproc->p_lock); 1024 1025 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1026 0, NULL, NULL) == 0) { 1027 curthread->t_bind_pset = zio_taskq_psrset_bind; 1028 } else { 1029 cmn_err(CE_WARN, 1030 "Couldn't bind process for zfs pool \"%s\" to " 1031 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1032 } 1033 1034 mutex_exit(&curproc->p_lock); 1035 mutex_exit(&pidlock); 1036 mutex_exit(&cpu_lock); 1037 pool_unlock(); 1038 } 1039#endif 1040 1041#ifdef SYSDC 1042 if (zio_taskq_sysdc) { 1043 sysdc_thread_enter(curthread, 100, 0); 1044 } 1045#endif 1046 1047 spa->spa_proc = curproc; 1048 spa->spa_did = curthread->t_did; 1049 1050 spa_create_zio_taskqs(spa); 1051 1052 mutex_enter(&spa->spa_proc_lock); 1053 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1054 1055 spa->spa_proc_state = SPA_PROC_ACTIVE; 1056 cv_broadcast(&spa->spa_proc_cv); 1057 1058 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1059 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1060 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1061 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1062 1063 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1064 spa->spa_proc_state = SPA_PROC_GONE; 1065 spa->spa_proc = &p0; 1066 cv_broadcast(&spa->spa_proc_cv); 1067 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1068 1069 mutex_enter(&curproc->p_lock); 1070 lwp_exit(); 1071} 1072#endif /* SPA_PROCESS */ 1073#endif 1074 1075/* 1076 * Activate an uninitialized pool. 1077 */ 1078static void 1079spa_activate(spa_t *spa, int mode) 1080{ 1081 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1082 1083 spa->spa_state = POOL_STATE_ACTIVE; 1084 spa->spa_mode = mode; 1085 1086 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1087 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1088 1089 /* Try to create a covering process */ 1090 mutex_enter(&spa->spa_proc_lock); 1091 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1092 ASSERT(spa->spa_proc == &p0); 1093 spa->spa_did = 0; 1094 1095#ifdef SPA_PROCESS 1096 /* Only create a process if we're going to be around a while. */ 1097 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1098 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1099 NULL, 0) == 0) { 1100 spa->spa_proc_state = SPA_PROC_CREATED; 1101 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1102 cv_wait(&spa->spa_proc_cv, 1103 &spa->spa_proc_lock); 1104 } 1105 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1106 ASSERT(spa->spa_proc != &p0); 1107 ASSERT(spa->spa_did != 0); 1108 } else { 1109#ifdef _KERNEL 1110 cmn_err(CE_WARN, 1111 "Couldn't create process for zfs pool \"%s\"\n", 1112 spa->spa_name); 1113#endif 1114 } 1115 } 1116#endif /* SPA_PROCESS */ 1117 mutex_exit(&spa->spa_proc_lock); 1118 1119 /* If we didn't create a process, we need to create our taskqs. */ 1120 ASSERT(spa->spa_proc == &p0); 1121 if (spa->spa_proc == &p0) { 1122 spa_create_zio_taskqs(spa); 1123 } 1124 1125 /* 1126 * Start TRIM thread. 1127 */ 1128 trim_thread_create(spa); 1129 1130 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1131 offsetof(vdev_t, vdev_config_dirty_node)); 1132 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1133 offsetof(objset_t, os_evicting_node)); 1134 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1135 offsetof(vdev_t, vdev_state_dirty_node)); 1136 1137 txg_list_create(&spa->spa_vdev_txg_list, 1138 offsetof(struct vdev, vdev_txg_node)); 1139 1140 avl_create(&spa->spa_errlist_scrub, 1141 spa_error_entry_compare, sizeof (spa_error_entry_t), 1142 offsetof(spa_error_entry_t, se_avl)); 1143 avl_create(&spa->spa_errlist_last, 1144 spa_error_entry_compare, sizeof (spa_error_entry_t), 1145 offsetof(spa_error_entry_t, se_avl)); 1146} 1147 1148/* 1149 * Opposite of spa_activate(). 1150 */ 1151static void 1152spa_deactivate(spa_t *spa) 1153{ 1154 ASSERT(spa->spa_sync_on == B_FALSE); 1155 ASSERT(spa->spa_dsl_pool == NULL); 1156 ASSERT(spa->spa_root_vdev == NULL); 1157 ASSERT(spa->spa_async_zio_root == NULL); 1158 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1159 1160 /* 1161 * Stop TRIM thread in case spa_unload() wasn't called directly 1162 * before spa_deactivate(). 1163 */ 1164 trim_thread_destroy(spa); 1165 1166 spa_evicting_os_wait(spa); 1167 1168 txg_list_destroy(&spa->spa_vdev_txg_list); 1169 1170 list_destroy(&spa->spa_config_dirty_list); 1171 list_destroy(&spa->spa_evicting_os_list); 1172 list_destroy(&spa->spa_state_dirty_list); 1173 1174 for (int t = 0; t < ZIO_TYPES; t++) { 1175 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1176 spa_taskqs_fini(spa, t, q); 1177 } 1178 } 1179 1180 metaslab_class_destroy(spa->spa_normal_class); 1181 spa->spa_normal_class = NULL; 1182 1183 metaslab_class_destroy(spa->spa_log_class); 1184 spa->spa_log_class = NULL; 1185 1186 /* 1187 * If this was part of an import or the open otherwise failed, we may 1188 * still have errors left in the queues. Empty them just in case. 1189 */ 1190 spa_errlog_drain(spa); 1191 1192 avl_destroy(&spa->spa_errlist_scrub); 1193 avl_destroy(&spa->spa_errlist_last); 1194 1195 spa->spa_state = POOL_STATE_UNINITIALIZED; 1196 1197 mutex_enter(&spa->spa_proc_lock); 1198 if (spa->spa_proc_state != SPA_PROC_NONE) { 1199 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1200 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1201 cv_broadcast(&spa->spa_proc_cv); 1202 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1203 ASSERT(spa->spa_proc != &p0); 1204 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1205 } 1206 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1207 spa->spa_proc_state = SPA_PROC_NONE; 1208 } 1209 ASSERT(spa->spa_proc == &p0); 1210 mutex_exit(&spa->spa_proc_lock); 1211 1212#ifdef SPA_PROCESS 1213 /* 1214 * We want to make sure spa_thread() has actually exited the ZFS 1215 * module, so that the module can't be unloaded out from underneath 1216 * it. 1217 */ 1218 if (spa->spa_did != 0) { 1219 thread_join(spa->spa_did); 1220 spa->spa_did = 0; 1221 } 1222#endif /* SPA_PROCESS */ 1223} 1224 1225/* 1226 * Verify a pool configuration, and construct the vdev tree appropriately. This 1227 * will create all the necessary vdevs in the appropriate layout, with each vdev 1228 * in the CLOSED state. This will prep the pool before open/creation/import. 1229 * All vdev validation is done by the vdev_alloc() routine. 1230 */ 1231static int 1232spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1233 uint_t id, int atype) 1234{ 1235 nvlist_t **child; 1236 uint_t children; 1237 int error; 1238 1239 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1240 return (error); 1241 1242 if ((*vdp)->vdev_ops->vdev_op_leaf) 1243 return (0); 1244 1245 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1246 &child, &children); 1247 1248 if (error == ENOENT) 1249 return (0); 1250 1251 if (error) { 1252 vdev_free(*vdp); 1253 *vdp = NULL; 1254 return (SET_ERROR(EINVAL)); 1255 } 1256 1257 for (int c = 0; c < children; c++) { 1258 vdev_t *vd; 1259 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1260 atype)) != 0) { 1261 vdev_free(*vdp); 1262 *vdp = NULL; 1263 return (error); 1264 } 1265 } 1266 1267 ASSERT(*vdp != NULL); 1268 1269 return (0); 1270} 1271 1272/* 1273 * Opposite of spa_load(). 1274 */ 1275static void 1276spa_unload(spa_t *spa) 1277{ 1278 int i; 1279 1280 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1281 1282 /* 1283 * Stop TRIM thread. 1284 */ 1285 trim_thread_destroy(spa); 1286 1287 /* 1288 * Stop async tasks. 1289 */ 1290 spa_async_suspend(spa); 1291 1292 /* 1293 * Stop syncing. 1294 */ 1295 if (spa->spa_sync_on) { 1296 txg_sync_stop(spa->spa_dsl_pool); 1297 spa->spa_sync_on = B_FALSE; 1298 } 1299 1300 /* 1301 * Wait for any outstanding async I/O to complete. 1302 */ 1303 if (spa->spa_async_zio_root != NULL) { 1304 for (int i = 0; i < max_ncpus; i++) 1305 (void) zio_wait(spa->spa_async_zio_root[i]); 1306 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1307 spa->spa_async_zio_root = NULL; 1308 } 1309 1310 bpobj_close(&spa->spa_deferred_bpobj); 1311 1312 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1313 1314 /* 1315 * Close all vdevs. 1316 */ 1317 if (spa->spa_root_vdev) 1318 vdev_free(spa->spa_root_vdev); 1319 ASSERT(spa->spa_root_vdev == NULL); 1320 1321 /* 1322 * Close the dsl pool. 1323 */ 1324 if (spa->spa_dsl_pool) { 1325 dsl_pool_close(spa->spa_dsl_pool); 1326 spa->spa_dsl_pool = NULL; 1327 spa->spa_meta_objset = NULL; 1328 } 1329 1330 ddt_unload(spa); 1331 1332 /* 1333 * Drop and purge level 2 cache 1334 */ 1335 spa_l2cache_drop(spa); 1336 1337 for (i = 0; i < spa->spa_spares.sav_count; i++) 1338 vdev_free(spa->spa_spares.sav_vdevs[i]); 1339 if (spa->spa_spares.sav_vdevs) { 1340 kmem_free(spa->spa_spares.sav_vdevs, 1341 spa->spa_spares.sav_count * sizeof (void *)); 1342 spa->spa_spares.sav_vdevs = NULL; 1343 } 1344 if (spa->spa_spares.sav_config) { 1345 nvlist_free(spa->spa_spares.sav_config); 1346 spa->spa_spares.sav_config = NULL; 1347 } 1348 spa->spa_spares.sav_count = 0; 1349 1350 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1351 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1352 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1353 } 1354 if (spa->spa_l2cache.sav_vdevs) { 1355 kmem_free(spa->spa_l2cache.sav_vdevs, 1356 spa->spa_l2cache.sav_count * sizeof (void *)); 1357 spa->spa_l2cache.sav_vdevs = NULL; 1358 } 1359 if (spa->spa_l2cache.sav_config) { 1360 nvlist_free(spa->spa_l2cache.sav_config); 1361 spa->spa_l2cache.sav_config = NULL; 1362 } 1363 spa->spa_l2cache.sav_count = 0; 1364 1365 spa->spa_async_suspended = 0; 1366 1367 if (spa->spa_comment != NULL) { 1368 spa_strfree(spa->spa_comment); 1369 spa->spa_comment = NULL; 1370 } 1371 1372 spa_config_exit(spa, SCL_ALL, FTAG); 1373} 1374 1375/* 1376 * Load (or re-load) the current list of vdevs describing the active spares for 1377 * this pool. When this is called, we have some form of basic information in 1378 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1379 * then re-generate a more complete list including status information. 1380 */ 1381static void 1382spa_load_spares(spa_t *spa) 1383{ 1384 nvlist_t **spares; 1385 uint_t nspares; 1386 int i; 1387 vdev_t *vd, *tvd; 1388 1389 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1390 1391 /* 1392 * First, close and free any existing spare vdevs. 1393 */ 1394 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1395 vd = spa->spa_spares.sav_vdevs[i]; 1396 1397 /* Undo the call to spa_activate() below */ 1398 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1399 B_FALSE)) != NULL && tvd->vdev_isspare) 1400 spa_spare_remove(tvd); 1401 vdev_close(vd); 1402 vdev_free(vd); 1403 } 1404 1405 if (spa->spa_spares.sav_vdevs) 1406 kmem_free(spa->spa_spares.sav_vdevs, 1407 spa->spa_spares.sav_count * sizeof (void *)); 1408 1409 if (spa->spa_spares.sav_config == NULL) 1410 nspares = 0; 1411 else 1412 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1413 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1414 1415 spa->spa_spares.sav_count = (int)nspares; 1416 spa->spa_spares.sav_vdevs = NULL; 1417 1418 if (nspares == 0) 1419 return; 1420 1421 /* 1422 * Construct the array of vdevs, opening them to get status in the 1423 * process. For each spare, there is potentially two different vdev_t 1424 * structures associated with it: one in the list of spares (used only 1425 * for basic validation purposes) and one in the active vdev 1426 * configuration (if it's spared in). During this phase we open and 1427 * validate each vdev on the spare list. If the vdev also exists in the 1428 * active configuration, then we also mark this vdev as an active spare. 1429 */ 1430 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1431 KM_SLEEP); 1432 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1433 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1434 VDEV_ALLOC_SPARE) == 0); 1435 ASSERT(vd != NULL); 1436 1437 spa->spa_spares.sav_vdevs[i] = vd; 1438 1439 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1440 B_FALSE)) != NULL) { 1441 if (!tvd->vdev_isspare) 1442 spa_spare_add(tvd); 1443 1444 /* 1445 * We only mark the spare active if we were successfully 1446 * able to load the vdev. Otherwise, importing a pool 1447 * with a bad active spare would result in strange 1448 * behavior, because multiple pool would think the spare 1449 * is actively in use. 1450 * 1451 * There is a vulnerability here to an equally bizarre 1452 * circumstance, where a dead active spare is later 1453 * brought back to life (onlined or otherwise). Given 1454 * the rarity of this scenario, and the extra complexity 1455 * it adds, we ignore the possibility. 1456 */ 1457 if (!vdev_is_dead(tvd)) 1458 spa_spare_activate(tvd); 1459 } 1460 1461 vd->vdev_top = vd; 1462 vd->vdev_aux = &spa->spa_spares; 1463 1464 if (vdev_open(vd) != 0) 1465 continue; 1466 1467 if (vdev_validate_aux(vd) == 0) 1468 spa_spare_add(vd); 1469 } 1470 1471 /* 1472 * Recompute the stashed list of spares, with status information 1473 * this time. 1474 */ 1475 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1476 DATA_TYPE_NVLIST_ARRAY) == 0); 1477 1478 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1479 KM_SLEEP); 1480 for (i = 0; i < spa->spa_spares.sav_count; i++) 1481 spares[i] = vdev_config_generate(spa, 1482 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1483 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1484 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1485 for (i = 0; i < spa->spa_spares.sav_count; i++) 1486 nvlist_free(spares[i]); 1487 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1488} 1489 1490/* 1491 * Load (or re-load) the current list of vdevs describing the active l2cache for 1492 * this pool. When this is called, we have some form of basic information in 1493 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1494 * then re-generate a more complete list including status information. 1495 * Devices which are already active have their details maintained, and are 1496 * not re-opened. 1497 */ 1498static void 1499spa_load_l2cache(spa_t *spa) 1500{ 1501 nvlist_t **l2cache; 1502 uint_t nl2cache; 1503 int i, j, oldnvdevs; 1504 uint64_t guid; 1505 vdev_t *vd, **oldvdevs, **newvdevs; 1506 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1507 1508 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1509 1510 if (sav->sav_config != NULL) { 1511 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1512 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1513 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1514 } else { 1515 nl2cache = 0; 1516 newvdevs = NULL; 1517 } 1518 1519 oldvdevs = sav->sav_vdevs; 1520 oldnvdevs = sav->sav_count; 1521 sav->sav_vdevs = NULL; 1522 sav->sav_count = 0; 1523 1524 /* 1525 * Process new nvlist of vdevs. 1526 */ 1527 for (i = 0; i < nl2cache; i++) { 1528 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1529 &guid) == 0); 1530 1531 newvdevs[i] = NULL; 1532 for (j = 0; j < oldnvdevs; j++) { 1533 vd = oldvdevs[j]; 1534 if (vd != NULL && guid == vd->vdev_guid) { 1535 /* 1536 * Retain previous vdev for add/remove ops. 1537 */ 1538 newvdevs[i] = vd; 1539 oldvdevs[j] = NULL; 1540 break; 1541 } 1542 } 1543 1544 if (newvdevs[i] == NULL) { 1545 /* 1546 * Create new vdev 1547 */ 1548 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1549 VDEV_ALLOC_L2CACHE) == 0); 1550 ASSERT(vd != NULL); 1551 newvdevs[i] = vd; 1552 1553 /* 1554 * Commit this vdev as an l2cache device, 1555 * even if it fails to open. 1556 */ 1557 spa_l2cache_add(vd); 1558 1559 vd->vdev_top = vd; 1560 vd->vdev_aux = sav; 1561 1562 spa_l2cache_activate(vd); 1563 1564 if (vdev_open(vd) != 0) 1565 continue; 1566 1567 (void) vdev_validate_aux(vd); 1568 1569 if (!vdev_is_dead(vd)) 1570 l2arc_add_vdev(spa, vd); 1571 } 1572 } 1573 1574 /* 1575 * Purge vdevs that were dropped 1576 */ 1577 for (i = 0; i < oldnvdevs; i++) { 1578 uint64_t pool; 1579 1580 vd = oldvdevs[i]; 1581 if (vd != NULL) { 1582 ASSERT(vd->vdev_isl2cache); 1583 1584 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1585 pool != 0ULL && l2arc_vdev_present(vd)) 1586 l2arc_remove_vdev(vd); 1587 vdev_clear_stats(vd); 1588 vdev_free(vd); 1589 } 1590 } 1591 1592 if (oldvdevs) 1593 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1594 1595 if (sav->sav_config == NULL) 1596 goto out; 1597 1598 sav->sav_vdevs = newvdevs; 1599 sav->sav_count = (int)nl2cache; 1600 1601 /* 1602 * Recompute the stashed list of l2cache devices, with status 1603 * information this time. 1604 */ 1605 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1606 DATA_TYPE_NVLIST_ARRAY) == 0); 1607 1608 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1609 for (i = 0; i < sav->sav_count; i++) 1610 l2cache[i] = vdev_config_generate(spa, 1611 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1612 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1613 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1614out: 1615 for (i = 0; i < sav->sav_count; i++) 1616 nvlist_free(l2cache[i]); 1617 if (sav->sav_count) 1618 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1619} 1620 1621static int 1622load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1623{ 1624 dmu_buf_t *db; 1625 char *packed = NULL; 1626 size_t nvsize = 0; 1627 int error; 1628 *value = NULL; 1629 1630 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 1631 if (error != 0) 1632 return (error); 1633 1634 nvsize = *(uint64_t *)db->db_data; 1635 dmu_buf_rele(db, FTAG); 1636 1637 packed = kmem_alloc(nvsize, KM_SLEEP); 1638 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1639 DMU_READ_PREFETCH); 1640 if (error == 0) 1641 error = nvlist_unpack(packed, nvsize, value, 0); 1642 kmem_free(packed, nvsize); 1643 1644 return (error); 1645} 1646 1647/* 1648 * Checks to see if the given vdev could not be opened, in which case we post a 1649 * sysevent to notify the autoreplace code that the device has been removed. 1650 */ 1651static void 1652spa_check_removed(vdev_t *vd) 1653{ 1654 for (int c = 0; c < vd->vdev_children; c++) 1655 spa_check_removed(vd->vdev_child[c]); 1656 1657 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1658 !vd->vdev_ishole) { 1659 zfs_post_autoreplace(vd->vdev_spa, vd); 1660 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1661 } 1662} 1663 1664/* 1665 * Validate the current config against the MOS config 1666 */ 1667static boolean_t 1668spa_config_valid(spa_t *spa, nvlist_t *config) 1669{ 1670 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1671 nvlist_t *nv; 1672 1673 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1674 1675 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1676 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1677 1678 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1679 1680 /* 1681 * If we're doing a normal import, then build up any additional 1682 * diagnostic information about missing devices in this config. 1683 * We'll pass this up to the user for further processing. 1684 */ 1685 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1686 nvlist_t **child, *nv; 1687 uint64_t idx = 0; 1688 1689 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1690 KM_SLEEP); 1691 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1692 1693 for (int c = 0; c < rvd->vdev_children; c++) { 1694 vdev_t *tvd = rvd->vdev_child[c]; 1695 vdev_t *mtvd = mrvd->vdev_child[c]; 1696 1697 if (tvd->vdev_ops == &vdev_missing_ops && 1698 mtvd->vdev_ops != &vdev_missing_ops && 1699 mtvd->vdev_islog) 1700 child[idx++] = vdev_config_generate(spa, mtvd, 1701 B_FALSE, 0); 1702 } 1703 1704 if (idx) { 1705 VERIFY(nvlist_add_nvlist_array(nv, 1706 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1707 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1708 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1709 1710 for (int i = 0; i < idx; i++) 1711 nvlist_free(child[i]); 1712 } 1713 nvlist_free(nv); 1714 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1715 } 1716 1717 /* 1718 * Compare the root vdev tree with the information we have 1719 * from the MOS config (mrvd). Check each top-level vdev 1720 * with the corresponding MOS config top-level (mtvd). 1721 */ 1722 for (int c = 0; c < rvd->vdev_children; c++) { 1723 vdev_t *tvd = rvd->vdev_child[c]; 1724 vdev_t *mtvd = mrvd->vdev_child[c]; 1725 1726 /* 1727 * Resolve any "missing" vdevs in the current configuration. 1728 * If we find that the MOS config has more accurate information 1729 * about the top-level vdev then use that vdev instead. 1730 */ 1731 if (tvd->vdev_ops == &vdev_missing_ops && 1732 mtvd->vdev_ops != &vdev_missing_ops) { 1733 1734 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1735 continue; 1736 1737 /* 1738 * Device specific actions. 1739 */ 1740 if (mtvd->vdev_islog) { 1741 spa_set_log_state(spa, SPA_LOG_CLEAR); 1742 } else { 1743 /* 1744 * XXX - once we have 'readonly' pool 1745 * support we should be able to handle 1746 * missing data devices by transitioning 1747 * the pool to readonly. 1748 */ 1749 continue; 1750 } 1751 1752 /* 1753 * Swap the missing vdev with the data we were 1754 * able to obtain from the MOS config. 1755 */ 1756 vdev_remove_child(rvd, tvd); 1757 vdev_remove_child(mrvd, mtvd); 1758 1759 vdev_add_child(rvd, mtvd); 1760 vdev_add_child(mrvd, tvd); 1761 1762 spa_config_exit(spa, SCL_ALL, FTAG); 1763 vdev_load(mtvd); 1764 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1765 1766 vdev_reopen(rvd); 1767 } else if (mtvd->vdev_islog) { 1768 /* 1769 * Load the slog device's state from the MOS config 1770 * since it's possible that the label does not 1771 * contain the most up-to-date information. 1772 */ 1773 vdev_load_log_state(tvd, mtvd); 1774 vdev_reopen(tvd); 1775 } 1776 } 1777 vdev_free(mrvd); 1778 spa_config_exit(spa, SCL_ALL, FTAG); 1779 1780 /* 1781 * Ensure we were able to validate the config. 1782 */ 1783 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1784} 1785 1786/* 1787 * Check for missing log devices 1788 */ 1789static boolean_t 1790spa_check_logs(spa_t *spa) 1791{ 1792 boolean_t rv = B_FALSE; 1793 dsl_pool_t *dp = spa_get_dsl(spa); 1794 1795 switch (spa->spa_log_state) { 1796 case SPA_LOG_MISSING: 1797 /* need to recheck in case slog has been restored */ 1798 case SPA_LOG_UNKNOWN: 1799 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 1800 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 1801 if (rv) 1802 spa_set_log_state(spa, SPA_LOG_MISSING); 1803 break; 1804 } 1805 return (rv); 1806} 1807 1808static boolean_t 1809spa_passivate_log(spa_t *spa) 1810{ 1811 vdev_t *rvd = spa->spa_root_vdev; 1812 boolean_t slog_found = B_FALSE; 1813 1814 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1815 1816 if (!spa_has_slogs(spa)) 1817 return (B_FALSE); 1818 1819 for (int c = 0; c < rvd->vdev_children; c++) { 1820 vdev_t *tvd = rvd->vdev_child[c]; 1821 metaslab_group_t *mg = tvd->vdev_mg; 1822 1823 if (tvd->vdev_islog) { 1824 metaslab_group_passivate(mg); 1825 slog_found = B_TRUE; 1826 } 1827 } 1828 1829 return (slog_found); 1830} 1831 1832static void 1833spa_activate_log(spa_t *spa) 1834{ 1835 vdev_t *rvd = spa->spa_root_vdev; 1836 1837 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1838 1839 for (int c = 0; c < rvd->vdev_children; c++) { 1840 vdev_t *tvd = rvd->vdev_child[c]; 1841 metaslab_group_t *mg = tvd->vdev_mg; 1842 1843 if (tvd->vdev_islog) 1844 metaslab_group_activate(mg); 1845 } 1846} 1847 1848int 1849spa_offline_log(spa_t *spa) 1850{ 1851 int error; 1852 1853 error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1854 NULL, DS_FIND_CHILDREN); 1855 if (error == 0) { 1856 /* 1857 * We successfully offlined the log device, sync out the 1858 * current txg so that the "stubby" block can be removed 1859 * by zil_sync(). 1860 */ 1861 txg_wait_synced(spa->spa_dsl_pool, 0); 1862 } 1863 return (error); 1864} 1865 1866static void 1867spa_aux_check_removed(spa_aux_vdev_t *sav) 1868{ 1869 int i; 1870 1871 for (i = 0; i < sav->sav_count; i++) 1872 spa_check_removed(sav->sav_vdevs[i]); 1873} 1874 1875void 1876spa_claim_notify(zio_t *zio) 1877{ 1878 spa_t *spa = zio->io_spa; 1879 1880 if (zio->io_error) 1881 return; 1882 1883 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1884 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1885 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1886 mutex_exit(&spa->spa_props_lock); 1887} 1888 1889typedef struct spa_load_error { 1890 uint64_t sle_meta_count; 1891 uint64_t sle_data_count; 1892} spa_load_error_t; 1893 1894static void 1895spa_load_verify_done(zio_t *zio) 1896{ 1897 blkptr_t *bp = zio->io_bp; 1898 spa_load_error_t *sle = zio->io_private; 1899 dmu_object_type_t type = BP_GET_TYPE(bp); 1900 int error = zio->io_error; 1901 spa_t *spa = zio->io_spa; 1902 1903 if (error) { 1904 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1905 type != DMU_OT_INTENT_LOG) 1906 atomic_inc_64(&sle->sle_meta_count); 1907 else 1908 atomic_inc_64(&sle->sle_data_count); 1909 } 1910 zio_data_buf_free(zio->io_data, zio->io_size); 1911 1912 mutex_enter(&spa->spa_scrub_lock); 1913 spa->spa_scrub_inflight--; 1914 cv_broadcast(&spa->spa_scrub_io_cv); 1915 mutex_exit(&spa->spa_scrub_lock); 1916} 1917 1918/* 1919 * Maximum number of concurrent scrub i/os to create while verifying 1920 * a pool while importing it. 1921 */ 1922int spa_load_verify_maxinflight = 10000; 1923boolean_t spa_load_verify_metadata = B_TRUE; 1924boolean_t spa_load_verify_data = B_TRUE; 1925 1926SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, 1927 &spa_load_verify_maxinflight, 0, 1928 "Maximum number of concurrent scrub I/Os to create while verifying a " 1929 "pool while importing it"); 1930 1931SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, 1932 &spa_load_verify_metadata, 0, 1933 "Check metadata on import?"); 1934 1935SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, 1936 &spa_load_verify_data, 0, 1937 "Check user data on import?"); 1938 1939/*ARGSUSED*/ 1940static int 1941spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1942 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 1943{ 1944 if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 1945 return (0); 1946 /* 1947 * Note: normally this routine will not be called if 1948 * spa_load_verify_metadata is not set. However, it may be useful 1949 * to manually set the flag after the traversal has begun. 1950 */ 1951 if (!spa_load_verify_metadata) 1952 return (0); 1953 if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) 1954 return (0); 1955 1956 zio_t *rio = arg; 1957 size_t size = BP_GET_PSIZE(bp); 1958 void *data = zio_data_buf_alloc(size); 1959 1960 mutex_enter(&spa->spa_scrub_lock); 1961 while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) 1962 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1963 spa->spa_scrub_inflight++; 1964 mutex_exit(&spa->spa_scrub_lock); 1965 1966 zio_nowait(zio_read(rio, spa, bp, data, size, 1967 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1968 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1969 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1970 return (0); 1971} 1972 1973/* ARGSUSED */ 1974int 1975verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1976{ 1977 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 1978 return (SET_ERROR(ENAMETOOLONG)); 1979 1980 return (0); 1981} 1982 1983static int 1984spa_load_verify(spa_t *spa) 1985{ 1986 zio_t *rio; 1987 spa_load_error_t sle = { 0 }; 1988 zpool_rewind_policy_t policy; 1989 boolean_t verify_ok = B_FALSE; 1990 int error = 0; 1991 1992 zpool_get_rewind_policy(spa->spa_config, &policy); 1993 1994 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1995 return (0); 1996 1997 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 1998 error = dmu_objset_find_dp(spa->spa_dsl_pool, 1999 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2000 DS_FIND_CHILDREN); 2001 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2002 if (error != 0) 2003 return (error); 2004 2005 rio = zio_root(spa, NULL, &sle, 2006 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2007 2008 if (spa_load_verify_metadata) { 2009 error = traverse_pool(spa, spa->spa_verify_min_txg, 2010 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, 2011 spa_load_verify_cb, rio); 2012 } 2013 2014 (void) zio_wait(rio); 2015 2016 spa->spa_load_meta_errors = sle.sle_meta_count; 2017 spa->spa_load_data_errors = sle.sle_data_count; 2018 2019 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 2020 sle.sle_data_count <= policy.zrp_maxdata) { 2021 int64_t loss = 0; 2022 2023 verify_ok = B_TRUE; 2024 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2025 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2026 2027 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2028 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2029 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 2030 VERIFY(nvlist_add_int64(spa->spa_load_info, 2031 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 2032 VERIFY(nvlist_add_uint64(spa->spa_load_info, 2033 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 2034 } else { 2035 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2036 } 2037 2038 if (error) { 2039 if (error != ENXIO && error != EIO) 2040 error = SET_ERROR(EIO); 2041 return (error); 2042 } 2043 2044 return (verify_ok ? 0 : EIO); 2045} 2046 2047/* 2048 * Find a value in the pool props object. 2049 */ 2050static void 2051spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2052{ 2053 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2054 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2055} 2056 2057/* 2058 * Find a value in the pool directory object. 2059 */ 2060static int 2061spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 2062{ 2063 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2064 name, sizeof (uint64_t), 1, val)); 2065} 2066 2067static int 2068spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2069{ 2070 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2071 return (err); 2072} 2073 2074/* 2075 * Fix up config after a partly-completed split. This is done with the 2076 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2077 * pool have that entry in their config, but only the splitting one contains 2078 * a list of all the guids of the vdevs that are being split off. 2079 * 2080 * This function determines what to do with that list: either rejoin 2081 * all the disks to the pool, or complete the splitting process. To attempt 2082 * the rejoin, each disk that is offlined is marked online again, and 2083 * we do a reopen() call. If the vdev label for every disk that was 2084 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2085 * then we call vdev_split() on each disk, and complete the split. 2086 * 2087 * Otherwise we leave the config alone, with all the vdevs in place in 2088 * the original pool. 2089 */ 2090static void 2091spa_try_repair(spa_t *spa, nvlist_t *config) 2092{ 2093 uint_t extracted; 2094 uint64_t *glist; 2095 uint_t i, gcount; 2096 nvlist_t *nvl; 2097 vdev_t **vd; 2098 boolean_t attempt_reopen; 2099 2100 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2101 return; 2102 2103 /* check that the config is complete */ 2104 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2105 &glist, &gcount) != 0) 2106 return; 2107 2108 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2109 2110 /* attempt to online all the vdevs & validate */ 2111 attempt_reopen = B_TRUE; 2112 for (i = 0; i < gcount; i++) { 2113 if (glist[i] == 0) /* vdev is hole */ 2114 continue; 2115 2116 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2117 if (vd[i] == NULL) { 2118 /* 2119 * Don't bother attempting to reopen the disks; 2120 * just do the split. 2121 */ 2122 attempt_reopen = B_FALSE; 2123 } else { 2124 /* attempt to re-online it */ 2125 vd[i]->vdev_offline = B_FALSE; 2126 } 2127 } 2128 2129 if (attempt_reopen) { 2130 vdev_reopen(spa->spa_root_vdev); 2131 2132 /* check each device to see what state it's in */ 2133 for (extracted = 0, i = 0; i < gcount; i++) { 2134 if (vd[i] != NULL && 2135 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2136 break; 2137 ++extracted; 2138 } 2139 } 2140 2141 /* 2142 * If every disk has been moved to the new pool, or if we never 2143 * even attempted to look at them, then we split them off for 2144 * good. 2145 */ 2146 if (!attempt_reopen || gcount == extracted) { 2147 for (i = 0; i < gcount; i++) 2148 if (vd[i] != NULL) 2149 vdev_split(vd[i]); 2150 vdev_reopen(spa->spa_root_vdev); 2151 } 2152 2153 kmem_free(vd, gcount * sizeof (vdev_t *)); 2154} 2155 2156static int 2157spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 2158 boolean_t mosconfig) 2159{ 2160 nvlist_t *config = spa->spa_config; 2161 char *ereport = FM_EREPORT_ZFS_POOL; 2162 char *comment; 2163 int error; 2164 uint64_t pool_guid; 2165 nvlist_t *nvl; 2166 2167 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 2168 return (SET_ERROR(EINVAL)); 2169 2170 ASSERT(spa->spa_comment == NULL); 2171 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 2172 spa->spa_comment = spa_strdup(comment); 2173 2174 /* 2175 * Versioning wasn't explicitly added to the label until later, so if 2176 * it's not present treat it as the initial version. 2177 */ 2178 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 2179 &spa->spa_ubsync.ub_version) != 0) 2180 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 2181 2182 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2183 &spa->spa_config_txg); 2184 2185 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2186 spa_guid_exists(pool_guid, 0)) { 2187 error = SET_ERROR(EEXIST); 2188 } else { 2189 spa->spa_config_guid = pool_guid; 2190 2191 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2192 &nvl) == 0) { 2193 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2194 KM_SLEEP) == 0); 2195 } 2196 2197 nvlist_free(spa->spa_load_info); 2198 spa->spa_load_info = fnvlist_alloc(); 2199 2200 gethrestime(&spa->spa_loaded_ts); 2201 error = spa_load_impl(spa, pool_guid, config, state, type, 2202 mosconfig, &ereport); 2203 } 2204 2205 /* 2206 * Don't count references from objsets that are already closed 2207 * and are making their way through the eviction process. 2208 */ 2209 spa_evicting_os_wait(spa); 2210 spa->spa_minref = refcount_count(&spa->spa_refcount); 2211 if (error) { 2212 if (error != EEXIST) { 2213 spa->spa_loaded_ts.tv_sec = 0; 2214 spa->spa_loaded_ts.tv_nsec = 0; 2215 } 2216 if (error != EBADF) { 2217 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2218 } 2219 } 2220 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2221 spa->spa_ena = 0; 2222 2223 return (error); 2224} 2225 2226/* 2227 * Load an existing storage pool, using the pool's builtin spa_config as a 2228 * source of configuration information. 2229 */ 2230static int 2231spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2232 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2233 char **ereport) 2234{ 2235 int error = 0; 2236 nvlist_t *nvroot = NULL; 2237 nvlist_t *label; 2238 vdev_t *rvd; 2239 uberblock_t *ub = &spa->spa_uberblock; 2240 uint64_t children, config_cache_txg = spa->spa_config_txg; 2241 int orig_mode = spa->spa_mode; 2242 int parse; 2243 uint64_t obj; 2244 boolean_t missing_feat_write = B_FALSE; 2245 2246 /* 2247 * If this is an untrusted config, access the pool in read-only mode. 2248 * This prevents things like resilvering recently removed devices. 2249 */ 2250 if (!mosconfig) 2251 spa->spa_mode = FREAD; 2252 2253 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2254 2255 spa->spa_load_state = state; 2256 2257 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2258 return (SET_ERROR(EINVAL)); 2259 2260 parse = (type == SPA_IMPORT_EXISTING ? 2261 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2262 2263 /* 2264 * Create "The Godfather" zio to hold all async IOs 2265 */ 2266 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 2267 KM_SLEEP); 2268 for (int i = 0; i < max_ncpus; i++) { 2269 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 2270 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 2271 ZIO_FLAG_GODFATHER); 2272 } 2273 2274 /* 2275 * Parse the configuration into a vdev tree. We explicitly set the 2276 * value that will be returned by spa_version() since parsing the 2277 * configuration requires knowing the version number. 2278 */ 2279 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2280 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2281 spa_config_exit(spa, SCL_ALL, FTAG); 2282 2283 if (error != 0) 2284 return (error); 2285 2286 ASSERT(spa->spa_root_vdev == rvd); 2287 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 2288 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 2289 2290 if (type != SPA_IMPORT_ASSEMBLE) { 2291 ASSERT(spa_guid(spa) == pool_guid); 2292 } 2293 2294 /* 2295 * Try to open all vdevs, loading each label in the process. 2296 */ 2297 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2298 error = vdev_open(rvd); 2299 spa_config_exit(spa, SCL_ALL, FTAG); 2300 if (error != 0) 2301 return (error); 2302 2303 /* 2304 * We need to validate the vdev labels against the configuration that 2305 * we have in hand, which is dependent on the setting of mosconfig. If 2306 * mosconfig is true then we're validating the vdev labels based on 2307 * that config. Otherwise, we're validating against the cached config 2308 * (zpool.cache) that was read when we loaded the zfs module, and then 2309 * later we will recursively call spa_load() and validate against 2310 * the vdev config. 2311 * 2312 * If we're assembling a new pool that's been split off from an 2313 * existing pool, the labels haven't yet been updated so we skip 2314 * validation for now. 2315 */ 2316 if (type != SPA_IMPORT_ASSEMBLE) { 2317 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2318 error = vdev_validate(rvd, mosconfig); 2319 spa_config_exit(spa, SCL_ALL, FTAG); 2320 2321 if (error != 0) 2322 return (error); 2323 2324 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2325 return (SET_ERROR(ENXIO)); 2326 } 2327 2328 /* 2329 * Find the best uberblock. 2330 */ 2331 vdev_uberblock_load(rvd, ub, &label); 2332 2333 /* 2334 * If we weren't able to find a single valid uberblock, return failure. 2335 */ 2336 if (ub->ub_txg == 0) { 2337 nvlist_free(label); 2338 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2339 } 2340 2341 /* 2342 * If the pool has an unsupported version we can't open it. 2343 */ 2344 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2345 nvlist_free(label); 2346 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2347 } 2348 2349 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2350 nvlist_t *features; 2351 2352 /* 2353 * If we weren't able to find what's necessary for reading the 2354 * MOS in the label, return failure. 2355 */ 2356 if (label == NULL || nvlist_lookup_nvlist(label, 2357 ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2358 nvlist_free(label); 2359 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2360 ENXIO)); 2361 } 2362 2363 /* 2364 * Update our in-core representation with the definitive values 2365 * from the label. 2366 */ 2367 nvlist_free(spa->spa_label_features); 2368 VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2369 } 2370 2371 nvlist_free(label); 2372 2373 /* 2374 * Look through entries in the label nvlist's features_for_read. If 2375 * there is a feature listed there which we don't understand then we 2376 * cannot open a pool. 2377 */ 2378 if (ub->ub_version >= SPA_VERSION_FEATURES) { 2379 nvlist_t *unsup_feat; 2380 2381 VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2382 0); 2383 2384 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2385 NULL); nvp != NULL; 2386 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2387 if (!zfeature_is_supported(nvpair_name(nvp))) { 2388 VERIFY(nvlist_add_string(unsup_feat, 2389 nvpair_name(nvp), "") == 0); 2390 } 2391 } 2392 2393 if (!nvlist_empty(unsup_feat)) { 2394 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2395 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2396 nvlist_free(unsup_feat); 2397 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2398 ENOTSUP)); 2399 } 2400 2401 nvlist_free(unsup_feat); 2402 } 2403 2404 /* 2405 * If the vdev guid sum doesn't match the uberblock, we have an 2406 * incomplete configuration. We first check to see if the pool 2407 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2408 * If it is, defer the vdev_guid_sum check till later so we 2409 * can handle missing vdevs. 2410 */ 2411 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2412 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2413 rvd->vdev_guid_sum != ub->ub_guid_sum) 2414 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2415 2416 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2417 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2418 spa_try_repair(spa, config); 2419 spa_config_exit(spa, SCL_ALL, FTAG); 2420 nvlist_free(spa->spa_config_splitting); 2421 spa->spa_config_splitting = NULL; 2422 } 2423 2424 /* 2425 * Initialize internal SPA structures. 2426 */ 2427 spa->spa_state = POOL_STATE_ACTIVE; 2428 spa->spa_ubsync = spa->spa_uberblock; 2429 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2430 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2431 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2432 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2433 spa->spa_claim_max_txg = spa->spa_first_txg; 2434 spa->spa_prev_software_version = ub->ub_software_version; 2435 2436 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2437 if (error) 2438 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2439 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2440 2441 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2442 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2443 2444 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2445 boolean_t missing_feat_read = B_FALSE; 2446 nvlist_t *unsup_feat, *enabled_feat; 2447 2448 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2449 &spa->spa_feat_for_read_obj) != 0) { 2450 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2451 } 2452 2453 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2454 &spa->spa_feat_for_write_obj) != 0) { 2455 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2456 } 2457 2458 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2459 &spa->spa_feat_desc_obj) != 0) { 2460 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2461 } 2462 2463 enabled_feat = fnvlist_alloc(); 2464 unsup_feat = fnvlist_alloc(); 2465 2466 if (!spa_features_check(spa, B_FALSE, 2467 unsup_feat, enabled_feat)) 2468 missing_feat_read = B_TRUE; 2469 2470 if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2471 if (!spa_features_check(spa, B_TRUE, 2472 unsup_feat, enabled_feat)) { 2473 missing_feat_write = B_TRUE; 2474 } 2475 } 2476 2477 fnvlist_add_nvlist(spa->spa_load_info, 2478 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2479 2480 if (!nvlist_empty(unsup_feat)) { 2481 fnvlist_add_nvlist(spa->spa_load_info, 2482 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2483 } 2484 2485 fnvlist_free(enabled_feat); 2486 fnvlist_free(unsup_feat); 2487 2488 if (!missing_feat_read) { 2489 fnvlist_add_boolean(spa->spa_load_info, 2490 ZPOOL_CONFIG_CAN_RDONLY); 2491 } 2492 2493 /* 2494 * If the state is SPA_LOAD_TRYIMPORT, our objective is 2495 * twofold: to determine whether the pool is available for 2496 * import in read-write mode and (if it is not) whether the 2497 * pool is available for import in read-only mode. If the pool 2498 * is available for import in read-write mode, it is displayed 2499 * as available in userland; if it is not available for import 2500 * in read-only mode, it is displayed as unavailable in 2501 * userland. If the pool is available for import in read-only 2502 * mode but not read-write mode, it is displayed as unavailable 2503 * in userland with a special note that the pool is actually 2504 * available for open in read-only mode. 2505 * 2506 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2507 * missing a feature for write, we must first determine whether 2508 * the pool can be opened read-only before returning to 2509 * userland in order to know whether to display the 2510 * abovementioned note. 2511 */ 2512 if (missing_feat_read || (missing_feat_write && 2513 spa_writeable(spa))) { 2514 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2515 ENOTSUP)); 2516 } 2517 2518 /* 2519 * Load refcounts for ZFS features from disk into an in-memory 2520 * cache during SPA initialization. 2521 */ 2522 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 2523 uint64_t refcount; 2524 2525 error = feature_get_refcount_from_disk(spa, 2526 &spa_feature_table[i], &refcount); 2527 if (error == 0) { 2528 spa->spa_feat_refcount_cache[i] = refcount; 2529 } else if (error == ENOTSUP) { 2530 spa->spa_feat_refcount_cache[i] = 2531 SPA_FEATURE_DISABLED; 2532 } else { 2533 return (spa_vdev_err(rvd, 2534 VDEV_AUX_CORRUPT_DATA, EIO)); 2535 } 2536 } 2537 } 2538 2539 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 2540 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 2541 &spa->spa_feat_enabled_txg_obj) != 0) 2542 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2543 } 2544 2545 spa->spa_is_initializing = B_TRUE; 2546 error = dsl_pool_open(spa->spa_dsl_pool); 2547 spa->spa_is_initializing = B_FALSE; 2548 if (error != 0) 2549 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2550 2551 if (!mosconfig) { 2552 uint64_t hostid; 2553 nvlist_t *policy = NULL, *nvconfig; 2554 2555 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2556 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2557 2558 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2559 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2560 char *hostname; 2561 unsigned long myhostid = 0; 2562 2563 VERIFY(nvlist_lookup_string(nvconfig, 2564 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2565 2566#ifdef _KERNEL 2567 myhostid = zone_get_hostid(NULL); 2568#else /* _KERNEL */ 2569 /* 2570 * We're emulating the system's hostid in userland, so 2571 * we can't use zone_get_hostid(). 2572 */ 2573 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2574#endif /* _KERNEL */ 2575 if (check_hostid && hostid != 0 && myhostid != 0 && 2576 hostid != myhostid) { 2577 nvlist_free(nvconfig); 2578 cmn_err(CE_WARN, "pool '%s' could not be " 2579 "loaded as it was last accessed by " 2580 "another system (host: %s hostid: 0x%lx). " 2581 "See: http://illumos.org/msg/ZFS-8000-EY", 2582 spa_name(spa), hostname, 2583 (unsigned long)hostid); 2584 return (SET_ERROR(EBADF)); 2585 } 2586 } 2587 if (nvlist_lookup_nvlist(spa->spa_config, 2588 ZPOOL_REWIND_POLICY, &policy) == 0) 2589 VERIFY(nvlist_add_nvlist(nvconfig, 2590 ZPOOL_REWIND_POLICY, policy) == 0); 2591 2592 spa_config_set(spa, nvconfig); 2593 spa_unload(spa); 2594 spa_deactivate(spa); 2595 spa_activate(spa, orig_mode); 2596 2597 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2598 } 2599 2600 /* Grab the secret checksum salt from the MOS. */ 2601 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2602 DMU_POOL_CHECKSUM_SALT, 1, 2603 sizeof (spa->spa_cksum_salt.zcs_bytes), 2604 spa->spa_cksum_salt.zcs_bytes); 2605 if (error == ENOENT) { 2606 /* Generate a new salt for subsequent use */ 2607 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 2608 sizeof (spa->spa_cksum_salt.zcs_bytes)); 2609 } else if (error != 0) { 2610 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2611 } 2612 2613 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2614 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2615 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2616 if (error != 0) 2617 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2618 2619 /* 2620 * Load the bit that tells us to use the new accounting function 2621 * (raid-z deflation). If we have an older pool, this will not 2622 * be present. 2623 */ 2624 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2625 if (error != 0 && error != ENOENT) 2626 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2627 2628 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2629 &spa->spa_creation_version); 2630 if (error != 0 && error != ENOENT) 2631 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2632 2633 /* 2634 * Load the persistent error log. If we have an older pool, this will 2635 * not be present. 2636 */ 2637 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2638 if (error != 0 && error != ENOENT) 2639 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2640 2641 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2642 &spa->spa_errlog_scrub); 2643 if (error != 0 && error != ENOENT) 2644 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2645 2646 /* 2647 * Load the history object. If we have an older pool, this 2648 * will not be present. 2649 */ 2650 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2651 if (error != 0 && error != ENOENT) 2652 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2653 2654 /* 2655 * If we're assembling the pool from the split-off vdevs of 2656 * an existing pool, we don't want to attach the spares & cache 2657 * devices. 2658 */ 2659 2660 /* 2661 * Load any hot spares for this pool. 2662 */ 2663 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2664 if (error != 0 && error != ENOENT) 2665 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2666 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2667 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2668 if (load_nvlist(spa, spa->spa_spares.sav_object, 2669 &spa->spa_spares.sav_config) != 0) 2670 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2671 2672 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2673 spa_load_spares(spa); 2674 spa_config_exit(spa, SCL_ALL, FTAG); 2675 } else if (error == 0) { 2676 spa->spa_spares.sav_sync = B_TRUE; 2677 } 2678 2679 /* 2680 * Load any level 2 ARC devices for this pool. 2681 */ 2682 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2683 &spa->spa_l2cache.sav_object); 2684 if (error != 0 && error != ENOENT) 2685 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2686 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2687 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2688 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2689 &spa->spa_l2cache.sav_config) != 0) 2690 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2691 2692 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2693 spa_load_l2cache(spa); 2694 spa_config_exit(spa, SCL_ALL, FTAG); 2695 } else if (error == 0) { 2696 spa->spa_l2cache.sav_sync = B_TRUE; 2697 } 2698 2699 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2700 2701 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2702 if (error && error != ENOENT) 2703 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2704 2705 if (error == 0) { 2706 uint64_t autoreplace; 2707 2708 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2709 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2710 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2711 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2712 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2713 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2714 &spa->spa_dedup_ditto); 2715 2716 spa->spa_autoreplace = (autoreplace != 0); 2717 } 2718 2719 /* 2720 * If the 'autoreplace' property is set, then post a resource notifying 2721 * the ZFS DE that it should not issue any faults for unopenable 2722 * devices. We also iterate over the vdevs, and post a sysevent for any 2723 * unopenable vdevs so that the normal autoreplace handler can take 2724 * over. 2725 */ 2726 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2727 spa_check_removed(spa->spa_root_vdev); 2728 /* 2729 * For the import case, this is done in spa_import(), because 2730 * at this point we're using the spare definitions from 2731 * the MOS config, not necessarily from the userland config. 2732 */ 2733 if (state != SPA_LOAD_IMPORT) { 2734 spa_aux_check_removed(&spa->spa_spares); 2735 spa_aux_check_removed(&spa->spa_l2cache); 2736 } 2737 } 2738 2739 /* 2740 * Load the vdev state for all toplevel vdevs. 2741 */ 2742 vdev_load(rvd); 2743 2744 /* 2745 * Propagate the leaf DTLs we just loaded all the way up the tree. 2746 */ 2747 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2748 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2749 spa_config_exit(spa, SCL_ALL, FTAG); 2750 2751 /* 2752 * Load the DDTs (dedup tables). 2753 */ 2754 error = ddt_load(spa); 2755 if (error != 0) 2756 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2757 2758 spa_update_dspace(spa); 2759 2760 /* 2761 * Validate the config, using the MOS config to fill in any 2762 * information which might be missing. If we fail to validate 2763 * the config then declare the pool unfit for use. If we're 2764 * assembling a pool from a split, the log is not transferred 2765 * over. 2766 */ 2767 if (type != SPA_IMPORT_ASSEMBLE) { 2768 nvlist_t *nvconfig; 2769 2770 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2771 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2772 2773 if (!spa_config_valid(spa, nvconfig)) { 2774 nvlist_free(nvconfig); 2775 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2776 ENXIO)); 2777 } 2778 nvlist_free(nvconfig); 2779 2780 /* 2781 * Now that we've validated the config, check the state of the 2782 * root vdev. If it can't be opened, it indicates one or 2783 * more toplevel vdevs are faulted. 2784 */ 2785 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2786 return (SET_ERROR(ENXIO)); 2787 2788 if (spa_writeable(spa) && spa_check_logs(spa)) { 2789 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2790 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2791 } 2792 } 2793 2794 if (missing_feat_write) { 2795 ASSERT(state == SPA_LOAD_TRYIMPORT); 2796 2797 /* 2798 * At this point, we know that we can open the pool in 2799 * read-only mode but not read-write mode. We now have enough 2800 * information and can return to userland. 2801 */ 2802 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2803 } 2804 2805 /* 2806 * We've successfully opened the pool, verify that we're ready 2807 * to start pushing transactions. 2808 */ 2809 if (state != SPA_LOAD_TRYIMPORT) { 2810 if (error = spa_load_verify(spa)) 2811 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2812 error)); 2813 } 2814 2815 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2816 spa->spa_load_max_txg == UINT64_MAX)) { 2817 dmu_tx_t *tx; 2818 int need_update = B_FALSE; 2819 dsl_pool_t *dp = spa_get_dsl(spa); 2820 2821 ASSERT(state != SPA_LOAD_TRYIMPORT); 2822 2823 /* 2824 * Claim log blocks that haven't been committed yet. 2825 * This must all happen in a single txg. 2826 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2827 * invoked from zil_claim_log_block()'s i/o done callback. 2828 * Price of rollback is that we abandon the log. 2829 */ 2830 spa->spa_claiming = B_TRUE; 2831 2832 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 2833 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2834 zil_claim, tx, DS_FIND_CHILDREN); 2835 dmu_tx_commit(tx); 2836 2837 spa->spa_claiming = B_FALSE; 2838 2839 spa_set_log_state(spa, SPA_LOG_GOOD); 2840 spa->spa_sync_on = B_TRUE; 2841 txg_sync_start(spa->spa_dsl_pool); 2842 2843 /* 2844 * Wait for all claims to sync. We sync up to the highest 2845 * claimed log block birth time so that claimed log blocks 2846 * don't appear to be from the future. spa_claim_max_txg 2847 * will have been set for us by either zil_check_log_chain() 2848 * (invoked from spa_check_logs()) or zil_claim() above. 2849 */ 2850 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2851 2852 /* 2853 * If the config cache is stale, or we have uninitialized 2854 * metaslabs (see spa_vdev_add()), then update the config. 2855 * 2856 * If this is a verbatim import, trust the current 2857 * in-core spa_config and update the disk labels. 2858 */ 2859 if (config_cache_txg != spa->spa_config_txg || 2860 state == SPA_LOAD_IMPORT || 2861 state == SPA_LOAD_RECOVER || 2862 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2863 need_update = B_TRUE; 2864 2865 for (int c = 0; c < rvd->vdev_children; c++) 2866 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2867 need_update = B_TRUE; 2868 2869 /* 2870 * Update the config cache asychronously in case we're the 2871 * root pool, in which case the config cache isn't writable yet. 2872 */ 2873 if (need_update) 2874 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2875 2876 /* 2877 * Check all DTLs to see if anything needs resilvering. 2878 */ 2879 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2880 vdev_resilver_needed(rvd, NULL, NULL)) 2881 spa_async_request(spa, SPA_ASYNC_RESILVER); 2882 2883 /* 2884 * Log the fact that we booted up (so that we can detect if 2885 * we rebooted in the middle of an operation). 2886 */ 2887 spa_history_log_version(spa, "open"); 2888 2889 /* 2890 * Delete any inconsistent datasets. 2891 */ 2892 (void) dmu_objset_find(spa_name(spa), 2893 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2894 2895 /* 2896 * Clean up any stale temporary dataset userrefs. 2897 */ 2898 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2899 } 2900 2901 return (0); 2902} 2903 2904static int 2905spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2906{ 2907 int mode = spa->spa_mode; 2908 2909 spa_unload(spa); 2910 spa_deactivate(spa); 2911 2912 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 2913 2914 spa_activate(spa, mode); 2915 spa_async_suspend(spa); 2916 2917 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2918} 2919 2920/* 2921 * If spa_load() fails this function will try loading prior txg's. If 2922 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2923 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2924 * function will not rewind the pool and will return the same error as 2925 * spa_load(). 2926 */ 2927static int 2928spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2929 uint64_t max_request, int rewind_flags) 2930{ 2931 nvlist_t *loadinfo = NULL; 2932 nvlist_t *config = NULL; 2933 int load_error, rewind_error; 2934 uint64_t safe_rewind_txg; 2935 uint64_t min_txg; 2936 2937 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2938 spa->spa_load_max_txg = spa->spa_load_txg; 2939 spa_set_log_state(spa, SPA_LOG_CLEAR); 2940 } else { 2941 spa->spa_load_max_txg = max_request; 2942 if (max_request != UINT64_MAX) 2943 spa->spa_extreme_rewind = B_TRUE; 2944 } 2945 2946 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2947 mosconfig); 2948 if (load_error == 0) 2949 return (0); 2950 2951 if (spa->spa_root_vdev != NULL) 2952 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2953 2954 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2955 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2956 2957 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2958 nvlist_free(config); 2959 return (load_error); 2960 } 2961 2962 if (state == SPA_LOAD_RECOVER) { 2963 /* Price of rolling back is discarding txgs, including log */ 2964 spa_set_log_state(spa, SPA_LOG_CLEAR); 2965 } else { 2966 /* 2967 * If we aren't rolling back save the load info from our first 2968 * import attempt so that we can restore it after attempting 2969 * to rewind. 2970 */ 2971 loadinfo = spa->spa_load_info; 2972 spa->spa_load_info = fnvlist_alloc(); 2973 } 2974 2975 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2976 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2977 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2978 TXG_INITIAL : safe_rewind_txg; 2979 2980 /* 2981 * Continue as long as we're finding errors, we're still within 2982 * the acceptable rewind range, and we're still finding uberblocks 2983 */ 2984 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2985 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2986 if (spa->spa_load_max_txg < safe_rewind_txg) 2987 spa->spa_extreme_rewind = B_TRUE; 2988 rewind_error = spa_load_retry(spa, state, mosconfig); 2989 } 2990 2991 spa->spa_extreme_rewind = B_FALSE; 2992 spa->spa_load_max_txg = UINT64_MAX; 2993 2994 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2995 spa_config_set(spa, config); 2996 2997 if (state == SPA_LOAD_RECOVER) { 2998 ASSERT3P(loadinfo, ==, NULL); 2999 return (rewind_error); 3000 } else { 3001 /* Store the rewind info as part of the initial load info */ 3002 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 3003 spa->spa_load_info); 3004 3005 /* Restore the initial load info */ 3006 fnvlist_free(spa->spa_load_info); 3007 spa->spa_load_info = loadinfo; 3008 3009 return (load_error); 3010 } 3011} 3012 3013/* 3014 * Pool Open/Import 3015 * 3016 * The import case is identical to an open except that the configuration is sent 3017 * down from userland, instead of grabbed from the configuration cache. For the 3018 * case of an open, the pool configuration will exist in the 3019 * POOL_STATE_UNINITIALIZED state. 3020 * 3021 * The stats information (gen/count/ustats) is used to gather vdev statistics at 3022 * the same time open the pool, without having to keep around the spa_t in some 3023 * ambiguous state. 3024 */ 3025static int 3026spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 3027 nvlist_t **config) 3028{ 3029 spa_t *spa; 3030 spa_load_state_t state = SPA_LOAD_OPEN; 3031 int error; 3032 int locked = B_FALSE; 3033 int firstopen = B_FALSE; 3034 3035 *spapp = NULL; 3036 3037 /* 3038 * As disgusting as this is, we need to support recursive calls to this 3039 * function because dsl_dir_open() is called during spa_load(), and ends 3040 * up calling spa_open() again. The real fix is to figure out how to 3041 * avoid dsl_dir_open() calling this in the first place. 3042 */ 3043 if (mutex_owner(&spa_namespace_lock) != curthread) { 3044 mutex_enter(&spa_namespace_lock); 3045 locked = B_TRUE; 3046 } 3047 3048 if ((spa = spa_lookup(pool)) == NULL) { 3049 if (locked) 3050 mutex_exit(&spa_namespace_lock); 3051 return (SET_ERROR(ENOENT)); 3052 } 3053 3054 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 3055 zpool_rewind_policy_t policy; 3056 3057 firstopen = B_TRUE; 3058 3059 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 3060 &policy); 3061 if (policy.zrp_request & ZPOOL_DO_REWIND) 3062 state = SPA_LOAD_RECOVER; 3063 3064 spa_activate(spa, spa_mode_global); 3065 3066 if (state != SPA_LOAD_RECOVER) 3067 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3068 3069 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 3070 policy.zrp_request); 3071 3072 if (error == EBADF) { 3073 /* 3074 * If vdev_validate() returns failure (indicated by 3075 * EBADF), it indicates that one of the vdevs indicates 3076 * that the pool has been exported or destroyed. If 3077 * this is the case, the config cache is out of sync and 3078 * we should remove the pool from the namespace. 3079 */ 3080 spa_unload(spa); 3081 spa_deactivate(spa); 3082 spa_config_sync(spa, B_TRUE, B_TRUE); 3083 spa_remove(spa); 3084 if (locked) 3085 mutex_exit(&spa_namespace_lock); 3086 return (SET_ERROR(ENOENT)); 3087 } 3088 3089 if (error) { 3090 /* 3091 * We can't open the pool, but we still have useful 3092 * information: the state of each vdev after the 3093 * attempted vdev_open(). Return this to the user. 3094 */ 3095 if (config != NULL && spa->spa_config) { 3096 VERIFY(nvlist_dup(spa->spa_config, config, 3097 KM_SLEEP) == 0); 3098 VERIFY(nvlist_add_nvlist(*config, 3099 ZPOOL_CONFIG_LOAD_INFO, 3100 spa->spa_load_info) == 0); 3101 } 3102 spa_unload(spa); 3103 spa_deactivate(spa); 3104 spa->spa_last_open_failed = error; 3105 if (locked) 3106 mutex_exit(&spa_namespace_lock); 3107 *spapp = NULL; 3108 return (error); 3109 } 3110 } 3111 3112 spa_open_ref(spa, tag); 3113 3114 if (config != NULL) 3115 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3116 3117 /* 3118 * If we've recovered the pool, pass back any information we 3119 * gathered while doing the load. 3120 */ 3121 if (state == SPA_LOAD_RECOVER) { 3122 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 3123 spa->spa_load_info) == 0); 3124 } 3125 3126 if (locked) { 3127 spa->spa_last_open_failed = 0; 3128 spa->spa_last_ubsync_txg = 0; 3129 spa->spa_load_txg = 0; 3130 mutex_exit(&spa_namespace_lock); 3131#ifdef __FreeBSD__ 3132#ifdef _KERNEL 3133 if (firstopen) 3134 zvol_create_minors(spa->spa_name); 3135#endif 3136#endif 3137 } 3138 3139 *spapp = spa; 3140 3141 return (0); 3142} 3143 3144int 3145spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 3146 nvlist_t **config) 3147{ 3148 return (spa_open_common(name, spapp, tag, policy, config)); 3149} 3150 3151int 3152spa_open(const char *name, spa_t **spapp, void *tag) 3153{ 3154 return (spa_open_common(name, spapp, tag, NULL, NULL)); 3155} 3156 3157/* 3158 * Lookup the given spa_t, incrementing the inject count in the process, 3159 * preventing it from being exported or destroyed. 3160 */ 3161spa_t * 3162spa_inject_addref(char *name) 3163{ 3164 spa_t *spa; 3165 3166 mutex_enter(&spa_namespace_lock); 3167 if ((spa = spa_lookup(name)) == NULL) { 3168 mutex_exit(&spa_namespace_lock); 3169 return (NULL); 3170 } 3171 spa->spa_inject_ref++; 3172 mutex_exit(&spa_namespace_lock); 3173 3174 return (spa); 3175} 3176 3177void 3178spa_inject_delref(spa_t *spa) 3179{ 3180 mutex_enter(&spa_namespace_lock); 3181 spa->spa_inject_ref--; 3182 mutex_exit(&spa_namespace_lock); 3183} 3184 3185/* 3186 * Add spares device information to the nvlist. 3187 */ 3188static void 3189spa_add_spares(spa_t *spa, nvlist_t *config) 3190{ 3191 nvlist_t **spares; 3192 uint_t i, nspares; 3193 nvlist_t *nvroot; 3194 uint64_t guid; 3195 vdev_stat_t *vs; 3196 uint_t vsc; 3197 uint64_t pool; 3198 3199 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3200 3201 if (spa->spa_spares.sav_count == 0) 3202 return; 3203 3204 VERIFY(nvlist_lookup_nvlist(config, 3205 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3206 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 3207 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3208 if (nspares != 0) { 3209 VERIFY(nvlist_add_nvlist_array(nvroot, 3210 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3211 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3212 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 3213 3214 /* 3215 * Go through and find any spares which have since been 3216 * repurposed as an active spare. If this is the case, update 3217 * their status appropriately. 3218 */ 3219 for (i = 0; i < nspares; i++) { 3220 VERIFY(nvlist_lookup_uint64(spares[i], 3221 ZPOOL_CONFIG_GUID, &guid) == 0); 3222 if (spa_spare_exists(guid, &pool, NULL) && 3223 pool != 0ULL) { 3224 VERIFY(nvlist_lookup_uint64_array( 3225 spares[i], ZPOOL_CONFIG_VDEV_STATS, 3226 (uint64_t **)&vs, &vsc) == 0); 3227 vs->vs_state = VDEV_STATE_CANT_OPEN; 3228 vs->vs_aux = VDEV_AUX_SPARED; 3229 } 3230 } 3231 } 3232} 3233 3234/* 3235 * Add l2cache device information to the nvlist, including vdev stats. 3236 */ 3237static void 3238spa_add_l2cache(spa_t *spa, nvlist_t *config) 3239{ 3240 nvlist_t **l2cache; 3241 uint_t i, j, nl2cache; 3242 nvlist_t *nvroot; 3243 uint64_t guid; 3244 vdev_t *vd; 3245 vdev_stat_t *vs; 3246 uint_t vsc; 3247 3248 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3249 3250 if (spa->spa_l2cache.sav_count == 0) 3251 return; 3252 3253 VERIFY(nvlist_lookup_nvlist(config, 3254 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3255 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3256 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3257 if (nl2cache != 0) { 3258 VERIFY(nvlist_add_nvlist_array(nvroot, 3259 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3260 VERIFY(nvlist_lookup_nvlist_array(nvroot, 3261 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3262 3263 /* 3264 * Update level 2 cache device stats. 3265 */ 3266 3267 for (i = 0; i < nl2cache; i++) { 3268 VERIFY(nvlist_lookup_uint64(l2cache[i], 3269 ZPOOL_CONFIG_GUID, &guid) == 0); 3270 3271 vd = NULL; 3272 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3273 if (guid == 3274 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3275 vd = spa->spa_l2cache.sav_vdevs[j]; 3276 break; 3277 } 3278 } 3279 ASSERT(vd != NULL); 3280 3281 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3282 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3283 == 0); 3284 vdev_get_stats(vd, vs); 3285 } 3286 } 3287} 3288 3289static void 3290spa_add_feature_stats(spa_t *spa, nvlist_t *config) 3291{ 3292 nvlist_t *features; 3293 zap_cursor_t zc; 3294 zap_attribute_t za; 3295 3296 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3297 VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3298 3299 /* We may be unable to read features if pool is suspended. */ 3300 if (spa_suspended(spa)) 3301 goto out; 3302 3303 if (spa->spa_feat_for_read_obj != 0) { 3304 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3305 spa->spa_feat_for_read_obj); 3306 zap_cursor_retrieve(&zc, &za) == 0; 3307 zap_cursor_advance(&zc)) { 3308 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3309 za.za_num_integers == 1); 3310 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3311 za.za_first_integer)); 3312 } 3313 zap_cursor_fini(&zc); 3314 } 3315 3316 if (spa->spa_feat_for_write_obj != 0) { 3317 for (zap_cursor_init(&zc, spa->spa_meta_objset, 3318 spa->spa_feat_for_write_obj); 3319 zap_cursor_retrieve(&zc, &za) == 0; 3320 zap_cursor_advance(&zc)) { 3321 ASSERT(za.za_integer_length == sizeof (uint64_t) && 3322 za.za_num_integers == 1); 3323 VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3324 za.za_first_integer)); 3325 } 3326 zap_cursor_fini(&zc); 3327 } 3328 3329out: 3330 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3331 features) == 0); 3332 nvlist_free(features); 3333} 3334 3335int 3336spa_get_stats(const char *name, nvlist_t **config, 3337 char *altroot, size_t buflen) 3338{ 3339 int error; 3340 spa_t *spa; 3341 3342 *config = NULL; 3343 error = spa_open_common(name, &spa, FTAG, NULL, config); 3344 3345 if (spa != NULL) { 3346 /* 3347 * This still leaves a window of inconsistency where the spares 3348 * or l2cache devices could change and the config would be 3349 * self-inconsistent. 3350 */ 3351 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3352 3353 if (*config != NULL) { 3354 uint64_t loadtimes[2]; 3355 3356 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3357 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3358 VERIFY(nvlist_add_uint64_array(*config, 3359 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3360 3361 VERIFY(nvlist_add_uint64(*config, 3362 ZPOOL_CONFIG_ERRCOUNT, 3363 spa_get_errlog_size(spa)) == 0); 3364 3365 if (spa_suspended(spa)) 3366 VERIFY(nvlist_add_uint64(*config, 3367 ZPOOL_CONFIG_SUSPENDED, 3368 spa->spa_failmode) == 0); 3369 3370 spa_add_spares(spa, *config); 3371 spa_add_l2cache(spa, *config); 3372 spa_add_feature_stats(spa, *config); 3373 } 3374 } 3375 3376 /* 3377 * We want to get the alternate root even for faulted pools, so we cheat 3378 * and call spa_lookup() directly. 3379 */ 3380 if (altroot) { 3381 if (spa == NULL) { 3382 mutex_enter(&spa_namespace_lock); 3383 spa = spa_lookup(name); 3384 if (spa) 3385 spa_altroot(spa, altroot, buflen); 3386 else 3387 altroot[0] = '\0'; 3388 spa = NULL; 3389 mutex_exit(&spa_namespace_lock); 3390 } else { 3391 spa_altroot(spa, altroot, buflen); 3392 } 3393 } 3394 3395 if (spa != NULL) { 3396 spa_config_exit(spa, SCL_CONFIG, FTAG); 3397 spa_close(spa, FTAG); 3398 } 3399 3400 return (error); 3401} 3402 3403/* 3404 * Validate that the auxiliary device array is well formed. We must have an 3405 * array of nvlists, each which describes a valid leaf vdev. If this is an 3406 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3407 * specified, as long as they are well-formed. 3408 */ 3409static int 3410spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3411 spa_aux_vdev_t *sav, const char *config, uint64_t version, 3412 vdev_labeltype_t label) 3413{ 3414 nvlist_t **dev; 3415 uint_t i, ndev; 3416 vdev_t *vd; 3417 int error; 3418 3419 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3420 3421 /* 3422 * It's acceptable to have no devs specified. 3423 */ 3424 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3425 return (0); 3426 3427 if (ndev == 0) 3428 return (SET_ERROR(EINVAL)); 3429 3430 /* 3431 * Make sure the pool is formatted with a version that supports this 3432 * device type. 3433 */ 3434 if (spa_version(spa) < version) 3435 return (SET_ERROR(ENOTSUP)); 3436 3437 /* 3438 * Set the pending device list so we correctly handle device in-use 3439 * checking. 3440 */ 3441 sav->sav_pending = dev; 3442 sav->sav_npending = ndev; 3443 3444 for (i = 0; i < ndev; i++) { 3445 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3446 mode)) != 0) 3447 goto out; 3448 3449 if (!vd->vdev_ops->vdev_op_leaf) { 3450 vdev_free(vd); 3451 error = SET_ERROR(EINVAL); 3452 goto out; 3453 } 3454 3455 /* 3456 * The L2ARC currently only supports disk devices in 3457 * kernel context. For user-level testing, we allow it. 3458 */ 3459#ifdef _KERNEL 3460 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3461 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3462 error = SET_ERROR(ENOTBLK); 3463 vdev_free(vd); 3464 goto out; 3465 } 3466#endif 3467 vd->vdev_top = vd; 3468 3469 if ((error = vdev_open(vd)) == 0 && 3470 (error = vdev_label_init(vd, crtxg, label)) == 0) { 3471 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3472 vd->vdev_guid) == 0); 3473 } 3474 3475 vdev_free(vd); 3476 3477 if (error && 3478 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3479 goto out; 3480 else 3481 error = 0; 3482 } 3483 3484out: 3485 sav->sav_pending = NULL; 3486 sav->sav_npending = 0; 3487 return (error); 3488} 3489 3490static int 3491spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3492{ 3493 int error; 3494 3495 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3496 3497 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3498 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3499 VDEV_LABEL_SPARE)) != 0) { 3500 return (error); 3501 } 3502 3503 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3504 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3505 VDEV_LABEL_L2CACHE)); 3506} 3507 3508static void 3509spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3510 const char *config) 3511{ 3512 int i; 3513 3514 if (sav->sav_config != NULL) { 3515 nvlist_t **olddevs; 3516 uint_t oldndevs; 3517 nvlist_t **newdevs; 3518 3519 /* 3520 * Generate new dev list by concatentating with the 3521 * current dev list. 3522 */ 3523 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3524 &olddevs, &oldndevs) == 0); 3525 3526 newdevs = kmem_alloc(sizeof (void *) * 3527 (ndevs + oldndevs), KM_SLEEP); 3528 for (i = 0; i < oldndevs; i++) 3529 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3530 KM_SLEEP) == 0); 3531 for (i = 0; i < ndevs; i++) 3532 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3533 KM_SLEEP) == 0); 3534 3535 VERIFY(nvlist_remove(sav->sav_config, config, 3536 DATA_TYPE_NVLIST_ARRAY) == 0); 3537 3538 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3539 config, newdevs, ndevs + oldndevs) == 0); 3540 for (i = 0; i < oldndevs + ndevs; i++) 3541 nvlist_free(newdevs[i]); 3542 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3543 } else { 3544 /* 3545 * Generate a new dev list. 3546 */ 3547 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3548 KM_SLEEP) == 0); 3549 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3550 devs, ndevs) == 0); 3551 } 3552} 3553 3554/* 3555 * Stop and drop level 2 ARC devices 3556 */ 3557void 3558spa_l2cache_drop(spa_t *spa) 3559{ 3560 vdev_t *vd; 3561 int i; 3562 spa_aux_vdev_t *sav = &spa->spa_l2cache; 3563 3564 for (i = 0; i < sav->sav_count; i++) { 3565 uint64_t pool; 3566 3567 vd = sav->sav_vdevs[i]; 3568 ASSERT(vd != NULL); 3569 3570 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3571 pool != 0ULL && l2arc_vdev_present(vd)) 3572 l2arc_remove_vdev(vd); 3573 } 3574} 3575 3576/* 3577 * Pool Creation 3578 */ 3579int 3580spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3581 nvlist_t *zplprops) 3582{ 3583 spa_t *spa; 3584 char *altroot = NULL; 3585 vdev_t *rvd; 3586 dsl_pool_t *dp; 3587 dmu_tx_t *tx; 3588 int error = 0; 3589 uint64_t txg = TXG_INITIAL; 3590 nvlist_t **spares, **l2cache; 3591 uint_t nspares, nl2cache; 3592 uint64_t version, obj; 3593 boolean_t has_features; 3594 3595 /* 3596 * If this pool already exists, return failure. 3597 */ 3598 mutex_enter(&spa_namespace_lock); 3599 if (spa_lookup(pool) != NULL) { 3600 mutex_exit(&spa_namespace_lock); 3601 return (SET_ERROR(EEXIST)); 3602 } 3603 3604 /* 3605 * Allocate a new spa_t structure. 3606 */ 3607 (void) nvlist_lookup_string(props, 3608 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3609 spa = spa_add(pool, NULL, altroot); 3610 spa_activate(spa, spa_mode_global); 3611 3612 if (props && (error = spa_prop_validate(spa, props))) { 3613 spa_deactivate(spa); 3614 spa_remove(spa); 3615 mutex_exit(&spa_namespace_lock); 3616 return (error); 3617 } 3618 3619 has_features = B_FALSE; 3620 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3621 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3622 if (zpool_prop_feature(nvpair_name(elem))) 3623 has_features = B_TRUE; 3624 } 3625 3626 if (has_features || nvlist_lookup_uint64(props, 3627 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3628 version = SPA_VERSION; 3629 } 3630 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3631 3632 spa->spa_first_txg = txg; 3633 spa->spa_uberblock.ub_txg = txg - 1; 3634 spa->spa_uberblock.ub_version = version; 3635 spa->spa_ubsync = spa->spa_uberblock; 3636 spa->spa_load_state = SPA_LOAD_CREATE; 3637 3638 /* 3639 * Create "The Godfather" zio to hold all async IOs 3640 */ 3641 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3642 KM_SLEEP); 3643 for (int i = 0; i < max_ncpus; i++) { 3644 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3645 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3646 ZIO_FLAG_GODFATHER); 3647 } 3648 3649 /* 3650 * Create the root vdev. 3651 */ 3652 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3653 3654 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3655 3656 ASSERT(error != 0 || rvd != NULL); 3657 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3658 3659 if (error == 0 && !zfs_allocatable_devs(nvroot)) 3660 error = SET_ERROR(EINVAL); 3661 3662 if (error == 0 && 3663 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3664 (error = spa_validate_aux(spa, nvroot, txg, 3665 VDEV_ALLOC_ADD)) == 0) { 3666 for (int c = 0; c < rvd->vdev_children; c++) { 3667 vdev_ashift_optimize(rvd->vdev_child[c]); 3668 vdev_metaslab_set_size(rvd->vdev_child[c]); 3669 vdev_expand(rvd->vdev_child[c], txg); 3670 } 3671 } 3672 3673 spa_config_exit(spa, SCL_ALL, FTAG); 3674 3675 if (error != 0) { 3676 spa_unload(spa); 3677 spa_deactivate(spa); 3678 spa_remove(spa); 3679 mutex_exit(&spa_namespace_lock); 3680 return (error); 3681 } 3682 3683 /* 3684 * Get the list of spares, if specified. 3685 */ 3686 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3687 &spares, &nspares) == 0) { 3688 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3689 KM_SLEEP) == 0); 3690 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3691 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3692 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3693 spa_load_spares(spa); 3694 spa_config_exit(spa, SCL_ALL, FTAG); 3695 spa->spa_spares.sav_sync = B_TRUE; 3696 } 3697 3698 /* 3699 * Get the list of level 2 cache devices, if specified. 3700 */ 3701 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3702 &l2cache, &nl2cache) == 0) { 3703 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3704 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3705 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3706 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3707 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3708 spa_load_l2cache(spa); 3709 spa_config_exit(spa, SCL_ALL, FTAG); 3710 spa->spa_l2cache.sav_sync = B_TRUE; 3711 } 3712 3713 spa->spa_is_initializing = B_TRUE; 3714 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3715 spa->spa_meta_objset = dp->dp_meta_objset; 3716 spa->spa_is_initializing = B_FALSE; 3717 3718 /* 3719 * Create DDTs (dedup tables). 3720 */ 3721 ddt_create(spa); 3722 3723 spa_update_dspace(spa); 3724 3725 tx = dmu_tx_create_assigned(dp, txg); 3726 3727 /* 3728 * Create the pool config object. 3729 */ 3730 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3731 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3732 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3733 3734 if (zap_add(spa->spa_meta_objset, 3735 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3736 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3737 cmn_err(CE_PANIC, "failed to add pool config"); 3738 } 3739 3740 if (spa_version(spa) >= SPA_VERSION_FEATURES) 3741 spa_feature_create_zap_objects(spa, tx); 3742 3743 if (zap_add(spa->spa_meta_objset, 3744 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3745 sizeof (uint64_t), 1, &version, tx) != 0) { 3746 cmn_err(CE_PANIC, "failed to add pool version"); 3747 } 3748 3749 /* Newly created pools with the right version are always deflated. */ 3750 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3751 spa->spa_deflate = TRUE; 3752 if (zap_add(spa->spa_meta_objset, 3753 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3754 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3755 cmn_err(CE_PANIC, "failed to add deflate"); 3756 } 3757 } 3758 3759 /* 3760 * Create the deferred-free bpobj. Turn off compression 3761 * because sync-to-convergence takes longer if the blocksize 3762 * keeps changing. 3763 */ 3764 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3765 dmu_object_set_compress(spa->spa_meta_objset, obj, 3766 ZIO_COMPRESS_OFF, tx); 3767 if (zap_add(spa->spa_meta_objset, 3768 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3769 sizeof (uint64_t), 1, &obj, tx) != 0) { 3770 cmn_err(CE_PANIC, "failed to add bpobj"); 3771 } 3772 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3773 spa->spa_meta_objset, obj)); 3774 3775 /* 3776 * Create the pool's history object. 3777 */ 3778 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3779 spa_history_create_obj(spa, tx); 3780 3781 /* 3782 * Generate some random noise for salted checksums to operate on. 3783 */ 3784 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 3785 sizeof (spa->spa_cksum_salt.zcs_bytes)); 3786 3787 /* 3788 * Set pool properties. 3789 */ 3790 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3791 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3792 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3793 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3794 3795 if (props != NULL) { 3796 spa_configfile_set(spa, props, B_FALSE); 3797 spa_sync_props(props, tx); 3798 } 3799 3800 dmu_tx_commit(tx); 3801 3802 spa->spa_sync_on = B_TRUE; 3803 txg_sync_start(spa->spa_dsl_pool); 3804 3805 /* 3806 * We explicitly wait for the first transaction to complete so that our 3807 * bean counters are appropriately updated. 3808 */ 3809 txg_wait_synced(spa->spa_dsl_pool, txg); 3810 3811 spa_config_sync(spa, B_FALSE, B_TRUE); 3812 spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE); 3813 3814 spa_history_log_version(spa, "create"); 3815 3816 /* 3817 * Don't count references from objsets that are already closed 3818 * and are making their way through the eviction process. 3819 */ 3820 spa_evicting_os_wait(spa); 3821 spa->spa_minref = refcount_count(&spa->spa_refcount); 3822 spa->spa_load_state = SPA_LOAD_NONE; 3823 3824 mutex_exit(&spa_namespace_lock); 3825 3826 return (0); 3827} 3828 3829#ifdef _KERNEL 3830#ifdef illumos 3831/* 3832 * Get the root pool information from the root disk, then import the root pool 3833 * during the system boot up time. 3834 */ 3835extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3836 3837static nvlist_t * 3838spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3839{ 3840 nvlist_t *config; 3841 nvlist_t *nvtop, *nvroot; 3842 uint64_t pgid; 3843 3844 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3845 return (NULL); 3846 3847 /* 3848 * Add this top-level vdev to the child array. 3849 */ 3850 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3851 &nvtop) == 0); 3852 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3853 &pgid) == 0); 3854 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3855 3856 /* 3857 * Put this pool's top-level vdevs into a root vdev. 3858 */ 3859 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3860 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3861 VDEV_TYPE_ROOT) == 0); 3862 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3863 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3864 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3865 &nvtop, 1) == 0); 3866 3867 /* 3868 * Replace the existing vdev_tree with the new root vdev in 3869 * this pool's configuration (remove the old, add the new). 3870 */ 3871 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3872 nvlist_free(nvroot); 3873 return (config); 3874} 3875 3876/* 3877 * Walk the vdev tree and see if we can find a device with "better" 3878 * configuration. A configuration is "better" if the label on that 3879 * device has a more recent txg. 3880 */ 3881static void 3882spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3883{ 3884 for (int c = 0; c < vd->vdev_children; c++) 3885 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3886 3887 if (vd->vdev_ops->vdev_op_leaf) { 3888 nvlist_t *label; 3889 uint64_t label_txg; 3890 3891 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3892 &label) != 0) 3893 return; 3894 3895 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3896 &label_txg) == 0); 3897 3898 /* 3899 * Do we have a better boot device? 3900 */ 3901 if (label_txg > *txg) { 3902 *txg = label_txg; 3903 *avd = vd; 3904 } 3905 nvlist_free(label); 3906 } 3907} 3908 3909/* 3910 * Import a root pool. 3911 * 3912 * For x86. devpath_list will consist of devid and/or physpath name of 3913 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3914 * The GRUB "findroot" command will return the vdev we should boot. 3915 * 3916 * For Sparc, devpath_list consists the physpath name of the booting device 3917 * no matter the rootpool is a single device pool or a mirrored pool. 3918 * e.g. 3919 * "/pci@1f,0/ide@d/disk@0,0:a" 3920 */ 3921int 3922spa_import_rootpool(char *devpath, char *devid) 3923{ 3924 spa_t *spa; 3925 vdev_t *rvd, *bvd, *avd = NULL; 3926 nvlist_t *config, *nvtop; 3927 uint64_t guid, txg; 3928 char *pname; 3929 int error; 3930 3931 /* 3932 * Read the label from the boot device and generate a configuration. 3933 */ 3934 config = spa_generate_rootconf(devpath, devid, &guid); 3935#if defined(_OBP) && defined(_KERNEL) 3936 if (config == NULL) { 3937 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3938 /* iscsi boot */ 3939 get_iscsi_bootpath_phy(devpath); 3940 config = spa_generate_rootconf(devpath, devid, &guid); 3941 } 3942 } 3943#endif 3944 if (config == NULL) { 3945 cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3946 devpath); 3947 return (SET_ERROR(EIO)); 3948 } 3949 3950 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3951 &pname) == 0); 3952 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3953 3954 mutex_enter(&spa_namespace_lock); 3955 if ((spa = spa_lookup(pname)) != NULL) { 3956 /* 3957 * Remove the existing root pool from the namespace so that we 3958 * can replace it with the correct config we just read in. 3959 */ 3960 spa_remove(spa); 3961 } 3962 3963 spa = spa_add(pname, config, NULL); 3964 spa->spa_is_root = B_TRUE; 3965 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3966 3967 /* 3968 * Build up a vdev tree based on the boot device's label config. 3969 */ 3970 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3971 &nvtop) == 0); 3972 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3973 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3974 VDEV_ALLOC_ROOTPOOL); 3975 spa_config_exit(spa, SCL_ALL, FTAG); 3976 if (error) { 3977 mutex_exit(&spa_namespace_lock); 3978 nvlist_free(config); 3979 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3980 pname); 3981 return (error); 3982 } 3983 3984 /* 3985 * Get the boot vdev. 3986 */ 3987 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3988 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3989 (u_longlong_t)guid); 3990 error = SET_ERROR(ENOENT); 3991 goto out; 3992 } 3993 3994 /* 3995 * Determine if there is a better boot device. 3996 */ 3997 avd = bvd; 3998 spa_alt_rootvdev(rvd, &avd, &txg); 3999 if (avd != bvd) { 4000 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 4001 "try booting from '%s'", avd->vdev_path); 4002 error = SET_ERROR(EINVAL); 4003 goto out; 4004 } 4005 4006 /* 4007 * If the boot device is part of a spare vdev then ensure that 4008 * we're booting off the active spare. 4009 */ 4010 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 4011 !bvd->vdev_isspare) { 4012 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 4013 "try booting from '%s'", 4014 bvd->vdev_parent-> 4015 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 4016 error = SET_ERROR(EINVAL); 4017 goto out; 4018 } 4019 4020 error = 0; 4021out: 4022 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4023 vdev_free(rvd); 4024 spa_config_exit(spa, SCL_ALL, FTAG); 4025 mutex_exit(&spa_namespace_lock); 4026 4027 nvlist_free(config); 4028 return (error); 4029} 4030 4031#else /* !illumos */ 4032 4033extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 4034 uint64_t *count); 4035 4036static nvlist_t * 4037spa_generate_rootconf(const char *name) 4038{ 4039 nvlist_t **configs, **tops; 4040 nvlist_t *config; 4041 nvlist_t *best_cfg, *nvtop, *nvroot; 4042 uint64_t *holes; 4043 uint64_t best_txg; 4044 uint64_t nchildren; 4045 uint64_t pgid; 4046 uint64_t count; 4047 uint64_t i; 4048 uint_t nholes; 4049 4050 if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 4051 return (NULL); 4052 4053 ASSERT3U(count, !=, 0); 4054 best_txg = 0; 4055 for (i = 0; i < count; i++) { 4056 uint64_t txg; 4057 4058 VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 4059 &txg) == 0); 4060 if (txg > best_txg) { 4061 best_txg = txg; 4062 best_cfg = configs[i]; 4063 } 4064 } 4065 4066 /* 4067 * Multi-vdev root pool configuration discovery is not supported yet. 4068 */ 4069 nchildren = 1; 4070 nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 4071 holes = NULL; 4072 nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 4073 &holes, &nholes); 4074 4075 tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 4076 for (i = 0; i < nchildren; i++) { 4077 if (i >= count) 4078 break; 4079 if (configs[i] == NULL) 4080 continue; 4081 VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 4082 &nvtop) == 0); 4083 nvlist_dup(nvtop, &tops[i], KM_SLEEP); 4084 } 4085 for (i = 0; holes != NULL && i < nholes; i++) { 4086 if (i >= nchildren) 4087 continue; 4088 if (tops[holes[i]] != NULL) 4089 continue; 4090 nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 4091 VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 4092 VDEV_TYPE_HOLE) == 0); 4093 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 4094 holes[i]) == 0); 4095 VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 4096 0) == 0); 4097 } 4098 for (i = 0; i < nchildren; i++) { 4099 if (tops[i] != NULL) 4100 continue; 4101 nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 4102 VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 4103 VDEV_TYPE_MISSING) == 0); 4104 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 4105 i) == 0); 4106 VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 4107 0) == 0); 4108 } 4109 4110 /* 4111 * Create pool config based on the best vdev config. 4112 */ 4113 nvlist_dup(best_cfg, &config, KM_SLEEP); 4114 4115 /* 4116 * Put this pool's top-level vdevs into a root vdev. 4117 */ 4118 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4119 &pgid) == 0); 4120 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4121 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 4122 VDEV_TYPE_ROOT) == 0); 4123 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 4124 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 4125 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 4126 tops, nchildren) == 0); 4127 4128 /* 4129 * Replace the existing vdev_tree with the new root vdev in 4130 * this pool's configuration (remove the old, add the new). 4131 */ 4132 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 4133 4134 /* 4135 * Drop vdev config elements that should not be present at pool level. 4136 */ 4137 nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 4138 nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 4139 4140 for (i = 0; i < count; i++) 4141 nvlist_free(configs[i]); 4142 kmem_free(configs, count * sizeof(void *)); 4143 for (i = 0; i < nchildren; i++) 4144 nvlist_free(tops[i]); 4145 kmem_free(tops, nchildren * sizeof(void *)); 4146 nvlist_free(nvroot); 4147 return (config); 4148} 4149 4150int 4151spa_import_rootpool(const char *name) 4152{ 4153 spa_t *spa; 4154 vdev_t *rvd, *bvd, *avd = NULL; 4155 nvlist_t *config, *nvtop; 4156 uint64_t txg; 4157 char *pname; 4158 int error; 4159 4160 /* 4161 * Read the label from the boot device and generate a configuration. 4162 */ 4163 config = spa_generate_rootconf(name); 4164 4165 mutex_enter(&spa_namespace_lock); 4166 if (config != NULL) { 4167 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 4168 &pname) == 0 && strcmp(name, pname) == 0); 4169 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 4170 == 0); 4171 4172 if ((spa = spa_lookup(pname)) != NULL) { 4173 /* 4174 * Remove the existing root pool from the namespace so 4175 * that we can replace it with the correct config 4176 * we just read in. 4177 */ 4178 spa_remove(spa); 4179 } 4180 spa = spa_add(pname, config, NULL); 4181 4182 /* 4183 * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 4184 * via spa_version(). 4185 */ 4186 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4187 &spa->spa_ubsync.ub_version) != 0) 4188 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4189 } else if ((spa = spa_lookup(name)) == NULL) { 4190 mutex_exit(&spa_namespace_lock); 4191 nvlist_free(config); 4192 cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 4193 name); 4194 return (EIO); 4195 } else { 4196 VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 4197 } 4198 spa->spa_is_root = B_TRUE; 4199 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 4200 4201 /* 4202 * Build up a vdev tree based on the boot device's label config. 4203 */ 4204 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4205 &nvtop) == 0); 4206 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4207 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 4208 VDEV_ALLOC_ROOTPOOL); 4209 spa_config_exit(spa, SCL_ALL, FTAG); 4210 if (error) { 4211 mutex_exit(&spa_namespace_lock); 4212 nvlist_free(config); 4213 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 4214 pname); 4215 return (error); 4216 } 4217 4218 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4219 vdev_free(rvd); 4220 spa_config_exit(spa, SCL_ALL, FTAG); 4221 mutex_exit(&spa_namespace_lock); 4222 4223 nvlist_free(config); 4224 return (0); 4225} 4226 4227#endif /* illumos */ 4228#endif /* _KERNEL */ 4229 4230/* 4231 * Import a non-root pool into the system. 4232 */ 4233int 4234spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 4235{ 4236 spa_t *spa; 4237 char *altroot = NULL; 4238 spa_load_state_t state = SPA_LOAD_IMPORT; 4239 zpool_rewind_policy_t policy; 4240 uint64_t mode = spa_mode_global; 4241 uint64_t readonly = B_FALSE; 4242 int error; 4243 nvlist_t *nvroot; 4244 nvlist_t **spares, **l2cache; 4245 uint_t nspares, nl2cache; 4246 4247 /* 4248 * If a pool with this name exists, return failure. 4249 */ 4250 mutex_enter(&spa_namespace_lock); 4251 if (spa_lookup(pool) != NULL) { 4252 mutex_exit(&spa_namespace_lock); 4253 return (SET_ERROR(EEXIST)); 4254 } 4255 4256 /* 4257 * Create and initialize the spa structure. 4258 */ 4259 (void) nvlist_lookup_string(props, 4260 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4261 (void) nvlist_lookup_uint64(props, 4262 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4263 if (readonly) 4264 mode = FREAD; 4265 spa = spa_add(pool, config, altroot); 4266 spa->spa_import_flags = flags; 4267 4268 /* 4269 * Verbatim import - Take a pool and insert it into the namespace 4270 * as if it had been loaded at boot. 4271 */ 4272 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4273 if (props != NULL) 4274 spa_configfile_set(spa, props, B_FALSE); 4275 4276 spa_config_sync(spa, B_FALSE, B_TRUE); 4277 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4278 4279 mutex_exit(&spa_namespace_lock); 4280 return (0); 4281 } 4282 4283 spa_activate(spa, mode); 4284 4285 /* 4286 * Don't start async tasks until we know everything is healthy. 4287 */ 4288 spa_async_suspend(spa); 4289 4290 zpool_get_rewind_policy(config, &policy); 4291 if (policy.zrp_request & ZPOOL_DO_REWIND) 4292 state = SPA_LOAD_RECOVER; 4293 4294 /* 4295 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4296 * because the user-supplied config is actually the one to trust when 4297 * doing an import. 4298 */ 4299 if (state != SPA_LOAD_RECOVER) 4300 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4301 4302 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4303 policy.zrp_request); 4304 4305 /* 4306 * Propagate anything learned while loading the pool and pass it 4307 * back to caller (i.e. rewind info, missing devices, etc). 4308 */ 4309 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4310 spa->spa_load_info) == 0); 4311 4312 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4313 /* 4314 * Toss any existing sparelist, as it doesn't have any validity 4315 * anymore, and conflicts with spa_has_spare(). 4316 */ 4317 if (spa->spa_spares.sav_config) { 4318 nvlist_free(spa->spa_spares.sav_config); 4319 spa->spa_spares.sav_config = NULL; 4320 spa_load_spares(spa); 4321 } 4322 if (spa->spa_l2cache.sav_config) { 4323 nvlist_free(spa->spa_l2cache.sav_config); 4324 spa->spa_l2cache.sav_config = NULL; 4325 spa_load_l2cache(spa); 4326 } 4327 4328 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4329 &nvroot) == 0); 4330 if (error == 0) 4331 error = spa_validate_aux(spa, nvroot, -1ULL, 4332 VDEV_ALLOC_SPARE); 4333 if (error == 0) 4334 error = spa_validate_aux(spa, nvroot, -1ULL, 4335 VDEV_ALLOC_L2CACHE); 4336 spa_config_exit(spa, SCL_ALL, FTAG); 4337 4338 if (props != NULL) 4339 spa_configfile_set(spa, props, B_FALSE); 4340 4341 if (error != 0 || (props && spa_writeable(spa) && 4342 (error = spa_prop_set(spa, props)))) { 4343 spa_unload(spa); 4344 spa_deactivate(spa); 4345 spa_remove(spa); 4346 mutex_exit(&spa_namespace_lock); 4347 return (error); 4348 } 4349 4350 spa_async_resume(spa); 4351 4352 /* 4353 * Override any spares and level 2 cache devices as specified by 4354 * the user, as these may have correct device names/devids, etc. 4355 */ 4356 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4357 &spares, &nspares) == 0) { 4358 if (spa->spa_spares.sav_config) 4359 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4360 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4361 else 4362 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4363 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4364 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4365 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4366 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4367 spa_load_spares(spa); 4368 spa_config_exit(spa, SCL_ALL, FTAG); 4369 spa->spa_spares.sav_sync = B_TRUE; 4370 } 4371 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4372 &l2cache, &nl2cache) == 0) { 4373 if (spa->spa_l2cache.sav_config) 4374 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4375 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4376 else 4377 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4378 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4379 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4380 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4381 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4382 spa_load_l2cache(spa); 4383 spa_config_exit(spa, SCL_ALL, FTAG); 4384 spa->spa_l2cache.sav_sync = B_TRUE; 4385 } 4386 4387 /* 4388 * Check for any removed devices. 4389 */ 4390 if (spa->spa_autoreplace) { 4391 spa_aux_check_removed(&spa->spa_spares); 4392 spa_aux_check_removed(&spa->spa_l2cache); 4393 } 4394 4395 if (spa_writeable(spa)) { 4396 /* 4397 * Update the config cache to include the newly-imported pool. 4398 */ 4399 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4400 } 4401 4402 /* 4403 * It's possible that the pool was expanded while it was exported. 4404 * We kick off an async task to handle this for us. 4405 */ 4406 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4407 4408 spa_history_log_version(spa, "import"); 4409 4410 spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT); 4411 4412 mutex_exit(&spa_namespace_lock); 4413 4414#ifdef __FreeBSD__ 4415#ifdef _KERNEL 4416 zvol_create_minors(pool); 4417#endif 4418#endif 4419 return (0); 4420} 4421 4422nvlist_t * 4423spa_tryimport(nvlist_t *tryconfig) 4424{ 4425 nvlist_t *config = NULL; 4426 char *poolname; 4427 spa_t *spa; 4428 uint64_t state; 4429 int error; 4430 4431 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4432 return (NULL); 4433 4434 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4435 return (NULL); 4436 4437 /* 4438 * Create and initialize the spa structure. 4439 */ 4440 mutex_enter(&spa_namespace_lock); 4441 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4442 spa_activate(spa, FREAD); 4443 4444 /* 4445 * Pass off the heavy lifting to spa_load(). 4446 * Pass TRUE for mosconfig because the user-supplied config 4447 * is actually the one to trust when doing an import. 4448 */ 4449 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4450 4451 /* 4452 * If 'tryconfig' was at least parsable, return the current config. 4453 */ 4454 if (spa->spa_root_vdev != NULL) { 4455 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4456 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4457 poolname) == 0); 4458 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4459 state) == 0); 4460 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4461 spa->spa_uberblock.ub_timestamp) == 0); 4462 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4463 spa->spa_load_info) == 0); 4464 4465 /* 4466 * If the bootfs property exists on this pool then we 4467 * copy it out so that external consumers can tell which 4468 * pools are bootable. 4469 */ 4470 if ((!error || error == EEXIST) && spa->spa_bootfs) { 4471 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4472 4473 /* 4474 * We have to play games with the name since the 4475 * pool was opened as TRYIMPORT_NAME. 4476 */ 4477 if (dsl_dsobj_to_dsname(spa_name(spa), 4478 spa->spa_bootfs, tmpname) == 0) { 4479 char *cp; 4480 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4481 4482 cp = strchr(tmpname, '/'); 4483 if (cp == NULL) { 4484 (void) strlcpy(dsname, tmpname, 4485 MAXPATHLEN); 4486 } else { 4487 (void) snprintf(dsname, MAXPATHLEN, 4488 "%s/%s", poolname, ++cp); 4489 } 4490 VERIFY(nvlist_add_string(config, 4491 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4492 kmem_free(dsname, MAXPATHLEN); 4493 } 4494 kmem_free(tmpname, MAXPATHLEN); 4495 } 4496 4497 /* 4498 * Add the list of hot spares and level 2 cache devices. 4499 */ 4500 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4501 spa_add_spares(spa, config); 4502 spa_add_l2cache(spa, config); 4503 spa_config_exit(spa, SCL_CONFIG, FTAG); 4504 } 4505 4506 spa_unload(spa); 4507 spa_deactivate(spa); 4508 spa_remove(spa); 4509 mutex_exit(&spa_namespace_lock); 4510 4511 return (config); 4512} 4513 4514/* 4515 * Pool export/destroy 4516 * 4517 * The act of destroying or exporting a pool is very simple. We make sure there 4518 * is no more pending I/O and any references to the pool are gone. Then, we 4519 * update the pool state and sync all the labels to disk, removing the 4520 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4521 * we don't sync the labels or remove the configuration cache. 4522 */ 4523static int 4524spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4525 boolean_t force, boolean_t hardforce) 4526{ 4527 spa_t *spa; 4528 4529 if (oldconfig) 4530 *oldconfig = NULL; 4531 4532 if (!(spa_mode_global & FWRITE)) 4533 return (SET_ERROR(EROFS)); 4534 4535 mutex_enter(&spa_namespace_lock); 4536 if ((spa = spa_lookup(pool)) == NULL) { 4537 mutex_exit(&spa_namespace_lock); 4538 return (SET_ERROR(ENOENT)); 4539 } 4540 4541 /* 4542 * Put a hold on the pool, drop the namespace lock, stop async tasks, 4543 * reacquire the namespace lock, and see if we can export. 4544 */ 4545 spa_open_ref(spa, FTAG); 4546 mutex_exit(&spa_namespace_lock); 4547 spa_async_suspend(spa); 4548 mutex_enter(&spa_namespace_lock); 4549 spa_close(spa, FTAG); 4550 4551 /* 4552 * The pool will be in core if it's openable, 4553 * in which case we can modify its state. 4554 */ 4555 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4556 /* 4557 * Objsets may be open only because they're dirty, so we 4558 * have to force it to sync before checking spa_refcnt. 4559 */ 4560 txg_wait_synced(spa->spa_dsl_pool, 0); 4561 spa_evicting_os_wait(spa); 4562 4563 /* 4564 * A pool cannot be exported or destroyed if there are active 4565 * references. If we are resetting a pool, allow references by 4566 * fault injection handlers. 4567 */ 4568 if (!spa_refcount_zero(spa) || 4569 (spa->spa_inject_ref != 0 && 4570 new_state != POOL_STATE_UNINITIALIZED)) { 4571 spa_async_resume(spa); 4572 mutex_exit(&spa_namespace_lock); 4573 return (SET_ERROR(EBUSY)); 4574 } 4575 4576 /* 4577 * A pool cannot be exported if it has an active shared spare. 4578 * This is to prevent other pools stealing the active spare 4579 * from an exported pool. At user's own will, such pool can 4580 * be forcedly exported. 4581 */ 4582 if (!force && new_state == POOL_STATE_EXPORTED && 4583 spa_has_active_shared_spare(spa)) { 4584 spa_async_resume(spa); 4585 mutex_exit(&spa_namespace_lock); 4586 return (SET_ERROR(EXDEV)); 4587 } 4588 4589 /* 4590 * We want this to be reflected on every label, 4591 * so mark them all dirty. spa_unload() will do the 4592 * final sync that pushes these changes out. 4593 */ 4594 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4595 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4596 spa->spa_state = new_state; 4597 spa->spa_final_txg = spa_last_synced_txg(spa) + 4598 TXG_DEFER_SIZE + 1; 4599 vdev_config_dirty(spa->spa_root_vdev); 4600 spa_config_exit(spa, SCL_ALL, FTAG); 4601 } 4602 } 4603 4604 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4605 4606 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4607 spa_unload(spa); 4608 spa_deactivate(spa); 4609 } 4610 4611 if (oldconfig && spa->spa_config) 4612 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4613 4614 if (new_state != POOL_STATE_UNINITIALIZED) { 4615 if (!hardforce) 4616 spa_config_sync(spa, B_TRUE, B_TRUE); 4617 spa_remove(spa); 4618 } 4619 mutex_exit(&spa_namespace_lock); 4620 4621 return (0); 4622} 4623 4624/* 4625 * Destroy a storage pool. 4626 */ 4627int 4628spa_destroy(char *pool) 4629{ 4630 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4631 B_FALSE, B_FALSE)); 4632} 4633 4634/* 4635 * Export a storage pool. 4636 */ 4637int 4638spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4639 boolean_t hardforce) 4640{ 4641 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4642 force, hardforce)); 4643} 4644 4645/* 4646 * Similar to spa_export(), this unloads the spa_t without actually removing it 4647 * from the namespace in any way. 4648 */ 4649int 4650spa_reset(char *pool) 4651{ 4652 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4653 B_FALSE, B_FALSE)); 4654} 4655 4656/* 4657 * ========================================================================== 4658 * Device manipulation 4659 * ========================================================================== 4660 */ 4661 4662/* 4663 * Add a device to a storage pool. 4664 */ 4665int 4666spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4667{ 4668 uint64_t txg, id; 4669 int error; 4670 vdev_t *rvd = spa->spa_root_vdev; 4671 vdev_t *vd, *tvd; 4672 nvlist_t **spares, **l2cache; 4673 uint_t nspares, nl2cache; 4674 4675 ASSERT(spa_writeable(spa)); 4676 4677 txg = spa_vdev_enter(spa); 4678 4679 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4680 VDEV_ALLOC_ADD)) != 0) 4681 return (spa_vdev_exit(spa, NULL, txg, error)); 4682 4683 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4684 4685 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4686 &nspares) != 0) 4687 nspares = 0; 4688 4689 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4690 &nl2cache) != 0) 4691 nl2cache = 0; 4692 4693 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4694 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4695 4696 if (vd->vdev_children != 0 && 4697 (error = vdev_create(vd, txg, B_FALSE)) != 0) 4698 return (spa_vdev_exit(spa, vd, txg, error)); 4699 4700 /* 4701 * We must validate the spares and l2cache devices after checking the 4702 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4703 */ 4704 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4705 return (spa_vdev_exit(spa, vd, txg, error)); 4706 4707 /* 4708 * Transfer each new top-level vdev from vd to rvd. 4709 */ 4710 for (int c = 0; c < vd->vdev_children; c++) { 4711 4712 /* 4713 * Set the vdev id to the first hole, if one exists. 4714 */ 4715 for (id = 0; id < rvd->vdev_children; id++) { 4716 if (rvd->vdev_child[id]->vdev_ishole) { 4717 vdev_free(rvd->vdev_child[id]); 4718 break; 4719 } 4720 } 4721 tvd = vd->vdev_child[c]; 4722 vdev_remove_child(vd, tvd); 4723 tvd->vdev_id = id; 4724 vdev_add_child(rvd, tvd); 4725 vdev_config_dirty(tvd); 4726 } 4727 4728 if (nspares != 0) { 4729 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4730 ZPOOL_CONFIG_SPARES); 4731 spa_load_spares(spa); 4732 spa->spa_spares.sav_sync = B_TRUE; 4733 } 4734 4735 if (nl2cache != 0) { 4736 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4737 ZPOOL_CONFIG_L2CACHE); 4738 spa_load_l2cache(spa); 4739 spa->spa_l2cache.sav_sync = B_TRUE; 4740 } 4741 4742 /* 4743 * We have to be careful when adding new vdevs to an existing pool. 4744 * If other threads start allocating from these vdevs before we 4745 * sync the config cache, and we lose power, then upon reboot we may 4746 * fail to open the pool because there are DVAs that the config cache 4747 * can't translate. Therefore, we first add the vdevs without 4748 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4749 * and then let spa_config_update() initialize the new metaslabs. 4750 * 4751 * spa_load() checks for added-but-not-initialized vdevs, so that 4752 * if we lose power at any point in this sequence, the remaining 4753 * steps will be completed the next time we load the pool. 4754 */ 4755 (void) spa_vdev_exit(spa, vd, txg, 0); 4756 4757 mutex_enter(&spa_namespace_lock); 4758 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4759 spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD); 4760 mutex_exit(&spa_namespace_lock); 4761 4762 return (0); 4763} 4764 4765/* 4766 * Attach a device to a mirror. The arguments are the path to any device 4767 * in the mirror, and the nvroot for the new device. If the path specifies 4768 * a device that is not mirrored, we automatically insert the mirror vdev. 4769 * 4770 * If 'replacing' is specified, the new device is intended to replace the 4771 * existing device; in this case the two devices are made into their own 4772 * mirror using the 'replacing' vdev, which is functionally identical to 4773 * the mirror vdev (it actually reuses all the same ops) but has a few 4774 * extra rules: you can't attach to it after it's been created, and upon 4775 * completion of resilvering, the first disk (the one being replaced) 4776 * is automatically detached. 4777 */ 4778int 4779spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4780{ 4781 uint64_t txg, dtl_max_txg; 4782 vdev_t *rvd = spa->spa_root_vdev; 4783 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4784 vdev_ops_t *pvops; 4785 char *oldvdpath, *newvdpath; 4786 int newvd_isspare; 4787 int error; 4788 4789 ASSERT(spa_writeable(spa)); 4790 4791 txg = spa_vdev_enter(spa); 4792 4793 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4794 4795 if (oldvd == NULL) 4796 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4797 4798 if (!oldvd->vdev_ops->vdev_op_leaf) 4799 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4800 4801 pvd = oldvd->vdev_parent; 4802 4803 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4804 VDEV_ALLOC_ATTACH)) != 0) 4805 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4806 4807 if (newrootvd->vdev_children != 1) 4808 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4809 4810 newvd = newrootvd->vdev_child[0]; 4811 4812 if (!newvd->vdev_ops->vdev_op_leaf) 4813 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4814 4815 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4816 return (spa_vdev_exit(spa, newrootvd, txg, error)); 4817 4818 /* 4819 * Spares can't replace logs 4820 */ 4821 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4822 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4823 4824 if (!replacing) { 4825 /* 4826 * For attach, the only allowable parent is a mirror or the root 4827 * vdev. 4828 */ 4829 if (pvd->vdev_ops != &vdev_mirror_ops && 4830 pvd->vdev_ops != &vdev_root_ops) 4831 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4832 4833 pvops = &vdev_mirror_ops; 4834 } else { 4835 /* 4836 * Active hot spares can only be replaced by inactive hot 4837 * spares. 4838 */ 4839 if (pvd->vdev_ops == &vdev_spare_ops && 4840 oldvd->vdev_isspare && 4841 !spa_has_spare(spa, newvd->vdev_guid)) 4842 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4843 4844 /* 4845 * If the source is a hot spare, and the parent isn't already a 4846 * spare, then we want to create a new hot spare. Otherwise, we 4847 * want to create a replacing vdev. The user is not allowed to 4848 * attach to a spared vdev child unless the 'isspare' state is 4849 * the same (spare replaces spare, non-spare replaces 4850 * non-spare). 4851 */ 4852 if (pvd->vdev_ops == &vdev_replacing_ops && 4853 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4854 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4855 } else if (pvd->vdev_ops == &vdev_spare_ops && 4856 newvd->vdev_isspare != oldvd->vdev_isspare) { 4857 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4858 } 4859 4860 if (newvd->vdev_isspare) 4861 pvops = &vdev_spare_ops; 4862 else 4863 pvops = &vdev_replacing_ops; 4864 } 4865 4866 /* 4867 * Make sure the new device is big enough. 4868 */ 4869 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4870 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4871 4872 /* 4873 * The new device cannot have a higher alignment requirement 4874 * than the top-level vdev. 4875 */ 4876 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4877 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4878 4879 /* 4880 * If this is an in-place replacement, update oldvd's path and devid 4881 * to make it distinguishable from newvd, and unopenable from now on. 4882 */ 4883 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4884 spa_strfree(oldvd->vdev_path); 4885 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4886 KM_SLEEP); 4887 (void) sprintf(oldvd->vdev_path, "%s/%s", 4888 newvd->vdev_path, "old"); 4889 if (oldvd->vdev_devid != NULL) { 4890 spa_strfree(oldvd->vdev_devid); 4891 oldvd->vdev_devid = NULL; 4892 } 4893 } 4894 4895 /* mark the device being resilvered */ 4896 newvd->vdev_resilver_txg = txg; 4897 4898 /* 4899 * If the parent is not a mirror, or if we're replacing, insert the new 4900 * mirror/replacing/spare vdev above oldvd. 4901 */ 4902 if (pvd->vdev_ops != pvops) 4903 pvd = vdev_add_parent(oldvd, pvops); 4904 4905 ASSERT(pvd->vdev_top->vdev_parent == rvd); 4906 ASSERT(pvd->vdev_ops == pvops); 4907 ASSERT(oldvd->vdev_parent == pvd); 4908 4909 /* 4910 * Extract the new device from its root and add it to pvd. 4911 */ 4912 vdev_remove_child(newrootvd, newvd); 4913 newvd->vdev_id = pvd->vdev_children; 4914 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4915 vdev_add_child(pvd, newvd); 4916 4917 tvd = newvd->vdev_top; 4918 ASSERT(pvd->vdev_top == tvd); 4919 ASSERT(tvd->vdev_parent == rvd); 4920 4921 vdev_config_dirty(tvd); 4922 4923 /* 4924 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4925 * for any dmu_sync-ed blocks. It will propagate upward when 4926 * spa_vdev_exit() calls vdev_dtl_reassess(). 4927 */ 4928 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4929 4930 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4931 dtl_max_txg - TXG_INITIAL); 4932 4933 if (newvd->vdev_isspare) { 4934 spa_spare_activate(newvd); 4935 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4936 } 4937 4938 oldvdpath = spa_strdup(oldvd->vdev_path); 4939 newvdpath = spa_strdup(newvd->vdev_path); 4940 newvd_isspare = newvd->vdev_isspare; 4941 4942 /* 4943 * Mark newvd's DTL dirty in this txg. 4944 */ 4945 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4946 4947 /* 4948 * Schedule the resilver to restart in the future. We do this to 4949 * ensure that dmu_sync-ed blocks have been stitched into the 4950 * respective datasets. 4951 */ 4952 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4953 4954 if (spa->spa_bootfs) 4955 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4956 4957 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH); 4958 4959 /* 4960 * Commit the config 4961 */ 4962 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4963 4964 spa_history_log_internal(spa, "vdev attach", NULL, 4965 "%s vdev=%s %s vdev=%s", 4966 replacing && newvd_isspare ? "spare in" : 4967 replacing ? "replace" : "attach", newvdpath, 4968 replacing ? "for" : "to", oldvdpath); 4969 4970 spa_strfree(oldvdpath); 4971 spa_strfree(newvdpath); 4972 4973 return (0); 4974} 4975 4976/* 4977 * Detach a device from a mirror or replacing vdev. 4978 * 4979 * If 'replace_done' is specified, only detach if the parent 4980 * is a replacing vdev. 4981 */ 4982int 4983spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4984{ 4985 uint64_t txg; 4986 int error; 4987 vdev_t *rvd = spa->spa_root_vdev; 4988 vdev_t *vd, *pvd, *cvd, *tvd; 4989 boolean_t unspare = B_FALSE; 4990 uint64_t unspare_guid = 0; 4991 char *vdpath; 4992 4993 ASSERT(spa_writeable(spa)); 4994 4995 txg = spa_vdev_enter(spa); 4996 4997 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4998 4999 if (vd == NULL) 5000 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 5001 5002 if (!vd->vdev_ops->vdev_op_leaf) 5003 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5004 5005 pvd = vd->vdev_parent; 5006 5007 /* 5008 * If the parent/child relationship is not as expected, don't do it. 5009 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 5010 * vdev that's replacing B with C. The user's intent in replacing 5011 * is to go from M(A,B) to M(A,C). If the user decides to cancel 5012 * the replace by detaching C, the expected behavior is to end up 5013 * M(A,B). But suppose that right after deciding to detach C, 5014 * the replacement of B completes. We would have M(A,C), and then 5015 * ask to detach C, which would leave us with just A -- not what 5016 * the user wanted. To prevent this, we make sure that the 5017 * parent/child relationship hasn't changed -- in this example, 5018 * that C's parent is still the replacing vdev R. 5019 */ 5020 if (pvd->vdev_guid != pguid && pguid != 0) 5021 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5022 5023 /* 5024 * Only 'replacing' or 'spare' vdevs can be replaced. 5025 */ 5026 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 5027 pvd->vdev_ops != &vdev_spare_ops) 5028 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5029 5030 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 5031 spa_version(spa) >= SPA_VERSION_SPARES); 5032 5033 /* 5034 * Only mirror, replacing, and spare vdevs support detach. 5035 */ 5036 if (pvd->vdev_ops != &vdev_replacing_ops && 5037 pvd->vdev_ops != &vdev_mirror_ops && 5038 pvd->vdev_ops != &vdev_spare_ops) 5039 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 5040 5041 /* 5042 * If this device has the only valid copy of some data, 5043 * we cannot safely detach it. 5044 */ 5045 if (vdev_dtl_required(vd)) 5046 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 5047 5048 ASSERT(pvd->vdev_children >= 2); 5049 5050 /* 5051 * If we are detaching the second disk from a replacing vdev, then 5052 * check to see if we changed the original vdev's path to have "/old" 5053 * at the end in spa_vdev_attach(). If so, undo that change now. 5054 */ 5055 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 5056 vd->vdev_path != NULL) { 5057 size_t len = strlen(vd->vdev_path); 5058 5059 for (int c = 0; c < pvd->vdev_children; c++) { 5060 cvd = pvd->vdev_child[c]; 5061 5062 if (cvd == vd || cvd->vdev_path == NULL) 5063 continue; 5064 5065 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 5066 strcmp(cvd->vdev_path + len, "/old") == 0) { 5067 spa_strfree(cvd->vdev_path); 5068 cvd->vdev_path = spa_strdup(vd->vdev_path); 5069 break; 5070 } 5071 } 5072 } 5073 5074 /* 5075 * If we are detaching the original disk from a spare, then it implies 5076 * that the spare should become a real disk, and be removed from the 5077 * active spare list for the pool. 5078 */ 5079 if (pvd->vdev_ops == &vdev_spare_ops && 5080 vd->vdev_id == 0 && 5081 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 5082 unspare = B_TRUE; 5083 5084 /* 5085 * Erase the disk labels so the disk can be used for other things. 5086 * This must be done after all other error cases are handled, 5087 * but before we disembowel vd (so we can still do I/O to it). 5088 * But if we can't do it, don't treat the error as fatal -- 5089 * it may be that the unwritability of the disk is the reason 5090 * it's being detached! 5091 */ 5092 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5093 5094 /* 5095 * Remove vd from its parent and compact the parent's children. 5096 */ 5097 vdev_remove_child(pvd, vd); 5098 vdev_compact_children(pvd); 5099 5100 /* 5101 * Remember one of the remaining children so we can get tvd below. 5102 */ 5103 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 5104 5105 /* 5106 * If we need to remove the remaining child from the list of hot spares, 5107 * do it now, marking the vdev as no longer a spare in the process. 5108 * We must do this before vdev_remove_parent(), because that can 5109 * change the GUID if it creates a new toplevel GUID. For a similar 5110 * reason, we must remove the spare now, in the same txg as the detach; 5111 * otherwise someone could attach a new sibling, change the GUID, and 5112 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 5113 */ 5114 if (unspare) { 5115 ASSERT(cvd->vdev_isspare); 5116 spa_spare_remove(cvd); 5117 unspare_guid = cvd->vdev_guid; 5118 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 5119 cvd->vdev_unspare = B_TRUE; 5120 } 5121 5122 /* 5123 * If the parent mirror/replacing vdev only has one child, 5124 * the parent is no longer needed. Remove it from the tree. 5125 */ 5126 if (pvd->vdev_children == 1) { 5127 if (pvd->vdev_ops == &vdev_spare_ops) 5128 cvd->vdev_unspare = B_FALSE; 5129 vdev_remove_parent(cvd); 5130 } 5131 5132 5133 /* 5134 * We don't set tvd until now because the parent we just removed 5135 * may have been the previous top-level vdev. 5136 */ 5137 tvd = cvd->vdev_top; 5138 ASSERT(tvd->vdev_parent == rvd); 5139 5140 /* 5141 * Reevaluate the parent vdev state. 5142 */ 5143 vdev_propagate_state(cvd); 5144 5145 /* 5146 * If the 'autoexpand' property is set on the pool then automatically 5147 * try to expand the size of the pool. For example if the device we 5148 * just detached was smaller than the others, it may be possible to 5149 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 5150 * first so that we can obtain the updated sizes of the leaf vdevs. 5151 */ 5152 if (spa->spa_autoexpand) { 5153 vdev_reopen(tvd); 5154 vdev_expand(tvd, txg); 5155 } 5156 5157 vdev_config_dirty(tvd); 5158 5159 /* 5160 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 5161 * vd->vdev_detached is set and free vd's DTL object in syncing context. 5162 * But first make sure we're not on any *other* txg's DTL list, to 5163 * prevent vd from being accessed after it's freed. 5164 */ 5165 vdpath = spa_strdup(vd->vdev_path); 5166 for (int t = 0; t < TXG_SIZE; t++) 5167 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 5168 vd->vdev_detached = B_TRUE; 5169 vdev_dirty(tvd, VDD_DTL, vd, txg); 5170 5171 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 5172 5173 /* hang on to the spa before we release the lock */ 5174 spa_open_ref(spa, FTAG); 5175 5176 error = spa_vdev_exit(spa, vd, txg, 0); 5177 5178 spa_history_log_internal(spa, "detach", NULL, 5179 "vdev=%s", vdpath); 5180 spa_strfree(vdpath); 5181 5182 /* 5183 * If this was the removal of the original device in a hot spare vdev, 5184 * then we want to go through and remove the device from the hot spare 5185 * list of every other pool. 5186 */ 5187 if (unspare) { 5188 spa_t *altspa = NULL; 5189 5190 mutex_enter(&spa_namespace_lock); 5191 while ((altspa = spa_next(altspa)) != NULL) { 5192 if (altspa->spa_state != POOL_STATE_ACTIVE || 5193 altspa == spa) 5194 continue; 5195 5196 spa_open_ref(altspa, FTAG); 5197 mutex_exit(&spa_namespace_lock); 5198 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 5199 mutex_enter(&spa_namespace_lock); 5200 spa_close(altspa, FTAG); 5201 } 5202 mutex_exit(&spa_namespace_lock); 5203 5204 /* search the rest of the vdevs for spares to remove */ 5205 spa_vdev_resilver_done(spa); 5206 } 5207 5208 /* all done with the spa; OK to release */ 5209 mutex_enter(&spa_namespace_lock); 5210 spa_close(spa, FTAG); 5211 mutex_exit(&spa_namespace_lock); 5212 5213 return (error); 5214} 5215 5216/* 5217 * Split a set of devices from their mirrors, and create a new pool from them. 5218 */ 5219int 5220spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 5221 nvlist_t *props, boolean_t exp) 5222{ 5223 int error = 0; 5224 uint64_t txg, *glist; 5225 spa_t *newspa; 5226 uint_t c, children, lastlog; 5227 nvlist_t **child, *nvl, *tmp; 5228 dmu_tx_t *tx; 5229 char *altroot = NULL; 5230 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 5231 boolean_t activate_slog; 5232 5233 ASSERT(spa_writeable(spa)); 5234 5235 txg = spa_vdev_enter(spa); 5236 5237 /* clear the log and flush everything up to now */ 5238 activate_slog = spa_passivate_log(spa); 5239 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5240 error = spa_offline_log(spa); 5241 txg = spa_vdev_config_enter(spa); 5242 5243 if (activate_slog) 5244 spa_activate_log(spa); 5245 5246 if (error != 0) 5247 return (spa_vdev_exit(spa, NULL, txg, error)); 5248 5249 /* check new spa name before going any further */ 5250 if (spa_lookup(newname) != NULL) 5251 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 5252 5253 /* 5254 * scan through all the children to ensure they're all mirrors 5255 */ 5256 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 5257 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 5258 &children) != 0) 5259 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5260 5261 /* first, check to ensure we've got the right child count */ 5262 rvd = spa->spa_root_vdev; 5263 lastlog = 0; 5264 for (c = 0; c < rvd->vdev_children; c++) { 5265 vdev_t *vd = rvd->vdev_child[c]; 5266 5267 /* don't count the holes & logs as children */ 5268 if (vd->vdev_islog || vd->vdev_ishole) { 5269 if (lastlog == 0) 5270 lastlog = c; 5271 continue; 5272 } 5273 5274 lastlog = 0; 5275 } 5276 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5277 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5278 5279 /* next, ensure no spare or cache devices are part of the split */ 5280 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5281 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5282 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5283 5284 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5285 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5286 5287 /* then, loop over each vdev and validate it */ 5288 for (c = 0; c < children; c++) { 5289 uint64_t is_hole = 0; 5290 5291 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5292 &is_hole); 5293 5294 if (is_hole != 0) { 5295 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5296 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5297 continue; 5298 } else { 5299 error = SET_ERROR(EINVAL); 5300 break; 5301 } 5302 } 5303 5304 /* which disk is going to be split? */ 5305 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5306 &glist[c]) != 0) { 5307 error = SET_ERROR(EINVAL); 5308 break; 5309 } 5310 5311 /* look it up in the spa */ 5312 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5313 if (vml[c] == NULL) { 5314 error = SET_ERROR(ENODEV); 5315 break; 5316 } 5317 5318 /* make sure there's nothing stopping the split */ 5319 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5320 vml[c]->vdev_islog || 5321 vml[c]->vdev_ishole || 5322 vml[c]->vdev_isspare || 5323 vml[c]->vdev_isl2cache || 5324 !vdev_writeable(vml[c]) || 5325 vml[c]->vdev_children != 0 || 5326 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5327 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5328 error = SET_ERROR(EINVAL); 5329 break; 5330 } 5331 5332 if (vdev_dtl_required(vml[c])) { 5333 error = SET_ERROR(EBUSY); 5334 break; 5335 } 5336 5337 /* we need certain info from the top level */ 5338 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5339 vml[c]->vdev_top->vdev_ms_array) == 0); 5340 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5341 vml[c]->vdev_top->vdev_ms_shift) == 0); 5342 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5343 vml[c]->vdev_top->vdev_asize) == 0); 5344 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5345 vml[c]->vdev_top->vdev_ashift) == 0); 5346 } 5347 5348 if (error != 0) { 5349 kmem_free(vml, children * sizeof (vdev_t *)); 5350 kmem_free(glist, children * sizeof (uint64_t)); 5351 return (spa_vdev_exit(spa, NULL, txg, error)); 5352 } 5353 5354 /* stop writers from using the disks */ 5355 for (c = 0; c < children; c++) { 5356 if (vml[c] != NULL) 5357 vml[c]->vdev_offline = B_TRUE; 5358 } 5359 vdev_reopen(spa->spa_root_vdev); 5360 5361 /* 5362 * Temporarily record the splitting vdevs in the spa config. This 5363 * will disappear once the config is regenerated. 5364 */ 5365 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5366 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5367 glist, children) == 0); 5368 kmem_free(glist, children * sizeof (uint64_t)); 5369 5370 mutex_enter(&spa->spa_props_lock); 5371 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5372 nvl) == 0); 5373 mutex_exit(&spa->spa_props_lock); 5374 spa->spa_config_splitting = nvl; 5375 vdev_config_dirty(spa->spa_root_vdev); 5376 5377 /* configure and create the new pool */ 5378 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5379 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5380 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5381 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5382 spa_version(spa)) == 0); 5383 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5384 spa->spa_config_txg) == 0); 5385 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5386 spa_generate_guid(NULL)) == 0); 5387 (void) nvlist_lookup_string(props, 5388 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5389 5390 /* add the new pool to the namespace */ 5391 newspa = spa_add(newname, config, altroot); 5392 newspa->spa_config_txg = spa->spa_config_txg; 5393 spa_set_log_state(newspa, SPA_LOG_CLEAR); 5394 5395 /* release the spa config lock, retaining the namespace lock */ 5396 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5397 5398 if (zio_injection_enabled) 5399 zio_handle_panic_injection(spa, FTAG, 1); 5400 5401 spa_activate(newspa, spa_mode_global); 5402 spa_async_suspend(newspa); 5403 5404#ifndef illumos 5405 /* mark that we are creating new spa by splitting */ 5406 newspa->spa_splitting_newspa = B_TRUE; 5407#endif 5408 /* create the new pool from the disks of the original pool */ 5409 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5410#ifndef illumos 5411 newspa->spa_splitting_newspa = B_FALSE; 5412#endif 5413 if (error) 5414 goto out; 5415 5416 /* if that worked, generate a real config for the new pool */ 5417 if (newspa->spa_root_vdev != NULL) { 5418 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5419 NV_UNIQUE_NAME, KM_SLEEP) == 0); 5420 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5421 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5422 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5423 B_TRUE)); 5424 } 5425 5426 /* set the props */ 5427 if (props != NULL) { 5428 spa_configfile_set(newspa, props, B_FALSE); 5429 error = spa_prop_set(newspa, props); 5430 if (error) 5431 goto out; 5432 } 5433 5434 /* flush everything */ 5435 txg = spa_vdev_config_enter(newspa); 5436 vdev_config_dirty(newspa->spa_root_vdev); 5437 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5438 5439 if (zio_injection_enabled) 5440 zio_handle_panic_injection(spa, FTAG, 2); 5441 5442 spa_async_resume(newspa); 5443 5444 /* finally, update the original pool's config */ 5445 txg = spa_vdev_config_enter(spa); 5446 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5447 error = dmu_tx_assign(tx, TXG_WAIT); 5448 if (error != 0) 5449 dmu_tx_abort(tx); 5450 for (c = 0; c < children; c++) { 5451 if (vml[c] != NULL) { 5452 vdev_split(vml[c]); 5453 if (error == 0) 5454 spa_history_log_internal(spa, "detach", tx, 5455 "vdev=%s", vml[c]->vdev_path); 5456 vdev_free(vml[c]); 5457 } 5458 } 5459 vdev_config_dirty(spa->spa_root_vdev); 5460 spa->spa_config_splitting = NULL; 5461 nvlist_free(nvl); 5462 if (error == 0) 5463 dmu_tx_commit(tx); 5464 (void) spa_vdev_exit(spa, NULL, txg, 0); 5465 5466 if (zio_injection_enabled) 5467 zio_handle_panic_injection(spa, FTAG, 3); 5468 5469 /* split is complete; log a history record */ 5470 spa_history_log_internal(newspa, "split", NULL, 5471 "from pool %s", spa_name(spa)); 5472 5473 kmem_free(vml, children * sizeof (vdev_t *)); 5474 5475 /* if we're not going to mount the filesystems in userland, export */ 5476 if (exp) 5477 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5478 B_FALSE, B_FALSE); 5479 5480 return (error); 5481 5482out: 5483 spa_unload(newspa); 5484 spa_deactivate(newspa); 5485 spa_remove(newspa); 5486 5487 txg = spa_vdev_config_enter(spa); 5488 5489 /* re-online all offlined disks */ 5490 for (c = 0; c < children; c++) { 5491 if (vml[c] != NULL) 5492 vml[c]->vdev_offline = B_FALSE; 5493 } 5494 vdev_reopen(spa->spa_root_vdev); 5495 5496 nvlist_free(spa->spa_config_splitting); 5497 spa->spa_config_splitting = NULL; 5498 (void) spa_vdev_exit(spa, NULL, txg, error); 5499 5500 kmem_free(vml, children * sizeof (vdev_t *)); 5501 return (error); 5502} 5503 5504static nvlist_t * 5505spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5506{ 5507 for (int i = 0; i < count; i++) { 5508 uint64_t guid; 5509 5510 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5511 &guid) == 0); 5512 5513 if (guid == target_guid) 5514 return (nvpp[i]); 5515 } 5516 5517 return (NULL); 5518} 5519 5520static void 5521spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5522 nvlist_t *dev_to_remove) 5523{ 5524 nvlist_t **newdev = NULL; 5525 5526 if (count > 1) 5527 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5528 5529 for (int i = 0, j = 0; i < count; i++) { 5530 if (dev[i] == dev_to_remove) 5531 continue; 5532 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5533 } 5534 5535 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5536 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5537 5538 for (int i = 0; i < count - 1; i++) 5539 nvlist_free(newdev[i]); 5540 5541 if (count > 1) 5542 kmem_free(newdev, (count - 1) * sizeof (void *)); 5543} 5544 5545/* 5546 * Evacuate the device. 5547 */ 5548static int 5549spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5550{ 5551 uint64_t txg; 5552 int error = 0; 5553 5554 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5555 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5556 ASSERT(vd == vd->vdev_top); 5557 5558 /* 5559 * Evacuate the device. We don't hold the config lock as writer 5560 * since we need to do I/O but we do keep the 5561 * spa_namespace_lock held. Once this completes the device 5562 * should no longer have any blocks allocated on it. 5563 */ 5564 if (vd->vdev_islog) { 5565 if (vd->vdev_stat.vs_alloc != 0) 5566 error = spa_offline_log(spa); 5567 } else { 5568 error = SET_ERROR(ENOTSUP); 5569 } 5570 5571 if (error) 5572 return (error); 5573 5574 /* 5575 * The evacuation succeeded. Remove any remaining MOS metadata 5576 * associated with this vdev, and wait for these changes to sync. 5577 */ 5578 ASSERT0(vd->vdev_stat.vs_alloc); 5579 txg = spa_vdev_config_enter(spa); 5580 vd->vdev_removing = B_TRUE; 5581 vdev_dirty_leaves(vd, VDD_DTL, txg); 5582 vdev_config_dirty(vd); 5583 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5584 5585 return (0); 5586} 5587 5588/* 5589 * Complete the removal by cleaning up the namespace. 5590 */ 5591static void 5592spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5593{ 5594 vdev_t *rvd = spa->spa_root_vdev; 5595 uint64_t id = vd->vdev_id; 5596 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5597 5598 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5599 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5600 ASSERT(vd == vd->vdev_top); 5601 5602 /* 5603 * Only remove any devices which are empty. 5604 */ 5605 if (vd->vdev_stat.vs_alloc != 0) 5606 return; 5607 5608 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5609 5610 if (list_link_active(&vd->vdev_state_dirty_node)) 5611 vdev_state_clean(vd); 5612 if (list_link_active(&vd->vdev_config_dirty_node)) 5613 vdev_config_clean(vd); 5614 5615 vdev_free(vd); 5616 5617 if (last_vdev) { 5618 vdev_compact_children(rvd); 5619 } else { 5620 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5621 vdev_add_child(rvd, vd); 5622 } 5623 vdev_config_dirty(rvd); 5624 5625 /* 5626 * Reassess the health of our root vdev. 5627 */ 5628 vdev_reopen(rvd); 5629} 5630 5631/* 5632 * Remove a device from the pool - 5633 * 5634 * Removing a device from the vdev namespace requires several steps 5635 * and can take a significant amount of time. As a result we use 5636 * the spa_vdev_config_[enter/exit] functions which allow us to 5637 * grab and release the spa_config_lock while still holding the namespace 5638 * lock. During each step the configuration is synced out. 5639 * 5640 * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5641 * devices. 5642 */ 5643int 5644spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5645{ 5646 vdev_t *vd; 5647 sysevent_t *ev = NULL; 5648 metaslab_group_t *mg; 5649 nvlist_t **spares, **l2cache, *nv; 5650 uint64_t txg = 0; 5651 uint_t nspares, nl2cache; 5652 int error = 0; 5653 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5654 5655 ASSERT(spa_writeable(spa)); 5656 5657 if (!locked) 5658 txg = spa_vdev_enter(spa); 5659 5660 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5661 5662 if (spa->spa_spares.sav_vdevs != NULL && 5663 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5664 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5665 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5666 /* 5667 * Only remove the hot spare if it's not currently in use 5668 * in this pool. 5669 */ 5670 if (vd == NULL || unspare) { 5671 if (vd == NULL) 5672 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 5673 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); 5674 spa_vdev_remove_aux(spa->spa_spares.sav_config, 5675 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5676 spa_load_spares(spa); 5677 spa->spa_spares.sav_sync = B_TRUE; 5678 } else { 5679 error = SET_ERROR(EBUSY); 5680 } 5681 } else if (spa->spa_l2cache.sav_vdevs != NULL && 5682 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5683 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5684 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5685 /* 5686 * Cache devices can always be removed. 5687 */ 5688 vd = spa_lookup_by_guid(spa, guid, B_TRUE); 5689 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX); 5690 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5691 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5692 spa_load_l2cache(spa); 5693 spa->spa_l2cache.sav_sync = B_TRUE; 5694 } else if (vd != NULL && vd->vdev_islog) { 5695 ASSERT(!locked); 5696 ASSERT(vd == vd->vdev_top); 5697 5698 mg = vd->vdev_mg; 5699 5700 /* 5701 * Stop allocating from this vdev. 5702 */ 5703 metaslab_group_passivate(mg); 5704 5705 /* 5706 * Wait for the youngest allocations and frees to sync, 5707 * and then wait for the deferral of those frees to finish. 5708 */ 5709 spa_vdev_config_exit(spa, NULL, 5710 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5711 5712 /* 5713 * Attempt to evacuate the vdev. 5714 */ 5715 error = spa_vdev_remove_evacuate(spa, vd); 5716 5717 txg = spa_vdev_config_enter(spa); 5718 5719 /* 5720 * If we couldn't evacuate the vdev, unwind. 5721 */ 5722 if (error) { 5723 metaslab_group_activate(mg); 5724 return (spa_vdev_exit(spa, NULL, txg, error)); 5725 } 5726 5727 /* 5728 * Clean up the vdev namespace. 5729 */ 5730 ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV); 5731 spa_vdev_remove_from_namespace(spa, vd); 5732 5733 } else if (vd != NULL) { 5734 /* 5735 * Normal vdevs cannot be removed (yet). 5736 */ 5737 error = SET_ERROR(ENOTSUP); 5738 } else { 5739 /* 5740 * There is no vdev of any kind with the specified guid. 5741 */ 5742 error = SET_ERROR(ENOENT); 5743 } 5744 5745 if (!locked) 5746 error = spa_vdev_exit(spa, NULL, txg, error); 5747 5748 if (ev) 5749 spa_event_post(ev); 5750 5751 return (error); 5752} 5753 5754/* 5755 * Find any device that's done replacing, or a vdev marked 'unspare' that's 5756 * currently spared, so we can detach it. 5757 */ 5758static vdev_t * 5759spa_vdev_resilver_done_hunt(vdev_t *vd) 5760{ 5761 vdev_t *newvd, *oldvd; 5762 5763 for (int c = 0; c < vd->vdev_children; c++) { 5764 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5765 if (oldvd != NULL) 5766 return (oldvd); 5767 } 5768 5769 /* 5770 * Check for a completed replacement. We always consider the first 5771 * vdev in the list to be the oldest vdev, and the last one to be 5772 * the newest (see spa_vdev_attach() for how that works). In 5773 * the case where the newest vdev is faulted, we will not automatically 5774 * remove it after a resilver completes. This is OK as it will require 5775 * user intervention to determine which disk the admin wishes to keep. 5776 */ 5777 if (vd->vdev_ops == &vdev_replacing_ops) { 5778 ASSERT(vd->vdev_children > 1); 5779 5780 newvd = vd->vdev_child[vd->vdev_children - 1]; 5781 oldvd = vd->vdev_child[0]; 5782 5783 if (vdev_dtl_empty(newvd, DTL_MISSING) && 5784 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5785 !vdev_dtl_required(oldvd)) 5786 return (oldvd); 5787 } 5788 5789 /* 5790 * Check for a completed resilver with the 'unspare' flag set. 5791 */ 5792 if (vd->vdev_ops == &vdev_spare_ops) { 5793 vdev_t *first = vd->vdev_child[0]; 5794 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5795 5796 if (last->vdev_unspare) { 5797 oldvd = first; 5798 newvd = last; 5799 } else if (first->vdev_unspare) { 5800 oldvd = last; 5801 newvd = first; 5802 } else { 5803 oldvd = NULL; 5804 } 5805 5806 if (oldvd != NULL && 5807 vdev_dtl_empty(newvd, DTL_MISSING) && 5808 vdev_dtl_empty(newvd, DTL_OUTAGE) && 5809 !vdev_dtl_required(oldvd)) 5810 return (oldvd); 5811 5812 /* 5813 * If there are more than two spares attached to a disk, 5814 * and those spares are not required, then we want to 5815 * attempt to free them up now so that they can be used 5816 * by other pools. Once we're back down to a single 5817 * disk+spare, we stop removing them. 5818 */ 5819 if (vd->vdev_children > 2) { 5820 newvd = vd->vdev_child[1]; 5821 5822 if (newvd->vdev_isspare && last->vdev_isspare && 5823 vdev_dtl_empty(last, DTL_MISSING) && 5824 vdev_dtl_empty(last, DTL_OUTAGE) && 5825 !vdev_dtl_required(newvd)) 5826 return (newvd); 5827 } 5828 } 5829 5830 return (NULL); 5831} 5832 5833static void 5834spa_vdev_resilver_done(spa_t *spa) 5835{ 5836 vdev_t *vd, *pvd, *ppvd; 5837 uint64_t guid, sguid, pguid, ppguid; 5838 5839 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5840 5841 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5842 pvd = vd->vdev_parent; 5843 ppvd = pvd->vdev_parent; 5844 guid = vd->vdev_guid; 5845 pguid = pvd->vdev_guid; 5846 ppguid = ppvd->vdev_guid; 5847 sguid = 0; 5848 /* 5849 * If we have just finished replacing a hot spared device, then 5850 * we need to detach the parent's first child (the original hot 5851 * spare) as well. 5852 */ 5853 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5854 ppvd->vdev_children == 2) { 5855 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5856 sguid = ppvd->vdev_child[1]->vdev_guid; 5857 } 5858 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5859 5860 spa_config_exit(spa, SCL_ALL, FTAG); 5861 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5862 return; 5863 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5864 return; 5865 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5866 } 5867 5868 spa_config_exit(spa, SCL_ALL, FTAG); 5869} 5870 5871/* 5872 * Update the stored path or FRU for this vdev. 5873 */ 5874int 5875spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5876 boolean_t ispath) 5877{ 5878 vdev_t *vd; 5879 boolean_t sync = B_FALSE; 5880 5881 ASSERT(spa_writeable(spa)); 5882 5883 spa_vdev_state_enter(spa, SCL_ALL); 5884 5885 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5886 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5887 5888 if (!vd->vdev_ops->vdev_op_leaf) 5889 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5890 5891 if (ispath) { 5892 if (strcmp(value, vd->vdev_path) != 0) { 5893 spa_strfree(vd->vdev_path); 5894 vd->vdev_path = spa_strdup(value); 5895 sync = B_TRUE; 5896 } 5897 } else { 5898 if (vd->vdev_fru == NULL) { 5899 vd->vdev_fru = spa_strdup(value); 5900 sync = B_TRUE; 5901 } else if (strcmp(value, vd->vdev_fru) != 0) { 5902 spa_strfree(vd->vdev_fru); 5903 vd->vdev_fru = spa_strdup(value); 5904 sync = B_TRUE; 5905 } 5906 } 5907 5908 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5909} 5910 5911int 5912spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5913{ 5914 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5915} 5916 5917int 5918spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5919{ 5920 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5921} 5922 5923/* 5924 * ========================================================================== 5925 * SPA Scanning 5926 * ========================================================================== 5927 */ 5928 5929int 5930spa_scan_stop(spa_t *spa) 5931{ 5932 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5933 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5934 return (SET_ERROR(EBUSY)); 5935 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5936} 5937 5938int 5939spa_scan(spa_t *spa, pool_scan_func_t func) 5940{ 5941 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5942 5943 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5944 return (SET_ERROR(ENOTSUP)); 5945 5946 /* 5947 * If a resilver was requested, but there is no DTL on a 5948 * writeable leaf device, we have nothing to do. 5949 */ 5950 if (func == POOL_SCAN_RESILVER && 5951 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5952 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5953 return (0); 5954 } 5955 5956 return (dsl_scan(spa->spa_dsl_pool, func)); 5957} 5958 5959/* 5960 * ========================================================================== 5961 * SPA async task processing 5962 * ========================================================================== 5963 */ 5964 5965static void 5966spa_async_remove(spa_t *spa, vdev_t *vd) 5967{ 5968 if (vd->vdev_remove_wanted) { 5969 vd->vdev_remove_wanted = B_FALSE; 5970 vd->vdev_delayed_close = B_FALSE; 5971 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5972 5973 /* 5974 * We want to clear the stats, but we don't want to do a full 5975 * vdev_clear() as that will cause us to throw away 5976 * degraded/faulted state as well as attempt to reopen the 5977 * device, all of which is a waste. 5978 */ 5979 vd->vdev_stat.vs_read_errors = 0; 5980 vd->vdev_stat.vs_write_errors = 0; 5981 vd->vdev_stat.vs_checksum_errors = 0; 5982 5983 vdev_state_dirty(vd->vdev_top); 5984 /* Tell userspace that the vdev is gone. */ 5985 zfs_post_remove(spa, vd); 5986 } 5987 5988 for (int c = 0; c < vd->vdev_children; c++) 5989 spa_async_remove(spa, vd->vdev_child[c]); 5990} 5991 5992static void 5993spa_async_probe(spa_t *spa, vdev_t *vd) 5994{ 5995 if (vd->vdev_probe_wanted) { 5996 vd->vdev_probe_wanted = B_FALSE; 5997 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5998 } 5999 6000 for (int c = 0; c < vd->vdev_children; c++) 6001 spa_async_probe(spa, vd->vdev_child[c]); 6002} 6003 6004static void 6005spa_async_autoexpand(spa_t *spa, vdev_t *vd) 6006{ 6007 sysevent_id_t eid; 6008 nvlist_t *attr; 6009 char *physpath; 6010 6011 if (!spa->spa_autoexpand) 6012 return; 6013 6014 for (int c = 0; c < vd->vdev_children; c++) { 6015 vdev_t *cvd = vd->vdev_child[c]; 6016 spa_async_autoexpand(spa, cvd); 6017 } 6018 6019 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 6020 return; 6021 6022 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 6023 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 6024 6025 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6026 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 6027 6028 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 6029 ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 6030 6031 nvlist_free(attr); 6032 kmem_free(physpath, MAXPATHLEN); 6033} 6034 6035static void 6036spa_async_thread(void *arg) 6037{ 6038 spa_t *spa = arg; 6039 int tasks; 6040 6041 ASSERT(spa->spa_sync_on); 6042 6043 mutex_enter(&spa->spa_async_lock); 6044 tasks = spa->spa_async_tasks; 6045 spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 6046 mutex_exit(&spa->spa_async_lock); 6047 6048 /* 6049 * See if the config needs to be updated. 6050 */ 6051 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 6052 uint64_t old_space, new_space; 6053 6054 mutex_enter(&spa_namespace_lock); 6055 old_space = metaslab_class_get_space(spa_normal_class(spa)); 6056 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6057 new_space = metaslab_class_get_space(spa_normal_class(spa)); 6058 mutex_exit(&spa_namespace_lock); 6059 6060 /* 6061 * If the pool grew as a result of the config update, 6062 * then log an internal history event. 6063 */ 6064 if (new_space != old_space) { 6065 spa_history_log_internal(spa, "vdev online", NULL, 6066 "pool '%s' size: %llu(+%llu)", 6067 spa_name(spa), new_space, new_space - old_space); 6068 } 6069 } 6070 6071 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 6072 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6073 spa_async_autoexpand(spa, spa->spa_root_vdev); 6074 spa_config_exit(spa, SCL_CONFIG, FTAG); 6075 } 6076 6077 /* 6078 * See if any devices need to be probed. 6079 */ 6080 if (tasks & SPA_ASYNC_PROBE) { 6081 spa_vdev_state_enter(spa, SCL_NONE); 6082 spa_async_probe(spa, spa->spa_root_vdev); 6083 (void) spa_vdev_state_exit(spa, NULL, 0); 6084 } 6085 6086 /* 6087 * If any devices are done replacing, detach them. 6088 */ 6089 if (tasks & SPA_ASYNC_RESILVER_DONE) 6090 spa_vdev_resilver_done(spa); 6091 6092 /* 6093 * Kick off a resilver. 6094 */ 6095 if (tasks & SPA_ASYNC_RESILVER) 6096 dsl_resilver_restart(spa->spa_dsl_pool, 0); 6097 6098 /* 6099 * Let the world know that we're done. 6100 */ 6101 mutex_enter(&spa->spa_async_lock); 6102 spa->spa_async_thread = NULL; 6103 cv_broadcast(&spa->spa_async_cv); 6104 mutex_exit(&spa->spa_async_lock); 6105 thread_exit(); 6106} 6107 6108static void 6109spa_async_thread_vd(void *arg) 6110{ 6111 spa_t *spa = arg; 6112 int tasks; 6113 6114 ASSERT(spa->spa_sync_on); 6115 6116 mutex_enter(&spa->spa_async_lock); 6117 tasks = spa->spa_async_tasks; 6118retry: 6119 spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 6120 mutex_exit(&spa->spa_async_lock); 6121 6122 /* 6123 * See if any devices need to be marked REMOVED. 6124 */ 6125 if (tasks & SPA_ASYNC_REMOVE) { 6126 spa_vdev_state_enter(spa, SCL_NONE); 6127 spa_async_remove(spa, spa->spa_root_vdev); 6128 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 6129 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 6130 for (int i = 0; i < spa->spa_spares.sav_count; i++) 6131 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 6132 (void) spa_vdev_state_exit(spa, NULL, 0); 6133 } 6134 6135 /* 6136 * Let the world know that we're done. 6137 */ 6138 mutex_enter(&spa->spa_async_lock); 6139 tasks = spa->spa_async_tasks; 6140 if ((tasks & SPA_ASYNC_REMOVE) != 0) 6141 goto retry; 6142 spa->spa_async_thread_vd = NULL; 6143 cv_broadcast(&spa->spa_async_cv); 6144 mutex_exit(&spa->spa_async_lock); 6145 thread_exit(); 6146} 6147 6148void 6149spa_async_suspend(spa_t *spa) 6150{ 6151 mutex_enter(&spa->spa_async_lock); 6152 spa->spa_async_suspended++; 6153 while (spa->spa_async_thread != NULL && 6154 spa->spa_async_thread_vd != NULL) 6155 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 6156 mutex_exit(&spa->spa_async_lock); 6157} 6158 6159void 6160spa_async_resume(spa_t *spa) 6161{ 6162 mutex_enter(&spa->spa_async_lock); 6163 ASSERT(spa->spa_async_suspended != 0); 6164 spa->spa_async_suspended--; 6165 mutex_exit(&spa->spa_async_lock); 6166} 6167 6168static boolean_t 6169spa_async_tasks_pending(spa_t *spa) 6170{ 6171 uint_t non_config_tasks; 6172 uint_t config_task; 6173 boolean_t config_task_suspended; 6174 6175 non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 6176 SPA_ASYNC_REMOVE); 6177 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 6178 if (spa->spa_ccw_fail_time == 0) { 6179 config_task_suspended = B_FALSE; 6180 } else { 6181 config_task_suspended = 6182 (gethrtime() - spa->spa_ccw_fail_time) < 6183 (zfs_ccw_retry_interval * NANOSEC); 6184 } 6185 6186 return (non_config_tasks || (config_task && !config_task_suspended)); 6187} 6188 6189static void 6190spa_async_dispatch(spa_t *spa) 6191{ 6192 mutex_enter(&spa->spa_async_lock); 6193 if (spa_async_tasks_pending(spa) && 6194 !spa->spa_async_suspended && 6195 spa->spa_async_thread == NULL && 6196 rootdir != NULL) 6197 spa->spa_async_thread = thread_create(NULL, 0, 6198 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 6199 mutex_exit(&spa->spa_async_lock); 6200} 6201 6202static void 6203spa_async_dispatch_vd(spa_t *spa) 6204{ 6205 mutex_enter(&spa->spa_async_lock); 6206 if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 6207 !spa->spa_async_suspended && 6208 spa->spa_async_thread_vd == NULL && 6209 rootdir != NULL) 6210 spa->spa_async_thread_vd = thread_create(NULL, 0, 6211 spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 6212 mutex_exit(&spa->spa_async_lock); 6213} 6214 6215void 6216spa_async_request(spa_t *spa, int task) 6217{ 6218 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 6219 mutex_enter(&spa->spa_async_lock); 6220 spa->spa_async_tasks |= task; 6221 mutex_exit(&spa->spa_async_lock); 6222 spa_async_dispatch_vd(spa); 6223} 6224 6225/* 6226 * ========================================================================== 6227 * SPA syncing routines 6228 * ========================================================================== 6229 */ 6230 6231static int 6232bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6233{ 6234 bpobj_t *bpo = arg; 6235 bpobj_enqueue(bpo, bp, tx); 6236 return (0); 6237} 6238 6239static int 6240spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 6241{ 6242 zio_t *zio = arg; 6243 6244 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 6245 BP_GET_PSIZE(bp), zio->io_flags)); 6246 return (0); 6247} 6248 6249/* 6250 * Note: this simple function is not inlined to make it easier to dtrace the 6251 * amount of time spent syncing frees. 6252 */ 6253static void 6254spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 6255{ 6256 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6257 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 6258 VERIFY(zio_wait(zio) == 0); 6259} 6260 6261/* 6262 * Note: this simple function is not inlined to make it easier to dtrace the 6263 * amount of time spent syncing deferred frees. 6264 */ 6265static void 6266spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 6267{ 6268 zio_t *zio = zio_root(spa, NULL, NULL, 0); 6269 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 6270 spa_free_sync_cb, zio, tx), ==, 0); 6271 VERIFY0(zio_wait(zio)); 6272} 6273 6274 6275static void 6276spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 6277{ 6278 char *packed = NULL; 6279 size_t bufsize; 6280 size_t nvsize = 0; 6281 dmu_buf_t *db; 6282 6283 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 6284 6285 /* 6286 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 6287 * information. This avoids the dmu_buf_will_dirty() path and 6288 * saves us a pre-read to get data we don't actually care about. 6289 */ 6290 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 6291 packed = kmem_alloc(bufsize, KM_SLEEP); 6292 6293 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 6294 KM_SLEEP) == 0); 6295 bzero(packed + nvsize, bufsize - nvsize); 6296 6297 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6298 6299 kmem_free(packed, bufsize); 6300 6301 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6302 dmu_buf_will_dirty(db, tx); 6303 *(uint64_t *)db->db_data = nvsize; 6304 dmu_buf_rele(db, FTAG); 6305} 6306 6307static void 6308spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6309 const char *config, const char *entry) 6310{ 6311 nvlist_t *nvroot; 6312 nvlist_t **list; 6313 int i; 6314 6315 if (!sav->sav_sync) 6316 return; 6317 6318 /* 6319 * Update the MOS nvlist describing the list of available devices. 6320 * spa_validate_aux() will have already made sure this nvlist is 6321 * valid and the vdevs are labeled appropriately. 6322 */ 6323 if (sav->sav_object == 0) { 6324 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6325 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6326 sizeof (uint64_t), tx); 6327 VERIFY(zap_update(spa->spa_meta_objset, 6328 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6329 &sav->sav_object, tx) == 0); 6330 } 6331 6332 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6333 if (sav->sav_count == 0) { 6334 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6335 } else { 6336 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6337 for (i = 0; i < sav->sav_count; i++) 6338 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6339 B_FALSE, VDEV_CONFIG_L2CACHE); 6340 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6341 sav->sav_count) == 0); 6342 for (i = 0; i < sav->sav_count; i++) 6343 nvlist_free(list[i]); 6344 kmem_free(list, sav->sav_count * sizeof (void *)); 6345 } 6346 6347 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6348 nvlist_free(nvroot); 6349 6350 sav->sav_sync = B_FALSE; 6351} 6352 6353static void 6354spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6355{ 6356 nvlist_t *config; 6357 6358 if (list_is_empty(&spa->spa_config_dirty_list)) 6359 return; 6360 6361 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6362 6363 config = spa_config_generate(spa, spa->spa_root_vdev, 6364 dmu_tx_get_txg(tx), B_FALSE); 6365 6366 /* 6367 * If we're upgrading the spa version then make sure that 6368 * the config object gets updated with the correct version. 6369 */ 6370 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6371 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6372 spa->spa_uberblock.ub_version); 6373 6374 spa_config_exit(spa, SCL_STATE, FTAG); 6375 6376 nvlist_free(spa->spa_config_syncing); 6377 spa->spa_config_syncing = config; 6378 6379 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6380} 6381 6382static void 6383spa_sync_version(void *arg, dmu_tx_t *tx) 6384{ 6385 uint64_t *versionp = arg; 6386 uint64_t version = *versionp; 6387 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6388 6389 /* 6390 * Setting the version is special cased when first creating the pool. 6391 */ 6392 ASSERT(tx->tx_txg != TXG_INITIAL); 6393 6394 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6395 ASSERT(version >= spa_version(spa)); 6396 6397 spa->spa_uberblock.ub_version = version; 6398 vdev_config_dirty(spa->spa_root_vdev); 6399 spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6400} 6401 6402/* 6403 * Set zpool properties. 6404 */ 6405static void 6406spa_sync_props(void *arg, dmu_tx_t *tx) 6407{ 6408 nvlist_t *nvp = arg; 6409 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6410 objset_t *mos = spa->spa_meta_objset; 6411 nvpair_t *elem = NULL; 6412 6413 mutex_enter(&spa->spa_props_lock); 6414 6415 while ((elem = nvlist_next_nvpair(nvp, elem))) { 6416 uint64_t intval; 6417 char *strval, *fname; 6418 zpool_prop_t prop; 6419 const char *propname; 6420 zprop_type_t proptype; 6421 spa_feature_t fid; 6422 6423 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6424 case ZPROP_INVAL: 6425 /* 6426 * We checked this earlier in spa_prop_validate(). 6427 */ 6428 ASSERT(zpool_prop_feature(nvpair_name(elem))); 6429 6430 fname = strchr(nvpair_name(elem), '@') + 1; 6431 VERIFY0(zfeature_lookup_name(fname, &fid)); 6432 6433 spa_feature_enable(spa, fid, tx); 6434 spa_history_log_internal(spa, "set", tx, 6435 "%s=enabled", nvpair_name(elem)); 6436 break; 6437 6438 case ZPOOL_PROP_VERSION: 6439 intval = fnvpair_value_uint64(elem); 6440 /* 6441 * The version is synced seperatly before other 6442 * properties and should be correct by now. 6443 */ 6444 ASSERT3U(spa_version(spa), >=, intval); 6445 break; 6446 6447 case ZPOOL_PROP_ALTROOT: 6448 /* 6449 * 'altroot' is a non-persistent property. It should 6450 * have been set temporarily at creation or import time. 6451 */ 6452 ASSERT(spa->spa_root != NULL); 6453 break; 6454 6455 case ZPOOL_PROP_READONLY: 6456 case ZPOOL_PROP_CACHEFILE: 6457 /* 6458 * 'readonly' and 'cachefile' are also non-persisitent 6459 * properties. 6460 */ 6461 break; 6462 case ZPOOL_PROP_COMMENT: 6463 strval = fnvpair_value_string(elem); 6464 if (spa->spa_comment != NULL) 6465 spa_strfree(spa->spa_comment); 6466 spa->spa_comment = spa_strdup(strval); 6467 /* 6468 * We need to dirty the configuration on all the vdevs 6469 * so that their labels get updated. It's unnecessary 6470 * to do this for pool creation since the vdev's 6471 * configuratoin has already been dirtied. 6472 */ 6473 if (tx->tx_txg != TXG_INITIAL) 6474 vdev_config_dirty(spa->spa_root_vdev); 6475 spa_history_log_internal(spa, "set", tx, 6476 "%s=%s", nvpair_name(elem), strval); 6477 break; 6478 default: 6479 /* 6480 * Set pool property values in the poolprops mos object. 6481 */ 6482 if (spa->spa_pool_props_object == 0) { 6483 spa->spa_pool_props_object = 6484 zap_create_link(mos, DMU_OT_POOL_PROPS, 6485 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6486 tx); 6487 } 6488 6489 /* normalize the property name */ 6490 propname = zpool_prop_to_name(prop); 6491 proptype = zpool_prop_get_type(prop); 6492 6493 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6494 ASSERT(proptype == PROP_TYPE_STRING); 6495 strval = fnvpair_value_string(elem); 6496 VERIFY0(zap_update(mos, 6497 spa->spa_pool_props_object, propname, 6498 1, strlen(strval) + 1, strval, tx)); 6499 spa_history_log_internal(spa, "set", tx, 6500 "%s=%s", nvpair_name(elem), strval); 6501 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6502 intval = fnvpair_value_uint64(elem); 6503 6504 if (proptype == PROP_TYPE_INDEX) { 6505 const char *unused; 6506 VERIFY0(zpool_prop_index_to_string( 6507 prop, intval, &unused)); 6508 } 6509 VERIFY0(zap_update(mos, 6510 spa->spa_pool_props_object, propname, 6511 8, 1, &intval, tx)); 6512 spa_history_log_internal(spa, "set", tx, 6513 "%s=%lld", nvpair_name(elem), intval); 6514 } else { 6515 ASSERT(0); /* not allowed */ 6516 } 6517 6518 switch (prop) { 6519 case ZPOOL_PROP_DELEGATION: 6520 spa->spa_delegation = intval; 6521 break; 6522 case ZPOOL_PROP_BOOTFS: 6523 spa->spa_bootfs = intval; 6524 break; 6525 case ZPOOL_PROP_FAILUREMODE: 6526 spa->spa_failmode = intval; 6527 break; 6528 case ZPOOL_PROP_AUTOEXPAND: 6529 spa->spa_autoexpand = intval; 6530 if (tx->tx_txg != TXG_INITIAL) 6531 spa_async_request(spa, 6532 SPA_ASYNC_AUTOEXPAND); 6533 break; 6534 case ZPOOL_PROP_DEDUPDITTO: 6535 spa->spa_dedup_ditto = intval; 6536 break; 6537 default: 6538 break; 6539 } 6540 } 6541 6542 } 6543 6544 mutex_exit(&spa->spa_props_lock); 6545} 6546 6547/* 6548 * Perform one-time upgrade on-disk changes. spa_version() does not 6549 * reflect the new version this txg, so there must be no changes this 6550 * txg to anything that the upgrade code depends on after it executes. 6551 * Therefore this must be called after dsl_pool_sync() does the sync 6552 * tasks. 6553 */ 6554static void 6555spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6556{ 6557 dsl_pool_t *dp = spa->spa_dsl_pool; 6558 6559 ASSERT(spa->spa_sync_pass == 1); 6560 6561 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6562 6563 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6564 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6565 dsl_pool_create_origin(dp, tx); 6566 6567 /* Keeping the origin open increases spa_minref */ 6568 spa->spa_minref += 3; 6569 } 6570 6571 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6572 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6573 dsl_pool_upgrade_clones(dp, tx); 6574 } 6575 6576 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6577 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6578 dsl_pool_upgrade_dir_clones(dp, tx); 6579 6580 /* Keeping the freedir open increases spa_minref */ 6581 spa->spa_minref += 3; 6582 } 6583 6584 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6585 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6586 spa_feature_create_zap_objects(spa, tx); 6587 } 6588 6589 /* 6590 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 6591 * when possibility to use lz4 compression for metadata was added 6592 * Old pools that have this feature enabled must be upgraded to have 6593 * this feature active 6594 */ 6595 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6596 boolean_t lz4_en = spa_feature_is_enabled(spa, 6597 SPA_FEATURE_LZ4_COMPRESS); 6598 boolean_t lz4_ac = spa_feature_is_active(spa, 6599 SPA_FEATURE_LZ4_COMPRESS); 6600 6601 if (lz4_en && !lz4_ac) 6602 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 6603 } 6604 6605 /* 6606 * If we haven't written the salt, do so now. Note that the 6607 * feature may not be activated yet, but that's fine since 6608 * the presence of this ZAP entry is backwards compatible. 6609 */ 6610 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 6611 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 6612 VERIFY0(zap_add(spa->spa_meta_objset, 6613 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 6614 sizeof (spa->spa_cksum_salt.zcs_bytes), 6615 spa->spa_cksum_salt.zcs_bytes, tx)); 6616 } 6617 6618 rrw_exit(&dp->dp_config_rwlock, FTAG); 6619} 6620 6621/* 6622 * Sync the specified transaction group. New blocks may be dirtied as 6623 * part of the process, so we iterate until it converges. 6624 */ 6625void 6626spa_sync(spa_t *spa, uint64_t txg) 6627{ 6628 dsl_pool_t *dp = spa->spa_dsl_pool; 6629 objset_t *mos = spa->spa_meta_objset; 6630 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6631 vdev_t *rvd = spa->spa_root_vdev; 6632 vdev_t *vd; 6633 dmu_tx_t *tx; 6634 int error; 6635 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 6636 zfs_vdev_queue_depth_pct / 100; 6637 6638 VERIFY(spa_writeable(spa)); 6639 6640 /* 6641 * Lock out configuration changes. 6642 */ 6643 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6644 6645 spa->spa_syncing_txg = txg; 6646 spa->spa_sync_pass = 0; 6647 6648 mutex_enter(&spa->spa_alloc_lock); 6649 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6650 mutex_exit(&spa->spa_alloc_lock); 6651 6652 /* 6653 * If there are any pending vdev state changes, convert them 6654 * into config changes that go out with this transaction group. 6655 */ 6656 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6657 while (list_head(&spa->spa_state_dirty_list) != NULL) { 6658 /* 6659 * We need the write lock here because, for aux vdevs, 6660 * calling vdev_config_dirty() modifies sav_config. 6661 * This is ugly and will become unnecessary when we 6662 * eliminate the aux vdev wart by integrating all vdevs 6663 * into the root vdev tree. 6664 */ 6665 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6666 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6667 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6668 vdev_state_clean(vd); 6669 vdev_config_dirty(vd); 6670 } 6671 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6672 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6673 } 6674 spa_config_exit(spa, SCL_STATE, FTAG); 6675 6676 tx = dmu_tx_create_assigned(dp, txg); 6677 6678 spa->spa_sync_starttime = gethrtime(); 6679#ifdef illumos 6680 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6681 spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6682#else /* !illumos */ 6683#ifdef _KERNEL 6684 callout_schedule(&spa->spa_deadman_cycid, 6685 hz * spa->spa_deadman_synctime / NANOSEC); 6686#endif 6687#endif /* illumos */ 6688 6689 /* 6690 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6691 * set spa_deflate if we have no raid-z vdevs. 6692 */ 6693 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6694 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6695 int i; 6696 6697 for (i = 0; i < rvd->vdev_children; i++) { 6698 vd = rvd->vdev_child[i]; 6699 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6700 break; 6701 } 6702 if (i == rvd->vdev_children) { 6703 spa->spa_deflate = TRUE; 6704 VERIFY(0 == zap_add(spa->spa_meta_objset, 6705 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6706 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6707 } 6708 } 6709 6710 /* 6711 * Set the top-level vdev's max queue depth. Evaluate each 6712 * top-level's async write queue depth in case it changed. 6713 * The max queue depth will not change in the middle of syncing 6714 * out this txg. 6715 */ 6716 uint64_t queue_depth_total = 0; 6717 for (int c = 0; c < rvd->vdev_children; c++) { 6718 vdev_t *tvd = rvd->vdev_child[c]; 6719 metaslab_group_t *mg = tvd->vdev_mg; 6720 6721 if (mg == NULL || mg->mg_class != spa_normal_class(spa) || 6722 !metaslab_group_initialized(mg)) 6723 continue; 6724 6725 /* 6726 * It is safe to do a lock-free check here because only async 6727 * allocations look at mg_max_alloc_queue_depth, and async 6728 * allocations all happen from spa_sync(). 6729 */ 6730 ASSERT0(refcount_count(&mg->mg_alloc_queue_depth)); 6731 mg->mg_max_alloc_queue_depth = max_queue_depth; 6732 queue_depth_total += mg->mg_max_alloc_queue_depth; 6733 } 6734 metaslab_class_t *mc = spa_normal_class(spa); 6735 ASSERT0(refcount_count(&mc->mc_alloc_slots)); 6736 mc->mc_alloc_max_slots = queue_depth_total; 6737 mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 6738 6739 ASSERT3U(mc->mc_alloc_max_slots, <=, 6740 max_queue_depth * rvd->vdev_children); 6741 6742 /* 6743 * Iterate to convergence. 6744 */ 6745 do { 6746 int pass = ++spa->spa_sync_pass; 6747 6748 spa_sync_config_object(spa, tx); 6749 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6750 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6751 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6752 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6753 spa_errlog_sync(spa, txg); 6754 dsl_pool_sync(dp, txg); 6755 6756 if (pass < zfs_sync_pass_deferred_free) { 6757 spa_sync_frees(spa, free_bpl, tx); 6758 } else { 6759 /* 6760 * We can not defer frees in pass 1, because 6761 * we sync the deferred frees later in pass 1. 6762 */ 6763 ASSERT3U(pass, >, 1); 6764 bplist_iterate(free_bpl, bpobj_enqueue_cb, 6765 &spa->spa_deferred_bpobj, tx); 6766 } 6767 6768 ddt_sync(spa, txg); 6769 dsl_scan_sync(dp, tx); 6770 6771 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6772 vdev_sync(vd, txg); 6773 6774 if (pass == 1) { 6775 spa_sync_upgrades(spa, tx); 6776 ASSERT3U(txg, >=, 6777 spa->spa_uberblock.ub_rootbp.blk_birth); 6778 /* 6779 * Note: We need to check if the MOS is dirty 6780 * because we could have marked the MOS dirty 6781 * without updating the uberblock (e.g. if we 6782 * have sync tasks but no dirty user data). We 6783 * need to check the uberblock's rootbp because 6784 * it is updated if we have synced out dirty 6785 * data (though in this case the MOS will most 6786 * likely also be dirty due to second order 6787 * effects, we don't want to rely on that here). 6788 */ 6789 if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && 6790 !dmu_objset_is_dirty(mos, txg)) { 6791 /* 6792 * Nothing changed on the first pass, 6793 * therefore this TXG is a no-op. Avoid 6794 * syncing deferred frees, so that we 6795 * can keep this TXG as a no-op. 6796 */ 6797 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, 6798 txg)); 6799 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6800 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 6801 break; 6802 } 6803 spa_sync_deferred_frees(spa, tx); 6804 } 6805 6806 } while (dmu_objset_is_dirty(mos, txg)); 6807 6808 /* 6809 * Rewrite the vdev configuration (which includes the uberblock) 6810 * to commit the transaction group. 6811 * 6812 * If there are no dirty vdevs, we sync the uberblock to a few 6813 * random top-level vdevs that are known to be visible in the 6814 * config cache (see spa_vdev_add() for a complete description). 6815 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6816 */ 6817 for (;;) { 6818 /* 6819 * We hold SCL_STATE to prevent vdev open/close/etc. 6820 * while we're attempting to write the vdev labels. 6821 */ 6822 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6823 6824 if (list_is_empty(&spa->spa_config_dirty_list)) { 6825 vdev_t *svd[SPA_DVAS_PER_BP]; 6826 int svdcount = 0; 6827 int children = rvd->vdev_children; 6828 int c0 = spa_get_random(children); 6829 6830 for (int c = 0; c < children; c++) { 6831 vd = rvd->vdev_child[(c0 + c) % children]; 6832 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6833 continue; 6834 svd[svdcount++] = vd; 6835 if (svdcount == SPA_DVAS_PER_BP) 6836 break; 6837 } 6838 error = vdev_config_sync(svd, svdcount, txg); 6839 } else { 6840 error = vdev_config_sync(rvd->vdev_child, 6841 rvd->vdev_children, txg); 6842 } 6843 6844 if (error == 0) 6845 spa->spa_last_synced_guid = rvd->vdev_guid; 6846 6847 spa_config_exit(spa, SCL_STATE, FTAG); 6848 6849 if (error == 0) 6850 break; 6851 zio_suspend(spa, NULL); 6852 zio_resume_wait(spa); 6853 } 6854 dmu_tx_commit(tx); 6855 6856#ifdef illumos 6857 VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6858#else /* !illumos */ 6859#ifdef _KERNEL 6860 callout_drain(&spa->spa_deadman_cycid); 6861#endif 6862#endif /* illumos */ 6863 6864 /* 6865 * Clear the dirty config list. 6866 */ 6867 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6868 vdev_config_clean(vd); 6869 6870 /* 6871 * Now that the new config has synced transactionally, 6872 * let it become visible to the config cache. 6873 */ 6874 if (spa->spa_config_syncing != NULL) { 6875 spa_config_set(spa, spa->spa_config_syncing); 6876 spa->spa_config_txg = txg; 6877 spa->spa_config_syncing = NULL; 6878 } 6879 6880 dsl_pool_sync_done(dp, txg); 6881 6882 mutex_enter(&spa->spa_alloc_lock); 6883 VERIFY0(avl_numnodes(&spa->spa_alloc_tree)); 6884 mutex_exit(&spa->spa_alloc_lock); 6885 6886 /* 6887 * Update usable space statistics. 6888 */ 6889 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6890 vdev_sync_done(vd, txg); 6891 6892 spa_update_dspace(spa); 6893 6894 /* 6895 * It had better be the case that we didn't dirty anything 6896 * since vdev_config_sync(). 6897 */ 6898 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6899 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6900 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6901 6902 spa->spa_sync_pass = 0; 6903 6904 /* 6905 * Update the last synced uberblock here. We want to do this at 6906 * the end of spa_sync() so that consumers of spa_last_synced_txg() 6907 * will be guaranteed that all the processing associated with 6908 * that txg has been completed. 6909 */ 6910 spa->spa_ubsync = spa->spa_uberblock; 6911 spa_config_exit(spa, SCL_CONFIG, FTAG); 6912 6913 spa_handle_ignored_writes(spa); 6914 6915 /* 6916 * If any async tasks have been requested, kick them off. 6917 */ 6918 spa_async_dispatch(spa); 6919 spa_async_dispatch_vd(spa); 6920} 6921 6922/* 6923 * Sync all pools. We don't want to hold the namespace lock across these 6924 * operations, so we take a reference on the spa_t and drop the lock during the 6925 * sync. 6926 */ 6927void 6928spa_sync_allpools(void) 6929{ 6930 spa_t *spa = NULL; 6931 mutex_enter(&spa_namespace_lock); 6932 while ((spa = spa_next(spa)) != NULL) { 6933 if (spa_state(spa) != POOL_STATE_ACTIVE || 6934 !spa_writeable(spa) || spa_suspended(spa)) 6935 continue; 6936 spa_open_ref(spa, FTAG); 6937 mutex_exit(&spa_namespace_lock); 6938 txg_wait_synced(spa_get_dsl(spa), 0); 6939 mutex_enter(&spa_namespace_lock); 6940 spa_close(spa, FTAG); 6941 } 6942 mutex_exit(&spa_namespace_lock); 6943} 6944 6945/* 6946 * ========================================================================== 6947 * Miscellaneous routines 6948 * ========================================================================== 6949 */ 6950 6951/* 6952 * Remove all pools in the system. 6953 */ 6954void 6955spa_evict_all(void) 6956{ 6957 spa_t *spa; 6958 6959 /* 6960 * Remove all cached state. All pools should be closed now, 6961 * so every spa in the AVL tree should be unreferenced. 6962 */ 6963 mutex_enter(&spa_namespace_lock); 6964 while ((spa = spa_next(NULL)) != NULL) { 6965 /* 6966 * Stop async tasks. The async thread may need to detach 6967 * a device that's been replaced, which requires grabbing 6968 * spa_namespace_lock, so we must drop it here. 6969 */ 6970 spa_open_ref(spa, FTAG); 6971 mutex_exit(&spa_namespace_lock); 6972 spa_async_suspend(spa); 6973 mutex_enter(&spa_namespace_lock); 6974 spa_close(spa, FTAG); 6975 6976 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6977 spa_unload(spa); 6978 spa_deactivate(spa); 6979 } 6980 spa_remove(spa); 6981 } 6982 mutex_exit(&spa_namespace_lock); 6983} 6984 6985vdev_t * 6986spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6987{ 6988 vdev_t *vd; 6989 int i; 6990 6991 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6992 return (vd); 6993 6994 if (aux) { 6995 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6996 vd = spa->spa_l2cache.sav_vdevs[i]; 6997 if (vd->vdev_guid == guid) 6998 return (vd); 6999 } 7000 7001 for (i = 0; i < spa->spa_spares.sav_count; i++) { 7002 vd = spa->spa_spares.sav_vdevs[i]; 7003 if (vd->vdev_guid == guid) 7004 return (vd); 7005 } 7006 } 7007 7008 return (NULL); 7009} 7010 7011void 7012spa_upgrade(spa_t *spa, uint64_t version) 7013{ 7014 ASSERT(spa_writeable(spa)); 7015 7016 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7017 7018 /* 7019 * This should only be called for a non-faulted pool, and since a 7020 * future version would result in an unopenable pool, this shouldn't be 7021 * possible. 7022 */ 7023 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 7024 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 7025 7026 spa->spa_uberblock.ub_version = version; 7027 vdev_config_dirty(spa->spa_root_vdev); 7028 7029 spa_config_exit(spa, SCL_ALL, FTAG); 7030 7031 txg_wait_synced(spa_get_dsl(spa), 0); 7032} 7033 7034boolean_t 7035spa_has_spare(spa_t *spa, uint64_t guid) 7036{ 7037 int i; 7038 uint64_t spareguid; 7039 spa_aux_vdev_t *sav = &spa->spa_spares; 7040 7041 for (i = 0; i < sav->sav_count; i++) 7042 if (sav->sav_vdevs[i]->vdev_guid == guid) 7043 return (B_TRUE); 7044 7045 for (i = 0; i < sav->sav_npending; i++) { 7046 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 7047 &spareguid) == 0 && spareguid == guid) 7048 return (B_TRUE); 7049 } 7050 7051 return (B_FALSE); 7052} 7053 7054/* 7055 * Check if a pool has an active shared spare device. 7056 * Note: reference count of an active spare is 2, as a spare and as a replace 7057 */ 7058static boolean_t 7059spa_has_active_shared_spare(spa_t *spa) 7060{ 7061 int i, refcnt; 7062 uint64_t pool; 7063 spa_aux_vdev_t *sav = &spa->spa_spares; 7064 7065 for (i = 0; i < sav->sav_count; i++) { 7066 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 7067 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 7068 refcnt > 2) 7069 return (B_TRUE); 7070 } 7071 7072 return (B_FALSE); 7073} 7074 7075static sysevent_t * 7076spa_event_create(spa_t *spa, vdev_t *vd, const char *name) 7077{ 7078 sysevent_t *ev = NULL; 7079#ifdef _KERNEL 7080 sysevent_attr_list_t *attr = NULL; 7081 sysevent_value_t value; 7082 7083 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 7084 SE_SLEEP); 7085 ASSERT(ev != NULL); 7086 7087 value.value_type = SE_DATA_TYPE_STRING; 7088 value.value.sv_string = spa_name(spa); 7089 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 7090 goto done; 7091 7092 value.value_type = SE_DATA_TYPE_UINT64; 7093 value.value.sv_uint64 = spa_guid(spa); 7094 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 7095 goto done; 7096 7097 if (vd) { 7098 value.value_type = SE_DATA_TYPE_UINT64; 7099 value.value.sv_uint64 = vd->vdev_guid; 7100 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 7101 SE_SLEEP) != 0) 7102 goto done; 7103 7104 if (vd->vdev_path) { 7105 value.value_type = SE_DATA_TYPE_STRING; 7106 value.value.sv_string = vd->vdev_path; 7107 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 7108 &value, SE_SLEEP) != 0) 7109 goto done; 7110 } 7111 } 7112 7113 if (sysevent_attach_attributes(ev, attr) != 0) 7114 goto done; 7115 attr = NULL; 7116 7117done: 7118 if (attr) 7119 sysevent_free_attr(attr); 7120 7121#endif 7122 return (ev); 7123} 7124 7125static void 7126spa_event_post(sysevent_t *ev) 7127{ 7128#ifdef _KERNEL 7129 sysevent_id_t eid; 7130 7131 (void) log_sysevent(ev, SE_SLEEP, &eid); 7132 sysevent_free(ev); 7133#endif 7134} 7135 7136/* 7137 * Post a sysevent corresponding to the given event. The 'name' must be one of 7138 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 7139 * filled in from the spa and (optionally) the vdev. This doesn't do anything 7140 * in the userland libzpool, as we don't want consumers to misinterpret ztest 7141 * or zdb as real changes. 7142 */ 7143void 7144spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 7145{ 7146 spa_event_post(spa_event_create(spa, vd, name)); 7147} 7148