1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd 22168404Spjd/* 23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved. 25249188Smm * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 26247265Smm * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30251629Sdelphij * SPA: Storage Pool Allocator 31251629Sdelphij * 32168404Spjd * This file contains all the routines used when modifying on-disk SPA state. 33168404Spjd * This includes opening, importing, destroying, exporting a pool, and syncing a 34168404Spjd * pool. 35168404Spjd */ 36168404Spjd 37168404Spjd#include <sys/zfs_context.h> 38168404Spjd#include <sys/fm/fs/zfs.h> 39168404Spjd#include <sys/spa_impl.h> 40168404Spjd#include <sys/zio.h> 41168404Spjd#include <sys/zio_checksum.h> 42168404Spjd#include <sys/dmu.h> 43168404Spjd#include <sys/dmu_tx.h> 44168404Spjd#include <sys/zap.h> 45168404Spjd#include <sys/zil.h> 46219089Spjd#include <sys/ddt.h> 47168404Spjd#include <sys/vdev_impl.h> 48168404Spjd#include <sys/metaslab.h> 49219089Spjd#include <sys/metaslab_impl.h> 50168404Spjd#include <sys/uberblock_impl.h> 51168404Spjd#include <sys/txg.h> 52168404Spjd#include <sys/avl.h> 53168404Spjd#include <sys/dmu_traverse.h> 54168404Spjd#include <sys/dmu_objset.h> 55168404Spjd#include <sys/unique.h> 56168404Spjd#include <sys/dsl_pool.h> 57168404Spjd#include <sys/dsl_dataset.h> 58168404Spjd#include <sys/dsl_dir.h> 59168404Spjd#include <sys/dsl_prop.h> 60168404Spjd#include <sys/dsl_synctask.h> 61168404Spjd#include <sys/fs/zfs.h> 62185029Spjd#include <sys/arc.h> 63168404Spjd#include <sys/callb.h> 64185029Spjd#include <sys/spa_boot.h> 65219089Spjd#include <sys/zfs_ioctl.h> 66219089Spjd#include <sys/dsl_scan.h> 67248571Smm#include <sys/dmu_send.h> 68248571Smm#include <sys/dsl_destroy.h> 69248571Smm#include <sys/dsl_userhold.h> 70236884Smm#include <sys/zfeature.h> 71219089Spjd#include <sys/zvol.h> 72240868Spjd#include <sys/trim_map.h> 73168404Spjd 74219089Spjd#ifdef _KERNEL 75219089Spjd#include <sys/callb.h> 76219089Spjd#include <sys/cpupart.h> 77219089Spjd#include <sys/zone.h> 78219089Spjd#endif /* _KERNEL */ 79219089Spjd 80185029Spjd#include "zfs_prop.h" 81185029Spjd#include "zfs_comutil.h" 82168404Spjd 83204073Spjd/* Check hostid on import? */ 84204073Spjdstatic int check_hostid = 1; 85204073Spjd 86204073SpjdSYSCTL_DECL(_vfs_zfs); 87204073SpjdTUNABLE_INT("vfs.zfs.check_hostid", &check_hostid); 88204073SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0, 89204073Spjd "Check hostid on import?"); 90204073Spjd 91251636Sdelphij/* 92251636Sdelphij * The interval, in seconds, at which failed configuration cache file writes 93251636Sdelphij * should be retried. 94251636Sdelphij */ 95251636Sdelphijstatic int zfs_ccw_retry_interval = 300; 96251636Sdelphij 97219089Spjdtypedef enum zti_modes { 98209962Smm zti_mode_fixed, /* value is # of threads (min 1) */ 99209962Smm zti_mode_online_percent, /* value is % of online CPUs */ 100219089Spjd zti_mode_batch, /* cpu-intensive; value is ignored */ 101211931Smm zti_mode_null, /* don't create a taskq */ 102209962Smm zti_nmodes 103219089Spjd} zti_modes_t; 104168712Spjd 105211931Smm#define ZTI_FIX(n) { zti_mode_fixed, (n) } 106211931Smm#define ZTI_PCT(n) { zti_mode_online_percent, (n) } 107219089Spjd#define ZTI_BATCH { zti_mode_batch, 0 } 108211931Smm#define ZTI_NULL { zti_mode_null, 0 } 109209962Smm 110211931Smm#define ZTI_ONE ZTI_FIX(1) 111209962Smm 112209962Smmtypedef struct zio_taskq_info { 113211931Smm enum zti_modes zti_mode; 114211931Smm uint_t zti_value; 115209962Smm} zio_taskq_info_t; 116209962Smm 117209962Smmstatic const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 118219089Spjd "issue", "issue_high", "intr", "intr_high" 119209962Smm}; 120209962Smm 121211931Smm/* 122211931Smm * Define the taskq threads for the following I/O types: 123211931Smm * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 124211931Smm */ 125211931Smmconst zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 126211931Smm /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 127211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 128219089Spjd { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 129219089Spjd { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 130219089Spjd { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 131211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 132211931Smm { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 133209962Smm}; 134209962Smm 135248571Smmstatic void spa_sync_version(void *arg, dmu_tx_t *tx); 136248571Smmstatic void spa_sync_props(void *arg, dmu_tx_t *tx); 137185029Spjdstatic boolean_t spa_has_active_shared_spare(spa_t *spa); 138219089Spjdstatic int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 139219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 140219089Spjd char **ereport); 141219089Spjdstatic void spa_vdev_resilver_done(spa_t *spa); 142185029Spjd 143219089Spjduint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 144219089Spjd#ifdef PSRSET_BIND 145219089Spjdid_t zio_taskq_psrset_bind = PS_NONE; 146219089Spjd#endif 147219089Spjd#ifdef SYSDC 148219089Spjdboolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 149219089Spjd#endif 150219089Spjduint_t zio_taskq_basedc = 80; /* base duty cycle */ 151219089Spjd 152219089Spjdboolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 153243503Smmextern int zfs_sync_pass_deferred_free; 154219089Spjd 155247265Smm#ifndef illumos 156247265Smmextern void spa_deadman(void *arg); 157247265Smm#endif 158247265Smm 159168404Spjd/* 160219089Spjd * This (illegal) pool name is used when temporarily importing a spa_t in order 161219089Spjd * to get the vdev stats associated with the imported devices. 162219089Spjd */ 163219089Spjd#define TRYIMPORT_NAME "$import" 164219089Spjd 165219089Spjd/* 166168404Spjd * ========================================================================== 167185029Spjd * SPA properties routines 168185029Spjd * ========================================================================== 169185029Spjd */ 170185029Spjd 171185029Spjd/* 172185029Spjd * Add a (source=src, propname=propval) list to an nvlist. 173185029Spjd */ 174185029Spjdstatic void 175185029Spjdspa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 176185029Spjd uint64_t intval, zprop_source_t src) 177185029Spjd{ 178185029Spjd const char *propname = zpool_prop_to_name(prop); 179185029Spjd nvlist_t *propval; 180185029Spjd 181185029Spjd VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 182185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 183185029Spjd 184185029Spjd if (strval != NULL) 185185029Spjd VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 186185029Spjd else 187185029Spjd VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 188185029Spjd 189185029Spjd VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 190185029Spjd nvlist_free(propval); 191185029Spjd} 192185029Spjd 193185029Spjd/* 194185029Spjd * Get property values from the spa configuration. 195185029Spjd */ 196185029Spjdstatic void 197185029Spjdspa_prop_get_config(spa_t *spa, nvlist_t **nvp) 198185029Spjd{ 199236155Smm vdev_t *rvd = spa->spa_root_vdev; 200236884Smm dsl_pool_t *pool = spa->spa_dsl_pool; 201209962Smm uint64_t size; 202219089Spjd uint64_t alloc; 203236155Smm uint64_t space; 204185029Spjd uint64_t cap, version; 205185029Spjd zprop_source_t src = ZPROP_SRC_NONE; 206185029Spjd spa_config_dirent_t *dp; 207185029Spjd 208185029Spjd ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 209185029Spjd 210236155Smm if (rvd != NULL) { 211219089Spjd alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 212219089Spjd size = metaslab_class_get_space(spa_normal_class(spa)); 213209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 214209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 215219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 216219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 217219089Spjd size - alloc, src); 218236155Smm 219236155Smm space = 0; 220236155Smm for (int c = 0; c < rvd->vdev_children; c++) { 221236155Smm vdev_t *tvd = rvd->vdev_child[c]; 222236155Smm space += tvd->vdev_max_asize - tvd->vdev_asize; 223236155Smm } 224236155Smm spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, 225236155Smm src); 226236155Smm 227219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 228219089Spjd (spa_mode(spa) == FREAD), src); 229185029Spjd 230219089Spjd cap = (size == 0) ? 0 : (alloc * 100 / size); 231209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 232185029Spjd 233219089Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 234219089Spjd ddt_get_pool_dedup_ratio(spa), src); 235219089Spjd 236209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 237236155Smm rvd->vdev_state, src); 238209962Smm 239209962Smm version = spa_version(spa); 240209962Smm if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 241209962Smm src = ZPROP_SRC_DEFAULT; 242209962Smm else 243209962Smm src = ZPROP_SRC_LOCAL; 244209962Smm spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 245209962Smm } 246209962Smm 247236884Smm if (pool != NULL) { 248236884Smm dsl_dir_t *freedir = pool->dp_free_dir; 249236884Smm 250236884Smm /* 251236884Smm * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 252236884Smm * when opening pools before this version freedir will be NULL. 253236884Smm */ 254236884Smm if (freedir != NULL) { 255236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 256236884Smm freedir->dd_phys->dd_used_bytes, src); 257236884Smm } else { 258236884Smm spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 259236884Smm NULL, 0, src); 260236884Smm } 261236884Smm } 262236884Smm 263185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 264185029Spjd 265228103Smm if (spa->spa_comment != NULL) { 266228103Smm spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 267228103Smm 0, ZPROP_SRC_LOCAL); 268228103Smm } 269228103Smm 270185029Spjd if (spa->spa_root != NULL) 271185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 272185029Spjd 0, ZPROP_SRC_LOCAL); 273185029Spjd 274185029Spjd if ((dp = list_head(&spa->spa_config_list)) != NULL) { 275185029Spjd if (dp->scd_path == NULL) { 276185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 277185029Spjd "none", 0, ZPROP_SRC_LOCAL); 278185029Spjd } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 279185029Spjd spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 280185029Spjd dp->scd_path, 0, ZPROP_SRC_LOCAL); 281185029Spjd } 282185029Spjd } 283185029Spjd} 284185029Spjd 285185029Spjd/* 286185029Spjd * Get zpool property values. 287185029Spjd */ 288185029Spjdint 289185029Spjdspa_prop_get(spa_t *spa, nvlist_t **nvp) 290185029Spjd{ 291219089Spjd objset_t *mos = spa->spa_meta_objset; 292185029Spjd zap_cursor_t zc; 293185029Spjd zap_attribute_t za; 294185029Spjd int err; 295185029Spjd 296185029Spjd VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 297185029Spjd 298185029Spjd mutex_enter(&spa->spa_props_lock); 299185029Spjd 300185029Spjd /* 301185029Spjd * Get properties from the spa config. 302185029Spjd */ 303185029Spjd spa_prop_get_config(spa, nvp); 304185029Spjd 305185029Spjd /* If no pool property object, no more prop to get. */ 306219089Spjd if (mos == NULL || spa->spa_pool_props_object == 0) { 307185029Spjd mutex_exit(&spa->spa_props_lock); 308185029Spjd return (0); 309185029Spjd } 310185029Spjd 311185029Spjd /* 312185029Spjd * Get properties from the MOS pool property object. 313185029Spjd */ 314185029Spjd for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 315185029Spjd (err = zap_cursor_retrieve(&zc, &za)) == 0; 316185029Spjd zap_cursor_advance(&zc)) { 317185029Spjd uint64_t intval = 0; 318185029Spjd char *strval = NULL; 319185029Spjd zprop_source_t src = ZPROP_SRC_DEFAULT; 320185029Spjd zpool_prop_t prop; 321185029Spjd 322185029Spjd if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 323185029Spjd continue; 324185029Spjd 325185029Spjd switch (za.za_integer_length) { 326185029Spjd case 8: 327185029Spjd /* integer property */ 328185029Spjd if (za.za_first_integer != 329185029Spjd zpool_prop_default_numeric(prop)) 330185029Spjd src = ZPROP_SRC_LOCAL; 331185029Spjd 332185029Spjd if (prop == ZPOOL_PROP_BOOTFS) { 333185029Spjd dsl_pool_t *dp; 334185029Spjd dsl_dataset_t *ds = NULL; 335185029Spjd 336185029Spjd dp = spa_get_dsl(spa); 337248571Smm dsl_pool_config_enter(dp, FTAG); 338185029Spjd if (err = dsl_dataset_hold_obj(dp, 339185029Spjd za.za_first_integer, FTAG, &ds)) { 340248571Smm dsl_pool_config_exit(dp, FTAG); 341185029Spjd break; 342185029Spjd } 343185029Spjd 344185029Spjd strval = kmem_alloc( 345185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 346185029Spjd KM_SLEEP); 347185029Spjd dsl_dataset_name(ds, strval); 348185029Spjd dsl_dataset_rele(ds, FTAG); 349248571Smm dsl_pool_config_exit(dp, FTAG); 350185029Spjd } else { 351185029Spjd strval = NULL; 352185029Spjd intval = za.za_first_integer; 353185029Spjd } 354185029Spjd 355185029Spjd spa_prop_add_list(*nvp, prop, strval, intval, src); 356185029Spjd 357185029Spjd if (strval != NULL) 358185029Spjd kmem_free(strval, 359185029Spjd MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 360185029Spjd 361185029Spjd break; 362185029Spjd 363185029Spjd case 1: 364185029Spjd /* string property */ 365185029Spjd strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 366185029Spjd err = zap_lookup(mos, spa->spa_pool_props_object, 367185029Spjd za.za_name, 1, za.za_num_integers, strval); 368185029Spjd if (err) { 369185029Spjd kmem_free(strval, za.za_num_integers); 370185029Spjd break; 371185029Spjd } 372185029Spjd spa_prop_add_list(*nvp, prop, strval, 0, src); 373185029Spjd kmem_free(strval, za.za_num_integers); 374185029Spjd break; 375185029Spjd 376185029Spjd default: 377185029Spjd break; 378185029Spjd } 379185029Spjd } 380185029Spjd zap_cursor_fini(&zc); 381185029Spjd mutex_exit(&spa->spa_props_lock); 382185029Spjdout: 383185029Spjd if (err && err != ENOENT) { 384185029Spjd nvlist_free(*nvp); 385185029Spjd *nvp = NULL; 386185029Spjd return (err); 387185029Spjd } 388185029Spjd 389185029Spjd return (0); 390185029Spjd} 391185029Spjd 392185029Spjd/* 393185029Spjd * Validate the given pool properties nvlist and modify the list 394185029Spjd * for the property values to be set. 395185029Spjd */ 396185029Spjdstatic int 397185029Spjdspa_prop_validate(spa_t *spa, nvlist_t *props) 398185029Spjd{ 399185029Spjd nvpair_t *elem; 400185029Spjd int error = 0, reset_bootfs = 0; 401247187Smm uint64_t objnum = 0; 402236884Smm boolean_t has_feature = B_FALSE; 403185029Spjd 404185029Spjd elem = NULL; 405185029Spjd while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 406185029Spjd uint64_t intval; 407236884Smm char *strval, *slash, *check, *fname; 408236884Smm const char *propname = nvpair_name(elem); 409236884Smm zpool_prop_t prop = zpool_name_to_prop(propname); 410185029Spjd 411236884Smm switch (prop) { 412236884Smm case ZPROP_INVAL: 413236884Smm if (!zpool_prop_feature(propname)) { 414249195Smm error = SET_ERROR(EINVAL); 415236884Smm break; 416236884Smm } 417185029Spjd 418236884Smm /* 419236884Smm * Sanitize the input. 420236884Smm */ 421236884Smm if (nvpair_type(elem) != DATA_TYPE_UINT64) { 422249195Smm error = SET_ERROR(EINVAL); 423236884Smm break; 424236884Smm } 425185029Spjd 426236884Smm if (nvpair_value_uint64(elem, &intval) != 0) { 427249195Smm error = SET_ERROR(EINVAL); 428236884Smm break; 429236884Smm } 430236884Smm 431236884Smm if (intval != 0) { 432249195Smm error = SET_ERROR(EINVAL); 433236884Smm break; 434236884Smm } 435236884Smm 436236884Smm fname = strchr(propname, '@') + 1; 437236884Smm if (zfeature_lookup_name(fname, NULL) != 0) { 438249195Smm error = SET_ERROR(EINVAL); 439236884Smm break; 440236884Smm } 441236884Smm 442236884Smm has_feature = B_TRUE; 443236884Smm break; 444236884Smm 445185029Spjd case ZPOOL_PROP_VERSION: 446185029Spjd error = nvpair_value_uint64(elem, &intval); 447185029Spjd if (!error && 448236884Smm (intval < spa_version(spa) || 449236884Smm intval > SPA_VERSION_BEFORE_FEATURES || 450236884Smm has_feature)) 451249195Smm error = SET_ERROR(EINVAL); 452185029Spjd break; 453185029Spjd 454185029Spjd case ZPOOL_PROP_DELEGATION: 455185029Spjd case ZPOOL_PROP_AUTOREPLACE: 456185029Spjd case ZPOOL_PROP_LISTSNAPS: 457219089Spjd case ZPOOL_PROP_AUTOEXPAND: 458185029Spjd error = nvpair_value_uint64(elem, &intval); 459185029Spjd if (!error && intval > 1) 460249195Smm error = SET_ERROR(EINVAL); 461185029Spjd break; 462185029Spjd 463185029Spjd case ZPOOL_PROP_BOOTFS: 464209962Smm /* 465209962Smm * If the pool version is less than SPA_VERSION_BOOTFS, 466209962Smm * or the pool is still being created (version == 0), 467209962Smm * the bootfs property cannot be set. 468209962Smm */ 469185029Spjd if (spa_version(spa) < SPA_VERSION_BOOTFS) { 470249195Smm error = SET_ERROR(ENOTSUP); 471185029Spjd break; 472185029Spjd } 473185029Spjd 474185029Spjd /* 475185029Spjd * Make sure the vdev config is bootable 476185029Spjd */ 477185029Spjd if (!vdev_is_bootable(spa->spa_root_vdev)) { 478249195Smm error = SET_ERROR(ENOTSUP); 479185029Spjd break; 480185029Spjd } 481185029Spjd 482185029Spjd reset_bootfs = 1; 483185029Spjd 484185029Spjd error = nvpair_value_string(elem, &strval); 485185029Spjd 486185029Spjd if (!error) { 487236884Smm objset_t *os; 488185029Spjd uint64_t compress; 489185029Spjd 490185029Spjd if (strval == NULL || strval[0] == '\0') { 491185029Spjd objnum = zpool_prop_default_numeric( 492185029Spjd ZPOOL_PROP_BOOTFS); 493185029Spjd break; 494185029Spjd } 495185029Spjd 496219089Spjd if (error = dmu_objset_hold(strval, FTAG, &os)) 497185029Spjd break; 498185029Spjd 499219089Spjd /* Must be ZPL and not gzip compressed. */ 500219089Spjd 501219089Spjd if (dmu_objset_type(os) != DMU_OST_ZFS) { 502249195Smm error = SET_ERROR(ENOTSUP); 503248571Smm } else if ((error = 504248571Smm dsl_prop_get_int_ds(dmu_objset_ds(os), 505185029Spjd zfs_prop_to_name(ZFS_PROP_COMPRESSION), 506248571Smm &compress)) == 0 && 507185029Spjd !BOOTFS_COMPRESS_VALID(compress)) { 508249195Smm error = SET_ERROR(ENOTSUP); 509185029Spjd } else { 510185029Spjd objnum = dmu_objset_id(os); 511185029Spjd } 512219089Spjd dmu_objset_rele(os, FTAG); 513185029Spjd } 514185029Spjd break; 515185029Spjd 516185029Spjd case ZPOOL_PROP_FAILUREMODE: 517185029Spjd error = nvpair_value_uint64(elem, &intval); 518185029Spjd if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 519185029Spjd intval > ZIO_FAILURE_MODE_PANIC)) 520249195Smm error = SET_ERROR(EINVAL); 521185029Spjd 522185029Spjd /* 523185029Spjd * This is a special case which only occurs when 524185029Spjd * the pool has completely failed. This allows 525185029Spjd * the user to change the in-core failmode property 526185029Spjd * without syncing it out to disk (I/Os might 527185029Spjd * currently be blocked). We do this by returning 528185029Spjd * EIO to the caller (spa_prop_set) to trick it 529185029Spjd * into thinking we encountered a property validation 530185029Spjd * error. 531185029Spjd */ 532185029Spjd if (!error && spa_suspended(spa)) { 533185029Spjd spa->spa_failmode = intval; 534249195Smm error = SET_ERROR(EIO); 535185029Spjd } 536185029Spjd break; 537185029Spjd 538185029Spjd case ZPOOL_PROP_CACHEFILE: 539185029Spjd if ((error = nvpair_value_string(elem, &strval)) != 0) 540185029Spjd break; 541185029Spjd 542185029Spjd if (strval[0] == '\0') 543185029Spjd break; 544185029Spjd 545185029Spjd if (strcmp(strval, "none") == 0) 546185029Spjd break; 547185029Spjd 548185029Spjd if (strval[0] != '/') { 549249195Smm error = SET_ERROR(EINVAL); 550185029Spjd break; 551185029Spjd } 552185029Spjd 553185029Spjd slash = strrchr(strval, '/'); 554185029Spjd ASSERT(slash != NULL); 555185029Spjd 556185029Spjd if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 557185029Spjd strcmp(slash, "/..") == 0) 558249195Smm error = SET_ERROR(EINVAL); 559185029Spjd break; 560219089Spjd 561228103Smm case ZPOOL_PROP_COMMENT: 562228103Smm if ((error = nvpair_value_string(elem, &strval)) != 0) 563228103Smm break; 564228103Smm for (check = strval; *check != '\0'; check++) { 565228103Smm /* 566228103Smm * The kernel doesn't have an easy isprint() 567228103Smm * check. For this kernel check, we merely 568228103Smm * check ASCII apart from DEL. Fix this if 569228103Smm * there is an easy-to-use kernel isprint(). 570228103Smm */ 571228103Smm if (*check >= 0x7f) { 572249195Smm error = SET_ERROR(EINVAL); 573228103Smm break; 574228103Smm } 575228103Smm check++; 576228103Smm } 577228103Smm if (strlen(strval) > ZPROP_MAX_COMMENT) 578228103Smm error = E2BIG; 579228103Smm break; 580228103Smm 581219089Spjd case ZPOOL_PROP_DEDUPDITTO: 582219089Spjd if (spa_version(spa) < SPA_VERSION_DEDUP) 583249195Smm error = SET_ERROR(ENOTSUP); 584219089Spjd else 585219089Spjd error = nvpair_value_uint64(elem, &intval); 586219089Spjd if (error == 0 && 587219089Spjd intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 588249195Smm error = SET_ERROR(EINVAL); 589219089Spjd break; 590185029Spjd } 591185029Spjd 592185029Spjd if (error) 593185029Spjd break; 594185029Spjd } 595185029Spjd 596185029Spjd if (!error && reset_bootfs) { 597185029Spjd error = nvlist_remove(props, 598185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 599185029Spjd 600185029Spjd if (!error) { 601185029Spjd error = nvlist_add_uint64(props, 602185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 603185029Spjd } 604185029Spjd } 605185029Spjd 606185029Spjd return (error); 607185029Spjd} 608185029Spjd 609209962Smmvoid 610209962Smmspa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 611209962Smm{ 612209962Smm char *cachefile; 613209962Smm spa_config_dirent_t *dp; 614209962Smm 615209962Smm if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 616209962Smm &cachefile) != 0) 617209962Smm return; 618209962Smm 619209962Smm dp = kmem_alloc(sizeof (spa_config_dirent_t), 620209962Smm KM_SLEEP); 621209962Smm 622209962Smm if (cachefile[0] == '\0') 623209962Smm dp->scd_path = spa_strdup(spa_config_path); 624209962Smm else if (strcmp(cachefile, "none") == 0) 625209962Smm dp->scd_path = NULL; 626209962Smm else 627209962Smm dp->scd_path = spa_strdup(cachefile); 628209962Smm 629209962Smm list_insert_head(&spa->spa_config_list, dp); 630209962Smm if (need_sync) 631209962Smm spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 632209962Smm} 633209962Smm 634185029Spjdint 635185029Spjdspa_prop_set(spa_t *spa, nvlist_t *nvp) 636185029Spjd{ 637185029Spjd int error; 638236884Smm nvpair_t *elem = NULL; 639209962Smm boolean_t need_sync = B_FALSE; 640185029Spjd 641185029Spjd if ((error = spa_prop_validate(spa, nvp)) != 0) 642185029Spjd return (error); 643185029Spjd 644209962Smm while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 645236884Smm zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 646209962Smm 647219089Spjd if (prop == ZPOOL_PROP_CACHEFILE || 648219089Spjd prop == ZPOOL_PROP_ALTROOT || 649219089Spjd prop == ZPOOL_PROP_READONLY) 650209962Smm continue; 651209962Smm 652236884Smm if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { 653236884Smm uint64_t ver; 654236884Smm 655236884Smm if (prop == ZPOOL_PROP_VERSION) { 656236884Smm VERIFY(nvpair_value_uint64(elem, &ver) == 0); 657236884Smm } else { 658236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 659236884Smm ver = SPA_VERSION_FEATURES; 660236884Smm need_sync = B_TRUE; 661236884Smm } 662236884Smm 663236884Smm /* Save time if the version is already set. */ 664236884Smm if (ver == spa_version(spa)) 665236884Smm continue; 666236884Smm 667236884Smm /* 668236884Smm * In addition to the pool directory object, we might 669236884Smm * create the pool properties object, the features for 670236884Smm * read object, the features for write object, or the 671236884Smm * feature descriptions object. 672236884Smm */ 673248571Smm error = dsl_sync_task(spa->spa_name, NULL, 674248571Smm spa_sync_version, &ver, 6); 675236884Smm if (error) 676236884Smm return (error); 677236884Smm continue; 678236884Smm } 679236884Smm 680209962Smm need_sync = B_TRUE; 681209962Smm break; 682209962Smm } 683209962Smm 684236884Smm if (need_sync) { 685248571Smm return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 686248571Smm nvp, 6)); 687236884Smm } 688236884Smm 689236884Smm return (0); 690185029Spjd} 691185029Spjd 692185029Spjd/* 693185029Spjd * If the bootfs property value is dsobj, clear it. 694185029Spjd */ 695185029Spjdvoid 696185029Spjdspa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 697185029Spjd{ 698185029Spjd if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 699185029Spjd VERIFY(zap_remove(spa->spa_meta_objset, 700185029Spjd spa->spa_pool_props_object, 701185029Spjd zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 702185029Spjd spa->spa_bootfs = 0; 703185029Spjd } 704185029Spjd} 705185029Spjd 706239620Smm/*ARGSUSED*/ 707239620Smmstatic int 708248571Smmspa_change_guid_check(void *arg, dmu_tx_t *tx) 709239620Smm{ 710248571Smm uint64_t *newguid = arg; 711248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 712239620Smm vdev_t *rvd = spa->spa_root_vdev; 713239620Smm uint64_t vdev_state; 714239620Smm 715239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 716239620Smm vdev_state = rvd->vdev_state; 717239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 718239620Smm 719239620Smm if (vdev_state != VDEV_STATE_HEALTHY) 720249195Smm return (SET_ERROR(ENXIO)); 721239620Smm 722239620Smm ASSERT3U(spa_guid(spa), !=, *newguid); 723239620Smm 724239620Smm return (0); 725239620Smm} 726239620Smm 727239620Smmstatic void 728248571Smmspa_change_guid_sync(void *arg, dmu_tx_t *tx) 729239620Smm{ 730248571Smm uint64_t *newguid = arg; 731248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 732239620Smm uint64_t oldguid; 733239620Smm vdev_t *rvd = spa->spa_root_vdev; 734239620Smm 735239620Smm oldguid = spa_guid(spa); 736239620Smm 737239620Smm spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 738239620Smm rvd->vdev_guid = *newguid; 739239620Smm rvd->vdev_guid_sum += (*newguid - oldguid); 740239620Smm vdev_config_dirty(rvd); 741239620Smm spa_config_exit(spa, SCL_STATE, FTAG); 742239620Smm 743248571Smm spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 744239620Smm oldguid, *newguid); 745239620Smm} 746239620Smm 747185029Spjd/* 748228103Smm * Change the GUID for the pool. This is done so that we can later 749228103Smm * re-import a pool built from a clone of our own vdevs. We will modify 750228103Smm * the root vdev's guid, our own pool guid, and then mark all of our 751228103Smm * vdevs dirty. Note that we must make sure that all our vdevs are 752228103Smm * online when we do this, or else any vdevs that weren't present 753228103Smm * would be orphaned from our pool. We are also going to issue a 754228103Smm * sysevent to update any watchers. 755228103Smm */ 756228103Smmint 757228103Smmspa_change_guid(spa_t *spa) 758228103Smm{ 759239620Smm int error; 760239620Smm uint64_t guid; 761228103Smm 762254074Sdelphij mutex_enter(&spa->spa_vdev_top_lock); 763239620Smm mutex_enter(&spa_namespace_lock); 764239620Smm guid = spa_generate_guid(NULL); 765228103Smm 766248571Smm error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 767248571Smm spa_change_guid_sync, &guid, 5); 768228103Smm 769239620Smm if (error == 0) { 770239620Smm spa_config_sync(spa, B_FALSE, B_TRUE); 771239620Smm spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 772239620Smm } 773228103Smm 774239620Smm mutex_exit(&spa_namespace_lock); 775254074Sdelphij mutex_exit(&spa->spa_vdev_top_lock); 776228103Smm 777239620Smm return (error); 778228103Smm} 779228103Smm 780228103Smm/* 781185029Spjd * ========================================================================== 782168404Spjd * SPA state manipulation (open/create/destroy/import/export) 783168404Spjd * ========================================================================== 784168404Spjd */ 785168404Spjd 786168404Spjdstatic int 787168404Spjdspa_error_entry_compare(const void *a, const void *b) 788168404Spjd{ 789168404Spjd spa_error_entry_t *sa = (spa_error_entry_t *)a; 790168404Spjd spa_error_entry_t *sb = (spa_error_entry_t *)b; 791168404Spjd int ret; 792168404Spjd 793168404Spjd ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 794168404Spjd sizeof (zbookmark_t)); 795168404Spjd 796168404Spjd if (ret < 0) 797168404Spjd return (-1); 798168404Spjd else if (ret > 0) 799168404Spjd return (1); 800168404Spjd else 801168404Spjd return (0); 802168404Spjd} 803168404Spjd 804168404Spjd/* 805168404Spjd * Utility function which retrieves copies of the current logs and 806168404Spjd * re-initializes them in the process. 807168404Spjd */ 808168404Spjdvoid 809168404Spjdspa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 810168404Spjd{ 811168404Spjd ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 812168404Spjd 813168404Spjd bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 814168404Spjd bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 815168404Spjd 816168404Spjd avl_create(&spa->spa_errlist_scrub, 817168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 818168404Spjd offsetof(spa_error_entry_t, se_avl)); 819168404Spjd avl_create(&spa->spa_errlist_last, 820168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 821168404Spjd offsetof(spa_error_entry_t, se_avl)); 822168404Spjd} 823168404Spjd 824219089Spjdstatic taskq_t * 825219089Spjdspa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 826219089Spjd uint_t value) 827168404Spjd{ 828219089Spjd uint_t flags = TASKQ_PREPOPULATE; 829219089Spjd boolean_t batch = B_FALSE; 830168404Spjd 831219089Spjd switch (mode) { 832219089Spjd case zti_mode_null: 833219089Spjd return (NULL); /* no taskq needed */ 834168404Spjd 835219089Spjd case zti_mode_fixed: 836219089Spjd ASSERT3U(value, >=, 1); 837219089Spjd value = MAX(value, 1); 838219089Spjd break; 839168404Spjd 840219089Spjd case zti_mode_batch: 841219089Spjd batch = B_TRUE; 842219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 843219089Spjd value = zio_taskq_batch_pct; 844219089Spjd break; 845219089Spjd 846219089Spjd case zti_mode_online_percent: 847219089Spjd flags |= TASKQ_THREADS_CPU_PCT; 848219089Spjd break; 849219089Spjd 850219089Spjd default: 851219089Spjd panic("unrecognized mode for %s taskq (%u:%u) in " 852219089Spjd "spa_activate()", 853219089Spjd name, mode, value); 854219089Spjd break; 855219089Spjd } 856219089Spjd 857219089Spjd#ifdef SYSDC 858219089Spjd if (zio_taskq_sysdc && spa->spa_proc != &p0) { 859219089Spjd if (batch) 860219089Spjd flags |= TASKQ_DC_BATCH; 861219089Spjd 862219089Spjd return (taskq_create_sysdc(name, value, 50, INT_MAX, 863219089Spjd spa->spa_proc, zio_taskq_basedc, flags)); 864219089Spjd } 865219089Spjd#endif 866219089Spjd return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 867219089Spjd spa->spa_proc, flags)); 868219089Spjd} 869219089Spjd 870219089Spjdstatic void 871219089Spjdspa_create_zio_taskqs(spa_t *spa) 872219089Spjd{ 873185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 874185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 875211931Smm const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 876211931Smm enum zti_modes mode = ztip->zti_mode; 877211931Smm uint_t value = ztip->zti_value; 878209962Smm char name[32]; 879209962Smm 880209962Smm (void) snprintf(name, sizeof (name), 881211931Smm "%s_%s", zio_type_name[t], zio_taskq_types[q]); 882209962Smm 883219089Spjd spa->spa_zio_taskq[t][q] = 884219089Spjd spa_taskq_create(spa, name, mode, value); 885219089Spjd } 886219089Spjd } 887219089Spjd} 888209962Smm 889219089Spjd#ifdef _KERNEL 890219089Spjd#ifdef SPA_PROCESS 891219089Spjdstatic void 892219089Spjdspa_thread(void *arg) 893219089Spjd{ 894219089Spjd callb_cpr_t cprinfo; 895209962Smm 896219089Spjd spa_t *spa = arg; 897219089Spjd user_t *pu = PTOU(curproc); 898209962Smm 899219089Spjd CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 900219089Spjd spa->spa_name); 901209962Smm 902219089Spjd ASSERT(curproc != &p0); 903219089Spjd (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 904219089Spjd "zpool-%s", spa->spa_name); 905219089Spjd (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 906211931Smm 907219089Spjd#ifdef PSRSET_BIND 908219089Spjd /* bind this thread to the requested psrset */ 909219089Spjd if (zio_taskq_psrset_bind != PS_NONE) { 910219089Spjd pool_lock(); 911219089Spjd mutex_enter(&cpu_lock); 912219089Spjd mutex_enter(&pidlock); 913219089Spjd mutex_enter(&curproc->p_lock); 914219089Spjd 915219089Spjd if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 916219089Spjd 0, NULL, NULL) == 0) { 917219089Spjd curthread->t_bind_pset = zio_taskq_psrset_bind; 918219089Spjd } else { 919219089Spjd cmn_err(CE_WARN, 920219089Spjd "Couldn't bind process for zfs pool \"%s\" to " 921219089Spjd "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 922219089Spjd } 923219089Spjd 924219089Spjd mutex_exit(&curproc->p_lock); 925219089Spjd mutex_exit(&pidlock); 926219089Spjd mutex_exit(&cpu_lock); 927219089Spjd pool_unlock(); 928219089Spjd } 929219089Spjd#endif 930219089Spjd 931219089Spjd#ifdef SYSDC 932219089Spjd if (zio_taskq_sysdc) { 933219089Spjd sysdc_thread_enter(curthread, 100, 0); 934219089Spjd } 935219089Spjd#endif 936219089Spjd 937219089Spjd spa->spa_proc = curproc; 938219089Spjd spa->spa_did = curthread->t_did; 939219089Spjd 940219089Spjd spa_create_zio_taskqs(spa); 941219089Spjd 942219089Spjd mutex_enter(&spa->spa_proc_lock); 943219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 944219089Spjd 945219089Spjd spa->spa_proc_state = SPA_PROC_ACTIVE; 946219089Spjd cv_broadcast(&spa->spa_proc_cv); 947219089Spjd 948219089Spjd CALLB_CPR_SAFE_BEGIN(&cprinfo); 949219089Spjd while (spa->spa_proc_state == SPA_PROC_ACTIVE) 950219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 951219089Spjd CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 952219089Spjd 953219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 954219089Spjd spa->spa_proc_state = SPA_PROC_GONE; 955219089Spjd spa->spa_proc = &p0; 956219089Spjd cv_broadcast(&spa->spa_proc_cv); 957219089Spjd CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 958219089Spjd 959219089Spjd mutex_enter(&curproc->p_lock); 960219089Spjd lwp_exit(); 961219089Spjd} 962219089Spjd#endif /* SPA_PROCESS */ 963219089Spjd#endif 964219089Spjd 965219089Spjd/* 966219089Spjd * Activate an uninitialized pool. 967219089Spjd */ 968219089Spjdstatic void 969219089Spjdspa_activate(spa_t *spa, int mode) 970219089Spjd{ 971219089Spjd ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 972219089Spjd 973219089Spjd spa->spa_state = POOL_STATE_ACTIVE; 974219089Spjd spa->spa_mode = mode; 975219089Spjd 976219089Spjd spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 977219089Spjd spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 978219089Spjd 979219089Spjd /* Try to create a covering process */ 980219089Spjd mutex_enter(&spa->spa_proc_lock); 981219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 982219089Spjd ASSERT(spa->spa_proc == &p0); 983219089Spjd spa->spa_did = 0; 984219089Spjd 985219089Spjd#ifdef SPA_PROCESS 986219089Spjd /* Only create a process if we're going to be around a while. */ 987219089Spjd if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 988219089Spjd if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 989219089Spjd NULL, 0) == 0) { 990219089Spjd spa->spa_proc_state = SPA_PROC_CREATED; 991219089Spjd while (spa->spa_proc_state == SPA_PROC_CREATED) { 992219089Spjd cv_wait(&spa->spa_proc_cv, 993219089Spjd &spa->spa_proc_lock); 994209962Smm } 995219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 996219089Spjd ASSERT(spa->spa_proc != &p0); 997219089Spjd ASSERT(spa->spa_did != 0); 998219089Spjd } else { 999219089Spjd#ifdef _KERNEL 1000219089Spjd cmn_err(CE_WARN, 1001219089Spjd "Couldn't create process for zfs pool \"%s\"\n", 1002219089Spjd spa->spa_name); 1003219089Spjd#endif 1004185029Spjd } 1005168404Spjd } 1006219089Spjd#endif /* SPA_PROCESS */ 1007219089Spjd mutex_exit(&spa->spa_proc_lock); 1008168404Spjd 1009219089Spjd /* If we didn't create a process, we need to create our taskqs. */ 1010219089Spjd ASSERT(spa->spa_proc == &p0); 1011219089Spjd if (spa->spa_proc == &p0) { 1012219089Spjd spa_create_zio_taskqs(spa); 1013219089Spjd } 1014219089Spjd 1015240868Spjd /* 1016240868Spjd * Start TRIM thread. 1017240868Spjd */ 1018240868Spjd trim_thread_create(spa); 1019240868Spjd 1020185029Spjd list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1021185029Spjd offsetof(vdev_t, vdev_config_dirty_node)); 1022185029Spjd list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1023185029Spjd offsetof(vdev_t, vdev_state_dirty_node)); 1024168404Spjd 1025168404Spjd txg_list_create(&spa->spa_vdev_txg_list, 1026168404Spjd offsetof(struct vdev, vdev_txg_node)); 1027168404Spjd 1028168404Spjd avl_create(&spa->spa_errlist_scrub, 1029168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1030168404Spjd offsetof(spa_error_entry_t, se_avl)); 1031168404Spjd avl_create(&spa->spa_errlist_last, 1032168404Spjd spa_error_entry_compare, sizeof (spa_error_entry_t), 1033168404Spjd offsetof(spa_error_entry_t, se_avl)); 1034168404Spjd} 1035168404Spjd 1036168404Spjd/* 1037168404Spjd * Opposite of spa_activate(). 1038168404Spjd */ 1039168404Spjdstatic void 1040168404Spjdspa_deactivate(spa_t *spa) 1041168404Spjd{ 1042168404Spjd ASSERT(spa->spa_sync_on == B_FALSE); 1043168404Spjd ASSERT(spa->spa_dsl_pool == NULL); 1044168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1045209962Smm ASSERT(spa->spa_async_zio_root == NULL); 1046168404Spjd ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1047168404Spjd 1048240868Spjd /* 1049240868Spjd * Stop TRIM thread in case spa_unload() wasn't called directly 1050240868Spjd * before spa_deactivate(). 1051240868Spjd */ 1052240868Spjd trim_thread_destroy(spa); 1053240868Spjd 1054168404Spjd txg_list_destroy(&spa->spa_vdev_txg_list); 1055168404Spjd 1056185029Spjd list_destroy(&spa->spa_config_dirty_list); 1057185029Spjd list_destroy(&spa->spa_state_dirty_list); 1058168404Spjd 1059185029Spjd for (int t = 0; t < ZIO_TYPES; t++) { 1060185029Spjd for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1061211931Smm if (spa->spa_zio_taskq[t][q] != NULL) 1062211931Smm taskq_destroy(spa->spa_zio_taskq[t][q]); 1063185029Spjd spa->spa_zio_taskq[t][q] = NULL; 1064185029Spjd } 1065168404Spjd } 1066168404Spjd 1067168404Spjd metaslab_class_destroy(spa->spa_normal_class); 1068168404Spjd spa->spa_normal_class = NULL; 1069168404Spjd 1070185029Spjd metaslab_class_destroy(spa->spa_log_class); 1071185029Spjd spa->spa_log_class = NULL; 1072185029Spjd 1073168404Spjd /* 1074168404Spjd * If this was part of an import or the open otherwise failed, we may 1075168404Spjd * still have errors left in the queues. Empty them just in case. 1076168404Spjd */ 1077168404Spjd spa_errlog_drain(spa); 1078168404Spjd 1079168404Spjd avl_destroy(&spa->spa_errlist_scrub); 1080168404Spjd avl_destroy(&spa->spa_errlist_last); 1081168404Spjd 1082168404Spjd spa->spa_state = POOL_STATE_UNINITIALIZED; 1083219089Spjd 1084219089Spjd mutex_enter(&spa->spa_proc_lock); 1085219089Spjd if (spa->spa_proc_state != SPA_PROC_NONE) { 1086219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1087219089Spjd spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1088219089Spjd cv_broadcast(&spa->spa_proc_cv); 1089219089Spjd while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1090219089Spjd ASSERT(spa->spa_proc != &p0); 1091219089Spjd cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1092219089Spjd } 1093219089Spjd ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1094219089Spjd spa->spa_proc_state = SPA_PROC_NONE; 1095219089Spjd } 1096219089Spjd ASSERT(spa->spa_proc == &p0); 1097219089Spjd mutex_exit(&spa->spa_proc_lock); 1098219089Spjd 1099219089Spjd#ifdef SPA_PROCESS 1100219089Spjd /* 1101219089Spjd * We want to make sure spa_thread() has actually exited the ZFS 1102219089Spjd * module, so that the module can't be unloaded out from underneath 1103219089Spjd * it. 1104219089Spjd */ 1105219089Spjd if (spa->spa_did != 0) { 1106219089Spjd thread_join(spa->spa_did); 1107219089Spjd spa->spa_did = 0; 1108219089Spjd } 1109219089Spjd#endif /* SPA_PROCESS */ 1110168404Spjd} 1111168404Spjd 1112168404Spjd/* 1113168404Spjd * Verify a pool configuration, and construct the vdev tree appropriately. This 1114168404Spjd * will create all the necessary vdevs in the appropriate layout, with each vdev 1115168404Spjd * in the CLOSED state. This will prep the pool before open/creation/import. 1116168404Spjd * All vdev validation is done by the vdev_alloc() routine. 1117168404Spjd */ 1118168404Spjdstatic int 1119168404Spjdspa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1120168404Spjd uint_t id, int atype) 1121168404Spjd{ 1122168404Spjd nvlist_t **child; 1123219089Spjd uint_t children; 1124168404Spjd int error; 1125168404Spjd 1126168404Spjd if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1127168404Spjd return (error); 1128168404Spjd 1129168404Spjd if ((*vdp)->vdev_ops->vdev_op_leaf) 1130168404Spjd return (0); 1131168404Spjd 1132185029Spjd error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1133185029Spjd &child, &children); 1134185029Spjd 1135185029Spjd if (error == ENOENT) 1136185029Spjd return (0); 1137185029Spjd 1138185029Spjd if (error) { 1139168404Spjd vdev_free(*vdp); 1140168404Spjd *vdp = NULL; 1141249195Smm return (SET_ERROR(EINVAL)); 1142168404Spjd } 1143168404Spjd 1144219089Spjd for (int c = 0; c < children; c++) { 1145168404Spjd vdev_t *vd; 1146168404Spjd if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1147168404Spjd atype)) != 0) { 1148168404Spjd vdev_free(*vdp); 1149168404Spjd *vdp = NULL; 1150168404Spjd return (error); 1151168404Spjd } 1152168404Spjd } 1153168404Spjd 1154168404Spjd ASSERT(*vdp != NULL); 1155168404Spjd 1156168404Spjd return (0); 1157168404Spjd} 1158168404Spjd 1159168404Spjd/* 1160168404Spjd * Opposite of spa_load(). 1161168404Spjd */ 1162168404Spjdstatic void 1163168404Spjdspa_unload(spa_t *spa) 1164168404Spjd{ 1165168404Spjd int i; 1166168404Spjd 1167185029Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1168185029Spjd 1169168404Spjd /* 1170240868Spjd * Stop TRIM thread. 1171240868Spjd */ 1172240868Spjd trim_thread_destroy(spa); 1173240868Spjd 1174240868Spjd /* 1175168404Spjd * Stop async tasks. 1176168404Spjd */ 1177168404Spjd spa_async_suspend(spa); 1178168404Spjd 1179168404Spjd /* 1180168404Spjd * Stop syncing. 1181168404Spjd */ 1182168404Spjd if (spa->spa_sync_on) { 1183168404Spjd txg_sync_stop(spa->spa_dsl_pool); 1184168404Spjd spa->spa_sync_on = B_FALSE; 1185168404Spjd } 1186168404Spjd 1187168404Spjd /* 1188185029Spjd * Wait for any outstanding async I/O to complete. 1189168404Spjd */ 1190209962Smm if (spa->spa_async_zio_root != NULL) { 1191209962Smm (void) zio_wait(spa->spa_async_zio_root); 1192209962Smm spa->spa_async_zio_root = NULL; 1193209962Smm } 1194168404Spjd 1195219089Spjd bpobj_close(&spa->spa_deferred_bpobj); 1196219089Spjd 1197168404Spjd /* 1198168404Spjd * Close the dsl pool. 1199168404Spjd */ 1200168404Spjd if (spa->spa_dsl_pool) { 1201168404Spjd dsl_pool_close(spa->spa_dsl_pool); 1202168404Spjd spa->spa_dsl_pool = NULL; 1203219089Spjd spa->spa_meta_objset = NULL; 1204168404Spjd } 1205168404Spjd 1206219089Spjd ddt_unload(spa); 1207219089Spjd 1208209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1209209962Smm 1210168404Spjd /* 1211209962Smm * Drop and purge level 2 cache 1212209962Smm */ 1213209962Smm spa_l2cache_drop(spa); 1214209962Smm 1215209962Smm /* 1216168404Spjd * Close all vdevs. 1217168404Spjd */ 1218168404Spjd if (spa->spa_root_vdev) 1219168404Spjd vdev_free(spa->spa_root_vdev); 1220168404Spjd ASSERT(spa->spa_root_vdev == NULL); 1221168404Spjd 1222185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1223185029Spjd vdev_free(spa->spa_spares.sav_vdevs[i]); 1224185029Spjd if (spa->spa_spares.sav_vdevs) { 1225185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1226185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1227185029Spjd spa->spa_spares.sav_vdevs = NULL; 1228168404Spjd } 1229185029Spjd if (spa->spa_spares.sav_config) { 1230185029Spjd nvlist_free(spa->spa_spares.sav_config); 1231185029Spjd spa->spa_spares.sav_config = NULL; 1232168404Spjd } 1233185029Spjd spa->spa_spares.sav_count = 0; 1234168404Spjd 1235230514Smm for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 1236230514Smm vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1237185029Spjd vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1238230514Smm } 1239185029Spjd if (spa->spa_l2cache.sav_vdevs) { 1240185029Spjd kmem_free(spa->spa_l2cache.sav_vdevs, 1241185029Spjd spa->spa_l2cache.sav_count * sizeof (void *)); 1242185029Spjd spa->spa_l2cache.sav_vdevs = NULL; 1243185029Spjd } 1244185029Spjd if (spa->spa_l2cache.sav_config) { 1245185029Spjd nvlist_free(spa->spa_l2cache.sav_config); 1246185029Spjd spa->spa_l2cache.sav_config = NULL; 1247185029Spjd } 1248185029Spjd spa->spa_l2cache.sav_count = 0; 1249185029Spjd 1250168404Spjd spa->spa_async_suspended = 0; 1251209962Smm 1252228103Smm if (spa->spa_comment != NULL) { 1253228103Smm spa_strfree(spa->spa_comment); 1254228103Smm spa->spa_comment = NULL; 1255228103Smm } 1256228103Smm 1257209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 1258168404Spjd} 1259168404Spjd 1260168404Spjd/* 1261168404Spjd * Load (or re-load) the current list of vdevs describing the active spares for 1262168404Spjd * this pool. When this is called, we have some form of basic information in 1263185029Spjd * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1264185029Spjd * then re-generate a more complete list including status information. 1265168404Spjd */ 1266168404Spjdstatic void 1267168404Spjdspa_load_spares(spa_t *spa) 1268168404Spjd{ 1269168404Spjd nvlist_t **spares; 1270168404Spjd uint_t nspares; 1271168404Spjd int i; 1272168404Spjd vdev_t *vd, *tvd; 1273168404Spjd 1274185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1275185029Spjd 1276168404Spjd /* 1277168404Spjd * First, close and free any existing spare vdevs. 1278168404Spjd */ 1279185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1280185029Spjd vd = spa->spa_spares.sav_vdevs[i]; 1281168404Spjd 1282168404Spjd /* Undo the call to spa_activate() below */ 1283185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1284185029Spjd B_FALSE)) != NULL && tvd->vdev_isspare) 1285168404Spjd spa_spare_remove(tvd); 1286168404Spjd vdev_close(vd); 1287168404Spjd vdev_free(vd); 1288168404Spjd } 1289168404Spjd 1290185029Spjd if (spa->spa_spares.sav_vdevs) 1291185029Spjd kmem_free(spa->spa_spares.sav_vdevs, 1292185029Spjd spa->spa_spares.sav_count * sizeof (void *)); 1293168404Spjd 1294185029Spjd if (spa->spa_spares.sav_config == NULL) 1295168404Spjd nspares = 0; 1296168404Spjd else 1297185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1298168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1299168404Spjd 1300185029Spjd spa->spa_spares.sav_count = (int)nspares; 1301185029Spjd spa->spa_spares.sav_vdevs = NULL; 1302168404Spjd 1303168404Spjd if (nspares == 0) 1304168404Spjd return; 1305168404Spjd 1306168404Spjd /* 1307168404Spjd * Construct the array of vdevs, opening them to get status in the 1308168404Spjd * process. For each spare, there is potentially two different vdev_t 1309168404Spjd * structures associated with it: one in the list of spares (used only 1310168404Spjd * for basic validation purposes) and one in the active vdev 1311168404Spjd * configuration (if it's spared in). During this phase we open and 1312168404Spjd * validate each vdev on the spare list. If the vdev also exists in the 1313168404Spjd * active configuration, then we also mark this vdev as an active spare. 1314168404Spjd */ 1315185029Spjd spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1316185029Spjd KM_SLEEP); 1317185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) { 1318168404Spjd VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1319168404Spjd VDEV_ALLOC_SPARE) == 0); 1320168404Spjd ASSERT(vd != NULL); 1321168404Spjd 1322185029Spjd spa->spa_spares.sav_vdevs[i] = vd; 1323168404Spjd 1324185029Spjd if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1325185029Spjd B_FALSE)) != NULL) { 1326168404Spjd if (!tvd->vdev_isspare) 1327168404Spjd spa_spare_add(tvd); 1328168404Spjd 1329168404Spjd /* 1330168404Spjd * We only mark the spare active if we were successfully 1331168404Spjd * able to load the vdev. Otherwise, importing a pool 1332168404Spjd * with a bad active spare would result in strange 1333168404Spjd * behavior, because multiple pool would think the spare 1334168404Spjd * is actively in use. 1335168404Spjd * 1336168404Spjd * There is a vulnerability here to an equally bizarre 1337168404Spjd * circumstance, where a dead active spare is later 1338168404Spjd * brought back to life (onlined or otherwise). Given 1339168404Spjd * the rarity of this scenario, and the extra complexity 1340168404Spjd * it adds, we ignore the possibility. 1341168404Spjd */ 1342168404Spjd if (!vdev_is_dead(tvd)) 1343168404Spjd spa_spare_activate(tvd); 1344168404Spjd } 1345168404Spjd 1346185029Spjd vd->vdev_top = vd; 1347209962Smm vd->vdev_aux = &spa->spa_spares; 1348185029Spjd 1349168404Spjd if (vdev_open(vd) != 0) 1350168404Spjd continue; 1351168404Spjd 1352185029Spjd if (vdev_validate_aux(vd) == 0) 1353185029Spjd spa_spare_add(vd); 1354168404Spjd } 1355168404Spjd 1356168404Spjd /* 1357168404Spjd * Recompute the stashed list of spares, with status information 1358168404Spjd * this time. 1359168404Spjd */ 1360185029Spjd VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1361168404Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1362168404Spjd 1363185029Spjd spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1364185029Spjd KM_SLEEP); 1365185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1366185029Spjd spares[i] = vdev_config_generate(spa, 1367219089Spjd spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1368185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1369185029Spjd ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1370185029Spjd for (i = 0; i < spa->spa_spares.sav_count; i++) 1371168404Spjd nvlist_free(spares[i]); 1372185029Spjd kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1373168404Spjd} 1374168404Spjd 1375185029Spjd/* 1376185029Spjd * Load (or re-load) the current list of vdevs describing the active l2cache for 1377185029Spjd * this pool. When this is called, we have some form of basic information in 1378185029Spjd * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1379185029Spjd * then re-generate a more complete list including status information. 1380185029Spjd * Devices which are already active have their details maintained, and are 1381185029Spjd * not re-opened. 1382185029Spjd */ 1383185029Spjdstatic void 1384185029Spjdspa_load_l2cache(spa_t *spa) 1385185029Spjd{ 1386185029Spjd nvlist_t **l2cache; 1387185029Spjd uint_t nl2cache; 1388185029Spjd int i, j, oldnvdevs; 1389219089Spjd uint64_t guid; 1390185029Spjd vdev_t *vd, **oldvdevs, **newvdevs; 1391185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 1392185029Spjd 1393185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1394185029Spjd 1395185029Spjd if (sav->sav_config != NULL) { 1396185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1397185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1398185029Spjd newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1399185029Spjd } else { 1400185029Spjd nl2cache = 0; 1401247187Smm newvdevs = NULL; 1402185029Spjd } 1403185029Spjd 1404185029Spjd oldvdevs = sav->sav_vdevs; 1405185029Spjd oldnvdevs = sav->sav_count; 1406185029Spjd sav->sav_vdevs = NULL; 1407185029Spjd sav->sav_count = 0; 1408185029Spjd 1409185029Spjd /* 1410185029Spjd * Process new nvlist of vdevs. 1411185029Spjd */ 1412185029Spjd for (i = 0; i < nl2cache; i++) { 1413185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1414185029Spjd &guid) == 0); 1415185029Spjd 1416185029Spjd newvdevs[i] = NULL; 1417185029Spjd for (j = 0; j < oldnvdevs; j++) { 1418185029Spjd vd = oldvdevs[j]; 1419185029Spjd if (vd != NULL && guid == vd->vdev_guid) { 1420185029Spjd /* 1421185029Spjd * Retain previous vdev for add/remove ops. 1422185029Spjd */ 1423185029Spjd newvdevs[i] = vd; 1424185029Spjd oldvdevs[j] = NULL; 1425185029Spjd break; 1426185029Spjd } 1427185029Spjd } 1428185029Spjd 1429185029Spjd if (newvdevs[i] == NULL) { 1430185029Spjd /* 1431185029Spjd * Create new vdev 1432185029Spjd */ 1433185029Spjd VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1434185029Spjd VDEV_ALLOC_L2CACHE) == 0); 1435185029Spjd ASSERT(vd != NULL); 1436185029Spjd newvdevs[i] = vd; 1437185029Spjd 1438185029Spjd /* 1439185029Spjd * Commit this vdev as an l2cache device, 1440185029Spjd * even if it fails to open. 1441185029Spjd */ 1442185029Spjd spa_l2cache_add(vd); 1443185029Spjd 1444185029Spjd vd->vdev_top = vd; 1445185029Spjd vd->vdev_aux = sav; 1446185029Spjd 1447185029Spjd spa_l2cache_activate(vd); 1448185029Spjd 1449185029Spjd if (vdev_open(vd) != 0) 1450185029Spjd continue; 1451185029Spjd 1452185029Spjd (void) vdev_validate_aux(vd); 1453185029Spjd 1454219089Spjd if (!vdev_is_dead(vd)) 1455219089Spjd l2arc_add_vdev(spa, vd); 1456185029Spjd } 1457185029Spjd } 1458185029Spjd 1459185029Spjd /* 1460185029Spjd * Purge vdevs that were dropped 1461185029Spjd */ 1462185029Spjd for (i = 0; i < oldnvdevs; i++) { 1463185029Spjd uint64_t pool; 1464185029Spjd 1465185029Spjd vd = oldvdevs[i]; 1466185029Spjd if (vd != NULL) { 1467230514Smm ASSERT(vd->vdev_isl2cache); 1468230514Smm 1469209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1470209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 1471185029Spjd l2arc_remove_vdev(vd); 1472230514Smm vdev_clear_stats(vd); 1473230514Smm vdev_free(vd); 1474185029Spjd } 1475185029Spjd } 1476185029Spjd 1477185029Spjd if (oldvdevs) 1478185029Spjd kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1479185029Spjd 1480185029Spjd if (sav->sav_config == NULL) 1481185029Spjd goto out; 1482185029Spjd 1483185029Spjd sav->sav_vdevs = newvdevs; 1484185029Spjd sav->sav_count = (int)nl2cache; 1485185029Spjd 1486185029Spjd /* 1487185029Spjd * Recompute the stashed list of l2cache devices, with status 1488185029Spjd * information this time. 1489185029Spjd */ 1490185029Spjd VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1491185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 1492185029Spjd 1493185029Spjd l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1494185029Spjd for (i = 0; i < sav->sav_count; i++) 1495185029Spjd l2cache[i] = vdev_config_generate(spa, 1496219089Spjd sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1497185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1498185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1499185029Spjdout: 1500185029Spjd for (i = 0; i < sav->sav_count; i++) 1501185029Spjd nvlist_free(l2cache[i]); 1502185029Spjd if (sav->sav_count) 1503185029Spjd kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1504185029Spjd} 1505185029Spjd 1506168404Spjdstatic int 1507168404Spjdload_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1508168404Spjd{ 1509168404Spjd dmu_buf_t *db; 1510168404Spjd char *packed = NULL; 1511168404Spjd size_t nvsize = 0; 1512168404Spjd int error; 1513168404Spjd *value = NULL; 1514168404Spjd 1515168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1516168404Spjd nvsize = *(uint64_t *)db->db_data; 1517168404Spjd dmu_buf_rele(db, FTAG); 1518168404Spjd 1519168404Spjd packed = kmem_alloc(nvsize, KM_SLEEP); 1520209962Smm error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1521209962Smm DMU_READ_PREFETCH); 1522168404Spjd if (error == 0) 1523168404Spjd error = nvlist_unpack(packed, nvsize, value, 0); 1524168404Spjd kmem_free(packed, nvsize); 1525168404Spjd 1526168404Spjd return (error); 1527168404Spjd} 1528168404Spjd 1529168404Spjd/* 1530185029Spjd * Checks to see if the given vdev could not be opened, in which case we post a 1531185029Spjd * sysevent to notify the autoreplace code that the device has been removed. 1532185029Spjd */ 1533185029Spjdstatic void 1534185029Spjdspa_check_removed(vdev_t *vd) 1535185029Spjd{ 1536219089Spjd for (int c = 0; c < vd->vdev_children; c++) 1537185029Spjd spa_check_removed(vd->vdev_child[c]); 1538185029Spjd 1539249188Smm if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 1540249188Smm !vd->vdev_ishole) { 1541185029Spjd zfs_post_autoreplace(vd->vdev_spa, vd); 1542185029Spjd spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1543185029Spjd } 1544185029Spjd} 1545185029Spjd 1546185029Spjd/* 1547219089Spjd * Validate the current config against the MOS config 1548213197Smm */ 1549219089Spjdstatic boolean_t 1550219089Spjdspa_config_valid(spa_t *spa, nvlist_t *config) 1551213197Smm{ 1552219089Spjd vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1553219089Spjd nvlist_t *nv; 1554213197Smm 1555219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1556213197Smm 1557219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1558219089Spjd VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1559219089Spjd 1560219089Spjd ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1561219089Spjd 1562219089Spjd /* 1563219089Spjd * If we're doing a normal import, then build up any additional 1564219089Spjd * diagnostic information about missing devices in this config. 1565219089Spjd * We'll pass this up to the user for further processing. 1566219089Spjd */ 1567219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1568219089Spjd nvlist_t **child, *nv; 1569219089Spjd uint64_t idx = 0; 1570219089Spjd 1571219089Spjd child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1572219089Spjd KM_SLEEP); 1573219089Spjd VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1574219089Spjd 1575219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1576219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1577219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1578219089Spjd 1579219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1580219089Spjd mtvd->vdev_ops != &vdev_missing_ops && 1581219089Spjd mtvd->vdev_islog) 1582219089Spjd child[idx++] = vdev_config_generate(spa, mtvd, 1583219089Spjd B_FALSE, 0); 1584219089Spjd } 1585219089Spjd 1586219089Spjd if (idx) { 1587219089Spjd VERIFY(nvlist_add_nvlist_array(nv, 1588219089Spjd ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1589219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1590219089Spjd ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1591219089Spjd 1592219089Spjd for (int i = 0; i < idx; i++) 1593219089Spjd nvlist_free(child[i]); 1594219089Spjd } 1595219089Spjd nvlist_free(nv); 1596219089Spjd kmem_free(child, rvd->vdev_children * sizeof (char **)); 1597219089Spjd } 1598219089Spjd 1599219089Spjd /* 1600219089Spjd * Compare the root vdev tree with the information we have 1601219089Spjd * from the MOS config (mrvd). Check each top-level vdev 1602219089Spjd * with the corresponding MOS config top-level (mtvd). 1603219089Spjd */ 1604219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1605213197Smm vdev_t *tvd = rvd->vdev_child[c]; 1606219089Spjd vdev_t *mtvd = mrvd->vdev_child[c]; 1607213197Smm 1608219089Spjd /* 1609219089Spjd * Resolve any "missing" vdevs in the current configuration. 1610219089Spjd * If we find that the MOS config has more accurate information 1611219089Spjd * about the top-level vdev then use that vdev instead. 1612219089Spjd */ 1613219089Spjd if (tvd->vdev_ops == &vdev_missing_ops && 1614219089Spjd mtvd->vdev_ops != &vdev_missing_ops) { 1615219089Spjd 1616219089Spjd if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1617219089Spjd continue; 1618219089Spjd 1619219089Spjd /* 1620219089Spjd * Device specific actions. 1621219089Spjd */ 1622219089Spjd if (mtvd->vdev_islog) { 1623219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 1624219089Spjd } else { 1625219089Spjd /* 1626219089Spjd * XXX - once we have 'readonly' pool 1627219089Spjd * support we should be able to handle 1628219089Spjd * missing data devices by transitioning 1629219089Spjd * the pool to readonly. 1630219089Spjd */ 1631219089Spjd continue; 1632219089Spjd } 1633219089Spjd 1634219089Spjd /* 1635219089Spjd * Swap the missing vdev with the data we were 1636219089Spjd * able to obtain from the MOS config. 1637219089Spjd */ 1638219089Spjd vdev_remove_child(rvd, tvd); 1639219089Spjd vdev_remove_child(mrvd, mtvd); 1640219089Spjd 1641219089Spjd vdev_add_child(rvd, mtvd); 1642219089Spjd vdev_add_child(mrvd, tvd); 1643219089Spjd 1644219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1645219089Spjd vdev_load(mtvd); 1646219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1647219089Spjd 1648219089Spjd vdev_reopen(rvd); 1649219089Spjd } else if (mtvd->vdev_islog) { 1650219089Spjd /* 1651219089Spjd * Load the slog device's state from the MOS config 1652219089Spjd * since it's possible that the label does not 1653219089Spjd * contain the most up-to-date information. 1654219089Spjd */ 1655219089Spjd vdev_load_log_state(tvd, mtvd); 1656219089Spjd vdev_reopen(tvd); 1657219089Spjd } 1658213197Smm } 1659219089Spjd vdev_free(mrvd); 1660219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 1661219089Spjd 1662219089Spjd /* 1663219089Spjd * Ensure we were able to validate the config. 1664219089Spjd */ 1665219089Spjd return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1666213197Smm} 1667213197Smm 1668213197Smm/* 1669185029Spjd * Check for missing log devices 1670185029Spjd */ 1671248571Smmstatic boolean_t 1672185029Spjdspa_check_logs(spa_t *spa) 1673185029Spjd{ 1674248571Smm boolean_t rv = B_FALSE; 1675248571Smm 1676185029Spjd switch (spa->spa_log_state) { 1677185029Spjd case SPA_LOG_MISSING: 1678185029Spjd /* need to recheck in case slog has been restored */ 1679185029Spjd case SPA_LOG_UNKNOWN: 1680248571Smm rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, 1681248571Smm NULL, DS_FIND_CHILDREN) != 0); 1682248571Smm if (rv) 1683219089Spjd spa_set_log_state(spa, SPA_LOG_MISSING); 1684185029Spjd break; 1685185029Spjd } 1686248571Smm return (rv); 1687185029Spjd} 1688185029Spjd 1689219089Spjdstatic boolean_t 1690219089Spjdspa_passivate_log(spa_t *spa) 1691219089Spjd{ 1692219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1693219089Spjd boolean_t slog_found = B_FALSE; 1694219089Spjd 1695219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1696219089Spjd 1697219089Spjd if (!spa_has_slogs(spa)) 1698219089Spjd return (B_FALSE); 1699219089Spjd 1700219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1701219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1702219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1703219089Spjd 1704219089Spjd if (tvd->vdev_islog) { 1705219089Spjd metaslab_group_passivate(mg); 1706219089Spjd slog_found = B_TRUE; 1707219089Spjd } 1708219089Spjd } 1709219089Spjd 1710219089Spjd return (slog_found); 1711219089Spjd} 1712219089Spjd 1713219089Spjdstatic void 1714219089Spjdspa_activate_log(spa_t *spa) 1715219089Spjd{ 1716219089Spjd vdev_t *rvd = spa->spa_root_vdev; 1717219089Spjd 1718219089Spjd ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1719219089Spjd 1720219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 1721219089Spjd vdev_t *tvd = rvd->vdev_child[c]; 1722219089Spjd metaslab_group_t *mg = tvd->vdev_mg; 1723219089Spjd 1724219089Spjd if (tvd->vdev_islog) 1725219089Spjd metaslab_group_activate(mg); 1726219089Spjd } 1727219089Spjd} 1728219089Spjd 1729219089Spjdint 1730219089Spjdspa_offline_log(spa_t *spa) 1731219089Spjd{ 1732248571Smm int error; 1733219089Spjd 1734248571Smm error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1735248571Smm NULL, DS_FIND_CHILDREN); 1736248571Smm if (error == 0) { 1737219089Spjd /* 1738219089Spjd * We successfully offlined the log device, sync out the 1739219089Spjd * current txg so that the "stubby" block can be removed 1740219089Spjd * by zil_sync(). 1741219089Spjd */ 1742219089Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 1743219089Spjd } 1744219089Spjd return (error); 1745219089Spjd} 1746219089Spjd 1747219089Spjdstatic void 1748219089Spjdspa_aux_check_removed(spa_aux_vdev_t *sav) 1749219089Spjd{ 1750219089Spjd int i; 1751219089Spjd 1752219089Spjd for (i = 0; i < sav->sav_count; i++) 1753219089Spjd spa_check_removed(sav->sav_vdevs[i]); 1754219089Spjd} 1755219089Spjd 1756219089Spjdvoid 1757219089Spjdspa_claim_notify(zio_t *zio) 1758219089Spjd{ 1759219089Spjd spa_t *spa = zio->io_spa; 1760219089Spjd 1761219089Spjd if (zio->io_error) 1762219089Spjd return; 1763219089Spjd 1764219089Spjd mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1765219089Spjd if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1766219089Spjd spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1767219089Spjd mutex_exit(&spa->spa_props_lock); 1768219089Spjd} 1769219089Spjd 1770219089Spjdtypedef struct spa_load_error { 1771219089Spjd uint64_t sle_meta_count; 1772219089Spjd uint64_t sle_data_count; 1773219089Spjd} spa_load_error_t; 1774219089Spjd 1775219089Spjdstatic void 1776219089Spjdspa_load_verify_done(zio_t *zio) 1777219089Spjd{ 1778219089Spjd blkptr_t *bp = zio->io_bp; 1779219089Spjd spa_load_error_t *sle = zio->io_private; 1780219089Spjd dmu_object_type_t type = BP_GET_TYPE(bp); 1781219089Spjd int error = zio->io_error; 1782219089Spjd 1783219089Spjd if (error) { 1784236884Smm if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 1785219089Spjd type != DMU_OT_INTENT_LOG) 1786219089Spjd atomic_add_64(&sle->sle_meta_count, 1); 1787219089Spjd else 1788219089Spjd atomic_add_64(&sle->sle_data_count, 1); 1789219089Spjd } 1790219089Spjd zio_data_buf_free(zio->io_data, zio->io_size); 1791219089Spjd} 1792219089Spjd 1793219089Spjd/*ARGSUSED*/ 1794219089Spjdstatic int 1795219089Spjdspa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1796246666Smm const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1797219089Spjd{ 1798219089Spjd if (bp != NULL) { 1799219089Spjd zio_t *rio = arg; 1800219089Spjd size_t size = BP_GET_PSIZE(bp); 1801219089Spjd void *data = zio_data_buf_alloc(size); 1802219089Spjd 1803219089Spjd zio_nowait(zio_read(rio, spa, bp, data, size, 1804219089Spjd spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1805219089Spjd ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1806219089Spjd ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1807219089Spjd } 1808219089Spjd return (0); 1809219089Spjd} 1810219089Spjd 1811219089Spjdstatic int 1812219089Spjdspa_load_verify(spa_t *spa) 1813219089Spjd{ 1814219089Spjd zio_t *rio; 1815219089Spjd spa_load_error_t sle = { 0 }; 1816219089Spjd zpool_rewind_policy_t policy; 1817219089Spjd boolean_t verify_ok = B_FALSE; 1818219089Spjd int error; 1819219089Spjd 1820219089Spjd zpool_get_rewind_policy(spa->spa_config, &policy); 1821219089Spjd 1822219089Spjd if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1823219089Spjd return (0); 1824219089Spjd 1825219089Spjd rio = zio_root(spa, NULL, &sle, 1826219089Spjd ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1827219089Spjd 1828219089Spjd error = traverse_pool(spa, spa->spa_verify_min_txg, 1829219089Spjd TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1830219089Spjd 1831219089Spjd (void) zio_wait(rio); 1832219089Spjd 1833219089Spjd spa->spa_load_meta_errors = sle.sle_meta_count; 1834219089Spjd spa->spa_load_data_errors = sle.sle_data_count; 1835219089Spjd 1836219089Spjd if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1837219089Spjd sle.sle_data_count <= policy.zrp_maxdata) { 1838219089Spjd int64_t loss = 0; 1839219089Spjd 1840219089Spjd verify_ok = B_TRUE; 1841219089Spjd spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1842219089Spjd spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1843219089Spjd 1844219089Spjd loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1845219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1846219089Spjd ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1847219089Spjd VERIFY(nvlist_add_int64(spa->spa_load_info, 1848219089Spjd ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1849219089Spjd VERIFY(nvlist_add_uint64(spa->spa_load_info, 1850219089Spjd ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1851219089Spjd } else { 1852219089Spjd spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1853219089Spjd } 1854219089Spjd 1855219089Spjd if (error) { 1856219089Spjd if (error != ENXIO && error != EIO) 1857249195Smm error = SET_ERROR(EIO); 1858219089Spjd return (error); 1859219089Spjd } 1860219089Spjd 1861219089Spjd return (verify_ok ? 0 : EIO); 1862219089Spjd} 1863219089Spjd 1864185029Spjd/* 1865219089Spjd * Find a value in the pool props object. 1866168404Spjd */ 1867219089Spjdstatic void 1868219089Spjdspa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1869219089Spjd{ 1870219089Spjd (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1871219089Spjd zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1872219089Spjd} 1873219089Spjd 1874219089Spjd/* 1875219089Spjd * Find a value in the pool directory object. 1876219089Spjd */ 1877168404Spjdstatic int 1878219089Spjdspa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1879168404Spjd{ 1880219089Spjd return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1881219089Spjd name, sizeof (uint64_t), 1, val)); 1882219089Spjd} 1883168404Spjd 1884219089Spjdstatic int 1885219089Spjdspa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1886219089Spjd{ 1887219089Spjd vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1888219089Spjd return (err); 1889219089Spjd} 1890219089Spjd 1891219089Spjd/* 1892219089Spjd * Fix up config after a partly-completed split. This is done with the 1893219089Spjd * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1894219089Spjd * pool have that entry in their config, but only the splitting one contains 1895219089Spjd * a list of all the guids of the vdevs that are being split off. 1896219089Spjd * 1897219089Spjd * This function determines what to do with that list: either rejoin 1898219089Spjd * all the disks to the pool, or complete the splitting process. To attempt 1899219089Spjd * the rejoin, each disk that is offlined is marked online again, and 1900219089Spjd * we do a reopen() call. If the vdev label for every disk that was 1901219089Spjd * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1902219089Spjd * then we call vdev_split() on each disk, and complete the split. 1903219089Spjd * 1904219089Spjd * Otherwise we leave the config alone, with all the vdevs in place in 1905219089Spjd * the original pool. 1906219089Spjd */ 1907219089Spjdstatic void 1908219089Spjdspa_try_repair(spa_t *spa, nvlist_t *config) 1909219089Spjd{ 1910219089Spjd uint_t extracted; 1911219089Spjd uint64_t *glist; 1912219089Spjd uint_t i, gcount; 1913219089Spjd nvlist_t *nvl; 1914219089Spjd vdev_t **vd; 1915219089Spjd boolean_t attempt_reopen; 1916219089Spjd 1917219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1918219089Spjd return; 1919219089Spjd 1920219089Spjd /* check that the config is complete */ 1921219089Spjd if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1922219089Spjd &glist, &gcount) != 0) 1923219089Spjd return; 1924219089Spjd 1925219089Spjd vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1926219089Spjd 1927219089Spjd /* attempt to online all the vdevs & validate */ 1928219089Spjd attempt_reopen = B_TRUE; 1929219089Spjd for (i = 0; i < gcount; i++) { 1930219089Spjd if (glist[i] == 0) /* vdev is hole */ 1931219089Spjd continue; 1932219089Spjd 1933219089Spjd vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1934219089Spjd if (vd[i] == NULL) { 1935219089Spjd /* 1936219089Spjd * Don't bother attempting to reopen the disks; 1937219089Spjd * just do the split. 1938219089Spjd */ 1939219089Spjd attempt_reopen = B_FALSE; 1940219089Spjd } else { 1941219089Spjd /* attempt to re-online it */ 1942219089Spjd vd[i]->vdev_offline = B_FALSE; 1943219089Spjd } 1944219089Spjd } 1945219089Spjd 1946219089Spjd if (attempt_reopen) { 1947219089Spjd vdev_reopen(spa->spa_root_vdev); 1948219089Spjd 1949219089Spjd /* check each device to see what state it's in */ 1950219089Spjd for (extracted = 0, i = 0; i < gcount; i++) { 1951219089Spjd if (vd[i] != NULL && 1952219089Spjd vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1953219089Spjd break; 1954219089Spjd ++extracted; 1955219089Spjd } 1956219089Spjd } 1957219089Spjd 1958209962Smm /* 1959219089Spjd * If every disk has been moved to the new pool, or if we never 1960219089Spjd * even attempted to look at them, then we split them off for 1961219089Spjd * good. 1962209962Smm */ 1963219089Spjd if (!attempt_reopen || gcount == extracted) { 1964219089Spjd for (i = 0; i < gcount; i++) 1965219089Spjd if (vd[i] != NULL) 1966219089Spjd vdev_split(vd[i]); 1967219089Spjd vdev_reopen(spa->spa_root_vdev); 1968219089Spjd } 1969209962Smm 1970219089Spjd kmem_free(vd, gcount * sizeof (vdev_t *)); 1971219089Spjd} 1972185029Spjd 1973219089Spjdstatic int 1974219089Spjdspa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1975219089Spjd boolean_t mosconfig) 1976219089Spjd{ 1977219089Spjd nvlist_t *config = spa->spa_config; 1978219089Spjd char *ereport = FM_EREPORT_ZFS_POOL; 1979228103Smm char *comment; 1980219089Spjd int error; 1981219089Spjd uint64_t pool_guid; 1982219089Spjd nvlist_t *nvl; 1983168404Spjd 1984219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1985249195Smm return (SET_ERROR(EINVAL)); 1986168404Spjd 1987228103Smm ASSERT(spa->spa_comment == NULL); 1988228103Smm if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1989228103Smm spa->spa_comment = spa_strdup(comment); 1990228103Smm 1991168404Spjd /* 1992168404Spjd * Versioning wasn't explicitly added to the label until later, so if 1993168404Spjd * it's not present treat it as the initial version. 1994168404Spjd */ 1995219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1996219089Spjd &spa->spa_ubsync.ub_version) != 0) 1997219089Spjd spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1998168404Spjd 1999168404Spjd (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 2000168404Spjd &spa->spa_config_txg); 2001168404Spjd 2002168404Spjd if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 2003168404Spjd spa_guid_exists(pool_guid, 0)) { 2004249195Smm error = SET_ERROR(EEXIST); 2005219089Spjd } else { 2006228103Smm spa->spa_config_guid = pool_guid; 2007219089Spjd 2008219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 2009219089Spjd &nvl) == 0) { 2010219089Spjd VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 2011219089Spjd KM_SLEEP) == 0); 2012219089Spjd } 2013219089Spjd 2014236884Smm nvlist_free(spa->spa_load_info); 2015236884Smm spa->spa_load_info = fnvlist_alloc(); 2016236884Smm 2017219089Spjd gethrestime(&spa->spa_loaded_ts); 2018219089Spjd error = spa_load_impl(spa, pool_guid, config, state, type, 2019219089Spjd mosconfig, &ereport); 2020168404Spjd } 2021168404Spjd 2022219089Spjd spa->spa_minref = refcount_count(&spa->spa_refcount); 2023219089Spjd if (error) { 2024219089Spjd if (error != EEXIST) { 2025219089Spjd spa->spa_loaded_ts.tv_sec = 0; 2026219089Spjd spa->spa_loaded_ts.tv_nsec = 0; 2027219089Spjd } 2028219089Spjd if (error != EBADF) { 2029219089Spjd zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 2030219089Spjd } 2031219089Spjd } 2032219089Spjd spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2033219089Spjd spa->spa_ena = 0; 2034168404Spjd 2035219089Spjd return (error); 2036219089Spjd} 2037219089Spjd 2038219089Spjd/* 2039219089Spjd * Load an existing storage pool, using the pool's builtin spa_config as a 2040219089Spjd * source of configuration information. 2041219089Spjd */ 2042219089Spjdstatic int 2043219089Spjdspa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 2044219089Spjd spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 2045219089Spjd char **ereport) 2046219089Spjd{ 2047219089Spjd int error = 0; 2048219089Spjd nvlist_t *nvroot = NULL; 2049236884Smm nvlist_t *label; 2050219089Spjd vdev_t *rvd; 2051219089Spjd uberblock_t *ub = &spa->spa_uberblock; 2052219089Spjd uint64_t children, config_cache_txg = spa->spa_config_txg; 2053219089Spjd int orig_mode = spa->spa_mode; 2054219089Spjd int parse; 2055219089Spjd uint64_t obj; 2056236884Smm boolean_t missing_feat_write = B_FALSE; 2057219089Spjd 2058168404Spjd /* 2059219089Spjd * If this is an untrusted config, access the pool in read-only mode. 2060219089Spjd * This prevents things like resilvering recently removed devices. 2061219089Spjd */ 2062219089Spjd if (!mosconfig) 2063219089Spjd spa->spa_mode = FREAD; 2064219089Spjd 2065219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2066219089Spjd 2067219089Spjd spa->spa_load_state = state; 2068219089Spjd 2069219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 2070249195Smm return (SET_ERROR(EINVAL)); 2071219089Spjd 2072219089Spjd parse = (type == SPA_IMPORT_EXISTING ? 2073219089Spjd VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 2074219089Spjd 2075219089Spjd /* 2076209962Smm * Create "The Godfather" zio to hold all async IOs 2077209962Smm */ 2078209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2079209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2080209962Smm 2081209962Smm /* 2082168404Spjd * Parse the configuration into a vdev tree. We explicitly set the 2083168404Spjd * value that will be returned by spa_version() since parsing the 2084168404Spjd * configuration requires knowing the version number. 2085168404Spjd */ 2086185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2087219089Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 2088185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2089168404Spjd 2090168404Spjd if (error != 0) 2091219089Spjd return (error); 2092168404Spjd 2093168404Spjd ASSERT(spa->spa_root_vdev == rvd); 2094168404Spjd 2095219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2096219089Spjd ASSERT(spa_guid(spa) == pool_guid); 2097219089Spjd } 2098219089Spjd 2099168404Spjd /* 2100168404Spjd * Try to open all vdevs, loading each label in the process. 2101168404Spjd */ 2102185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2103168926Spjd error = vdev_open(rvd); 2104185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2105168926Spjd if (error != 0) 2106219089Spjd return (error); 2107168404Spjd 2108168404Spjd /* 2109209962Smm * We need to validate the vdev labels against the configuration that 2110209962Smm * we have in hand, which is dependent on the setting of mosconfig. If 2111209962Smm * mosconfig is true then we're validating the vdev labels based on 2112219089Spjd * that config. Otherwise, we're validating against the cached config 2113209962Smm * (zpool.cache) that was read when we loaded the zfs module, and then 2114209962Smm * later we will recursively call spa_load() and validate against 2115209962Smm * the vdev config. 2116219089Spjd * 2117219089Spjd * If we're assembling a new pool that's been split off from an 2118219089Spjd * existing pool, the labels haven't yet been updated so we skip 2119219089Spjd * validation for now. 2120168404Spjd */ 2121219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2122219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2123230514Smm error = vdev_validate(rvd, mosconfig); 2124219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2125168404Spjd 2126219089Spjd if (error != 0) 2127219089Spjd return (error); 2128219089Spjd 2129219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2130249195Smm return (SET_ERROR(ENXIO)); 2131168404Spjd } 2132168404Spjd 2133168404Spjd /* 2134168404Spjd * Find the best uberblock. 2135168404Spjd */ 2136236884Smm vdev_uberblock_load(rvd, ub, &label); 2137168404Spjd 2138168404Spjd /* 2139168404Spjd * If we weren't able to find a single valid uberblock, return failure. 2140168404Spjd */ 2141236884Smm if (ub->ub_txg == 0) { 2142236884Smm nvlist_free(label); 2143219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 2144236884Smm } 2145168404Spjd 2146168404Spjd /* 2147236884Smm * If the pool has an unsupported version we can't open it. 2148168404Spjd */ 2149236884Smm if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 2150236884Smm nvlist_free(label); 2151219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 2152236884Smm } 2153168404Spjd 2154236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2155236884Smm nvlist_t *features; 2156236884Smm 2157236884Smm /* 2158236884Smm * If we weren't able to find what's necessary for reading the 2159236884Smm * MOS in the label, return failure. 2160236884Smm */ 2161236884Smm if (label == NULL || nvlist_lookup_nvlist(label, 2162236884Smm ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { 2163236884Smm nvlist_free(label); 2164236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2165236884Smm ENXIO)); 2166236884Smm } 2167236884Smm 2168236884Smm /* 2169236884Smm * Update our in-core representation with the definitive values 2170236884Smm * from the label. 2171236884Smm */ 2172236884Smm nvlist_free(spa->spa_label_features); 2173236884Smm VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); 2174236884Smm } 2175236884Smm 2176236884Smm nvlist_free(label); 2177236884Smm 2178168404Spjd /* 2179236884Smm * Look through entries in the label nvlist's features_for_read. If 2180236884Smm * there is a feature listed there which we don't understand then we 2181236884Smm * cannot open a pool. 2182236884Smm */ 2183236884Smm if (ub->ub_version >= SPA_VERSION_FEATURES) { 2184236884Smm nvlist_t *unsup_feat; 2185236884Smm 2186236884Smm VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 2187236884Smm 0); 2188236884Smm 2189236884Smm for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 2190236884Smm NULL); nvp != NULL; 2191236884Smm nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 2192236884Smm if (!zfeature_is_supported(nvpair_name(nvp))) { 2193236884Smm VERIFY(nvlist_add_string(unsup_feat, 2194236884Smm nvpair_name(nvp), "") == 0); 2195236884Smm } 2196236884Smm } 2197236884Smm 2198236884Smm if (!nvlist_empty(unsup_feat)) { 2199236884Smm VERIFY(nvlist_add_nvlist(spa->spa_load_info, 2200236884Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); 2201236884Smm nvlist_free(unsup_feat); 2202236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2203236884Smm ENOTSUP)); 2204236884Smm } 2205236884Smm 2206236884Smm nvlist_free(unsup_feat); 2207236884Smm } 2208236884Smm 2209236884Smm /* 2210168404Spjd * If the vdev guid sum doesn't match the uberblock, we have an 2211219089Spjd * incomplete configuration. We first check to see if the pool 2212219089Spjd * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 2213219089Spjd * If it is, defer the vdev_guid_sum check till later so we 2214219089Spjd * can handle missing vdevs. 2215168404Spjd */ 2216219089Spjd if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 2217219089Spjd &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 2218219089Spjd rvd->vdev_guid_sum != ub->ub_guid_sum) 2219219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 2220219089Spjd 2221219089Spjd if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 2222219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2223219089Spjd spa_try_repair(spa, config); 2224219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2225219089Spjd nvlist_free(spa->spa_config_splitting); 2226219089Spjd spa->spa_config_splitting = NULL; 2227168404Spjd } 2228168404Spjd 2229168404Spjd /* 2230168404Spjd * Initialize internal SPA structures. 2231168404Spjd */ 2232168404Spjd spa->spa_state = POOL_STATE_ACTIVE; 2233168404Spjd spa->spa_ubsync = spa->spa_uberblock; 2234219089Spjd spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 2235219089Spjd TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 2236219089Spjd spa->spa_first_txg = spa->spa_last_ubsync_txg ? 2237219089Spjd spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 2238219089Spjd spa->spa_claim_max_txg = spa->spa_first_txg; 2239219089Spjd spa->spa_prev_software_version = ub->ub_software_version; 2240219089Spjd 2241236884Smm error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 2242219089Spjd if (error) 2243219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2244168404Spjd spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 2245168404Spjd 2246219089Spjd if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 2247219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2248168404Spjd 2249236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) { 2250236884Smm boolean_t missing_feat_read = B_FALSE; 2251238926Smm nvlist_t *unsup_feat, *enabled_feat; 2252236884Smm 2253236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 2254236884Smm &spa->spa_feat_for_read_obj) != 0) { 2255236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2256236884Smm } 2257236884Smm 2258236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 2259236884Smm &spa->spa_feat_for_write_obj) != 0) { 2260236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2261236884Smm } 2262236884Smm 2263236884Smm if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 2264236884Smm &spa->spa_feat_desc_obj) != 0) { 2265236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2266236884Smm } 2267236884Smm 2268238926Smm enabled_feat = fnvlist_alloc(); 2269238926Smm unsup_feat = fnvlist_alloc(); 2270236884Smm 2271236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2272236884Smm spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj, 2273238926Smm unsup_feat, enabled_feat)) 2274236884Smm missing_feat_read = B_TRUE; 2275236884Smm 2276236884Smm if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { 2277236884Smm if (!feature_is_supported(spa->spa_meta_objset, 2278236884Smm spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj, 2279238926Smm unsup_feat, enabled_feat)) { 2280236884Smm missing_feat_write = B_TRUE; 2281238926Smm } 2282236884Smm } 2283236884Smm 2284238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2285238926Smm ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 2286238926Smm 2287236884Smm if (!nvlist_empty(unsup_feat)) { 2288238926Smm fnvlist_add_nvlist(spa->spa_load_info, 2289238926Smm ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 2290236884Smm } 2291236884Smm 2292238926Smm fnvlist_free(enabled_feat); 2293238926Smm fnvlist_free(unsup_feat); 2294236884Smm 2295236884Smm if (!missing_feat_read) { 2296236884Smm fnvlist_add_boolean(spa->spa_load_info, 2297236884Smm ZPOOL_CONFIG_CAN_RDONLY); 2298236884Smm } 2299236884Smm 2300236884Smm /* 2301236884Smm * If the state is SPA_LOAD_TRYIMPORT, our objective is 2302236884Smm * twofold: to determine whether the pool is available for 2303236884Smm * import in read-write mode and (if it is not) whether the 2304236884Smm * pool is available for import in read-only mode. If the pool 2305236884Smm * is available for import in read-write mode, it is displayed 2306236884Smm * as available in userland; if it is not available for import 2307236884Smm * in read-only mode, it is displayed as unavailable in 2308236884Smm * userland. If the pool is available for import in read-only 2309236884Smm * mode but not read-write mode, it is displayed as unavailable 2310236884Smm * in userland with a special note that the pool is actually 2311236884Smm * available for open in read-only mode. 2312236884Smm * 2313236884Smm * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 2314236884Smm * missing a feature for write, we must first determine whether 2315236884Smm * the pool can be opened read-only before returning to 2316236884Smm * userland in order to know whether to display the 2317236884Smm * abovementioned note. 2318236884Smm */ 2319236884Smm if (missing_feat_read || (missing_feat_write && 2320236884Smm spa_writeable(spa))) { 2321236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 2322236884Smm ENOTSUP)); 2323236884Smm } 2324236884Smm } 2325236884Smm 2326236884Smm spa->spa_is_initializing = B_TRUE; 2327236884Smm error = dsl_pool_open(spa->spa_dsl_pool); 2328236884Smm spa->spa_is_initializing = B_FALSE; 2329236884Smm if (error != 0) 2330236884Smm return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2331236884Smm 2332168404Spjd if (!mosconfig) { 2333168498Spjd uint64_t hostid; 2334219089Spjd nvlist_t *policy = NULL, *nvconfig; 2335168404Spjd 2336219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2337219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2338168404Spjd 2339219089Spjd if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2340185029Spjd ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2341168498Spjd char *hostname; 2342168498Spjd unsigned long myhostid = 0; 2343168498Spjd 2344219089Spjd VERIFY(nvlist_lookup_string(nvconfig, 2345168498Spjd ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2346168498Spjd 2347219089Spjd#ifdef _KERNEL 2348219089Spjd myhostid = zone_get_hostid(NULL); 2349219089Spjd#else /* _KERNEL */ 2350219089Spjd /* 2351219089Spjd * We're emulating the system's hostid in userland, so 2352219089Spjd * we can't use zone_get_hostid(). 2353219089Spjd */ 2354168498Spjd (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2355219089Spjd#endif /* _KERNEL */ 2356204073Spjd if (check_hostid && hostid != 0 && myhostid != 0 && 2357219089Spjd hostid != myhostid) { 2358219089Spjd nvlist_free(nvconfig); 2359168498Spjd cmn_err(CE_WARN, "pool '%s' could not be " 2360168498Spjd "loaded as it was last accessed by " 2361185029Spjd "another system (host: %s hostid: 0x%lx). " 2362236146Smm "See: http://illumos.org/msg/ZFS-8000-EY", 2363185029Spjd spa_name(spa), hostname, 2364168498Spjd (unsigned long)hostid); 2365249195Smm return (SET_ERROR(EBADF)); 2366168498Spjd } 2367168498Spjd } 2368219089Spjd if (nvlist_lookup_nvlist(spa->spa_config, 2369219089Spjd ZPOOL_REWIND_POLICY, &policy) == 0) 2370219089Spjd VERIFY(nvlist_add_nvlist(nvconfig, 2371219089Spjd ZPOOL_REWIND_POLICY, policy) == 0); 2372168498Spjd 2373219089Spjd spa_config_set(spa, nvconfig); 2374168404Spjd spa_unload(spa); 2375168404Spjd spa_deactivate(spa); 2376209962Smm spa_activate(spa, orig_mode); 2377168404Spjd 2378219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2379168404Spjd } 2380168404Spjd 2381219089Spjd if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2382219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2383219089Spjd error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2384219089Spjd if (error != 0) 2385219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2386168404Spjd 2387168404Spjd /* 2388168404Spjd * Load the bit that tells us to use the new accounting function 2389168404Spjd * (raid-z deflation). If we have an older pool, this will not 2390168404Spjd * be present. 2391168404Spjd */ 2392219089Spjd error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2393219089Spjd if (error != 0 && error != ENOENT) 2394219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2395168404Spjd 2396219089Spjd error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2397219089Spjd &spa->spa_creation_version); 2398219089Spjd if (error != 0 && error != ENOENT) 2399219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2400219089Spjd 2401168404Spjd /* 2402168404Spjd * Load the persistent error log. If we have an older pool, this will 2403168404Spjd * not be present. 2404168404Spjd */ 2405219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2406219089Spjd if (error != 0 && error != ENOENT) 2407219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2408168404Spjd 2409219089Spjd error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2410219089Spjd &spa->spa_errlog_scrub); 2411219089Spjd if (error != 0 && error != ENOENT) 2412219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2413168404Spjd 2414168404Spjd /* 2415168404Spjd * Load the history object. If we have an older pool, this 2416168404Spjd * will not be present. 2417168404Spjd */ 2418219089Spjd error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2419219089Spjd if (error != 0 && error != ENOENT) 2420219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2421168404Spjd 2422168404Spjd /* 2423219089Spjd * If we're assembling the pool from the split-off vdevs of 2424219089Spjd * an existing pool, we don't want to attach the spares & cache 2425219089Spjd * devices. 2426219089Spjd */ 2427219089Spjd 2428219089Spjd /* 2429168404Spjd * Load any hot spares for this pool. 2430168404Spjd */ 2431219089Spjd error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2432219089Spjd if (error != 0 && error != ENOENT) 2433219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2434219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2435185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2436185029Spjd if (load_nvlist(spa, spa->spa_spares.sav_object, 2437219089Spjd &spa->spa_spares.sav_config) != 0) 2438219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2439168404Spjd 2440185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2441168404Spjd spa_load_spares(spa); 2442185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2443219089Spjd } else if (error == 0) { 2444219089Spjd spa->spa_spares.sav_sync = B_TRUE; 2445168404Spjd } 2446168404Spjd 2447185029Spjd /* 2448185029Spjd * Load any level 2 ARC devices for this pool. 2449185029Spjd */ 2450219089Spjd error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2451185029Spjd &spa->spa_l2cache.sav_object); 2452219089Spjd if (error != 0 && error != ENOENT) 2453219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2454219089Spjd if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2455185029Spjd ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2456185029Spjd if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2457219089Spjd &spa->spa_l2cache.sav_config) != 0) 2458219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2459185029Spjd 2460185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2461185029Spjd spa_load_l2cache(spa); 2462185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2463219089Spjd } else if (error == 0) { 2464219089Spjd spa->spa_l2cache.sav_sync = B_TRUE; 2465185029Spjd } 2466185029Spjd 2467219089Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2468213197Smm 2469219089Spjd error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2470219089Spjd if (error && error != ENOENT) 2471219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2472185029Spjd 2473219089Spjd if (error == 0) { 2474219089Spjd uint64_t autoreplace; 2475185029Spjd 2476219089Spjd spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2477219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2478219089Spjd spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2479219089Spjd spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2480219089Spjd spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2481219089Spjd spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2482219089Spjd &spa->spa_dedup_ditto); 2483185029Spjd 2484219089Spjd spa->spa_autoreplace = (autoreplace != 0); 2485168404Spjd } 2486168404Spjd 2487168404Spjd /* 2488185029Spjd * If the 'autoreplace' property is set, then post a resource notifying 2489185029Spjd * the ZFS DE that it should not issue any faults for unopenable 2490185029Spjd * devices. We also iterate over the vdevs, and post a sysevent for any 2491185029Spjd * unopenable vdevs so that the normal autoreplace handler can take 2492185029Spjd * over. 2493185029Spjd */ 2494219089Spjd if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2495185029Spjd spa_check_removed(spa->spa_root_vdev); 2496219089Spjd /* 2497219089Spjd * For the import case, this is done in spa_import(), because 2498219089Spjd * at this point we're using the spare definitions from 2499219089Spjd * the MOS config, not necessarily from the userland config. 2500219089Spjd */ 2501219089Spjd if (state != SPA_LOAD_IMPORT) { 2502219089Spjd spa_aux_check_removed(&spa->spa_spares); 2503219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 2504219089Spjd } 2505219089Spjd } 2506185029Spjd 2507185029Spjd /* 2508168404Spjd * Load the vdev state for all toplevel vdevs. 2509168404Spjd */ 2510168404Spjd vdev_load(rvd); 2511168404Spjd 2512168404Spjd /* 2513168404Spjd * Propagate the leaf DTLs we just loaded all the way up the tree. 2514168404Spjd */ 2515185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2516168404Spjd vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2517185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 2518168404Spjd 2519168404Spjd /* 2520219089Spjd * Load the DDTs (dedup tables). 2521168404Spjd */ 2522219089Spjd error = ddt_load(spa); 2523219089Spjd if (error != 0) 2524219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2525219089Spjd 2526219089Spjd spa_update_dspace(spa); 2527219089Spjd 2528219089Spjd /* 2529219089Spjd * Validate the config, using the MOS config to fill in any 2530219089Spjd * information which might be missing. If we fail to validate 2531219089Spjd * the config then declare the pool unfit for use. If we're 2532219089Spjd * assembling a pool from a split, the log is not transferred 2533219089Spjd * over. 2534219089Spjd */ 2535219089Spjd if (type != SPA_IMPORT_ASSEMBLE) { 2536219089Spjd nvlist_t *nvconfig; 2537219089Spjd 2538219089Spjd if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2539219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2540219089Spjd 2541219089Spjd if (!spa_config_valid(spa, nvconfig)) { 2542219089Spjd nvlist_free(nvconfig); 2543219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2544219089Spjd ENXIO)); 2545219089Spjd } 2546219089Spjd nvlist_free(nvconfig); 2547219089Spjd 2548219089Spjd /* 2549236884Smm * Now that we've validated the config, check the state of the 2550219089Spjd * root vdev. If it can't be opened, it indicates one or 2551219089Spjd * more toplevel vdevs are faulted. 2552219089Spjd */ 2553219089Spjd if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2554249195Smm return (SET_ERROR(ENXIO)); 2555219089Spjd 2556219089Spjd if (spa_check_logs(spa)) { 2557219089Spjd *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2558219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2559219089Spjd } 2560168404Spjd } 2561168404Spjd 2562236884Smm if (missing_feat_write) { 2563236884Smm ASSERT(state == SPA_LOAD_TRYIMPORT); 2564236884Smm 2565236884Smm /* 2566236884Smm * At this point, we know that we can open the pool in 2567236884Smm * read-only mode but not read-write mode. We now have enough 2568236884Smm * information and can return to userland. 2569236884Smm */ 2570236884Smm return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); 2571236884Smm } 2572236884Smm 2573219089Spjd /* 2574219089Spjd * We've successfully opened the pool, verify that we're ready 2575219089Spjd * to start pushing transactions. 2576219089Spjd */ 2577219089Spjd if (state != SPA_LOAD_TRYIMPORT) { 2578219089Spjd if (error = spa_load_verify(spa)) 2579219089Spjd return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2580219089Spjd error)); 2581219089Spjd } 2582219089Spjd 2583219089Spjd if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2584219089Spjd spa->spa_load_max_txg == UINT64_MAX)) { 2585168404Spjd dmu_tx_t *tx; 2586168404Spjd int need_update = B_FALSE; 2587168404Spjd 2588209962Smm ASSERT(state != SPA_LOAD_TRYIMPORT); 2589209962Smm 2590168404Spjd /* 2591168404Spjd * Claim log blocks that haven't been committed yet. 2592168404Spjd * This must all happen in a single txg. 2593219089Spjd * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2594219089Spjd * invoked from zil_claim_log_block()'s i/o done callback. 2595219089Spjd * Price of rollback is that we abandon the log. 2596168404Spjd */ 2597219089Spjd spa->spa_claiming = B_TRUE; 2598219089Spjd 2599168404Spjd tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2600168404Spjd spa_first_txg(spa)); 2601185029Spjd (void) dmu_objset_find(spa_name(spa), 2602168404Spjd zil_claim, tx, DS_FIND_CHILDREN); 2603168404Spjd dmu_tx_commit(tx); 2604168404Spjd 2605219089Spjd spa->spa_claiming = B_FALSE; 2606219089Spjd 2607219089Spjd spa_set_log_state(spa, SPA_LOG_GOOD); 2608168404Spjd spa->spa_sync_on = B_TRUE; 2609168404Spjd txg_sync_start(spa->spa_dsl_pool); 2610168404Spjd 2611168404Spjd /* 2612219089Spjd * Wait for all claims to sync. We sync up to the highest 2613219089Spjd * claimed log block birth time so that claimed log blocks 2614219089Spjd * don't appear to be from the future. spa_claim_max_txg 2615219089Spjd * will have been set for us by either zil_check_log_chain() 2616219089Spjd * (invoked from spa_check_logs()) or zil_claim() above. 2617168404Spjd */ 2618219089Spjd txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2619168404Spjd 2620168404Spjd /* 2621168404Spjd * If the config cache is stale, or we have uninitialized 2622168404Spjd * metaslabs (see spa_vdev_add()), then update the config. 2623209962Smm * 2624219089Spjd * If this is a verbatim import, trust the current 2625209962Smm * in-core spa_config and update the disk labels. 2626168404Spjd */ 2627168404Spjd if (config_cache_txg != spa->spa_config_txg || 2628219089Spjd state == SPA_LOAD_IMPORT || 2629219089Spjd state == SPA_LOAD_RECOVER || 2630219089Spjd (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2631168404Spjd need_update = B_TRUE; 2632168404Spjd 2633209962Smm for (int c = 0; c < rvd->vdev_children; c++) 2634168404Spjd if (rvd->vdev_child[c]->vdev_ms_array == 0) 2635168404Spjd need_update = B_TRUE; 2636168404Spjd 2637168404Spjd /* 2638168404Spjd * Update the config cache asychronously in case we're the 2639168404Spjd * root pool, in which case the config cache isn't writable yet. 2640168404Spjd */ 2641168404Spjd if (need_update) 2642168404Spjd spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2643208683Spjd 2644208683Spjd /* 2645208683Spjd * Check all DTLs to see if anything needs resilvering. 2646208683Spjd */ 2647219089Spjd if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2648219089Spjd vdev_resilver_needed(rvd, NULL, NULL)) 2649208683Spjd spa_async_request(spa, SPA_ASYNC_RESILVER); 2650219089Spjd 2651219089Spjd /* 2652248571Smm * Log the fact that we booted up (so that we can detect if 2653248571Smm * we rebooted in the middle of an operation). 2654248571Smm */ 2655248571Smm spa_history_log_version(spa, "open"); 2656248571Smm 2657248571Smm /* 2658219089Spjd * Delete any inconsistent datasets. 2659219089Spjd */ 2660219089Spjd (void) dmu_objset_find(spa_name(spa), 2661219089Spjd dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2662219089Spjd 2663219089Spjd /* 2664219089Spjd * Clean up any stale temporary dataset userrefs. 2665219089Spjd */ 2666219089Spjd dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2667168404Spjd } 2668168404Spjd 2669219089Spjd return (0); 2670219089Spjd} 2671168404Spjd 2672219089Spjdstatic int 2673219089Spjdspa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2674219089Spjd{ 2675219089Spjd int mode = spa->spa_mode; 2676219089Spjd 2677219089Spjd spa_unload(spa); 2678219089Spjd spa_deactivate(spa); 2679219089Spjd 2680219089Spjd spa->spa_load_max_txg--; 2681219089Spjd 2682219089Spjd spa_activate(spa, mode); 2683219089Spjd spa_async_suspend(spa); 2684219089Spjd 2685219089Spjd return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2686168404Spjd} 2687168404Spjd 2688236884Smm/* 2689236884Smm * If spa_load() fails this function will try loading prior txg's. If 2690236884Smm * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 2691236884Smm * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 2692236884Smm * function will not rewind the pool and will return the same error as 2693236884Smm * spa_load(). 2694236884Smm */ 2695219089Spjdstatic int 2696219089Spjdspa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2697219089Spjd uint64_t max_request, int rewind_flags) 2698219089Spjd{ 2699236884Smm nvlist_t *loadinfo = NULL; 2700219089Spjd nvlist_t *config = NULL; 2701219089Spjd int load_error, rewind_error; 2702219089Spjd uint64_t safe_rewind_txg; 2703219089Spjd uint64_t min_txg; 2704219089Spjd 2705219089Spjd if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2706219089Spjd spa->spa_load_max_txg = spa->spa_load_txg; 2707219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2708219089Spjd } else { 2709219089Spjd spa->spa_load_max_txg = max_request; 2710219089Spjd } 2711219089Spjd 2712219089Spjd load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2713219089Spjd mosconfig); 2714219089Spjd if (load_error == 0) 2715219089Spjd return (0); 2716219089Spjd 2717219089Spjd if (spa->spa_root_vdev != NULL) 2718219089Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2719219089Spjd 2720219089Spjd spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2721219089Spjd spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2722219089Spjd 2723219089Spjd if (rewind_flags & ZPOOL_NEVER_REWIND) { 2724219089Spjd nvlist_free(config); 2725219089Spjd return (load_error); 2726219089Spjd } 2727219089Spjd 2728236884Smm if (state == SPA_LOAD_RECOVER) { 2729236884Smm /* Price of rolling back is discarding txgs, including log */ 2730219089Spjd spa_set_log_state(spa, SPA_LOG_CLEAR); 2731236884Smm } else { 2732236884Smm /* 2733236884Smm * If we aren't rolling back save the load info from our first 2734236884Smm * import attempt so that we can restore it after attempting 2735236884Smm * to rewind. 2736236884Smm */ 2737236884Smm loadinfo = spa->spa_load_info; 2738236884Smm spa->spa_load_info = fnvlist_alloc(); 2739236884Smm } 2740219089Spjd 2741219089Spjd spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2742219089Spjd safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2743219089Spjd min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2744219089Spjd TXG_INITIAL : safe_rewind_txg; 2745219089Spjd 2746219089Spjd /* 2747219089Spjd * Continue as long as we're finding errors, we're still within 2748219089Spjd * the acceptable rewind range, and we're still finding uberblocks 2749219089Spjd */ 2750219089Spjd while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2751219089Spjd spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2752219089Spjd if (spa->spa_load_max_txg < safe_rewind_txg) 2753219089Spjd spa->spa_extreme_rewind = B_TRUE; 2754219089Spjd rewind_error = spa_load_retry(spa, state, mosconfig); 2755219089Spjd } 2756219089Spjd 2757219089Spjd spa->spa_extreme_rewind = B_FALSE; 2758219089Spjd spa->spa_load_max_txg = UINT64_MAX; 2759219089Spjd 2760219089Spjd if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2761219089Spjd spa_config_set(spa, config); 2762219089Spjd 2763236884Smm if (state == SPA_LOAD_RECOVER) { 2764236884Smm ASSERT3P(loadinfo, ==, NULL); 2765236884Smm return (rewind_error); 2766236884Smm } else { 2767236884Smm /* Store the rewind info as part of the initial load info */ 2768236884Smm fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 2769236884Smm spa->spa_load_info); 2770236884Smm 2771236884Smm /* Restore the initial load info */ 2772236884Smm fnvlist_free(spa->spa_load_info); 2773236884Smm spa->spa_load_info = loadinfo; 2774236884Smm 2775236884Smm return (load_error); 2776236884Smm } 2777219089Spjd} 2778219089Spjd 2779168404Spjd/* 2780168404Spjd * Pool Open/Import 2781168404Spjd * 2782168404Spjd * The import case is identical to an open except that the configuration is sent 2783168404Spjd * down from userland, instead of grabbed from the configuration cache. For the 2784168404Spjd * case of an open, the pool configuration will exist in the 2785185029Spjd * POOL_STATE_UNINITIALIZED state. 2786168404Spjd * 2787168404Spjd * The stats information (gen/count/ustats) is used to gather vdev statistics at 2788168404Spjd * the same time open the pool, without having to keep around the spa_t in some 2789168404Spjd * ambiguous state. 2790168404Spjd */ 2791168404Spjdstatic int 2792219089Spjdspa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2793219089Spjd nvlist_t **config) 2794168404Spjd{ 2795168404Spjd spa_t *spa; 2796219089Spjd spa_load_state_t state = SPA_LOAD_OPEN; 2797168404Spjd int error; 2798168404Spjd int locked = B_FALSE; 2799219089Spjd int firstopen = B_FALSE; 2800168404Spjd 2801168404Spjd *spapp = NULL; 2802168404Spjd 2803168404Spjd /* 2804168404Spjd * As disgusting as this is, we need to support recursive calls to this 2805168404Spjd * function because dsl_dir_open() is called during spa_load(), and ends 2806168404Spjd * up calling spa_open() again. The real fix is to figure out how to 2807168404Spjd * avoid dsl_dir_open() calling this in the first place. 2808168404Spjd */ 2809168404Spjd if (mutex_owner(&spa_namespace_lock) != curthread) { 2810168404Spjd mutex_enter(&spa_namespace_lock); 2811168404Spjd locked = B_TRUE; 2812168404Spjd } 2813168404Spjd 2814168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 2815168404Spjd if (locked) 2816168404Spjd mutex_exit(&spa_namespace_lock); 2817249195Smm return (SET_ERROR(ENOENT)); 2818168404Spjd } 2819219089Spjd 2820168404Spjd if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2821219089Spjd zpool_rewind_policy_t policy; 2822168404Spjd 2823219089Spjd firstopen = B_TRUE; 2824219089Spjd 2825219089Spjd zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2826219089Spjd &policy); 2827219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 2828219089Spjd state = SPA_LOAD_RECOVER; 2829219089Spjd 2830209962Smm spa_activate(spa, spa_mode_global); 2831168404Spjd 2832219089Spjd if (state != SPA_LOAD_RECOVER) 2833219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2834168404Spjd 2835219089Spjd error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2836219089Spjd policy.zrp_request); 2837219089Spjd 2838168404Spjd if (error == EBADF) { 2839168404Spjd /* 2840168404Spjd * If vdev_validate() returns failure (indicated by 2841168404Spjd * EBADF), it indicates that one of the vdevs indicates 2842168404Spjd * that the pool has been exported or destroyed. If 2843168404Spjd * this is the case, the config cache is out of sync and 2844168404Spjd * we should remove the pool from the namespace. 2845168404Spjd */ 2846168404Spjd spa_unload(spa); 2847168404Spjd spa_deactivate(spa); 2848185029Spjd spa_config_sync(spa, B_TRUE, B_TRUE); 2849168404Spjd spa_remove(spa); 2850168404Spjd if (locked) 2851168404Spjd mutex_exit(&spa_namespace_lock); 2852249195Smm return (SET_ERROR(ENOENT)); 2853168404Spjd } 2854168404Spjd 2855168404Spjd if (error) { 2856168404Spjd /* 2857168404Spjd * We can't open the pool, but we still have useful 2858168404Spjd * information: the state of each vdev after the 2859168404Spjd * attempted vdev_open(). Return this to the user. 2860168404Spjd */ 2861219089Spjd if (config != NULL && spa->spa_config) { 2862219089Spjd VERIFY(nvlist_dup(spa->spa_config, config, 2863219089Spjd KM_SLEEP) == 0); 2864219089Spjd VERIFY(nvlist_add_nvlist(*config, 2865219089Spjd ZPOOL_CONFIG_LOAD_INFO, 2866219089Spjd spa->spa_load_info) == 0); 2867219089Spjd } 2868168404Spjd spa_unload(spa); 2869168404Spjd spa_deactivate(spa); 2870219089Spjd spa->spa_last_open_failed = error; 2871168404Spjd if (locked) 2872168404Spjd mutex_exit(&spa_namespace_lock); 2873168404Spjd *spapp = NULL; 2874168404Spjd return (error); 2875168404Spjd } 2876168404Spjd } 2877168404Spjd 2878168404Spjd spa_open_ref(spa, tag); 2879185029Spjd 2880219089Spjd if (config != NULL) 2881219089Spjd *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2882219089Spjd 2883219089Spjd /* 2884219089Spjd * If we've recovered the pool, pass back any information we 2885219089Spjd * gathered while doing the load. 2886219089Spjd */ 2887219089Spjd if (state == SPA_LOAD_RECOVER) { 2888219089Spjd VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2889219089Spjd spa->spa_load_info) == 0); 2890219089Spjd } 2891219089Spjd 2892219089Spjd if (locked) { 2893219089Spjd spa->spa_last_open_failed = 0; 2894219089Spjd spa->spa_last_ubsync_txg = 0; 2895219089Spjd spa->spa_load_txg = 0; 2896168404Spjd mutex_exit(&spa_namespace_lock); 2897219089Spjd#ifdef __FreeBSD__ 2898219089Spjd#ifdef _KERNEL 2899219089Spjd if (firstopen) 2900249047Savg zvol_create_minors(spa->spa_name); 2901219089Spjd#endif 2902219089Spjd#endif 2903219089Spjd } 2904168404Spjd 2905168404Spjd *spapp = spa; 2906168404Spjd 2907168404Spjd return (0); 2908168404Spjd} 2909168404Spjd 2910168404Spjdint 2911219089Spjdspa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2912219089Spjd nvlist_t **config) 2913219089Spjd{ 2914219089Spjd return (spa_open_common(name, spapp, tag, policy, config)); 2915219089Spjd} 2916219089Spjd 2917219089Spjdint 2918168404Spjdspa_open(const char *name, spa_t **spapp, void *tag) 2919168404Spjd{ 2920219089Spjd return (spa_open_common(name, spapp, tag, NULL, NULL)); 2921168404Spjd} 2922168404Spjd 2923168404Spjd/* 2924168404Spjd * Lookup the given spa_t, incrementing the inject count in the process, 2925168404Spjd * preventing it from being exported or destroyed. 2926168404Spjd */ 2927168404Spjdspa_t * 2928168404Spjdspa_inject_addref(char *name) 2929168404Spjd{ 2930168404Spjd spa_t *spa; 2931168404Spjd 2932168404Spjd mutex_enter(&spa_namespace_lock); 2933168404Spjd if ((spa = spa_lookup(name)) == NULL) { 2934168404Spjd mutex_exit(&spa_namespace_lock); 2935168404Spjd return (NULL); 2936168404Spjd } 2937168404Spjd spa->spa_inject_ref++; 2938168404Spjd mutex_exit(&spa_namespace_lock); 2939168404Spjd 2940168404Spjd return (spa); 2941168404Spjd} 2942168404Spjd 2943168404Spjdvoid 2944168404Spjdspa_inject_delref(spa_t *spa) 2945168404Spjd{ 2946168404Spjd mutex_enter(&spa_namespace_lock); 2947168404Spjd spa->spa_inject_ref--; 2948168404Spjd mutex_exit(&spa_namespace_lock); 2949168404Spjd} 2950168404Spjd 2951185029Spjd/* 2952185029Spjd * Add spares device information to the nvlist. 2953185029Spjd */ 2954168404Spjdstatic void 2955168404Spjdspa_add_spares(spa_t *spa, nvlist_t *config) 2956168404Spjd{ 2957168404Spjd nvlist_t **spares; 2958168404Spjd uint_t i, nspares; 2959168404Spjd nvlist_t *nvroot; 2960168404Spjd uint64_t guid; 2961168404Spjd vdev_stat_t *vs; 2962168404Spjd uint_t vsc; 2963168404Spjd uint64_t pool; 2964168404Spjd 2965209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2966209962Smm 2967185029Spjd if (spa->spa_spares.sav_count == 0) 2968168404Spjd return; 2969168404Spjd 2970168404Spjd VERIFY(nvlist_lookup_nvlist(config, 2971168404Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2972185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2973168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2974168404Spjd if (nspares != 0) { 2975168404Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 2976168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2977168404Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 2978168404Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2979168404Spjd 2980168404Spjd /* 2981168404Spjd * Go through and find any spares which have since been 2982168404Spjd * repurposed as an active spare. If this is the case, update 2983168404Spjd * their status appropriately. 2984168404Spjd */ 2985168404Spjd for (i = 0; i < nspares; i++) { 2986168404Spjd VERIFY(nvlist_lookup_uint64(spares[i], 2987168404Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 2988185029Spjd if (spa_spare_exists(guid, &pool, NULL) && 2989185029Spjd pool != 0ULL) { 2990168404Spjd VERIFY(nvlist_lookup_uint64_array( 2991219089Spjd spares[i], ZPOOL_CONFIG_VDEV_STATS, 2992168404Spjd (uint64_t **)&vs, &vsc) == 0); 2993168404Spjd vs->vs_state = VDEV_STATE_CANT_OPEN; 2994168404Spjd vs->vs_aux = VDEV_AUX_SPARED; 2995168404Spjd } 2996168404Spjd } 2997168404Spjd } 2998168404Spjd} 2999168404Spjd 3000185029Spjd/* 3001185029Spjd * Add l2cache device information to the nvlist, including vdev stats. 3002185029Spjd */ 3003185029Spjdstatic void 3004185029Spjdspa_add_l2cache(spa_t *spa, nvlist_t *config) 3005185029Spjd{ 3006185029Spjd nvlist_t **l2cache; 3007185029Spjd uint_t i, j, nl2cache; 3008185029Spjd nvlist_t *nvroot; 3009185029Spjd uint64_t guid; 3010185029Spjd vdev_t *vd; 3011185029Spjd vdev_stat_t *vs; 3012185029Spjd uint_t vsc; 3013185029Spjd 3014209962Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3015209962Smm 3016185029Spjd if (spa->spa_l2cache.sav_count == 0) 3017185029Spjd return; 3018185029Spjd 3019185029Spjd VERIFY(nvlist_lookup_nvlist(config, 3020185029Spjd ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 3021185029Spjd VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 3022185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3023185029Spjd if (nl2cache != 0) { 3024185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, 3025185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3026185029Spjd VERIFY(nvlist_lookup_nvlist_array(nvroot, 3027185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 3028185029Spjd 3029185029Spjd /* 3030185029Spjd * Update level 2 cache device stats. 3031185029Spjd */ 3032185029Spjd 3033185029Spjd for (i = 0; i < nl2cache; i++) { 3034185029Spjd VERIFY(nvlist_lookup_uint64(l2cache[i], 3035185029Spjd ZPOOL_CONFIG_GUID, &guid) == 0); 3036185029Spjd 3037185029Spjd vd = NULL; 3038185029Spjd for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 3039185029Spjd if (guid == 3040185029Spjd spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 3041185029Spjd vd = spa->spa_l2cache.sav_vdevs[j]; 3042185029Spjd break; 3043185029Spjd } 3044185029Spjd } 3045185029Spjd ASSERT(vd != NULL); 3046185029Spjd 3047185029Spjd VERIFY(nvlist_lookup_uint64_array(l2cache[i], 3048219089Spjd ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 3049219089Spjd == 0); 3050185029Spjd vdev_get_stats(vd, vs); 3051185029Spjd } 3052185029Spjd } 3053185029Spjd} 3054185029Spjd 3055236884Smmstatic void 3056236884Smmspa_add_feature_stats(spa_t *spa, nvlist_t *config) 3057236884Smm{ 3058236884Smm nvlist_t *features; 3059236884Smm zap_cursor_t zc; 3060236884Smm zap_attribute_t za; 3061236884Smm 3062236884Smm ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3063236884Smm VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3064236884Smm 3065253993Smav /* We may be unable to read features if pool is suspended. */ 3066253993Smav if (spa_suspended(spa)) 3067253993Smav goto out; 3068253993Smav 3069236884Smm if (spa->spa_feat_for_read_obj != 0) { 3070236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3071236884Smm spa->spa_feat_for_read_obj); 3072236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3073236884Smm zap_cursor_advance(&zc)) { 3074236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3075236884Smm za.za_num_integers == 1); 3076236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3077236884Smm za.za_first_integer)); 3078236884Smm } 3079236884Smm zap_cursor_fini(&zc); 3080236884Smm } 3081236884Smm 3082236884Smm if (spa->spa_feat_for_write_obj != 0) { 3083236884Smm for (zap_cursor_init(&zc, spa->spa_meta_objset, 3084236884Smm spa->spa_feat_for_write_obj); 3085236884Smm zap_cursor_retrieve(&zc, &za) == 0; 3086236884Smm zap_cursor_advance(&zc)) { 3087236884Smm ASSERT(za.za_integer_length == sizeof (uint64_t) && 3088236884Smm za.za_num_integers == 1); 3089236884Smm VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, 3090236884Smm za.za_first_integer)); 3091236884Smm } 3092236884Smm zap_cursor_fini(&zc); 3093236884Smm } 3094236884Smm 3095253993Smavout: 3096236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 3097236884Smm features) == 0); 3098236884Smm nvlist_free(features); 3099236884Smm} 3100236884Smm 3101168404Spjdint 3102236884Smmspa_get_stats(const char *name, nvlist_t **config, 3103236884Smm char *altroot, size_t buflen) 3104168404Spjd{ 3105168404Spjd int error; 3106168404Spjd spa_t *spa; 3107168404Spjd 3108168404Spjd *config = NULL; 3109219089Spjd error = spa_open_common(name, &spa, FTAG, NULL, config); 3110168404Spjd 3111209962Smm if (spa != NULL) { 3112209962Smm /* 3113209962Smm * This still leaves a window of inconsistency where the spares 3114209962Smm * or l2cache devices could change and the config would be 3115209962Smm * self-inconsistent. 3116209962Smm */ 3117209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3118168404Spjd 3119209962Smm if (*config != NULL) { 3120219089Spjd uint64_t loadtimes[2]; 3121219089Spjd 3122219089Spjd loadtimes[0] = spa->spa_loaded_ts.tv_sec; 3123219089Spjd loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 3124219089Spjd VERIFY(nvlist_add_uint64_array(*config, 3125219089Spjd ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 3126219089Spjd 3127185029Spjd VERIFY(nvlist_add_uint64(*config, 3128209962Smm ZPOOL_CONFIG_ERRCOUNT, 3129209962Smm spa_get_errlog_size(spa)) == 0); 3130185029Spjd 3131209962Smm if (spa_suspended(spa)) 3132209962Smm VERIFY(nvlist_add_uint64(*config, 3133209962Smm ZPOOL_CONFIG_SUSPENDED, 3134209962Smm spa->spa_failmode) == 0); 3135209962Smm 3136209962Smm spa_add_spares(spa, *config); 3137209962Smm spa_add_l2cache(spa, *config); 3138236884Smm spa_add_feature_stats(spa, *config); 3139209962Smm } 3140168404Spjd } 3141168404Spjd 3142168404Spjd /* 3143168404Spjd * We want to get the alternate root even for faulted pools, so we cheat 3144168404Spjd * and call spa_lookup() directly. 3145168404Spjd */ 3146168404Spjd if (altroot) { 3147168404Spjd if (spa == NULL) { 3148168404Spjd mutex_enter(&spa_namespace_lock); 3149168404Spjd spa = spa_lookup(name); 3150168404Spjd if (spa) 3151168404Spjd spa_altroot(spa, altroot, buflen); 3152168404Spjd else 3153168404Spjd altroot[0] = '\0'; 3154168404Spjd spa = NULL; 3155168404Spjd mutex_exit(&spa_namespace_lock); 3156168404Spjd } else { 3157168404Spjd spa_altroot(spa, altroot, buflen); 3158168404Spjd } 3159168404Spjd } 3160168404Spjd 3161209962Smm if (spa != NULL) { 3162209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 3163168404Spjd spa_close(spa, FTAG); 3164209962Smm } 3165168404Spjd 3166168404Spjd return (error); 3167168404Spjd} 3168168404Spjd 3169168404Spjd/* 3170185029Spjd * Validate that the auxiliary device array is well formed. We must have an 3171185029Spjd * array of nvlists, each which describes a valid leaf vdev. If this is an 3172185029Spjd * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 3173185029Spjd * specified, as long as they are well-formed. 3174168404Spjd */ 3175168404Spjdstatic int 3176185029Spjdspa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 3177185029Spjd spa_aux_vdev_t *sav, const char *config, uint64_t version, 3178185029Spjd vdev_labeltype_t label) 3179168404Spjd{ 3180185029Spjd nvlist_t **dev; 3181185029Spjd uint_t i, ndev; 3182168404Spjd vdev_t *vd; 3183168404Spjd int error; 3184168404Spjd 3185185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3186185029Spjd 3187168404Spjd /* 3188185029Spjd * It's acceptable to have no devs specified. 3189168404Spjd */ 3190185029Spjd if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 3191168404Spjd return (0); 3192168404Spjd 3193185029Spjd if (ndev == 0) 3194249195Smm return (SET_ERROR(EINVAL)); 3195168404Spjd 3196168404Spjd /* 3197185029Spjd * Make sure the pool is formatted with a version that supports this 3198185029Spjd * device type. 3199168404Spjd */ 3200185029Spjd if (spa_version(spa) < version) 3201249195Smm return (SET_ERROR(ENOTSUP)); 3202168404Spjd 3203168404Spjd /* 3204185029Spjd * Set the pending device list so we correctly handle device in-use 3205168404Spjd * checking. 3206168404Spjd */ 3207185029Spjd sav->sav_pending = dev; 3208185029Spjd sav->sav_npending = ndev; 3209168404Spjd 3210185029Spjd for (i = 0; i < ndev; i++) { 3211185029Spjd if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 3212168404Spjd mode)) != 0) 3213168404Spjd goto out; 3214168404Spjd 3215168404Spjd if (!vd->vdev_ops->vdev_op_leaf) { 3216168404Spjd vdev_free(vd); 3217249195Smm error = SET_ERROR(EINVAL); 3218168404Spjd goto out; 3219168404Spjd } 3220168404Spjd 3221185029Spjd /* 3222185029Spjd * The L2ARC currently only supports disk devices in 3223185029Spjd * kernel context. For user-level testing, we allow it. 3224185029Spjd */ 3225185029Spjd#ifdef _KERNEL 3226185029Spjd if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 3227185029Spjd strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 3228249195Smm error = SET_ERROR(ENOTBLK); 3229230514Smm vdev_free(vd); 3230185029Spjd goto out; 3231185029Spjd } 3232185029Spjd#endif 3233168404Spjd vd->vdev_top = vd; 3234168404Spjd 3235168404Spjd if ((error = vdev_open(vd)) == 0 && 3236185029Spjd (error = vdev_label_init(vd, crtxg, label)) == 0) { 3237185029Spjd VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 3238168404Spjd vd->vdev_guid) == 0); 3239168404Spjd } 3240168404Spjd 3241168404Spjd vdev_free(vd); 3242168404Spjd 3243185029Spjd if (error && 3244185029Spjd (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 3245168404Spjd goto out; 3246168404Spjd else 3247168404Spjd error = 0; 3248168404Spjd } 3249168404Spjd 3250168404Spjdout: 3251185029Spjd sav->sav_pending = NULL; 3252185029Spjd sav->sav_npending = 0; 3253168404Spjd return (error); 3254168404Spjd} 3255168404Spjd 3256185029Spjdstatic int 3257185029Spjdspa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 3258185029Spjd{ 3259185029Spjd int error; 3260185029Spjd 3261185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3262185029Spjd 3263185029Spjd if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3264185029Spjd &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 3265185029Spjd VDEV_LABEL_SPARE)) != 0) { 3266185029Spjd return (error); 3267185029Spjd } 3268185029Spjd 3269185029Spjd return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 3270185029Spjd &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 3271185029Spjd VDEV_LABEL_L2CACHE)); 3272185029Spjd} 3273185029Spjd 3274185029Spjdstatic void 3275185029Spjdspa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 3276185029Spjd const char *config) 3277185029Spjd{ 3278185029Spjd int i; 3279185029Spjd 3280185029Spjd if (sav->sav_config != NULL) { 3281185029Spjd nvlist_t **olddevs; 3282185029Spjd uint_t oldndevs; 3283185029Spjd nvlist_t **newdevs; 3284185029Spjd 3285185029Spjd /* 3286185029Spjd * Generate new dev list by concatentating with the 3287185029Spjd * current dev list. 3288185029Spjd */ 3289185029Spjd VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 3290185029Spjd &olddevs, &oldndevs) == 0); 3291185029Spjd 3292185029Spjd newdevs = kmem_alloc(sizeof (void *) * 3293185029Spjd (ndevs + oldndevs), KM_SLEEP); 3294185029Spjd for (i = 0; i < oldndevs; i++) 3295185029Spjd VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 3296185029Spjd KM_SLEEP) == 0); 3297185029Spjd for (i = 0; i < ndevs; i++) 3298185029Spjd VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 3299185029Spjd KM_SLEEP) == 0); 3300185029Spjd 3301185029Spjd VERIFY(nvlist_remove(sav->sav_config, config, 3302185029Spjd DATA_TYPE_NVLIST_ARRAY) == 0); 3303185029Spjd 3304185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, 3305185029Spjd config, newdevs, ndevs + oldndevs) == 0); 3306185029Spjd for (i = 0; i < oldndevs + ndevs; i++) 3307185029Spjd nvlist_free(newdevs[i]); 3308185029Spjd kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 3309185029Spjd } else { 3310185029Spjd /* 3311185029Spjd * Generate a new dev list. 3312185029Spjd */ 3313185029Spjd VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 3314185029Spjd KM_SLEEP) == 0); 3315185029Spjd VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 3316185029Spjd devs, ndevs) == 0); 3317185029Spjd } 3318185029Spjd} 3319185029Spjd 3320168404Spjd/* 3321185029Spjd * Stop and drop level 2 ARC devices 3322185029Spjd */ 3323185029Spjdvoid 3324185029Spjdspa_l2cache_drop(spa_t *spa) 3325185029Spjd{ 3326185029Spjd vdev_t *vd; 3327185029Spjd int i; 3328185029Spjd spa_aux_vdev_t *sav = &spa->spa_l2cache; 3329185029Spjd 3330185029Spjd for (i = 0; i < sav->sav_count; i++) { 3331185029Spjd uint64_t pool; 3332185029Spjd 3333185029Spjd vd = sav->sav_vdevs[i]; 3334185029Spjd ASSERT(vd != NULL); 3335185029Spjd 3336209962Smm if (spa_l2cache_exists(vd->vdev_guid, &pool) && 3337209962Smm pool != 0ULL && l2arc_vdev_present(vd)) 3338185029Spjd l2arc_remove_vdev(vd); 3339185029Spjd } 3340185029Spjd} 3341185029Spjd 3342185029Spjd/* 3343168404Spjd * Pool Creation 3344168404Spjd */ 3345168404Spjdint 3346185029Spjdspa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 3347248571Smm nvlist_t *zplprops) 3348168404Spjd{ 3349168404Spjd spa_t *spa; 3350185029Spjd char *altroot = NULL; 3351168404Spjd vdev_t *rvd; 3352168404Spjd dsl_pool_t *dp; 3353168404Spjd dmu_tx_t *tx; 3354219089Spjd int error = 0; 3355168404Spjd uint64_t txg = TXG_INITIAL; 3356185029Spjd nvlist_t **spares, **l2cache; 3357185029Spjd uint_t nspares, nl2cache; 3358219089Spjd uint64_t version, obj; 3359236884Smm boolean_t has_features; 3360168404Spjd 3361168404Spjd /* 3362168404Spjd * If this pool already exists, return failure. 3363168404Spjd */ 3364168404Spjd mutex_enter(&spa_namespace_lock); 3365168404Spjd if (spa_lookup(pool) != NULL) { 3366168404Spjd mutex_exit(&spa_namespace_lock); 3367249195Smm return (SET_ERROR(EEXIST)); 3368168404Spjd } 3369168404Spjd 3370168404Spjd /* 3371168404Spjd * Allocate a new spa_t structure. 3372168404Spjd */ 3373185029Spjd (void) nvlist_lookup_string(props, 3374185029Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3375219089Spjd spa = spa_add(pool, NULL, altroot); 3376209962Smm spa_activate(spa, spa_mode_global); 3377168404Spjd 3378185029Spjd if (props && (error = spa_prop_validate(spa, props))) { 3379185029Spjd spa_deactivate(spa); 3380185029Spjd spa_remove(spa); 3381185029Spjd mutex_exit(&spa_namespace_lock); 3382185029Spjd return (error); 3383185029Spjd } 3384185029Spjd 3385236884Smm has_features = B_FALSE; 3386236884Smm for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 3387236884Smm elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 3388236884Smm if (zpool_prop_feature(nvpair_name(elem))) 3389236884Smm has_features = B_TRUE; 3390236884Smm } 3391236884Smm 3392236884Smm if (has_features || nvlist_lookup_uint64(props, 3393236884Smm zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 3394185029Spjd version = SPA_VERSION; 3395236884Smm } 3396236884Smm ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 3397219089Spjd 3398219089Spjd spa->spa_first_txg = txg; 3399219089Spjd spa->spa_uberblock.ub_txg = txg - 1; 3400185029Spjd spa->spa_uberblock.ub_version = version; 3401168404Spjd spa->spa_ubsync = spa->spa_uberblock; 3402168404Spjd 3403168404Spjd /* 3404209962Smm * Create "The Godfather" zio to hold all async IOs 3405209962Smm */ 3406209962Smm spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 3407209962Smm ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 3408209962Smm 3409209962Smm /* 3410168404Spjd * Create the root vdev. 3411168404Spjd */ 3412185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3413168404Spjd 3414168404Spjd error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 3415168404Spjd 3416168404Spjd ASSERT(error != 0 || rvd != NULL); 3417168404Spjd ASSERT(error != 0 || spa->spa_root_vdev == rvd); 3418168404Spjd 3419185029Spjd if (error == 0 && !zfs_allocatable_devs(nvroot)) 3420249195Smm error = SET_ERROR(EINVAL); 3421168404Spjd 3422168404Spjd if (error == 0 && 3423168404Spjd (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 3424185029Spjd (error = spa_validate_aux(spa, nvroot, txg, 3425168404Spjd VDEV_ALLOC_ADD)) == 0) { 3426219089Spjd for (int c = 0; c < rvd->vdev_children; c++) { 3427254591Sgibbs vdev_ashift_optimize(rvd->vdev_child[c]); 3428219089Spjd vdev_metaslab_set_size(rvd->vdev_child[c]); 3429219089Spjd vdev_expand(rvd->vdev_child[c], txg); 3430219089Spjd } 3431168404Spjd } 3432168404Spjd 3433185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3434168404Spjd 3435168404Spjd if (error != 0) { 3436168404Spjd spa_unload(spa); 3437168404Spjd spa_deactivate(spa); 3438168404Spjd spa_remove(spa); 3439168404Spjd mutex_exit(&spa_namespace_lock); 3440168404Spjd return (error); 3441168404Spjd } 3442168404Spjd 3443168404Spjd /* 3444168404Spjd * Get the list of spares, if specified. 3445168404Spjd */ 3446168404Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3447168404Spjd &spares, &nspares) == 0) { 3448185029Spjd VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3449168404Spjd KM_SLEEP) == 0); 3450185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3451168404Spjd ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3452185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3453168404Spjd spa_load_spares(spa); 3454185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3455185029Spjd spa->spa_spares.sav_sync = B_TRUE; 3456168404Spjd } 3457168404Spjd 3458185029Spjd /* 3459185029Spjd * Get the list of level 2 cache devices, if specified. 3460185029Spjd */ 3461185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3462185029Spjd &l2cache, &nl2cache) == 0) { 3463185029Spjd VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3464185029Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 3465185029Spjd VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3466185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3467185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3468185029Spjd spa_load_l2cache(spa); 3469185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3470185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 3471185029Spjd } 3472185029Spjd 3473236884Smm spa->spa_is_initializing = B_TRUE; 3474185029Spjd spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3475168404Spjd spa->spa_meta_objset = dp->dp_meta_objset; 3476236884Smm spa->spa_is_initializing = B_FALSE; 3477168404Spjd 3478219089Spjd /* 3479219089Spjd * Create DDTs (dedup tables). 3480219089Spjd */ 3481219089Spjd ddt_create(spa); 3482219089Spjd 3483219089Spjd spa_update_dspace(spa); 3484219089Spjd 3485168404Spjd tx = dmu_tx_create_assigned(dp, txg); 3486168404Spjd 3487168404Spjd /* 3488168404Spjd * Create the pool config object. 3489168404Spjd */ 3490168404Spjd spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3491185029Spjd DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3492168404Spjd DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3493168404Spjd 3494168404Spjd if (zap_add(spa->spa_meta_objset, 3495168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3496168404Spjd sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3497168404Spjd cmn_err(CE_PANIC, "failed to add pool config"); 3498168404Spjd } 3499168404Spjd 3500236884Smm if (spa_version(spa) >= SPA_VERSION_FEATURES) 3501236884Smm spa_feature_create_zap_objects(spa, tx); 3502236884Smm 3503219089Spjd if (zap_add(spa->spa_meta_objset, 3504219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3505219089Spjd sizeof (uint64_t), 1, &version, tx) != 0) { 3506219089Spjd cmn_err(CE_PANIC, "failed to add pool version"); 3507219089Spjd } 3508219089Spjd 3509185029Spjd /* Newly created pools with the right version are always deflated. */ 3510185029Spjd if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3511185029Spjd spa->spa_deflate = TRUE; 3512185029Spjd if (zap_add(spa->spa_meta_objset, 3513185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3514185029Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3515185029Spjd cmn_err(CE_PANIC, "failed to add deflate"); 3516185029Spjd } 3517168404Spjd } 3518168404Spjd 3519168404Spjd /* 3520219089Spjd * Create the deferred-free bpobj. Turn off compression 3521168404Spjd * because sync-to-convergence takes longer if the blocksize 3522168404Spjd * keeps changing. 3523168404Spjd */ 3524219089Spjd obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3525219089Spjd dmu_object_set_compress(spa->spa_meta_objset, obj, 3526168404Spjd ZIO_COMPRESS_OFF, tx); 3527168404Spjd if (zap_add(spa->spa_meta_objset, 3528219089Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3529219089Spjd sizeof (uint64_t), 1, &obj, tx) != 0) { 3530219089Spjd cmn_err(CE_PANIC, "failed to add bpobj"); 3531168404Spjd } 3532219089Spjd VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3533219089Spjd spa->spa_meta_objset, obj)); 3534168404Spjd 3535168404Spjd /* 3536168404Spjd * Create the pool's history object. 3537168404Spjd */ 3538185029Spjd if (version >= SPA_VERSION_ZPOOL_HISTORY) 3539185029Spjd spa_history_create_obj(spa, tx); 3540168404Spjd 3541185029Spjd /* 3542185029Spjd * Set pool properties. 3543185029Spjd */ 3544185029Spjd spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3545185029Spjd spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3546185029Spjd spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3547219089Spjd spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3548219089Spjd 3549209962Smm if (props != NULL) { 3550209962Smm spa_configfile_set(spa, props, B_FALSE); 3551248571Smm spa_sync_props(props, tx); 3552209962Smm } 3553185029Spjd 3554168404Spjd dmu_tx_commit(tx); 3555168404Spjd 3556168404Spjd spa->spa_sync_on = B_TRUE; 3557168404Spjd txg_sync_start(spa->spa_dsl_pool); 3558168404Spjd 3559168404Spjd /* 3560168404Spjd * We explicitly wait for the first transaction to complete so that our 3561168404Spjd * bean counters are appropriately updated. 3562168404Spjd */ 3563168404Spjd txg_wait_synced(spa->spa_dsl_pool, txg); 3564168404Spjd 3565185029Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 3566168404Spjd 3567248571Smm spa_history_log_version(spa, "create"); 3568185029Spjd 3569208442Smm spa->spa_minref = refcount_count(&spa->spa_refcount); 3570208442Smm 3571168404Spjd mutex_exit(&spa_namespace_lock); 3572168404Spjd 3573168404Spjd return (0); 3574168404Spjd} 3575168404Spjd 3576241286Savg#ifdef _KERNEL 3577219089Spjd#if defined(sun) 3578185029Spjd/* 3579219089Spjd * Get the root pool information from the root disk, then import the root pool 3580219089Spjd * during the system boot up time. 3581185029Spjd */ 3582219089Spjdextern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3583219089Spjd 3584219089Spjdstatic nvlist_t * 3585219089Spjdspa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3586185029Spjd{ 3587219089Spjd nvlist_t *config; 3588185029Spjd nvlist_t *nvtop, *nvroot; 3589185029Spjd uint64_t pgid; 3590185029Spjd 3591219089Spjd if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3592219089Spjd return (NULL); 3593219089Spjd 3594168404Spjd /* 3595185029Spjd * Add this top-level vdev to the child array. 3596168404Spjd */ 3597219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3598219089Spjd &nvtop) == 0); 3599219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3600219089Spjd &pgid) == 0); 3601219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3602168404Spjd 3603185029Spjd /* 3604185029Spjd * Put this pool's top-level vdevs into a root vdev. 3605185029Spjd */ 3606185029Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3607219089Spjd VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3608219089Spjd VDEV_TYPE_ROOT) == 0); 3609185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3610185029Spjd VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3611185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3612185029Spjd &nvtop, 1) == 0); 3613168404Spjd 3614168404Spjd /* 3615185029Spjd * Replace the existing vdev_tree with the new root vdev in 3616185029Spjd * this pool's configuration (remove the old, add the new). 3617168404Spjd */ 3618185029Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3619185029Spjd nvlist_free(nvroot); 3620219089Spjd return (config); 3621185029Spjd} 3622168404Spjd 3623185029Spjd/* 3624219089Spjd * Walk the vdev tree and see if we can find a device with "better" 3625219089Spjd * configuration. A configuration is "better" if the label on that 3626219089Spjd * device has a more recent txg. 3627185029Spjd */ 3628219089Spjdstatic void 3629219089Spjdspa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3630185029Spjd{ 3631219089Spjd for (int c = 0; c < vd->vdev_children; c++) 3632219089Spjd spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3633185029Spjd 3634219089Spjd if (vd->vdev_ops->vdev_op_leaf) { 3635219089Spjd nvlist_t *label; 3636219089Spjd uint64_t label_txg; 3637185029Spjd 3638219089Spjd if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3639219089Spjd &label) != 0) 3640219089Spjd return; 3641185029Spjd 3642219089Spjd VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3643219089Spjd &label_txg) == 0); 3644168404Spjd 3645219089Spjd /* 3646219089Spjd * Do we have a better boot device? 3647219089Spjd */ 3648219089Spjd if (label_txg > *txg) { 3649219089Spjd *txg = label_txg; 3650219089Spjd *avd = vd; 3651185029Spjd } 3652219089Spjd nvlist_free(label); 3653185029Spjd } 3654185029Spjd} 3655185029Spjd 3656185029Spjd/* 3657185029Spjd * Import a root pool. 3658185029Spjd * 3659185029Spjd * For x86. devpath_list will consist of devid and/or physpath name of 3660185029Spjd * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3661185029Spjd * The GRUB "findroot" command will return the vdev we should boot. 3662185029Spjd * 3663185029Spjd * For Sparc, devpath_list consists the physpath name of the booting device 3664185029Spjd * no matter the rootpool is a single device pool or a mirrored pool. 3665185029Spjd * e.g. 3666185029Spjd * "/pci@1f,0/ide@d/disk@0,0:a" 3667185029Spjd */ 3668185029Spjdint 3669185029Spjdspa_import_rootpool(char *devpath, char *devid) 3670185029Spjd{ 3671219089Spjd spa_t *spa; 3672219089Spjd vdev_t *rvd, *bvd, *avd = NULL; 3673219089Spjd nvlist_t *config, *nvtop; 3674219089Spjd uint64_t guid, txg; 3675185029Spjd char *pname; 3676185029Spjd int error; 3677185029Spjd 3678185029Spjd /* 3679219089Spjd * Read the label from the boot device and generate a configuration. 3680185029Spjd */ 3681219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3682219089Spjd#if defined(_OBP) && defined(_KERNEL) 3683219089Spjd if (config == NULL) { 3684219089Spjd if (strstr(devpath, "/iscsi/ssd") != NULL) { 3685219089Spjd /* iscsi boot */ 3686219089Spjd get_iscsi_bootpath_phy(devpath); 3687219089Spjd config = spa_generate_rootconf(devpath, devid, &guid); 3688219089Spjd } 3689219089Spjd } 3690219089Spjd#endif 3691219089Spjd if (config == NULL) { 3692236884Smm cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", 3693219089Spjd devpath); 3694249195Smm return (SET_ERROR(EIO)); 3695219089Spjd } 3696185029Spjd 3697219089Spjd VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3698219089Spjd &pname) == 0); 3699219089Spjd VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3700185029Spjd 3701209962Smm mutex_enter(&spa_namespace_lock); 3702209962Smm if ((spa = spa_lookup(pname)) != NULL) { 3703209962Smm /* 3704209962Smm * Remove the existing root pool from the namespace so that we 3705209962Smm * can replace it with the correct config we just read in. 3706209962Smm */ 3707209962Smm spa_remove(spa); 3708209962Smm } 3709185029Spjd 3710219089Spjd spa = spa_add(pname, config, NULL); 3711209962Smm spa->spa_is_root = B_TRUE; 3712219089Spjd spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3713209962Smm 3714219089Spjd /* 3715219089Spjd * Build up a vdev tree based on the boot device's label config. 3716219089Spjd */ 3717219089Spjd VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3718219089Spjd &nvtop) == 0); 3719219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3720219089Spjd error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3721219089Spjd VDEV_ALLOC_ROOTPOOL); 3722219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3723219089Spjd if (error) { 3724209962Smm mutex_exit(&spa_namespace_lock); 3725219089Spjd nvlist_free(config); 3726219089Spjd cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3727219089Spjd pname); 3728219089Spjd return (error); 3729209962Smm } 3730209962Smm 3731219089Spjd /* 3732219089Spjd * Get the boot vdev. 3733219089Spjd */ 3734219089Spjd if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3735219089Spjd cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3736219089Spjd (u_longlong_t)guid); 3737249195Smm error = SET_ERROR(ENOENT); 3738219089Spjd goto out; 3739219089Spjd } 3740209962Smm 3741219089Spjd /* 3742219089Spjd * Determine if there is a better boot device. 3743219089Spjd */ 3744219089Spjd avd = bvd; 3745219089Spjd spa_alt_rootvdev(rvd, &avd, &txg); 3746219089Spjd if (avd != bvd) { 3747219089Spjd cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3748219089Spjd "try booting from '%s'", avd->vdev_path); 3749249195Smm error = SET_ERROR(EINVAL); 3750219089Spjd goto out; 3751219089Spjd } 3752209962Smm 3753219089Spjd /* 3754219089Spjd * If the boot device is part of a spare vdev then ensure that 3755219089Spjd * we're booting off the active spare. 3756219089Spjd */ 3757219089Spjd if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3758219089Spjd !bvd->vdev_isspare) { 3759219089Spjd cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3760219089Spjd "try booting from '%s'", 3761219089Spjd bvd->vdev_parent-> 3762219089Spjd vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3763249195Smm error = SET_ERROR(EINVAL); 3764219089Spjd goto out; 3765219089Spjd } 3766209962Smm 3767219089Spjd error = 0; 3768219089Spjdout: 3769219089Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3770219089Spjd vdev_free(rvd); 3771219089Spjd spa_config_exit(spa, SCL_ALL, FTAG); 3772209962Smm mutex_exit(&spa_namespace_lock); 3773209962Smm 3774219089Spjd nvlist_free(config); 3775219089Spjd return (error); 3776185029Spjd} 3777185029Spjd 3778241286Savg#else 3779241286Savg 3780243502Savgextern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, 3781243502Savg uint64_t *count); 3782241286Savg 3783241286Savgstatic nvlist_t * 3784241286Savgspa_generate_rootconf(const char *name) 3785241286Savg{ 3786243502Savg nvlist_t **configs, **tops; 3787241286Savg nvlist_t *config; 3788243502Savg nvlist_t *best_cfg, *nvtop, *nvroot; 3789243502Savg uint64_t *holes; 3790243502Savg uint64_t best_txg; 3791243213Savg uint64_t nchildren; 3792241286Savg uint64_t pgid; 3793243502Savg uint64_t count; 3794243502Savg uint64_t i; 3795243502Savg uint_t nholes; 3796241286Savg 3797243502Savg if (vdev_geom_read_pool_label(name, &configs, &count) != 0) 3798241286Savg return (NULL); 3799241286Savg 3800243502Savg ASSERT3U(count, !=, 0); 3801243502Savg best_txg = 0; 3802243502Savg for (i = 0; i < count; i++) { 3803243502Savg uint64_t txg; 3804243502Savg 3805243502Savg VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, 3806243502Savg &txg) == 0); 3807243502Savg if (txg > best_txg) { 3808243502Savg best_txg = txg; 3809243502Savg best_cfg = configs[i]; 3810243502Savg } 3811243502Savg } 3812243502Savg 3813241286Savg /* 3814243213Savg * Multi-vdev root pool configuration discovery is not supported yet. 3815243213Savg */ 3816245945Savg nchildren = 1; 3817245945Savg nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); 3818243502Savg holes = NULL; 3819243502Savg nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, 3820243502Savg &holes, &nholes); 3821243502Savg 3822244635Savg tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); 3823243502Savg for (i = 0; i < nchildren; i++) { 3824243502Savg if (i >= count) 3825243502Savg break; 3826243502Savg if (configs[i] == NULL) 3827243502Savg continue; 3828243502Savg VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, 3829243502Savg &nvtop) == 0); 3830243502Savg nvlist_dup(nvtop, &tops[i], KM_SLEEP); 3831243213Savg } 3832243502Savg for (i = 0; holes != NULL && i < nholes; i++) { 3833243502Savg if (i >= nchildren) 3834243502Savg continue; 3835243502Savg if (tops[holes[i]] != NULL) 3836243502Savg continue; 3837243502Savg nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); 3838243502Savg VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, 3839243502Savg VDEV_TYPE_HOLE) == 0); 3840243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, 3841243502Savg holes[i]) == 0); 3842243502Savg VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 3843243502Savg 0) == 0); 3844243502Savg } 3845243502Savg for (i = 0; i < nchildren; i++) { 3846243502Savg if (tops[i] != NULL) 3847243502Savg continue; 3848243502Savg nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); 3849243502Savg VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, 3850243502Savg VDEV_TYPE_MISSING) == 0); 3851243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, 3852243502Savg i) == 0); 3853243502Savg VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 3854243502Savg 0) == 0); 3855243502Savg } 3856243213Savg 3857243213Savg /* 3858243502Savg * Create pool config based on the best vdev config. 3859241286Savg */ 3860243502Savg nvlist_dup(best_cfg, &config, KM_SLEEP); 3861241286Savg 3862241286Savg /* 3863241286Savg * Put this pool's top-level vdevs into a root vdev. 3864241286Savg */ 3865243502Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3866243502Savg &pgid) == 0); 3867241286Savg VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3868241286Savg VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3869241286Savg VDEV_TYPE_ROOT) == 0); 3870241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3871241286Savg VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3872241286Savg VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3873243502Savg tops, nchildren) == 0); 3874241286Savg 3875241286Savg /* 3876241286Savg * Replace the existing vdev_tree with the new root vdev in 3877241286Savg * this pool's configuration (remove the old, add the new). 3878241286Savg */ 3879241286Savg VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3880243502Savg 3881243502Savg /* 3882243502Savg * Drop vdev config elements that should not be present at pool level. 3883243502Savg */ 3884243502Savg nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); 3885243502Savg nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); 3886243502Savg 3887243502Savg for (i = 0; i < count; i++) 3888243502Savg nvlist_free(configs[i]); 3889243502Savg kmem_free(configs, count * sizeof(void *)); 3890243502Savg for (i = 0; i < nchildren; i++) 3891243502Savg nvlist_free(tops[i]); 3892243502Savg kmem_free(tops, nchildren * sizeof(void *)); 3893241286Savg nvlist_free(nvroot); 3894241286Savg return (config); 3895241286Savg} 3896241286Savg 3897241286Savgint 3898241286Savgspa_import_rootpool(const char *name) 3899241286Savg{ 3900241286Savg spa_t *spa; 3901241286Savg vdev_t *rvd, *bvd, *avd = NULL; 3902241286Savg nvlist_t *config, *nvtop; 3903241286Savg uint64_t txg; 3904241286Savg char *pname; 3905241286Savg int error; 3906241286Savg 3907241286Savg /* 3908241286Savg * Read the label from the boot device and generate a configuration. 3909241286Savg */ 3910241286Savg config = spa_generate_rootconf(name); 3911243213Savg 3912243213Savg mutex_enter(&spa_namespace_lock); 3913243213Savg if (config != NULL) { 3914243213Savg VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3915243213Savg &pname) == 0 && strcmp(name, pname) == 0); 3916243213Savg VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) 3917243213Savg == 0); 3918243213Savg 3919243213Savg if ((spa = spa_lookup(pname)) != NULL) { 3920243213Savg /* 3921243213Savg * Remove the existing root pool from the namespace so 3922243213Savg * that we can replace it with the correct config 3923243213Savg * we just read in. 3924243213Savg */ 3925243213Savg spa_remove(spa); 3926243213Savg } 3927243213Savg spa = spa_add(pname, config, NULL); 3928243501Savg 3929243501Savg /* 3930243501Savg * Set spa_ubsync.ub_version as it can be used in vdev_alloc() 3931243501Savg * via spa_version(). 3932243501Savg */ 3933243501Savg if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3934243501Savg &spa->spa_ubsync.ub_version) != 0) 3935243501Savg spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3936243213Savg } else if ((spa = spa_lookup(name)) == NULL) { 3937241286Savg cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", 3938241286Savg name); 3939241286Savg return (EIO); 3940243213Savg } else { 3941243213Savg VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); 3942241286Savg } 3943241286Savg spa->spa_is_root = B_TRUE; 3944241286Savg spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3945241286Savg 3946241286Savg /* 3947241286Savg * Build up a vdev tree based on the boot device's label config. 3948241286Savg */ 3949241286Savg VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3950241286Savg &nvtop) == 0); 3951241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3952241286Savg error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3953241286Savg VDEV_ALLOC_ROOTPOOL); 3954241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 3955241286Savg if (error) { 3956241286Savg mutex_exit(&spa_namespace_lock); 3957241286Savg nvlist_free(config); 3958241286Savg cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3959241286Savg pname); 3960241286Savg return (error); 3961241286Savg } 3962241286Savg 3963241286Savg spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3964241286Savg vdev_free(rvd); 3965241286Savg spa_config_exit(spa, SCL_ALL, FTAG); 3966241286Savg mutex_exit(&spa_namespace_lock); 3967241286Savg 3968243213Savg nvlist_free(config); 3969243213Savg return (0); 3970241286Savg} 3971241286Savg 3972241286Savg#endif /* sun */ 3973219089Spjd#endif 3974219089Spjd 3975209962Smm/* 3976209962Smm * Import a non-root pool into the system. 3977209962Smm */ 3978185029Spjdint 3979219089Spjdspa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3980185029Spjd{ 3981209962Smm spa_t *spa; 3982209962Smm char *altroot = NULL; 3983219089Spjd spa_load_state_t state = SPA_LOAD_IMPORT; 3984219089Spjd zpool_rewind_policy_t policy; 3985219089Spjd uint64_t mode = spa_mode_global; 3986219089Spjd uint64_t readonly = B_FALSE; 3987209962Smm int error; 3988209962Smm nvlist_t *nvroot; 3989209962Smm nvlist_t **spares, **l2cache; 3990209962Smm uint_t nspares, nl2cache; 3991209962Smm 3992209962Smm /* 3993209962Smm * If a pool with this name exists, return failure. 3994209962Smm */ 3995209962Smm mutex_enter(&spa_namespace_lock); 3996219089Spjd if (spa_lookup(pool) != NULL) { 3997209962Smm mutex_exit(&spa_namespace_lock); 3998249195Smm return (SET_ERROR(EEXIST)); 3999209962Smm } 4000209962Smm 4001209962Smm /* 4002209962Smm * Create and initialize the spa structure. 4003209962Smm */ 4004209962Smm (void) nvlist_lookup_string(props, 4005209962Smm zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4006219089Spjd (void) nvlist_lookup_uint64(props, 4007219089Spjd zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 4008219089Spjd if (readonly) 4009219089Spjd mode = FREAD; 4010219089Spjd spa = spa_add(pool, config, altroot); 4011219089Spjd spa->spa_import_flags = flags; 4012209962Smm 4013209962Smm /* 4014219089Spjd * Verbatim import - Take a pool and insert it into the namespace 4015219089Spjd * as if it had been loaded at boot. 4016219089Spjd */ 4017219089Spjd if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 4018219089Spjd if (props != NULL) 4019219089Spjd spa_configfile_set(spa, props, B_FALSE); 4020219089Spjd 4021219089Spjd spa_config_sync(spa, B_FALSE, B_TRUE); 4022219089Spjd 4023219089Spjd mutex_exit(&spa_namespace_lock); 4024248571Smm spa_history_log_version(spa, "import"); 4025219089Spjd 4026219089Spjd return (0); 4027219089Spjd } 4028219089Spjd 4029219089Spjd spa_activate(spa, mode); 4030219089Spjd 4031219089Spjd /* 4032209962Smm * Don't start async tasks until we know everything is healthy. 4033209962Smm */ 4034209962Smm spa_async_suspend(spa); 4035209962Smm 4036219089Spjd zpool_get_rewind_policy(config, &policy); 4037219089Spjd if (policy.zrp_request & ZPOOL_DO_REWIND) 4038219089Spjd state = SPA_LOAD_RECOVER; 4039219089Spjd 4040209962Smm /* 4041209962Smm * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 4042209962Smm * because the user-supplied config is actually the one to trust when 4043209962Smm * doing an import. 4044209962Smm */ 4045219089Spjd if (state != SPA_LOAD_RECOVER) 4046219089Spjd spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 4047209962Smm 4048219089Spjd error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 4049219089Spjd policy.zrp_request); 4050219089Spjd 4051219089Spjd /* 4052219089Spjd * Propagate anything learned while loading the pool and pass it 4053219089Spjd * back to caller (i.e. rewind info, missing devices, etc). 4054219089Spjd */ 4055219089Spjd VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4056219089Spjd spa->spa_load_info) == 0); 4057219089Spjd 4058209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4059209962Smm /* 4060209962Smm * Toss any existing sparelist, as it doesn't have any validity 4061209962Smm * anymore, and conflicts with spa_has_spare(). 4062209962Smm */ 4063209962Smm if (spa->spa_spares.sav_config) { 4064209962Smm nvlist_free(spa->spa_spares.sav_config); 4065209962Smm spa->spa_spares.sav_config = NULL; 4066209962Smm spa_load_spares(spa); 4067209962Smm } 4068209962Smm if (spa->spa_l2cache.sav_config) { 4069209962Smm nvlist_free(spa->spa_l2cache.sav_config); 4070209962Smm spa->spa_l2cache.sav_config = NULL; 4071209962Smm spa_load_l2cache(spa); 4072209962Smm } 4073209962Smm 4074209962Smm VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 4075209962Smm &nvroot) == 0); 4076209962Smm if (error == 0) 4077209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4078209962Smm VDEV_ALLOC_SPARE); 4079209962Smm if (error == 0) 4080209962Smm error = spa_validate_aux(spa, nvroot, -1ULL, 4081209962Smm VDEV_ALLOC_L2CACHE); 4082209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4083209962Smm 4084209962Smm if (props != NULL) 4085209962Smm spa_configfile_set(spa, props, B_FALSE); 4086209962Smm 4087209962Smm if (error != 0 || (props && spa_writeable(spa) && 4088209962Smm (error = spa_prop_set(spa, props)))) { 4089209962Smm spa_unload(spa); 4090209962Smm spa_deactivate(spa); 4091209962Smm spa_remove(spa); 4092209962Smm mutex_exit(&spa_namespace_lock); 4093209962Smm return (error); 4094209962Smm } 4095209962Smm 4096209962Smm spa_async_resume(spa); 4097209962Smm 4098209962Smm /* 4099209962Smm * Override any spares and level 2 cache devices as specified by 4100209962Smm * the user, as these may have correct device names/devids, etc. 4101209962Smm */ 4102209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 4103209962Smm &spares, &nspares) == 0) { 4104209962Smm if (spa->spa_spares.sav_config) 4105209962Smm VERIFY(nvlist_remove(spa->spa_spares.sav_config, 4106209962Smm ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 4107209962Smm else 4108209962Smm VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 4109209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4110209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 4111209962Smm ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 4112209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4113209962Smm spa_load_spares(spa); 4114209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4115209962Smm spa->spa_spares.sav_sync = B_TRUE; 4116209962Smm } 4117209962Smm if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 4118209962Smm &l2cache, &nl2cache) == 0) { 4119209962Smm if (spa->spa_l2cache.sav_config) 4120209962Smm VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 4121209962Smm ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 4122209962Smm else 4123209962Smm VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 4124209962Smm NV_UNIQUE_NAME, KM_SLEEP) == 0); 4125209962Smm VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 4126209962Smm ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 4127209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4128209962Smm spa_load_l2cache(spa); 4129209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 4130209962Smm spa->spa_l2cache.sav_sync = B_TRUE; 4131209962Smm } 4132209962Smm 4133219089Spjd /* 4134219089Spjd * Check for any removed devices. 4135219089Spjd */ 4136219089Spjd if (spa->spa_autoreplace) { 4137219089Spjd spa_aux_check_removed(&spa->spa_spares); 4138219089Spjd spa_aux_check_removed(&spa->spa_l2cache); 4139219089Spjd } 4140219089Spjd 4141209962Smm if (spa_writeable(spa)) { 4142209962Smm /* 4143209962Smm * Update the config cache to include the newly-imported pool. 4144209962Smm */ 4145209962Smm spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4146209962Smm } 4147209962Smm 4148219089Spjd /* 4149219089Spjd * It's possible that the pool was expanded while it was exported. 4150219089Spjd * We kick off an async task to handle this for us. 4151219089Spjd */ 4152219089Spjd spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 4153219089Spjd 4154209962Smm mutex_exit(&spa_namespace_lock); 4155248571Smm spa_history_log_version(spa, "import"); 4156209962Smm 4157219089Spjd#ifdef __FreeBSD__ 4158219089Spjd#ifdef _KERNEL 4159219089Spjd zvol_create_minors(pool); 4160219089Spjd#endif 4161219089Spjd#endif 4162209962Smm return (0); 4163185029Spjd} 4164185029Spjd 4165168404Spjdnvlist_t * 4166168404Spjdspa_tryimport(nvlist_t *tryconfig) 4167168404Spjd{ 4168168404Spjd nvlist_t *config = NULL; 4169168404Spjd char *poolname; 4170168404Spjd spa_t *spa; 4171168404Spjd uint64_t state; 4172208443Smm int error; 4173168404Spjd 4174168404Spjd if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 4175168404Spjd return (NULL); 4176168404Spjd 4177168404Spjd if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 4178168404Spjd return (NULL); 4179168404Spjd 4180168404Spjd /* 4181168404Spjd * Create and initialize the spa structure. 4182168404Spjd */ 4183168404Spjd mutex_enter(&spa_namespace_lock); 4184219089Spjd spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 4185209962Smm spa_activate(spa, FREAD); 4186168404Spjd 4187168404Spjd /* 4188168404Spjd * Pass off the heavy lifting to spa_load(). 4189168404Spjd * Pass TRUE for mosconfig because the user-supplied config 4190168404Spjd * is actually the one to trust when doing an import. 4191168404Spjd */ 4192219089Spjd error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 4193168404Spjd 4194168404Spjd /* 4195168404Spjd * If 'tryconfig' was at least parsable, return the current config. 4196168404Spjd */ 4197168404Spjd if (spa->spa_root_vdev != NULL) { 4198168404Spjd config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 4199168404Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 4200168404Spjd poolname) == 0); 4201168404Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4202168404Spjd state) == 0); 4203168498Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 4204168498Spjd spa->spa_uberblock.ub_timestamp) == 0); 4205236884Smm VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 4206236884Smm spa->spa_load_info) == 0); 4207168404Spjd 4208168404Spjd /* 4209185029Spjd * If the bootfs property exists on this pool then we 4210185029Spjd * copy it out so that external consumers can tell which 4211185029Spjd * pools are bootable. 4212168404Spjd */ 4213208443Smm if ((!error || error == EEXIST) && spa->spa_bootfs) { 4214185029Spjd char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4215185029Spjd 4216185029Spjd /* 4217185029Spjd * We have to play games with the name since the 4218185029Spjd * pool was opened as TRYIMPORT_NAME. 4219185029Spjd */ 4220185029Spjd if (dsl_dsobj_to_dsname(spa_name(spa), 4221185029Spjd spa->spa_bootfs, tmpname) == 0) { 4222185029Spjd char *cp; 4223185029Spjd char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 4224185029Spjd 4225185029Spjd cp = strchr(tmpname, '/'); 4226185029Spjd if (cp == NULL) { 4227185029Spjd (void) strlcpy(dsname, tmpname, 4228185029Spjd MAXPATHLEN); 4229185029Spjd } else { 4230185029Spjd (void) snprintf(dsname, MAXPATHLEN, 4231185029Spjd "%s/%s", poolname, ++cp); 4232185029Spjd } 4233185029Spjd VERIFY(nvlist_add_string(config, 4234185029Spjd ZPOOL_CONFIG_BOOTFS, dsname) == 0); 4235185029Spjd kmem_free(dsname, MAXPATHLEN); 4236185029Spjd } 4237185029Spjd kmem_free(tmpname, MAXPATHLEN); 4238185029Spjd } 4239185029Spjd 4240185029Spjd /* 4241185029Spjd * Add the list of hot spares and level 2 cache devices. 4242185029Spjd */ 4243209962Smm spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4244168404Spjd spa_add_spares(spa, config); 4245185029Spjd spa_add_l2cache(spa, config); 4246209962Smm spa_config_exit(spa, SCL_CONFIG, FTAG); 4247168404Spjd } 4248168404Spjd 4249168404Spjd spa_unload(spa); 4250168404Spjd spa_deactivate(spa); 4251168404Spjd spa_remove(spa); 4252168404Spjd mutex_exit(&spa_namespace_lock); 4253168404Spjd 4254168404Spjd return (config); 4255168404Spjd} 4256168404Spjd 4257168404Spjd/* 4258168404Spjd * Pool export/destroy 4259168404Spjd * 4260168404Spjd * The act of destroying or exporting a pool is very simple. We make sure there 4261168404Spjd * is no more pending I/O and any references to the pool are gone. Then, we 4262168404Spjd * update the pool state and sync all the labels to disk, removing the 4263207670Smm * configuration from the cache afterwards. If the 'hardforce' flag is set, then 4264207670Smm * we don't sync the labels or remove the configuration cache. 4265168404Spjd */ 4266168404Spjdstatic int 4267185029Spjdspa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 4268207670Smm boolean_t force, boolean_t hardforce) 4269168404Spjd{ 4270168404Spjd spa_t *spa; 4271168404Spjd 4272168404Spjd if (oldconfig) 4273168404Spjd *oldconfig = NULL; 4274168404Spjd 4275209962Smm if (!(spa_mode_global & FWRITE)) 4276249195Smm return (SET_ERROR(EROFS)); 4277168404Spjd 4278168404Spjd mutex_enter(&spa_namespace_lock); 4279168404Spjd if ((spa = spa_lookup(pool)) == NULL) { 4280168404Spjd mutex_exit(&spa_namespace_lock); 4281249195Smm return (SET_ERROR(ENOENT)); 4282168404Spjd } 4283168404Spjd 4284168404Spjd /* 4285168404Spjd * Put a hold on the pool, drop the namespace lock, stop async tasks, 4286168404Spjd * reacquire the namespace lock, and see if we can export. 4287168404Spjd */ 4288168404Spjd spa_open_ref(spa, FTAG); 4289168404Spjd mutex_exit(&spa_namespace_lock); 4290168404Spjd spa_async_suspend(spa); 4291168404Spjd mutex_enter(&spa_namespace_lock); 4292168404Spjd spa_close(spa, FTAG); 4293168404Spjd 4294168404Spjd /* 4295168404Spjd * The pool will be in core if it's openable, 4296168404Spjd * in which case we can modify its state. 4297168404Spjd */ 4298168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 4299168404Spjd /* 4300168404Spjd * Objsets may be open only because they're dirty, so we 4301168404Spjd * have to force it to sync before checking spa_refcnt. 4302168404Spjd */ 4303168404Spjd txg_wait_synced(spa->spa_dsl_pool, 0); 4304168404Spjd 4305168404Spjd /* 4306168404Spjd * A pool cannot be exported or destroyed if there are active 4307168404Spjd * references. If we are resetting a pool, allow references by 4308168404Spjd * fault injection handlers. 4309168404Spjd */ 4310168404Spjd if (!spa_refcount_zero(spa) || 4311168404Spjd (spa->spa_inject_ref != 0 && 4312168404Spjd new_state != POOL_STATE_UNINITIALIZED)) { 4313168404Spjd spa_async_resume(spa); 4314168404Spjd mutex_exit(&spa_namespace_lock); 4315249195Smm return (SET_ERROR(EBUSY)); 4316168404Spjd } 4317168404Spjd 4318185029Spjd /* 4319185029Spjd * A pool cannot be exported if it has an active shared spare. 4320185029Spjd * This is to prevent other pools stealing the active spare 4321185029Spjd * from an exported pool. At user's own will, such pool can 4322185029Spjd * be forcedly exported. 4323185029Spjd */ 4324185029Spjd if (!force && new_state == POOL_STATE_EXPORTED && 4325185029Spjd spa_has_active_shared_spare(spa)) { 4326185029Spjd spa_async_resume(spa); 4327185029Spjd mutex_exit(&spa_namespace_lock); 4328249195Smm return (SET_ERROR(EXDEV)); 4329185029Spjd } 4330168404Spjd 4331168404Spjd /* 4332168404Spjd * We want this to be reflected on every label, 4333168404Spjd * so mark them all dirty. spa_unload() will do the 4334168404Spjd * final sync that pushes these changes out. 4335168404Spjd */ 4336207670Smm if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 4337185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4338168404Spjd spa->spa_state = new_state; 4339219089Spjd spa->spa_final_txg = spa_last_synced_txg(spa) + 4340219089Spjd TXG_DEFER_SIZE + 1; 4341168404Spjd vdev_config_dirty(spa->spa_root_vdev); 4342185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 4343168404Spjd } 4344168404Spjd } 4345168404Spjd 4346185029Spjd spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 4347185029Spjd 4348168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 4349168404Spjd spa_unload(spa); 4350168404Spjd spa_deactivate(spa); 4351168404Spjd } 4352168404Spjd 4353168404Spjd if (oldconfig && spa->spa_config) 4354168404Spjd VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 4355168404Spjd 4356168404Spjd if (new_state != POOL_STATE_UNINITIALIZED) { 4357207670Smm if (!hardforce) 4358207670Smm spa_config_sync(spa, B_TRUE, B_TRUE); 4359168404Spjd spa_remove(spa); 4360168404Spjd } 4361168404Spjd mutex_exit(&spa_namespace_lock); 4362168404Spjd 4363168404Spjd return (0); 4364168404Spjd} 4365168404Spjd 4366168404Spjd/* 4367168404Spjd * Destroy a storage pool. 4368168404Spjd */ 4369168404Spjdint 4370168404Spjdspa_destroy(char *pool) 4371168404Spjd{ 4372207670Smm return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 4373207670Smm B_FALSE, B_FALSE)); 4374168404Spjd} 4375168404Spjd 4376168404Spjd/* 4377168404Spjd * Export a storage pool. 4378168404Spjd */ 4379168404Spjdint 4380207670Smmspa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 4381207670Smm boolean_t hardforce) 4382168404Spjd{ 4383207670Smm return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 4384207670Smm force, hardforce)); 4385168404Spjd} 4386168404Spjd 4387168404Spjd/* 4388168404Spjd * Similar to spa_export(), this unloads the spa_t without actually removing it 4389168404Spjd * from the namespace in any way. 4390168404Spjd */ 4391168404Spjdint 4392168404Spjdspa_reset(char *pool) 4393168404Spjd{ 4394185029Spjd return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 4395207670Smm B_FALSE, B_FALSE)); 4396168404Spjd} 4397168404Spjd 4398168404Spjd/* 4399168404Spjd * ========================================================================== 4400168404Spjd * Device manipulation 4401168404Spjd * ========================================================================== 4402168404Spjd */ 4403168404Spjd 4404168404Spjd/* 4405185029Spjd * Add a device to a storage pool. 4406168404Spjd */ 4407168404Spjdint 4408168404Spjdspa_vdev_add(spa_t *spa, nvlist_t *nvroot) 4409168404Spjd{ 4410219089Spjd uint64_t txg, id; 4411209962Smm int error; 4412168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4413168404Spjd vdev_t *vd, *tvd; 4414185029Spjd nvlist_t **spares, **l2cache; 4415185029Spjd uint_t nspares, nl2cache; 4416168404Spjd 4417219089Spjd ASSERT(spa_writeable(spa)); 4418219089Spjd 4419168404Spjd txg = spa_vdev_enter(spa); 4420168404Spjd 4421168404Spjd if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 4422168404Spjd VDEV_ALLOC_ADD)) != 0) 4423168404Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4424168404Spjd 4425185029Spjd spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 4426168404Spjd 4427185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 4428185029Spjd &nspares) != 0) 4429168404Spjd nspares = 0; 4430168404Spjd 4431185029Spjd if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 4432185029Spjd &nl2cache) != 0) 4433185029Spjd nl2cache = 0; 4434185029Spjd 4435185029Spjd if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 4436168404Spjd return (spa_vdev_exit(spa, vd, txg, EINVAL)); 4437168404Spjd 4438185029Spjd if (vd->vdev_children != 0 && 4439185029Spjd (error = vdev_create(vd, txg, B_FALSE)) != 0) 4440185029Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4441168404Spjd 4442168404Spjd /* 4443185029Spjd * We must validate the spares and l2cache devices after checking the 4444185029Spjd * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 4445168404Spjd */ 4446185029Spjd if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 4447168404Spjd return (spa_vdev_exit(spa, vd, txg, error)); 4448168404Spjd 4449168404Spjd /* 4450168404Spjd * Transfer each new top-level vdev from vd to rvd. 4451168404Spjd */ 4452209962Smm for (int c = 0; c < vd->vdev_children; c++) { 4453219089Spjd 4454219089Spjd /* 4455219089Spjd * Set the vdev id to the first hole, if one exists. 4456219089Spjd */ 4457219089Spjd for (id = 0; id < rvd->vdev_children; id++) { 4458219089Spjd if (rvd->vdev_child[id]->vdev_ishole) { 4459219089Spjd vdev_free(rvd->vdev_child[id]); 4460219089Spjd break; 4461219089Spjd } 4462219089Spjd } 4463168404Spjd tvd = vd->vdev_child[c]; 4464168404Spjd vdev_remove_child(vd, tvd); 4465219089Spjd tvd->vdev_id = id; 4466168404Spjd vdev_add_child(rvd, tvd); 4467168404Spjd vdev_config_dirty(tvd); 4468168404Spjd } 4469168404Spjd 4470168404Spjd if (nspares != 0) { 4471185029Spjd spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 4472185029Spjd ZPOOL_CONFIG_SPARES); 4473168404Spjd spa_load_spares(spa); 4474185029Spjd spa->spa_spares.sav_sync = B_TRUE; 4475168404Spjd } 4476168404Spjd 4477185029Spjd if (nl2cache != 0) { 4478185029Spjd spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 4479185029Spjd ZPOOL_CONFIG_L2CACHE); 4480185029Spjd spa_load_l2cache(spa); 4481185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 4482185029Spjd } 4483185029Spjd 4484168404Spjd /* 4485168404Spjd * We have to be careful when adding new vdevs to an existing pool. 4486168404Spjd * If other threads start allocating from these vdevs before we 4487168404Spjd * sync the config cache, and we lose power, then upon reboot we may 4488168404Spjd * fail to open the pool because there are DVAs that the config cache 4489168404Spjd * can't translate. Therefore, we first add the vdevs without 4490168404Spjd * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 4491168404Spjd * and then let spa_config_update() initialize the new metaslabs. 4492168404Spjd * 4493168404Spjd * spa_load() checks for added-but-not-initialized vdevs, so that 4494168404Spjd * if we lose power at any point in this sequence, the remaining 4495168404Spjd * steps will be completed the next time we load the pool. 4496168404Spjd */ 4497168404Spjd (void) spa_vdev_exit(spa, vd, txg, 0); 4498168404Spjd 4499168404Spjd mutex_enter(&spa_namespace_lock); 4500168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 4501168404Spjd mutex_exit(&spa_namespace_lock); 4502168404Spjd 4503168404Spjd return (0); 4504168404Spjd} 4505168404Spjd 4506168404Spjd/* 4507168404Spjd * Attach a device to a mirror. The arguments are the path to any device 4508168404Spjd * in the mirror, and the nvroot for the new device. If the path specifies 4509168404Spjd * a device that is not mirrored, we automatically insert the mirror vdev. 4510168404Spjd * 4511168404Spjd * If 'replacing' is specified, the new device is intended to replace the 4512168404Spjd * existing device; in this case the two devices are made into their own 4513185029Spjd * mirror using the 'replacing' vdev, which is functionally identical to 4514168404Spjd * the mirror vdev (it actually reuses all the same ops) but has a few 4515168404Spjd * extra rules: you can't attach to it after it's been created, and upon 4516168404Spjd * completion of resilvering, the first disk (the one being replaced) 4517168404Spjd * is automatically detached. 4518168404Spjd */ 4519168404Spjdint 4520168404Spjdspa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 4521168404Spjd{ 4522219089Spjd uint64_t txg, dtl_max_txg; 4523168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4524168404Spjd vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 4525168404Spjd vdev_ops_t *pvops; 4526185029Spjd char *oldvdpath, *newvdpath; 4527185029Spjd int newvd_isspare; 4528185029Spjd int error; 4529168404Spjd 4530219089Spjd ASSERT(spa_writeable(spa)); 4531219089Spjd 4532168404Spjd txg = spa_vdev_enter(spa); 4533168404Spjd 4534185029Spjd oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 4535168404Spjd 4536168404Spjd if (oldvd == NULL) 4537168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4538168404Spjd 4539168404Spjd if (!oldvd->vdev_ops->vdev_op_leaf) 4540168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4541168404Spjd 4542168404Spjd pvd = oldvd->vdev_parent; 4543168404Spjd 4544168404Spjd if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 4545230514Smm VDEV_ALLOC_ATTACH)) != 0) 4546185029Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4547185029Spjd 4548185029Spjd if (newrootvd->vdev_children != 1) 4549168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4550168404Spjd 4551168404Spjd newvd = newrootvd->vdev_child[0]; 4552168404Spjd 4553168404Spjd if (!newvd->vdev_ops->vdev_op_leaf) 4554168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 4555168404Spjd 4556168404Spjd if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 4557168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, error)); 4558168404Spjd 4559185029Spjd /* 4560185029Spjd * Spares can't replace logs 4561185029Spjd */ 4562185029Spjd if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 4563185029Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4564185029Spjd 4565168404Spjd if (!replacing) { 4566168404Spjd /* 4567168404Spjd * For attach, the only allowable parent is a mirror or the root 4568168404Spjd * vdev. 4569168404Spjd */ 4570168404Spjd if (pvd->vdev_ops != &vdev_mirror_ops && 4571168404Spjd pvd->vdev_ops != &vdev_root_ops) 4572168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4573168404Spjd 4574168404Spjd pvops = &vdev_mirror_ops; 4575168404Spjd } else { 4576168404Spjd /* 4577168404Spjd * Active hot spares can only be replaced by inactive hot 4578168404Spjd * spares. 4579168404Spjd */ 4580168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4581219089Spjd oldvd->vdev_isspare && 4582168404Spjd !spa_has_spare(spa, newvd->vdev_guid)) 4583168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4584168404Spjd 4585168404Spjd /* 4586168404Spjd * If the source is a hot spare, and the parent isn't already a 4587168404Spjd * spare, then we want to create a new hot spare. Otherwise, we 4588168404Spjd * want to create a replacing vdev. The user is not allowed to 4589168404Spjd * attach to a spared vdev child unless the 'isspare' state is 4590168404Spjd * the same (spare replaces spare, non-spare replaces 4591168404Spjd * non-spare). 4592168404Spjd */ 4593219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && 4594219089Spjd spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 4595168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4596219089Spjd } else if (pvd->vdev_ops == &vdev_spare_ops && 4597219089Spjd newvd->vdev_isspare != oldvd->vdev_isspare) { 4598168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 4599219089Spjd } 4600219089Spjd 4601219089Spjd if (newvd->vdev_isspare) 4602168404Spjd pvops = &vdev_spare_ops; 4603168404Spjd else 4604168404Spjd pvops = &vdev_replacing_ops; 4605168404Spjd } 4606168404Spjd 4607168404Spjd /* 4608219089Spjd * Make sure the new device is big enough. 4609168404Spjd */ 4610219089Spjd if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 4611168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 4612168404Spjd 4613168404Spjd /* 4614168404Spjd * The new device cannot have a higher alignment requirement 4615168404Spjd * than the top-level vdev. 4616168404Spjd */ 4617168404Spjd if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 4618168404Spjd return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 4619168404Spjd 4620168404Spjd /* 4621168404Spjd * If this is an in-place replacement, update oldvd's path and devid 4622168404Spjd * to make it distinguishable from newvd, and unopenable from now on. 4623168404Spjd */ 4624168404Spjd if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 4625168404Spjd spa_strfree(oldvd->vdev_path); 4626168404Spjd oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 4627168404Spjd KM_SLEEP); 4628168404Spjd (void) sprintf(oldvd->vdev_path, "%s/%s", 4629168404Spjd newvd->vdev_path, "old"); 4630168404Spjd if (oldvd->vdev_devid != NULL) { 4631168404Spjd spa_strfree(oldvd->vdev_devid); 4632168404Spjd oldvd->vdev_devid = NULL; 4633168404Spjd } 4634168404Spjd } 4635168404Spjd 4636219089Spjd /* mark the device being resilvered */ 4637254112Sdelphij newvd->vdev_resilver_txg = txg; 4638219089Spjd 4639168404Spjd /* 4640168404Spjd * If the parent is not a mirror, or if we're replacing, insert the new 4641168404Spjd * mirror/replacing/spare vdev above oldvd. 4642168404Spjd */ 4643168404Spjd if (pvd->vdev_ops != pvops) 4644168404Spjd pvd = vdev_add_parent(oldvd, pvops); 4645168404Spjd 4646168404Spjd ASSERT(pvd->vdev_top->vdev_parent == rvd); 4647168404Spjd ASSERT(pvd->vdev_ops == pvops); 4648168404Spjd ASSERT(oldvd->vdev_parent == pvd); 4649168404Spjd 4650168404Spjd /* 4651168404Spjd * Extract the new device from its root and add it to pvd. 4652168404Spjd */ 4653168404Spjd vdev_remove_child(newrootvd, newvd); 4654168404Spjd newvd->vdev_id = pvd->vdev_children; 4655219089Spjd newvd->vdev_crtxg = oldvd->vdev_crtxg; 4656168404Spjd vdev_add_child(pvd, newvd); 4657168404Spjd 4658168404Spjd tvd = newvd->vdev_top; 4659168404Spjd ASSERT(pvd->vdev_top == tvd); 4660168404Spjd ASSERT(tvd->vdev_parent == rvd); 4661168404Spjd 4662168404Spjd vdev_config_dirty(tvd); 4663168404Spjd 4664168404Spjd /* 4665219089Spjd * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4666219089Spjd * for any dmu_sync-ed blocks. It will propagate upward when 4667219089Spjd * spa_vdev_exit() calls vdev_dtl_reassess(). 4668168404Spjd */ 4669219089Spjd dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4670168404Spjd 4671219089Spjd vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4672219089Spjd dtl_max_txg - TXG_INITIAL); 4673168404Spjd 4674209962Smm if (newvd->vdev_isspare) { 4675168404Spjd spa_spare_activate(newvd); 4676209962Smm spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4677209962Smm } 4678209962Smm 4679185029Spjd oldvdpath = spa_strdup(oldvd->vdev_path); 4680185029Spjd newvdpath = spa_strdup(newvd->vdev_path); 4681185029Spjd newvd_isspare = newvd->vdev_isspare; 4682168404Spjd 4683168404Spjd /* 4684168404Spjd * Mark newvd's DTL dirty in this txg. 4685168404Spjd */ 4686168404Spjd vdev_dirty(tvd, VDD_DTL, newvd, txg); 4687168404Spjd 4688219089Spjd /* 4689219089Spjd * Restart the resilver 4690219089Spjd */ 4691219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4692168404Spjd 4693219089Spjd /* 4694219089Spjd * Commit the config 4695219089Spjd */ 4696219089Spjd (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4697185029Spjd 4698248571Smm spa_history_log_internal(spa, "vdev attach", NULL, 4699219089Spjd "%s vdev=%s %s vdev=%s", 4700219089Spjd replacing && newvd_isspare ? "spare in" : 4701219089Spjd replacing ? "replace" : "attach", newvdpath, 4702219089Spjd replacing ? "for" : "to", oldvdpath); 4703219089Spjd 4704185029Spjd spa_strfree(oldvdpath); 4705185029Spjd spa_strfree(newvdpath); 4706185029Spjd 4707219089Spjd if (spa->spa_bootfs) 4708219089Spjd spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4709168404Spjd 4710168404Spjd return (0); 4711168404Spjd} 4712168404Spjd 4713168404Spjd/* 4714168404Spjd * Detach a device from a mirror or replacing vdev. 4715251631Sdelphij * 4716168404Spjd * If 'replace_done' is specified, only detach if the parent 4717168404Spjd * is a replacing vdev. 4718168404Spjd */ 4719168404Spjdint 4720209962Smmspa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4721168404Spjd{ 4722168404Spjd uint64_t txg; 4723209962Smm int error; 4724168404Spjd vdev_t *rvd = spa->spa_root_vdev; 4725168404Spjd vdev_t *vd, *pvd, *cvd, *tvd; 4726168404Spjd boolean_t unspare = B_FALSE; 4727247187Smm uint64_t unspare_guid = 0; 4728219089Spjd char *vdpath; 4729168404Spjd 4730219089Spjd ASSERT(spa_writeable(spa)); 4731219089Spjd 4732168404Spjd txg = spa_vdev_enter(spa); 4733168404Spjd 4734185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4735168404Spjd 4736168404Spjd if (vd == NULL) 4737168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4738168404Spjd 4739168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 4740168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4741168404Spjd 4742168404Spjd pvd = vd->vdev_parent; 4743168404Spjd 4744168404Spjd /* 4745209962Smm * If the parent/child relationship is not as expected, don't do it. 4746209962Smm * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4747209962Smm * vdev that's replacing B with C. The user's intent in replacing 4748209962Smm * is to go from M(A,B) to M(A,C). If the user decides to cancel 4749209962Smm * the replace by detaching C, the expected behavior is to end up 4750209962Smm * M(A,B). But suppose that right after deciding to detach C, 4751209962Smm * the replacement of B completes. We would have M(A,C), and then 4752209962Smm * ask to detach C, which would leave us with just A -- not what 4753209962Smm * the user wanted. To prevent this, we make sure that the 4754209962Smm * parent/child relationship hasn't changed -- in this example, 4755209962Smm * that C's parent is still the replacing vdev R. 4756209962Smm */ 4757209962Smm if (pvd->vdev_guid != pguid && pguid != 0) 4758209962Smm return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4759209962Smm 4760209962Smm /* 4761219089Spjd * Only 'replacing' or 'spare' vdevs can be replaced. 4762168404Spjd */ 4763219089Spjd if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4764219089Spjd pvd->vdev_ops != &vdev_spare_ops) 4765219089Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4766168404Spjd 4767168404Spjd ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4768185029Spjd spa_version(spa) >= SPA_VERSION_SPARES); 4769168404Spjd 4770168404Spjd /* 4771168404Spjd * Only mirror, replacing, and spare vdevs support detach. 4772168404Spjd */ 4773168404Spjd if (pvd->vdev_ops != &vdev_replacing_ops && 4774168404Spjd pvd->vdev_ops != &vdev_mirror_ops && 4775168404Spjd pvd->vdev_ops != &vdev_spare_ops) 4776168404Spjd return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4777168404Spjd 4778168404Spjd /* 4779209962Smm * If this device has the only valid copy of some data, 4780209962Smm * we cannot safely detach it. 4781168404Spjd */ 4782209962Smm if (vdev_dtl_required(vd)) 4783168404Spjd return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4784168404Spjd 4785209962Smm ASSERT(pvd->vdev_children >= 2); 4786168404Spjd 4787168404Spjd /* 4788185029Spjd * If we are detaching the second disk from a replacing vdev, then 4789185029Spjd * check to see if we changed the original vdev's path to have "/old" 4790185029Spjd * at the end in spa_vdev_attach(). If so, undo that change now. 4791168404Spjd */ 4792219089Spjd if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4793219089Spjd vd->vdev_path != NULL) { 4794219089Spjd size_t len = strlen(vd->vdev_path); 4795219089Spjd 4796219089Spjd for (int c = 0; c < pvd->vdev_children; c++) { 4797219089Spjd cvd = pvd->vdev_child[c]; 4798219089Spjd 4799219089Spjd if (cvd == vd || cvd->vdev_path == NULL) 4800219089Spjd continue; 4801219089Spjd 4802219089Spjd if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4803219089Spjd strcmp(cvd->vdev_path + len, "/old") == 0) { 4804219089Spjd spa_strfree(cvd->vdev_path); 4805219089Spjd cvd->vdev_path = spa_strdup(vd->vdev_path); 4806219089Spjd break; 4807219089Spjd } 4808185029Spjd } 4809185029Spjd } 4810168404Spjd 4811168404Spjd /* 4812168404Spjd * If we are detaching the original disk from a spare, then it implies 4813168404Spjd * that the spare should become a real disk, and be removed from the 4814168404Spjd * active spare list for the pool. 4815168404Spjd */ 4816168404Spjd if (pvd->vdev_ops == &vdev_spare_ops && 4817219089Spjd vd->vdev_id == 0 && 4818219089Spjd pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4819168404Spjd unspare = B_TRUE; 4820168404Spjd 4821168404Spjd /* 4822168404Spjd * Erase the disk labels so the disk can be used for other things. 4823168404Spjd * This must be done after all other error cases are handled, 4824168404Spjd * but before we disembowel vd (so we can still do I/O to it). 4825168404Spjd * But if we can't do it, don't treat the error as fatal -- 4826168404Spjd * it may be that the unwritability of the disk is the reason 4827168404Spjd * it's being detached! 4828168404Spjd */ 4829168404Spjd error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4830168404Spjd 4831168404Spjd /* 4832168404Spjd * Remove vd from its parent and compact the parent's children. 4833168404Spjd */ 4834168404Spjd vdev_remove_child(pvd, vd); 4835168404Spjd vdev_compact_children(pvd); 4836168404Spjd 4837168404Spjd /* 4838168404Spjd * Remember one of the remaining children so we can get tvd below. 4839168404Spjd */ 4840219089Spjd cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4841168404Spjd 4842168404Spjd /* 4843168404Spjd * If we need to remove the remaining child from the list of hot spares, 4844209962Smm * do it now, marking the vdev as no longer a spare in the process. 4845209962Smm * We must do this before vdev_remove_parent(), because that can 4846209962Smm * change the GUID if it creates a new toplevel GUID. For a similar 4847209962Smm * reason, we must remove the spare now, in the same txg as the detach; 4848209962Smm * otherwise someone could attach a new sibling, change the GUID, and 4849209962Smm * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4850168404Spjd */ 4851168404Spjd if (unspare) { 4852168404Spjd ASSERT(cvd->vdev_isspare); 4853168404Spjd spa_spare_remove(cvd); 4854168404Spjd unspare_guid = cvd->vdev_guid; 4855209962Smm (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4856219089Spjd cvd->vdev_unspare = B_TRUE; 4857168404Spjd } 4858168404Spjd 4859168404Spjd /* 4860168404Spjd * If the parent mirror/replacing vdev only has one child, 4861168404Spjd * the parent is no longer needed. Remove it from the tree. 4862168404Spjd */ 4863219089Spjd if (pvd->vdev_children == 1) { 4864219089Spjd if (pvd->vdev_ops == &vdev_spare_ops) 4865219089Spjd cvd->vdev_unspare = B_FALSE; 4866168404Spjd vdev_remove_parent(cvd); 4867219089Spjd } 4868168404Spjd 4869219089Spjd 4870168404Spjd /* 4871168404Spjd * We don't set tvd until now because the parent we just removed 4872168404Spjd * may have been the previous top-level vdev. 4873168404Spjd */ 4874168404Spjd tvd = cvd->vdev_top; 4875168404Spjd ASSERT(tvd->vdev_parent == rvd); 4876168404Spjd 4877168404Spjd /* 4878168404Spjd * Reevaluate the parent vdev state. 4879168404Spjd */ 4880185029Spjd vdev_propagate_state(cvd); 4881168404Spjd 4882168404Spjd /* 4883219089Spjd * If the 'autoexpand' property is set on the pool then automatically 4884219089Spjd * try to expand the size of the pool. For example if the device we 4885219089Spjd * just detached was smaller than the others, it may be possible to 4886219089Spjd * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4887219089Spjd * first so that we can obtain the updated sizes of the leaf vdevs. 4888168404Spjd */ 4889219089Spjd if (spa->spa_autoexpand) { 4890219089Spjd vdev_reopen(tvd); 4891219089Spjd vdev_expand(tvd, txg); 4892219089Spjd } 4893168404Spjd 4894168404Spjd vdev_config_dirty(tvd); 4895168404Spjd 4896168404Spjd /* 4897168404Spjd * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4898168404Spjd * vd->vdev_detached is set and free vd's DTL object in syncing context. 4899168404Spjd * But first make sure we're not on any *other* txg's DTL list, to 4900168404Spjd * prevent vd from being accessed after it's freed. 4901168404Spjd */ 4902219089Spjd vdpath = spa_strdup(vd->vdev_path); 4903209962Smm for (int t = 0; t < TXG_SIZE; t++) 4904168404Spjd (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4905168404Spjd vd->vdev_detached = B_TRUE; 4906168404Spjd vdev_dirty(tvd, VDD_DTL, vd, txg); 4907168404Spjd 4908185029Spjd spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4909185029Spjd 4910219089Spjd /* hang on to the spa before we release the lock */ 4911219089Spjd spa_open_ref(spa, FTAG); 4912219089Spjd 4913168404Spjd error = spa_vdev_exit(spa, vd, txg, 0); 4914168404Spjd 4915248571Smm spa_history_log_internal(spa, "detach", NULL, 4916219089Spjd "vdev=%s", vdpath); 4917219089Spjd spa_strfree(vdpath); 4918219089Spjd 4919168404Spjd /* 4920168404Spjd * If this was the removal of the original device in a hot spare vdev, 4921168404Spjd * then we want to go through and remove the device from the hot spare 4922168404Spjd * list of every other pool. 4923168404Spjd */ 4924168404Spjd if (unspare) { 4925219089Spjd spa_t *altspa = NULL; 4926219089Spjd 4927168404Spjd mutex_enter(&spa_namespace_lock); 4928219089Spjd while ((altspa = spa_next(altspa)) != NULL) { 4929219089Spjd if (altspa->spa_state != POOL_STATE_ACTIVE || 4930219089Spjd altspa == spa) 4931168404Spjd continue; 4932219089Spjd 4933219089Spjd spa_open_ref(altspa, FTAG); 4934185029Spjd mutex_exit(&spa_namespace_lock); 4935219089Spjd (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4936185029Spjd mutex_enter(&spa_namespace_lock); 4937219089Spjd spa_close(altspa, FTAG); 4938168404Spjd } 4939168404Spjd mutex_exit(&spa_namespace_lock); 4940219089Spjd 4941219089Spjd /* search the rest of the vdevs for spares to remove */ 4942219089Spjd spa_vdev_resilver_done(spa); 4943168404Spjd } 4944168404Spjd 4945219089Spjd /* all done with the spa; OK to release */ 4946219089Spjd mutex_enter(&spa_namespace_lock); 4947219089Spjd spa_close(spa, FTAG); 4948219089Spjd mutex_exit(&spa_namespace_lock); 4949219089Spjd 4950168404Spjd return (error); 4951168404Spjd} 4952168404Spjd 4953219089Spjd/* 4954219089Spjd * Split a set of devices from their mirrors, and create a new pool from them. 4955219089Spjd */ 4956219089Spjdint 4957219089Spjdspa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4958219089Spjd nvlist_t *props, boolean_t exp) 4959219089Spjd{ 4960219089Spjd int error = 0; 4961219089Spjd uint64_t txg, *glist; 4962219089Spjd spa_t *newspa; 4963219089Spjd uint_t c, children, lastlog; 4964219089Spjd nvlist_t **child, *nvl, *tmp; 4965219089Spjd dmu_tx_t *tx; 4966219089Spjd char *altroot = NULL; 4967219089Spjd vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4968219089Spjd boolean_t activate_slog; 4969219089Spjd 4970219089Spjd ASSERT(spa_writeable(spa)); 4971219089Spjd 4972219089Spjd txg = spa_vdev_enter(spa); 4973219089Spjd 4974219089Spjd /* clear the log and flush everything up to now */ 4975219089Spjd activate_slog = spa_passivate_log(spa); 4976219089Spjd (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4977219089Spjd error = spa_offline_log(spa); 4978219089Spjd txg = spa_vdev_config_enter(spa); 4979219089Spjd 4980219089Spjd if (activate_slog) 4981219089Spjd spa_activate_log(spa); 4982219089Spjd 4983219089Spjd if (error != 0) 4984219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 4985219089Spjd 4986219089Spjd /* check new spa name before going any further */ 4987219089Spjd if (spa_lookup(newname) != NULL) 4988219089Spjd return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4989219089Spjd 4990219089Spjd /* 4991219089Spjd * scan through all the children to ensure they're all mirrors 4992219089Spjd */ 4993219089Spjd if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4994219089Spjd nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4995219089Spjd &children) != 0) 4996219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4997219089Spjd 4998219089Spjd /* first, check to ensure we've got the right child count */ 4999219089Spjd rvd = spa->spa_root_vdev; 5000219089Spjd lastlog = 0; 5001219089Spjd for (c = 0; c < rvd->vdev_children; c++) { 5002219089Spjd vdev_t *vd = rvd->vdev_child[c]; 5003219089Spjd 5004219089Spjd /* don't count the holes & logs as children */ 5005219089Spjd if (vd->vdev_islog || vd->vdev_ishole) { 5006219089Spjd if (lastlog == 0) 5007219089Spjd lastlog = c; 5008219089Spjd continue; 5009219089Spjd } 5010219089Spjd 5011219089Spjd lastlog = 0; 5012219089Spjd } 5013219089Spjd if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 5014219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5015219089Spjd 5016219089Spjd /* next, ensure no spare or cache devices are part of the split */ 5017219089Spjd if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 5018219089Spjd nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 5019219089Spjd return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 5020219089Spjd 5021219089Spjd vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 5022219089Spjd glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 5023219089Spjd 5024219089Spjd /* then, loop over each vdev and validate it */ 5025219089Spjd for (c = 0; c < children; c++) { 5026219089Spjd uint64_t is_hole = 0; 5027219089Spjd 5028219089Spjd (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 5029219089Spjd &is_hole); 5030219089Spjd 5031219089Spjd if (is_hole != 0) { 5032219089Spjd if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 5033219089Spjd spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 5034219089Spjd continue; 5035219089Spjd } else { 5036249195Smm error = SET_ERROR(EINVAL); 5037219089Spjd break; 5038219089Spjd } 5039219089Spjd } 5040219089Spjd 5041219089Spjd /* which disk is going to be split? */ 5042219089Spjd if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 5043219089Spjd &glist[c]) != 0) { 5044249195Smm error = SET_ERROR(EINVAL); 5045219089Spjd break; 5046219089Spjd } 5047219089Spjd 5048219089Spjd /* look it up in the spa */ 5049219089Spjd vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 5050219089Spjd if (vml[c] == NULL) { 5051249195Smm error = SET_ERROR(ENODEV); 5052219089Spjd break; 5053219089Spjd } 5054219089Spjd 5055219089Spjd /* make sure there's nothing stopping the split */ 5056219089Spjd if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 5057219089Spjd vml[c]->vdev_islog || 5058219089Spjd vml[c]->vdev_ishole || 5059219089Spjd vml[c]->vdev_isspare || 5060219089Spjd vml[c]->vdev_isl2cache || 5061219089Spjd !vdev_writeable(vml[c]) || 5062219089Spjd vml[c]->vdev_children != 0 || 5063219089Spjd vml[c]->vdev_state != VDEV_STATE_HEALTHY || 5064219089Spjd c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 5065249195Smm error = SET_ERROR(EINVAL); 5066219089Spjd break; 5067219089Spjd } 5068219089Spjd 5069219089Spjd if (vdev_dtl_required(vml[c])) { 5070249195Smm error = SET_ERROR(EBUSY); 5071219089Spjd break; 5072219089Spjd } 5073219089Spjd 5074219089Spjd /* we need certain info from the top level */ 5075219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 5076219089Spjd vml[c]->vdev_top->vdev_ms_array) == 0); 5077219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 5078219089Spjd vml[c]->vdev_top->vdev_ms_shift) == 0); 5079219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 5080219089Spjd vml[c]->vdev_top->vdev_asize) == 0); 5081219089Spjd VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 5082219089Spjd vml[c]->vdev_top->vdev_ashift) == 0); 5083219089Spjd } 5084219089Spjd 5085219089Spjd if (error != 0) { 5086219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5087219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5088219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5089219089Spjd } 5090219089Spjd 5091219089Spjd /* stop writers from using the disks */ 5092219089Spjd for (c = 0; c < children; c++) { 5093219089Spjd if (vml[c] != NULL) 5094219089Spjd vml[c]->vdev_offline = B_TRUE; 5095219089Spjd } 5096219089Spjd vdev_reopen(spa->spa_root_vdev); 5097219089Spjd 5098219089Spjd /* 5099219089Spjd * Temporarily record the splitting vdevs in the spa config. This 5100219089Spjd * will disappear once the config is regenerated. 5101219089Spjd */ 5102219089Spjd VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5103219089Spjd VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 5104219089Spjd glist, children) == 0); 5105219089Spjd kmem_free(glist, children * sizeof (uint64_t)); 5106219089Spjd 5107219089Spjd mutex_enter(&spa->spa_props_lock); 5108219089Spjd VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 5109219089Spjd nvl) == 0); 5110219089Spjd mutex_exit(&spa->spa_props_lock); 5111219089Spjd spa->spa_config_splitting = nvl; 5112219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5113219089Spjd 5114219089Spjd /* configure and create the new pool */ 5115219089Spjd VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 5116219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 5117219089Spjd exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 5118219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 5119219089Spjd spa_version(spa)) == 0); 5120219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 5121219089Spjd spa->spa_config_txg) == 0); 5122219089Spjd VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 5123219089Spjd spa_generate_guid(NULL)) == 0); 5124219089Spjd (void) nvlist_lookup_string(props, 5125219089Spjd zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5126219089Spjd 5127219089Spjd /* add the new pool to the namespace */ 5128219089Spjd newspa = spa_add(newname, config, altroot); 5129219089Spjd newspa->spa_config_txg = spa->spa_config_txg; 5130219089Spjd spa_set_log_state(newspa, SPA_LOG_CLEAR); 5131219089Spjd 5132219089Spjd /* release the spa config lock, retaining the namespace lock */ 5133219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5134219089Spjd 5135219089Spjd if (zio_injection_enabled) 5136219089Spjd zio_handle_panic_injection(spa, FTAG, 1); 5137219089Spjd 5138219089Spjd spa_activate(newspa, spa_mode_global); 5139219089Spjd spa_async_suspend(newspa); 5140219089Spjd 5141219089Spjd#ifndef sun 5142219089Spjd /* mark that we are creating new spa by splitting */ 5143219089Spjd newspa->spa_splitting_newspa = B_TRUE; 5144219089Spjd#endif 5145219089Spjd /* create the new pool from the disks of the original pool */ 5146219089Spjd error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 5147219089Spjd#ifndef sun 5148219089Spjd newspa->spa_splitting_newspa = B_FALSE; 5149219089Spjd#endif 5150219089Spjd if (error) 5151219089Spjd goto out; 5152219089Spjd 5153219089Spjd /* if that worked, generate a real config for the new pool */ 5154219089Spjd if (newspa->spa_root_vdev != NULL) { 5155219089Spjd VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 5156219089Spjd NV_UNIQUE_NAME, KM_SLEEP) == 0); 5157219089Spjd VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 5158219089Spjd ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 5159219089Spjd spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 5160219089Spjd B_TRUE)); 5161219089Spjd } 5162219089Spjd 5163219089Spjd /* set the props */ 5164219089Spjd if (props != NULL) { 5165219089Spjd spa_configfile_set(newspa, props, B_FALSE); 5166219089Spjd error = spa_prop_set(newspa, props); 5167219089Spjd if (error) 5168219089Spjd goto out; 5169219089Spjd } 5170219089Spjd 5171219089Spjd /* flush everything */ 5172219089Spjd txg = spa_vdev_config_enter(newspa); 5173219089Spjd vdev_config_dirty(newspa->spa_root_vdev); 5174219089Spjd (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 5175219089Spjd 5176219089Spjd if (zio_injection_enabled) 5177219089Spjd zio_handle_panic_injection(spa, FTAG, 2); 5178219089Spjd 5179219089Spjd spa_async_resume(newspa); 5180219089Spjd 5181219089Spjd /* finally, update the original pool's config */ 5182219089Spjd txg = spa_vdev_config_enter(spa); 5183219089Spjd tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 5184219089Spjd error = dmu_tx_assign(tx, TXG_WAIT); 5185219089Spjd if (error != 0) 5186219089Spjd dmu_tx_abort(tx); 5187219089Spjd for (c = 0; c < children; c++) { 5188219089Spjd if (vml[c] != NULL) { 5189219089Spjd vdev_split(vml[c]); 5190219089Spjd if (error == 0) 5191248571Smm spa_history_log_internal(spa, "detach", tx, 5192248571Smm "vdev=%s", vml[c]->vdev_path); 5193219089Spjd vdev_free(vml[c]); 5194219089Spjd } 5195219089Spjd } 5196219089Spjd vdev_config_dirty(spa->spa_root_vdev); 5197219089Spjd spa->spa_config_splitting = NULL; 5198219089Spjd nvlist_free(nvl); 5199219089Spjd if (error == 0) 5200219089Spjd dmu_tx_commit(tx); 5201219089Spjd (void) spa_vdev_exit(spa, NULL, txg, 0); 5202219089Spjd 5203219089Spjd if (zio_injection_enabled) 5204219089Spjd zio_handle_panic_injection(spa, FTAG, 3); 5205219089Spjd 5206219089Spjd /* split is complete; log a history record */ 5207248571Smm spa_history_log_internal(newspa, "split", NULL, 5208248571Smm "from pool %s", spa_name(spa)); 5209219089Spjd 5210219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5211219089Spjd 5212219089Spjd /* if we're not going to mount the filesystems in userland, export */ 5213219089Spjd if (exp) 5214219089Spjd error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 5215219089Spjd B_FALSE, B_FALSE); 5216219089Spjd 5217219089Spjd return (error); 5218219089Spjd 5219219089Spjdout: 5220219089Spjd spa_unload(newspa); 5221219089Spjd spa_deactivate(newspa); 5222219089Spjd spa_remove(newspa); 5223219089Spjd 5224219089Spjd txg = spa_vdev_config_enter(spa); 5225219089Spjd 5226219089Spjd /* re-online all offlined disks */ 5227219089Spjd for (c = 0; c < children; c++) { 5228219089Spjd if (vml[c] != NULL) 5229219089Spjd vml[c]->vdev_offline = B_FALSE; 5230219089Spjd } 5231219089Spjd vdev_reopen(spa->spa_root_vdev); 5232219089Spjd 5233219089Spjd nvlist_free(spa->spa_config_splitting); 5234219089Spjd spa->spa_config_splitting = NULL; 5235219089Spjd (void) spa_vdev_exit(spa, NULL, txg, error); 5236219089Spjd 5237219089Spjd kmem_free(vml, children * sizeof (vdev_t *)); 5238219089Spjd return (error); 5239219089Spjd} 5240219089Spjd 5241185029Spjdstatic nvlist_t * 5242185029Spjdspa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 5243185029Spjd{ 5244185029Spjd for (int i = 0; i < count; i++) { 5245185029Spjd uint64_t guid; 5246185029Spjd 5247185029Spjd VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 5248185029Spjd &guid) == 0); 5249185029Spjd 5250185029Spjd if (guid == target_guid) 5251185029Spjd return (nvpp[i]); 5252185029Spjd } 5253185029Spjd 5254185029Spjd return (NULL); 5255185029Spjd} 5256185029Spjd 5257185029Spjdstatic void 5258185029Spjdspa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 5259185029Spjd nvlist_t *dev_to_remove) 5260185029Spjd{ 5261185029Spjd nvlist_t **newdev = NULL; 5262185029Spjd 5263185029Spjd if (count > 1) 5264185029Spjd newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 5265185029Spjd 5266185029Spjd for (int i = 0, j = 0; i < count; i++) { 5267185029Spjd if (dev[i] == dev_to_remove) 5268185029Spjd continue; 5269185029Spjd VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 5270185029Spjd } 5271185029Spjd 5272185029Spjd VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 5273185029Spjd VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 5274185029Spjd 5275185029Spjd for (int i = 0; i < count - 1; i++) 5276185029Spjd nvlist_free(newdev[i]); 5277185029Spjd 5278185029Spjd if (count > 1) 5279185029Spjd kmem_free(newdev, (count - 1) * sizeof (void *)); 5280185029Spjd} 5281185029Spjd 5282168404Spjd/* 5283219089Spjd * Evacuate the device. 5284219089Spjd */ 5285219089Spjdstatic int 5286219089Spjdspa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 5287219089Spjd{ 5288219089Spjd uint64_t txg; 5289219089Spjd int error = 0; 5290219089Spjd 5291219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5292219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5293219089Spjd ASSERT(vd == vd->vdev_top); 5294219089Spjd 5295219089Spjd /* 5296219089Spjd * Evacuate the device. We don't hold the config lock as writer 5297219089Spjd * since we need to do I/O but we do keep the 5298219089Spjd * spa_namespace_lock held. Once this completes the device 5299219089Spjd * should no longer have any blocks allocated on it. 5300219089Spjd */ 5301219089Spjd if (vd->vdev_islog) { 5302219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5303219089Spjd error = spa_offline_log(spa); 5304219089Spjd } else { 5305249195Smm error = SET_ERROR(ENOTSUP); 5306219089Spjd } 5307219089Spjd 5308219089Spjd if (error) 5309219089Spjd return (error); 5310219089Spjd 5311219089Spjd /* 5312219089Spjd * The evacuation succeeded. Remove any remaining MOS metadata 5313219089Spjd * associated with this vdev, and wait for these changes to sync. 5314219089Spjd */ 5315240415Smm ASSERT0(vd->vdev_stat.vs_alloc); 5316219089Spjd txg = spa_vdev_config_enter(spa); 5317219089Spjd vd->vdev_removing = B_TRUE; 5318219089Spjd vdev_dirty(vd, 0, NULL, txg); 5319219089Spjd vdev_config_dirty(vd); 5320219089Spjd spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 5321219089Spjd 5322219089Spjd return (0); 5323219089Spjd} 5324219089Spjd 5325219089Spjd/* 5326219089Spjd * Complete the removal by cleaning up the namespace. 5327219089Spjd */ 5328219089Spjdstatic void 5329219089Spjdspa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 5330219089Spjd{ 5331219089Spjd vdev_t *rvd = spa->spa_root_vdev; 5332219089Spjd uint64_t id = vd->vdev_id; 5333219089Spjd boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 5334219089Spjd 5335219089Spjd ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5336219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5337219089Spjd ASSERT(vd == vd->vdev_top); 5338219089Spjd 5339219089Spjd /* 5340219089Spjd * Only remove any devices which are empty. 5341219089Spjd */ 5342219089Spjd if (vd->vdev_stat.vs_alloc != 0) 5343219089Spjd return; 5344219089Spjd 5345219089Spjd (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 5346219089Spjd 5347219089Spjd if (list_link_active(&vd->vdev_state_dirty_node)) 5348219089Spjd vdev_state_clean(vd); 5349219089Spjd if (list_link_active(&vd->vdev_config_dirty_node)) 5350219089Spjd vdev_config_clean(vd); 5351219089Spjd 5352219089Spjd vdev_free(vd); 5353219089Spjd 5354219089Spjd if (last_vdev) { 5355219089Spjd vdev_compact_children(rvd); 5356219089Spjd } else { 5357219089Spjd vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 5358219089Spjd vdev_add_child(rvd, vd); 5359219089Spjd } 5360219089Spjd vdev_config_dirty(rvd); 5361219089Spjd 5362219089Spjd /* 5363219089Spjd * Reassess the health of our root vdev. 5364219089Spjd */ 5365219089Spjd vdev_reopen(rvd); 5366219089Spjd} 5367219089Spjd 5368219089Spjd/* 5369219089Spjd * Remove a device from the pool - 5370219089Spjd * 5371219089Spjd * Removing a device from the vdev namespace requires several steps 5372219089Spjd * and can take a significant amount of time. As a result we use 5373219089Spjd * the spa_vdev_config_[enter/exit] functions which allow us to 5374219089Spjd * grab and release the spa_config_lock while still holding the namespace 5375219089Spjd * lock. During each step the configuration is synced out. 5376251631Sdelphij * 5377251631Sdelphij * Currently, this supports removing only hot spares, slogs, and level 2 ARC 5378251631Sdelphij * devices. 5379219089Spjd */ 5380168404Spjdint 5381168404Spjdspa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 5382168404Spjd{ 5383168404Spjd vdev_t *vd; 5384219089Spjd metaslab_group_t *mg; 5385185029Spjd nvlist_t **spares, **l2cache, *nv; 5386219089Spjd uint64_t txg = 0; 5387185029Spjd uint_t nspares, nl2cache; 5388185029Spjd int error = 0; 5389209962Smm boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 5390168404Spjd 5391219089Spjd ASSERT(spa_writeable(spa)); 5392219089Spjd 5393209962Smm if (!locked) 5394209962Smm txg = spa_vdev_enter(spa); 5395168404Spjd 5396185029Spjd vd = spa_lookup_by_guid(spa, guid, B_FALSE); 5397168404Spjd 5398185029Spjd if (spa->spa_spares.sav_vdevs != NULL && 5399185029Spjd nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5400185029Spjd ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 5401185029Spjd (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 5402185029Spjd /* 5403185029Spjd * Only remove the hot spare if it's not currently in use 5404185029Spjd * in this pool. 5405185029Spjd */ 5406185029Spjd if (vd == NULL || unspare) { 5407185029Spjd spa_vdev_remove_aux(spa->spa_spares.sav_config, 5408185029Spjd ZPOOL_CONFIG_SPARES, spares, nspares, nv); 5409185029Spjd spa_load_spares(spa); 5410185029Spjd spa->spa_spares.sav_sync = B_TRUE; 5411185029Spjd } else { 5412249195Smm error = SET_ERROR(EBUSY); 5413168404Spjd } 5414185029Spjd } else if (spa->spa_l2cache.sav_vdevs != NULL && 5415185029Spjd nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5416185029Spjd ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 5417185029Spjd (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 5418185029Spjd /* 5419185029Spjd * Cache devices can always be removed. 5420185029Spjd */ 5421185029Spjd spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 5422185029Spjd ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 5423185029Spjd spa_load_l2cache(spa); 5424185029Spjd spa->spa_l2cache.sav_sync = B_TRUE; 5425219089Spjd } else if (vd != NULL && vd->vdev_islog) { 5426219089Spjd ASSERT(!locked); 5427219089Spjd ASSERT(vd == vd->vdev_top); 5428219089Spjd 5429219089Spjd /* 5430219089Spjd * XXX - Once we have bp-rewrite this should 5431219089Spjd * become the common case. 5432219089Spjd */ 5433219089Spjd 5434219089Spjd mg = vd->vdev_mg; 5435219089Spjd 5436219089Spjd /* 5437219089Spjd * Stop allocating from this vdev. 5438219089Spjd */ 5439219089Spjd metaslab_group_passivate(mg); 5440219089Spjd 5441219089Spjd /* 5442219089Spjd * Wait for the youngest allocations and frees to sync, 5443219089Spjd * and then wait for the deferral of those frees to finish. 5444219089Spjd */ 5445219089Spjd spa_vdev_config_exit(spa, NULL, 5446219089Spjd txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 5447219089Spjd 5448219089Spjd /* 5449219089Spjd * Attempt to evacuate the vdev. 5450219089Spjd */ 5451219089Spjd error = spa_vdev_remove_evacuate(spa, vd); 5452219089Spjd 5453219089Spjd txg = spa_vdev_config_enter(spa); 5454219089Spjd 5455219089Spjd /* 5456219089Spjd * If we couldn't evacuate the vdev, unwind. 5457219089Spjd */ 5458219089Spjd if (error) { 5459219089Spjd metaslab_group_activate(mg); 5460219089Spjd return (spa_vdev_exit(spa, NULL, txg, error)); 5461219089Spjd } 5462219089Spjd 5463219089Spjd /* 5464219089Spjd * Clean up the vdev namespace. 5465219089Spjd */ 5466219089Spjd spa_vdev_remove_from_namespace(spa, vd); 5467219089Spjd 5468185029Spjd } else if (vd != NULL) { 5469185029Spjd /* 5470185029Spjd * Normal vdevs cannot be removed (yet). 5471185029Spjd */ 5472249195Smm error = SET_ERROR(ENOTSUP); 5473168404Spjd } else { 5474185029Spjd /* 5475185029Spjd * There is no vdev of any kind with the specified guid. 5476185029Spjd */ 5477249195Smm error = SET_ERROR(ENOENT); 5478168404Spjd } 5479168404Spjd 5480209962Smm if (!locked) 5481209962Smm return (spa_vdev_exit(spa, NULL, txg, error)); 5482209962Smm 5483209962Smm return (error); 5484168404Spjd} 5485168404Spjd 5486168404Spjd/* 5487185029Spjd * Find any device that's done replacing, or a vdev marked 'unspare' that's 5488251631Sdelphij * currently spared, so we can detach it. 5489168404Spjd */ 5490168404Spjdstatic vdev_t * 5491185029Spjdspa_vdev_resilver_done_hunt(vdev_t *vd) 5492168404Spjd{ 5493168404Spjd vdev_t *newvd, *oldvd; 5494168404Spjd 5495219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5496185029Spjd oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 5497168404Spjd if (oldvd != NULL) 5498168404Spjd return (oldvd); 5499168404Spjd } 5500168404Spjd 5501185029Spjd /* 5502219089Spjd * Check for a completed replacement. We always consider the first 5503219089Spjd * vdev in the list to be the oldest vdev, and the last one to be 5504219089Spjd * the newest (see spa_vdev_attach() for how that works). In 5505219089Spjd * the case where the newest vdev is faulted, we will not automatically 5506219089Spjd * remove it after a resilver completes. This is OK as it will require 5507219089Spjd * user intervention to determine which disk the admin wishes to keep. 5508185029Spjd */ 5509219089Spjd if (vd->vdev_ops == &vdev_replacing_ops) { 5510219089Spjd ASSERT(vd->vdev_children > 1); 5511219089Spjd 5512219089Spjd newvd = vd->vdev_child[vd->vdev_children - 1]; 5513168404Spjd oldvd = vd->vdev_child[0]; 5514168404Spjd 5515209962Smm if (vdev_dtl_empty(newvd, DTL_MISSING) && 5516219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5517209962Smm !vdev_dtl_required(oldvd)) 5518168404Spjd return (oldvd); 5519168404Spjd } 5520168404Spjd 5521185029Spjd /* 5522185029Spjd * Check for a completed resilver with the 'unspare' flag set. 5523185029Spjd */ 5524219089Spjd if (vd->vdev_ops == &vdev_spare_ops) { 5525219089Spjd vdev_t *first = vd->vdev_child[0]; 5526219089Spjd vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 5527185029Spjd 5528219089Spjd if (last->vdev_unspare) { 5529219089Spjd oldvd = first; 5530219089Spjd newvd = last; 5531219089Spjd } else if (first->vdev_unspare) { 5532219089Spjd oldvd = last; 5533219089Spjd newvd = first; 5534219089Spjd } else { 5535219089Spjd oldvd = NULL; 5536219089Spjd } 5537219089Spjd 5538219089Spjd if (oldvd != NULL && 5539209962Smm vdev_dtl_empty(newvd, DTL_MISSING) && 5540219089Spjd vdev_dtl_empty(newvd, DTL_OUTAGE) && 5541219089Spjd !vdev_dtl_required(oldvd)) 5542185029Spjd return (oldvd); 5543219089Spjd 5544219089Spjd /* 5545219089Spjd * If there are more than two spares attached to a disk, 5546219089Spjd * and those spares are not required, then we want to 5547219089Spjd * attempt to free them up now so that they can be used 5548219089Spjd * by other pools. Once we're back down to a single 5549219089Spjd * disk+spare, we stop removing them. 5550219089Spjd */ 5551219089Spjd if (vd->vdev_children > 2) { 5552219089Spjd newvd = vd->vdev_child[1]; 5553219089Spjd 5554219089Spjd if (newvd->vdev_isspare && last->vdev_isspare && 5555219089Spjd vdev_dtl_empty(last, DTL_MISSING) && 5556219089Spjd vdev_dtl_empty(last, DTL_OUTAGE) && 5557219089Spjd !vdev_dtl_required(newvd)) 5558219089Spjd return (newvd); 5559185029Spjd } 5560185029Spjd } 5561185029Spjd 5562168404Spjd return (NULL); 5563168404Spjd} 5564168404Spjd 5565168404Spjdstatic void 5566185029Spjdspa_vdev_resilver_done(spa_t *spa) 5567168404Spjd{ 5568209962Smm vdev_t *vd, *pvd, *ppvd; 5569209962Smm uint64_t guid, sguid, pguid, ppguid; 5570168404Spjd 5571209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5572168404Spjd 5573185029Spjd while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 5574209962Smm pvd = vd->vdev_parent; 5575209962Smm ppvd = pvd->vdev_parent; 5576168404Spjd guid = vd->vdev_guid; 5577209962Smm pguid = pvd->vdev_guid; 5578209962Smm ppguid = ppvd->vdev_guid; 5579209962Smm sguid = 0; 5580168404Spjd /* 5581168404Spjd * If we have just finished replacing a hot spared device, then 5582168404Spjd * we need to detach the parent's first child (the original hot 5583168404Spjd * spare) as well. 5584168404Spjd */ 5585219089Spjd if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 5586219089Spjd ppvd->vdev_children == 2) { 5587168404Spjd ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 5588209962Smm sguid = ppvd->vdev_child[1]->vdev_guid; 5589168404Spjd } 5590254112Sdelphij ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 5591254112Sdelphij 5592209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5593209962Smm if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 5594168404Spjd return; 5595209962Smm if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 5596168404Spjd return; 5597209962Smm spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5598168404Spjd } 5599168404Spjd 5600209962Smm spa_config_exit(spa, SCL_ALL, FTAG); 5601168404Spjd} 5602168404Spjd 5603168404Spjd/* 5604219089Spjd * Update the stored path or FRU for this vdev. 5605168404Spjd */ 5606168404Spjdint 5607209962Smmspa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 5608209962Smm boolean_t ispath) 5609168404Spjd{ 5610185029Spjd vdev_t *vd; 5611219089Spjd boolean_t sync = B_FALSE; 5612168404Spjd 5613219089Spjd ASSERT(spa_writeable(spa)); 5614168404Spjd 5615219089Spjd spa_vdev_state_enter(spa, SCL_ALL); 5616219089Spjd 5617209962Smm if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 5618219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOENT)); 5619168404Spjd 5620168404Spjd if (!vd->vdev_ops->vdev_op_leaf) 5621219089Spjd return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 5622168404Spjd 5623209962Smm if (ispath) { 5624219089Spjd if (strcmp(value, vd->vdev_path) != 0) { 5625219089Spjd spa_strfree(vd->vdev_path); 5626219089Spjd vd->vdev_path = spa_strdup(value); 5627219089Spjd sync = B_TRUE; 5628219089Spjd } 5629209962Smm } else { 5630219089Spjd if (vd->vdev_fru == NULL) { 5631219089Spjd vd->vdev_fru = spa_strdup(value); 5632219089Spjd sync = B_TRUE; 5633219089Spjd } else if (strcmp(value, vd->vdev_fru) != 0) { 5634209962Smm spa_strfree(vd->vdev_fru); 5635219089Spjd vd->vdev_fru = spa_strdup(value); 5636219089Spjd sync = B_TRUE; 5637219089Spjd } 5638209962Smm } 5639168404Spjd 5640219089Spjd return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 5641168404Spjd} 5642168404Spjd 5643209962Smmint 5644209962Smmspa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 5645209962Smm{ 5646209962Smm return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 5647209962Smm} 5648209962Smm 5649209962Smmint 5650209962Smmspa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 5651209962Smm{ 5652209962Smm return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 5653209962Smm} 5654209962Smm 5655168404Spjd/* 5656168404Spjd * ========================================================================== 5657219089Spjd * SPA Scanning 5658168404Spjd * ========================================================================== 5659168404Spjd */ 5660168404Spjd 5661168404Spjdint 5662219089Spjdspa_scan_stop(spa_t *spa) 5663168404Spjd{ 5664185029Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5665219089Spjd if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5666249195Smm return (SET_ERROR(EBUSY)); 5667219089Spjd return (dsl_scan_cancel(spa->spa_dsl_pool)); 5668219089Spjd} 5669168404Spjd 5670219089Spjdint 5671219089Spjdspa_scan(spa_t *spa, pool_scan_func_t func) 5672219089Spjd{ 5673219089Spjd ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5674219089Spjd 5675219089Spjd if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5676249195Smm return (SET_ERROR(ENOTSUP)); 5677168404Spjd 5678168404Spjd /* 5679185029Spjd * If a resilver was requested, but there is no DTL on a 5680185029Spjd * writeable leaf device, we have nothing to do. 5681168404Spjd */ 5682219089Spjd if (func == POOL_SCAN_RESILVER && 5683185029Spjd !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5684185029Spjd spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5685168404Spjd return (0); 5686168404Spjd } 5687168404Spjd 5688219089Spjd return (dsl_scan(spa->spa_dsl_pool, func)); 5689168404Spjd} 5690168404Spjd 5691168404Spjd/* 5692168404Spjd * ========================================================================== 5693168404Spjd * SPA async task processing 5694168404Spjd * ========================================================================== 5695168404Spjd */ 5696168404Spjd 5697168404Spjdstatic void 5698185029Spjdspa_async_remove(spa_t *spa, vdev_t *vd) 5699168404Spjd{ 5700185029Spjd if (vd->vdev_remove_wanted) { 5701219089Spjd vd->vdev_remove_wanted = B_FALSE; 5702219089Spjd vd->vdev_delayed_close = B_FALSE; 5703185029Spjd vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5704209962Smm 5705209962Smm /* 5706209962Smm * We want to clear the stats, but we don't want to do a full 5707209962Smm * vdev_clear() as that will cause us to throw away 5708209962Smm * degraded/faulted state as well as attempt to reopen the 5709209962Smm * device, all of which is a waste. 5710209962Smm */ 5711209962Smm vd->vdev_stat.vs_read_errors = 0; 5712209962Smm vd->vdev_stat.vs_write_errors = 0; 5713209962Smm vd->vdev_stat.vs_checksum_errors = 0; 5714209962Smm 5715185029Spjd vdev_state_dirty(vd->vdev_top); 5716185029Spjd } 5717168404Spjd 5718185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5719185029Spjd spa_async_remove(spa, vd->vdev_child[c]); 5720185029Spjd} 5721168404Spjd 5722185029Spjdstatic void 5723185029Spjdspa_async_probe(spa_t *spa, vdev_t *vd) 5724185029Spjd{ 5725185029Spjd if (vd->vdev_probe_wanted) { 5726219089Spjd vd->vdev_probe_wanted = B_FALSE; 5727185029Spjd vdev_reopen(vd); /* vdev_open() does the actual probe */ 5728168404Spjd } 5729168404Spjd 5730185029Spjd for (int c = 0; c < vd->vdev_children; c++) 5731185029Spjd spa_async_probe(spa, vd->vdev_child[c]); 5732168404Spjd} 5733168404Spjd 5734168404Spjdstatic void 5735219089Spjdspa_async_autoexpand(spa_t *spa, vdev_t *vd) 5736219089Spjd{ 5737219089Spjd sysevent_id_t eid; 5738219089Spjd nvlist_t *attr; 5739219089Spjd char *physpath; 5740219089Spjd 5741219089Spjd if (!spa->spa_autoexpand) 5742219089Spjd return; 5743219089Spjd 5744219089Spjd for (int c = 0; c < vd->vdev_children; c++) { 5745219089Spjd vdev_t *cvd = vd->vdev_child[c]; 5746219089Spjd spa_async_autoexpand(spa, cvd); 5747219089Spjd } 5748219089Spjd 5749219089Spjd if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5750219089Spjd return; 5751219089Spjd 5752219089Spjd physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5753219089Spjd (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5754219089Spjd 5755219089Spjd VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5756219089Spjd VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5757219089Spjd 5758219089Spjd (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5759219089Spjd ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); 5760219089Spjd 5761219089Spjd nvlist_free(attr); 5762219089Spjd kmem_free(physpath, MAXPATHLEN); 5763219089Spjd} 5764219089Spjd 5765219089Spjdstatic void 5766168404Spjdspa_async_thread(void *arg) 5767168404Spjd{ 5768168404Spjd spa_t *spa = arg; 5769168404Spjd int tasks; 5770168404Spjd 5771168404Spjd ASSERT(spa->spa_sync_on); 5772168404Spjd 5773168404Spjd mutex_enter(&spa->spa_async_lock); 5774168404Spjd tasks = spa->spa_async_tasks; 5775253990Smav spa->spa_async_tasks &= SPA_ASYNC_REMOVE; 5776168404Spjd mutex_exit(&spa->spa_async_lock); 5777168404Spjd 5778168404Spjd /* 5779168404Spjd * See if the config needs to be updated. 5780168404Spjd */ 5781168404Spjd if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5782219089Spjd uint64_t old_space, new_space; 5783219089Spjd 5784168404Spjd mutex_enter(&spa_namespace_lock); 5785219089Spjd old_space = metaslab_class_get_space(spa_normal_class(spa)); 5786168404Spjd spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5787219089Spjd new_space = metaslab_class_get_space(spa_normal_class(spa)); 5788168404Spjd mutex_exit(&spa_namespace_lock); 5789219089Spjd 5790219089Spjd /* 5791219089Spjd * If the pool grew as a result of the config update, 5792219089Spjd * then log an internal history event. 5793219089Spjd */ 5794219089Spjd if (new_space != old_space) { 5795248571Smm spa_history_log_internal(spa, "vdev online", NULL, 5796219089Spjd "pool '%s' size: %llu(+%llu)", 5797219089Spjd spa_name(spa), new_space, new_space - old_space); 5798219089Spjd } 5799168404Spjd } 5800168404Spjd 5801219089Spjd if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5802219089Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5803219089Spjd spa_async_autoexpand(spa, spa->spa_root_vdev); 5804219089Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 5805219089Spjd } 5806219089Spjd 5807168404Spjd /* 5808185029Spjd * See if any devices need to be probed. 5809168404Spjd */ 5810185029Spjd if (tasks & SPA_ASYNC_PROBE) { 5811219089Spjd spa_vdev_state_enter(spa, SCL_NONE); 5812185029Spjd spa_async_probe(spa, spa->spa_root_vdev); 5813185029Spjd (void) spa_vdev_state_exit(spa, NULL, 0); 5814185029Spjd } 5815168404Spjd 5816168404Spjd /* 5817185029Spjd * If any devices are done replacing, detach them. 5818168404Spjd */ 5819185029Spjd if (tasks & SPA_ASYNC_RESILVER_DONE) 5820185029Spjd spa_vdev_resilver_done(spa); 5821168404Spjd 5822168404Spjd /* 5823168404Spjd * Kick off a resilver. 5824168404Spjd */ 5825168404Spjd if (tasks & SPA_ASYNC_RESILVER) 5826219089Spjd dsl_resilver_restart(spa->spa_dsl_pool, 0); 5827168404Spjd 5828168404Spjd /* 5829168404Spjd * Let the world know that we're done. 5830168404Spjd */ 5831168404Spjd mutex_enter(&spa->spa_async_lock); 5832168404Spjd spa->spa_async_thread = NULL; 5833168404Spjd cv_broadcast(&spa->spa_async_cv); 5834168404Spjd mutex_exit(&spa->spa_async_lock); 5835168404Spjd thread_exit(); 5836168404Spjd} 5837168404Spjd 5838253990Smavstatic void 5839253990Smavspa_async_thread_vd(void *arg) 5840253990Smav{ 5841253990Smav spa_t *spa = arg; 5842253990Smav int tasks; 5843253990Smav 5844253990Smav ASSERT(spa->spa_sync_on); 5845253990Smav 5846253990Smav mutex_enter(&spa->spa_async_lock); 5847253990Smav tasks = spa->spa_async_tasks; 5848253990Smavretry: 5849253990Smav spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; 5850253990Smav mutex_exit(&spa->spa_async_lock); 5851253990Smav 5852253990Smav /* 5853253990Smav * See if any devices need to be marked REMOVED. 5854253990Smav */ 5855253990Smav if (tasks & SPA_ASYNC_REMOVE) { 5856253990Smav spa_vdev_state_enter(spa, SCL_NONE); 5857253990Smav spa_async_remove(spa, spa->spa_root_vdev); 5858253990Smav for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5859253990Smav spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5860253990Smav for (int i = 0; i < spa->spa_spares.sav_count; i++) 5861253990Smav spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5862253990Smav (void) spa_vdev_state_exit(spa, NULL, 0); 5863253990Smav } 5864253990Smav 5865253990Smav /* 5866253990Smav * Let the world know that we're done. 5867253990Smav */ 5868253990Smav mutex_enter(&spa->spa_async_lock); 5869253990Smav tasks = spa->spa_async_tasks; 5870253990Smav if ((tasks & SPA_ASYNC_REMOVE) != 0) 5871253990Smav goto retry; 5872253990Smav spa->spa_async_thread_vd = NULL; 5873253990Smav cv_broadcast(&spa->spa_async_cv); 5874253990Smav mutex_exit(&spa->spa_async_lock); 5875253990Smav thread_exit(); 5876253990Smav} 5877253990Smav 5878168404Spjdvoid 5879168404Spjdspa_async_suspend(spa_t *spa) 5880168404Spjd{ 5881168404Spjd mutex_enter(&spa->spa_async_lock); 5882168404Spjd spa->spa_async_suspended++; 5883253990Smav while (spa->spa_async_thread != NULL && 5884253990Smav spa->spa_async_thread_vd != NULL) 5885168404Spjd cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5886168404Spjd mutex_exit(&spa->spa_async_lock); 5887168404Spjd} 5888168404Spjd 5889168404Spjdvoid 5890168404Spjdspa_async_resume(spa_t *spa) 5891168404Spjd{ 5892168404Spjd mutex_enter(&spa->spa_async_lock); 5893168404Spjd ASSERT(spa->spa_async_suspended != 0); 5894168404Spjd spa->spa_async_suspended--; 5895168404Spjd mutex_exit(&spa->spa_async_lock); 5896168404Spjd} 5897168404Spjd 5898251636Sdelphijstatic boolean_t 5899251636Sdelphijspa_async_tasks_pending(spa_t *spa) 5900251636Sdelphij{ 5901251636Sdelphij uint_t non_config_tasks; 5902251636Sdelphij uint_t config_task; 5903251636Sdelphij boolean_t config_task_suspended; 5904251636Sdelphij 5905253990Smav non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | 5906253990Smav SPA_ASYNC_REMOVE); 5907251636Sdelphij config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 5908251636Sdelphij if (spa->spa_ccw_fail_time == 0) { 5909251636Sdelphij config_task_suspended = B_FALSE; 5910251636Sdelphij } else { 5911251636Sdelphij config_task_suspended = 5912251636Sdelphij (gethrtime() - spa->spa_ccw_fail_time) < 5913251636Sdelphij (zfs_ccw_retry_interval * NANOSEC); 5914251636Sdelphij } 5915251636Sdelphij 5916251636Sdelphij return (non_config_tasks || (config_task && !config_task_suspended)); 5917251636Sdelphij} 5918251636Sdelphij 5919168404Spjdstatic void 5920168404Spjdspa_async_dispatch(spa_t *spa) 5921168404Spjd{ 5922168404Spjd mutex_enter(&spa->spa_async_lock); 5923251636Sdelphij if (spa_async_tasks_pending(spa) && 5924251636Sdelphij !spa->spa_async_suspended && 5925168404Spjd spa->spa_async_thread == NULL && 5926251636Sdelphij rootdir != NULL) 5927168404Spjd spa->spa_async_thread = thread_create(NULL, 0, 5928168404Spjd spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5929168404Spjd mutex_exit(&spa->spa_async_lock); 5930168404Spjd} 5931168404Spjd 5932253990Smavstatic void 5933253990Smavspa_async_dispatch_vd(spa_t *spa) 5934253990Smav{ 5935253990Smav mutex_enter(&spa->spa_async_lock); 5936253990Smav if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && 5937253990Smav !spa->spa_async_suspended && 5938253990Smav spa->spa_async_thread_vd == NULL && 5939253990Smav rootdir != NULL) 5940253990Smav spa->spa_async_thread_vd = thread_create(NULL, 0, 5941253990Smav spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); 5942253990Smav mutex_exit(&spa->spa_async_lock); 5943253990Smav} 5944253990Smav 5945168404Spjdvoid 5946168404Spjdspa_async_request(spa_t *spa, int task) 5947168404Spjd{ 5948219089Spjd zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5949168404Spjd mutex_enter(&spa->spa_async_lock); 5950168404Spjd spa->spa_async_tasks |= task; 5951168404Spjd mutex_exit(&spa->spa_async_lock); 5952253990Smav spa_async_dispatch_vd(spa); 5953168404Spjd} 5954168404Spjd 5955168404Spjd/* 5956168404Spjd * ========================================================================== 5957168404Spjd * SPA syncing routines 5958168404Spjd * ========================================================================== 5959168404Spjd */ 5960168404Spjd 5961219089Spjdstatic int 5962219089Spjdbpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5963168404Spjd{ 5964219089Spjd bpobj_t *bpo = arg; 5965219089Spjd bpobj_enqueue(bpo, bp, tx); 5966219089Spjd return (0); 5967219089Spjd} 5968168404Spjd 5969219089Spjdstatic int 5970219089Spjdspa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5971219089Spjd{ 5972219089Spjd zio_t *zio = arg; 5973168404Spjd 5974219089Spjd zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5975240868Spjd BP_GET_PSIZE(bp), zio->io_flags)); 5976219089Spjd return (0); 5977168404Spjd} 5978168404Spjd 5979168404Spjdstatic void 5980168404Spjdspa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5981168404Spjd{ 5982168404Spjd char *packed = NULL; 5983185029Spjd size_t bufsize; 5984168404Spjd size_t nvsize = 0; 5985168404Spjd dmu_buf_t *db; 5986168404Spjd 5987168404Spjd VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5988168404Spjd 5989185029Spjd /* 5990185029Spjd * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5991185029Spjd * information. This avoids the dbuf_will_dirty() path and 5992185029Spjd * saves us a pre-read to get data we don't actually care about. 5993185029Spjd */ 5994236884Smm bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 5995185029Spjd packed = kmem_alloc(bufsize, KM_SLEEP); 5996168404Spjd 5997168404Spjd VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5998168404Spjd KM_SLEEP) == 0); 5999185029Spjd bzero(packed + nvsize, bufsize - nvsize); 6000168404Spjd 6001185029Spjd dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 6002168404Spjd 6003185029Spjd kmem_free(packed, bufsize); 6004168404Spjd 6005168404Spjd VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 6006168404Spjd dmu_buf_will_dirty(db, tx); 6007168404Spjd *(uint64_t *)db->db_data = nvsize; 6008168404Spjd dmu_buf_rele(db, FTAG); 6009168404Spjd} 6010168404Spjd 6011168404Spjdstatic void 6012185029Spjdspa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 6013185029Spjd const char *config, const char *entry) 6014168404Spjd{ 6015168404Spjd nvlist_t *nvroot; 6016185029Spjd nvlist_t **list; 6017168404Spjd int i; 6018168404Spjd 6019185029Spjd if (!sav->sav_sync) 6020168404Spjd return; 6021168404Spjd 6022168404Spjd /* 6023185029Spjd * Update the MOS nvlist describing the list of available devices. 6024185029Spjd * spa_validate_aux() will have already made sure this nvlist is 6025185029Spjd * valid and the vdevs are labeled appropriately. 6026168404Spjd */ 6027185029Spjd if (sav->sav_object == 0) { 6028185029Spjd sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 6029185029Spjd DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 6030185029Spjd sizeof (uint64_t), tx); 6031168404Spjd VERIFY(zap_update(spa->spa_meta_objset, 6032185029Spjd DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 6033185029Spjd &sav->sav_object, tx) == 0); 6034168404Spjd } 6035168404Spjd 6036168404Spjd VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 6037185029Spjd if (sav->sav_count == 0) { 6038185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 6039168404Spjd } else { 6040185029Spjd list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 6041185029Spjd for (i = 0; i < sav->sav_count; i++) 6042185029Spjd list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 6043219089Spjd B_FALSE, VDEV_CONFIG_L2CACHE); 6044185029Spjd VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 6045185029Spjd sav->sav_count) == 0); 6046185029Spjd for (i = 0; i < sav->sav_count; i++) 6047185029Spjd nvlist_free(list[i]); 6048185029Spjd kmem_free(list, sav->sav_count * sizeof (void *)); 6049168404Spjd } 6050168404Spjd 6051185029Spjd spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 6052168404Spjd nvlist_free(nvroot); 6053168404Spjd 6054185029Spjd sav->sav_sync = B_FALSE; 6055168404Spjd} 6056168404Spjd 6057168404Spjdstatic void 6058168404Spjdspa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 6059168404Spjd{ 6060168404Spjd nvlist_t *config; 6061168404Spjd 6062185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) 6063168404Spjd return; 6064168404Spjd 6065185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6066168404Spjd 6067185029Spjd config = spa_config_generate(spa, spa->spa_root_vdev, 6068185029Spjd dmu_tx_get_txg(tx), B_FALSE); 6069185029Spjd 6070243505Smm /* 6071243505Smm * If we're upgrading the spa version then make sure that 6072243505Smm * the config object gets updated with the correct version. 6073243505Smm */ 6074243505Smm if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 6075243505Smm fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 6076243505Smm spa->spa_uberblock.ub_version); 6077243505Smm 6078185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6079185029Spjd 6080168404Spjd if (spa->spa_config_syncing) 6081168404Spjd nvlist_free(spa->spa_config_syncing); 6082168404Spjd spa->spa_config_syncing = config; 6083168404Spjd 6084168404Spjd spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 6085168404Spjd} 6086168404Spjd 6087236884Smmstatic void 6088248571Smmspa_sync_version(void *arg, dmu_tx_t *tx) 6089236884Smm{ 6090248571Smm uint64_t *versionp = arg; 6091248571Smm uint64_t version = *versionp; 6092248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6093236884Smm 6094236884Smm /* 6095236884Smm * Setting the version is special cased when first creating the pool. 6096236884Smm */ 6097236884Smm ASSERT(tx->tx_txg != TXG_INITIAL); 6098236884Smm 6099247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6100236884Smm ASSERT(version >= spa_version(spa)); 6101236884Smm 6102236884Smm spa->spa_uberblock.ub_version = version; 6103236884Smm vdev_config_dirty(spa->spa_root_vdev); 6104248571Smm spa_history_log_internal(spa, "set", tx, "version=%lld", version); 6105236884Smm} 6106236884Smm 6107185029Spjd/* 6108185029Spjd * Set zpool properties. 6109185029Spjd */ 6110168404Spjdstatic void 6111248571Smmspa_sync_props(void *arg, dmu_tx_t *tx) 6112168404Spjd{ 6113248571Smm nvlist_t *nvp = arg; 6114248571Smm spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6115185029Spjd objset_t *mos = spa->spa_meta_objset; 6116236884Smm nvpair_t *elem = NULL; 6117168404Spjd 6118168404Spjd mutex_enter(&spa->spa_props_lock); 6119168404Spjd 6120185029Spjd while ((elem = nvlist_next_nvpair(nvp, elem))) { 6121236884Smm uint64_t intval; 6122236884Smm char *strval, *fname; 6123236884Smm zpool_prop_t prop; 6124236884Smm const char *propname; 6125236884Smm zprop_type_t proptype; 6126236884Smm zfeature_info_t *feature; 6127236884Smm 6128185029Spjd switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 6129236884Smm case ZPROP_INVAL: 6130236884Smm /* 6131236884Smm * We checked this earlier in spa_prop_validate(). 6132236884Smm */ 6133236884Smm ASSERT(zpool_prop_feature(nvpair_name(elem))); 6134236884Smm 6135236884Smm fname = strchr(nvpair_name(elem), '@') + 1; 6136236884Smm VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature)); 6137236884Smm 6138236884Smm spa_feature_enable(spa, feature, tx); 6139248571Smm spa_history_log_internal(spa, "set", tx, 6140248571Smm "%s=enabled", nvpair_name(elem)); 6141236884Smm break; 6142236884Smm 6143185029Spjd case ZPOOL_PROP_VERSION: 6144236884Smm VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6145185029Spjd /* 6146236884Smm * The version is synced seperatly before other 6147236884Smm * properties and should be correct by now. 6148185029Spjd */ 6149236884Smm ASSERT3U(spa_version(spa), >=, intval); 6150185029Spjd break; 6151168404Spjd 6152185029Spjd case ZPOOL_PROP_ALTROOT: 6153185029Spjd /* 6154185029Spjd * 'altroot' is a non-persistent property. It should 6155185029Spjd * have been set temporarily at creation or import time. 6156185029Spjd */ 6157185029Spjd ASSERT(spa->spa_root != NULL); 6158185029Spjd break; 6159168404Spjd 6160219089Spjd case ZPOOL_PROP_READONLY: 6161185029Spjd case ZPOOL_PROP_CACHEFILE: 6162185029Spjd /* 6163219089Spjd * 'readonly' and 'cachefile' are also non-persisitent 6164219089Spjd * properties. 6165185029Spjd */ 6166168404Spjd break; 6167228103Smm case ZPOOL_PROP_COMMENT: 6168228103Smm VERIFY(nvpair_value_string(elem, &strval) == 0); 6169228103Smm if (spa->spa_comment != NULL) 6170228103Smm spa_strfree(spa->spa_comment); 6171228103Smm spa->spa_comment = spa_strdup(strval); 6172228103Smm /* 6173228103Smm * We need to dirty the configuration on all the vdevs 6174228103Smm * so that their labels get updated. It's unnecessary 6175228103Smm * to do this for pool creation since the vdev's 6176228103Smm * configuratoin has already been dirtied. 6177228103Smm */ 6178228103Smm if (tx->tx_txg != TXG_INITIAL) 6179228103Smm vdev_config_dirty(spa->spa_root_vdev); 6180248571Smm spa_history_log_internal(spa, "set", tx, 6181248571Smm "%s=%s", nvpair_name(elem), strval); 6182228103Smm break; 6183185029Spjd default: 6184185029Spjd /* 6185185029Spjd * Set pool property values in the poolprops mos object. 6186185029Spjd */ 6187185029Spjd if (spa->spa_pool_props_object == 0) { 6188236884Smm spa->spa_pool_props_object = 6189236884Smm zap_create_link(mos, DMU_OT_POOL_PROPS, 6190185029Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 6191236884Smm tx); 6192185029Spjd } 6193185029Spjd 6194185029Spjd /* normalize the property name */ 6195185029Spjd propname = zpool_prop_to_name(prop); 6196185029Spjd proptype = zpool_prop_get_type(prop); 6197185029Spjd 6198185029Spjd if (nvpair_type(elem) == DATA_TYPE_STRING) { 6199185029Spjd ASSERT(proptype == PROP_TYPE_STRING); 6200185029Spjd VERIFY(nvpair_value_string(elem, &strval) == 0); 6201185029Spjd VERIFY(zap_update(mos, 6202185029Spjd spa->spa_pool_props_object, propname, 6203185029Spjd 1, strlen(strval) + 1, strval, tx) == 0); 6204248571Smm spa_history_log_internal(spa, "set", tx, 6205248571Smm "%s=%s", nvpair_name(elem), strval); 6206185029Spjd } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6207185029Spjd VERIFY(nvpair_value_uint64(elem, &intval) == 0); 6208185029Spjd 6209185029Spjd if (proptype == PROP_TYPE_INDEX) { 6210185029Spjd const char *unused; 6211185029Spjd VERIFY(zpool_prop_index_to_string( 6212185029Spjd prop, intval, &unused) == 0); 6213185029Spjd } 6214185029Spjd VERIFY(zap_update(mos, 6215185029Spjd spa->spa_pool_props_object, propname, 6216185029Spjd 8, 1, &intval, tx) == 0); 6217248571Smm spa_history_log_internal(spa, "set", tx, 6218248571Smm "%s=%lld", nvpair_name(elem), intval); 6219185029Spjd } else { 6220185029Spjd ASSERT(0); /* not allowed */ 6221185029Spjd } 6222185029Spjd 6223185029Spjd switch (prop) { 6224185029Spjd case ZPOOL_PROP_DELEGATION: 6225185029Spjd spa->spa_delegation = intval; 6226185029Spjd break; 6227185029Spjd case ZPOOL_PROP_BOOTFS: 6228185029Spjd spa->spa_bootfs = intval; 6229185029Spjd break; 6230185029Spjd case ZPOOL_PROP_FAILUREMODE: 6231185029Spjd spa->spa_failmode = intval; 6232185029Spjd break; 6233219089Spjd case ZPOOL_PROP_AUTOEXPAND: 6234219089Spjd spa->spa_autoexpand = intval; 6235219089Spjd if (tx->tx_txg != TXG_INITIAL) 6236219089Spjd spa_async_request(spa, 6237219089Spjd SPA_ASYNC_AUTOEXPAND); 6238219089Spjd break; 6239219089Spjd case ZPOOL_PROP_DEDUPDITTO: 6240219089Spjd spa->spa_dedup_ditto = intval; 6241219089Spjd break; 6242185029Spjd default: 6243185029Spjd break; 6244185029Spjd } 6245168404Spjd } 6246185029Spjd 6247168404Spjd } 6248185029Spjd 6249185029Spjd mutex_exit(&spa->spa_props_lock); 6250168404Spjd} 6251168404Spjd 6252168404Spjd/* 6253219089Spjd * Perform one-time upgrade on-disk changes. spa_version() does not 6254219089Spjd * reflect the new version this txg, so there must be no changes this 6255219089Spjd * txg to anything that the upgrade code depends on after it executes. 6256219089Spjd * Therefore this must be called after dsl_pool_sync() does the sync 6257219089Spjd * tasks. 6258219089Spjd */ 6259219089Spjdstatic void 6260219089Spjdspa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 6261219089Spjd{ 6262219089Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6263219089Spjd 6264219089Spjd ASSERT(spa->spa_sync_pass == 1); 6265219089Spjd 6266248571Smm rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 6267248571Smm 6268219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 6269219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 6270219089Spjd dsl_pool_create_origin(dp, tx); 6271219089Spjd 6272219089Spjd /* Keeping the origin open increases spa_minref */ 6273219089Spjd spa->spa_minref += 3; 6274219089Spjd } 6275219089Spjd 6276219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 6277219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 6278219089Spjd dsl_pool_upgrade_clones(dp, tx); 6279219089Spjd } 6280219089Spjd 6281219089Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 6282219089Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 6283219089Spjd dsl_pool_upgrade_dir_clones(dp, tx); 6284219089Spjd 6285219089Spjd /* Keeping the freedir open increases spa_minref */ 6286219089Spjd spa->spa_minref += 3; 6287219089Spjd } 6288236884Smm 6289236884Smm if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 6290236884Smm spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 6291236884Smm spa_feature_create_zap_objects(spa, tx); 6292236884Smm } 6293248571Smm rrw_exit(&dp->dp_config_rwlock, FTAG); 6294219089Spjd} 6295219089Spjd 6296219089Spjd/* 6297168404Spjd * Sync the specified transaction group. New blocks may be dirtied as 6298168404Spjd * part of the process, so we iterate until it converges. 6299168404Spjd */ 6300168404Spjdvoid 6301168404Spjdspa_sync(spa_t *spa, uint64_t txg) 6302168404Spjd{ 6303168404Spjd dsl_pool_t *dp = spa->spa_dsl_pool; 6304168404Spjd objset_t *mos = spa->spa_meta_objset; 6305219089Spjd bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 6306219089Spjd bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 6307168404Spjd vdev_t *rvd = spa->spa_root_vdev; 6308168404Spjd vdev_t *vd; 6309168404Spjd dmu_tx_t *tx; 6310185029Spjd int error; 6311168404Spjd 6312219089Spjd VERIFY(spa_writeable(spa)); 6313219089Spjd 6314168404Spjd /* 6315168404Spjd * Lock out configuration changes. 6316168404Spjd */ 6317185029Spjd spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6318168404Spjd 6319168404Spjd spa->spa_syncing_txg = txg; 6320168404Spjd spa->spa_sync_pass = 0; 6321168404Spjd 6322185029Spjd /* 6323185029Spjd * If there are any pending vdev state changes, convert them 6324185029Spjd * into config changes that go out with this transaction group. 6325185029Spjd */ 6326185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6327209962Smm while (list_head(&spa->spa_state_dirty_list) != NULL) { 6328209962Smm /* 6329209962Smm * We need the write lock here because, for aux vdevs, 6330209962Smm * calling vdev_config_dirty() modifies sav_config. 6331209962Smm * This is ugly and will become unnecessary when we 6332209962Smm * eliminate the aux vdev wart by integrating all vdevs 6333209962Smm * into the root vdev tree. 6334209962Smm */ 6335209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6336209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 6337209962Smm while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 6338209962Smm vdev_state_clean(vd); 6339209962Smm vdev_config_dirty(vd); 6340209962Smm } 6341209962Smm spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 6342209962Smm spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 6343185029Spjd } 6344185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6345185029Spjd 6346168404Spjd tx = dmu_tx_create_assigned(dp, txg); 6347168404Spjd 6348247265Smm spa->spa_sync_starttime = gethrtime(); 6349247265Smm#ifdef illumos 6350247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, 6351247265Smm spa->spa_sync_starttime + spa->spa_deadman_synctime)); 6352247265Smm#else /* FreeBSD */ 6353247265Smm#ifdef _KERNEL 6354247265Smm callout_reset(&spa->spa_deadman_cycid, 6355247265Smm hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); 6356247265Smm#endif 6357247265Smm#endif 6358247265Smm 6359168404Spjd /* 6360185029Spjd * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 6361168404Spjd * set spa_deflate if we have no raid-z vdevs. 6362168404Spjd */ 6363185029Spjd if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 6364185029Spjd spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 6365168404Spjd int i; 6366168404Spjd 6367168404Spjd for (i = 0; i < rvd->vdev_children; i++) { 6368168404Spjd vd = rvd->vdev_child[i]; 6369168404Spjd if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 6370168404Spjd break; 6371168404Spjd } 6372168404Spjd if (i == rvd->vdev_children) { 6373168404Spjd spa->spa_deflate = TRUE; 6374168404Spjd VERIFY(0 == zap_add(spa->spa_meta_objset, 6375168404Spjd DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6376168404Spjd sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 6377168404Spjd } 6378168404Spjd } 6379168404Spjd 6380168404Spjd /* 6381219089Spjd * If anything has changed in this txg, or if someone is waiting 6382219089Spjd * for this txg to sync (eg, spa_vdev_remove()), push the 6383219089Spjd * deferred frees from the previous txg. If not, leave them 6384219089Spjd * alone so that we don't generate work on an otherwise idle 6385219089Spjd * system. 6386168404Spjd */ 6387168404Spjd if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 6388168404Spjd !txg_list_empty(&dp->dp_dirty_dirs, txg) || 6389219089Spjd !txg_list_empty(&dp->dp_sync_tasks, txg) || 6390219089Spjd ((dsl_scan_active(dp->dp_scan) || 6391219089Spjd txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 6392219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6393219089Spjd VERIFY3U(bpobj_iterate(defer_bpo, 6394219089Spjd spa_free_sync_cb, zio, tx), ==, 0); 6395240415Smm VERIFY0(zio_wait(zio)); 6396219089Spjd } 6397168404Spjd 6398168404Spjd /* 6399168404Spjd * Iterate to convergence. 6400168404Spjd */ 6401168404Spjd do { 6402219089Spjd int pass = ++spa->spa_sync_pass; 6403168404Spjd 6404168404Spjd spa_sync_config_object(spa, tx); 6405185029Spjd spa_sync_aux_dev(spa, &spa->spa_spares, tx, 6406185029Spjd ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 6407185029Spjd spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 6408185029Spjd ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 6409168404Spjd spa_errlog_sync(spa, txg); 6410168404Spjd dsl_pool_sync(dp, txg); 6411168404Spjd 6412243503Smm if (pass < zfs_sync_pass_deferred_free) { 6413219089Spjd zio_t *zio = zio_root(spa, NULL, NULL, 0); 6414219089Spjd bplist_iterate(free_bpl, spa_free_sync_cb, 6415219089Spjd zio, tx); 6416219089Spjd VERIFY(zio_wait(zio) == 0); 6417219089Spjd } else { 6418219089Spjd bplist_iterate(free_bpl, bpobj_enqueue_cb, 6419219089Spjd defer_bpo, tx); 6420168404Spjd } 6421168404Spjd 6422219089Spjd ddt_sync(spa, txg); 6423219089Spjd dsl_scan_sync(dp, tx); 6424168404Spjd 6425219089Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 6426219089Spjd vdev_sync(vd, txg); 6427168404Spjd 6428219089Spjd if (pass == 1) 6429219089Spjd spa_sync_upgrades(spa, tx); 6430168404Spjd 6431219089Spjd } while (dmu_objset_is_dirty(mos, txg)); 6432219089Spjd 6433168404Spjd /* 6434168404Spjd * Rewrite the vdev configuration (which includes the uberblock) 6435168404Spjd * to commit the transaction group. 6436168404Spjd * 6437185029Spjd * If there are no dirty vdevs, we sync the uberblock to a few 6438185029Spjd * random top-level vdevs that are known to be visible in the 6439185029Spjd * config cache (see spa_vdev_add() for a complete description). 6440185029Spjd * If there *are* dirty vdevs, sync the uberblock to all vdevs. 6441168404Spjd */ 6442185029Spjd for (;;) { 6443185029Spjd /* 6444185029Spjd * We hold SCL_STATE to prevent vdev open/close/etc. 6445185029Spjd * while we're attempting to write the vdev labels. 6446185029Spjd */ 6447185029Spjd spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 6448168404Spjd 6449185029Spjd if (list_is_empty(&spa->spa_config_dirty_list)) { 6450185029Spjd vdev_t *svd[SPA_DVAS_PER_BP]; 6451185029Spjd int svdcount = 0; 6452185029Spjd int children = rvd->vdev_children; 6453185029Spjd int c0 = spa_get_random(children); 6454185029Spjd 6455219089Spjd for (int c = 0; c < children; c++) { 6456185029Spjd vd = rvd->vdev_child[(c0 + c) % children]; 6457185029Spjd if (vd->vdev_ms_array == 0 || vd->vdev_islog) 6458185029Spjd continue; 6459185029Spjd svd[svdcount++] = vd; 6460185029Spjd if (svdcount == SPA_DVAS_PER_BP) 6461185029Spjd break; 6462185029Spjd } 6463213198Smm error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 6464213198Smm if (error != 0) 6465213198Smm error = vdev_config_sync(svd, svdcount, txg, 6466213198Smm B_TRUE); 6467185029Spjd } else { 6468185029Spjd error = vdev_config_sync(rvd->vdev_child, 6469213198Smm rvd->vdev_children, txg, B_FALSE); 6470213198Smm if (error != 0) 6471213198Smm error = vdev_config_sync(rvd->vdev_child, 6472213198Smm rvd->vdev_children, txg, B_TRUE); 6473168404Spjd } 6474185029Spjd 6475239620Smm if (error == 0) 6476239620Smm spa->spa_last_synced_guid = rvd->vdev_guid; 6477239620Smm 6478185029Spjd spa_config_exit(spa, SCL_STATE, FTAG); 6479185029Spjd 6480185029Spjd if (error == 0) 6481185029Spjd break; 6482185029Spjd zio_suspend(spa, NULL); 6483185029Spjd zio_resume_wait(spa); 6484168404Spjd } 6485168404Spjd dmu_tx_commit(tx); 6486168404Spjd 6487247265Smm#ifdef illumos 6488247265Smm VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); 6489247265Smm#else /* FreeBSD */ 6490247265Smm#ifdef _KERNEL 6491247265Smm callout_drain(&spa->spa_deadman_cycid); 6492247265Smm#endif 6493247265Smm#endif 6494247265Smm 6495168404Spjd /* 6496168404Spjd * Clear the dirty config list. 6497168404Spjd */ 6498185029Spjd while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 6499168404Spjd vdev_config_clean(vd); 6500168404Spjd 6501168404Spjd /* 6502168404Spjd * Now that the new config has synced transactionally, 6503168404Spjd * let it become visible to the config cache. 6504168404Spjd */ 6505168404Spjd if (spa->spa_config_syncing != NULL) { 6506168404Spjd spa_config_set(spa, spa->spa_config_syncing); 6507168404Spjd spa->spa_config_txg = txg; 6508168404Spjd spa->spa_config_syncing = NULL; 6509168404Spjd } 6510168404Spjd 6511168404Spjd spa->spa_ubsync = spa->spa_uberblock; 6512168404Spjd 6513219089Spjd dsl_pool_sync_done(dp, txg); 6514168404Spjd 6515168404Spjd /* 6516168404Spjd * Update usable space statistics. 6517168404Spjd */ 6518168404Spjd while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 6519168404Spjd vdev_sync_done(vd, txg); 6520168404Spjd 6521219089Spjd spa_update_dspace(spa); 6522219089Spjd 6523168404Spjd /* 6524168404Spjd * It had better be the case that we didn't dirty anything 6525168404Spjd * since vdev_config_sync(). 6526168404Spjd */ 6527168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 6528168404Spjd ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 6529168404Spjd ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 6530168404Spjd 6531219089Spjd spa->spa_sync_pass = 0; 6532219089Spjd 6533185029Spjd spa_config_exit(spa, SCL_CONFIG, FTAG); 6534168404Spjd 6535219089Spjd spa_handle_ignored_writes(spa); 6536219089Spjd 6537168404Spjd /* 6538168404Spjd * If any async tasks have been requested, kick them off. 6539168404Spjd */ 6540168404Spjd spa_async_dispatch(spa); 6541253990Smav spa_async_dispatch_vd(spa); 6542168404Spjd} 6543168404Spjd 6544168404Spjd/* 6545168404Spjd * Sync all pools. We don't want to hold the namespace lock across these 6546168404Spjd * operations, so we take a reference on the spa_t and drop the lock during the 6547168404Spjd * sync. 6548168404Spjd */ 6549168404Spjdvoid 6550168404Spjdspa_sync_allpools(void) 6551168404Spjd{ 6552168404Spjd spa_t *spa = NULL; 6553168404Spjd mutex_enter(&spa_namespace_lock); 6554168404Spjd while ((spa = spa_next(spa)) != NULL) { 6555219089Spjd if (spa_state(spa) != POOL_STATE_ACTIVE || 6556219089Spjd !spa_writeable(spa) || spa_suspended(spa)) 6557168404Spjd continue; 6558168404Spjd spa_open_ref(spa, FTAG); 6559168404Spjd mutex_exit(&spa_namespace_lock); 6560168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6561168404Spjd mutex_enter(&spa_namespace_lock); 6562168404Spjd spa_close(spa, FTAG); 6563168404Spjd } 6564168404Spjd mutex_exit(&spa_namespace_lock); 6565168404Spjd} 6566168404Spjd 6567168404Spjd/* 6568168404Spjd * ========================================================================== 6569168404Spjd * Miscellaneous routines 6570168404Spjd * ========================================================================== 6571168404Spjd */ 6572168404Spjd 6573168404Spjd/* 6574168404Spjd * Remove all pools in the system. 6575168404Spjd */ 6576168404Spjdvoid 6577168404Spjdspa_evict_all(void) 6578168404Spjd{ 6579168404Spjd spa_t *spa; 6580168404Spjd 6581168404Spjd /* 6582168404Spjd * Remove all cached state. All pools should be closed now, 6583168404Spjd * so every spa in the AVL tree should be unreferenced. 6584168404Spjd */ 6585168404Spjd mutex_enter(&spa_namespace_lock); 6586168404Spjd while ((spa = spa_next(NULL)) != NULL) { 6587168404Spjd /* 6588168404Spjd * Stop async tasks. The async thread may need to detach 6589168404Spjd * a device that's been replaced, which requires grabbing 6590168404Spjd * spa_namespace_lock, so we must drop it here. 6591168404Spjd */ 6592168404Spjd spa_open_ref(spa, FTAG); 6593168404Spjd mutex_exit(&spa_namespace_lock); 6594168404Spjd spa_async_suspend(spa); 6595168404Spjd mutex_enter(&spa_namespace_lock); 6596168404Spjd spa_close(spa, FTAG); 6597168404Spjd 6598168404Spjd if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6599168404Spjd spa_unload(spa); 6600168404Spjd spa_deactivate(spa); 6601168404Spjd } 6602168404Spjd spa_remove(spa); 6603168404Spjd } 6604168404Spjd mutex_exit(&spa_namespace_lock); 6605168404Spjd} 6606168404Spjd 6607168404Spjdvdev_t * 6608209962Smmspa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 6609168404Spjd{ 6610185029Spjd vdev_t *vd; 6611185029Spjd int i; 6612185029Spjd 6613185029Spjd if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 6614185029Spjd return (vd); 6615185029Spjd 6616209962Smm if (aux) { 6617185029Spjd for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 6618185029Spjd vd = spa->spa_l2cache.sav_vdevs[i]; 6619185029Spjd if (vd->vdev_guid == guid) 6620185029Spjd return (vd); 6621185029Spjd } 6622209962Smm 6623209962Smm for (i = 0; i < spa->spa_spares.sav_count; i++) { 6624209962Smm vd = spa->spa_spares.sav_vdevs[i]; 6625209962Smm if (vd->vdev_guid == guid) 6626209962Smm return (vd); 6627209962Smm } 6628185029Spjd } 6629185029Spjd 6630185029Spjd return (NULL); 6631168404Spjd} 6632168404Spjd 6633168404Spjdvoid 6634185029Spjdspa_upgrade(spa_t *spa, uint64_t version) 6635168404Spjd{ 6636219089Spjd ASSERT(spa_writeable(spa)); 6637219089Spjd 6638185029Spjd spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6639168404Spjd 6640168404Spjd /* 6641168404Spjd * This should only be called for a non-faulted pool, and since a 6642168404Spjd * future version would result in an unopenable pool, this shouldn't be 6643168404Spjd * possible. 6644168404Spjd */ 6645247592Sdelphij ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 6646185029Spjd ASSERT(version >= spa->spa_uberblock.ub_version); 6647168404Spjd 6648185029Spjd spa->spa_uberblock.ub_version = version; 6649168404Spjd vdev_config_dirty(spa->spa_root_vdev); 6650168404Spjd 6651185029Spjd spa_config_exit(spa, SCL_ALL, FTAG); 6652168404Spjd 6653168404Spjd txg_wait_synced(spa_get_dsl(spa), 0); 6654168404Spjd} 6655168404Spjd 6656168404Spjdboolean_t 6657168404Spjdspa_has_spare(spa_t *spa, uint64_t guid) 6658168404Spjd{ 6659168404Spjd int i; 6660168404Spjd uint64_t spareguid; 6661185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6662168404Spjd 6663185029Spjd for (i = 0; i < sav->sav_count; i++) 6664185029Spjd if (sav->sav_vdevs[i]->vdev_guid == guid) 6665168404Spjd return (B_TRUE); 6666168404Spjd 6667185029Spjd for (i = 0; i < sav->sav_npending; i++) { 6668185029Spjd if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 6669185029Spjd &spareguid) == 0 && spareguid == guid) 6670168404Spjd return (B_TRUE); 6671168404Spjd } 6672168404Spjd 6673168404Spjd return (B_FALSE); 6674168404Spjd} 6675168404Spjd 6676185029Spjd/* 6677185029Spjd * Check if a pool has an active shared spare device. 6678185029Spjd * Note: reference count of an active spare is 2, as a spare and as a replace 6679185029Spjd */ 6680185029Spjdstatic boolean_t 6681185029Spjdspa_has_active_shared_spare(spa_t *spa) 6682168404Spjd{ 6683185029Spjd int i, refcnt; 6684185029Spjd uint64_t pool; 6685185029Spjd spa_aux_vdev_t *sav = &spa->spa_spares; 6686185029Spjd 6687185029Spjd for (i = 0; i < sav->sav_count; i++) { 6688185029Spjd if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 6689185029Spjd &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 6690185029Spjd refcnt > 2) 6691185029Spjd return (B_TRUE); 6692185029Spjd } 6693185029Spjd 6694185029Spjd return (B_FALSE); 6695168404Spjd} 6696168404Spjd 6697185029Spjd/* 6698185029Spjd * Post a sysevent corresponding to the given event. The 'name' must be one of 6699185029Spjd * the event definitions in sys/sysevent/eventdefs.h. The payload will be 6700185029Spjd * filled in from the spa and (optionally) the vdev. This doesn't do anything 6701185029Spjd * in the userland libzpool, as we don't want consumers to misinterpret ztest 6702185029Spjd * or zdb as real changes. 6703185029Spjd */ 6704185029Spjdvoid 6705185029Spjdspa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 6706168404Spjd{ 6707185029Spjd#ifdef _KERNEL 6708185029Spjd sysevent_t *ev; 6709185029Spjd sysevent_attr_list_t *attr = NULL; 6710185029Spjd sysevent_value_t value; 6711185029Spjd sysevent_id_t eid; 6712168404Spjd 6713185029Spjd ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 6714185029Spjd SE_SLEEP); 6715168404Spjd 6716185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6717185029Spjd value.value.sv_string = spa_name(spa); 6718185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 6719185029Spjd goto done; 6720168404Spjd 6721185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6722185029Spjd value.value.sv_uint64 = spa_guid(spa); 6723185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 6724185029Spjd goto done; 6725168404Spjd 6726185029Spjd if (vd) { 6727185029Spjd value.value_type = SE_DATA_TYPE_UINT64; 6728185029Spjd value.value.sv_uint64 = vd->vdev_guid; 6729185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 6730185029Spjd SE_SLEEP) != 0) 6731185029Spjd goto done; 6732168404Spjd 6733185029Spjd if (vd->vdev_path) { 6734185029Spjd value.value_type = SE_DATA_TYPE_STRING; 6735185029Spjd value.value.sv_string = vd->vdev_path; 6736185029Spjd if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 6737185029Spjd &value, SE_SLEEP) != 0) 6738185029Spjd goto done; 6739168404Spjd } 6740168404Spjd } 6741168404Spjd 6742185029Spjd if (sysevent_attach_attributes(ev, attr) != 0) 6743185029Spjd goto done; 6744185029Spjd attr = NULL; 6745168404Spjd 6746185029Spjd (void) log_sysevent(ev, SE_SLEEP, &eid); 6747185029Spjd 6748185029Spjddone: 6749185029Spjd if (attr) 6750185029Spjd sysevent_free_attr(attr); 6751185029Spjd sysevent_free(ev); 6752185029Spjd#endif 6753168404Spjd} 6754