vdev_indirect.c revision 339104
1/* 2 * CDDL HEADER START 3 * 4 * This file and its contents are supplied under the terms of the 5 * Common Development and Distribution License ("CDDL"), version 1.0. 6 * You may only use this file in accordance with the terms of version 7 * 1.0 of the CDDL. 8 * 9 * A full copy of the text of the CDDL should have accompanied this 10 * source. A copy of the CDDL is also available via the Internet at 11 * http://www.illumos.org/license/CDDL. 12 * 13 * CDDL HEADER END 14 */ 15 16/* 17 * Copyright (c) 2014, 2017 by Delphix. All rights reserved. 18 */ 19 20#include <sys/zfs_context.h> 21#include <sys/spa.h> 22#include <sys/spa_impl.h> 23#include <sys/vdev_impl.h> 24#include <sys/fs/zfs.h> 25#include <sys/zio.h> 26#include <sys/metaslab.h> 27#include <sys/refcount.h> 28#include <sys/dmu.h> 29#include <sys/vdev_indirect_mapping.h> 30#include <sys/dmu_tx.h> 31#include <sys/dsl_synctask.h> 32#include <sys/zap.h> 33#include <sys/abd.h> 34#include <sys/zthr.h> 35 36/* 37 * An indirect vdev corresponds to a vdev that has been removed. Since 38 * we cannot rewrite block pointers of snapshots, etc., we keep a 39 * mapping from old location on the removed device to the new location 40 * on another device in the pool and use this mapping whenever we need 41 * to access the DVA. Unfortunately, this mapping did not respect 42 * logical block boundaries when it was first created, and so a DVA on 43 * this indirect vdev may be "split" into multiple sections that each 44 * map to a different location. As a consequence, not all DVAs can be 45 * translated to an equivalent new DVA. Instead we must provide a 46 * "vdev_remap" operation that executes a callback on each contiguous 47 * segment of the new location. This function is used in multiple ways: 48 * 49 * - reads and repair writes to this device use the callback to create 50 * a child io for each mapped segment. 51 * 52 * - frees and claims to this device use the callback to free or claim 53 * each mapped segment. (Note that we don't actually need to claim 54 * log blocks on indirect vdevs, because we don't allocate to 55 * removing vdevs. However, zdb uses zio_claim() for its leak 56 * detection.) 57 */ 58 59/* 60 * "Big theory statement" for how we mark blocks obsolete. 61 * 62 * When a block on an indirect vdev is freed or remapped, a section of 63 * that vdev's mapping may no longer be referenced (aka "obsolete"). We 64 * keep track of how much of each mapping entry is obsolete. When 65 * an entry becomes completely obsolete, we can remove it, thus reducing 66 * the memory used by the mapping. The complete picture of obsolescence 67 * is given by the following data structures, described below: 68 * - the entry-specific obsolete count 69 * - the vdev-specific obsolete spacemap 70 * - the pool-specific obsolete bpobj 71 * 72 * == On disk data structures used == 73 * 74 * We track the obsolete space for the pool using several objects. Each 75 * of these objects is created on demand and freed when no longer 76 * needed, and is assumed to be empty if it does not exist. 77 * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. 78 * 79 * - Each vic_mapping_object (associated with an indirect vdev) can 80 * have a vimp_counts_object. This is an array of uint32_t's 81 * with the same number of entries as the vic_mapping_object. When 82 * the mapping is condensed, entries from the vic_obsolete_sm_object 83 * (see below) are folded into the counts. Therefore, each 84 * obsolete_counts entry tells us the number of bytes in the 85 * corresponding mapping entry that were not referenced when the 86 * mapping was last condensed. 87 * 88 * - Each indirect or removing vdev can have a vic_obsolete_sm_object. 89 * This is a space map containing an alloc entry for every DVA that 90 * has been obsoleted since the last time this indirect vdev was 91 * condensed. We use this object in order to improve performance 92 * when marking a DVA as obsolete. Instead of modifying an arbitrary 93 * offset of the vimp_counts_object, we only need to append an entry 94 * to the end of this object. When a DVA becomes obsolete, it is 95 * added to the obsolete space map. This happens when the DVA is 96 * freed, remapped and not referenced by a snapshot, or the last 97 * snapshot referencing it is destroyed. 98 * 99 * - Each dataset can have a ds_remap_deadlist object. This is a 100 * deadlist object containing all blocks that were remapped in this 101 * dataset but referenced in a previous snapshot. Blocks can *only* 102 * appear on this list if they were remapped (dsl_dataset_block_remapped); 103 * blocks that were killed in a head dataset are put on the normal 104 * ds_deadlist and marked obsolete when they are freed. 105 * 106 * - The pool can have a dp_obsolete_bpobj. This is a list of blocks 107 * in the pool that need to be marked obsolete. When a snapshot is 108 * destroyed, we move some of the ds_remap_deadlist to the obsolete 109 * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then 110 * asynchronously process the obsolete bpobj, moving its entries to 111 * the specific vdevs' obsolete space maps. 112 * 113 * == Summary of how we mark blocks as obsolete == 114 * 115 * - When freeing a block: if any DVA is on an indirect vdev, append to 116 * vic_obsolete_sm_object. 117 * - When remapping a block, add dva to ds_remap_deadlist (if prev snap 118 * references; otherwise append to vic_obsolete_sm_object). 119 * - When freeing a snapshot: move parts of ds_remap_deadlist to 120 * dp_obsolete_bpobj (same algorithm as ds_deadlist). 121 * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to 122 * individual vdev's vic_obsolete_sm_object. 123 */ 124 125/* 126 * "Big theory statement" for how we condense indirect vdevs. 127 * 128 * Condensing an indirect vdev's mapping is the process of determining 129 * the precise counts of obsolete space for each mapping entry (by 130 * integrating the obsolete spacemap into the obsolete counts) and 131 * writing out a new mapping that contains only referenced entries. 132 * 133 * We condense a vdev when we expect the mapping to shrink (see 134 * vdev_indirect_should_condense()), but only perform one condense at a 135 * time to limit the memory usage. In addition, we use a separate 136 * open-context thread (spa_condense_indirect_thread) to incrementally 137 * create the new mapping object in a way that minimizes the impact on 138 * the rest of the system. 139 * 140 * == Generating a new mapping == 141 * 142 * To generate a new mapping, we follow these steps: 143 * 144 * 1. Save the old obsolete space map and create a new mapping object 145 * (see spa_condense_indirect_start_sync()). This initializes the 146 * spa_condensing_indirect_phys with the "previous obsolete space map", 147 * which is now read only. Newly obsolete DVAs will be added to a 148 * new (initially empty) obsolete space map, and will not be 149 * considered as part of this condense operation. 150 * 151 * 2. Construct in memory the precise counts of obsolete space for each 152 * mapping entry, by incorporating the obsolete space map into the 153 * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) 154 * 155 * 3. Iterate through each mapping entry, writing to the new mapping any 156 * entries that are not completely obsolete (i.e. which don't have 157 * obsolete count == mapping length). (See 158 * spa_condense_indirect_generate_new_mapping().) 159 * 160 * 4. Destroy the old mapping object and switch over to the new one 161 * (spa_condense_indirect_complete_sync). 162 * 163 * == Restarting from failure == 164 * 165 * To restart the condense when we import/open the pool, we must start 166 * at the 2nd step above: reconstruct the precise counts in memory, 167 * based on the space map + counts. Then in the 3rd step, we start 168 * iterating where we left off: at vimp_max_offset of the new mapping 169 * object. 170 */ 171 172boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE; 173 174/* 175 * Condense if at least this percent of the bytes in the mapping is 176 * obsolete. With the default of 25%, the amount of space mapped 177 * will be reduced to 1% of its original size after at most 16 178 * condenses. Higher values will condense less often (causing less 179 * i/o); lower values will reduce the mapping size more quickly. 180 */ 181int zfs_indirect_condense_obsolete_pct = 25; 182 183/* 184 * Condense if the obsolete space map takes up more than this amount of 185 * space on disk (logically). This limits the amount of disk space 186 * consumed by the obsolete space map; the default of 1GB is small enough 187 * that we typically don't mind "wasting" it. 188 */ 189uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; 190 191/* 192 * Don't bother condensing if the mapping uses less than this amount of 193 * memory. The default of 128KB is considered a "trivial" amount of 194 * memory and not worth reducing. 195 */ 196uint64_t zfs_condense_min_mapping_bytes = 128 * 1024; 197 198/* 199 * This is used by the test suite so that it can ensure that certain 200 * actions happen while in the middle of a condense (which might otherwise 201 * complete too quickly). If used to reduce the performance impact of 202 * condensing in production, a maximum value of 1 should be sufficient. 203 */ 204int zfs_condense_indirect_commit_entry_delay_ticks = 0; 205 206/* 207 * Mark the given offset and size as being obsolete. 208 */ 209void 210vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) 211{ 212 spa_t *spa = vd->vdev_spa; 213 214 ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); 215 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 216 ASSERT(size > 0); 217 VERIFY(vdev_indirect_mapping_entry_for_offset( 218 vd->vdev_indirect_mapping, offset) != NULL); 219 220 if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { 221 mutex_enter(&vd->vdev_obsolete_lock); 222 range_tree_add(vd->vdev_obsolete_segments, offset, size); 223 mutex_exit(&vd->vdev_obsolete_lock); 224 vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); 225 } 226} 227 228/* 229 * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This 230 * wrapper is provided because the DMU does not know about vdev_t's and 231 * cannot directly call vdev_indirect_mark_obsolete. 232 */ 233void 234spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, 235 uint64_t size, dmu_tx_t *tx) 236{ 237 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 238 ASSERT(dmu_tx_is_syncing(tx)); 239 240 /* The DMU can only remap indirect vdevs. */ 241 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 242 vdev_indirect_mark_obsolete(vd, offset, size); 243} 244 245static spa_condensing_indirect_t * 246spa_condensing_indirect_create(spa_t *spa) 247{ 248 spa_condensing_indirect_phys_t *scip = 249 &spa->spa_condensing_indirect_phys; 250 spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); 251 objset_t *mos = spa->spa_meta_objset; 252 253 for (int i = 0; i < TXG_SIZE; i++) { 254 list_create(&sci->sci_new_mapping_entries[i], 255 sizeof (vdev_indirect_mapping_entry_t), 256 offsetof(vdev_indirect_mapping_entry_t, vime_node)); 257 } 258 259 sci->sci_new_mapping = 260 vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); 261 262 return (sci); 263} 264 265static void 266spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) 267{ 268 for (int i = 0; i < TXG_SIZE; i++) 269 list_destroy(&sci->sci_new_mapping_entries[i]); 270 271 if (sci->sci_new_mapping != NULL) 272 vdev_indirect_mapping_close(sci->sci_new_mapping); 273 274 kmem_free(sci, sizeof (*sci)); 275} 276 277boolean_t 278vdev_indirect_should_condense(vdev_t *vd) 279{ 280 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 281 spa_t *spa = vd->vdev_spa; 282 283 ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); 284 285 if (!zfs_condense_indirect_vdevs_enable) 286 return (B_FALSE); 287 288 /* 289 * We can only condense one indirect vdev at a time. 290 */ 291 if (spa->spa_condensing_indirect != NULL) 292 return (B_FALSE); 293 294 if (spa_shutting_down(spa)) 295 return (B_FALSE); 296 297 /* 298 * The mapping object size must not change while we are 299 * condensing, so we can only condense indirect vdevs 300 * (not vdevs that are still in the middle of being removed). 301 */ 302 if (vd->vdev_ops != &vdev_indirect_ops) 303 return (B_FALSE); 304 305 /* 306 * If nothing new has been marked obsolete, there is no 307 * point in condensing. 308 */ 309 if (vd->vdev_obsolete_sm == NULL) { 310 ASSERT0(vdev_obsolete_sm_object(vd)); 311 return (B_FALSE); 312 } 313 314 ASSERT(vd->vdev_obsolete_sm != NULL); 315 316 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 317 space_map_object(vd->vdev_obsolete_sm)); 318 319 uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); 320 uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); 321 uint64_t mapping_size = vdev_indirect_mapping_size(vim); 322 uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); 323 324 ASSERT3U(bytes_obsolete, <=, bytes_mapped); 325 326 /* 327 * If a high percentage of the bytes that are mapped have become 328 * obsolete, condense (unless the mapping is already small enough). 329 * This has a good chance of reducing the amount of memory used 330 * by the mapping. 331 */ 332 if (bytes_obsolete * 100 / bytes_mapped >= 333 zfs_indirect_condense_obsolete_pct && 334 mapping_size > zfs_condense_min_mapping_bytes) { 335 zfs_dbgmsg("should condense vdev %llu because obsolete " 336 "spacemap covers %d%% of %lluMB mapping", 337 (u_longlong_t)vd->vdev_id, 338 (int)(bytes_obsolete * 100 / bytes_mapped), 339 (u_longlong_t)bytes_mapped / 1024 / 1024); 340 return (B_TRUE); 341 } 342 343 /* 344 * If the obsolete space map takes up too much space on disk, 345 * condense in order to free up this disk space. 346 */ 347 if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { 348 zfs_dbgmsg("should condense vdev %llu because obsolete sm " 349 "length %lluMB >= max size %lluMB", 350 (u_longlong_t)vd->vdev_id, 351 (u_longlong_t)obsolete_sm_size / 1024 / 1024, 352 (u_longlong_t)zfs_condense_max_obsolete_bytes / 353 1024 / 1024); 354 return (B_TRUE); 355 } 356 357 return (B_FALSE); 358} 359 360/* 361 * This sync task completes (finishes) a condense, deleting the old 362 * mapping and replacing it with the new one. 363 */ 364static void 365spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) 366{ 367 spa_condensing_indirect_t *sci = arg; 368 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 369 spa_condensing_indirect_phys_t *scip = 370 &spa->spa_condensing_indirect_phys; 371 vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); 372 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 373 objset_t *mos = spa->spa_meta_objset; 374 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 375 uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); 376 uint64_t new_count = 377 vdev_indirect_mapping_num_entries(sci->sci_new_mapping); 378 379 ASSERT(dmu_tx_is_syncing(tx)); 380 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 381 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 382 for (int i = 0; i < TXG_SIZE; i++) { 383 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 384 } 385 ASSERT(vic->vic_mapping_object != 0); 386 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 387 ASSERT(scip->scip_next_mapping_object != 0); 388 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 389 390 /* 391 * Reset vdev_indirect_mapping to refer to the new object. 392 */ 393 rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); 394 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 395 vd->vdev_indirect_mapping = sci->sci_new_mapping; 396 rw_exit(&vd->vdev_indirect_rwlock); 397 398 sci->sci_new_mapping = NULL; 399 vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); 400 vic->vic_mapping_object = scip->scip_next_mapping_object; 401 scip->scip_next_mapping_object = 0; 402 403 space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); 404 spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 405 scip->scip_prev_obsolete_sm_object = 0; 406 407 scip->scip_vdev = 0; 408 409 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 410 DMU_POOL_CONDENSING_INDIRECT, tx)); 411 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 412 spa->spa_condensing_indirect = NULL; 413 414 zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " 415 "new mapping object %llu has %llu entries " 416 "(was %llu entries)", 417 vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, 418 new_count, old_count); 419 420 vdev_config_dirty(spa->spa_root_vdev); 421} 422 423/* 424 * This sync task appends entries to the new mapping object. 425 */ 426static void 427spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) 428{ 429 spa_condensing_indirect_t *sci = arg; 430 uint64_t txg = dmu_tx_get_txg(tx); 431 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 432 433 ASSERT(dmu_tx_is_syncing(tx)); 434 ASSERT3P(sci, ==, spa->spa_condensing_indirect); 435 436 vdev_indirect_mapping_add_entries(sci->sci_new_mapping, 437 &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); 438 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); 439} 440 441/* 442 * Open-context function to add one entry to the new mapping. The new 443 * entry will be remembered and written from syncing context. 444 */ 445static void 446spa_condense_indirect_commit_entry(spa_t *spa, 447 vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) 448{ 449 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 450 451 ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); 452 453 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 454 dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); 455 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 456 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; 457 458 /* 459 * If we are the first entry committed this txg, kick off the sync 460 * task to write to the MOS on our behalf. 461 */ 462 if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { 463 dsl_sync_task_nowait(dmu_tx_pool(tx), 464 spa_condense_indirect_commit_sync, sci, 465 0, ZFS_SPACE_CHECK_NONE, tx); 466 } 467 468 vdev_indirect_mapping_entry_t *vime = 469 kmem_alloc(sizeof (*vime), KM_SLEEP); 470 vime->vime_mapping = *vimep; 471 vime->vime_obsolete_count = count; 472 list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); 473 474 dmu_tx_commit(tx); 475} 476 477static void 478spa_condense_indirect_generate_new_mapping(vdev_t *vd, 479 uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) 480{ 481 spa_t *spa = vd->vdev_spa; 482 uint64_t mapi = start_index; 483 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 484 uint64_t old_num_entries = 485 vdev_indirect_mapping_num_entries(old_mapping); 486 487 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 488 ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); 489 490 zfs_dbgmsg("starting condense of vdev %llu from index %llu", 491 (u_longlong_t)vd->vdev_id, 492 (u_longlong_t)mapi); 493 494 while (mapi < old_num_entries) { 495 496 if (zthr_iscancelled(zthr)) { 497 zfs_dbgmsg("pausing condense of vdev %llu " 498 "at index %llu", (u_longlong_t)vd->vdev_id, 499 (u_longlong_t)mapi); 500 break; 501 } 502 503 vdev_indirect_mapping_entry_phys_t *entry = 504 &old_mapping->vim_entries[mapi]; 505 uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); 506 ASSERT3U(obsolete_counts[mapi], <=, entry_size); 507 if (obsolete_counts[mapi] < entry_size) { 508 spa_condense_indirect_commit_entry(spa, entry, 509 obsolete_counts[mapi]); 510 511 /* 512 * This delay may be requested for testing, debugging, 513 * or performance reasons. 514 */ 515 delay(zfs_condense_indirect_commit_entry_delay_ticks); 516 } 517 518 mapi++; 519 } 520} 521 522/* ARGSUSED */ 523static boolean_t 524spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) 525{ 526 spa_t *spa = arg; 527 528 return (spa->spa_condensing_indirect != NULL); 529} 530 531/* ARGSUSED */ 532static int 533spa_condense_indirect_thread(void *arg, zthr_t *zthr) 534{ 535 spa_t *spa = arg; 536 vdev_t *vd; 537 538 ASSERT3P(spa->spa_condensing_indirect, !=, NULL); 539 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 540 vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); 541 ASSERT3P(vd, !=, NULL); 542 spa_config_exit(spa, SCL_VDEV, FTAG); 543 544 spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; 545 spa_condensing_indirect_phys_t *scip = 546 &spa->spa_condensing_indirect_phys; 547 uint32_t *counts; 548 uint64_t start_index; 549 vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; 550 space_map_t *prev_obsolete_sm = NULL; 551 552 ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); 553 ASSERT(scip->scip_next_mapping_object != 0); 554 ASSERT(scip->scip_prev_obsolete_sm_object != 0); 555 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 556 557 for (int i = 0; i < TXG_SIZE; i++) { 558 /* 559 * The list must start out empty in order for the 560 * _commit_sync() sync task to be properly registered 561 * on the first call to _commit_entry(); so it's wise 562 * to double check and ensure we actually are starting 563 * with empty lists. 564 */ 565 ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); 566 } 567 568 VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, 569 scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); 570 space_map_update(prev_obsolete_sm); 571 counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); 572 if (prev_obsolete_sm != NULL) { 573 vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, 574 counts, prev_obsolete_sm); 575 } 576 space_map_close(prev_obsolete_sm); 577 578 /* 579 * Generate new mapping. Determine what index to continue from 580 * based on the max offset that we've already written in the 581 * new mapping. 582 */ 583 uint64_t max_offset = 584 vdev_indirect_mapping_max_offset(sci->sci_new_mapping); 585 if (max_offset == 0) { 586 /* We haven't written anything to the new mapping yet. */ 587 start_index = 0; 588 } else { 589 /* 590 * Pick up from where we left off. _entry_for_offset() 591 * returns a pointer into the vim_entries array. If 592 * max_offset is greater than any of the mappings 593 * contained in the table NULL will be returned and 594 * that indicates we've exhausted our iteration of the 595 * old_mapping. 596 */ 597 598 vdev_indirect_mapping_entry_phys_t *entry = 599 vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, 600 max_offset); 601 602 if (entry == NULL) { 603 /* 604 * We've already written the whole new mapping. 605 * This special value will cause us to skip the 606 * generate_new_mapping step and just do the sync 607 * task to complete the condense. 608 */ 609 start_index = UINT64_MAX; 610 } else { 611 start_index = entry - old_mapping->vim_entries; 612 ASSERT3U(start_index, <, 613 vdev_indirect_mapping_num_entries(old_mapping)); 614 } 615 } 616 617 spa_condense_indirect_generate_new_mapping(vd, counts, 618 start_index, zthr); 619 620 vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); 621 622 /* 623 * If the zthr has received a cancellation signal while running 624 * in generate_new_mapping() or at any point after that, then bail 625 * early. We don't want to complete the condense if the spa is 626 * shutting down. 627 */ 628 if (zthr_iscancelled(zthr)) 629 return (0); 630 631 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 632 spa_condense_indirect_complete_sync, sci, 0, 633 ZFS_SPACE_CHECK_EXTRA_RESERVED)); 634 635 return (0); 636 thread_exit(); 637} 638 639/* 640 * Sync task to begin the condensing process. 641 */ 642void 643spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) 644{ 645 spa_t *spa = vd->vdev_spa; 646 spa_condensing_indirect_phys_t *scip = 647 &spa->spa_condensing_indirect_phys; 648 649 ASSERT0(scip->scip_next_mapping_object); 650 ASSERT0(scip->scip_prev_obsolete_sm_object); 651 ASSERT0(scip->scip_vdev); 652 ASSERT(dmu_tx_is_syncing(tx)); 653 ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); 654 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 655 ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); 656 657 uint64_t obsolete_sm_obj = vdev_obsolete_sm_object(vd); 658 ASSERT(obsolete_sm_obj != 0); 659 660 scip->scip_vdev = vd->vdev_id; 661 scip->scip_next_mapping_object = 662 vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); 663 664 scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; 665 666 /* 667 * We don't need to allocate a new space map object, since 668 * vdev_indirect_sync_obsolete will allocate one when needed. 669 */ 670 space_map_close(vd->vdev_obsolete_sm); 671 vd->vdev_obsolete_sm = NULL; 672 VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, 673 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); 674 675 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, 676 DMU_POOL_DIRECTORY_OBJECT, 677 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 678 sizeof (*scip) / sizeof (uint64_t), scip, tx)); 679 680 ASSERT3P(spa->spa_condensing_indirect, ==, NULL); 681 spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); 682 683 zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " 684 "posm=%llu nm=%llu", 685 vd->vdev_id, dmu_tx_get_txg(tx), 686 (u_longlong_t)scip->scip_prev_obsolete_sm_object, 687 (u_longlong_t)scip->scip_next_mapping_object); 688 689 zthr_wakeup(spa->spa_condense_zthr); 690} 691 692/* 693 * Sync to the given vdev's obsolete space map any segments that are no longer 694 * referenced as of the given txg. 695 * 696 * If the obsolete space map doesn't exist yet, create and open it. 697 */ 698void 699vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) 700{ 701 spa_t *spa = vd->vdev_spa; 702 vdev_indirect_config_t *vic = &vd->vdev_indirect_config; 703 704 ASSERT3U(vic->vic_mapping_object, !=, 0); 705 ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); 706 ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); 707 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); 708 709 if (vdev_obsolete_sm_object(vd) == 0) { 710 uint64_t obsolete_sm_object = 711 space_map_alloc(spa->spa_meta_objset, 712 vdev_standard_sm_blksz, tx); 713 714 ASSERT(vd->vdev_top_zap != 0); 715 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 716 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, 717 sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); 718 ASSERT3U(vdev_obsolete_sm_object(vd), !=, 0); 719 720 spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); 721 VERIFY0(space_map_open(&vd->vdev_obsolete_sm, 722 spa->spa_meta_objset, obsolete_sm_object, 723 0, vd->vdev_asize, 0)); 724 space_map_update(vd->vdev_obsolete_sm); 725 } 726 727 ASSERT(vd->vdev_obsolete_sm != NULL); 728 ASSERT3U(vdev_obsolete_sm_object(vd), ==, 729 space_map_object(vd->vdev_obsolete_sm)); 730 731 space_map_write(vd->vdev_obsolete_sm, 732 vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); 733 space_map_update(vd->vdev_obsolete_sm); 734 range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); 735} 736 737int 738spa_condense_init(spa_t *spa) 739{ 740 int error = zap_lookup(spa->spa_meta_objset, 741 DMU_POOL_DIRECTORY_OBJECT, 742 DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), 743 sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), 744 &spa->spa_condensing_indirect_phys); 745 if (error == 0) { 746 if (spa_writeable(spa)) { 747 spa->spa_condensing_indirect = 748 spa_condensing_indirect_create(spa); 749 } 750 return (0); 751 } else if (error == ENOENT) { 752 return (0); 753 } else { 754 return (error); 755 } 756} 757 758void 759spa_condense_fini(spa_t *spa) 760{ 761 if (spa->spa_condensing_indirect != NULL) { 762 spa_condensing_indirect_destroy(spa->spa_condensing_indirect); 763 spa->spa_condensing_indirect = NULL; 764 } 765} 766 767void 768spa_start_indirect_condensing_thread(spa_t *spa) 769{ 770 ASSERT3P(spa->spa_condense_zthr, ==, NULL); 771 spa->spa_condense_zthr = zthr_create(spa_condense_indirect_thread_check, 772 spa_condense_indirect_thread, spa); 773} 774 775/* 776 * Gets the obsolete spacemap object from the vdev's ZAP. 777 * Returns the spacemap object, or 0 if it wasn't in the ZAP or the ZAP doesn't 778 * exist yet. 779 */ 780int 781vdev_obsolete_sm_object(vdev_t *vd) 782{ 783 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 784 if (vd->vdev_top_zap == 0) { 785 return (0); 786 } 787 788 uint64_t sm_obj = 0; 789 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 790 VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (sm_obj), 1, &sm_obj); 791 792 ASSERT(err == 0 || err == ENOENT); 793 794 return (sm_obj); 795} 796 797boolean_t 798vdev_obsolete_counts_are_precise(vdev_t *vd) 799{ 800 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 801 if (vd->vdev_top_zap == 0) { 802 return (B_FALSE); 803 } 804 805 uint64_t val = 0; 806 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 807 VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); 808 809 ASSERT(err == 0 || err == ENOENT); 810 811 return (val != 0); 812} 813 814/* ARGSUSED */ 815static void 816vdev_indirect_close(vdev_t *vd) 817{ 818} 819 820/* ARGSUSED */ 821static void 822vdev_indirect_io_done(zio_t *zio) 823{ 824} 825 826/* ARGSUSED */ 827static int 828vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 829 uint64_t *logical_ashift, uint64_t *physical_ashift) 830{ 831 *psize = *max_psize = vd->vdev_asize + 832 VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 833 *logical_ashift = vd->vdev_ashift; 834 *physical_ashift = vd->vdev_physical_ashift; 835 return (0); 836} 837 838typedef struct remap_segment { 839 vdev_t *rs_vd; 840 uint64_t rs_offset; 841 uint64_t rs_asize; 842 uint64_t rs_split_offset; 843 list_node_t rs_node; 844} remap_segment_t; 845 846remap_segment_t * 847rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) 848{ 849 remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); 850 rs->rs_vd = vd; 851 rs->rs_offset = offset; 852 rs->rs_asize = asize; 853 rs->rs_split_offset = split_offset; 854 return (rs); 855} 856 857/* 858 * Given an indirect vdev and an extent on that vdev, it duplicates the 859 * physical entries of the indirect mapping that correspond to the extent 860 * to a new array and returns a pointer to it. In addition, copied_entries 861 * is populated with the number of mapping entries that were duplicated. 862 * 863 * Note that the function assumes that the caller holds vdev_indirect_rwlock. 864 * This ensures that the mapping won't change due to condensing as we 865 * copy over its contents. 866 * 867 * Finally, since we are doing an allocation, it is up to the caller to 868 * free the array allocated in this function. 869 */ 870vdev_indirect_mapping_entry_phys_t * 871vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, 872 uint64_t asize, uint64_t *copied_entries) 873{ 874 vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; 875 vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; 876 uint64_t entries = 0; 877 878 ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); 879 880 vdev_indirect_mapping_entry_phys_t *first_mapping = 881 vdev_indirect_mapping_entry_for_offset(vim, offset); 882 ASSERT3P(first_mapping, !=, NULL); 883 884 vdev_indirect_mapping_entry_phys_t *m = first_mapping; 885 while (asize > 0) { 886 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 887 888 ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); 889 ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); 890 891 uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); 892 uint64_t inner_size = MIN(asize, size - inner_offset); 893 894 offset += inner_size; 895 asize -= inner_size; 896 entries++; 897 m++; 898 } 899 900 size_t copy_length = entries * sizeof (*first_mapping); 901 duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); 902 bcopy(first_mapping, duplicate_mappings, copy_length); 903 *copied_entries = entries; 904 905 return (duplicate_mappings); 906} 907 908/* 909 * Goes through the relevant indirect mappings until it hits a concrete vdev 910 * and issues the callback. On the way to the concrete vdev, if any other 911 * indirect vdevs are encountered, then the callback will also be called on 912 * each of those indirect vdevs. For example, if the segment is mapped to 913 * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is 914 * mapped to segment B on concrete vdev 2, then the callback will be called on 915 * both vdev 1 and vdev 2. 916 * 917 * While the callback passed to vdev_indirect_remap() is called on every vdev 918 * the function encounters, certain callbacks only care about concrete vdevs. 919 * These types of callbacks should return immediately and explicitly when they 920 * are called on an indirect vdev. 921 * 922 * Because there is a possibility that a DVA section in the indirect device 923 * has been split into multiple sections in our mapping, we keep track 924 * of the relevant contiguous segments of the new location (remap_segment_t) 925 * in a stack. This way we can call the callback for each of the new sections 926 * created by a single section of the indirect device. Note though, that in 927 * this scenario the callbacks in each split block won't occur in-order in 928 * terms of offset, so callers should not make any assumptions about that. 929 * 930 * For callbacks that don't handle split blocks and immediately return when 931 * they encounter them (as is the case for remap_blkptr_cb), the caller can 932 * assume that its callback will be applied from the first indirect vdev 933 * encountered to the last one and then the concrete vdev, in that order. 934 */ 935static void 936vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, 937 void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) 938{ 939 list_t stack; 940 spa_t *spa = vd->vdev_spa; 941 942 list_create(&stack, sizeof (remap_segment_t), 943 offsetof(remap_segment_t, rs_node)); 944 945 for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); 946 rs != NULL; rs = list_remove_head(&stack)) { 947 vdev_t *v = rs->rs_vd; 948 uint64_t num_entries = 0; 949 950 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 951 ASSERT(rs->rs_asize > 0); 952 953 /* 954 * Note: As this function can be called from open context 955 * (e.g. zio_read()), we need the following rwlock to 956 * prevent the mapping from being changed by condensing. 957 * 958 * So we grab the lock and we make a copy of the entries 959 * that are relevant to the extent that we are working on. 960 * Once that is done, we drop the lock and iterate over 961 * our copy of the mapping. Once we are done with the with 962 * the remap segment and we free it, we also free our copy 963 * of the indirect mapping entries that are relevant to it. 964 * 965 * This way we don't need to wait until the function is 966 * finished with a segment, to condense it. In addition, we 967 * don't need a recursive rwlock for the case that a call to 968 * vdev_indirect_remap() needs to call itself (through the 969 * codepath of its callback) for the same vdev in the middle 970 * of its execution. 971 */ 972 rw_enter(&v->vdev_indirect_rwlock, RW_READER); 973 vdev_indirect_mapping_t *vim = v->vdev_indirect_mapping; 974 ASSERT3P(vim, !=, NULL); 975 976 vdev_indirect_mapping_entry_phys_t *mapping = 977 vdev_indirect_mapping_duplicate_adjacent_entries(v, 978 rs->rs_offset, rs->rs_asize, &num_entries); 979 ASSERT3P(mapping, !=, NULL); 980 ASSERT3U(num_entries, >, 0); 981 rw_exit(&v->vdev_indirect_rwlock); 982 983 for (uint64_t i = 0; i < num_entries; i++) { 984 /* 985 * Note: the vdev_indirect_mapping can not change 986 * while we are running. It only changes while the 987 * removal is in progress, and then only from syncing 988 * context. While a removal is in progress, this 989 * function is only called for frees, which also only 990 * happen from syncing context. 991 */ 992 vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; 993 994 ASSERT3P(m, !=, NULL); 995 ASSERT3U(rs->rs_asize, >, 0); 996 997 uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); 998 uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); 999 uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); 1000 1001 ASSERT3U(rs->rs_offset, >=, 1002 DVA_MAPPING_GET_SRC_OFFSET(m)); 1003 ASSERT3U(rs->rs_offset, <, 1004 DVA_MAPPING_GET_SRC_OFFSET(m) + size); 1005 ASSERT3U(dst_vdev, !=, v->vdev_id); 1006 1007 uint64_t inner_offset = rs->rs_offset - 1008 DVA_MAPPING_GET_SRC_OFFSET(m); 1009 uint64_t inner_size = 1010 MIN(rs->rs_asize, size - inner_offset); 1011 1012 vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); 1013 ASSERT3P(dst_v, !=, NULL); 1014 1015 if (dst_v->vdev_ops == &vdev_indirect_ops) { 1016 list_insert_head(&stack, 1017 rs_alloc(dst_v, dst_offset + inner_offset, 1018 inner_size, rs->rs_split_offset)); 1019 1020 } 1021 1022 if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && 1023 IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { 1024 /* 1025 * Note: This clause exists only solely for 1026 * testing purposes. We use it to ensure that 1027 * split blocks work and that the callbacks 1028 * using them yield the same result if issued 1029 * in reverse order. 1030 */ 1031 uint64_t inner_half = inner_size / 2; 1032 1033 func(rs->rs_split_offset + inner_half, dst_v, 1034 dst_offset + inner_offset + inner_half, 1035 inner_half, arg); 1036 1037 func(rs->rs_split_offset, dst_v, 1038 dst_offset + inner_offset, 1039 inner_half, arg); 1040 } else { 1041 func(rs->rs_split_offset, dst_v, 1042 dst_offset + inner_offset, 1043 inner_size, arg); 1044 } 1045 1046 rs->rs_offset += inner_size; 1047 rs->rs_asize -= inner_size; 1048 rs->rs_split_offset += inner_size; 1049 } 1050 VERIFY0(rs->rs_asize); 1051 1052 kmem_free(mapping, num_entries * sizeof (*mapping)); 1053 kmem_free(rs, sizeof (remap_segment_t)); 1054 } 1055 list_destroy(&stack); 1056} 1057 1058static void 1059vdev_indirect_child_io_done(zio_t *zio) 1060{ 1061 zio_t *pio = zio->io_private; 1062 1063 mutex_enter(&pio->io_lock); 1064 pio->io_error = zio_worst_error(pio->io_error, zio->io_error); 1065 mutex_exit(&pio->io_lock); 1066 1067 abd_put(zio->io_abd); 1068} 1069 1070static void 1071vdev_indirect_io_start_cb(uint64_t split_offset, vdev_t *vd, uint64_t offset, 1072 uint64_t size, void *arg) 1073{ 1074 zio_t *zio = arg; 1075 1076 ASSERT3P(vd, !=, NULL); 1077 1078 if (vd->vdev_ops == &vdev_indirect_ops) 1079 return; 1080 1081 zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, 1082 abd_get_offset(zio->io_abd, split_offset), 1083 size, zio->io_type, zio->io_priority, 1084 0, vdev_indirect_child_io_done, zio)); 1085} 1086 1087static void 1088vdev_indirect_io_start(zio_t *zio) 1089{ 1090 spa_t *spa = zio->io_spa; 1091 1092 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1093 if (zio->io_type != ZIO_TYPE_READ) { 1094 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); 1095 ASSERT((zio->io_flags & 1096 (ZIO_FLAG_SELF_HEAL | ZIO_FLAG_INDUCE_DAMAGE)) != 0); 1097 } 1098 1099 vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, 1100 vdev_indirect_io_start_cb, zio); 1101 1102 zio_execute(zio); 1103} 1104 1105vdev_ops_t vdev_indirect_ops = { 1106 vdev_indirect_open, 1107 vdev_indirect_close, 1108 vdev_default_asize, 1109 vdev_indirect_io_start, 1110 vdev_indirect_io_done, 1111 NULL, 1112 NULL, 1113 NULL, 1114 NULL, 1115 vdev_indirect_remap, 1116 VDEV_TYPE_INDIRECT, /* name of this vdev type */ 1117 B_FALSE /* leaf vdev */ 1118}; 1119