metaslab.c revision 307268
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/space_map.h> 32#include <sys/metaslab_impl.h> 33#include <sys/vdev_impl.h> 34#include <sys/zio.h> 35#include <sys/spa_impl.h> 36#include <sys/zfeature.h> 37 38SYSCTL_DECL(_vfs_zfs); 39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 40 41/* 42 * Allow allocations to switch to gang blocks quickly. We do this to 43 * avoid having to load lots of space_maps in a given txg. There are, 44 * however, some cases where we want to avoid "fast" ganging and instead 45 * we want to do an exhaustive search of all metaslabs on this device. 46 * Currently we don't allow any gang, slog, or dump device related allocations 47 * to "fast" gang. 48 */ 49#define CAN_FASTGANG(flags) \ 50 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 51 METASLAB_GANG_AVOID))) 52 53#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 54#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 55#define METASLAB_ACTIVE_MASK \ 56 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 57 58uint64_t metaslab_aliquot = 512ULL << 10; 59uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 60TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 61SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 62 &metaslab_gang_bang, 0, 63 "Force gang block allocation for blocks larger than or equal to this value"); 64 65/* 66 * The in-core space map representation is more compact than its on-disk form. 67 * The zfs_condense_pct determines how much more compact the in-core 68 * space_map representation must be before we compact it on-disk. 69 * Values should be greater than or equal to 100. 70 */ 71int zfs_condense_pct = 200; 72TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 73SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 74 &zfs_condense_pct, 0, 75 "Condense on-disk spacemap when it is more than this many percents" 76 " of in-memory counterpart"); 77 78/* 79 * Condensing a metaslab is not guaranteed to actually reduce the amount of 80 * space used on disk. In particular, a space map uses data in increments of 81 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 82 * same number of blocks after condensing. Since the goal of condensing is to 83 * reduce the number of IOPs required to read the space map, we only want to 84 * condense when we can be sure we will reduce the number of blocks used by the 85 * space map. Unfortunately, we cannot precisely compute whether or not this is 86 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 87 * we apply the following heuristic: do not condense a spacemap unless the 88 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 89 * blocks. 90 */ 91int zfs_metaslab_condense_block_threshold = 4; 92 93/* 94 * The zfs_mg_noalloc_threshold defines which metaslab groups should 95 * be eligible for allocation. The value is defined as a percentage of 96 * free space. Metaslab groups that have more free space than 97 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 98 * a metaslab group's free space is less than or equal to the 99 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 100 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 101 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 102 * groups are allowed to accept allocations. Gang blocks are always 103 * eligible to allocate on any metaslab group. The default value of 0 means 104 * no metaslab group will be excluded based on this criterion. 105 */ 106int zfs_mg_noalloc_threshold = 0; 107TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 108SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 109 &zfs_mg_noalloc_threshold, 0, 110 "Percentage of metaslab group size that should be free" 111 " to make it eligible for allocation"); 112 113/* 114 * Metaslab groups are considered eligible for allocations if their 115 * fragmenation metric (measured as a percentage) is less than or equal to 116 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 117 * then it will be skipped unless all metaslab groups within the metaslab 118 * class have also crossed this threshold. 119 */ 120int zfs_mg_fragmentation_threshold = 85; 121TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 122SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 123 &zfs_mg_fragmentation_threshold, 0, 124 "Percentage of metaslab group size that should be considered " 125 "eligible for allocations unless all metaslab groups within the metaslab class " 126 "have also crossed this threshold"); 127 128/* 129 * Allow metaslabs to keep their active state as long as their fragmentation 130 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 131 * active metaslab that exceeds this threshold will no longer keep its active 132 * status allowing better metaslabs to be selected. 133 */ 134int zfs_metaslab_fragmentation_threshold = 70; 135TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 136 &zfs_metaslab_fragmentation_threshold); 137SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 138 &zfs_metaslab_fragmentation_threshold, 0, 139 "Maximum percentage of metaslab fragmentation level to keep their active state"); 140 141/* 142 * When set will load all metaslabs when pool is first opened. 143 */ 144int metaslab_debug_load = 0; 145TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 146SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 147 &metaslab_debug_load, 0, 148 "Load all metaslabs when pool is first opened"); 149 150/* 151 * When set will prevent metaslabs from being unloaded. 152 */ 153int metaslab_debug_unload = 0; 154TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 155SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 156 &metaslab_debug_unload, 0, 157 "Prevent metaslabs from being unloaded"); 158 159/* 160 * Minimum size which forces the dynamic allocator to change 161 * it's allocation strategy. Once the space map cannot satisfy 162 * an allocation of this size then it switches to using more 163 * aggressive strategy (i.e search by size rather than offset). 164 */ 165uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 166TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 167 &metaslab_df_alloc_threshold); 168SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 169 &metaslab_df_alloc_threshold, 0, 170 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 171 172/* 173 * The minimum free space, in percent, which must be available 174 * in a space map to continue allocations in a first-fit fashion. 175 * Once the space_map's free space drops below this level we dynamically 176 * switch to using best-fit allocations. 177 */ 178int metaslab_df_free_pct = 4; 179TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 180SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 181 &metaslab_df_free_pct, 0, 182 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 183 184/* 185 * A metaslab is considered "free" if it contains a contiguous 186 * segment which is greater than metaslab_min_alloc_size. 187 */ 188uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 189TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 190 &metaslab_min_alloc_size); 191SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 192 &metaslab_min_alloc_size, 0, 193 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 194 195/* 196 * Percentage of all cpus that can be used by the metaslab taskq. 197 */ 198int metaslab_load_pct = 50; 199TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 200SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 201 &metaslab_load_pct, 0, 202 "Percentage of cpus that can be used by the metaslab taskq"); 203 204/* 205 * Determines how many txgs a metaslab may remain loaded without having any 206 * allocations from it. As long as a metaslab continues to be used we will 207 * keep it loaded. 208 */ 209int metaslab_unload_delay = TXG_SIZE * 2; 210TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 211SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 212 &metaslab_unload_delay, 0, 213 "Number of TXGs that an unused metaslab can be kept in memory"); 214 215/* 216 * Max number of metaslabs per group to preload. 217 */ 218int metaslab_preload_limit = SPA_DVAS_PER_BP; 219TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 220SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 221 &metaslab_preload_limit, 0, 222 "Max number of metaslabs per group to preload"); 223 224/* 225 * Enable/disable preloading of metaslab. 226 */ 227boolean_t metaslab_preload_enabled = B_TRUE; 228TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 229SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 230 &metaslab_preload_enabled, 0, 231 "Max number of metaslabs per group to preload"); 232 233/* 234 * Enable/disable fragmentation weighting on metaslabs. 235 */ 236boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 237TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 238 &metaslab_fragmentation_factor_enabled); 239SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 240 &metaslab_fragmentation_factor_enabled, 0, 241 "Enable fragmentation weighting on metaslabs"); 242 243/* 244 * Enable/disable lba weighting (i.e. outer tracks are given preference). 245 */ 246boolean_t metaslab_lba_weighting_enabled = B_TRUE; 247TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 248 &metaslab_lba_weighting_enabled); 249SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 250 &metaslab_lba_weighting_enabled, 0, 251 "Enable LBA weighting (i.e. outer tracks are given preference)"); 252 253/* 254 * Enable/disable metaslab group biasing. 255 */ 256boolean_t metaslab_bias_enabled = B_TRUE; 257TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 258 &metaslab_bias_enabled); 259SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 260 &metaslab_bias_enabled, 0, 261 "Enable metaslab group biasing"); 262 263static uint64_t metaslab_fragmentation(metaslab_t *); 264 265/* 266 * ========================================================================== 267 * Metaslab classes 268 * ========================================================================== 269 */ 270metaslab_class_t * 271metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 272{ 273 metaslab_class_t *mc; 274 275 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 276 277 mc->mc_spa = spa; 278 mc->mc_rotor = NULL; 279 mc->mc_ops = ops; 280 281 return (mc); 282} 283 284void 285metaslab_class_destroy(metaslab_class_t *mc) 286{ 287 ASSERT(mc->mc_rotor == NULL); 288 ASSERT(mc->mc_alloc == 0); 289 ASSERT(mc->mc_deferred == 0); 290 ASSERT(mc->mc_space == 0); 291 ASSERT(mc->mc_dspace == 0); 292 293 kmem_free(mc, sizeof (metaslab_class_t)); 294} 295 296int 297metaslab_class_validate(metaslab_class_t *mc) 298{ 299 metaslab_group_t *mg; 300 vdev_t *vd; 301 302 /* 303 * Must hold one of the spa_config locks. 304 */ 305 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 306 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 307 308 if ((mg = mc->mc_rotor) == NULL) 309 return (0); 310 311 do { 312 vd = mg->mg_vd; 313 ASSERT(vd->vdev_mg != NULL); 314 ASSERT3P(vd->vdev_top, ==, vd); 315 ASSERT3P(mg->mg_class, ==, mc); 316 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 317 } while ((mg = mg->mg_next) != mc->mc_rotor); 318 319 return (0); 320} 321 322void 323metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 324 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 325{ 326 atomic_add_64(&mc->mc_alloc, alloc_delta); 327 atomic_add_64(&mc->mc_deferred, defer_delta); 328 atomic_add_64(&mc->mc_space, space_delta); 329 atomic_add_64(&mc->mc_dspace, dspace_delta); 330} 331 332void 333metaslab_class_minblocksize_update(metaslab_class_t *mc) 334{ 335 metaslab_group_t *mg; 336 vdev_t *vd; 337 uint64_t minashift = UINT64_MAX; 338 339 if ((mg = mc->mc_rotor) == NULL) { 340 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 341 return; 342 } 343 344 do { 345 vd = mg->mg_vd; 346 if (vd->vdev_ashift < minashift) 347 minashift = vd->vdev_ashift; 348 } while ((mg = mg->mg_next) != mc->mc_rotor); 349 350 mc->mc_minblocksize = 1ULL << minashift; 351} 352 353uint64_t 354metaslab_class_get_alloc(metaslab_class_t *mc) 355{ 356 return (mc->mc_alloc); 357} 358 359uint64_t 360metaslab_class_get_deferred(metaslab_class_t *mc) 361{ 362 return (mc->mc_deferred); 363} 364 365uint64_t 366metaslab_class_get_space(metaslab_class_t *mc) 367{ 368 return (mc->mc_space); 369} 370 371uint64_t 372metaslab_class_get_dspace(metaslab_class_t *mc) 373{ 374 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 375} 376 377uint64_t 378metaslab_class_get_minblocksize(metaslab_class_t *mc) 379{ 380 return (mc->mc_minblocksize); 381} 382 383void 384metaslab_class_histogram_verify(metaslab_class_t *mc) 385{ 386 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 387 uint64_t *mc_hist; 388 int i; 389 390 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 391 return; 392 393 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 394 KM_SLEEP); 395 396 for (int c = 0; c < rvd->vdev_children; c++) { 397 vdev_t *tvd = rvd->vdev_child[c]; 398 metaslab_group_t *mg = tvd->vdev_mg; 399 400 /* 401 * Skip any holes, uninitialized top-levels, or 402 * vdevs that are not in this metalab class. 403 */ 404 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 405 mg->mg_class != mc) { 406 continue; 407 } 408 409 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 410 mc_hist[i] += mg->mg_histogram[i]; 411 } 412 413 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 414 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 415 416 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 417} 418 419/* 420 * Calculate the metaslab class's fragmentation metric. The metric 421 * is weighted based on the space contribution of each metaslab group. 422 * The return value will be a number between 0 and 100 (inclusive), or 423 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 424 * zfs_frag_table for more information about the metric. 425 */ 426uint64_t 427metaslab_class_fragmentation(metaslab_class_t *mc) 428{ 429 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 430 uint64_t fragmentation = 0; 431 432 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 433 434 for (int c = 0; c < rvd->vdev_children; c++) { 435 vdev_t *tvd = rvd->vdev_child[c]; 436 metaslab_group_t *mg = tvd->vdev_mg; 437 438 /* 439 * Skip any holes, uninitialized top-levels, or 440 * vdevs that are not in this metalab class. 441 */ 442 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 443 mg->mg_class != mc) { 444 continue; 445 } 446 447 /* 448 * If a metaslab group does not contain a fragmentation 449 * metric then just bail out. 450 */ 451 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 452 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 453 return (ZFS_FRAG_INVALID); 454 } 455 456 /* 457 * Determine how much this metaslab_group is contributing 458 * to the overall pool fragmentation metric. 459 */ 460 fragmentation += mg->mg_fragmentation * 461 metaslab_group_get_space(mg); 462 } 463 fragmentation /= metaslab_class_get_space(mc); 464 465 ASSERT3U(fragmentation, <=, 100); 466 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 467 return (fragmentation); 468} 469 470/* 471 * Calculate the amount of expandable space that is available in 472 * this metaslab class. If a device is expanded then its expandable 473 * space will be the amount of allocatable space that is currently not 474 * part of this metaslab class. 475 */ 476uint64_t 477metaslab_class_expandable_space(metaslab_class_t *mc) 478{ 479 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 480 uint64_t space = 0; 481 482 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 483 for (int c = 0; c < rvd->vdev_children; c++) { 484 vdev_t *tvd = rvd->vdev_child[c]; 485 metaslab_group_t *mg = tvd->vdev_mg; 486 487 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 488 mg->mg_class != mc) { 489 continue; 490 } 491 492 /* 493 * Calculate if we have enough space to add additional 494 * metaslabs. We report the expandable space in terms 495 * of the metaslab size since that's the unit of expansion. 496 */ 497 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 498 1ULL << tvd->vdev_ms_shift); 499 } 500 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 501 return (space); 502} 503 504/* 505 * ========================================================================== 506 * Metaslab groups 507 * ========================================================================== 508 */ 509static int 510metaslab_compare(const void *x1, const void *x2) 511{ 512 const metaslab_t *m1 = x1; 513 const metaslab_t *m2 = x2; 514 515 if (m1->ms_weight < m2->ms_weight) 516 return (1); 517 if (m1->ms_weight > m2->ms_weight) 518 return (-1); 519 520 /* 521 * If the weights are identical, use the offset to force uniqueness. 522 */ 523 if (m1->ms_start < m2->ms_start) 524 return (-1); 525 if (m1->ms_start > m2->ms_start) 526 return (1); 527 528 ASSERT3P(m1, ==, m2); 529 530 return (0); 531} 532 533/* 534 * Update the allocatable flag and the metaslab group's capacity. 535 * The allocatable flag is set to true if the capacity is below 536 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 537 * from allocatable to non-allocatable or vice versa then the metaslab 538 * group's class is updated to reflect the transition. 539 */ 540static void 541metaslab_group_alloc_update(metaslab_group_t *mg) 542{ 543 vdev_t *vd = mg->mg_vd; 544 metaslab_class_t *mc = mg->mg_class; 545 vdev_stat_t *vs = &vd->vdev_stat; 546 boolean_t was_allocatable; 547 548 ASSERT(vd == vd->vdev_top); 549 550 mutex_enter(&mg->mg_lock); 551 was_allocatable = mg->mg_allocatable; 552 553 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 554 (vs->vs_space + 1); 555 556 /* 557 * A metaslab group is considered allocatable if it has plenty 558 * of free space or is not heavily fragmented. We only take 559 * fragmentation into account if the metaslab group has a valid 560 * fragmentation metric (i.e. a value between 0 and 100). 561 */ 562 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 563 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 564 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 565 566 /* 567 * The mc_alloc_groups maintains a count of the number of 568 * groups in this metaslab class that are still above the 569 * zfs_mg_noalloc_threshold. This is used by the allocating 570 * threads to determine if they should avoid allocations to 571 * a given group. The allocator will avoid allocations to a group 572 * if that group has reached or is below the zfs_mg_noalloc_threshold 573 * and there are still other groups that are above the threshold. 574 * When a group transitions from allocatable to non-allocatable or 575 * vice versa we update the metaslab class to reflect that change. 576 * When the mc_alloc_groups value drops to 0 that means that all 577 * groups have reached the zfs_mg_noalloc_threshold making all groups 578 * eligible for allocations. This effectively means that all devices 579 * are balanced again. 580 */ 581 if (was_allocatable && !mg->mg_allocatable) 582 mc->mc_alloc_groups--; 583 else if (!was_allocatable && mg->mg_allocatable) 584 mc->mc_alloc_groups++; 585 586 mutex_exit(&mg->mg_lock); 587} 588 589metaslab_group_t * 590metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 591{ 592 metaslab_group_t *mg; 593 594 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 595 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 596 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 597 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 598 mg->mg_vd = vd; 599 mg->mg_class = mc; 600 mg->mg_activation_count = 0; 601 602 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 603 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 604 605 return (mg); 606} 607 608void 609metaslab_group_destroy(metaslab_group_t *mg) 610{ 611 ASSERT(mg->mg_prev == NULL); 612 ASSERT(mg->mg_next == NULL); 613 /* 614 * We may have gone below zero with the activation count 615 * either because we never activated in the first place or 616 * because we're done, and possibly removing the vdev. 617 */ 618 ASSERT(mg->mg_activation_count <= 0); 619 620 taskq_destroy(mg->mg_taskq); 621 avl_destroy(&mg->mg_metaslab_tree); 622 mutex_destroy(&mg->mg_lock); 623 kmem_free(mg, sizeof (metaslab_group_t)); 624} 625 626void 627metaslab_group_activate(metaslab_group_t *mg) 628{ 629 metaslab_class_t *mc = mg->mg_class; 630 metaslab_group_t *mgprev, *mgnext; 631 632 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 633 634 ASSERT(mc->mc_rotor != mg); 635 ASSERT(mg->mg_prev == NULL); 636 ASSERT(mg->mg_next == NULL); 637 ASSERT(mg->mg_activation_count <= 0); 638 639 if (++mg->mg_activation_count <= 0) 640 return; 641 642 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 643 metaslab_group_alloc_update(mg); 644 645 if ((mgprev = mc->mc_rotor) == NULL) { 646 mg->mg_prev = mg; 647 mg->mg_next = mg; 648 } else { 649 mgnext = mgprev->mg_next; 650 mg->mg_prev = mgprev; 651 mg->mg_next = mgnext; 652 mgprev->mg_next = mg; 653 mgnext->mg_prev = mg; 654 } 655 mc->mc_rotor = mg; 656 metaslab_class_minblocksize_update(mc); 657} 658 659void 660metaslab_group_passivate(metaslab_group_t *mg) 661{ 662 metaslab_class_t *mc = mg->mg_class; 663 metaslab_group_t *mgprev, *mgnext; 664 665 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 666 667 if (--mg->mg_activation_count != 0) { 668 ASSERT(mc->mc_rotor != mg); 669 ASSERT(mg->mg_prev == NULL); 670 ASSERT(mg->mg_next == NULL); 671 ASSERT(mg->mg_activation_count < 0); 672 return; 673 } 674 675 taskq_wait(mg->mg_taskq); 676 metaslab_group_alloc_update(mg); 677 678 mgprev = mg->mg_prev; 679 mgnext = mg->mg_next; 680 681 if (mg == mgnext) { 682 mc->mc_rotor = NULL; 683 } else { 684 mc->mc_rotor = mgnext; 685 mgprev->mg_next = mgnext; 686 mgnext->mg_prev = mgprev; 687 } 688 689 mg->mg_prev = NULL; 690 mg->mg_next = NULL; 691 metaslab_class_minblocksize_update(mc); 692} 693 694uint64_t 695metaslab_group_get_space(metaslab_group_t *mg) 696{ 697 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 698} 699 700void 701metaslab_group_histogram_verify(metaslab_group_t *mg) 702{ 703 uint64_t *mg_hist; 704 vdev_t *vd = mg->mg_vd; 705 uint64_t ashift = vd->vdev_ashift; 706 int i; 707 708 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 709 return; 710 711 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 712 KM_SLEEP); 713 714 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 715 SPACE_MAP_HISTOGRAM_SIZE + ashift); 716 717 for (int m = 0; m < vd->vdev_ms_count; m++) { 718 metaslab_t *msp = vd->vdev_ms[m]; 719 720 if (msp->ms_sm == NULL) 721 continue; 722 723 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 724 mg_hist[i + ashift] += 725 msp->ms_sm->sm_phys->smp_histogram[i]; 726 } 727 728 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 729 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 730 731 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 732} 733 734static void 735metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 736{ 737 metaslab_class_t *mc = mg->mg_class; 738 uint64_t ashift = mg->mg_vd->vdev_ashift; 739 740 ASSERT(MUTEX_HELD(&msp->ms_lock)); 741 if (msp->ms_sm == NULL) 742 return; 743 744 mutex_enter(&mg->mg_lock); 745 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 746 mg->mg_histogram[i + ashift] += 747 msp->ms_sm->sm_phys->smp_histogram[i]; 748 mc->mc_histogram[i + ashift] += 749 msp->ms_sm->sm_phys->smp_histogram[i]; 750 } 751 mutex_exit(&mg->mg_lock); 752} 753 754void 755metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 756{ 757 metaslab_class_t *mc = mg->mg_class; 758 uint64_t ashift = mg->mg_vd->vdev_ashift; 759 760 ASSERT(MUTEX_HELD(&msp->ms_lock)); 761 if (msp->ms_sm == NULL) 762 return; 763 764 mutex_enter(&mg->mg_lock); 765 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 766 ASSERT3U(mg->mg_histogram[i + ashift], >=, 767 msp->ms_sm->sm_phys->smp_histogram[i]); 768 ASSERT3U(mc->mc_histogram[i + ashift], >=, 769 msp->ms_sm->sm_phys->smp_histogram[i]); 770 771 mg->mg_histogram[i + ashift] -= 772 msp->ms_sm->sm_phys->smp_histogram[i]; 773 mc->mc_histogram[i + ashift] -= 774 msp->ms_sm->sm_phys->smp_histogram[i]; 775 } 776 mutex_exit(&mg->mg_lock); 777} 778 779static void 780metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 781{ 782 ASSERT(msp->ms_group == NULL); 783 mutex_enter(&mg->mg_lock); 784 msp->ms_group = mg; 785 msp->ms_weight = 0; 786 avl_add(&mg->mg_metaslab_tree, msp); 787 mutex_exit(&mg->mg_lock); 788 789 mutex_enter(&msp->ms_lock); 790 metaslab_group_histogram_add(mg, msp); 791 mutex_exit(&msp->ms_lock); 792} 793 794static void 795metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 796{ 797 mutex_enter(&msp->ms_lock); 798 metaslab_group_histogram_remove(mg, msp); 799 mutex_exit(&msp->ms_lock); 800 801 mutex_enter(&mg->mg_lock); 802 ASSERT(msp->ms_group == mg); 803 avl_remove(&mg->mg_metaslab_tree, msp); 804 msp->ms_group = NULL; 805 mutex_exit(&mg->mg_lock); 806} 807 808static void 809metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 810{ 811 /* 812 * Although in principle the weight can be any value, in 813 * practice we do not use values in the range [1, 511]. 814 */ 815 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 816 ASSERT(MUTEX_HELD(&msp->ms_lock)); 817 818 mutex_enter(&mg->mg_lock); 819 ASSERT(msp->ms_group == mg); 820 avl_remove(&mg->mg_metaslab_tree, msp); 821 msp->ms_weight = weight; 822 avl_add(&mg->mg_metaslab_tree, msp); 823 mutex_exit(&mg->mg_lock); 824} 825 826/* 827 * Calculate the fragmentation for a given metaslab group. We can use 828 * a simple average here since all metaslabs within the group must have 829 * the same size. The return value will be a value between 0 and 100 830 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 831 * group have a fragmentation metric. 832 */ 833uint64_t 834metaslab_group_fragmentation(metaslab_group_t *mg) 835{ 836 vdev_t *vd = mg->mg_vd; 837 uint64_t fragmentation = 0; 838 uint64_t valid_ms = 0; 839 840 for (int m = 0; m < vd->vdev_ms_count; m++) { 841 metaslab_t *msp = vd->vdev_ms[m]; 842 843 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 844 continue; 845 846 valid_ms++; 847 fragmentation += msp->ms_fragmentation; 848 } 849 850 if (valid_ms <= vd->vdev_ms_count / 2) 851 return (ZFS_FRAG_INVALID); 852 853 fragmentation /= valid_ms; 854 ASSERT3U(fragmentation, <=, 100); 855 return (fragmentation); 856} 857 858/* 859 * Determine if a given metaslab group should skip allocations. A metaslab 860 * group should avoid allocations if its free capacity is less than the 861 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 862 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 863 * that can still handle allocations. 864 */ 865static boolean_t 866metaslab_group_allocatable(metaslab_group_t *mg) 867{ 868 vdev_t *vd = mg->mg_vd; 869 spa_t *spa = vd->vdev_spa; 870 metaslab_class_t *mc = mg->mg_class; 871 872 /* 873 * We use two key metrics to determine if a metaslab group is 874 * considered allocatable -- free space and fragmentation. If 875 * the free space is greater than the free space threshold and 876 * the fragmentation is less than the fragmentation threshold then 877 * consider the group allocatable. There are two case when we will 878 * not consider these key metrics. The first is if the group is 879 * associated with a slog device and the second is if all groups 880 * in this metaslab class have already been consider ineligible 881 * for allocations. 882 */ 883 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 884 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 885 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 886 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 887} 888 889/* 890 * ========================================================================== 891 * Range tree callbacks 892 * ========================================================================== 893 */ 894 895/* 896 * Comparison function for the private size-ordered tree. Tree is sorted 897 * by size, larger sizes at the end of the tree. 898 */ 899static int 900metaslab_rangesize_compare(const void *x1, const void *x2) 901{ 902 const range_seg_t *r1 = x1; 903 const range_seg_t *r2 = x2; 904 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 905 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 906 907 if (rs_size1 < rs_size2) 908 return (-1); 909 if (rs_size1 > rs_size2) 910 return (1); 911 912 if (r1->rs_start < r2->rs_start) 913 return (-1); 914 915 if (r1->rs_start > r2->rs_start) 916 return (1); 917 918 return (0); 919} 920 921/* 922 * Create any block allocator specific components. The current allocators 923 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 924 */ 925static void 926metaslab_rt_create(range_tree_t *rt, void *arg) 927{ 928 metaslab_t *msp = arg; 929 930 ASSERT3P(rt->rt_arg, ==, msp); 931 ASSERT(msp->ms_tree == NULL); 932 933 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 934 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 935} 936 937/* 938 * Destroy the block allocator specific components. 939 */ 940static void 941metaslab_rt_destroy(range_tree_t *rt, void *arg) 942{ 943 metaslab_t *msp = arg; 944 945 ASSERT3P(rt->rt_arg, ==, msp); 946 ASSERT3P(msp->ms_tree, ==, rt); 947 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 948 949 avl_destroy(&msp->ms_size_tree); 950} 951 952static void 953metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 954{ 955 metaslab_t *msp = arg; 956 957 ASSERT3P(rt->rt_arg, ==, msp); 958 ASSERT3P(msp->ms_tree, ==, rt); 959 VERIFY(!msp->ms_condensing); 960 avl_add(&msp->ms_size_tree, rs); 961} 962 963static void 964metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 965{ 966 metaslab_t *msp = arg; 967 968 ASSERT3P(rt->rt_arg, ==, msp); 969 ASSERT3P(msp->ms_tree, ==, rt); 970 VERIFY(!msp->ms_condensing); 971 avl_remove(&msp->ms_size_tree, rs); 972} 973 974static void 975metaslab_rt_vacate(range_tree_t *rt, void *arg) 976{ 977 metaslab_t *msp = arg; 978 979 ASSERT3P(rt->rt_arg, ==, msp); 980 ASSERT3P(msp->ms_tree, ==, rt); 981 982 /* 983 * Normally one would walk the tree freeing nodes along the way. 984 * Since the nodes are shared with the range trees we can avoid 985 * walking all nodes and just reinitialize the avl tree. The nodes 986 * will be freed by the range tree, so we don't want to free them here. 987 */ 988 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 989 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 990} 991 992static range_tree_ops_t metaslab_rt_ops = { 993 metaslab_rt_create, 994 metaslab_rt_destroy, 995 metaslab_rt_add, 996 metaslab_rt_remove, 997 metaslab_rt_vacate 998}; 999 1000/* 1001 * ========================================================================== 1002 * Metaslab block operations 1003 * ========================================================================== 1004 */ 1005 1006/* 1007 * Return the maximum contiguous segment within the metaslab. 1008 */ 1009uint64_t 1010metaslab_block_maxsize(metaslab_t *msp) 1011{ 1012 avl_tree_t *t = &msp->ms_size_tree; 1013 range_seg_t *rs; 1014 1015 if (t == NULL || (rs = avl_last(t)) == NULL) 1016 return (0ULL); 1017 1018 return (rs->rs_end - rs->rs_start); 1019} 1020 1021uint64_t 1022metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1023{ 1024 uint64_t start; 1025 range_tree_t *rt = msp->ms_tree; 1026 1027 VERIFY(!msp->ms_condensing); 1028 1029 start = msp->ms_ops->msop_alloc(msp, size); 1030 if (start != -1ULL) { 1031 vdev_t *vd = msp->ms_group->mg_vd; 1032 1033 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1034 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1035 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1036 range_tree_remove(rt, start, size); 1037 } 1038 return (start); 1039} 1040 1041/* 1042 * ========================================================================== 1043 * Common allocator routines 1044 * ========================================================================== 1045 */ 1046 1047/* 1048 * This is a helper function that can be used by the allocator to find 1049 * a suitable block to allocate. This will search the specified AVL 1050 * tree looking for a block that matches the specified criteria. 1051 */ 1052static uint64_t 1053metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1054 uint64_t align) 1055{ 1056 range_seg_t *rs, rsearch; 1057 avl_index_t where; 1058 1059 rsearch.rs_start = *cursor; 1060 rsearch.rs_end = *cursor + size; 1061 1062 rs = avl_find(t, &rsearch, &where); 1063 if (rs == NULL) 1064 rs = avl_nearest(t, where, AVL_AFTER); 1065 1066 while (rs != NULL) { 1067 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1068 1069 if (offset + size <= rs->rs_end) { 1070 *cursor = offset + size; 1071 return (offset); 1072 } 1073 rs = AVL_NEXT(t, rs); 1074 } 1075 1076 /* 1077 * If we know we've searched the whole map (*cursor == 0), give up. 1078 * Otherwise, reset the cursor to the beginning and try again. 1079 */ 1080 if (*cursor == 0) 1081 return (-1ULL); 1082 1083 *cursor = 0; 1084 return (metaslab_block_picker(t, cursor, size, align)); 1085} 1086 1087/* 1088 * ========================================================================== 1089 * The first-fit block allocator 1090 * ========================================================================== 1091 */ 1092static uint64_t 1093metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1094{ 1095 /* 1096 * Find the largest power of 2 block size that evenly divides the 1097 * requested size. This is used to try to allocate blocks with similar 1098 * alignment from the same area of the metaslab (i.e. same cursor 1099 * bucket) but it does not guarantee that other allocations sizes 1100 * may exist in the same region. 1101 */ 1102 uint64_t align = size & -size; 1103 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1104 avl_tree_t *t = &msp->ms_tree->rt_root; 1105 1106 return (metaslab_block_picker(t, cursor, size, align)); 1107} 1108 1109static metaslab_ops_t metaslab_ff_ops = { 1110 metaslab_ff_alloc 1111}; 1112 1113/* 1114 * ========================================================================== 1115 * Dynamic block allocator - 1116 * Uses the first fit allocation scheme until space get low and then 1117 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1118 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1119 * ========================================================================== 1120 */ 1121static uint64_t 1122metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1123{ 1124 /* 1125 * Find the largest power of 2 block size that evenly divides the 1126 * requested size. This is used to try to allocate blocks with similar 1127 * alignment from the same area of the metaslab (i.e. same cursor 1128 * bucket) but it does not guarantee that other allocations sizes 1129 * may exist in the same region. 1130 */ 1131 uint64_t align = size & -size; 1132 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1133 range_tree_t *rt = msp->ms_tree; 1134 avl_tree_t *t = &rt->rt_root; 1135 uint64_t max_size = metaslab_block_maxsize(msp); 1136 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1137 1138 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1139 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1140 1141 if (max_size < size) 1142 return (-1ULL); 1143 1144 /* 1145 * If we're running low on space switch to using the size 1146 * sorted AVL tree (best-fit). 1147 */ 1148 if (max_size < metaslab_df_alloc_threshold || 1149 free_pct < metaslab_df_free_pct) { 1150 t = &msp->ms_size_tree; 1151 *cursor = 0; 1152 } 1153 1154 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1155} 1156 1157static metaslab_ops_t metaslab_df_ops = { 1158 metaslab_df_alloc 1159}; 1160 1161/* 1162 * ========================================================================== 1163 * Cursor fit block allocator - 1164 * Select the largest region in the metaslab, set the cursor to the beginning 1165 * of the range and the cursor_end to the end of the range. As allocations 1166 * are made advance the cursor. Continue allocating from the cursor until 1167 * the range is exhausted and then find a new range. 1168 * ========================================================================== 1169 */ 1170static uint64_t 1171metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1172{ 1173 range_tree_t *rt = msp->ms_tree; 1174 avl_tree_t *t = &msp->ms_size_tree; 1175 uint64_t *cursor = &msp->ms_lbas[0]; 1176 uint64_t *cursor_end = &msp->ms_lbas[1]; 1177 uint64_t offset = 0; 1178 1179 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1180 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1181 1182 ASSERT3U(*cursor_end, >=, *cursor); 1183 1184 if ((*cursor + size) > *cursor_end) { 1185 range_seg_t *rs; 1186 1187 rs = avl_last(&msp->ms_size_tree); 1188 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1189 return (-1ULL); 1190 1191 *cursor = rs->rs_start; 1192 *cursor_end = rs->rs_end; 1193 } 1194 1195 offset = *cursor; 1196 *cursor += size; 1197 1198 return (offset); 1199} 1200 1201static metaslab_ops_t metaslab_cf_ops = { 1202 metaslab_cf_alloc 1203}; 1204 1205/* 1206 * ========================================================================== 1207 * New dynamic fit allocator - 1208 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1209 * contiguous blocks. If no region is found then just use the largest segment 1210 * that remains. 1211 * ========================================================================== 1212 */ 1213 1214/* 1215 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1216 * to request from the allocator. 1217 */ 1218uint64_t metaslab_ndf_clump_shift = 4; 1219 1220static uint64_t 1221metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1222{ 1223 avl_tree_t *t = &msp->ms_tree->rt_root; 1224 avl_index_t where; 1225 range_seg_t *rs, rsearch; 1226 uint64_t hbit = highbit64(size); 1227 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1228 uint64_t max_size = metaslab_block_maxsize(msp); 1229 1230 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1231 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1232 1233 if (max_size < size) 1234 return (-1ULL); 1235 1236 rsearch.rs_start = *cursor; 1237 rsearch.rs_end = *cursor + size; 1238 1239 rs = avl_find(t, &rsearch, &where); 1240 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1241 t = &msp->ms_size_tree; 1242 1243 rsearch.rs_start = 0; 1244 rsearch.rs_end = MIN(max_size, 1245 1ULL << (hbit + metaslab_ndf_clump_shift)); 1246 rs = avl_find(t, &rsearch, &where); 1247 if (rs == NULL) 1248 rs = avl_nearest(t, where, AVL_AFTER); 1249 ASSERT(rs != NULL); 1250 } 1251 1252 if ((rs->rs_end - rs->rs_start) >= size) { 1253 *cursor = rs->rs_start + size; 1254 return (rs->rs_start); 1255 } 1256 return (-1ULL); 1257} 1258 1259static metaslab_ops_t metaslab_ndf_ops = { 1260 metaslab_ndf_alloc 1261}; 1262 1263metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1264 1265/* 1266 * ========================================================================== 1267 * Metaslabs 1268 * ========================================================================== 1269 */ 1270 1271/* 1272 * Wait for any in-progress metaslab loads to complete. 1273 */ 1274void 1275metaslab_load_wait(metaslab_t *msp) 1276{ 1277 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1278 1279 while (msp->ms_loading) { 1280 ASSERT(!msp->ms_loaded); 1281 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1282 } 1283} 1284 1285int 1286metaslab_load(metaslab_t *msp) 1287{ 1288 int error = 0; 1289 1290 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1291 ASSERT(!msp->ms_loaded); 1292 ASSERT(!msp->ms_loading); 1293 1294 msp->ms_loading = B_TRUE; 1295 1296 /* 1297 * If the space map has not been allocated yet, then treat 1298 * all the space in the metaslab as free and add it to the 1299 * ms_tree. 1300 */ 1301 if (msp->ms_sm != NULL) 1302 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1303 else 1304 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1305 1306 msp->ms_loaded = (error == 0); 1307 msp->ms_loading = B_FALSE; 1308 1309 if (msp->ms_loaded) { 1310 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1311 range_tree_walk(msp->ms_defertree[t], 1312 range_tree_remove, msp->ms_tree); 1313 } 1314 } 1315 cv_broadcast(&msp->ms_load_cv); 1316 return (error); 1317} 1318 1319void 1320metaslab_unload(metaslab_t *msp) 1321{ 1322 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1323 range_tree_vacate(msp->ms_tree, NULL, NULL); 1324 msp->ms_loaded = B_FALSE; 1325 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1326} 1327 1328int 1329metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1330 metaslab_t **msp) 1331{ 1332 vdev_t *vd = mg->mg_vd; 1333 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1334 metaslab_t *ms; 1335 int error; 1336 1337 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1338 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1339 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1340 ms->ms_id = id; 1341 ms->ms_start = id << vd->vdev_ms_shift; 1342 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1343 1344 /* 1345 * We only open space map objects that already exist. All others 1346 * will be opened when we finally allocate an object for it. 1347 */ 1348 if (object != 0) { 1349 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1350 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1351 1352 if (error != 0) { 1353 kmem_free(ms, sizeof (metaslab_t)); 1354 return (error); 1355 } 1356 1357 ASSERT(ms->ms_sm != NULL); 1358 } 1359 1360 /* 1361 * We create the main range tree here, but we don't create the 1362 * alloctree and freetree until metaslab_sync_done(). This serves 1363 * two purposes: it allows metaslab_sync_done() to detect the 1364 * addition of new space; and for debugging, it ensures that we'd 1365 * data fault on any attempt to use this metaslab before it's ready. 1366 */ 1367 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1368 metaslab_group_add(mg, ms); 1369 1370 ms->ms_fragmentation = metaslab_fragmentation(ms); 1371 ms->ms_ops = mg->mg_class->mc_ops; 1372 1373 /* 1374 * If we're opening an existing pool (txg == 0) or creating 1375 * a new one (txg == TXG_INITIAL), all space is available now. 1376 * If we're adding space to an existing pool, the new space 1377 * does not become available until after this txg has synced. 1378 */ 1379 if (txg <= TXG_INITIAL) 1380 metaslab_sync_done(ms, 0); 1381 1382 /* 1383 * If metaslab_debug_load is set and we're initializing a metaslab 1384 * that has an allocated space_map object then load the its space 1385 * map so that can verify frees. 1386 */ 1387 if (metaslab_debug_load && ms->ms_sm != NULL) { 1388 mutex_enter(&ms->ms_lock); 1389 VERIFY0(metaslab_load(ms)); 1390 mutex_exit(&ms->ms_lock); 1391 } 1392 1393 if (txg != 0) { 1394 vdev_dirty(vd, 0, NULL, txg); 1395 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1396 } 1397 1398 *msp = ms; 1399 1400 return (0); 1401} 1402 1403void 1404metaslab_fini(metaslab_t *msp) 1405{ 1406 metaslab_group_t *mg = msp->ms_group; 1407 1408 metaslab_group_remove(mg, msp); 1409 1410 mutex_enter(&msp->ms_lock); 1411 1412 VERIFY(msp->ms_group == NULL); 1413 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1414 0, -msp->ms_size); 1415 space_map_close(msp->ms_sm); 1416 1417 metaslab_unload(msp); 1418 range_tree_destroy(msp->ms_tree); 1419 1420 for (int t = 0; t < TXG_SIZE; t++) { 1421 range_tree_destroy(msp->ms_alloctree[t]); 1422 range_tree_destroy(msp->ms_freetree[t]); 1423 } 1424 1425 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1426 range_tree_destroy(msp->ms_defertree[t]); 1427 } 1428 1429 ASSERT0(msp->ms_deferspace); 1430 1431 mutex_exit(&msp->ms_lock); 1432 cv_destroy(&msp->ms_load_cv); 1433 mutex_destroy(&msp->ms_lock); 1434 1435 kmem_free(msp, sizeof (metaslab_t)); 1436} 1437 1438#define FRAGMENTATION_TABLE_SIZE 17 1439 1440/* 1441 * This table defines a segment size based fragmentation metric that will 1442 * allow each metaslab to derive its own fragmentation value. This is done 1443 * by calculating the space in each bucket of the spacemap histogram and 1444 * multiplying that by the fragmetation metric in this table. Doing 1445 * this for all buckets and dividing it by the total amount of free 1446 * space in this metaslab (i.e. the total free space in all buckets) gives 1447 * us the fragmentation metric. This means that a high fragmentation metric 1448 * equates to most of the free space being comprised of small segments. 1449 * Conversely, if the metric is low, then most of the free space is in 1450 * large segments. A 10% change in fragmentation equates to approximately 1451 * double the number of segments. 1452 * 1453 * This table defines 0% fragmented space using 16MB segments. Testing has 1454 * shown that segments that are greater than or equal to 16MB do not suffer 1455 * from drastic performance problems. Using this value, we derive the rest 1456 * of the table. Since the fragmentation value is never stored on disk, it 1457 * is possible to change these calculations in the future. 1458 */ 1459int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1460 100, /* 512B */ 1461 100, /* 1K */ 1462 98, /* 2K */ 1463 95, /* 4K */ 1464 90, /* 8K */ 1465 80, /* 16K */ 1466 70, /* 32K */ 1467 60, /* 64K */ 1468 50, /* 128K */ 1469 40, /* 256K */ 1470 30, /* 512K */ 1471 20, /* 1M */ 1472 15, /* 2M */ 1473 10, /* 4M */ 1474 5, /* 8M */ 1475 0 /* 16M */ 1476}; 1477 1478/* 1479 * Calclate the metaslab's fragmentation metric. A return value 1480 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1481 * not support this metric. Otherwise, the return value should be in the 1482 * range [0, 100]. 1483 */ 1484static uint64_t 1485metaslab_fragmentation(metaslab_t *msp) 1486{ 1487 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1488 uint64_t fragmentation = 0; 1489 uint64_t total = 0; 1490 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1491 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1492 1493 if (!feature_enabled) 1494 return (ZFS_FRAG_INVALID); 1495 1496 /* 1497 * A null space map means that the entire metaslab is free 1498 * and thus is not fragmented. 1499 */ 1500 if (msp->ms_sm == NULL) 1501 return (0); 1502 1503 /* 1504 * If this metaslab's space_map has not been upgraded, flag it 1505 * so that we upgrade next time we encounter it. 1506 */ 1507 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1508 uint64_t txg = spa_syncing_txg(spa); 1509 vdev_t *vd = msp->ms_group->mg_vd; 1510 1511 if (spa_writeable(spa)) { 1512 msp->ms_condense_wanted = B_TRUE; 1513 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1514 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1515 "msp %p, vd %p", txg, msp, vd); 1516 } 1517 return (ZFS_FRAG_INVALID); 1518 } 1519 1520 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1521 uint64_t space = 0; 1522 uint8_t shift = msp->ms_sm->sm_shift; 1523 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1524 FRAGMENTATION_TABLE_SIZE - 1); 1525 1526 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1527 continue; 1528 1529 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1530 total += space; 1531 1532 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1533 fragmentation += space * zfs_frag_table[idx]; 1534 } 1535 1536 if (total > 0) 1537 fragmentation /= total; 1538 ASSERT3U(fragmentation, <=, 100); 1539 return (fragmentation); 1540} 1541 1542/* 1543 * Compute a weight -- a selection preference value -- for the given metaslab. 1544 * This is based on the amount of free space, the level of fragmentation, 1545 * the LBA range, and whether the metaslab is loaded. 1546 */ 1547static uint64_t 1548metaslab_weight(metaslab_t *msp) 1549{ 1550 metaslab_group_t *mg = msp->ms_group; 1551 vdev_t *vd = mg->mg_vd; 1552 uint64_t weight, space; 1553 1554 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1555 1556 /* 1557 * This vdev is in the process of being removed so there is nothing 1558 * for us to do here. 1559 */ 1560 if (vd->vdev_removing) { 1561 ASSERT0(space_map_allocated(msp->ms_sm)); 1562 ASSERT0(vd->vdev_ms_shift); 1563 return (0); 1564 } 1565 1566 /* 1567 * The baseline weight is the metaslab's free space. 1568 */ 1569 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1570 1571 msp->ms_fragmentation = metaslab_fragmentation(msp); 1572 if (metaslab_fragmentation_factor_enabled && 1573 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1574 /* 1575 * Use the fragmentation information to inversely scale 1576 * down the baseline weight. We need to ensure that we 1577 * don't exclude this metaslab completely when it's 100% 1578 * fragmented. To avoid this we reduce the fragmented value 1579 * by 1. 1580 */ 1581 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1582 1583 /* 1584 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1585 * this metaslab again. The fragmentation metric may have 1586 * decreased the space to something smaller than 1587 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1588 * so that we can consume any remaining space. 1589 */ 1590 if (space > 0 && space < SPA_MINBLOCKSIZE) 1591 space = SPA_MINBLOCKSIZE; 1592 } 1593 weight = space; 1594 1595 /* 1596 * Modern disks have uniform bit density and constant angular velocity. 1597 * Therefore, the outer recording zones are faster (higher bandwidth) 1598 * than the inner zones by the ratio of outer to inner track diameter, 1599 * which is typically around 2:1. We account for this by assigning 1600 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1601 * In effect, this means that we'll select the metaslab with the most 1602 * free bandwidth rather than simply the one with the most free space. 1603 */ 1604 if (metaslab_lba_weighting_enabled) { 1605 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1606 ASSERT(weight >= space && weight <= 2 * space); 1607 } 1608 1609 /* 1610 * If this metaslab is one we're actively using, adjust its 1611 * weight to make it preferable to any inactive metaslab so 1612 * we'll polish it off. If the fragmentation on this metaslab 1613 * has exceed our threshold, then don't mark it active. 1614 */ 1615 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1616 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1617 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1618 } 1619 1620 return (weight); 1621} 1622 1623static int 1624metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1625{ 1626 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1627 1628 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1629 metaslab_load_wait(msp); 1630 if (!msp->ms_loaded) { 1631 int error = metaslab_load(msp); 1632 if (error) { 1633 metaslab_group_sort(msp->ms_group, msp, 0); 1634 return (error); 1635 } 1636 } 1637 1638 metaslab_group_sort(msp->ms_group, msp, 1639 msp->ms_weight | activation_weight); 1640 } 1641 ASSERT(msp->ms_loaded); 1642 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1643 1644 return (0); 1645} 1646 1647static void 1648metaslab_passivate(metaslab_t *msp, uint64_t size) 1649{ 1650 /* 1651 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1652 * this metaslab again. In that case, it had better be empty, 1653 * or we would be leaving space on the table. 1654 */ 1655 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1656 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1657 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1658} 1659 1660static void 1661metaslab_preload(void *arg) 1662{ 1663 metaslab_t *msp = arg; 1664 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1665 1666 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1667 1668 mutex_enter(&msp->ms_lock); 1669 metaslab_load_wait(msp); 1670 if (!msp->ms_loaded) 1671 (void) metaslab_load(msp); 1672 1673 /* 1674 * Set the ms_access_txg value so that we don't unload it right away. 1675 */ 1676 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1677 mutex_exit(&msp->ms_lock); 1678} 1679 1680static void 1681metaslab_group_preload(metaslab_group_t *mg) 1682{ 1683 spa_t *spa = mg->mg_vd->vdev_spa; 1684 metaslab_t *msp; 1685 avl_tree_t *t = &mg->mg_metaslab_tree; 1686 int m = 0; 1687 1688 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1689 taskq_wait(mg->mg_taskq); 1690 return; 1691 } 1692 1693 mutex_enter(&mg->mg_lock); 1694 /* 1695 * Load the next potential metaslabs 1696 */ 1697 msp = avl_first(t); 1698 while (msp != NULL) { 1699 metaslab_t *msp_next = AVL_NEXT(t, msp); 1700 1701 /* 1702 * We preload only the maximum number of metaslabs specified 1703 * by metaslab_preload_limit. If a metaslab is being forced 1704 * to condense then we preload it too. This will ensure 1705 * that force condensing happens in the next txg. 1706 */ 1707 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1708 msp = msp_next; 1709 continue; 1710 } 1711 1712 /* 1713 * We must drop the metaslab group lock here to preserve 1714 * lock ordering with the ms_lock (when grabbing both 1715 * the mg_lock and the ms_lock, the ms_lock must be taken 1716 * first). As a result, it is possible that the ordering 1717 * of the metaslabs within the avl tree may change before 1718 * we reacquire the lock. The metaslab cannot be removed from 1719 * the tree while we're in syncing context so it is safe to 1720 * drop the mg_lock here. If the metaslabs are reordered 1721 * nothing will break -- we just may end up loading a 1722 * less than optimal one. 1723 */ 1724 mutex_exit(&mg->mg_lock); 1725 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1726 msp, TQ_SLEEP) != 0); 1727 mutex_enter(&mg->mg_lock); 1728 msp = msp_next; 1729 } 1730 mutex_exit(&mg->mg_lock); 1731} 1732 1733/* 1734 * Determine if the space map's on-disk footprint is past our tolerance 1735 * for inefficiency. We would like to use the following criteria to make 1736 * our decision: 1737 * 1738 * 1. The size of the space map object should not dramatically increase as a 1739 * result of writing out the free space range tree. 1740 * 1741 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1742 * times the size than the free space range tree representation 1743 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1744 * 1745 * 3. The on-disk size of the space map should actually decrease. 1746 * 1747 * Checking the first condition is tricky since we don't want to walk 1748 * the entire AVL tree calculating the estimated on-disk size. Instead we 1749 * use the size-ordered range tree in the metaslab and calculate the 1750 * size required to write out the largest segment in our free tree. If the 1751 * size required to represent that segment on disk is larger than the space 1752 * map object then we avoid condensing this map. 1753 * 1754 * To determine the second criterion we use a best-case estimate and assume 1755 * each segment can be represented on-disk as a single 64-bit entry. We refer 1756 * to this best-case estimate as the space map's minimal form. 1757 * 1758 * Unfortunately, we cannot compute the on-disk size of the space map in this 1759 * context because we cannot accurately compute the effects of compression, etc. 1760 * Instead, we apply the heuristic described in the block comment for 1761 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1762 * is greater than a threshold number of blocks. 1763 */ 1764static boolean_t 1765metaslab_should_condense(metaslab_t *msp) 1766{ 1767 space_map_t *sm = msp->ms_sm; 1768 range_seg_t *rs; 1769 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1770 dmu_object_info_t doi; 1771 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1772 1773 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1774 ASSERT(msp->ms_loaded); 1775 1776 /* 1777 * Use the ms_size_tree range tree, which is ordered by size, to 1778 * obtain the largest segment in the free tree. We always condense 1779 * metaslabs that are empty and metaslabs for which a condense 1780 * request has been made. 1781 */ 1782 rs = avl_last(&msp->ms_size_tree); 1783 if (rs == NULL || msp->ms_condense_wanted) 1784 return (B_TRUE); 1785 1786 /* 1787 * Calculate the number of 64-bit entries this segment would 1788 * require when written to disk. If this single segment would be 1789 * larger on-disk than the entire current on-disk structure, then 1790 * clearly condensing will increase the on-disk structure size. 1791 */ 1792 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1793 entries = size / (MIN(size, SM_RUN_MAX)); 1794 segsz = entries * sizeof (uint64_t); 1795 1796 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1797 object_size = space_map_length(msp->ms_sm); 1798 1799 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1800 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1801 1802 return (segsz <= object_size && 1803 object_size >= (optimal_size * zfs_condense_pct / 100) && 1804 object_size > zfs_metaslab_condense_block_threshold * record_size); 1805} 1806 1807/* 1808 * Condense the on-disk space map representation to its minimized form. 1809 * The minimized form consists of a small number of allocations followed by 1810 * the entries of the free range tree. 1811 */ 1812static void 1813metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1814{ 1815 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1816 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1817 range_tree_t *condense_tree; 1818 space_map_t *sm = msp->ms_sm; 1819 1820 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1821 ASSERT3U(spa_sync_pass(spa), ==, 1); 1822 ASSERT(msp->ms_loaded); 1823 1824 1825 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1826 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1827 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1828 msp->ms_group->mg_vd->vdev_spa->spa_name, 1829 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1830 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1831 1832 msp->ms_condense_wanted = B_FALSE; 1833 1834 /* 1835 * Create an range tree that is 100% allocated. We remove segments 1836 * that have been freed in this txg, any deferred frees that exist, 1837 * and any allocation in the future. Removing segments should be 1838 * a relatively inexpensive operation since we expect these trees to 1839 * have a small number of nodes. 1840 */ 1841 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1842 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1843 1844 /* 1845 * Remove what's been freed in this txg from the condense_tree. 1846 * Since we're in sync_pass 1, we know that all the frees from 1847 * this txg are in the freetree. 1848 */ 1849 range_tree_walk(freetree, range_tree_remove, condense_tree); 1850 1851 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1852 range_tree_walk(msp->ms_defertree[t], 1853 range_tree_remove, condense_tree); 1854 } 1855 1856 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1857 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1858 range_tree_remove, condense_tree); 1859 } 1860 1861 /* 1862 * We're about to drop the metaslab's lock thus allowing 1863 * other consumers to change it's content. Set the 1864 * metaslab's ms_condensing flag to ensure that 1865 * allocations on this metaslab do not occur while we're 1866 * in the middle of committing it to disk. This is only critical 1867 * for the ms_tree as all other range trees use per txg 1868 * views of their content. 1869 */ 1870 msp->ms_condensing = B_TRUE; 1871 1872 mutex_exit(&msp->ms_lock); 1873 space_map_truncate(sm, tx); 1874 mutex_enter(&msp->ms_lock); 1875 1876 /* 1877 * While we would ideally like to create a space_map representation 1878 * that consists only of allocation records, doing so can be 1879 * prohibitively expensive because the in-core free tree can be 1880 * large, and therefore computationally expensive to subtract 1881 * from the condense_tree. Instead we sync out two trees, a cheap 1882 * allocation only tree followed by the in-core free tree. While not 1883 * optimal, this is typically close to optimal, and much cheaper to 1884 * compute. 1885 */ 1886 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1887 range_tree_vacate(condense_tree, NULL, NULL); 1888 range_tree_destroy(condense_tree); 1889 1890 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1891 msp->ms_condensing = B_FALSE; 1892} 1893 1894/* 1895 * Write a metaslab to disk in the context of the specified transaction group. 1896 */ 1897void 1898metaslab_sync(metaslab_t *msp, uint64_t txg) 1899{ 1900 metaslab_group_t *mg = msp->ms_group; 1901 vdev_t *vd = mg->mg_vd; 1902 spa_t *spa = vd->vdev_spa; 1903 objset_t *mos = spa_meta_objset(spa); 1904 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1905 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1906 range_tree_t **freed_tree = 1907 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1908 dmu_tx_t *tx; 1909 uint64_t object = space_map_object(msp->ms_sm); 1910 1911 ASSERT(!vd->vdev_ishole); 1912 1913 /* 1914 * This metaslab has just been added so there's no work to do now. 1915 */ 1916 if (*freetree == NULL) { 1917 ASSERT3P(alloctree, ==, NULL); 1918 return; 1919 } 1920 1921 ASSERT3P(alloctree, !=, NULL); 1922 ASSERT3P(*freetree, !=, NULL); 1923 ASSERT3P(*freed_tree, !=, NULL); 1924 1925 /* 1926 * Normally, we don't want to process a metaslab if there 1927 * are no allocations or frees to perform. However, if the metaslab 1928 * is being forced to condense we need to let it through. 1929 */ 1930 if (range_tree_space(alloctree) == 0 && 1931 range_tree_space(*freetree) == 0 && 1932 !msp->ms_condense_wanted) 1933 return; 1934 1935 /* 1936 * The only state that can actually be changing concurrently with 1937 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1938 * be modifying this txg's alloctree, freetree, freed_tree, or 1939 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1940 * space_map ASSERTs. We drop it whenever we call into the DMU, 1941 * because the DMU can call down to us (e.g. via zio_free()) at 1942 * any time. 1943 */ 1944 1945 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1946 1947 if (msp->ms_sm == NULL) { 1948 uint64_t new_object; 1949 1950 new_object = space_map_alloc(mos, tx); 1951 VERIFY3U(new_object, !=, 0); 1952 1953 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1954 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1955 &msp->ms_lock)); 1956 ASSERT(msp->ms_sm != NULL); 1957 } 1958 1959 mutex_enter(&msp->ms_lock); 1960 1961 /* 1962 * Note: metaslab_condense() clears the space_map's histogram. 1963 * Therefore we must verify and remove this histogram before 1964 * condensing. 1965 */ 1966 metaslab_group_histogram_verify(mg); 1967 metaslab_class_histogram_verify(mg->mg_class); 1968 metaslab_group_histogram_remove(mg, msp); 1969 1970 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1971 metaslab_should_condense(msp)) { 1972 metaslab_condense(msp, txg, tx); 1973 } else { 1974 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1975 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1976 } 1977 1978 if (msp->ms_loaded) { 1979 /* 1980 * When the space map is loaded, we have an accruate 1981 * histogram in the range tree. This gives us an opportunity 1982 * to bring the space map's histogram up-to-date so we clear 1983 * it first before updating it. 1984 */ 1985 space_map_histogram_clear(msp->ms_sm); 1986 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1987 } else { 1988 /* 1989 * Since the space map is not loaded we simply update the 1990 * exisiting histogram with what was freed in this txg. This 1991 * means that the on-disk histogram may not have an accurate 1992 * view of the free space but it's close enough to allow 1993 * us to make allocation decisions. 1994 */ 1995 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1996 } 1997 metaslab_group_histogram_add(mg, msp); 1998 metaslab_group_histogram_verify(mg); 1999 metaslab_class_histogram_verify(mg->mg_class); 2000 2001 /* 2002 * For sync pass 1, we avoid traversing this txg's free range tree 2003 * and instead will just swap the pointers for freetree and 2004 * freed_tree. We can safely do this since the freed_tree is 2005 * guaranteed to be empty on the initial pass. 2006 */ 2007 if (spa_sync_pass(spa) == 1) { 2008 range_tree_swap(freetree, freed_tree); 2009 } else { 2010 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 2011 } 2012 range_tree_vacate(alloctree, NULL, NULL); 2013 2014 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2015 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2016 2017 mutex_exit(&msp->ms_lock); 2018 2019 if (object != space_map_object(msp->ms_sm)) { 2020 object = space_map_object(msp->ms_sm); 2021 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2022 msp->ms_id, sizeof (uint64_t), &object, tx); 2023 } 2024 dmu_tx_commit(tx); 2025} 2026 2027/* 2028 * Called after a transaction group has completely synced to mark 2029 * all of the metaslab's free space as usable. 2030 */ 2031void 2032metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2033{ 2034 metaslab_group_t *mg = msp->ms_group; 2035 vdev_t *vd = mg->mg_vd; 2036 range_tree_t **freed_tree; 2037 range_tree_t **defer_tree; 2038 int64_t alloc_delta, defer_delta; 2039 2040 ASSERT(!vd->vdev_ishole); 2041 2042 mutex_enter(&msp->ms_lock); 2043 2044 /* 2045 * If this metaslab is just becoming available, initialize its 2046 * alloctrees, freetrees, and defertree and add its capacity to 2047 * the vdev. 2048 */ 2049 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2050 for (int t = 0; t < TXG_SIZE; t++) { 2051 ASSERT(msp->ms_alloctree[t] == NULL); 2052 ASSERT(msp->ms_freetree[t] == NULL); 2053 2054 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2055 &msp->ms_lock); 2056 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2057 &msp->ms_lock); 2058 } 2059 2060 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2061 ASSERT(msp->ms_defertree[t] == NULL); 2062 2063 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2064 &msp->ms_lock); 2065 } 2066 2067 vdev_space_update(vd, 0, 0, msp->ms_size); 2068 } 2069 2070 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2071 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2072 2073 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2074 defer_delta = range_tree_space(*freed_tree) - 2075 range_tree_space(*defer_tree); 2076 2077 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2078 2079 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2080 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2081 2082 /* 2083 * If there's a metaslab_load() in progress, wait for it to complete 2084 * so that we have a consistent view of the in-core space map. 2085 */ 2086 metaslab_load_wait(msp); 2087 2088 /* 2089 * Move the frees from the defer_tree back to the free 2090 * range tree (if it's loaded). Swap the freed_tree and the 2091 * defer_tree -- this is safe to do because we've just emptied out 2092 * the defer_tree. 2093 */ 2094 range_tree_vacate(*defer_tree, 2095 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2096 range_tree_swap(freed_tree, defer_tree); 2097 2098 space_map_update(msp->ms_sm); 2099 2100 msp->ms_deferspace += defer_delta; 2101 ASSERT3S(msp->ms_deferspace, >=, 0); 2102 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2103 if (msp->ms_deferspace != 0) { 2104 /* 2105 * Keep syncing this metaslab until all deferred frees 2106 * are back in circulation. 2107 */ 2108 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2109 } 2110 2111 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2112 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2113 VERIFY0(range_tree_space( 2114 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2115 } 2116 2117 if (!metaslab_debug_unload) 2118 metaslab_unload(msp); 2119 } 2120 2121 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2122 mutex_exit(&msp->ms_lock); 2123} 2124 2125void 2126metaslab_sync_reassess(metaslab_group_t *mg) 2127{ 2128 metaslab_group_alloc_update(mg); 2129 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2130 2131 /* 2132 * Preload the next potential metaslabs 2133 */ 2134 metaslab_group_preload(mg); 2135} 2136 2137static uint64_t 2138metaslab_distance(metaslab_t *msp, dva_t *dva) 2139{ 2140 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2141 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2142 uint64_t start = msp->ms_id; 2143 2144 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2145 return (1ULL << 63); 2146 2147 if (offset < start) 2148 return ((start - offset) << ms_shift); 2149 if (offset > start) 2150 return ((offset - start) << ms_shift); 2151 return (0); 2152} 2153 2154static uint64_t 2155metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2156 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2157{ 2158 spa_t *spa = mg->mg_vd->vdev_spa; 2159 metaslab_t *msp = NULL; 2160 uint64_t offset = -1ULL; 2161 avl_tree_t *t = &mg->mg_metaslab_tree; 2162 uint64_t activation_weight; 2163 uint64_t target_distance; 2164 int i; 2165 2166 activation_weight = METASLAB_WEIGHT_PRIMARY; 2167 for (i = 0; i < d; i++) { 2168 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2169 activation_weight = METASLAB_WEIGHT_SECONDARY; 2170 break; 2171 } 2172 } 2173 2174 for (;;) { 2175 boolean_t was_active; 2176 2177 mutex_enter(&mg->mg_lock); 2178 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2179 if (msp->ms_weight < asize) { 2180 spa_dbgmsg(spa, "%s: failed to meet weight " 2181 "requirement: vdev %llu, txg %llu, mg %p, " 2182 "msp %p, psize %llu, asize %llu, " 2183 "weight %llu", spa_name(spa), 2184 mg->mg_vd->vdev_id, txg, 2185 mg, msp, psize, asize, msp->ms_weight); 2186 mutex_exit(&mg->mg_lock); 2187 return (-1ULL); 2188 } 2189 2190 /* 2191 * If the selected metaslab is condensing, skip it. 2192 */ 2193 if (msp->ms_condensing) 2194 continue; 2195 2196 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2197 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2198 break; 2199 2200 target_distance = min_distance + 2201 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2202 min_distance >> 1); 2203 2204 for (i = 0; i < d; i++) 2205 if (metaslab_distance(msp, &dva[i]) < 2206 target_distance) 2207 break; 2208 if (i == d) 2209 break; 2210 } 2211 mutex_exit(&mg->mg_lock); 2212 if (msp == NULL) 2213 return (-1ULL); 2214 2215 mutex_enter(&msp->ms_lock); 2216 2217 /* 2218 * Ensure that the metaslab we have selected is still 2219 * capable of handling our request. It's possible that 2220 * another thread may have changed the weight while we 2221 * were blocked on the metaslab lock. 2222 */ 2223 if (msp->ms_weight < asize || (was_active && 2224 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2225 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2226 mutex_exit(&msp->ms_lock); 2227 continue; 2228 } 2229 2230 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2231 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2232 metaslab_passivate(msp, 2233 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2234 mutex_exit(&msp->ms_lock); 2235 continue; 2236 } 2237 2238 if (metaslab_activate(msp, activation_weight) != 0) { 2239 mutex_exit(&msp->ms_lock); 2240 continue; 2241 } 2242 2243 /* 2244 * If this metaslab is currently condensing then pick again as 2245 * we can't manipulate this metaslab until it's committed 2246 * to disk. 2247 */ 2248 if (msp->ms_condensing) { 2249 mutex_exit(&msp->ms_lock); 2250 continue; 2251 } 2252 2253 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2254 break; 2255 2256 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2257 mutex_exit(&msp->ms_lock); 2258 } 2259 2260 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2261 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2262 2263 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2264 msp->ms_access_txg = txg + metaslab_unload_delay; 2265 2266 mutex_exit(&msp->ms_lock); 2267 2268 return (offset); 2269} 2270 2271/* 2272 * Allocate a block for the specified i/o. 2273 */ 2274static int 2275metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2276 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2277{ 2278 metaslab_group_t *mg, *rotor; 2279 vdev_t *vd; 2280 int dshift = 3; 2281 int all_zero; 2282 int zio_lock = B_FALSE; 2283 boolean_t allocatable; 2284 uint64_t offset = -1ULL; 2285 uint64_t asize; 2286 uint64_t distance; 2287 2288 ASSERT(!DVA_IS_VALID(&dva[d])); 2289 2290 /* 2291 * For testing, make some blocks above a certain size be gang blocks. 2292 */ 2293 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2294 return (SET_ERROR(ENOSPC)); 2295 2296 /* 2297 * Start at the rotor and loop through all mgs until we find something. 2298 * Note that there's no locking on mc_rotor or mc_aliquot because 2299 * nothing actually breaks if we miss a few updates -- we just won't 2300 * allocate quite as evenly. It all balances out over time. 2301 * 2302 * If we are doing ditto or log blocks, try to spread them across 2303 * consecutive vdevs. If we're forced to reuse a vdev before we've 2304 * allocated all of our ditto blocks, then try and spread them out on 2305 * that vdev as much as possible. If it turns out to not be possible, 2306 * gradually lower our standards until anything becomes acceptable. 2307 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2308 * gives us hope of containing our fault domains to something we're 2309 * able to reason about. Otherwise, any two top-level vdev failures 2310 * will guarantee the loss of data. With consecutive allocation, 2311 * only two adjacent top-level vdev failures will result in data loss. 2312 * 2313 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2314 * ourselves on the same vdev as our gang block header. That 2315 * way, we can hope for locality in vdev_cache, plus it makes our 2316 * fault domains something tractable. 2317 */ 2318 if (hintdva) { 2319 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2320 2321 /* 2322 * It's possible the vdev we're using as the hint no 2323 * longer exists (i.e. removed). Consult the rotor when 2324 * all else fails. 2325 */ 2326 if (vd != NULL) { 2327 mg = vd->vdev_mg; 2328 2329 if (flags & METASLAB_HINTBP_AVOID && 2330 mg->mg_next != NULL) 2331 mg = mg->mg_next; 2332 } else { 2333 mg = mc->mc_rotor; 2334 } 2335 } else if (d != 0) { 2336 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2337 mg = vd->vdev_mg->mg_next; 2338 } else { 2339 mg = mc->mc_rotor; 2340 } 2341 2342 /* 2343 * If the hint put us into the wrong metaslab class, or into a 2344 * metaslab group that has been passivated, just follow the rotor. 2345 */ 2346 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2347 mg = mc->mc_rotor; 2348 2349 rotor = mg; 2350top: 2351 all_zero = B_TRUE; 2352 do { 2353 ASSERT(mg->mg_activation_count == 1); 2354 2355 vd = mg->mg_vd; 2356 2357 /* 2358 * Don't allocate from faulted devices. 2359 */ 2360 if (zio_lock) { 2361 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2362 allocatable = vdev_allocatable(vd); 2363 spa_config_exit(spa, SCL_ZIO, FTAG); 2364 } else { 2365 allocatable = vdev_allocatable(vd); 2366 } 2367 2368 /* 2369 * Determine if the selected metaslab group is eligible 2370 * for allocations. If we're ganging or have requested 2371 * an allocation for the smallest gang block size 2372 * then we don't want to avoid allocating to the this 2373 * metaslab group. If we're in this condition we should 2374 * try to allocate from any device possible so that we 2375 * don't inadvertently return ENOSPC and suspend the pool 2376 * even though space is still available. 2377 */ 2378 if (allocatable && CAN_FASTGANG(flags) && 2379 psize > SPA_GANGBLOCKSIZE) 2380 allocatable = metaslab_group_allocatable(mg); 2381 2382 if (!allocatable) 2383 goto next; 2384 2385 /* 2386 * Avoid writing single-copy data to a failing vdev 2387 * unless the user instructs us that it is okay. 2388 */ 2389 if ((vd->vdev_stat.vs_write_errors > 0 || 2390 vd->vdev_state < VDEV_STATE_HEALTHY) && 2391 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2392 all_zero = B_FALSE; 2393 goto next; 2394 } 2395 2396 ASSERT(mg->mg_class == mc); 2397 2398 distance = vd->vdev_asize >> dshift; 2399 if (distance <= (1ULL << vd->vdev_ms_shift)) 2400 distance = 0; 2401 else 2402 all_zero = B_FALSE; 2403 2404 asize = vdev_psize_to_asize(vd, psize); 2405 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2406 2407 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2408 dva, d); 2409 if (offset != -1ULL) { 2410 /* 2411 * If we've just selected this metaslab group, 2412 * figure out whether the corresponding vdev is 2413 * over- or under-used relative to the pool, 2414 * and set an allocation bias to even it out. 2415 */ 2416 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2417 vdev_stat_t *vs = &vd->vdev_stat; 2418 int64_t vu, cu; 2419 2420 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2421 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2422 2423 /* 2424 * Calculate how much more or less we should 2425 * try to allocate from this device during 2426 * this iteration around the rotor. 2427 * For example, if a device is 80% full 2428 * and the pool is 20% full then we should 2429 * reduce allocations by 60% on this device. 2430 * 2431 * mg_bias = (20 - 80) * 512K / 100 = -307K 2432 * 2433 * This reduces allocations by 307K for this 2434 * iteration. 2435 */ 2436 mg->mg_bias = ((cu - vu) * 2437 (int64_t)mg->mg_aliquot) / 100; 2438 } else if (!metaslab_bias_enabled) { 2439 mg->mg_bias = 0; 2440 } 2441 2442 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2443 mg->mg_aliquot + mg->mg_bias) { 2444 mc->mc_rotor = mg->mg_next; 2445 mc->mc_aliquot = 0; 2446 } 2447 2448 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2449 DVA_SET_OFFSET(&dva[d], offset); 2450 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2451 DVA_SET_ASIZE(&dva[d], asize); 2452 2453 return (0); 2454 } 2455next: 2456 mc->mc_rotor = mg->mg_next; 2457 mc->mc_aliquot = 0; 2458 } while ((mg = mg->mg_next) != rotor); 2459 2460 if (!all_zero) { 2461 dshift++; 2462 ASSERT(dshift < 64); 2463 goto top; 2464 } 2465 2466 if (!allocatable && !zio_lock) { 2467 dshift = 3; 2468 zio_lock = B_TRUE; 2469 goto top; 2470 } 2471 2472 bzero(&dva[d], sizeof (dva_t)); 2473 2474 return (SET_ERROR(ENOSPC)); 2475} 2476 2477/* 2478 * Free the block represented by DVA in the context of the specified 2479 * transaction group. 2480 */ 2481static void 2482metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2483{ 2484 uint64_t vdev = DVA_GET_VDEV(dva); 2485 uint64_t offset = DVA_GET_OFFSET(dva); 2486 uint64_t size = DVA_GET_ASIZE(dva); 2487 vdev_t *vd; 2488 metaslab_t *msp; 2489 2490 ASSERT(DVA_IS_VALID(dva)); 2491 2492 if (txg > spa_freeze_txg(spa)) 2493 return; 2494 2495 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2496 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2497 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2498 (u_longlong_t)vdev, (u_longlong_t)offset); 2499 ASSERT(0); 2500 return; 2501 } 2502 2503 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2504 2505 if (DVA_GET_GANG(dva)) 2506 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2507 2508 mutex_enter(&msp->ms_lock); 2509 2510 if (now) { 2511 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2512 offset, size); 2513 2514 VERIFY(!msp->ms_condensing); 2515 VERIFY3U(offset, >=, msp->ms_start); 2516 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2517 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2518 msp->ms_size); 2519 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2520 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2521 range_tree_add(msp->ms_tree, offset, size); 2522 } else { 2523 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2524 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2525 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2526 offset, size); 2527 } 2528 2529 mutex_exit(&msp->ms_lock); 2530} 2531 2532/* 2533 * Intent log support: upon opening the pool after a crash, notify the SPA 2534 * of blocks that the intent log has allocated for immediate write, but 2535 * which are still considered free by the SPA because the last transaction 2536 * group didn't commit yet. 2537 */ 2538static int 2539metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2540{ 2541 uint64_t vdev = DVA_GET_VDEV(dva); 2542 uint64_t offset = DVA_GET_OFFSET(dva); 2543 uint64_t size = DVA_GET_ASIZE(dva); 2544 vdev_t *vd; 2545 metaslab_t *msp; 2546 int error = 0; 2547 2548 ASSERT(DVA_IS_VALID(dva)); 2549 2550 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2551 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2552 return (SET_ERROR(ENXIO)); 2553 2554 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2555 2556 if (DVA_GET_GANG(dva)) 2557 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2558 2559 mutex_enter(&msp->ms_lock); 2560 2561 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2562 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2563 2564 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2565 error = SET_ERROR(ENOENT); 2566 2567 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2568 mutex_exit(&msp->ms_lock); 2569 return (error); 2570 } 2571 2572 VERIFY(!msp->ms_condensing); 2573 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2574 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2575 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2576 range_tree_remove(msp->ms_tree, offset, size); 2577 2578 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2579 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2580 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2581 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2582 } 2583 2584 mutex_exit(&msp->ms_lock); 2585 2586 return (0); 2587} 2588 2589int 2590metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2591 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2592{ 2593 dva_t *dva = bp->blk_dva; 2594 dva_t *hintdva = hintbp->blk_dva; 2595 int error = 0; 2596 2597 ASSERT(bp->blk_birth == 0); 2598 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2599 2600 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2601 2602 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2603 spa_config_exit(spa, SCL_ALLOC, FTAG); 2604 return (SET_ERROR(ENOSPC)); 2605 } 2606 2607 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2608 ASSERT(BP_GET_NDVAS(bp) == 0); 2609 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2610 2611 for (int d = 0; d < ndvas; d++) { 2612 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2613 txg, flags); 2614 if (error != 0) { 2615 for (d--; d >= 0; d--) { 2616 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2617 bzero(&dva[d], sizeof (dva_t)); 2618 } 2619 spa_config_exit(spa, SCL_ALLOC, FTAG); 2620 return (error); 2621 } 2622 } 2623 ASSERT(error == 0); 2624 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2625 2626 spa_config_exit(spa, SCL_ALLOC, FTAG); 2627 2628 BP_SET_BIRTH(bp, txg, txg); 2629 2630 return (0); 2631} 2632 2633void 2634metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2635{ 2636 const dva_t *dva = bp->blk_dva; 2637 int ndvas = BP_GET_NDVAS(bp); 2638 2639 ASSERT(!BP_IS_HOLE(bp)); 2640 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2641 2642 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2643 2644 for (int d = 0; d < ndvas; d++) 2645 metaslab_free_dva(spa, &dva[d], txg, now); 2646 2647 spa_config_exit(spa, SCL_FREE, FTAG); 2648} 2649 2650int 2651metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2652{ 2653 const dva_t *dva = bp->blk_dva; 2654 int ndvas = BP_GET_NDVAS(bp); 2655 int error = 0; 2656 2657 ASSERT(!BP_IS_HOLE(bp)); 2658 2659 if (txg != 0) { 2660 /* 2661 * First do a dry run to make sure all DVAs are claimable, 2662 * so we don't have to unwind from partial failures below. 2663 */ 2664 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2665 return (error); 2666 } 2667 2668 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2669 2670 for (int d = 0; d < ndvas; d++) 2671 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2672 break; 2673 2674 spa_config_exit(spa, SCL_ALLOC, FTAG); 2675 2676 ASSERT(error == 0 || txg == 0); 2677 2678 return (error); 2679} 2680 2681void 2682metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2683{ 2684 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2685 return; 2686 2687 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2688 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2689 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2690 vdev_t *vd = vdev_lookup_top(spa, vdev); 2691 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2692 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2693 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2694 2695 if (msp->ms_loaded) 2696 range_tree_verify(msp->ms_tree, offset, size); 2697 2698 for (int j = 0; j < TXG_SIZE; j++) 2699 range_tree_verify(msp->ms_freetree[j], offset, size); 2700 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2701 range_tree_verify(msp->ms_defertree[j], offset, size); 2702 } 2703 spa_config_exit(spa, SCL_VDEV, FTAG); 2704} 2705