metaslab.c revision 297112
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/space_map.h> 32#include <sys/metaslab_impl.h> 33#include <sys/vdev_impl.h> 34#include <sys/zio.h> 35#include <sys/spa_impl.h> 36#include <sys/zfeature.h> 37 38SYSCTL_DECL(_vfs_zfs); 39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 40 41/* 42 * Allow allocations to switch to gang blocks quickly. We do this to 43 * avoid having to load lots of space_maps in a given txg. There are, 44 * however, some cases where we want to avoid "fast" ganging and instead 45 * we want to do an exhaustive search of all metaslabs on this device. 46 * Currently we don't allow any gang, slog, or dump device related allocations 47 * to "fast" gang. 48 */ 49#define CAN_FASTGANG(flags) \ 50 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 51 METASLAB_GANG_AVOID))) 52 53#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 54#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 55#define METASLAB_ACTIVE_MASK \ 56 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 57 58uint64_t metaslab_aliquot = 512ULL << 10; 59uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 60TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 61SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 62 &metaslab_gang_bang, 0, 63 "Force gang block allocation for blocks larger than or equal to this value"); 64 65/* 66 * The in-core space map representation is more compact than its on-disk form. 67 * The zfs_condense_pct determines how much more compact the in-core 68 * space_map representation must be before we compact it on-disk. 69 * Values should be greater than or equal to 100. 70 */ 71int zfs_condense_pct = 200; 72TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 73SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 74 &zfs_condense_pct, 0, 75 "Condense on-disk spacemap when it is more than this many percents" 76 " of in-memory counterpart"); 77 78/* 79 * Condensing a metaslab is not guaranteed to actually reduce the amount of 80 * space used on disk. In particular, a space map uses data in increments of 81 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 82 * same number of blocks after condensing. Since the goal of condensing is to 83 * reduce the number of IOPs required to read the space map, we only want to 84 * condense when we can be sure we will reduce the number of blocks used by the 85 * space map. Unfortunately, we cannot precisely compute whether or not this is 86 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 87 * we apply the following heuristic: do not condense a spacemap unless the 88 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 89 * blocks. 90 */ 91int zfs_metaslab_condense_block_threshold = 4; 92 93/* 94 * The zfs_mg_noalloc_threshold defines which metaslab groups should 95 * be eligible for allocation. The value is defined as a percentage of 96 * free space. Metaslab groups that have more free space than 97 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 98 * a metaslab group's free space is less than or equal to the 99 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 100 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 101 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 102 * groups are allowed to accept allocations. Gang blocks are always 103 * eligible to allocate on any metaslab group. The default value of 0 means 104 * no metaslab group will be excluded based on this criterion. 105 */ 106int zfs_mg_noalloc_threshold = 0; 107TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 108SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 109 &zfs_mg_noalloc_threshold, 0, 110 "Percentage of metaslab group size that should be free" 111 " to make it eligible for allocation"); 112 113/* 114 * Metaslab groups are considered eligible for allocations if their 115 * fragmenation metric (measured as a percentage) is less than or equal to 116 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 117 * then it will be skipped unless all metaslab groups within the metaslab 118 * class have also crossed this threshold. 119 */ 120int zfs_mg_fragmentation_threshold = 85; 121TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 122SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 123 &zfs_mg_fragmentation_threshold, 0, 124 "Percentage of metaslab group size that should be considered " 125 "eligible for allocations unless all metaslab groups within the metaslab class " 126 "have also crossed this threshold"); 127 128/* 129 * Allow metaslabs to keep their active state as long as their fragmentation 130 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 131 * active metaslab that exceeds this threshold will no longer keep its active 132 * status allowing better metaslabs to be selected. 133 */ 134int zfs_metaslab_fragmentation_threshold = 70; 135TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 136 &zfs_metaslab_fragmentation_threshold); 137SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 138 &zfs_metaslab_fragmentation_threshold, 0, 139 "Maximum percentage of metaslab fragmentation level to keep their active state"); 140 141/* 142 * When set will load all metaslabs when pool is first opened. 143 */ 144int metaslab_debug_load = 0; 145TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 146SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 147 &metaslab_debug_load, 0, 148 "Load all metaslabs when pool is first opened"); 149 150/* 151 * When set will prevent metaslabs from being unloaded. 152 */ 153int metaslab_debug_unload = 0; 154TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 155SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 156 &metaslab_debug_unload, 0, 157 "Prevent metaslabs from being unloaded"); 158 159/* 160 * Minimum size which forces the dynamic allocator to change 161 * it's allocation strategy. Once the space map cannot satisfy 162 * an allocation of this size then it switches to using more 163 * aggressive strategy (i.e search by size rather than offset). 164 */ 165uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 166TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 167 &metaslab_df_alloc_threshold); 168SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 169 &metaslab_df_alloc_threshold, 0, 170 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 171 172/* 173 * The minimum free space, in percent, which must be available 174 * in a space map to continue allocations in a first-fit fashion. 175 * Once the space_map's free space drops below this level we dynamically 176 * switch to using best-fit allocations. 177 */ 178int metaslab_df_free_pct = 4; 179TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 180SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 181 &metaslab_df_free_pct, 0, 182 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 183 184/* 185 * A metaslab is considered "free" if it contains a contiguous 186 * segment which is greater than metaslab_min_alloc_size. 187 */ 188uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 189TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 190 &metaslab_min_alloc_size); 191SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 192 &metaslab_min_alloc_size, 0, 193 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 194 195/* 196 * Percentage of all cpus that can be used by the metaslab taskq. 197 */ 198int metaslab_load_pct = 50; 199TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 200SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 201 &metaslab_load_pct, 0, 202 "Percentage of cpus that can be used by the metaslab taskq"); 203 204/* 205 * Determines how many txgs a metaslab may remain loaded without having any 206 * allocations from it. As long as a metaslab continues to be used we will 207 * keep it loaded. 208 */ 209int metaslab_unload_delay = TXG_SIZE * 2; 210TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 211SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 212 &metaslab_unload_delay, 0, 213 "Number of TXGs that an unused metaslab can be kept in memory"); 214 215/* 216 * Max number of metaslabs per group to preload. 217 */ 218int metaslab_preload_limit = SPA_DVAS_PER_BP; 219TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 220SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 221 &metaslab_preload_limit, 0, 222 "Max number of metaslabs per group to preload"); 223 224/* 225 * Enable/disable preloading of metaslab. 226 */ 227boolean_t metaslab_preload_enabled = B_TRUE; 228TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 229SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 230 &metaslab_preload_enabled, 0, 231 "Max number of metaslabs per group to preload"); 232 233/* 234 * Enable/disable fragmentation weighting on metaslabs. 235 */ 236boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 237TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 238 &metaslab_fragmentation_factor_enabled); 239SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 240 &metaslab_fragmentation_factor_enabled, 0, 241 "Enable fragmentation weighting on metaslabs"); 242 243/* 244 * Enable/disable lba weighting (i.e. outer tracks are given preference). 245 */ 246boolean_t metaslab_lba_weighting_enabled = B_TRUE; 247TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 248 &metaslab_lba_weighting_enabled); 249SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 250 &metaslab_lba_weighting_enabled, 0, 251 "Enable LBA weighting (i.e. outer tracks are given preference)"); 252 253/* 254 * Enable/disable metaslab group biasing. 255 */ 256boolean_t metaslab_bias_enabled = B_TRUE; 257TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 258 &metaslab_bias_enabled); 259SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 260 &metaslab_bias_enabled, 0, 261 "Enable metaslab group biasing"); 262 263static uint64_t metaslab_fragmentation(metaslab_t *); 264 265/* 266 * ========================================================================== 267 * Metaslab classes 268 * ========================================================================== 269 */ 270metaslab_class_t * 271metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 272{ 273 metaslab_class_t *mc; 274 275 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 276 277 mc->mc_spa = spa; 278 mc->mc_rotor = NULL; 279 mc->mc_ops = ops; 280 281 return (mc); 282} 283 284void 285metaslab_class_destroy(metaslab_class_t *mc) 286{ 287 ASSERT(mc->mc_rotor == NULL); 288 ASSERT(mc->mc_alloc == 0); 289 ASSERT(mc->mc_deferred == 0); 290 ASSERT(mc->mc_space == 0); 291 ASSERT(mc->mc_dspace == 0); 292 293 kmem_free(mc, sizeof (metaslab_class_t)); 294} 295 296int 297metaslab_class_validate(metaslab_class_t *mc) 298{ 299 metaslab_group_t *mg; 300 vdev_t *vd; 301 302 /* 303 * Must hold one of the spa_config locks. 304 */ 305 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 306 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 307 308 if ((mg = mc->mc_rotor) == NULL) 309 return (0); 310 311 do { 312 vd = mg->mg_vd; 313 ASSERT(vd->vdev_mg != NULL); 314 ASSERT3P(vd->vdev_top, ==, vd); 315 ASSERT3P(mg->mg_class, ==, mc); 316 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 317 } while ((mg = mg->mg_next) != mc->mc_rotor); 318 319 return (0); 320} 321 322void 323metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 324 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 325{ 326 atomic_add_64(&mc->mc_alloc, alloc_delta); 327 atomic_add_64(&mc->mc_deferred, defer_delta); 328 atomic_add_64(&mc->mc_space, space_delta); 329 atomic_add_64(&mc->mc_dspace, dspace_delta); 330} 331 332void 333metaslab_class_minblocksize_update(metaslab_class_t *mc) 334{ 335 metaslab_group_t *mg; 336 vdev_t *vd; 337 uint64_t minashift = UINT64_MAX; 338 339 if ((mg = mc->mc_rotor) == NULL) { 340 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 341 return; 342 } 343 344 do { 345 vd = mg->mg_vd; 346 if (vd->vdev_ashift < minashift) 347 minashift = vd->vdev_ashift; 348 } while ((mg = mg->mg_next) != mc->mc_rotor); 349 350 mc->mc_minblocksize = 1ULL << minashift; 351} 352 353uint64_t 354metaslab_class_get_alloc(metaslab_class_t *mc) 355{ 356 return (mc->mc_alloc); 357} 358 359uint64_t 360metaslab_class_get_deferred(metaslab_class_t *mc) 361{ 362 return (mc->mc_deferred); 363} 364 365uint64_t 366metaslab_class_get_space(metaslab_class_t *mc) 367{ 368 return (mc->mc_space); 369} 370 371uint64_t 372metaslab_class_get_dspace(metaslab_class_t *mc) 373{ 374 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 375} 376 377uint64_t 378metaslab_class_get_minblocksize(metaslab_class_t *mc) 379{ 380 return (mc->mc_minblocksize); 381} 382 383void 384metaslab_class_histogram_verify(metaslab_class_t *mc) 385{ 386 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 387 uint64_t *mc_hist; 388 int i; 389 390 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 391 return; 392 393 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 394 KM_SLEEP); 395 396 for (int c = 0; c < rvd->vdev_children; c++) { 397 vdev_t *tvd = rvd->vdev_child[c]; 398 metaslab_group_t *mg = tvd->vdev_mg; 399 400 /* 401 * Skip any holes, uninitialized top-levels, or 402 * vdevs that are not in this metalab class. 403 */ 404 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 405 mg->mg_class != mc) { 406 continue; 407 } 408 409 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 410 mc_hist[i] += mg->mg_histogram[i]; 411 } 412 413 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 414 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 415 416 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 417} 418 419/* 420 * Calculate the metaslab class's fragmentation metric. The metric 421 * is weighted based on the space contribution of each metaslab group. 422 * The return value will be a number between 0 and 100 (inclusive), or 423 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 424 * zfs_frag_table for more information about the metric. 425 */ 426uint64_t 427metaslab_class_fragmentation(metaslab_class_t *mc) 428{ 429 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 430 uint64_t fragmentation = 0; 431 432 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 433 434 for (int c = 0; c < rvd->vdev_children; c++) { 435 vdev_t *tvd = rvd->vdev_child[c]; 436 metaslab_group_t *mg = tvd->vdev_mg; 437 438 /* 439 * Skip any holes, uninitialized top-levels, or 440 * vdevs that are not in this metalab class. 441 */ 442 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 443 mg->mg_class != mc) { 444 continue; 445 } 446 447 /* 448 * If a metaslab group does not contain a fragmentation 449 * metric then just bail out. 450 */ 451 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 452 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 453 return (ZFS_FRAG_INVALID); 454 } 455 456 /* 457 * Determine how much this metaslab_group is contributing 458 * to the overall pool fragmentation metric. 459 */ 460 fragmentation += mg->mg_fragmentation * 461 metaslab_group_get_space(mg); 462 } 463 fragmentation /= metaslab_class_get_space(mc); 464 465 ASSERT3U(fragmentation, <=, 100); 466 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 467 return (fragmentation); 468} 469 470/* 471 * Calculate the amount of expandable space that is available in 472 * this metaslab class. If a device is expanded then its expandable 473 * space will be the amount of allocatable space that is currently not 474 * part of this metaslab class. 475 */ 476uint64_t 477metaslab_class_expandable_space(metaslab_class_t *mc) 478{ 479 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 480 uint64_t space = 0; 481 482 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 483 for (int c = 0; c < rvd->vdev_children; c++) { 484 vdev_t *tvd = rvd->vdev_child[c]; 485 metaslab_group_t *mg = tvd->vdev_mg; 486 487 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 488 mg->mg_class != mc) { 489 continue; 490 } 491 492 space += tvd->vdev_max_asize - tvd->vdev_asize; 493 } 494 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 495 return (space); 496} 497 498/* 499 * ========================================================================== 500 * Metaslab groups 501 * ========================================================================== 502 */ 503static int 504metaslab_compare(const void *x1, const void *x2) 505{ 506 const metaslab_t *m1 = x1; 507 const metaslab_t *m2 = x2; 508 509 if (m1->ms_weight < m2->ms_weight) 510 return (1); 511 if (m1->ms_weight > m2->ms_weight) 512 return (-1); 513 514 /* 515 * If the weights are identical, use the offset to force uniqueness. 516 */ 517 if (m1->ms_start < m2->ms_start) 518 return (-1); 519 if (m1->ms_start > m2->ms_start) 520 return (1); 521 522 ASSERT3P(m1, ==, m2); 523 524 return (0); 525} 526 527/* 528 * Update the allocatable flag and the metaslab group's capacity. 529 * The allocatable flag is set to true if the capacity is below 530 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 531 * from allocatable to non-allocatable or vice versa then the metaslab 532 * group's class is updated to reflect the transition. 533 */ 534static void 535metaslab_group_alloc_update(metaslab_group_t *mg) 536{ 537 vdev_t *vd = mg->mg_vd; 538 metaslab_class_t *mc = mg->mg_class; 539 vdev_stat_t *vs = &vd->vdev_stat; 540 boolean_t was_allocatable; 541 542 ASSERT(vd == vd->vdev_top); 543 544 mutex_enter(&mg->mg_lock); 545 was_allocatable = mg->mg_allocatable; 546 547 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 548 (vs->vs_space + 1); 549 550 /* 551 * A metaslab group is considered allocatable if it has plenty 552 * of free space or is not heavily fragmented. We only take 553 * fragmentation into account if the metaslab group has a valid 554 * fragmentation metric (i.e. a value between 0 and 100). 555 */ 556 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 557 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 558 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 559 560 /* 561 * The mc_alloc_groups maintains a count of the number of 562 * groups in this metaslab class that are still above the 563 * zfs_mg_noalloc_threshold. This is used by the allocating 564 * threads to determine if they should avoid allocations to 565 * a given group. The allocator will avoid allocations to a group 566 * if that group has reached or is below the zfs_mg_noalloc_threshold 567 * and there are still other groups that are above the threshold. 568 * When a group transitions from allocatable to non-allocatable or 569 * vice versa we update the metaslab class to reflect that change. 570 * When the mc_alloc_groups value drops to 0 that means that all 571 * groups have reached the zfs_mg_noalloc_threshold making all groups 572 * eligible for allocations. This effectively means that all devices 573 * are balanced again. 574 */ 575 if (was_allocatable && !mg->mg_allocatable) 576 mc->mc_alloc_groups--; 577 else if (!was_allocatable && mg->mg_allocatable) 578 mc->mc_alloc_groups++; 579 580 mutex_exit(&mg->mg_lock); 581} 582 583metaslab_group_t * 584metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 585{ 586 metaslab_group_t *mg; 587 588 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 589 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 590 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 591 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 592 mg->mg_vd = vd; 593 mg->mg_class = mc; 594 mg->mg_activation_count = 0; 595 596 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 597 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 598 599 return (mg); 600} 601 602void 603metaslab_group_destroy(metaslab_group_t *mg) 604{ 605 ASSERT(mg->mg_prev == NULL); 606 ASSERT(mg->mg_next == NULL); 607 /* 608 * We may have gone below zero with the activation count 609 * either because we never activated in the first place or 610 * because we're done, and possibly removing the vdev. 611 */ 612 ASSERT(mg->mg_activation_count <= 0); 613 614 taskq_destroy(mg->mg_taskq); 615 avl_destroy(&mg->mg_metaslab_tree); 616 mutex_destroy(&mg->mg_lock); 617 kmem_free(mg, sizeof (metaslab_group_t)); 618} 619 620void 621metaslab_group_activate(metaslab_group_t *mg) 622{ 623 metaslab_class_t *mc = mg->mg_class; 624 metaslab_group_t *mgprev, *mgnext; 625 626 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 627 628 ASSERT(mc->mc_rotor != mg); 629 ASSERT(mg->mg_prev == NULL); 630 ASSERT(mg->mg_next == NULL); 631 ASSERT(mg->mg_activation_count <= 0); 632 633 if (++mg->mg_activation_count <= 0) 634 return; 635 636 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 637 metaslab_group_alloc_update(mg); 638 639 if ((mgprev = mc->mc_rotor) == NULL) { 640 mg->mg_prev = mg; 641 mg->mg_next = mg; 642 } else { 643 mgnext = mgprev->mg_next; 644 mg->mg_prev = mgprev; 645 mg->mg_next = mgnext; 646 mgprev->mg_next = mg; 647 mgnext->mg_prev = mg; 648 } 649 mc->mc_rotor = mg; 650 metaslab_class_minblocksize_update(mc); 651} 652 653void 654metaslab_group_passivate(metaslab_group_t *mg) 655{ 656 metaslab_class_t *mc = mg->mg_class; 657 metaslab_group_t *mgprev, *mgnext; 658 659 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 660 661 if (--mg->mg_activation_count != 0) { 662 ASSERT(mc->mc_rotor != mg); 663 ASSERT(mg->mg_prev == NULL); 664 ASSERT(mg->mg_next == NULL); 665 ASSERT(mg->mg_activation_count < 0); 666 return; 667 } 668 669 taskq_wait(mg->mg_taskq); 670 metaslab_group_alloc_update(mg); 671 672 mgprev = mg->mg_prev; 673 mgnext = mg->mg_next; 674 675 if (mg == mgnext) { 676 mc->mc_rotor = NULL; 677 } else { 678 mc->mc_rotor = mgnext; 679 mgprev->mg_next = mgnext; 680 mgnext->mg_prev = mgprev; 681 } 682 683 mg->mg_prev = NULL; 684 mg->mg_next = NULL; 685 metaslab_class_minblocksize_update(mc); 686} 687 688uint64_t 689metaslab_group_get_space(metaslab_group_t *mg) 690{ 691 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 692} 693 694void 695metaslab_group_histogram_verify(metaslab_group_t *mg) 696{ 697 uint64_t *mg_hist; 698 vdev_t *vd = mg->mg_vd; 699 uint64_t ashift = vd->vdev_ashift; 700 int i; 701 702 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 703 return; 704 705 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 706 KM_SLEEP); 707 708 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 709 SPACE_MAP_HISTOGRAM_SIZE + ashift); 710 711 for (int m = 0; m < vd->vdev_ms_count; m++) { 712 metaslab_t *msp = vd->vdev_ms[m]; 713 714 if (msp->ms_sm == NULL) 715 continue; 716 717 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 718 mg_hist[i + ashift] += 719 msp->ms_sm->sm_phys->smp_histogram[i]; 720 } 721 722 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 723 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 724 725 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 726} 727 728static void 729metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 730{ 731 metaslab_class_t *mc = mg->mg_class; 732 uint64_t ashift = mg->mg_vd->vdev_ashift; 733 734 ASSERT(MUTEX_HELD(&msp->ms_lock)); 735 if (msp->ms_sm == NULL) 736 return; 737 738 mutex_enter(&mg->mg_lock); 739 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 740 mg->mg_histogram[i + ashift] += 741 msp->ms_sm->sm_phys->smp_histogram[i]; 742 mc->mc_histogram[i + ashift] += 743 msp->ms_sm->sm_phys->smp_histogram[i]; 744 } 745 mutex_exit(&mg->mg_lock); 746} 747 748void 749metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 750{ 751 metaslab_class_t *mc = mg->mg_class; 752 uint64_t ashift = mg->mg_vd->vdev_ashift; 753 754 ASSERT(MUTEX_HELD(&msp->ms_lock)); 755 if (msp->ms_sm == NULL) 756 return; 757 758 mutex_enter(&mg->mg_lock); 759 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 760 ASSERT3U(mg->mg_histogram[i + ashift], >=, 761 msp->ms_sm->sm_phys->smp_histogram[i]); 762 ASSERT3U(mc->mc_histogram[i + ashift], >=, 763 msp->ms_sm->sm_phys->smp_histogram[i]); 764 765 mg->mg_histogram[i + ashift] -= 766 msp->ms_sm->sm_phys->smp_histogram[i]; 767 mc->mc_histogram[i + ashift] -= 768 msp->ms_sm->sm_phys->smp_histogram[i]; 769 } 770 mutex_exit(&mg->mg_lock); 771} 772 773static void 774metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 775{ 776 ASSERT(msp->ms_group == NULL); 777 mutex_enter(&mg->mg_lock); 778 msp->ms_group = mg; 779 msp->ms_weight = 0; 780 avl_add(&mg->mg_metaslab_tree, msp); 781 mutex_exit(&mg->mg_lock); 782 783 mutex_enter(&msp->ms_lock); 784 metaslab_group_histogram_add(mg, msp); 785 mutex_exit(&msp->ms_lock); 786} 787 788static void 789metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 790{ 791 mutex_enter(&msp->ms_lock); 792 metaslab_group_histogram_remove(mg, msp); 793 mutex_exit(&msp->ms_lock); 794 795 mutex_enter(&mg->mg_lock); 796 ASSERT(msp->ms_group == mg); 797 avl_remove(&mg->mg_metaslab_tree, msp); 798 msp->ms_group = NULL; 799 mutex_exit(&mg->mg_lock); 800} 801 802static void 803metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 804{ 805 /* 806 * Although in principle the weight can be any value, in 807 * practice we do not use values in the range [1, 511]. 808 */ 809 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 810 ASSERT(MUTEX_HELD(&msp->ms_lock)); 811 812 mutex_enter(&mg->mg_lock); 813 ASSERT(msp->ms_group == mg); 814 avl_remove(&mg->mg_metaslab_tree, msp); 815 msp->ms_weight = weight; 816 avl_add(&mg->mg_metaslab_tree, msp); 817 mutex_exit(&mg->mg_lock); 818} 819 820/* 821 * Calculate the fragmentation for a given metaslab group. We can use 822 * a simple average here since all metaslabs within the group must have 823 * the same size. The return value will be a value between 0 and 100 824 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 825 * group have a fragmentation metric. 826 */ 827uint64_t 828metaslab_group_fragmentation(metaslab_group_t *mg) 829{ 830 vdev_t *vd = mg->mg_vd; 831 uint64_t fragmentation = 0; 832 uint64_t valid_ms = 0; 833 834 for (int m = 0; m < vd->vdev_ms_count; m++) { 835 metaslab_t *msp = vd->vdev_ms[m]; 836 837 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 838 continue; 839 840 valid_ms++; 841 fragmentation += msp->ms_fragmentation; 842 } 843 844 if (valid_ms <= vd->vdev_ms_count / 2) 845 return (ZFS_FRAG_INVALID); 846 847 fragmentation /= valid_ms; 848 ASSERT3U(fragmentation, <=, 100); 849 return (fragmentation); 850} 851 852/* 853 * Determine if a given metaslab group should skip allocations. A metaslab 854 * group should avoid allocations if its free capacity is less than the 855 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 856 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 857 * that can still handle allocations. 858 */ 859static boolean_t 860metaslab_group_allocatable(metaslab_group_t *mg) 861{ 862 vdev_t *vd = mg->mg_vd; 863 spa_t *spa = vd->vdev_spa; 864 metaslab_class_t *mc = mg->mg_class; 865 866 /* 867 * We use two key metrics to determine if a metaslab group is 868 * considered allocatable -- free space and fragmentation. If 869 * the free space is greater than the free space threshold and 870 * the fragmentation is less than the fragmentation threshold then 871 * consider the group allocatable. There are two case when we will 872 * not consider these key metrics. The first is if the group is 873 * associated with a slog device and the second is if all groups 874 * in this metaslab class have already been consider ineligible 875 * for allocations. 876 */ 877 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 878 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 879 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 880 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 881} 882 883/* 884 * ========================================================================== 885 * Range tree callbacks 886 * ========================================================================== 887 */ 888 889/* 890 * Comparison function for the private size-ordered tree. Tree is sorted 891 * by size, larger sizes at the end of the tree. 892 */ 893static int 894metaslab_rangesize_compare(const void *x1, const void *x2) 895{ 896 const range_seg_t *r1 = x1; 897 const range_seg_t *r2 = x2; 898 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 899 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 900 901 if (rs_size1 < rs_size2) 902 return (-1); 903 if (rs_size1 > rs_size2) 904 return (1); 905 906 if (r1->rs_start < r2->rs_start) 907 return (-1); 908 909 if (r1->rs_start > r2->rs_start) 910 return (1); 911 912 return (0); 913} 914 915/* 916 * Create any block allocator specific components. The current allocators 917 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 918 */ 919static void 920metaslab_rt_create(range_tree_t *rt, void *arg) 921{ 922 metaslab_t *msp = arg; 923 924 ASSERT3P(rt->rt_arg, ==, msp); 925 ASSERT(msp->ms_tree == NULL); 926 927 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 928 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 929} 930 931/* 932 * Destroy the block allocator specific components. 933 */ 934static void 935metaslab_rt_destroy(range_tree_t *rt, void *arg) 936{ 937 metaslab_t *msp = arg; 938 939 ASSERT3P(rt->rt_arg, ==, msp); 940 ASSERT3P(msp->ms_tree, ==, rt); 941 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 942 943 avl_destroy(&msp->ms_size_tree); 944} 945 946static void 947metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 948{ 949 metaslab_t *msp = arg; 950 951 ASSERT3P(rt->rt_arg, ==, msp); 952 ASSERT3P(msp->ms_tree, ==, rt); 953 VERIFY(!msp->ms_condensing); 954 avl_add(&msp->ms_size_tree, rs); 955} 956 957static void 958metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 959{ 960 metaslab_t *msp = arg; 961 962 ASSERT3P(rt->rt_arg, ==, msp); 963 ASSERT3P(msp->ms_tree, ==, rt); 964 VERIFY(!msp->ms_condensing); 965 avl_remove(&msp->ms_size_tree, rs); 966} 967 968static void 969metaslab_rt_vacate(range_tree_t *rt, void *arg) 970{ 971 metaslab_t *msp = arg; 972 973 ASSERT3P(rt->rt_arg, ==, msp); 974 ASSERT3P(msp->ms_tree, ==, rt); 975 976 /* 977 * Normally one would walk the tree freeing nodes along the way. 978 * Since the nodes are shared with the range trees we can avoid 979 * walking all nodes and just reinitialize the avl tree. The nodes 980 * will be freed by the range tree, so we don't want to free them here. 981 */ 982 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 983 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 984} 985 986static range_tree_ops_t metaslab_rt_ops = { 987 metaslab_rt_create, 988 metaslab_rt_destroy, 989 metaslab_rt_add, 990 metaslab_rt_remove, 991 metaslab_rt_vacate 992}; 993 994/* 995 * ========================================================================== 996 * Metaslab block operations 997 * ========================================================================== 998 */ 999 1000/* 1001 * Return the maximum contiguous segment within the metaslab. 1002 */ 1003uint64_t 1004metaslab_block_maxsize(metaslab_t *msp) 1005{ 1006 avl_tree_t *t = &msp->ms_size_tree; 1007 range_seg_t *rs; 1008 1009 if (t == NULL || (rs = avl_last(t)) == NULL) 1010 return (0ULL); 1011 1012 return (rs->rs_end - rs->rs_start); 1013} 1014 1015uint64_t 1016metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1017{ 1018 uint64_t start; 1019 range_tree_t *rt = msp->ms_tree; 1020 1021 VERIFY(!msp->ms_condensing); 1022 1023 start = msp->ms_ops->msop_alloc(msp, size); 1024 if (start != -1ULL) { 1025 vdev_t *vd = msp->ms_group->mg_vd; 1026 1027 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1028 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1029 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1030 range_tree_remove(rt, start, size); 1031 } 1032 return (start); 1033} 1034 1035/* 1036 * ========================================================================== 1037 * Common allocator routines 1038 * ========================================================================== 1039 */ 1040 1041/* 1042 * This is a helper function that can be used by the allocator to find 1043 * a suitable block to allocate. This will search the specified AVL 1044 * tree looking for a block that matches the specified criteria. 1045 */ 1046static uint64_t 1047metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1048 uint64_t align) 1049{ 1050 range_seg_t *rs, rsearch; 1051 avl_index_t where; 1052 1053 rsearch.rs_start = *cursor; 1054 rsearch.rs_end = *cursor + size; 1055 1056 rs = avl_find(t, &rsearch, &where); 1057 if (rs == NULL) 1058 rs = avl_nearest(t, where, AVL_AFTER); 1059 1060 while (rs != NULL) { 1061 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1062 1063 if (offset + size <= rs->rs_end) { 1064 *cursor = offset + size; 1065 return (offset); 1066 } 1067 rs = AVL_NEXT(t, rs); 1068 } 1069 1070 /* 1071 * If we know we've searched the whole map (*cursor == 0), give up. 1072 * Otherwise, reset the cursor to the beginning and try again. 1073 */ 1074 if (*cursor == 0) 1075 return (-1ULL); 1076 1077 *cursor = 0; 1078 return (metaslab_block_picker(t, cursor, size, align)); 1079} 1080 1081/* 1082 * ========================================================================== 1083 * The first-fit block allocator 1084 * ========================================================================== 1085 */ 1086static uint64_t 1087metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1088{ 1089 /* 1090 * Find the largest power of 2 block size that evenly divides the 1091 * requested size. This is used to try to allocate blocks with similar 1092 * alignment from the same area of the metaslab (i.e. same cursor 1093 * bucket) but it does not guarantee that other allocations sizes 1094 * may exist in the same region. 1095 */ 1096 uint64_t align = size & -size; 1097 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1098 avl_tree_t *t = &msp->ms_tree->rt_root; 1099 1100 return (metaslab_block_picker(t, cursor, size, align)); 1101} 1102 1103static metaslab_ops_t metaslab_ff_ops = { 1104 metaslab_ff_alloc 1105}; 1106 1107/* 1108 * ========================================================================== 1109 * Dynamic block allocator - 1110 * Uses the first fit allocation scheme until space get low and then 1111 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1112 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1113 * ========================================================================== 1114 */ 1115static uint64_t 1116metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1117{ 1118 /* 1119 * Find the largest power of 2 block size that evenly divides the 1120 * requested size. This is used to try to allocate blocks with similar 1121 * alignment from the same area of the metaslab (i.e. same cursor 1122 * bucket) but it does not guarantee that other allocations sizes 1123 * may exist in the same region. 1124 */ 1125 uint64_t align = size & -size; 1126 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1127 range_tree_t *rt = msp->ms_tree; 1128 avl_tree_t *t = &rt->rt_root; 1129 uint64_t max_size = metaslab_block_maxsize(msp); 1130 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1131 1132 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1133 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1134 1135 if (max_size < size) 1136 return (-1ULL); 1137 1138 /* 1139 * If we're running low on space switch to using the size 1140 * sorted AVL tree (best-fit). 1141 */ 1142 if (max_size < metaslab_df_alloc_threshold || 1143 free_pct < metaslab_df_free_pct) { 1144 t = &msp->ms_size_tree; 1145 *cursor = 0; 1146 } 1147 1148 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1149} 1150 1151static metaslab_ops_t metaslab_df_ops = { 1152 metaslab_df_alloc 1153}; 1154 1155/* 1156 * ========================================================================== 1157 * Cursor fit block allocator - 1158 * Select the largest region in the metaslab, set the cursor to the beginning 1159 * of the range and the cursor_end to the end of the range. As allocations 1160 * are made advance the cursor. Continue allocating from the cursor until 1161 * the range is exhausted and then find a new range. 1162 * ========================================================================== 1163 */ 1164static uint64_t 1165metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1166{ 1167 range_tree_t *rt = msp->ms_tree; 1168 avl_tree_t *t = &msp->ms_size_tree; 1169 uint64_t *cursor = &msp->ms_lbas[0]; 1170 uint64_t *cursor_end = &msp->ms_lbas[1]; 1171 uint64_t offset = 0; 1172 1173 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1174 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1175 1176 ASSERT3U(*cursor_end, >=, *cursor); 1177 1178 if ((*cursor + size) > *cursor_end) { 1179 range_seg_t *rs; 1180 1181 rs = avl_last(&msp->ms_size_tree); 1182 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1183 return (-1ULL); 1184 1185 *cursor = rs->rs_start; 1186 *cursor_end = rs->rs_end; 1187 } 1188 1189 offset = *cursor; 1190 *cursor += size; 1191 1192 return (offset); 1193} 1194 1195static metaslab_ops_t metaslab_cf_ops = { 1196 metaslab_cf_alloc 1197}; 1198 1199/* 1200 * ========================================================================== 1201 * New dynamic fit allocator - 1202 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1203 * contiguous blocks. If no region is found then just use the largest segment 1204 * that remains. 1205 * ========================================================================== 1206 */ 1207 1208/* 1209 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1210 * to request from the allocator. 1211 */ 1212uint64_t metaslab_ndf_clump_shift = 4; 1213 1214static uint64_t 1215metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1216{ 1217 avl_tree_t *t = &msp->ms_tree->rt_root; 1218 avl_index_t where; 1219 range_seg_t *rs, rsearch; 1220 uint64_t hbit = highbit64(size); 1221 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1222 uint64_t max_size = metaslab_block_maxsize(msp); 1223 1224 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1225 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1226 1227 if (max_size < size) 1228 return (-1ULL); 1229 1230 rsearch.rs_start = *cursor; 1231 rsearch.rs_end = *cursor + size; 1232 1233 rs = avl_find(t, &rsearch, &where); 1234 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1235 t = &msp->ms_size_tree; 1236 1237 rsearch.rs_start = 0; 1238 rsearch.rs_end = MIN(max_size, 1239 1ULL << (hbit + metaslab_ndf_clump_shift)); 1240 rs = avl_find(t, &rsearch, &where); 1241 if (rs == NULL) 1242 rs = avl_nearest(t, where, AVL_AFTER); 1243 ASSERT(rs != NULL); 1244 } 1245 1246 if ((rs->rs_end - rs->rs_start) >= size) { 1247 *cursor = rs->rs_start + size; 1248 return (rs->rs_start); 1249 } 1250 return (-1ULL); 1251} 1252 1253static metaslab_ops_t metaslab_ndf_ops = { 1254 metaslab_ndf_alloc 1255}; 1256 1257metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1258 1259/* 1260 * ========================================================================== 1261 * Metaslabs 1262 * ========================================================================== 1263 */ 1264 1265/* 1266 * Wait for any in-progress metaslab loads to complete. 1267 */ 1268void 1269metaslab_load_wait(metaslab_t *msp) 1270{ 1271 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1272 1273 while (msp->ms_loading) { 1274 ASSERT(!msp->ms_loaded); 1275 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1276 } 1277} 1278 1279int 1280metaslab_load(metaslab_t *msp) 1281{ 1282 int error = 0; 1283 1284 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1285 ASSERT(!msp->ms_loaded); 1286 ASSERT(!msp->ms_loading); 1287 1288 msp->ms_loading = B_TRUE; 1289 1290 /* 1291 * If the space map has not been allocated yet, then treat 1292 * all the space in the metaslab as free and add it to the 1293 * ms_tree. 1294 */ 1295 if (msp->ms_sm != NULL) 1296 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1297 else 1298 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1299 1300 msp->ms_loaded = (error == 0); 1301 msp->ms_loading = B_FALSE; 1302 1303 if (msp->ms_loaded) { 1304 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1305 range_tree_walk(msp->ms_defertree[t], 1306 range_tree_remove, msp->ms_tree); 1307 } 1308 } 1309 cv_broadcast(&msp->ms_load_cv); 1310 return (error); 1311} 1312 1313void 1314metaslab_unload(metaslab_t *msp) 1315{ 1316 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1317 range_tree_vacate(msp->ms_tree, NULL, NULL); 1318 msp->ms_loaded = B_FALSE; 1319 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1320} 1321 1322int 1323metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1324 metaslab_t **msp) 1325{ 1326 vdev_t *vd = mg->mg_vd; 1327 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1328 metaslab_t *ms; 1329 int error; 1330 1331 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1332 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1333 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1334 ms->ms_id = id; 1335 ms->ms_start = id << vd->vdev_ms_shift; 1336 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1337 1338 /* 1339 * We only open space map objects that already exist. All others 1340 * will be opened when we finally allocate an object for it. 1341 */ 1342 if (object != 0) { 1343 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1344 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1345 1346 if (error != 0) { 1347 kmem_free(ms, sizeof (metaslab_t)); 1348 return (error); 1349 } 1350 1351 ASSERT(ms->ms_sm != NULL); 1352 } 1353 1354 /* 1355 * We create the main range tree here, but we don't create the 1356 * alloctree and freetree until metaslab_sync_done(). This serves 1357 * two purposes: it allows metaslab_sync_done() to detect the 1358 * addition of new space; and for debugging, it ensures that we'd 1359 * data fault on any attempt to use this metaslab before it's ready. 1360 */ 1361 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1362 metaslab_group_add(mg, ms); 1363 1364 ms->ms_fragmentation = metaslab_fragmentation(ms); 1365 ms->ms_ops = mg->mg_class->mc_ops; 1366 1367 /* 1368 * If we're opening an existing pool (txg == 0) or creating 1369 * a new one (txg == TXG_INITIAL), all space is available now. 1370 * If we're adding space to an existing pool, the new space 1371 * does not become available until after this txg has synced. 1372 */ 1373 if (txg <= TXG_INITIAL) 1374 metaslab_sync_done(ms, 0); 1375 1376 /* 1377 * If metaslab_debug_load is set and we're initializing a metaslab 1378 * that has an allocated space_map object then load the its space 1379 * map so that can verify frees. 1380 */ 1381 if (metaslab_debug_load && ms->ms_sm != NULL) { 1382 mutex_enter(&ms->ms_lock); 1383 VERIFY0(metaslab_load(ms)); 1384 mutex_exit(&ms->ms_lock); 1385 } 1386 1387 if (txg != 0) { 1388 vdev_dirty(vd, 0, NULL, txg); 1389 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1390 } 1391 1392 *msp = ms; 1393 1394 return (0); 1395} 1396 1397void 1398metaslab_fini(metaslab_t *msp) 1399{ 1400 metaslab_group_t *mg = msp->ms_group; 1401 1402 metaslab_group_remove(mg, msp); 1403 1404 mutex_enter(&msp->ms_lock); 1405 1406 VERIFY(msp->ms_group == NULL); 1407 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1408 0, -msp->ms_size); 1409 space_map_close(msp->ms_sm); 1410 1411 metaslab_unload(msp); 1412 range_tree_destroy(msp->ms_tree); 1413 1414 for (int t = 0; t < TXG_SIZE; t++) { 1415 range_tree_destroy(msp->ms_alloctree[t]); 1416 range_tree_destroy(msp->ms_freetree[t]); 1417 } 1418 1419 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1420 range_tree_destroy(msp->ms_defertree[t]); 1421 } 1422 1423 ASSERT0(msp->ms_deferspace); 1424 1425 mutex_exit(&msp->ms_lock); 1426 cv_destroy(&msp->ms_load_cv); 1427 mutex_destroy(&msp->ms_lock); 1428 1429 kmem_free(msp, sizeof (metaslab_t)); 1430} 1431 1432#define FRAGMENTATION_TABLE_SIZE 17 1433 1434/* 1435 * This table defines a segment size based fragmentation metric that will 1436 * allow each metaslab to derive its own fragmentation value. This is done 1437 * by calculating the space in each bucket of the spacemap histogram and 1438 * multiplying that by the fragmetation metric in this table. Doing 1439 * this for all buckets and dividing it by the total amount of free 1440 * space in this metaslab (i.e. the total free space in all buckets) gives 1441 * us the fragmentation metric. This means that a high fragmentation metric 1442 * equates to most of the free space being comprised of small segments. 1443 * Conversely, if the metric is low, then most of the free space is in 1444 * large segments. A 10% change in fragmentation equates to approximately 1445 * double the number of segments. 1446 * 1447 * This table defines 0% fragmented space using 16MB segments. Testing has 1448 * shown that segments that are greater than or equal to 16MB do not suffer 1449 * from drastic performance problems. Using this value, we derive the rest 1450 * of the table. Since the fragmentation value is never stored on disk, it 1451 * is possible to change these calculations in the future. 1452 */ 1453int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1454 100, /* 512B */ 1455 100, /* 1K */ 1456 98, /* 2K */ 1457 95, /* 4K */ 1458 90, /* 8K */ 1459 80, /* 16K */ 1460 70, /* 32K */ 1461 60, /* 64K */ 1462 50, /* 128K */ 1463 40, /* 256K */ 1464 30, /* 512K */ 1465 20, /* 1M */ 1466 15, /* 2M */ 1467 10, /* 4M */ 1468 5, /* 8M */ 1469 0 /* 16M */ 1470}; 1471 1472/* 1473 * Calclate the metaslab's fragmentation metric. A return value 1474 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1475 * not support this metric. Otherwise, the return value should be in the 1476 * range [0, 100]. 1477 */ 1478static uint64_t 1479metaslab_fragmentation(metaslab_t *msp) 1480{ 1481 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1482 uint64_t fragmentation = 0; 1483 uint64_t total = 0; 1484 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1485 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1486 1487 if (!feature_enabled) 1488 return (ZFS_FRAG_INVALID); 1489 1490 /* 1491 * A null space map means that the entire metaslab is free 1492 * and thus is not fragmented. 1493 */ 1494 if (msp->ms_sm == NULL) 1495 return (0); 1496 1497 /* 1498 * If this metaslab's space_map has not been upgraded, flag it 1499 * so that we upgrade next time we encounter it. 1500 */ 1501 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1502 uint64_t txg = spa_syncing_txg(spa); 1503 vdev_t *vd = msp->ms_group->mg_vd; 1504 1505 if (spa_writeable(spa)) { 1506 msp->ms_condense_wanted = B_TRUE; 1507 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1508 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1509 "msp %p, vd %p", txg, msp, vd); 1510 } 1511 return (ZFS_FRAG_INVALID); 1512 } 1513 1514 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1515 uint64_t space = 0; 1516 uint8_t shift = msp->ms_sm->sm_shift; 1517 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1518 FRAGMENTATION_TABLE_SIZE - 1); 1519 1520 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1521 continue; 1522 1523 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1524 total += space; 1525 1526 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1527 fragmentation += space * zfs_frag_table[idx]; 1528 } 1529 1530 if (total > 0) 1531 fragmentation /= total; 1532 ASSERT3U(fragmentation, <=, 100); 1533 return (fragmentation); 1534} 1535 1536/* 1537 * Compute a weight -- a selection preference value -- for the given metaslab. 1538 * This is based on the amount of free space, the level of fragmentation, 1539 * the LBA range, and whether the metaslab is loaded. 1540 */ 1541static uint64_t 1542metaslab_weight(metaslab_t *msp) 1543{ 1544 metaslab_group_t *mg = msp->ms_group; 1545 vdev_t *vd = mg->mg_vd; 1546 uint64_t weight, space; 1547 1548 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1549 1550 /* 1551 * This vdev is in the process of being removed so there is nothing 1552 * for us to do here. 1553 */ 1554 if (vd->vdev_removing) { 1555 ASSERT0(space_map_allocated(msp->ms_sm)); 1556 ASSERT0(vd->vdev_ms_shift); 1557 return (0); 1558 } 1559 1560 /* 1561 * The baseline weight is the metaslab's free space. 1562 */ 1563 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1564 1565 msp->ms_fragmentation = metaslab_fragmentation(msp); 1566 if (metaslab_fragmentation_factor_enabled && 1567 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1568 /* 1569 * Use the fragmentation information to inversely scale 1570 * down the baseline weight. We need to ensure that we 1571 * don't exclude this metaslab completely when it's 100% 1572 * fragmented. To avoid this we reduce the fragmented value 1573 * by 1. 1574 */ 1575 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1576 1577 /* 1578 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1579 * this metaslab again. The fragmentation metric may have 1580 * decreased the space to something smaller than 1581 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1582 * so that we can consume any remaining space. 1583 */ 1584 if (space > 0 && space < SPA_MINBLOCKSIZE) 1585 space = SPA_MINBLOCKSIZE; 1586 } 1587 weight = space; 1588 1589 /* 1590 * Modern disks have uniform bit density and constant angular velocity. 1591 * Therefore, the outer recording zones are faster (higher bandwidth) 1592 * than the inner zones by the ratio of outer to inner track diameter, 1593 * which is typically around 2:1. We account for this by assigning 1594 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1595 * In effect, this means that we'll select the metaslab with the most 1596 * free bandwidth rather than simply the one with the most free space. 1597 */ 1598 if (metaslab_lba_weighting_enabled) { 1599 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1600 ASSERT(weight >= space && weight <= 2 * space); 1601 } 1602 1603 /* 1604 * If this metaslab is one we're actively using, adjust its 1605 * weight to make it preferable to any inactive metaslab so 1606 * we'll polish it off. If the fragmentation on this metaslab 1607 * has exceed our threshold, then don't mark it active. 1608 */ 1609 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1610 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1611 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1612 } 1613 1614 return (weight); 1615} 1616 1617static int 1618metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1619{ 1620 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1621 1622 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1623 metaslab_load_wait(msp); 1624 if (!msp->ms_loaded) { 1625 int error = metaslab_load(msp); 1626 if (error) { 1627 metaslab_group_sort(msp->ms_group, msp, 0); 1628 return (error); 1629 } 1630 } 1631 1632 metaslab_group_sort(msp->ms_group, msp, 1633 msp->ms_weight | activation_weight); 1634 } 1635 ASSERT(msp->ms_loaded); 1636 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1637 1638 return (0); 1639} 1640 1641static void 1642metaslab_passivate(metaslab_t *msp, uint64_t size) 1643{ 1644 /* 1645 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1646 * this metaslab again. In that case, it had better be empty, 1647 * or we would be leaving space on the table. 1648 */ 1649 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1650 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1651 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1652} 1653 1654static void 1655metaslab_preload(void *arg) 1656{ 1657 metaslab_t *msp = arg; 1658 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1659 1660 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1661 1662 mutex_enter(&msp->ms_lock); 1663 metaslab_load_wait(msp); 1664 if (!msp->ms_loaded) 1665 (void) metaslab_load(msp); 1666 1667 /* 1668 * Set the ms_access_txg value so that we don't unload it right away. 1669 */ 1670 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1671 mutex_exit(&msp->ms_lock); 1672} 1673 1674static void 1675metaslab_group_preload(metaslab_group_t *mg) 1676{ 1677 spa_t *spa = mg->mg_vd->vdev_spa; 1678 metaslab_t *msp; 1679 avl_tree_t *t = &mg->mg_metaslab_tree; 1680 int m = 0; 1681 1682 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1683 taskq_wait(mg->mg_taskq); 1684 return; 1685 } 1686 1687 mutex_enter(&mg->mg_lock); 1688 /* 1689 * Load the next potential metaslabs 1690 */ 1691 msp = avl_first(t); 1692 while (msp != NULL) { 1693 metaslab_t *msp_next = AVL_NEXT(t, msp); 1694 1695 /* 1696 * We preload only the maximum number of metaslabs specified 1697 * by metaslab_preload_limit. If a metaslab is being forced 1698 * to condense then we preload it too. This will ensure 1699 * that force condensing happens in the next txg. 1700 */ 1701 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1702 msp = msp_next; 1703 continue; 1704 } 1705 1706 /* 1707 * We must drop the metaslab group lock here to preserve 1708 * lock ordering with the ms_lock (when grabbing both 1709 * the mg_lock and the ms_lock, the ms_lock must be taken 1710 * first). As a result, it is possible that the ordering 1711 * of the metaslabs within the avl tree may change before 1712 * we reacquire the lock. The metaslab cannot be removed from 1713 * the tree while we're in syncing context so it is safe to 1714 * drop the mg_lock here. If the metaslabs are reordered 1715 * nothing will break -- we just may end up loading a 1716 * less than optimal one. 1717 */ 1718 mutex_exit(&mg->mg_lock); 1719 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1720 msp, TQ_SLEEP) != 0); 1721 mutex_enter(&mg->mg_lock); 1722 msp = msp_next; 1723 } 1724 mutex_exit(&mg->mg_lock); 1725} 1726 1727/* 1728 * Determine if the space map's on-disk footprint is past our tolerance 1729 * for inefficiency. We would like to use the following criteria to make 1730 * our decision: 1731 * 1732 * 1. The size of the space map object should not dramatically increase as a 1733 * result of writing out the free space range tree. 1734 * 1735 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1736 * times the size than the free space range tree representation 1737 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1738 * 1739 * 3. The on-disk size of the space map should actually decrease. 1740 * 1741 * Checking the first condition is tricky since we don't want to walk 1742 * the entire AVL tree calculating the estimated on-disk size. Instead we 1743 * use the size-ordered range tree in the metaslab and calculate the 1744 * size required to write out the largest segment in our free tree. If the 1745 * size required to represent that segment on disk is larger than the space 1746 * map object then we avoid condensing this map. 1747 * 1748 * To determine the second criterion we use a best-case estimate and assume 1749 * each segment can be represented on-disk as a single 64-bit entry. We refer 1750 * to this best-case estimate as the space map's minimal form. 1751 * 1752 * Unfortunately, we cannot compute the on-disk size of the space map in this 1753 * context because we cannot accurately compute the effects of compression, etc. 1754 * Instead, we apply the heuristic described in the block comment for 1755 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1756 * is greater than a threshold number of blocks. 1757 */ 1758static boolean_t 1759metaslab_should_condense(metaslab_t *msp) 1760{ 1761 space_map_t *sm = msp->ms_sm; 1762 range_seg_t *rs; 1763 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1764 dmu_object_info_t doi; 1765 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1766 1767 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1768 ASSERT(msp->ms_loaded); 1769 1770 /* 1771 * Use the ms_size_tree range tree, which is ordered by size, to 1772 * obtain the largest segment in the free tree. We always condense 1773 * metaslabs that are empty and metaslabs for which a condense 1774 * request has been made. 1775 */ 1776 rs = avl_last(&msp->ms_size_tree); 1777 if (rs == NULL || msp->ms_condense_wanted) 1778 return (B_TRUE); 1779 1780 /* 1781 * Calculate the number of 64-bit entries this segment would 1782 * require when written to disk. If this single segment would be 1783 * larger on-disk than the entire current on-disk structure, then 1784 * clearly condensing will increase the on-disk structure size. 1785 */ 1786 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1787 entries = size / (MIN(size, SM_RUN_MAX)); 1788 segsz = entries * sizeof (uint64_t); 1789 1790 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1791 object_size = space_map_length(msp->ms_sm); 1792 1793 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1794 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1795 1796 return (segsz <= object_size && 1797 object_size >= (optimal_size * zfs_condense_pct / 100) && 1798 object_size > zfs_metaslab_condense_block_threshold * record_size); 1799} 1800 1801/* 1802 * Condense the on-disk space map representation to its minimized form. 1803 * The minimized form consists of a small number of allocations followed by 1804 * the entries of the free range tree. 1805 */ 1806static void 1807metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1808{ 1809 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1810 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1811 range_tree_t *condense_tree; 1812 space_map_t *sm = msp->ms_sm; 1813 1814 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1815 ASSERT3U(spa_sync_pass(spa), ==, 1); 1816 ASSERT(msp->ms_loaded); 1817 1818 1819 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1820 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1821 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1822 msp->ms_group->mg_vd->vdev_spa->spa_name, 1823 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1824 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1825 1826 msp->ms_condense_wanted = B_FALSE; 1827 1828 /* 1829 * Create an range tree that is 100% allocated. We remove segments 1830 * that have been freed in this txg, any deferred frees that exist, 1831 * and any allocation in the future. Removing segments should be 1832 * a relatively inexpensive operation since we expect these trees to 1833 * have a small number of nodes. 1834 */ 1835 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1836 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1837 1838 /* 1839 * Remove what's been freed in this txg from the condense_tree. 1840 * Since we're in sync_pass 1, we know that all the frees from 1841 * this txg are in the freetree. 1842 */ 1843 range_tree_walk(freetree, range_tree_remove, condense_tree); 1844 1845 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1846 range_tree_walk(msp->ms_defertree[t], 1847 range_tree_remove, condense_tree); 1848 } 1849 1850 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1851 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1852 range_tree_remove, condense_tree); 1853 } 1854 1855 /* 1856 * We're about to drop the metaslab's lock thus allowing 1857 * other consumers to change it's content. Set the 1858 * metaslab's ms_condensing flag to ensure that 1859 * allocations on this metaslab do not occur while we're 1860 * in the middle of committing it to disk. This is only critical 1861 * for the ms_tree as all other range trees use per txg 1862 * views of their content. 1863 */ 1864 msp->ms_condensing = B_TRUE; 1865 1866 mutex_exit(&msp->ms_lock); 1867 space_map_truncate(sm, tx); 1868 mutex_enter(&msp->ms_lock); 1869 1870 /* 1871 * While we would ideally like to create a space_map representation 1872 * that consists only of allocation records, doing so can be 1873 * prohibitively expensive because the in-core free tree can be 1874 * large, and therefore computationally expensive to subtract 1875 * from the condense_tree. Instead we sync out two trees, a cheap 1876 * allocation only tree followed by the in-core free tree. While not 1877 * optimal, this is typically close to optimal, and much cheaper to 1878 * compute. 1879 */ 1880 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1881 range_tree_vacate(condense_tree, NULL, NULL); 1882 range_tree_destroy(condense_tree); 1883 1884 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1885 msp->ms_condensing = B_FALSE; 1886} 1887 1888/* 1889 * Write a metaslab to disk in the context of the specified transaction group. 1890 */ 1891void 1892metaslab_sync(metaslab_t *msp, uint64_t txg) 1893{ 1894 metaslab_group_t *mg = msp->ms_group; 1895 vdev_t *vd = mg->mg_vd; 1896 spa_t *spa = vd->vdev_spa; 1897 objset_t *mos = spa_meta_objset(spa); 1898 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1899 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1900 range_tree_t **freed_tree = 1901 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1902 dmu_tx_t *tx; 1903 uint64_t object = space_map_object(msp->ms_sm); 1904 1905 ASSERT(!vd->vdev_ishole); 1906 1907 /* 1908 * This metaslab has just been added so there's no work to do now. 1909 */ 1910 if (*freetree == NULL) { 1911 ASSERT3P(alloctree, ==, NULL); 1912 return; 1913 } 1914 1915 ASSERT3P(alloctree, !=, NULL); 1916 ASSERT3P(*freetree, !=, NULL); 1917 ASSERT3P(*freed_tree, !=, NULL); 1918 1919 /* 1920 * Normally, we don't want to process a metaslab if there 1921 * are no allocations or frees to perform. However, if the metaslab 1922 * is being forced to condense we need to let it through. 1923 */ 1924 if (range_tree_space(alloctree) == 0 && 1925 range_tree_space(*freetree) == 0 && 1926 !msp->ms_condense_wanted) 1927 return; 1928 1929 /* 1930 * The only state that can actually be changing concurrently with 1931 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1932 * be modifying this txg's alloctree, freetree, freed_tree, or 1933 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1934 * space_map ASSERTs. We drop it whenever we call into the DMU, 1935 * because the DMU can call down to us (e.g. via zio_free()) at 1936 * any time. 1937 */ 1938 1939 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1940 1941 if (msp->ms_sm == NULL) { 1942 uint64_t new_object; 1943 1944 new_object = space_map_alloc(mos, tx); 1945 VERIFY3U(new_object, !=, 0); 1946 1947 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1948 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1949 &msp->ms_lock)); 1950 ASSERT(msp->ms_sm != NULL); 1951 } 1952 1953 mutex_enter(&msp->ms_lock); 1954 1955 /* 1956 * Note: metaslab_condense() clears the space_map's histogram. 1957 * Therefore we must verify and remove this histogram before 1958 * condensing. 1959 */ 1960 metaslab_group_histogram_verify(mg); 1961 metaslab_class_histogram_verify(mg->mg_class); 1962 metaslab_group_histogram_remove(mg, msp); 1963 1964 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1965 metaslab_should_condense(msp)) { 1966 metaslab_condense(msp, txg, tx); 1967 } else { 1968 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1969 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1970 } 1971 1972 if (msp->ms_loaded) { 1973 /* 1974 * When the space map is loaded, we have an accruate 1975 * histogram in the range tree. This gives us an opportunity 1976 * to bring the space map's histogram up-to-date so we clear 1977 * it first before updating it. 1978 */ 1979 space_map_histogram_clear(msp->ms_sm); 1980 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1981 } else { 1982 /* 1983 * Since the space map is not loaded we simply update the 1984 * exisiting histogram with what was freed in this txg. This 1985 * means that the on-disk histogram may not have an accurate 1986 * view of the free space but it's close enough to allow 1987 * us to make allocation decisions. 1988 */ 1989 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1990 } 1991 metaslab_group_histogram_add(mg, msp); 1992 metaslab_group_histogram_verify(mg); 1993 metaslab_class_histogram_verify(mg->mg_class); 1994 1995 /* 1996 * For sync pass 1, we avoid traversing this txg's free range tree 1997 * and instead will just swap the pointers for freetree and 1998 * freed_tree. We can safely do this since the freed_tree is 1999 * guaranteed to be empty on the initial pass. 2000 */ 2001 if (spa_sync_pass(spa) == 1) { 2002 range_tree_swap(freetree, freed_tree); 2003 } else { 2004 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 2005 } 2006 range_tree_vacate(alloctree, NULL, NULL); 2007 2008 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2009 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2010 2011 mutex_exit(&msp->ms_lock); 2012 2013 if (object != space_map_object(msp->ms_sm)) { 2014 object = space_map_object(msp->ms_sm); 2015 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2016 msp->ms_id, sizeof (uint64_t), &object, tx); 2017 } 2018 dmu_tx_commit(tx); 2019} 2020 2021/* 2022 * Called after a transaction group has completely synced to mark 2023 * all of the metaslab's free space as usable. 2024 */ 2025void 2026metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2027{ 2028 metaslab_group_t *mg = msp->ms_group; 2029 vdev_t *vd = mg->mg_vd; 2030 range_tree_t **freed_tree; 2031 range_tree_t **defer_tree; 2032 int64_t alloc_delta, defer_delta; 2033 2034 ASSERT(!vd->vdev_ishole); 2035 2036 mutex_enter(&msp->ms_lock); 2037 2038 /* 2039 * If this metaslab is just becoming available, initialize its 2040 * alloctrees, freetrees, and defertree and add its capacity to 2041 * the vdev. 2042 */ 2043 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2044 for (int t = 0; t < TXG_SIZE; t++) { 2045 ASSERT(msp->ms_alloctree[t] == NULL); 2046 ASSERT(msp->ms_freetree[t] == NULL); 2047 2048 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2049 &msp->ms_lock); 2050 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2051 &msp->ms_lock); 2052 } 2053 2054 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2055 ASSERT(msp->ms_defertree[t] == NULL); 2056 2057 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2058 &msp->ms_lock); 2059 } 2060 2061 vdev_space_update(vd, 0, 0, msp->ms_size); 2062 } 2063 2064 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2065 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2066 2067 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2068 defer_delta = range_tree_space(*freed_tree) - 2069 range_tree_space(*defer_tree); 2070 2071 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2072 2073 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2074 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2075 2076 /* 2077 * If there's a metaslab_load() in progress, wait for it to complete 2078 * so that we have a consistent view of the in-core space map. 2079 */ 2080 metaslab_load_wait(msp); 2081 2082 /* 2083 * Move the frees from the defer_tree back to the free 2084 * range tree (if it's loaded). Swap the freed_tree and the 2085 * defer_tree -- this is safe to do because we've just emptied out 2086 * the defer_tree. 2087 */ 2088 range_tree_vacate(*defer_tree, 2089 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2090 range_tree_swap(freed_tree, defer_tree); 2091 2092 space_map_update(msp->ms_sm); 2093 2094 msp->ms_deferspace += defer_delta; 2095 ASSERT3S(msp->ms_deferspace, >=, 0); 2096 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2097 if (msp->ms_deferspace != 0) { 2098 /* 2099 * Keep syncing this metaslab until all deferred frees 2100 * are back in circulation. 2101 */ 2102 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2103 } 2104 2105 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2106 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2107 VERIFY0(range_tree_space( 2108 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2109 } 2110 2111 if (!metaslab_debug_unload) 2112 metaslab_unload(msp); 2113 } 2114 2115 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2116 mutex_exit(&msp->ms_lock); 2117} 2118 2119void 2120metaslab_sync_reassess(metaslab_group_t *mg) 2121{ 2122 metaslab_group_alloc_update(mg); 2123 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2124 2125 /* 2126 * Preload the next potential metaslabs 2127 */ 2128 metaslab_group_preload(mg); 2129} 2130 2131static uint64_t 2132metaslab_distance(metaslab_t *msp, dva_t *dva) 2133{ 2134 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2135 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2136 uint64_t start = msp->ms_id; 2137 2138 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2139 return (1ULL << 63); 2140 2141 if (offset < start) 2142 return ((start - offset) << ms_shift); 2143 if (offset > start) 2144 return ((offset - start) << ms_shift); 2145 return (0); 2146} 2147 2148static uint64_t 2149metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2150 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2151{ 2152 spa_t *spa = mg->mg_vd->vdev_spa; 2153 metaslab_t *msp = NULL; 2154 uint64_t offset = -1ULL; 2155 avl_tree_t *t = &mg->mg_metaslab_tree; 2156 uint64_t activation_weight; 2157 uint64_t target_distance; 2158 int i; 2159 2160 activation_weight = METASLAB_WEIGHT_PRIMARY; 2161 for (i = 0; i < d; i++) { 2162 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2163 activation_weight = METASLAB_WEIGHT_SECONDARY; 2164 break; 2165 } 2166 } 2167 2168 for (;;) { 2169 boolean_t was_active; 2170 2171 mutex_enter(&mg->mg_lock); 2172 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2173 if (msp->ms_weight < asize) { 2174 spa_dbgmsg(spa, "%s: failed to meet weight " 2175 "requirement: vdev %llu, txg %llu, mg %p, " 2176 "msp %p, psize %llu, asize %llu, " 2177 "weight %llu", spa_name(spa), 2178 mg->mg_vd->vdev_id, txg, 2179 mg, msp, psize, asize, msp->ms_weight); 2180 mutex_exit(&mg->mg_lock); 2181 return (-1ULL); 2182 } 2183 2184 /* 2185 * If the selected metaslab is condensing, skip it. 2186 */ 2187 if (msp->ms_condensing) 2188 continue; 2189 2190 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2191 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2192 break; 2193 2194 target_distance = min_distance + 2195 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2196 min_distance >> 1); 2197 2198 for (i = 0; i < d; i++) 2199 if (metaslab_distance(msp, &dva[i]) < 2200 target_distance) 2201 break; 2202 if (i == d) 2203 break; 2204 } 2205 mutex_exit(&mg->mg_lock); 2206 if (msp == NULL) 2207 return (-1ULL); 2208 2209 mutex_enter(&msp->ms_lock); 2210 2211 /* 2212 * Ensure that the metaslab we have selected is still 2213 * capable of handling our request. It's possible that 2214 * another thread may have changed the weight while we 2215 * were blocked on the metaslab lock. 2216 */ 2217 if (msp->ms_weight < asize || (was_active && 2218 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2219 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2220 mutex_exit(&msp->ms_lock); 2221 continue; 2222 } 2223 2224 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2225 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2226 metaslab_passivate(msp, 2227 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2228 mutex_exit(&msp->ms_lock); 2229 continue; 2230 } 2231 2232 if (metaslab_activate(msp, activation_weight) != 0) { 2233 mutex_exit(&msp->ms_lock); 2234 continue; 2235 } 2236 2237 /* 2238 * If this metaslab is currently condensing then pick again as 2239 * we can't manipulate this metaslab until it's committed 2240 * to disk. 2241 */ 2242 if (msp->ms_condensing) { 2243 mutex_exit(&msp->ms_lock); 2244 continue; 2245 } 2246 2247 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2248 break; 2249 2250 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2251 mutex_exit(&msp->ms_lock); 2252 } 2253 2254 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2255 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2256 2257 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2258 msp->ms_access_txg = txg + metaslab_unload_delay; 2259 2260 mutex_exit(&msp->ms_lock); 2261 2262 return (offset); 2263} 2264 2265/* 2266 * Allocate a block for the specified i/o. 2267 */ 2268static int 2269metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2270 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2271{ 2272 metaslab_group_t *mg, *rotor; 2273 vdev_t *vd; 2274 int dshift = 3; 2275 int all_zero; 2276 int zio_lock = B_FALSE; 2277 boolean_t allocatable; 2278 uint64_t offset = -1ULL; 2279 uint64_t asize; 2280 uint64_t distance; 2281 2282 ASSERT(!DVA_IS_VALID(&dva[d])); 2283 2284 /* 2285 * For testing, make some blocks above a certain size be gang blocks. 2286 */ 2287 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2288 return (SET_ERROR(ENOSPC)); 2289 2290 /* 2291 * Start at the rotor and loop through all mgs until we find something. 2292 * Note that there's no locking on mc_rotor or mc_aliquot because 2293 * nothing actually breaks if we miss a few updates -- we just won't 2294 * allocate quite as evenly. It all balances out over time. 2295 * 2296 * If we are doing ditto or log blocks, try to spread them across 2297 * consecutive vdevs. If we're forced to reuse a vdev before we've 2298 * allocated all of our ditto blocks, then try and spread them out on 2299 * that vdev as much as possible. If it turns out to not be possible, 2300 * gradually lower our standards until anything becomes acceptable. 2301 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2302 * gives us hope of containing our fault domains to something we're 2303 * able to reason about. Otherwise, any two top-level vdev failures 2304 * will guarantee the loss of data. With consecutive allocation, 2305 * only two adjacent top-level vdev failures will result in data loss. 2306 * 2307 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2308 * ourselves on the same vdev as our gang block header. That 2309 * way, we can hope for locality in vdev_cache, plus it makes our 2310 * fault domains something tractable. 2311 */ 2312 if (hintdva) { 2313 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2314 2315 /* 2316 * It's possible the vdev we're using as the hint no 2317 * longer exists (i.e. removed). Consult the rotor when 2318 * all else fails. 2319 */ 2320 if (vd != NULL) { 2321 mg = vd->vdev_mg; 2322 2323 if (flags & METASLAB_HINTBP_AVOID && 2324 mg->mg_next != NULL) 2325 mg = mg->mg_next; 2326 } else { 2327 mg = mc->mc_rotor; 2328 } 2329 } else if (d != 0) { 2330 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2331 mg = vd->vdev_mg->mg_next; 2332 } else { 2333 mg = mc->mc_rotor; 2334 } 2335 2336 /* 2337 * If the hint put us into the wrong metaslab class, or into a 2338 * metaslab group that has been passivated, just follow the rotor. 2339 */ 2340 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2341 mg = mc->mc_rotor; 2342 2343 rotor = mg; 2344top: 2345 all_zero = B_TRUE; 2346 do { 2347 ASSERT(mg->mg_activation_count == 1); 2348 2349 vd = mg->mg_vd; 2350 2351 /* 2352 * Don't allocate from faulted devices. 2353 */ 2354 if (zio_lock) { 2355 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2356 allocatable = vdev_allocatable(vd); 2357 spa_config_exit(spa, SCL_ZIO, FTAG); 2358 } else { 2359 allocatable = vdev_allocatable(vd); 2360 } 2361 2362 /* 2363 * Determine if the selected metaslab group is eligible 2364 * for allocations. If we're ganging or have requested 2365 * an allocation for the smallest gang block size 2366 * then we don't want to avoid allocating to the this 2367 * metaslab group. If we're in this condition we should 2368 * try to allocate from any device possible so that we 2369 * don't inadvertently return ENOSPC and suspend the pool 2370 * even though space is still available. 2371 */ 2372 if (allocatable && CAN_FASTGANG(flags) && 2373 psize > SPA_GANGBLOCKSIZE) 2374 allocatable = metaslab_group_allocatable(mg); 2375 2376 if (!allocatable) 2377 goto next; 2378 2379 /* 2380 * Avoid writing single-copy data to a failing vdev 2381 * unless the user instructs us that it is okay. 2382 */ 2383 if ((vd->vdev_stat.vs_write_errors > 0 || 2384 vd->vdev_state < VDEV_STATE_HEALTHY) && 2385 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2386 all_zero = B_FALSE; 2387 goto next; 2388 } 2389 2390 ASSERT(mg->mg_class == mc); 2391 2392 distance = vd->vdev_asize >> dshift; 2393 if (distance <= (1ULL << vd->vdev_ms_shift)) 2394 distance = 0; 2395 else 2396 all_zero = B_FALSE; 2397 2398 asize = vdev_psize_to_asize(vd, psize); 2399 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2400 2401 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2402 dva, d); 2403 if (offset != -1ULL) { 2404 /* 2405 * If we've just selected this metaslab group, 2406 * figure out whether the corresponding vdev is 2407 * over- or under-used relative to the pool, 2408 * and set an allocation bias to even it out. 2409 */ 2410 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2411 vdev_stat_t *vs = &vd->vdev_stat; 2412 int64_t vu, cu; 2413 2414 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2415 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2416 2417 /* 2418 * Calculate how much more or less we should 2419 * try to allocate from this device during 2420 * this iteration around the rotor. 2421 * For example, if a device is 80% full 2422 * and the pool is 20% full then we should 2423 * reduce allocations by 60% on this device. 2424 * 2425 * mg_bias = (20 - 80) * 512K / 100 = -307K 2426 * 2427 * This reduces allocations by 307K for this 2428 * iteration. 2429 */ 2430 mg->mg_bias = ((cu - vu) * 2431 (int64_t)mg->mg_aliquot) / 100; 2432 } else if (!metaslab_bias_enabled) { 2433 mg->mg_bias = 0; 2434 } 2435 2436 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2437 mg->mg_aliquot + mg->mg_bias) { 2438 mc->mc_rotor = mg->mg_next; 2439 mc->mc_aliquot = 0; 2440 } 2441 2442 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2443 DVA_SET_OFFSET(&dva[d], offset); 2444 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2445 DVA_SET_ASIZE(&dva[d], asize); 2446 2447 return (0); 2448 } 2449next: 2450 mc->mc_rotor = mg->mg_next; 2451 mc->mc_aliquot = 0; 2452 } while ((mg = mg->mg_next) != rotor); 2453 2454 if (!all_zero) { 2455 dshift++; 2456 ASSERT(dshift < 64); 2457 goto top; 2458 } 2459 2460 if (!allocatable && !zio_lock) { 2461 dshift = 3; 2462 zio_lock = B_TRUE; 2463 goto top; 2464 } 2465 2466 bzero(&dva[d], sizeof (dva_t)); 2467 2468 return (SET_ERROR(ENOSPC)); 2469} 2470 2471/* 2472 * Free the block represented by DVA in the context of the specified 2473 * transaction group. 2474 */ 2475static void 2476metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2477{ 2478 uint64_t vdev = DVA_GET_VDEV(dva); 2479 uint64_t offset = DVA_GET_OFFSET(dva); 2480 uint64_t size = DVA_GET_ASIZE(dva); 2481 vdev_t *vd; 2482 metaslab_t *msp; 2483 2484 ASSERT(DVA_IS_VALID(dva)); 2485 2486 if (txg > spa_freeze_txg(spa)) 2487 return; 2488 2489 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2490 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2491 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2492 (u_longlong_t)vdev, (u_longlong_t)offset); 2493 ASSERT(0); 2494 return; 2495 } 2496 2497 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2498 2499 if (DVA_GET_GANG(dva)) 2500 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2501 2502 mutex_enter(&msp->ms_lock); 2503 2504 if (now) { 2505 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2506 offset, size); 2507 2508 VERIFY(!msp->ms_condensing); 2509 VERIFY3U(offset, >=, msp->ms_start); 2510 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2511 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2512 msp->ms_size); 2513 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2514 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2515 range_tree_add(msp->ms_tree, offset, size); 2516 } else { 2517 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2518 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2519 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2520 offset, size); 2521 } 2522 2523 mutex_exit(&msp->ms_lock); 2524} 2525 2526/* 2527 * Intent log support: upon opening the pool after a crash, notify the SPA 2528 * of blocks that the intent log has allocated for immediate write, but 2529 * which are still considered free by the SPA because the last transaction 2530 * group didn't commit yet. 2531 */ 2532static int 2533metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2534{ 2535 uint64_t vdev = DVA_GET_VDEV(dva); 2536 uint64_t offset = DVA_GET_OFFSET(dva); 2537 uint64_t size = DVA_GET_ASIZE(dva); 2538 vdev_t *vd; 2539 metaslab_t *msp; 2540 int error = 0; 2541 2542 ASSERT(DVA_IS_VALID(dva)); 2543 2544 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2545 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2546 return (SET_ERROR(ENXIO)); 2547 2548 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2549 2550 if (DVA_GET_GANG(dva)) 2551 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2552 2553 mutex_enter(&msp->ms_lock); 2554 2555 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2556 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2557 2558 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2559 error = SET_ERROR(ENOENT); 2560 2561 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2562 mutex_exit(&msp->ms_lock); 2563 return (error); 2564 } 2565 2566 VERIFY(!msp->ms_condensing); 2567 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2568 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2569 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2570 range_tree_remove(msp->ms_tree, offset, size); 2571 2572 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2573 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2574 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2575 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2576 } 2577 2578 mutex_exit(&msp->ms_lock); 2579 2580 return (0); 2581} 2582 2583int 2584metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2585 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2586{ 2587 dva_t *dva = bp->blk_dva; 2588 dva_t *hintdva = hintbp->blk_dva; 2589 int error = 0; 2590 2591 ASSERT(bp->blk_birth == 0); 2592 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2593 2594 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2595 2596 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2597 spa_config_exit(spa, SCL_ALLOC, FTAG); 2598 return (SET_ERROR(ENOSPC)); 2599 } 2600 2601 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2602 ASSERT(BP_GET_NDVAS(bp) == 0); 2603 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2604 2605 for (int d = 0; d < ndvas; d++) { 2606 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2607 txg, flags); 2608 if (error != 0) { 2609 for (d--; d >= 0; d--) { 2610 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2611 bzero(&dva[d], sizeof (dva_t)); 2612 } 2613 spa_config_exit(spa, SCL_ALLOC, FTAG); 2614 return (error); 2615 } 2616 } 2617 ASSERT(error == 0); 2618 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2619 2620 spa_config_exit(spa, SCL_ALLOC, FTAG); 2621 2622 BP_SET_BIRTH(bp, txg, txg); 2623 2624 return (0); 2625} 2626 2627void 2628metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2629{ 2630 const dva_t *dva = bp->blk_dva; 2631 int ndvas = BP_GET_NDVAS(bp); 2632 2633 ASSERT(!BP_IS_HOLE(bp)); 2634 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2635 2636 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2637 2638 for (int d = 0; d < ndvas; d++) 2639 metaslab_free_dva(spa, &dva[d], txg, now); 2640 2641 spa_config_exit(spa, SCL_FREE, FTAG); 2642} 2643 2644int 2645metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2646{ 2647 const dva_t *dva = bp->blk_dva; 2648 int ndvas = BP_GET_NDVAS(bp); 2649 int error = 0; 2650 2651 ASSERT(!BP_IS_HOLE(bp)); 2652 2653 if (txg != 0) { 2654 /* 2655 * First do a dry run to make sure all DVAs are claimable, 2656 * so we don't have to unwind from partial failures below. 2657 */ 2658 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2659 return (error); 2660 } 2661 2662 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2663 2664 for (int d = 0; d < ndvas; d++) 2665 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2666 break; 2667 2668 spa_config_exit(spa, SCL_ALLOC, FTAG); 2669 2670 ASSERT(error == 0 || txg == 0); 2671 2672 return (error); 2673} 2674 2675void 2676metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2677{ 2678 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2679 return; 2680 2681 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2682 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2683 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2684 vdev_t *vd = vdev_lookup_top(spa, vdev); 2685 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2686 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2687 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2688 2689 if (msp->ms_loaded) 2690 range_tree_verify(msp->ms_tree, offset, size); 2691 2692 for (int j = 0; j < TXG_SIZE; j++) 2693 range_tree_verify(msp->ms_freetree[j], offset, size); 2694 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2695 range_tree_verify(msp->ms_defertree[j], offset, size); 2696 } 2697 spa_config_exit(spa, SCL_VDEV, FTAG); 2698} 2699