metaslab.c revision 307279
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/zfs_context.h> 29#include <sys/dmu.h> 30#include <sys/dmu_tx.h> 31#include <sys/space_map.h> 32#include <sys/metaslab_impl.h> 33#include <sys/vdev_impl.h> 34#include <sys/zio.h> 35#include <sys/spa_impl.h> 36#include <sys/zfeature.h> 37 38SYSCTL_DECL(_vfs_zfs); 39SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 40 41#define GANG_ALLOCATION(flags) \ 42 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) 43 44#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 45#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 46#define METASLAB_ACTIVE_MASK \ 47 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 48 49uint64_t metaslab_aliquot = 512ULL << 10; 50uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 51TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 52SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 53 &metaslab_gang_bang, 0, 54 "Force gang block allocation for blocks larger than or equal to this value"); 55 56/* 57 * The in-core space map representation is more compact than its on-disk form. 58 * The zfs_condense_pct determines how much more compact the in-core 59 * space_map representation must be before we compact it on-disk. 60 * Values should be greater than or equal to 100. 61 */ 62int zfs_condense_pct = 200; 63TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 64SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 65 &zfs_condense_pct, 0, 66 "Condense on-disk spacemap when it is more than this many percents" 67 " of in-memory counterpart"); 68 69/* 70 * Condensing a metaslab is not guaranteed to actually reduce the amount of 71 * space used on disk. In particular, a space map uses data in increments of 72 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 73 * same number of blocks after condensing. Since the goal of condensing is to 74 * reduce the number of IOPs required to read the space map, we only want to 75 * condense when we can be sure we will reduce the number of blocks used by the 76 * space map. Unfortunately, we cannot precisely compute whether or not this is 77 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 78 * we apply the following heuristic: do not condense a spacemap unless the 79 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 80 * blocks. 81 */ 82int zfs_metaslab_condense_block_threshold = 4; 83 84/* 85 * The zfs_mg_noalloc_threshold defines which metaslab groups should 86 * be eligible for allocation. The value is defined as a percentage of 87 * free space. Metaslab groups that have more free space than 88 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 89 * a metaslab group's free space is less than or equal to the 90 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 91 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 92 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 93 * groups are allowed to accept allocations. Gang blocks are always 94 * eligible to allocate on any metaslab group. The default value of 0 means 95 * no metaslab group will be excluded based on this criterion. 96 */ 97int zfs_mg_noalloc_threshold = 0; 98TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 99SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 100 &zfs_mg_noalloc_threshold, 0, 101 "Percentage of metaslab group size that should be free" 102 " to make it eligible for allocation"); 103 104/* 105 * Metaslab groups are considered eligible for allocations if their 106 * fragmenation metric (measured as a percentage) is less than or equal to 107 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 108 * then it will be skipped unless all metaslab groups within the metaslab 109 * class have also crossed this threshold. 110 */ 111int zfs_mg_fragmentation_threshold = 85; 112TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 113SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 114 &zfs_mg_fragmentation_threshold, 0, 115 "Percentage of metaslab group size that should be considered " 116 "eligible for allocations unless all metaslab groups within the metaslab class " 117 "have also crossed this threshold"); 118 119/* 120 * Allow metaslabs to keep their active state as long as their fragmentation 121 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 122 * active metaslab that exceeds this threshold will no longer keep its active 123 * status allowing better metaslabs to be selected. 124 */ 125int zfs_metaslab_fragmentation_threshold = 70; 126TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 127 &zfs_metaslab_fragmentation_threshold); 128SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 129 &zfs_metaslab_fragmentation_threshold, 0, 130 "Maximum percentage of metaslab fragmentation level to keep their active state"); 131 132/* 133 * When set will load all metaslabs when pool is first opened. 134 */ 135int metaslab_debug_load = 0; 136TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 137SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 138 &metaslab_debug_load, 0, 139 "Load all metaslabs when pool is first opened"); 140 141/* 142 * When set will prevent metaslabs from being unloaded. 143 */ 144int metaslab_debug_unload = 0; 145TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 146SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 147 &metaslab_debug_unload, 0, 148 "Prevent metaslabs from being unloaded"); 149 150/* 151 * Minimum size which forces the dynamic allocator to change 152 * it's allocation strategy. Once the space map cannot satisfy 153 * an allocation of this size then it switches to using more 154 * aggressive strategy (i.e search by size rather than offset). 155 */ 156uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 157TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 158 &metaslab_df_alloc_threshold); 159SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 160 &metaslab_df_alloc_threshold, 0, 161 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 162 163/* 164 * The minimum free space, in percent, which must be available 165 * in a space map to continue allocations in a first-fit fashion. 166 * Once the space_map's free space drops below this level we dynamically 167 * switch to using best-fit allocations. 168 */ 169int metaslab_df_free_pct = 4; 170TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 171SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 172 &metaslab_df_free_pct, 0, 173 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 174 175/* 176 * A metaslab is considered "free" if it contains a contiguous 177 * segment which is greater than metaslab_min_alloc_size. 178 */ 179uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 180TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 181 &metaslab_min_alloc_size); 182SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 183 &metaslab_min_alloc_size, 0, 184 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 185 186/* 187 * Percentage of all cpus that can be used by the metaslab taskq. 188 */ 189int metaslab_load_pct = 50; 190TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 191SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 192 &metaslab_load_pct, 0, 193 "Percentage of cpus that can be used by the metaslab taskq"); 194 195/* 196 * Determines how many txgs a metaslab may remain loaded without having any 197 * allocations from it. As long as a metaslab continues to be used we will 198 * keep it loaded. 199 */ 200int metaslab_unload_delay = TXG_SIZE * 2; 201TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 202SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 203 &metaslab_unload_delay, 0, 204 "Number of TXGs that an unused metaslab can be kept in memory"); 205 206/* 207 * Max number of metaslabs per group to preload. 208 */ 209int metaslab_preload_limit = SPA_DVAS_PER_BP; 210TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 211SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 212 &metaslab_preload_limit, 0, 213 "Max number of metaslabs per group to preload"); 214 215/* 216 * Enable/disable preloading of metaslab. 217 */ 218boolean_t metaslab_preload_enabled = B_TRUE; 219TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 220SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 221 &metaslab_preload_enabled, 0, 222 "Max number of metaslabs per group to preload"); 223 224/* 225 * Enable/disable fragmentation weighting on metaslabs. 226 */ 227boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 228TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 229 &metaslab_fragmentation_factor_enabled); 230SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 231 &metaslab_fragmentation_factor_enabled, 0, 232 "Enable fragmentation weighting on metaslabs"); 233 234/* 235 * Enable/disable lba weighting (i.e. outer tracks are given preference). 236 */ 237boolean_t metaslab_lba_weighting_enabled = B_TRUE; 238TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 239 &metaslab_lba_weighting_enabled); 240SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 241 &metaslab_lba_weighting_enabled, 0, 242 "Enable LBA weighting (i.e. outer tracks are given preference)"); 243 244/* 245 * Enable/disable metaslab group biasing. 246 */ 247boolean_t metaslab_bias_enabled = B_TRUE; 248TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 249 &metaslab_bias_enabled); 250SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 251 &metaslab_bias_enabled, 0, 252 "Enable metaslab group biasing"); 253 254static uint64_t metaslab_fragmentation(metaslab_t *); 255 256/* 257 * ========================================================================== 258 * Metaslab classes 259 * ========================================================================== 260 */ 261metaslab_class_t * 262metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 263{ 264 metaslab_class_t *mc; 265 266 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 267 268 mc->mc_spa = spa; 269 mc->mc_rotor = NULL; 270 mc->mc_ops = ops; 271 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); 272 refcount_create_tracked(&mc->mc_alloc_slots); 273 274 return (mc); 275} 276 277void 278metaslab_class_destroy(metaslab_class_t *mc) 279{ 280 ASSERT(mc->mc_rotor == NULL); 281 ASSERT(mc->mc_alloc == 0); 282 ASSERT(mc->mc_deferred == 0); 283 ASSERT(mc->mc_space == 0); 284 ASSERT(mc->mc_dspace == 0); 285 286 refcount_destroy(&mc->mc_alloc_slots); 287 mutex_destroy(&mc->mc_lock); 288 kmem_free(mc, sizeof (metaslab_class_t)); 289} 290 291int 292metaslab_class_validate(metaslab_class_t *mc) 293{ 294 metaslab_group_t *mg; 295 vdev_t *vd; 296 297 /* 298 * Must hold one of the spa_config locks. 299 */ 300 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 301 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 302 303 if ((mg = mc->mc_rotor) == NULL) 304 return (0); 305 306 do { 307 vd = mg->mg_vd; 308 ASSERT(vd->vdev_mg != NULL); 309 ASSERT3P(vd->vdev_top, ==, vd); 310 ASSERT3P(mg->mg_class, ==, mc); 311 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 312 } while ((mg = mg->mg_next) != mc->mc_rotor); 313 314 return (0); 315} 316 317void 318metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 319 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 320{ 321 atomic_add_64(&mc->mc_alloc, alloc_delta); 322 atomic_add_64(&mc->mc_deferred, defer_delta); 323 atomic_add_64(&mc->mc_space, space_delta); 324 atomic_add_64(&mc->mc_dspace, dspace_delta); 325} 326 327void 328metaslab_class_minblocksize_update(metaslab_class_t *mc) 329{ 330 metaslab_group_t *mg; 331 vdev_t *vd; 332 uint64_t minashift = UINT64_MAX; 333 334 if ((mg = mc->mc_rotor) == NULL) { 335 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 336 return; 337 } 338 339 do { 340 vd = mg->mg_vd; 341 if (vd->vdev_ashift < minashift) 342 minashift = vd->vdev_ashift; 343 } while ((mg = mg->mg_next) != mc->mc_rotor); 344 345 mc->mc_minblocksize = 1ULL << minashift; 346} 347 348uint64_t 349metaslab_class_get_alloc(metaslab_class_t *mc) 350{ 351 return (mc->mc_alloc); 352} 353 354uint64_t 355metaslab_class_get_deferred(metaslab_class_t *mc) 356{ 357 return (mc->mc_deferred); 358} 359 360uint64_t 361metaslab_class_get_space(metaslab_class_t *mc) 362{ 363 return (mc->mc_space); 364} 365 366uint64_t 367metaslab_class_get_dspace(metaslab_class_t *mc) 368{ 369 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 370} 371 372uint64_t 373metaslab_class_get_minblocksize(metaslab_class_t *mc) 374{ 375 return (mc->mc_minblocksize); 376} 377 378void 379metaslab_class_histogram_verify(metaslab_class_t *mc) 380{ 381 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 382 uint64_t *mc_hist; 383 int i; 384 385 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 386 return; 387 388 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 389 KM_SLEEP); 390 391 for (int c = 0; c < rvd->vdev_children; c++) { 392 vdev_t *tvd = rvd->vdev_child[c]; 393 metaslab_group_t *mg = tvd->vdev_mg; 394 395 /* 396 * Skip any holes, uninitialized top-levels, or 397 * vdevs that are not in this metalab class. 398 */ 399 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 400 mg->mg_class != mc) { 401 continue; 402 } 403 404 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 405 mc_hist[i] += mg->mg_histogram[i]; 406 } 407 408 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 409 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 410 411 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 412} 413 414/* 415 * Calculate the metaslab class's fragmentation metric. The metric 416 * is weighted based on the space contribution of each metaslab group. 417 * The return value will be a number between 0 and 100 (inclusive), or 418 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 419 * zfs_frag_table for more information about the metric. 420 */ 421uint64_t 422metaslab_class_fragmentation(metaslab_class_t *mc) 423{ 424 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 425 uint64_t fragmentation = 0; 426 427 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 428 429 for (int c = 0; c < rvd->vdev_children; c++) { 430 vdev_t *tvd = rvd->vdev_child[c]; 431 metaslab_group_t *mg = tvd->vdev_mg; 432 433 /* 434 * Skip any holes, uninitialized top-levels, or 435 * vdevs that are not in this metalab class. 436 */ 437 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 438 mg->mg_class != mc) { 439 continue; 440 } 441 442 /* 443 * If a metaslab group does not contain a fragmentation 444 * metric then just bail out. 445 */ 446 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 447 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 448 return (ZFS_FRAG_INVALID); 449 } 450 451 /* 452 * Determine how much this metaslab_group is contributing 453 * to the overall pool fragmentation metric. 454 */ 455 fragmentation += mg->mg_fragmentation * 456 metaslab_group_get_space(mg); 457 } 458 fragmentation /= metaslab_class_get_space(mc); 459 460 ASSERT3U(fragmentation, <=, 100); 461 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 462 return (fragmentation); 463} 464 465/* 466 * Calculate the amount of expandable space that is available in 467 * this metaslab class. If a device is expanded then its expandable 468 * space will be the amount of allocatable space that is currently not 469 * part of this metaslab class. 470 */ 471uint64_t 472metaslab_class_expandable_space(metaslab_class_t *mc) 473{ 474 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 475 uint64_t space = 0; 476 477 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 478 for (int c = 0; c < rvd->vdev_children; c++) { 479 vdev_t *tvd = rvd->vdev_child[c]; 480 metaslab_group_t *mg = tvd->vdev_mg; 481 482 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 483 mg->mg_class != mc) { 484 continue; 485 } 486 487 /* 488 * Calculate if we have enough space to add additional 489 * metaslabs. We report the expandable space in terms 490 * of the metaslab size since that's the unit of expansion. 491 */ 492 space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, 493 1ULL << tvd->vdev_ms_shift); 494 } 495 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 496 return (space); 497} 498 499/* 500 * ========================================================================== 501 * Metaslab groups 502 * ========================================================================== 503 */ 504static int 505metaslab_compare(const void *x1, const void *x2) 506{ 507 const metaslab_t *m1 = x1; 508 const metaslab_t *m2 = x2; 509 510 if (m1->ms_weight < m2->ms_weight) 511 return (1); 512 if (m1->ms_weight > m2->ms_weight) 513 return (-1); 514 515 /* 516 * If the weights are identical, use the offset to force uniqueness. 517 */ 518 if (m1->ms_start < m2->ms_start) 519 return (-1); 520 if (m1->ms_start > m2->ms_start) 521 return (1); 522 523 ASSERT3P(m1, ==, m2); 524 525 return (0); 526} 527 528/* 529 * Update the allocatable flag and the metaslab group's capacity. 530 * The allocatable flag is set to true if the capacity is below 531 * the zfs_mg_noalloc_threshold or has a fragmentation value that is 532 * greater than zfs_mg_fragmentation_threshold. If a metaslab group 533 * transitions from allocatable to non-allocatable or vice versa then the 534 * metaslab group's class is updated to reflect the transition. 535 */ 536static void 537metaslab_group_alloc_update(metaslab_group_t *mg) 538{ 539 vdev_t *vd = mg->mg_vd; 540 metaslab_class_t *mc = mg->mg_class; 541 vdev_stat_t *vs = &vd->vdev_stat; 542 boolean_t was_allocatable; 543 boolean_t was_initialized; 544 545 ASSERT(vd == vd->vdev_top); 546 547 mutex_enter(&mg->mg_lock); 548 was_allocatable = mg->mg_allocatable; 549 was_initialized = mg->mg_initialized; 550 551 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 552 (vs->vs_space + 1); 553 554 mutex_enter(&mc->mc_lock); 555 556 /* 557 * If the metaslab group was just added then it won't 558 * have any space until we finish syncing out this txg. 559 * At that point we will consider it initialized and available 560 * for allocations. We also don't consider non-activated 561 * metaslab groups (e.g. vdevs that are in the middle of being removed) 562 * to be initialized, because they can't be used for allocation. 563 */ 564 mg->mg_initialized = metaslab_group_initialized(mg); 565 if (!was_initialized && mg->mg_initialized) { 566 mc->mc_groups++; 567 } else if (was_initialized && !mg->mg_initialized) { 568 ASSERT3U(mc->mc_groups, >, 0); 569 mc->mc_groups--; 570 } 571 if (mg->mg_initialized) 572 mg->mg_no_free_space = B_FALSE; 573 574 /* 575 * A metaslab group is considered allocatable if it has plenty 576 * of free space or is not heavily fragmented. We only take 577 * fragmentation into account if the metaslab group has a valid 578 * fragmentation metric (i.e. a value between 0 and 100). 579 */ 580 mg->mg_allocatable = (mg->mg_activation_count > 0 && 581 mg->mg_free_capacity > zfs_mg_noalloc_threshold && 582 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 583 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 584 585 /* 586 * The mc_alloc_groups maintains a count of the number of 587 * groups in this metaslab class that are still above the 588 * zfs_mg_noalloc_threshold. This is used by the allocating 589 * threads to determine if they should avoid allocations to 590 * a given group. The allocator will avoid allocations to a group 591 * if that group has reached or is below the zfs_mg_noalloc_threshold 592 * and there are still other groups that are above the threshold. 593 * When a group transitions from allocatable to non-allocatable or 594 * vice versa we update the metaslab class to reflect that change. 595 * When the mc_alloc_groups value drops to 0 that means that all 596 * groups have reached the zfs_mg_noalloc_threshold making all groups 597 * eligible for allocations. This effectively means that all devices 598 * are balanced again. 599 */ 600 if (was_allocatable && !mg->mg_allocatable) 601 mc->mc_alloc_groups--; 602 else if (!was_allocatable && mg->mg_allocatable) 603 mc->mc_alloc_groups++; 604 mutex_exit(&mc->mc_lock); 605 606 mutex_exit(&mg->mg_lock); 607} 608 609metaslab_group_t * 610metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 611{ 612 metaslab_group_t *mg; 613 614 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 615 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 616 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 617 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 618 mg->mg_vd = vd; 619 mg->mg_class = mc; 620 mg->mg_activation_count = 0; 621 mg->mg_initialized = B_FALSE; 622 mg->mg_no_free_space = B_TRUE; 623 refcount_create_tracked(&mg->mg_alloc_queue_depth); 624 625 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 626 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 627 628 return (mg); 629} 630 631void 632metaslab_group_destroy(metaslab_group_t *mg) 633{ 634 ASSERT(mg->mg_prev == NULL); 635 ASSERT(mg->mg_next == NULL); 636 /* 637 * We may have gone below zero with the activation count 638 * either because we never activated in the first place or 639 * because we're done, and possibly removing the vdev. 640 */ 641 ASSERT(mg->mg_activation_count <= 0); 642 643 taskq_destroy(mg->mg_taskq); 644 avl_destroy(&mg->mg_metaslab_tree); 645 mutex_destroy(&mg->mg_lock); 646 refcount_destroy(&mg->mg_alloc_queue_depth); 647 kmem_free(mg, sizeof (metaslab_group_t)); 648} 649 650void 651metaslab_group_activate(metaslab_group_t *mg) 652{ 653 metaslab_class_t *mc = mg->mg_class; 654 metaslab_group_t *mgprev, *mgnext; 655 656 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 657 658 ASSERT(mc->mc_rotor != mg); 659 ASSERT(mg->mg_prev == NULL); 660 ASSERT(mg->mg_next == NULL); 661 ASSERT(mg->mg_activation_count <= 0); 662 663 if (++mg->mg_activation_count <= 0) 664 return; 665 666 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 667 metaslab_group_alloc_update(mg); 668 669 if ((mgprev = mc->mc_rotor) == NULL) { 670 mg->mg_prev = mg; 671 mg->mg_next = mg; 672 } else { 673 mgnext = mgprev->mg_next; 674 mg->mg_prev = mgprev; 675 mg->mg_next = mgnext; 676 mgprev->mg_next = mg; 677 mgnext->mg_prev = mg; 678 } 679 mc->mc_rotor = mg; 680 metaslab_class_minblocksize_update(mc); 681} 682 683void 684metaslab_group_passivate(metaslab_group_t *mg) 685{ 686 metaslab_class_t *mc = mg->mg_class; 687 metaslab_group_t *mgprev, *mgnext; 688 689 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 690 691 if (--mg->mg_activation_count != 0) { 692 ASSERT(mc->mc_rotor != mg); 693 ASSERT(mg->mg_prev == NULL); 694 ASSERT(mg->mg_next == NULL); 695 ASSERT(mg->mg_activation_count < 0); 696 return; 697 } 698 699 taskq_wait(mg->mg_taskq); 700 metaslab_group_alloc_update(mg); 701 702 mgprev = mg->mg_prev; 703 mgnext = mg->mg_next; 704 705 if (mg == mgnext) { 706 mc->mc_rotor = NULL; 707 } else { 708 mc->mc_rotor = mgnext; 709 mgprev->mg_next = mgnext; 710 mgnext->mg_prev = mgprev; 711 } 712 713 mg->mg_prev = NULL; 714 mg->mg_next = NULL; 715 metaslab_class_minblocksize_update(mc); 716} 717 718boolean_t 719metaslab_group_initialized(metaslab_group_t *mg) 720{ 721 vdev_t *vd = mg->mg_vd; 722 vdev_stat_t *vs = &vd->vdev_stat; 723 724 return (vs->vs_space != 0 && mg->mg_activation_count > 0); 725} 726 727uint64_t 728metaslab_group_get_space(metaslab_group_t *mg) 729{ 730 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 731} 732 733void 734metaslab_group_histogram_verify(metaslab_group_t *mg) 735{ 736 uint64_t *mg_hist; 737 vdev_t *vd = mg->mg_vd; 738 uint64_t ashift = vd->vdev_ashift; 739 int i; 740 741 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 742 return; 743 744 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 745 KM_SLEEP); 746 747 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 748 SPACE_MAP_HISTOGRAM_SIZE + ashift); 749 750 for (int m = 0; m < vd->vdev_ms_count; m++) { 751 metaslab_t *msp = vd->vdev_ms[m]; 752 753 if (msp->ms_sm == NULL) 754 continue; 755 756 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 757 mg_hist[i + ashift] += 758 msp->ms_sm->sm_phys->smp_histogram[i]; 759 } 760 761 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 762 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 763 764 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 765} 766 767static void 768metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 769{ 770 metaslab_class_t *mc = mg->mg_class; 771 uint64_t ashift = mg->mg_vd->vdev_ashift; 772 773 ASSERT(MUTEX_HELD(&msp->ms_lock)); 774 if (msp->ms_sm == NULL) 775 return; 776 777 mutex_enter(&mg->mg_lock); 778 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 779 mg->mg_histogram[i + ashift] += 780 msp->ms_sm->sm_phys->smp_histogram[i]; 781 mc->mc_histogram[i + ashift] += 782 msp->ms_sm->sm_phys->smp_histogram[i]; 783 } 784 mutex_exit(&mg->mg_lock); 785} 786 787void 788metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 789{ 790 metaslab_class_t *mc = mg->mg_class; 791 uint64_t ashift = mg->mg_vd->vdev_ashift; 792 793 ASSERT(MUTEX_HELD(&msp->ms_lock)); 794 if (msp->ms_sm == NULL) 795 return; 796 797 mutex_enter(&mg->mg_lock); 798 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 799 ASSERT3U(mg->mg_histogram[i + ashift], >=, 800 msp->ms_sm->sm_phys->smp_histogram[i]); 801 ASSERT3U(mc->mc_histogram[i + ashift], >=, 802 msp->ms_sm->sm_phys->smp_histogram[i]); 803 804 mg->mg_histogram[i + ashift] -= 805 msp->ms_sm->sm_phys->smp_histogram[i]; 806 mc->mc_histogram[i + ashift] -= 807 msp->ms_sm->sm_phys->smp_histogram[i]; 808 } 809 mutex_exit(&mg->mg_lock); 810} 811 812static void 813metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 814{ 815 ASSERT(msp->ms_group == NULL); 816 mutex_enter(&mg->mg_lock); 817 msp->ms_group = mg; 818 msp->ms_weight = 0; 819 avl_add(&mg->mg_metaslab_tree, msp); 820 mutex_exit(&mg->mg_lock); 821 822 mutex_enter(&msp->ms_lock); 823 metaslab_group_histogram_add(mg, msp); 824 mutex_exit(&msp->ms_lock); 825} 826 827static void 828metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 829{ 830 mutex_enter(&msp->ms_lock); 831 metaslab_group_histogram_remove(mg, msp); 832 mutex_exit(&msp->ms_lock); 833 834 mutex_enter(&mg->mg_lock); 835 ASSERT(msp->ms_group == mg); 836 avl_remove(&mg->mg_metaslab_tree, msp); 837 msp->ms_group = NULL; 838 mutex_exit(&mg->mg_lock); 839} 840 841static void 842metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 843{ 844 /* 845 * Although in principle the weight can be any value, in 846 * practice we do not use values in the range [1, 511]. 847 */ 848 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 849 ASSERT(MUTEX_HELD(&msp->ms_lock)); 850 851 mutex_enter(&mg->mg_lock); 852 ASSERT(msp->ms_group == mg); 853 avl_remove(&mg->mg_metaslab_tree, msp); 854 msp->ms_weight = weight; 855 avl_add(&mg->mg_metaslab_tree, msp); 856 mutex_exit(&mg->mg_lock); 857} 858 859/* 860 * Calculate the fragmentation for a given metaslab group. We can use 861 * a simple average here since all metaslabs within the group must have 862 * the same size. The return value will be a value between 0 and 100 863 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 864 * group have a fragmentation metric. 865 */ 866uint64_t 867metaslab_group_fragmentation(metaslab_group_t *mg) 868{ 869 vdev_t *vd = mg->mg_vd; 870 uint64_t fragmentation = 0; 871 uint64_t valid_ms = 0; 872 873 for (int m = 0; m < vd->vdev_ms_count; m++) { 874 metaslab_t *msp = vd->vdev_ms[m]; 875 876 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 877 continue; 878 879 valid_ms++; 880 fragmentation += msp->ms_fragmentation; 881 } 882 883 if (valid_ms <= vd->vdev_ms_count / 2) 884 return (ZFS_FRAG_INVALID); 885 886 fragmentation /= valid_ms; 887 ASSERT3U(fragmentation, <=, 100); 888 return (fragmentation); 889} 890 891/* 892 * Determine if a given metaslab group should skip allocations. A metaslab 893 * group should avoid allocations if its free capacity is less than the 894 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 895 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 896 * that can still handle allocations. If the allocation throttle is enabled 897 * then we skip allocations to devices that have reached their maximum 898 * allocation queue depth unless the selected metaslab group is the only 899 * eligible group remaining. 900 */ 901static boolean_t 902metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, 903 uint64_t psize) 904{ 905 spa_t *spa = mg->mg_vd->vdev_spa; 906 metaslab_class_t *mc = mg->mg_class; 907 908 /* 909 * We can only consider skipping this metaslab group if it's 910 * in the normal metaslab class and there are other metaslab 911 * groups to select from. Otherwise, we always consider it eligible 912 * for allocations. 913 */ 914 if (mc != spa_normal_class(spa) || mc->mc_groups <= 1) 915 return (B_TRUE); 916 917 /* 918 * If the metaslab group's mg_allocatable flag is set (see comments 919 * in metaslab_group_alloc_update() for more information) and 920 * the allocation throttle is disabled then allow allocations to this 921 * device. However, if the allocation throttle is enabled then 922 * check if we have reached our allocation limit (mg_alloc_queue_depth) 923 * to determine if we should allow allocations to this metaslab group. 924 * If all metaslab groups are no longer considered allocatable 925 * (mc_alloc_groups == 0) or we're trying to allocate the smallest 926 * gang block size then we allow allocations on this metaslab group 927 * regardless of the mg_allocatable or throttle settings. 928 */ 929 if (mg->mg_allocatable) { 930 metaslab_group_t *mgp; 931 int64_t qdepth; 932 uint64_t qmax = mg->mg_max_alloc_queue_depth; 933 934 if (!mc->mc_alloc_throttle_enabled) 935 return (B_TRUE); 936 937 /* 938 * If this metaslab group does not have any free space, then 939 * there is no point in looking further. 940 */ 941 if (mg->mg_no_free_space) 942 return (B_FALSE); 943 944 qdepth = refcount_count(&mg->mg_alloc_queue_depth); 945 946 /* 947 * If this metaslab group is below its qmax or it's 948 * the only allocatable metasable group, then attempt 949 * to allocate from it. 950 */ 951 if (qdepth < qmax || mc->mc_alloc_groups == 1) 952 return (B_TRUE); 953 ASSERT3U(mc->mc_alloc_groups, >, 1); 954 955 /* 956 * Since this metaslab group is at or over its qmax, we 957 * need to determine if there are metaslab groups after this 958 * one that might be able to handle this allocation. This is 959 * racy since we can't hold the locks for all metaslab 960 * groups at the same time when we make this check. 961 */ 962 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) { 963 qmax = mgp->mg_max_alloc_queue_depth; 964 965 qdepth = refcount_count(&mgp->mg_alloc_queue_depth); 966 967 /* 968 * If there is another metaslab group that 969 * might be able to handle the allocation, then 970 * we return false so that we skip this group. 971 */ 972 if (qdepth < qmax && !mgp->mg_no_free_space) 973 return (B_FALSE); 974 } 975 976 /* 977 * We didn't find another group to handle the allocation 978 * so we can't skip this metaslab group even though 979 * we are at or over our qmax. 980 */ 981 return (B_TRUE); 982 983 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { 984 return (B_TRUE); 985 } 986 return (B_FALSE); 987} 988 989/* 990 * ========================================================================== 991 * Range tree callbacks 992 * ========================================================================== 993 */ 994 995/* 996 * Comparison function for the private size-ordered tree. Tree is sorted 997 * by size, larger sizes at the end of the tree. 998 */ 999static int 1000metaslab_rangesize_compare(const void *x1, const void *x2) 1001{ 1002 const range_seg_t *r1 = x1; 1003 const range_seg_t *r2 = x2; 1004 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 1005 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 1006 1007 if (rs_size1 < rs_size2) 1008 return (-1); 1009 if (rs_size1 > rs_size2) 1010 return (1); 1011 1012 if (r1->rs_start < r2->rs_start) 1013 return (-1); 1014 1015 if (r1->rs_start > r2->rs_start) 1016 return (1); 1017 1018 return (0); 1019} 1020 1021/* 1022 * Create any block allocator specific components. The current allocators 1023 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 1024 */ 1025static void 1026metaslab_rt_create(range_tree_t *rt, void *arg) 1027{ 1028 metaslab_t *msp = arg; 1029 1030 ASSERT3P(rt->rt_arg, ==, msp); 1031 ASSERT(msp->ms_tree == NULL); 1032 1033 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 1034 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1035} 1036 1037/* 1038 * Destroy the block allocator specific components. 1039 */ 1040static void 1041metaslab_rt_destroy(range_tree_t *rt, void *arg) 1042{ 1043 metaslab_t *msp = arg; 1044 1045 ASSERT3P(rt->rt_arg, ==, msp); 1046 ASSERT3P(msp->ms_tree, ==, rt); 1047 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 1048 1049 avl_destroy(&msp->ms_size_tree); 1050} 1051 1052static void 1053metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 1054{ 1055 metaslab_t *msp = arg; 1056 1057 ASSERT3P(rt->rt_arg, ==, msp); 1058 ASSERT3P(msp->ms_tree, ==, rt); 1059 VERIFY(!msp->ms_condensing); 1060 avl_add(&msp->ms_size_tree, rs); 1061} 1062 1063static void 1064metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 1065{ 1066 metaslab_t *msp = arg; 1067 1068 ASSERT3P(rt->rt_arg, ==, msp); 1069 ASSERT3P(msp->ms_tree, ==, rt); 1070 VERIFY(!msp->ms_condensing); 1071 avl_remove(&msp->ms_size_tree, rs); 1072} 1073 1074static void 1075metaslab_rt_vacate(range_tree_t *rt, void *arg) 1076{ 1077 metaslab_t *msp = arg; 1078 1079 ASSERT3P(rt->rt_arg, ==, msp); 1080 ASSERT3P(msp->ms_tree, ==, rt); 1081 1082 /* 1083 * Normally one would walk the tree freeing nodes along the way. 1084 * Since the nodes are shared with the range trees we can avoid 1085 * walking all nodes and just reinitialize the avl tree. The nodes 1086 * will be freed by the range tree, so we don't want to free them here. 1087 */ 1088 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 1089 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 1090} 1091 1092static range_tree_ops_t metaslab_rt_ops = { 1093 metaslab_rt_create, 1094 metaslab_rt_destroy, 1095 metaslab_rt_add, 1096 metaslab_rt_remove, 1097 metaslab_rt_vacate 1098}; 1099 1100/* 1101 * ========================================================================== 1102 * Metaslab block operations 1103 * ========================================================================== 1104 */ 1105 1106/* 1107 * Return the maximum contiguous segment within the metaslab. 1108 */ 1109uint64_t 1110metaslab_block_maxsize(metaslab_t *msp) 1111{ 1112 avl_tree_t *t = &msp->ms_size_tree; 1113 range_seg_t *rs; 1114 1115 if (t == NULL || (rs = avl_last(t)) == NULL) 1116 return (0ULL); 1117 1118 return (rs->rs_end - rs->rs_start); 1119} 1120 1121uint64_t 1122metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1123{ 1124 uint64_t start; 1125 range_tree_t *rt = msp->ms_tree; 1126 1127 VERIFY(!msp->ms_condensing); 1128 1129 start = msp->ms_ops->msop_alloc(msp, size); 1130 if (start != -1ULL) { 1131 vdev_t *vd = msp->ms_group->mg_vd; 1132 1133 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1134 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1135 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1136 range_tree_remove(rt, start, size); 1137 } 1138 return (start); 1139} 1140 1141/* 1142 * ========================================================================== 1143 * Common allocator routines 1144 * ========================================================================== 1145 */ 1146 1147/* 1148 * This is a helper function that can be used by the allocator to find 1149 * a suitable block to allocate. This will search the specified AVL 1150 * tree looking for a block that matches the specified criteria. 1151 */ 1152static uint64_t 1153metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1154 uint64_t align) 1155{ 1156 range_seg_t *rs, rsearch; 1157 avl_index_t where; 1158 1159 rsearch.rs_start = *cursor; 1160 rsearch.rs_end = *cursor + size; 1161 1162 rs = avl_find(t, &rsearch, &where); 1163 if (rs == NULL) 1164 rs = avl_nearest(t, where, AVL_AFTER); 1165 1166 while (rs != NULL) { 1167 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1168 1169 if (offset + size <= rs->rs_end) { 1170 *cursor = offset + size; 1171 return (offset); 1172 } 1173 rs = AVL_NEXT(t, rs); 1174 } 1175 1176 /* 1177 * If we know we've searched the whole map (*cursor == 0), give up. 1178 * Otherwise, reset the cursor to the beginning and try again. 1179 */ 1180 if (*cursor == 0) 1181 return (-1ULL); 1182 1183 *cursor = 0; 1184 return (metaslab_block_picker(t, cursor, size, align)); 1185} 1186 1187/* 1188 * ========================================================================== 1189 * The first-fit block allocator 1190 * ========================================================================== 1191 */ 1192static uint64_t 1193metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1194{ 1195 /* 1196 * Find the largest power of 2 block size that evenly divides the 1197 * requested size. This is used to try to allocate blocks with similar 1198 * alignment from the same area of the metaslab (i.e. same cursor 1199 * bucket) but it does not guarantee that other allocations sizes 1200 * may exist in the same region. 1201 */ 1202 uint64_t align = size & -size; 1203 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1204 avl_tree_t *t = &msp->ms_tree->rt_root; 1205 1206 return (metaslab_block_picker(t, cursor, size, align)); 1207} 1208 1209static metaslab_ops_t metaslab_ff_ops = { 1210 metaslab_ff_alloc 1211}; 1212 1213/* 1214 * ========================================================================== 1215 * Dynamic block allocator - 1216 * Uses the first fit allocation scheme until space get low and then 1217 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1218 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1219 * ========================================================================== 1220 */ 1221static uint64_t 1222metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1223{ 1224 /* 1225 * Find the largest power of 2 block size that evenly divides the 1226 * requested size. This is used to try to allocate blocks with similar 1227 * alignment from the same area of the metaslab (i.e. same cursor 1228 * bucket) but it does not guarantee that other allocations sizes 1229 * may exist in the same region. 1230 */ 1231 uint64_t align = size & -size; 1232 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1233 range_tree_t *rt = msp->ms_tree; 1234 avl_tree_t *t = &rt->rt_root; 1235 uint64_t max_size = metaslab_block_maxsize(msp); 1236 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1237 1238 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1239 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1240 1241 if (max_size < size) 1242 return (-1ULL); 1243 1244 /* 1245 * If we're running low on space switch to using the size 1246 * sorted AVL tree (best-fit). 1247 */ 1248 if (max_size < metaslab_df_alloc_threshold || 1249 free_pct < metaslab_df_free_pct) { 1250 t = &msp->ms_size_tree; 1251 *cursor = 0; 1252 } 1253 1254 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1255} 1256 1257static metaslab_ops_t metaslab_df_ops = { 1258 metaslab_df_alloc 1259}; 1260 1261/* 1262 * ========================================================================== 1263 * Cursor fit block allocator - 1264 * Select the largest region in the metaslab, set the cursor to the beginning 1265 * of the range and the cursor_end to the end of the range. As allocations 1266 * are made advance the cursor. Continue allocating from the cursor until 1267 * the range is exhausted and then find a new range. 1268 * ========================================================================== 1269 */ 1270static uint64_t 1271metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1272{ 1273 range_tree_t *rt = msp->ms_tree; 1274 avl_tree_t *t = &msp->ms_size_tree; 1275 uint64_t *cursor = &msp->ms_lbas[0]; 1276 uint64_t *cursor_end = &msp->ms_lbas[1]; 1277 uint64_t offset = 0; 1278 1279 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1280 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1281 1282 ASSERT3U(*cursor_end, >=, *cursor); 1283 1284 if ((*cursor + size) > *cursor_end) { 1285 range_seg_t *rs; 1286 1287 rs = avl_last(&msp->ms_size_tree); 1288 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1289 return (-1ULL); 1290 1291 *cursor = rs->rs_start; 1292 *cursor_end = rs->rs_end; 1293 } 1294 1295 offset = *cursor; 1296 *cursor += size; 1297 1298 return (offset); 1299} 1300 1301static metaslab_ops_t metaslab_cf_ops = { 1302 metaslab_cf_alloc 1303}; 1304 1305/* 1306 * ========================================================================== 1307 * New dynamic fit allocator - 1308 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1309 * contiguous blocks. If no region is found then just use the largest segment 1310 * that remains. 1311 * ========================================================================== 1312 */ 1313 1314/* 1315 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1316 * to request from the allocator. 1317 */ 1318uint64_t metaslab_ndf_clump_shift = 4; 1319 1320static uint64_t 1321metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1322{ 1323 avl_tree_t *t = &msp->ms_tree->rt_root; 1324 avl_index_t where; 1325 range_seg_t *rs, rsearch; 1326 uint64_t hbit = highbit64(size); 1327 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1328 uint64_t max_size = metaslab_block_maxsize(msp); 1329 1330 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1331 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1332 1333 if (max_size < size) 1334 return (-1ULL); 1335 1336 rsearch.rs_start = *cursor; 1337 rsearch.rs_end = *cursor + size; 1338 1339 rs = avl_find(t, &rsearch, &where); 1340 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1341 t = &msp->ms_size_tree; 1342 1343 rsearch.rs_start = 0; 1344 rsearch.rs_end = MIN(max_size, 1345 1ULL << (hbit + metaslab_ndf_clump_shift)); 1346 rs = avl_find(t, &rsearch, &where); 1347 if (rs == NULL) 1348 rs = avl_nearest(t, where, AVL_AFTER); 1349 ASSERT(rs != NULL); 1350 } 1351 1352 if ((rs->rs_end - rs->rs_start) >= size) { 1353 *cursor = rs->rs_start + size; 1354 return (rs->rs_start); 1355 } 1356 return (-1ULL); 1357} 1358 1359static metaslab_ops_t metaslab_ndf_ops = { 1360 metaslab_ndf_alloc 1361}; 1362 1363metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1364 1365/* 1366 * ========================================================================== 1367 * Metaslabs 1368 * ========================================================================== 1369 */ 1370 1371/* 1372 * Wait for any in-progress metaslab loads to complete. 1373 */ 1374void 1375metaslab_load_wait(metaslab_t *msp) 1376{ 1377 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1378 1379 while (msp->ms_loading) { 1380 ASSERT(!msp->ms_loaded); 1381 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1382 } 1383} 1384 1385int 1386metaslab_load(metaslab_t *msp) 1387{ 1388 int error = 0; 1389 1390 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1391 ASSERT(!msp->ms_loaded); 1392 ASSERT(!msp->ms_loading); 1393 1394 msp->ms_loading = B_TRUE; 1395 1396 /* 1397 * If the space map has not been allocated yet, then treat 1398 * all the space in the metaslab as free and add it to the 1399 * ms_tree. 1400 */ 1401 if (msp->ms_sm != NULL) 1402 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1403 else 1404 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1405 1406 msp->ms_loaded = (error == 0); 1407 msp->ms_loading = B_FALSE; 1408 1409 if (msp->ms_loaded) { 1410 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1411 range_tree_walk(msp->ms_defertree[t], 1412 range_tree_remove, msp->ms_tree); 1413 } 1414 } 1415 cv_broadcast(&msp->ms_load_cv); 1416 return (error); 1417} 1418 1419void 1420metaslab_unload(metaslab_t *msp) 1421{ 1422 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1423 range_tree_vacate(msp->ms_tree, NULL, NULL); 1424 msp->ms_loaded = B_FALSE; 1425 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1426} 1427 1428int 1429metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1430 metaslab_t **msp) 1431{ 1432 vdev_t *vd = mg->mg_vd; 1433 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1434 metaslab_t *ms; 1435 int error; 1436 1437 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1438 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1439 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1440 ms->ms_id = id; 1441 ms->ms_start = id << vd->vdev_ms_shift; 1442 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1443 1444 /* 1445 * We only open space map objects that already exist. All others 1446 * will be opened when we finally allocate an object for it. 1447 */ 1448 if (object != 0) { 1449 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1450 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1451 1452 if (error != 0) { 1453 kmem_free(ms, sizeof (metaslab_t)); 1454 return (error); 1455 } 1456 1457 ASSERT(ms->ms_sm != NULL); 1458 } 1459 1460 /* 1461 * We create the main range tree here, but we don't create the 1462 * alloctree and freetree until metaslab_sync_done(). This serves 1463 * two purposes: it allows metaslab_sync_done() to detect the 1464 * addition of new space; and for debugging, it ensures that we'd 1465 * data fault on any attempt to use this metaslab before it's ready. 1466 */ 1467 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1468 metaslab_group_add(mg, ms); 1469 1470 ms->ms_fragmentation = metaslab_fragmentation(ms); 1471 ms->ms_ops = mg->mg_class->mc_ops; 1472 1473 /* 1474 * If we're opening an existing pool (txg == 0) or creating 1475 * a new one (txg == TXG_INITIAL), all space is available now. 1476 * If we're adding space to an existing pool, the new space 1477 * does not become available until after this txg has synced. 1478 */ 1479 if (txg <= TXG_INITIAL) 1480 metaslab_sync_done(ms, 0); 1481 1482 /* 1483 * If metaslab_debug_load is set and we're initializing a metaslab 1484 * that has an allocated space_map object then load the its space 1485 * map so that can verify frees. 1486 */ 1487 if (metaslab_debug_load && ms->ms_sm != NULL) { 1488 mutex_enter(&ms->ms_lock); 1489 VERIFY0(metaslab_load(ms)); 1490 mutex_exit(&ms->ms_lock); 1491 } 1492 1493 if (txg != 0) { 1494 vdev_dirty(vd, 0, NULL, txg); 1495 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1496 } 1497 1498 *msp = ms; 1499 1500 return (0); 1501} 1502 1503void 1504metaslab_fini(metaslab_t *msp) 1505{ 1506 metaslab_group_t *mg = msp->ms_group; 1507 1508 metaslab_group_remove(mg, msp); 1509 1510 mutex_enter(&msp->ms_lock); 1511 1512 VERIFY(msp->ms_group == NULL); 1513 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1514 0, -msp->ms_size); 1515 space_map_close(msp->ms_sm); 1516 1517 metaslab_unload(msp); 1518 range_tree_destroy(msp->ms_tree); 1519 1520 for (int t = 0; t < TXG_SIZE; t++) { 1521 range_tree_destroy(msp->ms_alloctree[t]); 1522 range_tree_destroy(msp->ms_freetree[t]); 1523 } 1524 1525 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1526 range_tree_destroy(msp->ms_defertree[t]); 1527 } 1528 1529 ASSERT0(msp->ms_deferspace); 1530 1531 mutex_exit(&msp->ms_lock); 1532 cv_destroy(&msp->ms_load_cv); 1533 mutex_destroy(&msp->ms_lock); 1534 1535 kmem_free(msp, sizeof (metaslab_t)); 1536} 1537 1538#define FRAGMENTATION_TABLE_SIZE 17 1539 1540/* 1541 * This table defines a segment size based fragmentation metric that will 1542 * allow each metaslab to derive its own fragmentation value. This is done 1543 * by calculating the space in each bucket of the spacemap histogram and 1544 * multiplying that by the fragmetation metric in this table. Doing 1545 * this for all buckets and dividing it by the total amount of free 1546 * space in this metaslab (i.e. the total free space in all buckets) gives 1547 * us the fragmentation metric. This means that a high fragmentation metric 1548 * equates to most of the free space being comprised of small segments. 1549 * Conversely, if the metric is low, then most of the free space is in 1550 * large segments. A 10% change in fragmentation equates to approximately 1551 * double the number of segments. 1552 * 1553 * This table defines 0% fragmented space using 16MB segments. Testing has 1554 * shown that segments that are greater than or equal to 16MB do not suffer 1555 * from drastic performance problems. Using this value, we derive the rest 1556 * of the table. Since the fragmentation value is never stored on disk, it 1557 * is possible to change these calculations in the future. 1558 */ 1559int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1560 100, /* 512B */ 1561 100, /* 1K */ 1562 98, /* 2K */ 1563 95, /* 4K */ 1564 90, /* 8K */ 1565 80, /* 16K */ 1566 70, /* 32K */ 1567 60, /* 64K */ 1568 50, /* 128K */ 1569 40, /* 256K */ 1570 30, /* 512K */ 1571 20, /* 1M */ 1572 15, /* 2M */ 1573 10, /* 4M */ 1574 5, /* 8M */ 1575 0 /* 16M */ 1576}; 1577 1578/* 1579 * Calclate the metaslab's fragmentation metric. A return value 1580 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1581 * not support this metric. Otherwise, the return value should be in the 1582 * range [0, 100]. 1583 */ 1584static uint64_t 1585metaslab_fragmentation(metaslab_t *msp) 1586{ 1587 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1588 uint64_t fragmentation = 0; 1589 uint64_t total = 0; 1590 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1591 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1592 1593 if (!feature_enabled) 1594 return (ZFS_FRAG_INVALID); 1595 1596 /* 1597 * A null space map means that the entire metaslab is free 1598 * and thus is not fragmented. 1599 */ 1600 if (msp->ms_sm == NULL) 1601 return (0); 1602 1603 /* 1604 * If this metaslab's space_map has not been upgraded, flag it 1605 * so that we upgrade next time we encounter it. 1606 */ 1607 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1608 uint64_t txg = spa_syncing_txg(spa); 1609 vdev_t *vd = msp->ms_group->mg_vd; 1610 1611 if (spa_writeable(spa)) { 1612 msp->ms_condense_wanted = B_TRUE; 1613 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1614 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1615 "msp %p, vd %p", txg, msp, vd); 1616 } 1617 return (ZFS_FRAG_INVALID); 1618 } 1619 1620 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1621 uint64_t space = 0; 1622 uint8_t shift = msp->ms_sm->sm_shift; 1623 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1624 FRAGMENTATION_TABLE_SIZE - 1); 1625 1626 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1627 continue; 1628 1629 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1630 total += space; 1631 1632 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1633 fragmentation += space * zfs_frag_table[idx]; 1634 } 1635 1636 if (total > 0) 1637 fragmentation /= total; 1638 ASSERT3U(fragmentation, <=, 100); 1639 return (fragmentation); 1640} 1641 1642/* 1643 * Compute a weight -- a selection preference value -- for the given metaslab. 1644 * This is based on the amount of free space, the level of fragmentation, 1645 * the LBA range, and whether the metaslab is loaded. 1646 */ 1647static uint64_t 1648metaslab_weight(metaslab_t *msp) 1649{ 1650 metaslab_group_t *mg = msp->ms_group; 1651 vdev_t *vd = mg->mg_vd; 1652 uint64_t weight, space; 1653 1654 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1655 1656 /* 1657 * This vdev is in the process of being removed so there is nothing 1658 * for us to do here. 1659 */ 1660 if (vd->vdev_removing) { 1661 ASSERT0(space_map_allocated(msp->ms_sm)); 1662 ASSERT0(vd->vdev_ms_shift); 1663 return (0); 1664 } 1665 1666 /* 1667 * The baseline weight is the metaslab's free space. 1668 */ 1669 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1670 1671 msp->ms_fragmentation = metaslab_fragmentation(msp); 1672 if (metaslab_fragmentation_factor_enabled && 1673 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1674 /* 1675 * Use the fragmentation information to inversely scale 1676 * down the baseline weight. We need to ensure that we 1677 * don't exclude this metaslab completely when it's 100% 1678 * fragmented. To avoid this we reduce the fragmented value 1679 * by 1. 1680 */ 1681 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1682 1683 /* 1684 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1685 * this metaslab again. The fragmentation metric may have 1686 * decreased the space to something smaller than 1687 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1688 * so that we can consume any remaining space. 1689 */ 1690 if (space > 0 && space < SPA_MINBLOCKSIZE) 1691 space = SPA_MINBLOCKSIZE; 1692 } 1693 weight = space; 1694 1695 /* 1696 * Modern disks have uniform bit density and constant angular velocity. 1697 * Therefore, the outer recording zones are faster (higher bandwidth) 1698 * than the inner zones by the ratio of outer to inner track diameter, 1699 * which is typically around 2:1. We account for this by assigning 1700 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1701 * In effect, this means that we'll select the metaslab with the most 1702 * free bandwidth rather than simply the one with the most free space. 1703 */ 1704 if (metaslab_lba_weighting_enabled) { 1705 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1706 ASSERT(weight >= space && weight <= 2 * space); 1707 } 1708 1709 /* 1710 * If this metaslab is one we're actively using, adjust its 1711 * weight to make it preferable to any inactive metaslab so 1712 * we'll polish it off. If the fragmentation on this metaslab 1713 * has exceed our threshold, then don't mark it active. 1714 */ 1715 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1716 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1717 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1718 } 1719 1720 return (weight); 1721} 1722 1723static int 1724metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1725{ 1726 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1727 1728 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1729 metaslab_load_wait(msp); 1730 if (!msp->ms_loaded) { 1731 int error = metaslab_load(msp); 1732 if (error) { 1733 metaslab_group_sort(msp->ms_group, msp, 0); 1734 return (error); 1735 } 1736 } 1737 1738 metaslab_group_sort(msp->ms_group, msp, 1739 msp->ms_weight | activation_weight); 1740 } 1741 ASSERT(msp->ms_loaded); 1742 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1743 1744 return (0); 1745} 1746 1747static void 1748metaslab_passivate(metaslab_t *msp, uint64_t size) 1749{ 1750 /* 1751 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1752 * this metaslab again. In that case, it had better be empty, 1753 * or we would be leaving space on the table. 1754 */ 1755 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1756 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1757 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1758} 1759 1760static void 1761metaslab_preload(void *arg) 1762{ 1763 metaslab_t *msp = arg; 1764 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1765 1766 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1767 1768 mutex_enter(&msp->ms_lock); 1769 metaslab_load_wait(msp); 1770 if (!msp->ms_loaded) 1771 (void) metaslab_load(msp); 1772 1773 /* 1774 * Set the ms_access_txg value so that we don't unload it right away. 1775 */ 1776 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1777 mutex_exit(&msp->ms_lock); 1778} 1779 1780static void 1781metaslab_group_preload(metaslab_group_t *mg) 1782{ 1783 spa_t *spa = mg->mg_vd->vdev_spa; 1784 metaslab_t *msp; 1785 avl_tree_t *t = &mg->mg_metaslab_tree; 1786 int m = 0; 1787 1788 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1789 taskq_wait(mg->mg_taskq); 1790 return; 1791 } 1792 1793 mutex_enter(&mg->mg_lock); 1794 /* 1795 * Load the next potential metaslabs 1796 */ 1797 msp = avl_first(t); 1798 while (msp != NULL) { 1799 metaslab_t *msp_next = AVL_NEXT(t, msp); 1800 1801 /* 1802 * We preload only the maximum number of metaslabs specified 1803 * by metaslab_preload_limit. If a metaslab is being forced 1804 * to condense then we preload it too. This will ensure 1805 * that force condensing happens in the next txg. 1806 */ 1807 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1808 msp = msp_next; 1809 continue; 1810 } 1811 1812 /* 1813 * We must drop the metaslab group lock here to preserve 1814 * lock ordering with the ms_lock (when grabbing both 1815 * the mg_lock and the ms_lock, the ms_lock must be taken 1816 * first). As a result, it is possible that the ordering 1817 * of the metaslabs within the avl tree may change before 1818 * we reacquire the lock. The metaslab cannot be removed from 1819 * the tree while we're in syncing context so it is safe to 1820 * drop the mg_lock here. If the metaslabs are reordered 1821 * nothing will break -- we just may end up loading a 1822 * less than optimal one. 1823 */ 1824 mutex_exit(&mg->mg_lock); 1825 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1826 msp, TQ_SLEEP) != 0); 1827 mutex_enter(&mg->mg_lock); 1828 msp = msp_next; 1829 } 1830 mutex_exit(&mg->mg_lock); 1831} 1832 1833/* 1834 * Determine if the space map's on-disk footprint is past our tolerance 1835 * for inefficiency. We would like to use the following criteria to make 1836 * our decision: 1837 * 1838 * 1. The size of the space map object should not dramatically increase as a 1839 * result of writing out the free space range tree. 1840 * 1841 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1842 * times the size than the free space range tree representation 1843 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1844 * 1845 * 3. The on-disk size of the space map should actually decrease. 1846 * 1847 * Checking the first condition is tricky since we don't want to walk 1848 * the entire AVL tree calculating the estimated on-disk size. Instead we 1849 * use the size-ordered range tree in the metaslab and calculate the 1850 * size required to write out the largest segment in our free tree. If the 1851 * size required to represent that segment on disk is larger than the space 1852 * map object then we avoid condensing this map. 1853 * 1854 * To determine the second criterion we use a best-case estimate and assume 1855 * each segment can be represented on-disk as a single 64-bit entry. We refer 1856 * to this best-case estimate as the space map's minimal form. 1857 * 1858 * Unfortunately, we cannot compute the on-disk size of the space map in this 1859 * context because we cannot accurately compute the effects of compression, etc. 1860 * Instead, we apply the heuristic described in the block comment for 1861 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1862 * is greater than a threshold number of blocks. 1863 */ 1864static boolean_t 1865metaslab_should_condense(metaslab_t *msp) 1866{ 1867 space_map_t *sm = msp->ms_sm; 1868 range_seg_t *rs; 1869 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1870 dmu_object_info_t doi; 1871 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1872 1873 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1874 ASSERT(msp->ms_loaded); 1875 1876 /* 1877 * Use the ms_size_tree range tree, which is ordered by size, to 1878 * obtain the largest segment in the free tree. We always condense 1879 * metaslabs that are empty and metaslabs for which a condense 1880 * request has been made. 1881 */ 1882 rs = avl_last(&msp->ms_size_tree); 1883 if (rs == NULL || msp->ms_condense_wanted) 1884 return (B_TRUE); 1885 1886 /* 1887 * Calculate the number of 64-bit entries this segment would 1888 * require when written to disk. If this single segment would be 1889 * larger on-disk than the entire current on-disk structure, then 1890 * clearly condensing will increase the on-disk structure size. 1891 */ 1892 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1893 entries = size / (MIN(size, SM_RUN_MAX)); 1894 segsz = entries * sizeof (uint64_t); 1895 1896 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1897 object_size = space_map_length(msp->ms_sm); 1898 1899 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1900 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1901 1902 return (segsz <= object_size && 1903 object_size >= (optimal_size * zfs_condense_pct / 100) && 1904 object_size > zfs_metaslab_condense_block_threshold * record_size); 1905} 1906 1907/* 1908 * Condense the on-disk space map representation to its minimized form. 1909 * The minimized form consists of a small number of allocations followed by 1910 * the entries of the free range tree. 1911 */ 1912static void 1913metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1914{ 1915 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1916 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1917 range_tree_t *condense_tree; 1918 space_map_t *sm = msp->ms_sm; 1919 1920 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1921 ASSERT3U(spa_sync_pass(spa), ==, 1); 1922 ASSERT(msp->ms_loaded); 1923 1924 1925 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1926 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1927 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1928 msp->ms_group->mg_vd->vdev_spa->spa_name, 1929 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1930 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1931 1932 msp->ms_condense_wanted = B_FALSE; 1933 1934 /* 1935 * Create an range tree that is 100% allocated. We remove segments 1936 * that have been freed in this txg, any deferred frees that exist, 1937 * and any allocation in the future. Removing segments should be 1938 * a relatively inexpensive operation since we expect these trees to 1939 * have a small number of nodes. 1940 */ 1941 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1942 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1943 1944 /* 1945 * Remove what's been freed in this txg from the condense_tree. 1946 * Since we're in sync_pass 1, we know that all the frees from 1947 * this txg are in the freetree. 1948 */ 1949 range_tree_walk(freetree, range_tree_remove, condense_tree); 1950 1951 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1952 range_tree_walk(msp->ms_defertree[t], 1953 range_tree_remove, condense_tree); 1954 } 1955 1956 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1957 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1958 range_tree_remove, condense_tree); 1959 } 1960 1961 /* 1962 * We're about to drop the metaslab's lock thus allowing 1963 * other consumers to change it's content. Set the 1964 * metaslab's ms_condensing flag to ensure that 1965 * allocations on this metaslab do not occur while we're 1966 * in the middle of committing it to disk. This is only critical 1967 * for the ms_tree as all other range trees use per txg 1968 * views of their content. 1969 */ 1970 msp->ms_condensing = B_TRUE; 1971 1972 mutex_exit(&msp->ms_lock); 1973 space_map_truncate(sm, tx); 1974 mutex_enter(&msp->ms_lock); 1975 1976 /* 1977 * While we would ideally like to create a space_map representation 1978 * that consists only of allocation records, doing so can be 1979 * prohibitively expensive because the in-core free tree can be 1980 * large, and therefore computationally expensive to subtract 1981 * from the condense_tree. Instead we sync out two trees, a cheap 1982 * allocation only tree followed by the in-core free tree. While not 1983 * optimal, this is typically close to optimal, and much cheaper to 1984 * compute. 1985 */ 1986 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1987 range_tree_vacate(condense_tree, NULL, NULL); 1988 range_tree_destroy(condense_tree); 1989 1990 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1991 msp->ms_condensing = B_FALSE; 1992} 1993 1994/* 1995 * Write a metaslab to disk in the context of the specified transaction group. 1996 */ 1997void 1998metaslab_sync(metaslab_t *msp, uint64_t txg) 1999{ 2000 metaslab_group_t *mg = msp->ms_group; 2001 vdev_t *vd = mg->mg_vd; 2002 spa_t *spa = vd->vdev_spa; 2003 objset_t *mos = spa_meta_objset(spa); 2004 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 2005 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 2006 range_tree_t **freed_tree = 2007 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2008 dmu_tx_t *tx; 2009 uint64_t object = space_map_object(msp->ms_sm); 2010 2011 ASSERT(!vd->vdev_ishole); 2012 2013 /* 2014 * This metaslab has just been added so there's no work to do now. 2015 */ 2016 if (*freetree == NULL) { 2017 ASSERT3P(alloctree, ==, NULL); 2018 return; 2019 } 2020 2021 ASSERT3P(alloctree, !=, NULL); 2022 ASSERT3P(*freetree, !=, NULL); 2023 ASSERT3P(*freed_tree, !=, NULL); 2024 2025 /* 2026 * Normally, we don't want to process a metaslab if there 2027 * are no allocations or frees to perform. However, if the metaslab 2028 * is being forced to condense we need to let it through. 2029 */ 2030 if (range_tree_space(alloctree) == 0 && 2031 range_tree_space(*freetree) == 0 && 2032 !msp->ms_condense_wanted) 2033 return; 2034 2035 /* 2036 * The only state that can actually be changing concurrently with 2037 * metaslab_sync() is the metaslab's ms_tree. No other thread can 2038 * be modifying this txg's alloctree, freetree, freed_tree, or 2039 * space_map_phys_t. Therefore, we only hold ms_lock to satify 2040 * space_map ASSERTs. We drop it whenever we call into the DMU, 2041 * because the DMU can call down to us (e.g. via zio_free()) at 2042 * any time. 2043 */ 2044 2045 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2046 2047 if (msp->ms_sm == NULL) { 2048 uint64_t new_object; 2049 2050 new_object = space_map_alloc(mos, tx); 2051 VERIFY3U(new_object, !=, 0); 2052 2053 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 2054 msp->ms_start, msp->ms_size, vd->vdev_ashift, 2055 &msp->ms_lock)); 2056 ASSERT(msp->ms_sm != NULL); 2057 } 2058 2059 mutex_enter(&msp->ms_lock); 2060 2061 /* 2062 * Note: metaslab_condense() clears the space_map's histogram. 2063 * Therefore we must verify and remove this histogram before 2064 * condensing. 2065 */ 2066 metaslab_group_histogram_verify(mg); 2067 metaslab_class_histogram_verify(mg->mg_class); 2068 metaslab_group_histogram_remove(mg, msp); 2069 2070 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 2071 metaslab_should_condense(msp)) { 2072 metaslab_condense(msp, txg, tx); 2073 } else { 2074 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 2075 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 2076 } 2077 2078 if (msp->ms_loaded) { 2079 /* 2080 * When the space map is loaded, we have an accruate 2081 * histogram in the range tree. This gives us an opportunity 2082 * to bring the space map's histogram up-to-date so we clear 2083 * it first before updating it. 2084 */ 2085 space_map_histogram_clear(msp->ms_sm); 2086 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 2087 } else { 2088 /* 2089 * Since the space map is not loaded we simply update the 2090 * exisiting histogram with what was freed in this txg. This 2091 * means that the on-disk histogram may not have an accurate 2092 * view of the free space but it's close enough to allow 2093 * us to make allocation decisions. 2094 */ 2095 space_map_histogram_add(msp->ms_sm, *freetree, tx); 2096 } 2097 metaslab_group_histogram_add(mg, msp); 2098 metaslab_group_histogram_verify(mg); 2099 metaslab_class_histogram_verify(mg->mg_class); 2100 2101 /* 2102 * For sync pass 1, we avoid traversing this txg's free range tree 2103 * and instead will just swap the pointers for freetree and 2104 * freed_tree. We can safely do this since the freed_tree is 2105 * guaranteed to be empty on the initial pass. 2106 */ 2107 if (spa_sync_pass(spa) == 1) { 2108 range_tree_swap(freetree, freed_tree); 2109 } else { 2110 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 2111 } 2112 range_tree_vacate(alloctree, NULL, NULL); 2113 2114 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2115 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2116 2117 mutex_exit(&msp->ms_lock); 2118 2119 if (object != space_map_object(msp->ms_sm)) { 2120 object = space_map_object(msp->ms_sm); 2121 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2122 msp->ms_id, sizeof (uint64_t), &object, tx); 2123 } 2124 dmu_tx_commit(tx); 2125} 2126 2127/* 2128 * Called after a transaction group has completely synced to mark 2129 * all of the metaslab's free space as usable. 2130 */ 2131void 2132metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2133{ 2134 metaslab_group_t *mg = msp->ms_group; 2135 vdev_t *vd = mg->mg_vd; 2136 range_tree_t **freed_tree; 2137 range_tree_t **defer_tree; 2138 int64_t alloc_delta, defer_delta; 2139 2140 ASSERT(!vd->vdev_ishole); 2141 2142 mutex_enter(&msp->ms_lock); 2143 2144 /* 2145 * If this metaslab is just becoming available, initialize its 2146 * alloctrees, freetrees, and defertree and add its capacity to 2147 * the vdev. 2148 */ 2149 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2150 for (int t = 0; t < TXG_SIZE; t++) { 2151 ASSERT(msp->ms_alloctree[t] == NULL); 2152 ASSERT(msp->ms_freetree[t] == NULL); 2153 2154 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2155 &msp->ms_lock); 2156 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2157 &msp->ms_lock); 2158 } 2159 2160 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2161 ASSERT(msp->ms_defertree[t] == NULL); 2162 2163 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2164 &msp->ms_lock); 2165 } 2166 2167 vdev_space_update(vd, 0, 0, msp->ms_size); 2168 } 2169 2170 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2171 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2172 2173 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2174 defer_delta = range_tree_space(*freed_tree) - 2175 range_tree_space(*defer_tree); 2176 2177 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2178 2179 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2180 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2181 2182 /* 2183 * If there's a metaslab_load() in progress, wait for it to complete 2184 * so that we have a consistent view of the in-core space map. 2185 */ 2186 metaslab_load_wait(msp); 2187 2188 /* 2189 * Move the frees from the defer_tree back to the free 2190 * range tree (if it's loaded). Swap the freed_tree and the 2191 * defer_tree -- this is safe to do because we've just emptied out 2192 * the defer_tree. 2193 */ 2194 range_tree_vacate(*defer_tree, 2195 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2196 range_tree_swap(freed_tree, defer_tree); 2197 2198 space_map_update(msp->ms_sm); 2199 2200 msp->ms_deferspace += defer_delta; 2201 ASSERT3S(msp->ms_deferspace, >=, 0); 2202 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2203 if (msp->ms_deferspace != 0) { 2204 /* 2205 * Keep syncing this metaslab until all deferred frees 2206 * are back in circulation. 2207 */ 2208 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2209 } 2210 2211 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2212 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2213 VERIFY0(range_tree_space( 2214 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2215 } 2216 2217 if (!metaslab_debug_unload) 2218 metaslab_unload(msp); 2219 } 2220 2221 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2222 mutex_exit(&msp->ms_lock); 2223} 2224 2225void 2226metaslab_sync_reassess(metaslab_group_t *mg) 2227{ 2228 metaslab_group_alloc_update(mg); 2229 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2230 2231 /* 2232 * Preload the next potential metaslabs 2233 */ 2234 metaslab_group_preload(mg); 2235} 2236 2237static uint64_t 2238metaslab_distance(metaslab_t *msp, dva_t *dva) 2239{ 2240 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2241 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2242 uint64_t start = msp->ms_id; 2243 2244 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2245 return (1ULL << 63); 2246 2247 if (offset < start) 2248 return ((start - offset) << ms_shift); 2249 if (offset > start) 2250 return ((offset - start) << ms_shift); 2251 return (0); 2252} 2253 2254/* 2255 * ========================================================================== 2256 * Metaslab block operations 2257 * ========================================================================== 2258 */ 2259 2260static void 2261metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags) 2262{ 2263 if (!(flags & METASLAB_ASYNC_ALLOC) || 2264 flags & METASLAB_DONT_THROTTLE) 2265 return; 2266 2267 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2268 if (!mg->mg_class->mc_alloc_throttle_enabled) 2269 return; 2270 2271 (void) refcount_add(&mg->mg_alloc_queue_depth, tag); 2272} 2273 2274void 2275metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags) 2276{ 2277 if (!(flags & METASLAB_ASYNC_ALLOC) || 2278 flags & METASLAB_DONT_THROTTLE) 2279 return; 2280 2281 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2282 if (!mg->mg_class->mc_alloc_throttle_enabled) 2283 return; 2284 2285 (void) refcount_remove(&mg->mg_alloc_queue_depth, tag); 2286} 2287 2288void 2289metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag) 2290{ 2291#ifdef ZFS_DEBUG 2292 const dva_t *dva = bp->blk_dva; 2293 int ndvas = BP_GET_NDVAS(bp); 2294 2295 for (int d = 0; d < ndvas; d++) { 2296 uint64_t vdev = DVA_GET_VDEV(&dva[d]); 2297 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; 2298 VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag)); 2299 } 2300#endif 2301} 2302 2303static uint64_t 2304metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize, 2305 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2306{ 2307 spa_t *spa = mg->mg_vd->vdev_spa; 2308 metaslab_t *msp = NULL; 2309 uint64_t offset = -1ULL; 2310 avl_tree_t *t = &mg->mg_metaslab_tree; 2311 uint64_t activation_weight; 2312 uint64_t target_distance; 2313 int i; 2314 2315 activation_weight = METASLAB_WEIGHT_PRIMARY; 2316 for (i = 0; i < d; i++) { 2317 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2318 activation_weight = METASLAB_WEIGHT_SECONDARY; 2319 break; 2320 } 2321 } 2322 2323 for (;;) { 2324 boolean_t was_active; 2325 2326 mutex_enter(&mg->mg_lock); 2327 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2328 if (msp->ms_weight < asize) { 2329 spa_dbgmsg(spa, "%s: failed to meet weight " 2330 "requirement: vdev %llu, txg %llu, mg %p, " 2331 "msp %p, asize %llu, " 2332 "weight %llu", spa_name(spa), 2333 mg->mg_vd->vdev_id, txg, 2334 mg, msp, asize, msp->ms_weight); 2335 mutex_exit(&mg->mg_lock); 2336 return (-1ULL); 2337 } 2338 2339 /* 2340 * If the selected metaslab is condensing, skip it. 2341 */ 2342 if (msp->ms_condensing) 2343 continue; 2344 2345 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2346 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2347 break; 2348 2349 target_distance = min_distance + 2350 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2351 min_distance >> 1); 2352 2353 for (i = 0; i < d; i++) 2354 if (metaslab_distance(msp, &dva[i]) < 2355 target_distance) 2356 break; 2357 if (i == d) 2358 break; 2359 } 2360 mutex_exit(&mg->mg_lock); 2361 if (msp == NULL) 2362 return (-1ULL); 2363 2364 mutex_enter(&msp->ms_lock); 2365 2366 /* 2367 * Ensure that the metaslab we have selected is still 2368 * capable of handling our request. It's possible that 2369 * another thread may have changed the weight while we 2370 * were blocked on the metaslab lock. 2371 */ 2372 if (msp->ms_weight < asize || (was_active && 2373 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2374 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2375 mutex_exit(&msp->ms_lock); 2376 continue; 2377 } 2378 2379 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2380 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2381 metaslab_passivate(msp, 2382 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2383 mutex_exit(&msp->ms_lock); 2384 continue; 2385 } 2386 2387 if (metaslab_activate(msp, activation_weight) != 0) { 2388 mutex_exit(&msp->ms_lock); 2389 continue; 2390 } 2391 2392 /* 2393 * If this metaslab is currently condensing then pick again as 2394 * we can't manipulate this metaslab until it's committed 2395 * to disk. 2396 */ 2397 if (msp->ms_condensing) { 2398 mutex_exit(&msp->ms_lock); 2399 continue; 2400 } 2401 2402 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2403 break; 2404 2405 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2406 mutex_exit(&msp->ms_lock); 2407 } 2408 2409 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2410 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2411 2412 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2413 msp->ms_access_txg = txg + metaslab_unload_delay; 2414 2415 mutex_exit(&msp->ms_lock); 2416 return (offset); 2417} 2418 2419/* 2420 * Allocate a block for the specified i/o. 2421 */ 2422static int 2423metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2424 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2425{ 2426 metaslab_group_t *mg, *rotor; 2427 vdev_t *vd; 2428 int dshift = 3; 2429 int all_zero; 2430 int zio_lock = B_FALSE; 2431 boolean_t allocatable; 2432 uint64_t asize; 2433 uint64_t distance; 2434 2435 ASSERT(!DVA_IS_VALID(&dva[d])); 2436 2437 /* 2438 * For testing, make some blocks above a certain size be gang blocks. 2439 */ 2440 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2441 return (SET_ERROR(ENOSPC)); 2442 2443 /* 2444 * Start at the rotor and loop through all mgs until we find something. 2445 * Note that there's no locking on mc_rotor or mc_aliquot because 2446 * nothing actually breaks if we miss a few updates -- we just won't 2447 * allocate quite as evenly. It all balances out over time. 2448 * 2449 * If we are doing ditto or log blocks, try to spread them across 2450 * consecutive vdevs. If we're forced to reuse a vdev before we've 2451 * allocated all of our ditto blocks, then try and spread them out on 2452 * that vdev as much as possible. If it turns out to not be possible, 2453 * gradually lower our standards until anything becomes acceptable. 2454 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2455 * gives us hope of containing our fault domains to something we're 2456 * able to reason about. Otherwise, any two top-level vdev failures 2457 * will guarantee the loss of data. With consecutive allocation, 2458 * only two adjacent top-level vdev failures will result in data loss. 2459 * 2460 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2461 * ourselves on the same vdev as our gang block header. That 2462 * way, we can hope for locality in vdev_cache, plus it makes our 2463 * fault domains something tractable. 2464 */ 2465 if (hintdva) { 2466 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2467 2468 /* 2469 * It's possible the vdev we're using as the hint no 2470 * longer exists (i.e. removed). Consult the rotor when 2471 * all else fails. 2472 */ 2473 if (vd != NULL) { 2474 mg = vd->vdev_mg; 2475 2476 if (flags & METASLAB_HINTBP_AVOID && 2477 mg->mg_next != NULL) 2478 mg = mg->mg_next; 2479 } else { 2480 mg = mc->mc_rotor; 2481 } 2482 } else if (d != 0) { 2483 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2484 mg = vd->vdev_mg->mg_next; 2485 } else { 2486 mg = mc->mc_rotor; 2487 } 2488 2489 /* 2490 * If the hint put us into the wrong metaslab class, or into a 2491 * metaslab group that has been passivated, just follow the rotor. 2492 */ 2493 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2494 mg = mc->mc_rotor; 2495 2496 rotor = mg; 2497top: 2498 all_zero = B_TRUE; 2499 do { 2500 ASSERT(mg->mg_activation_count == 1); 2501 vd = mg->mg_vd; 2502 2503 /* 2504 * Don't allocate from faulted devices. 2505 */ 2506 if (zio_lock) { 2507 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2508 allocatable = vdev_allocatable(vd); 2509 spa_config_exit(spa, SCL_ZIO, FTAG); 2510 } else { 2511 allocatable = vdev_allocatable(vd); 2512 } 2513 2514 /* 2515 * Determine if the selected metaslab group is eligible 2516 * for allocations. If we're ganging then don't allow 2517 * this metaslab group to skip allocations since that would 2518 * inadvertently return ENOSPC and suspend the pool 2519 * even though space is still available. 2520 */ 2521 if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) { 2522 allocatable = metaslab_group_allocatable(mg, rotor, 2523 psize); 2524 } 2525 2526 if (!allocatable) 2527 goto next; 2528 2529 ASSERT(mg->mg_initialized); 2530 2531 /* 2532 * Avoid writing single-copy data to a failing vdev. 2533 */ 2534 if ((vd->vdev_stat.vs_write_errors > 0 || 2535 vd->vdev_state < VDEV_STATE_HEALTHY) && 2536 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2537 all_zero = B_FALSE; 2538 goto next; 2539 } 2540 2541 ASSERT(mg->mg_class == mc); 2542 2543 distance = vd->vdev_asize >> dshift; 2544 if (distance <= (1ULL << vd->vdev_ms_shift)) 2545 distance = 0; 2546 else 2547 all_zero = B_FALSE; 2548 2549 asize = vdev_psize_to_asize(vd, psize); 2550 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2551 2552 uint64_t offset = metaslab_group_alloc(mg, asize, txg, 2553 distance, dva, d); 2554 2555 mutex_enter(&mg->mg_lock); 2556 if (offset == -1ULL) { 2557 mg->mg_failed_allocations++; 2558 if (asize == SPA_GANGBLOCKSIZE) { 2559 /* 2560 * This metaslab group was unable to allocate 2561 * the minimum gang block size so it must be 2562 * out of space. We must notify the allocation 2563 * throttle to start skipping allocation 2564 * attempts to this metaslab group until more 2565 * space becomes available. 2566 * 2567 * Note: this failure cannot be caused by the 2568 * allocation throttle since the allocation 2569 * throttle is only responsible for skipping 2570 * devices and not failing block allocations. 2571 */ 2572 mg->mg_no_free_space = B_TRUE; 2573 } 2574 } 2575 mg->mg_allocations++; 2576 mutex_exit(&mg->mg_lock); 2577 2578 if (offset != -1ULL) { 2579 /* 2580 * If we've just selected this metaslab group, 2581 * figure out whether the corresponding vdev is 2582 * over- or under-used relative to the pool, 2583 * and set an allocation bias to even it out. 2584 */ 2585 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2586 vdev_stat_t *vs = &vd->vdev_stat; 2587 int64_t vu, cu; 2588 2589 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2590 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2591 2592 /* 2593 * Calculate how much more or less we should 2594 * try to allocate from this device during 2595 * this iteration around the rotor. 2596 * For example, if a device is 80% full 2597 * and the pool is 20% full then we should 2598 * reduce allocations by 60% on this device. 2599 * 2600 * mg_bias = (20 - 80) * 512K / 100 = -307K 2601 * 2602 * This reduces allocations by 307K for this 2603 * iteration. 2604 */ 2605 mg->mg_bias = ((cu - vu) * 2606 (int64_t)mg->mg_aliquot) / 100; 2607 } else if (!metaslab_bias_enabled) { 2608 mg->mg_bias = 0; 2609 } 2610 2611 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2612 mg->mg_aliquot + mg->mg_bias) { 2613 mc->mc_rotor = mg->mg_next; 2614 mc->mc_aliquot = 0; 2615 } 2616 2617 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2618 DVA_SET_OFFSET(&dva[d], offset); 2619 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2620 DVA_SET_ASIZE(&dva[d], asize); 2621 2622 return (0); 2623 } 2624next: 2625 mc->mc_rotor = mg->mg_next; 2626 mc->mc_aliquot = 0; 2627 } while ((mg = mg->mg_next) != rotor); 2628 2629 if (!all_zero) { 2630 dshift++; 2631 ASSERT(dshift < 64); 2632 goto top; 2633 } 2634 2635 if (!allocatable && !zio_lock) { 2636 dshift = 3; 2637 zio_lock = B_TRUE; 2638 goto top; 2639 } 2640 2641 bzero(&dva[d], sizeof (dva_t)); 2642 2643 return (SET_ERROR(ENOSPC)); 2644} 2645 2646/* 2647 * Free the block represented by DVA in the context of the specified 2648 * transaction group. 2649 */ 2650static void 2651metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2652{ 2653 uint64_t vdev = DVA_GET_VDEV(dva); 2654 uint64_t offset = DVA_GET_OFFSET(dva); 2655 uint64_t size = DVA_GET_ASIZE(dva); 2656 vdev_t *vd; 2657 metaslab_t *msp; 2658 2659 ASSERT(DVA_IS_VALID(dva)); 2660 2661 if (txg > spa_freeze_txg(spa)) 2662 return; 2663 2664 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2665 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2666 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2667 (u_longlong_t)vdev, (u_longlong_t)offset); 2668 ASSERT(0); 2669 return; 2670 } 2671 2672 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2673 2674 if (DVA_GET_GANG(dva)) 2675 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2676 2677 mutex_enter(&msp->ms_lock); 2678 2679 if (now) { 2680 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2681 offset, size); 2682 2683 VERIFY(!msp->ms_condensing); 2684 VERIFY3U(offset, >=, msp->ms_start); 2685 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2686 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2687 msp->ms_size); 2688 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2689 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2690 range_tree_add(msp->ms_tree, offset, size); 2691 } else { 2692 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2693 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2694 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2695 offset, size); 2696 } 2697 2698 mutex_exit(&msp->ms_lock); 2699} 2700 2701/* 2702 * Intent log support: upon opening the pool after a crash, notify the SPA 2703 * of blocks that the intent log has allocated for immediate write, but 2704 * which are still considered free by the SPA because the last transaction 2705 * group didn't commit yet. 2706 */ 2707static int 2708metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2709{ 2710 uint64_t vdev = DVA_GET_VDEV(dva); 2711 uint64_t offset = DVA_GET_OFFSET(dva); 2712 uint64_t size = DVA_GET_ASIZE(dva); 2713 vdev_t *vd; 2714 metaslab_t *msp; 2715 int error = 0; 2716 2717 ASSERT(DVA_IS_VALID(dva)); 2718 2719 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2720 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2721 return (SET_ERROR(ENXIO)); 2722 2723 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2724 2725 if (DVA_GET_GANG(dva)) 2726 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2727 2728 mutex_enter(&msp->ms_lock); 2729 2730 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2731 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2732 2733 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2734 error = SET_ERROR(ENOENT); 2735 2736 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2737 mutex_exit(&msp->ms_lock); 2738 return (error); 2739 } 2740 2741 VERIFY(!msp->ms_condensing); 2742 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2743 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2744 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2745 range_tree_remove(msp->ms_tree, offset, size); 2746 2747 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2748 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2749 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2750 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2751 } 2752 2753 mutex_exit(&msp->ms_lock); 2754 2755 return (0); 2756} 2757 2758/* 2759 * Reserve some allocation slots. The reservation system must be called 2760 * before we call into the allocator. If there aren't any available slots 2761 * then the I/O will be throttled until an I/O completes and its slots are 2762 * freed up. The function returns true if it was successful in placing 2763 * the reservation. 2764 */ 2765boolean_t 2766metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio, 2767 int flags) 2768{ 2769 uint64_t available_slots = 0; 2770 boolean_t slot_reserved = B_FALSE; 2771 2772 ASSERT(mc->mc_alloc_throttle_enabled); 2773 mutex_enter(&mc->mc_lock); 2774 2775 uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots); 2776 if (reserved_slots < mc->mc_alloc_max_slots) 2777 available_slots = mc->mc_alloc_max_slots - reserved_slots; 2778 2779 if (slots <= available_slots || GANG_ALLOCATION(flags)) { 2780 /* 2781 * We reserve the slots individually so that we can unreserve 2782 * them individually when an I/O completes. 2783 */ 2784 for (int d = 0; d < slots; d++) { 2785 reserved_slots = refcount_add(&mc->mc_alloc_slots, zio); 2786 } 2787 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; 2788 slot_reserved = B_TRUE; 2789 } 2790 2791 mutex_exit(&mc->mc_lock); 2792 return (slot_reserved); 2793} 2794 2795void 2796metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio) 2797{ 2798 ASSERT(mc->mc_alloc_throttle_enabled); 2799 mutex_enter(&mc->mc_lock); 2800 for (int d = 0; d < slots; d++) { 2801 (void) refcount_remove(&mc->mc_alloc_slots, zio); 2802 } 2803 mutex_exit(&mc->mc_lock); 2804} 2805 2806int 2807metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2808 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio) 2809{ 2810 dva_t *dva = bp->blk_dva; 2811 dva_t *hintdva = hintbp->blk_dva; 2812 int error = 0; 2813 2814 ASSERT(bp->blk_birth == 0); 2815 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2816 2817 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2818 2819 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2820 spa_config_exit(spa, SCL_ALLOC, FTAG); 2821 return (SET_ERROR(ENOSPC)); 2822 } 2823 2824 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2825 ASSERT(BP_GET_NDVAS(bp) == 0); 2826 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2827 2828 for (int d = 0; d < ndvas; d++) { 2829 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2830 txg, flags); 2831 if (error != 0) { 2832 for (d--; d >= 0; d--) { 2833 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2834 metaslab_group_alloc_decrement(spa, 2835 DVA_GET_VDEV(&dva[d]), zio, flags); 2836 bzero(&dva[d], sizeof (dva_t)); 2837 } 2838 spa_config_exit(spa, SCL_ALLOC, FTAG); 2839 return (error); 2840 } else { 2841 /* 2842 * Update the metaslab group's queue depth 2843 * based on the newly allocated dva. 2844 */ 2845 metaslab_group_alloc_increment(spa, 2846 DVA_GET_VDEV(&dva[d]), zio, flags); 2847 } 2848 2849 } 2850 ASSERT(error == 0); 2851 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2852 2853 spa_config_exit(spa, SCL_ALLOC, FTAG); 2854 2855 BP_SET_BIRTH(bp, txg, txg); 2856 2857 return (0); 2858} 2859 2860void 2861metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2862{ 2863 const dva_t *dva = bp->blk_dva; 2864 int ndvas = BP_GET_NDVAS(bp); 2865 2866 ASSERT(!BP_IS_HOLE(bp)); 2867 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2868 2869 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2870 2871 for (int d = 0; d < ndvas; d++) 2872 metaslab_free_dva(spa, &dva[d], txg, now); 2873 2874 spa_config_exit(spa, SCL_FREE, FTAG); 2875} 2876 2877int 2878metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2879{ 2880 const dva_t *dva = bp->blk_dva; 2881 int ndvas = BP_GET_NDVAS(bp); 2882 int error = 0; 2883 2884 ASSERT(!BP_IS_HOLE(bp)); 2885 2886 if (txg != 0) { 2887 /* 2888 * First do a dry run to make sure all DVAs are claimable, 2889 * so we don't have to unwind from partial failures below. 2890 */ 2891 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2892 return (error); 2893 } 2894 2895 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2896 2897 for (int d = 0; d < ndvas; d++) 2898 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2899 break; 2900 2901 spa_config_exit(spa, SCL_ALLOC, FTAG); 2902 2903 ASSERT(error == 0 || txg == 0); 2904 2905 return (error); 2906} 2907 2908void 2909metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2910{ 2911 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2912 return; 2913 2914 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2915 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2916 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2917 vdev_t *vd = vdev_lookup_top(spa, vdev); 2918 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2919 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2920 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2921 2922 if (msp->ms_loaded) 2923 range_tree_verify(msp->ms_tree, offset, size); 2924 2925 for (int j = 0; j < TXG_SIZE; j++) 2926 range_tree_verify(msp->ms_freetree[j], offset, size); 2927 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2928 range_tree_verify(msp->ms_defertree[j], offset, size); 2929 } 2930 spa_config_exit(spa, SCL_VDEV, FTAG); 2931} 2932