metaslab.c revision 269773
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h> 35#include <sys/zfeature.h> 36 37SYSCTL_DECL(_vfs_zfs); 38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 39 40/* 41 * Allow allocations to switch to gang blocks quickly. We do this to 42 * avoid having to load lots of space_maps in a given txg. There are, 43 * however, some cases where we want to avoid "fast" ganging and instead 44 * we want to do an exhaustive search of all metaslabs on this device. 45 * Currently we don't allow any gang, slog, or dump device related allocations 46 * to "fast" gang. 47 */ 48#define CAN_FASTGANG(flags) \ 49 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 50 METASLAB_GANG_AVOID))) 51 52#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 53#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 54#define METASLAB_ACTIVE_MASK \ 55 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 56 57uint64_t metaslab_aliquot = 512ULL << 10; 58uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 59TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 60SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 61 &metaslab_gang_bang, 0, 62 "Force gang block allocation for blocks larger than or equal to this value"); 63 64/* 65 * The in-core space map representation is more compact than its on-disk form. 66 * The zfs_condense_pct determines how much more compact the in-core 67 * space_map representation must be before we compact it on-disk. 68 * Values should be greater than or equal to 100. 69 */ 70int zfs_condense_pct = 200; 71TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 72SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 73 &zfs_condense_pct, 0, 74 "Condense on-disk spacemap when it is more than this many percents" 75 " of in-memory counterpart"); 76 77/* 78 * Condensing a metaslab is not guaranteed to actually reduce the amount of 79 * space used on disk. In particular, a space map uses data in increments of 80 * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the 81 * same number of blocks after condensing. Since the goal of condensing is to 82 * reduce the number of IOPs required to read the space map, we only want to 83 * condense when we can be sure we will reduce the number of blocks used by the 84 * space map. Unfortunately, we cannot precisely compute whether or not this is 85 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 86 * we apply the following heuristic: do not condense a spacemap unless the 87 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 88 * blocks. 89 */ 90int zfs_metaslab_condense_block_threshold = 4; 91 92/* 93 * The zfs_mg_noalloc_threshold defines which metaslab groups should 94 * be eligible for allocation. The value is defined as a percentage of 95 * free space. Metaslab groups that have more free space than 96 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 97 * a metaslab group's free space is less than or equal to the 98 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 99 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 100 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 101 * groups are allowed to accept allocations. Gang blocks are always 102 * eligible to allocate on any metaslab group. The default value of 0 means 103 * no metaslab group will be excluded based on this criterion. 104 */ 105int zfs_mg_noalloc_threshold = 0; 106TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 107SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 108 &zfs_mg_noalloc_threshold, 0, 109 "Percentage of metaslab group size that should be free" 110 " to make it eligible for allocation"); 111 112/* 113 * Metaslab groups are considered eligible for allocations if their 114 * fragmenation metric (measured as a percentage) is less than or equal to 115 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 116 * then it will be skipped unless all metaslab groups within the metaslab 117 * class have also crossed this threshold. 118 */ 119int zfs_mg_fragmentation_threshold = 85; 120 121/* 122 * Allow metaslabs to keep their active state as long as their fragmentation 123 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 124 * active metaslab that exceeds this threshold will no longer keep its active 125 * status allowing better metaslabs to be selected. 126 */ 127int zfs_metaslab_fragmentation_threshold = 70; 128 129/* 130 * When set will load all metaslabs when pool is first opened. 131 */ 132int metaslab_debug_load = 0; 133TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 134SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 135 &metaslab_debug_load, 0, 136 "Load all metaslabs when pool is first opened"); 137 138/* 139 * When set will prevent metaslabs from being unloaded. 140 */ 141int metaslab_debug_unload = 0; 142TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 143SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 144 &metaslab_debug_unload, 0, 145 "Prevent metaslabs from being unloaded"); 146 147/* 148 * Minimum size which forces the dynamic allocator to change 149 * it's allocation strategy. Once the space map cannot satisfy 150 * an allocation of this size then it switches to using more 151 * aggressive strategy (i.e search by size rather than offset). 152 */ 153uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 154TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 155 &metaslab_df_alloc_threshold); 156SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 157 &metaslab_df_alloc_threshold, 0, 158 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 159 160/* 161 * The minimum free space, in percent, which must be available 162 * in a space map to continue allocations in a first-fit fashion. 163 * Once the space_map's free space drops below this level we dynamically 164 * switch to using best-fit allocations. 165 */ 166int metaslab_df_free_pct = 4; 167TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 168SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 169 &metaslab_df_free_pct, 0, 170 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 171 172/* 173 * A metaslab is considered "free" if it contains a contiguous 174 * segment which is greater than metaslab_min_alloc_size. 175 */ 176uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 177TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 178 &metaslab_min_alloc_size); 179SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 180 &metaslab_min_alloc_size, 0, 181 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 182 183/* 184 * Percentage of all cpus that can be used by the metaslab taskq. 185 */ 186int metaslab_load_pct = 50; 187TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 188SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 189 &metaslab_load_pct, 0, 190 "Percentage of cpus that can be used by the metaslab taskq"); 191 192/* 193 * Determines how many txgs a metaslab may remain loaded without having any 194 * allocations from it. As long as a metaslab continues to be used we will 195 * keep it loaded. 196 */ 197int metaslab_unload_delay = TXG_SIZE * 2; 198TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 199SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 200 &metaslab_unload_delay, 0, 201 "Number of TXGs that an unused metaslab can be kept in memory"); 202 203/* 204 * Max number of metaslabs per group to preload. 205 */ 206int metaslab_preload_limit = SPA_DVAS_PER_BP; 207TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 208SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 209 &metaslab_preload_limit, 0, 210 "Max number of metaslabs per group to preload"); 211 212/* 213 * Enable/disable preloading of metaslab. 214 */ 215boolean_t metaslab_preload_enabled = B_TRUE; 216TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 217SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 218 &metaslab_preload_enabled, 0, 219 "Max number of metaslabs per group to preload"); 220 221/* 222 * Enable/disable fragmentation weighting on metaslabs. 223 */ 224boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 225TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 226 &metaslab_fragmentation_factor_enabled); 227SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 228 &metaslab_fragmentation_factor_enabled, 0, 229 "Enable fragmentation weighting on metaslabs"); 230 231/* 232 * Enable/disable lba weighting (i.e. outer tracks are given preference). 233 */ 234boolean_t metaslab_lba_weighting_enabled = B_TRUE; 235TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 236 &metaslab_lba_weighting_enabled); 237SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 238 &metaslab_lba_weighting_enabled, 0, 239 "Enable LBA weighting (i.e. outer tracks are given preference)"); 240 241/* 242 * Enable/disable metaslab group biasing. 243 */ 244boolean_t metaslab_bias_enabled = B_TRUE; 245TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 246 &metaslab_bias_enabled); 247SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 248 &metaslab_bias_enabled, 0, 249 "Enable metaslab group biasing"); 250 251static uint64_t metaslab_fragmentation(metaslab_t *); 252 253/* 254 * ========================================================================== 255 * Metaslab classes 256 * ========================================================================== 257 */ 258metaslab_class_t * 259metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 260{ 261 metaslab_class_t *mc; 262 263 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 264 265 mc->mc_spa = spa; 266 mc->mc_rotor = NULL; 267 mc->mc_ops = ops; 268 269 return (mc); 270} 271 272void 273metaslab_class_destroy(metaslab_class_t *mc) 274{ 275 ASSERT(mc->mc_rotor == NULL); 276 ASSERT(mc->mc_alloc == 0); 277 ASSERT(mc->mc_deferred == 0); 278 ASSERT(mc->mc_space == 0); 279 ASSERT(mc->mc_dspace == 0); 280 281 kmem_free(mc, sizeof (metaslab_class_t)); 282} 283 284int 285metaslab_class_validate(metaslab_class_t *mc) 286{ 287 metaslab_group_t *mg; 288 vdev_t *vd; 289 290 /* 291 * Must hold one of the spa_config locks. 292 */ 293 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 294 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 295 296 if ((mg = mc->mc_rotor) == NULL) 297 return (0); 298 299 do { 300 vd = mg->mg_vd; 301 ASSERT(vd->vdev_mg != NULL); 302 ASSERT3P(vd->vdev_top, ==, vd); 303 ASSERT3P(mg->mg_class, ==, mc); 304 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 305 } while ((mg = mg->mg_next) != mc->mc_rotor); 306 307 return (0); 308} 309 310void 311metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 312 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 313{ 314 atomic_add_64(&mc->mc_alloc, alloc_delta); 315 atomic_add_64(&mc->mc_deferred, defer_delta); 316 atomic_add_64(&mc->mc_space, space_delta); 317 atomic_add_64(&mc->mc_dspace, dspace_delta); 318} 319 320void 321metaslab_class_minblocksize_update(metaslab_class_t *mc) 322{ 323 metaslab_group_t *mg; 324 vdev_t *vd; 325 uint64_t minashift = UINT64_MAX; 326 327 if ((mg = mc->mc_rotor) == NULL) { 328 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 329 return; 330 } 331 332 do { 333 vd = mg->mg_vd; 334 if (vd->vdev_ashift < minashift) 335 minashift = vd->vdev_ashift; 336 } while ((mg = mg->mg_next) != mc->mc_rotor); 337 338 mc->mc_minblocksize = 1ULL << minashift; 339} 340 341uint64_t 342metaslab_class_get_alloc(metaslab_class_t *mc) 343{ 344 return (mc->mc_alloc); 345} 346 347uint64_t 348metaslab_class_get_deferred(metaslab_class_t *mc) 349{ 350 return (mc->mc_deferred); 351} 352 353uint64_t 354metaslab_class_get_space(metaslab_class_t *mc) 355{ 356 return (mc->mc_space); 357} 358 359uint64_t 360metaslab_class_get_dspace(metaslab_class_t *mc) 361{ 362 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 363} 364 365uint64_t 366metaslab_class_get_minblocksize(metaslab_class_t *mc) 367{ 368 return (mc->mc_minblocksize); 369} 370 371void 372metaslab_class_histogram_verify(metaslab_class_t *mc) 373{ 374 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 375 uint64_t *mc_hist; 376 int i; 377 378 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 379 return; 380 381 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 382 KM_SLEEP); 383 384 for (int c = 0; c < rvd->vdev_children; c++) { 385 vdev_t *tvd = rvd->vdev_child[c]; 386 metaslab_group_t *mg = tvd->vdev_mg; 387 388 /* 389 * Skip any holes, uninitialized top-levels, or 390 * vdevs that are not in this metalab class. 391 */ 392 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 393 mg->mg_class != mc) { 394 continue; 395 } 396 397 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 398 mc_hist[i] += mg->mg_histogram[i]; 399 } 400 401 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 402 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 403 404 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 405} 406 407/* 408 * Calculate the metaslab class's fragmentation metric. The metric 409 * is weighted based on the space contribution of each metaslab group. 410 * The return value will be a number between 0 and 100 (inclusive), or 411 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 412 * zfs_frag_table for more information about the metric. 413 */ 414uint64_t 415metaslab_class_fragmentation(metaslab_class_t *mc) 416{ 417 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 418 uint64_t fragmentation = 0; 419 420 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 421 422 for (int c = 0; c < rvd->vdev_children; c++) { 423 vdev_t *tvd = rvd->vdev_child[c]; 424 metaslab_group_t *mg = tvd->vdev_mg; 425 426 /* 427 * Skip any holes, uninitialized top-levels, or 428 * vdevs that are not in this metalab class. 429 */ 430 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 431 mg->mg_class != mc) { 432 continue; 433 } 434 435 /* 436 * If a metaslab group does not contain a fragmentation 437 * metric then just bail out. 438 */ 439 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 440 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 441 return (ZFS_FRAG_INVALID); 442 } 443 444 /* 445 * Determine how much this metaslab_group is contributing 446 * to the overall pool fragmentation metric. 447 */ 448 fragmentation += mg->mg_fragmentation * 449 metaslab_group_get_space(mg); 450 } 451 fragmentation /= metaslab_class_get_space(mc); 452 453 ASSERT3U(fragmentation, <=, 100); 454 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 455 return (fragmentation); 456} 457 458/* 459 * Calculate the amount of expandable space that is available in 460 * this metaslab class. If a device is expanded then its expandable 461 * space will be the amount of allocatable space that is currently not 462 * part of this metaslab class. 463 */ 464uint64_t 465metaslab_class_expandable_space(metaslab_class_t *mc) 466{ 467 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 468 uint64_t space = 0; 469 470 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 471 for (int c = 0; c < rvd->vdev_children; c++) { 472 vdev_t *tvd = rvd->vdev_child[c]; 473 metaslab_group_t *mg = tvd->vdev_mg; 474 475 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 476 mg->mg_class != mc) { 477 continue; 478 } 479 480 space += tvd->vdev_max_asize - tvd->vdev_asize; 481 } 482 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 483 return (space); 484} 485 486/* 487 * ========================================================================== 488 * Metaslab groups 489 * ========================================================================== 490 */ 491static int 492metaslab_compare(const void *x1, const void *x2) 493{ 494 const metaslab_t *m1 = x1; 495 const metaslab_t *m2 = x2; 496 497 if (m1->ms_weight < m2->ms_weight) 498 return (1); 499 if (m1->ms_weight > m2->ms_weight) 500 return (-1); 501 502 /* 503 * If the weights are identical, use the offset to force uniqueness. 504 */ 505 if (m1->ms_start < m2->ms_start) 506 return (-1); 507 if (m1->ms_start > m2->ms_start) 508 return (1); 509 510 ASSERT3P(m1, ==, m2); 511 512 return (0); 513} 514 515/* 516 * Update the allocatable flag and the metaslab group's capacity. 517 * The allocatable flag is set to true if the capacity is below 518 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 519 * from allocatable to non-allocatable or vice versa then the metaslab 520 * group's class is updated to reflect the transition. 521 */ 522static void 523metaslab_group_alloc_update(metaslab_group_t *mg) 524{ 525 vdev_t *vd = mg->mg_vd; 526 metaslab_class_t *mc = mg->mg_class; 527 vdev_stat_t *vs = &vd->vdev_stat; 528 boolean_t was_allocatable; 529 530 ASSERT(vd == vd->vdev_top); 531 532 mutex_enter(&mg->mg_lock); 533 was_allocatable = mg->mg_allocatable; 534 535 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 536 (vs->vs_space + 1); 537 538 /* 539 * A metaslab group is considered allocatable if it has plenty 540 * of free space or is not heavily fragmented. We only take 541 * fragmentation into account if the metaslab group has a valid 542 * fragmentation metric (i.e. a value between 0 and 100). 543 */ 544 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 545 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 546 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 547 548 /* 549 * The mc_alloc_groups maintains a count of the number of 550 * groups in this metaslab class that are still above the 551 * zfs_mg_noalloc_threshold. This is used by the allocating 552 * threads to determine if they should avoid allocations to 553 * a given group. The allocator will avoid allocations to a group 554 * if that group has reached or is below the zfs_mg_noalloc_threshold 555 * and there are still other groups that are above the threshold. 556 * When a group transitions from allocatable to non-allocatable or 557 * vice versa we update the metaslab class to reflect that change. 558 * When the mc_alloc_groups value drops to 0 that means that all 559 * groups have reached the zfs_mg_noalloc_threshold making all groups 560 * eligible for allocations. This effectively means that all devices 561 * are balanced again. 562 */ 563 if (was_allocatable && !mg->mg_allocatable) 564 mc->mc_alloc_groups--; 565 else if (!was_allocatable && mg->mg_allocatable) 566 mc->mc_alloc_groups++; 567 568 mutex_exit(&mg->mg_lock); 569} 570 571metaslab_group_t * 572metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 573{ 574 metaslab_group_t *mg; 575 576 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 577 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 578 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 579 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 580 mg->mg_vd = vd; 581 mg->mg_class = mc; 582 mg->mg_activation_count = 0; 583 584 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 585 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 586 587 return (mg); 588} 589 590void 591metaslab_group_destroy(metaslab_group_t *mg) 592{ 593 ASSERT(mg->mg_prev == NULL); 594 ASSERT(mg->mg_next == NULL); 595 /* 596 * We may have gone below zero with the activation count 597 * either because we never activated in the first place or 598 * because we're done, and possibly removing the vdev. 599 */ 600 ASSERT(mg->mg_activation_count <= 0); 601 602 taskq_destroy(mg->mg_taskq); 603 avl_destroy(&mg->mg_metaslab_tree); 604 mutex_destroy(&mg->mg_lock); 605 kmem_free(mg, sizeof (metaslab_group_t)); 606} 607 608void 609metaslab_group_activate(metaslab_group_t *mg) 610{ 611 metaslab_class_t *mc = mg->mg_class; 612 metaslab_group_t *mgprev, *mgnext; 613 614 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 615 616 ASSERT(mc->mc_rotor != mg); 617 ASSERT(mg->mg_prev == NULL); 618 ASSERT(mg->mg_next == NULL); 619 ASSERT(mg->mg_activation_count <= 0); 620 621 if (++mg->mg_activation_count <= 0) 622 return; 623 624 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 625 metaslab_group_alloc_update(mg); 626 627 if ((mgprev = mc->mc_rotor) == NULL) { 628 mg->mg_prev = mg; 629 mg->mg_next = mg; 630 } else { 631 mgnext = mgprev->mg_next; 632 mg->mg_prev = mgprev; 633 mg->mg_next = mgnext; 634 mgprev->mg_next = mg; 635 mgnext->mg_prev = mg; 636 } 637 mc->mc_rotor = mg; 638 metaslab_class_minblocksize_update(mc); 639} 640 641void 642metaslab_group_passivate(metaslab_group_t *mg) 643{ 644 metaslab_class_t *mc = mg->mg_class; 645 metaslab_group_t *mgprev, *mgnext; 646 647 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 648 649 if (--mg->mg_activation_count != 0) { 650 ASSERT(mc->mc_rotor != mg); 651 ASSERT(mg->mg_prev == NULL); 652 ASSERT(mg->mg_next == NULL); 653 ASSERT(mg->mg_activation_count < 0); 654 return; 655 } 656 657 taskq_wait(mg->mg_taskq); 658 metaslab_group_alloc_update(mg); 659 660 mgprev = mg->mg_prev; 661 mgnext = mg->mg_next; 662 663 if (mg == mgnext) { 664 mc->mc_rotor = NULL; 665 } else { 666 mc->mc_rotor = mgnext; 667 mgprev->mg_next = mgnext; 668 mgnext->mg_prev = mgprev; 669 } 670 671 mg->mg_prev = NULL; 672 mg->mg_next = NULL; 673 metaslab_class_minblocksize_update(mc); 674} 675 676uint64_t 677metaslab_group_get_space(metaslab_group_t *mg) 678{ 679 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 680} 681 682void 683metaslab_group_histogram_verify(metaslab_group_t *mg) 684{ 685 uint64_t *mg_hist; 686 vdev_t *vd = mg->mg_vd; 687 uint64_t ashift = vd->vdev_ashift; 688 int i; 689 690 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 691 return; 692 693 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 694 KM_SLEEP); 695 696 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 697 SPACE_MAP_HISTOGRAM_SIZE + ashift); 698 699 for (int m = 0; m < vd->vdev_ms_count; m++) { 700 metaslab_t *msp = vd->vdev_ms[m]; 701 702 if (msp->ms_sm == NULL) 703 continue; 704 705 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 706 mg_hist[i + ashift] += 707 msp->ms_sm->sm_phys->smp_histogram[i]; 708 } 709 710 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 711 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 712 713 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 714} 715 716static void 717metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 718{ 719 metaslab_class_t *mc = mg->mg_class; 720 uint64_t ashift = mg->mg_vd->vdev_ashift; 721 722 ASSERT(MUTEX_HELD(&msp->ms_lock)); 723 if (msp->ms_sm == NULL) 724 return; 725 726 mutex_enter(&mg->mg_lock); 727 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 728 mg->mg_histogram[i + ashift] += 729 msp->ms_sm->sm_phys->smp_histogram[i]; 730 mc->mc_histogram[i + ashift] += 731 msp->ms_sm->sm_phys->smp_histogram[i]; 732 } 733 mutex_exit(&mg->mg_lock); 734} 735 736void 737metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 738{ 739 metaslab_class_t *mc = mg->mg_class; 740 uint64_t ashift = mg->mg_vd->vdev_ashift; 741 742 ASSERT(MUTEX_HELD(&msp->ms_lock)); 743 if (msp->ms_sm == NULL) 744 return; 745 746 mutex_enter(&mg->mg_lock); 747 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 748 ASSERT3U(mg->mg_histogram[i + ashift], >=, 749 msp->ms_sm->sm_phys->smp_histogram[i]); 750 ASSERT3U(mc->mc_histogram[i + ashift], >=, 751 msp->ms_sm->sm_phys->smp_histogram[i]); 752 753 mg->mg_histogram[i + ashift] -= 754 msp->ms_sm->sm_phys->smp_histogram[i]; 755 mc->mc_histogram[i + ashift] -= 756 msp->ms_sm->sm_phys->smp_histogram[i]; 757 } 758 mutex_exit(&mg->mg_lock); 759} 760 761static void 762metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 763{ 764 ASSERT(msp->ms_group == NULL); 765 mutex_enter(&mg->mg_lock); 766 msp->ms_group = mg; 767 msp->ms_weight = 0; 768 avl_add(&mg->mg_metaslab_tree, msp); 769 mutex_exit(&mg->mg_lock); 770 771 mutex_enter(&msp->ms_lock); 772 metaslab_group_histogram_add(mg, msp); 773 mutex_exit(&msp->ms_lock); 774} 775 776static void 777metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 778{ 779 mutex_enter(&msp->ms_lock); 780 metaslab_group_histogram_remove(mg, msp); 781 mutex_exit(&msp->ms_lock); 782 783 mutex_enter(&mg->mg_lock); 784 ASSERT(msp->ms_group == mg); 785 avl_remove(&mg->mg_metaslab_tree, msp); 786 msp->ms_group = NULL; 787 mutex_exit(&mg->mg_lock); 788} 789 790static void 791metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 792{ 793 /* 794 * Although in principle the weight can be any value, in 795 * practice we do not use values in the range [1, 511]. 796 */ 797 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 798 ASSERT(MUTEX_HELD(&msp->ms_lock)); 799 800 mutex_enter(&mg->mg_lock); 801 ASSERT(msp->ms_group == mg); 802 avl_remove(&mg->mg_metaslab_tree, msp); 803 msp->ms_weight = weight; 804 avl_add(&mg->mg_metaslab_tree, msp); 805 mutex_exit(&mg->mg_lock); 806} 807 808/* 809 * Calculate the fragmentation for a given metaslab group. We can use 810 * a simple average here since all metaslabs within the group must have 811 * the same size. The return value will be a value between 0 and 100 812 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 813 * group have a fragmentation metric. 814 */ 815uint64_t 816metaslab_group_fragmentation(metaslab_group_t *mg) 817{ 818 vdev_t *vd = mg->mg_vd; 819 uint64_t fragmentation = 0; 820 uint64_t valid_ms = 0; 821 822 for (int m = 0; m < vd->vdev_ms_count; m++) { 823 metaslab_t *msp = vd->vdev_ms[m]; 824 825 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 826 continue; 827 828 valid_ms++; 829 fragmentation += msp->ms_fragmentation; 830 } 831 832 if (valid_ms <= vd->vdev_ms_count / 2) 833 return (ZFS_FRAG_INVALID); 834 835 fragmentation /= valid_ms; 836 ASSERT3U(fragmentation, <=, 100); 837 return (fragmentation); 838} 839 840/* 841 * Determine if a given metaslab group should skip allocations. A metaslab 842 * group should avoid allocations if its free capacity is less than the 843 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 844 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 845 * that can still handle allocations. 846 */ 847static boolean_t 848metaslab_group_allocatable(metaslab_group_t *mg) 849{ 850 vdev_t *vd = mg->mg_vd; 851 spa_t *spa = vd->vdev_spa; 852 metaslab_class_t *mc = mg->mg_class; 853 854 /* 855 * We use two key metrics to determine if a metaslab group is 856 * considered allocatable -- free space and fragmentation. If 857 * the free space is greater than the free space threshold and 858 * the fragmentation is less than the fragmentation threshold then 859 * consider the group allocatable. There are two case when we will 860 * not consider these key metrics. The first is if the group is 861 * associated with a slog device and the second is if all groups 862 * in this metaslab class have already been consider ineligible 863 * for allocations. 864 */ 865 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 866 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 867 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 868 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 869} 870 871/* 872 * ========================================================================== 873 * Range tree callbacks 874 * ========================================================================== 875 */ 876 877/* 878 * Comparison function for the private size-ordered tree. Tree is sorted 879 * by size, larger sizes at the end of the tree. 880 */ 881static int 882metaslab_rangesize_compare(const void *x1, const void *x2) 883{ 884 const range_seg_t *r1 = x1; 885 const range_seg_t *r2 = x2; 886 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 887 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 888 889 if (rs_size1 < rs_size2) 890 return (-1); 891 if (rs_size1 > rs_size2) 892 return (1); 893 894 if (r1->rs_start < r2->rs_start) 895 return (-1); 896 897 if (r1->rs_start > r2->rs_start) 898 return (1); 899 900 return (0); 901} 902 903/* 904 * Create any block allocator specific components. The current allocators 905 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 906 */ 907static void 908metaslab_rt_create(range_tree_t *rt, void *arg) 909{ 910 metaslab_t *msp = arg; 911 912 ASSERT3P(rt->rt_arg, ==, msp); 913 ASSERT(msp->ms_tree == NULL); 914 915 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 916 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 917} 918 919/* 920 * Destroy the block allocator specific components. 921 */ 922static void 923metaslab_rt_destroy(range_tree_t *rt, void *arg) 924{ 925 metaslab_t *msp = arg; 926 927 ASSERT3P(rt->rt_arg, ==, msp); 928 ASSERT3P(msp->ms_tree, ==, rt); 929 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 930 931 avl_destroy(&msp->ms_size_tree); 932} 933 934static void 935metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 936{ 937 metaslab_t *msp = arg; 938 939 ASSERT3P(rt->rt_arg, ==, msp); 940 ASSERT3P(msp->ms_tree, ==, rt); 941 VERIFY(!msp->ms_condensing); 942 avl_add(&msp->ms_size_tree, rs); 943} 944 945static void 946metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 947{ 948 metaslab_t *msp = arg; 949 950 ASSERT3P(rt->rt_arg, ==, msp); 951 ASSERT3P(msp->ms_tree, ==, rt); 952 VERIFY(!msp->ms_condensing); 953 avl_remove(&msp->ms_size_tree, rs); 954} 955 956static void 957metaslab_rt_vacate(range_tree_t *rt, void *arg) 958{ 959 metaslab_t *msp = arg; 960 961 ASSERT3P(rt->rt_arg, ==, msp); 962 ASSERT3P(msp->ms_tree, ==, rt); 963 964 /* 965 * Normally one would walk the tree freeing nodes along the way. 966 * Since the nodes are shared with the range trees we can avoid 967 * walking all nodes and just reinitialize the avl tree. The nodes 968 * will be freed by the range tree, so we don't want to free them here. 969 */ 970 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 971 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 972} 973 974static range_tree_ops_t metaslab_rt_ops = { 975 metaslab_rt_create, 976 metaslab_rt_destroy, 977 metaslab_rt_add, 978 metaslab_rt_remove, 979 metaslab_rt_vacate 980}; 981 982/* 983 * ========================================================================== 984 * Metaslab block operations 985 * ========================================================================== 986 */ 987 988/* 989 * Return the maximum contiguous segment within the metaslab. 990 */ 991uint64_t 992metaslab_block_maxsize(metaslab_t *msp) 993{ 994 avl_tree_t *t = &msp->ms_size_tree; 995 range_seg_t *rs; 996 997 if (t == NULL || (rs = avl_last(t)) == NULL) 998 return (0ULL); 999 1000 return (rs->rs_end - rs->rs_start); 1001} 1002 1003uint64_t 1004metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1005{ 1006 uint64_t start; 1007 range_tree_t *rt = msp->ms_tree; 1008 1009 VERIFY(!msp->ms_condensing); 1010 1011 start = msp->ms_ops->msop_alloc(msp, size); 1012 if (start != -1ULL) { 1013 vdev_t *vd = msp->ms_group->mg_vd; 1014 1015 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1016 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1017 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1018 range_tree_remove(rt, start, size); 1019 } 1020 return (start); 1021} 1022 1023/* 1024 * ========================================================================== 1025 * Common allocator routines 1026 * ========================================================================== 1027 */ 1028 1029/* 1030 * This is a helper function that can be used by the allocator to find 1031 * a suitable block to allocate. This will search the specified AVL 1032 * tree looking for a block that matches the specified criteria. 1033 */ 1034static uint64_t 1035metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1036 uint64_t align) 1037{ 1038 range_seg_t *rs, rsearch; 1039 avl_index_t where; 1040 1041 rsearch.rs_start = *cursor; 1042 rsearch.rs_end = *cursor + size; 1043 1044 rs = avl_find(t, &rsearch, &where); 1045 if (rs == NULL) 1046 rs = avl_nearest(t, where, AVL_AFTER); 1047 1048 while (rs != NULL) { 1049 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1050 1051 if (offset + size <= rs->rs_end) { 1052 *cursor = offset + size; 1053 return (offset); 1054 } 1055 rs = AVL_NEXT(t, rs); 1056 } 1057 1058 /* 1059 * If we know we've searched the whole map (*cursor == 0), give up. 1060 * Otherwise, reset the cursor to the beginning and try again. 1061 */ 1062 if (*cursor == 0) 1063 return (-1ULL); 1064 1065 *cursor = 0; 1066 return (metaslab_block_picker(t, cursor, size, align)); 1067} 1068 1069/* 1070 * ========================================================================== 1071 * The first-fit block allocator 1072 * ========================================================================== 1073 */ 1074static uint64_t 1075metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1076{ 1077 /* 1078 * Find the largest power of 2 block size that evenly divides the 1079 * requested size. This is used to try to allocate blocks with similar 1080 * alignment from the same area of the metaslab (i.e. same cursor 1081 * bucket) but it does not guarantee that other allocations sizes 1082 * may exist in the same region. 1083 */ 1084 uint64_t align = size & -size; 1085 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1086 avl_tree_t *t = &msp->ms_tree->rt_root; 1087 1088 return (metaslab_block_picker(t, cursor, size, align)); 1089} 1090 1091static metaslab_ops_t metaslab_ff_ops = { 1092 metaslab_ff_alloc 1093}; 1094 1095/* 1096 * ========================================================================== 1097 * Dynamic block allocator - 1098 * Uses the first fit allocation scheme until space get low and then 1099 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1100 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1101 * ========================================================================== 1102 */ 1103static uint64_t 1104metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1105{ 1106 /* 1107 * Find the largest power of 2 block size that evenly divides the 1108 * requested size. This is used to try to allocate blocks with similar 1109 * alignment from the same area of the metaslab (i.e. same cursor 1110 * bucket) but it does not guarantee that other allocations sizes 1111 * may exist in the same region. 1112 */ 1113 uint64_t align = size & -size; 1114 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1115 range_tree_t *rt = msp->ms_tree; 1116 avl_tree_t *t = &rt->rt_root; 1117 uint64_t max_size = metaslab_block_maxsize(msp); 1118 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1119 1120 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1121 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1122 1123 if (max_size < size) 1124 return (-1ULL); 1125 1126 /* 1127 * If we're running low on space switch to using the size 1128 * sorted AVL tree (best-fit). 1129 */ 1130 if (max_size < metaslab_df_alloc_threshold || 1131 free_pct < metaslab_df_free_pct) { 1132 t = &msp->ms_size_tree; 1133 *cursor = 0; 1134 } 1135 1136 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1137} 1138 1139static metaslab_ops_t metaslab_df_ops = { 1140 metaslab_df_alloc 1141}; 1142 1143/* 1144 * ========================================================================== 1145 * Cursor fit block allocator - 1146 * Select the largest region in the metaslab, set the cursor to the beginning 1147 * of the range and the cursor_end to the end of the range. As allocations 1148 * are made advance the cursor. Continue allocating from the cursor until 1149 * the range is exhausted and then find a new range. 1150 * ========================================================================== 1151 */ 1152static uint64_t 1153metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1154{ 1155 range_tree_t *rt = msp->ms_tree; 1156 avl_tree_t *t = &msp->ms_size_tree; 1157 uint64_t *cursor = &msp->ms_lbas[0]; 1158 uint64_t *cursor_end = &msp->ms_lbas[1]; 1159 uint64_t offset = 0; 1160 1161 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1162 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1163 1164 ASSERT3U(*cursor_end, >=, *cursor); 1165 1166 if ((*cursor + size) > *cursor_end) { 1167 range_seg_t *rs; 1168 1169 rs = avl_last(&msp->ms_size_tree); 1170 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1171 return (-1ULL); 1172 1173 *cursor = rs->rs_start; 1174 *cursor_end = rs->rs_end; 1175 } 1176 1177 offset = *cursor; 1178 *cursor += size; 1179 1180 return (offset); 1181} 1182 1183static metaslab_ops_t metaslab_cf_ops = { 1184 metaslab_cf_alloc 1185}; 1186 1187/* 1188 * ========================================================================== 1189 * New dynamic fit allocator - 1190 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1191 * contiguous blocks. If no region is found then just use the largest segment 1192 * that remains. 1193 * ========================================================================== 1194 */ 1195 1196/* 1197 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1198 * to request from the allocator. 1199 */ 1200uint64_t metaslab_ndf_clump_shift = 4; 1201 1202static uint64_t 1203metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1204{ 1205 avl_tree_t *t = &msp->ms_tree->rt_root; 1206 avl_index_t where; 1207 range_seg_t *rs, rsearch; 1208 uint64_t hbit = highbit64(size); 1209 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1210 uint64_t max_size = metaslab_block_maxsize(msp); 1211 1212 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1213 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1214 1215 if (max_size < size) 1216 return (-1ULL); 1217 1218 rsearch.rs_start = *cursor; 1219 rsearch.rs_end = *cursor + size; 1220 1221 rs = avl_find(t, &rsearch, &where); 1222 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1223 t = &msp->ms_size_tree; 1224 1225 rsearch.rs_start = 0; 1226 rsearch.rs_end = MIN(max_size, 1227 1ULL << (hbit + metaslab_ndf_clump_shift)); 1228 rs = avl_find(t, &rsearch, &where); 1229 if (rs == NULL) 1230 rs = avl_nearest(t, where, AVL_AFTER); 1231 ASSERT(rs != NULL); 1232 } 1233 1234 if ((rs->rs_end - rs->rs_start) >= size) { 1235 *cursor = rs->rs_start + size; 1236 return (rs->rs_start); 1237 } 1238 return (-1ULL); 1239} 1240 1241static metaslab_ops_t metaslab_ndf_ops = { 1242 metaslab_ndf_alloc 1243}; 1244 1245metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1246 1247/* 1248 * ========================================================================== 1249 * Metaslabs 1250 * ========================================================================== 1251 */ 1252 1253/* 1254 * Wait for any in-progress metaslab loads to complete. 1255 */ 1256void 1257metaslab_load_wait(metaslab_t *msp) 1258{ 1259 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1260 1261 while (msp->ms_loading) { 1262 ASSERT(!msp->ms_loaded); 1263 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1264 } 1265} 1266 1267int 1268metaslab_load(metaslab_t *msp) 1269{ 1270 int error = 0; 1271 1272 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1273 ASSERT(!msp->ms_loaded); 1274 ASSERT(!msp->ms_loading); 1275 1276 msp->ms_loading = B_TRUE; 1277 1278 /* 1279 * If the space map has not been allocated yet, then treat 1280 * all the space in the metaslab as free and add it to the 1281 * ms_tree. 1282 */ 1283 if (msp->ms_sm != NULL) 1284 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1285 else 1286 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1287 1288 msp->ms_loaded = (error == 0); 1289 msp->ms_loading = B_FALSE; 1290 1291 if (msp->ms_loaded) { 1292 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1293 range_tree_walk(msp->ms_defertree[t], 1294 range_tree_remove, msp->ms_tree); 1295 } 1296 } 1297 cv_broadcast(&msp->ms_load_cv); 1298 return (error); 1299} 1300 1301void 1302metaslab_unload(metaslab_t *msp) 1303{ 1304 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1305 range_tree_vacate(msp->ms_tree, NULL, NULL); 1306 msp->ms_loaded = B_FALSE; 1307 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1308} 1309 1310metaslab_t * 1311metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg) 1312{ 1313 vdev_t *vd = mg->mg_vd; 1314 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1315 metaslab_t *msp; 1316 1317 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1318 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1319 cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL); 1320 msp->ms_id = id; 1321 msp->ms_start = id << vd->vdev_ms_shift; 1322 msp->ms_size = 1ULL << vd->vdev_ms_shift; 1323 1324 /* 1325 * We only open space map objects that already exist. All others 1326 * will be opened when we finally allocate an object for it. 1327 */ 1328 if (object != 0) { 1329 VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start, 1330 msp->ms_size, vd->vdev_ashift, &msp->ms_lock)); 1331 ASSERT(msp->ms_sm != NULL); 1332 } 1333 1334 /* 1335 * We create the main range tree here, but we don't create the 1336 * alloctree and freetree until metaslab_sync_done(). This serves 1337 * two purposes: it allows metaslab_sync_done() to detect the 1338 * addition of new space; and for debugging, it ensures that we'd 1339 * data fault on any attempt to use this metaslab before it's ready. 1340 */ 1341 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 1342 metaslab_group_add(mg, msp); 1343 1344 msp->ms_fragmentation = metaslab_fragmentation(msp); 1345 msp->ms_ops = mg->mg_class->mc_ops; 1346 1347 /* 1348 * If we're opening an existing pool (txg == 0) or creating 1349 * a new one (txg == TXG_INITIAL), all space is available now. 1350 * If we're adding space to an existing pool, the new space 1351 * does not become available until after this txg has synced. 1352 */ 1353 if (txg <= TXG_INITIAL) 1354 metaslab_sync_done(msp, 0); 1355 1356 /* 1357 * If metaslab_debug_load is set and we're initializing a metaslab 1358 * that has an allocated space_map object then load the its space 1359 * map so that can verify frees. 1360 */ 1361 if (metaslab_debug_load && msp->ms_sm != NULL) { 1362 mutex_enter(&msp->ms_lock); 1363 VERIFY0(metaslab_load(msp)); 1364 mutex_exit(&msp->ms_lock); 1365 } 1366 1367 if (txg != 0) { 1368 vdev_dirty(vd, 0, NULL, txg); 1369 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1370 } 1371 1372 return (msp); 1373} 1374 1375void 1376metaslab_fini(metaslab_t *msp) 1377{ 1378 metaslab_group_t *mg = msp->ms_group; 1379 1380 metaslab_group_remove(mg, msp); 1381 1382 mutex_enter(&msp->ms_lock); 1383 1384 VERIFY(msp->ms_group == NULL); 1385 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1386 0, -msp->ms_size); 1387 space_map_close(msp->ms_sm); 1388 1389 metaslab_unload(msp); 1390 range_tree_destroy(msp->ms_tree); 1391 1392 for (int t = 0; t < TXG_SIZE; t++) { 1393 range_tree_destroy(msp->ms_alloctree[t]); 1394 range_tree_destroy(msp->ms_freetree[t]); 1395 } 1396 1397 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1398 range_tree_destroy(msp->ms_defertree[t]); 1399 } 1400 1401 ASSERT0(msp->ms_deferspace); 1402 1403 mutex_exit(&msp->ms_lock); 1404 cv_destroy(&msp->ms_load_cv); 1405 mutex_destroy(&msp->ms_lock); 1406 1407 kmem_free(msp, sizeof (metaslab_t)); 1408} 1409 1410#define FRAGMENTATION_TABLE_SIZE 17 1411 1412/* 1413 * This table defines a segment size based fragmentation metric that will 1414 * allow each metaslab to derive its own fragmentation value. This is done 1415 * by calculating the space in each bucket of the spacemap histogram and 1416 * multiplying that by the fragmetation metric in this table. Doing 1417 * this for all buckets and dividing it by the total amount of free 1418 * space in this metaslab (i.e. the total free space in all buckets) gives 1419 * us the fragmentation metric. This means that a high fragmentation metric 1420 * equates to most of the free space being comprised of small segments. 1421 * Conversely, if the metric is low, then most of the free space is in 1422 * large segments. A 10% change in fragmentation equates to approximately 1423 * double the number of segments. 1424 * 1425 * This table defines 0% fragmented space using 16MB segments. Testing has 1426 * shown that segments that are greater than or equal to 16MB do not suffer 1427 * from drastic performance problems. Using this value, we derive the rest 1428 * of the table. Since the fragmentation value is never stored on disk, it 1429 * is possible to change these calculations in the future. 1430 */ 1431int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1432 100, /* 512B */ 1433 100, /* 1K */ 1434 98, /* 2K */ 1435 95, /* 4K */ 1436 90, /* 8K */ 1437 80, /* 16K */ 1438 70, /* 32K */ 1439 60, /* 64K */ 1440 50, /* 128K */ 1441 40, /* 256K */ 1442 30, /* 512K */ 1443 20, /* 1M */ 1444 15, /* 2M */ 1445 10, /* 4M */ 1446 5, /* 8M */ 1447 0 /* 16M */ 1448}; 1449 1450/* 1451 * Calclate the metaslab's fragmentation metric. A return value 1452 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1453 * not support this metric. Otherwise, the return value should be in the 1454 * range [0, 100]. 1455 */ 1456static uint64_t 1457metaslab_fragmentation(metaslab_t *msp) 1458{ 1459 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1460 uint64_t fragmentation = 0; 1461 uint64_t total = 0; 1462 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1463 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1464 1465 if (!feature_enabled) 1466 return (ZFS_FRAG_INVALID); 1467 1468 /* 1469 * A null space map means that the entire metaslab is free 1470 * and thus is not fragmented. 1471 */ 1472 if (msp->ms_sm == NULL) 1473 return (0); 1474 1475 /* 1476 * If this metaslab's space_map has not been upgraded, flag it 1477 * so that we upgrade next time we encounter it. 1478 */ 1479 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1480 uint64_t txg = spa_syncing_txg(spa); 1481 vdev_t *vd = msp->ms_group->mg_vd; 1482 1483 msp->ms_condense_wanted = B_TRUE; 1484 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1485 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1486 "msp %p, vd %p", txg, msp, vd); 1487 return (ZFS_FRAG_INVALID); 1488 } 1489 1490 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1491 uint64_t space = 0; 1492 uint8_t shift = msp->ms_sm->sm_shift; 1493 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1494 FRAGMENTATION_TABLE_SIZE - 1); 1495 1496 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1497 continue; 1498 1499 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1500 total += space; 1501 1502 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1503 fragmentation += space * zfs_frag_table[idx]; 1504 } 1505 1506 if (total > 0) 1507 fragmentation /= total; 1508 ASSERT3U(fragmentation, <=, 100); 1509 return (fragmentation); 1510} 1511 1512/* 1513 * Compute a weight -- a selection preference value -- for the given metaslab. 1514 * This is based on the amount of free space, the level of fragmentation, 1515 * the LBA range, and whether the metaslab is loaded. 1516 */ 1517static uint64_t 1518metaslab_weight(metaslab_t *msp) 1519{ 1520 metaslab_group_t *mg = msp->ms_group; 1521 vdev_t *vd = mg->mg_vd; 1522 uint64_t weight, space; 1523 1524 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1525 1526 /* 1527 * This vdev is in the process of being removed so there is nothing 1528 * for us to do here. 1529 */ 1530 if (vd->vdev_removing) { 1531 ASSERT0(space_map_allocated(msp->ms_sm)); 1532 ASSERT0(vd->vdev_ms_shift); 1533 return (0); 1534 } 1535 1536 /* 1537 * The baseline weight is the metaslab's free space. 1538 */ 1539 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1540 1541 msp->ms_fragmentation = metaslab_fragmentation(msp); 1542 if (metaslab_fragmentation_factor_enabled && 1543 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1544 /* 1545 * Use the fragmentation information to inversely scale 1546 * down the baseline weight. We need to ensure that we 1547 * don't exclude this metaslab completely when it's 100% 1548 * fragmented. To avoid this we reduce the fragmented value 1549 * by 1. 1550 */ 1551 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1552 1553 /* 1554 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1555 * this metaslab again. The fragmentation metric may have 1556 * decreased the space to something smaller than 1557 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1558 * so that we can consume any remaining space. 1559 */ 1560 if (space > 0 && space < SPA_MINBLOCKSIZE) 1561 space = SPA_MINBLOCKSIZE; 1562 } 1563 weight = space; 1564 1565 /* 1566 * Modern disks have uniform bit density and constant angular velocity. 1567 * Therefore, the outer recording zones are faster (higher bandwidth) 1568 * than the inner zones by the ratio of outer to inner track diameter, 1569 * which is typically around 2:1. We account for this by assigning 1570 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1571 * In effect, this means that we'll select the metaslab with the most 1572 * free bandwidth rather than simply the one with the most free space. 1573 */ 1574 if (metaslab_lba_weighting_enabled) { 1575 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1576 ASSERT(weight >= space && weight <= 2 * space); 1577 } 1578 1579 /* 1580 * If this metaslab is one we're actively using, adjust its 1581 * weight to make it preferable to any inactive metaslab so 1582 * we'll polish it off. If the fragmentation on this metaslab 1583 * has exceed our threshold, then don't mark it active. 1584 */ 1585 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1586 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1587 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1588 } 1589 1590 return (weight); 1591} 1592 1593static int 1594metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1595{ 1596 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1597 1598 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1599 metaslab_load_wait(msp); 1600 if (!msp->ms_loaded) { 1601 int error = metaslab_load(msp); 1602 if (error) { 1603 metaslab_group_sort(msp->ms_group, msp, 0); 1604 return (error); 1605 } 1606 } 1607 1608 metaslab_group_sort(msp->ms_group, msp, 1609 msp->ms_weight | activation_weight); 1610 } 1611 ASSERT(msp->ms_loaded); 1612 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1613 1614 return (0); 1615} 1616 1617static void 1618metaslab_passivate(metaslab_t *msp, uint64_t size) 1619{ 1620 /* 1621 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1622 * this metaslab again. In that case, it had better be empty, 1623 * or we would be leaving space on the table. 1624 */ 1625 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1626 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1627 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1628} 1629 1630static void 1631metaslab_preload(void *arg) 1632{ 1633 metaslab_t *msp = arg; 1634 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1635 1636 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1637 1638 mutex_enter(&msp->ms_lock); 1639 metaslab_load_wait(msp); 1640 if (!msp->ms_loaded) 1641 (void) metaslab_load(msp); 1642 1643 /* 1644 * Set the ms_access_txg value so that we don't unload it right away. 1645 */ 1646 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1647 mutex_exit(&msp->ms_lock); 1648} 1649 1650static void 1651metaslab_group_preload(metaslab_group_t *mg) 1652{ 1653 spa_t *spa = mg->mg_vd->vdev_spa; 1654 metaslab_t *msp; 1655 avl_tree_t *t = &mg->mg_metaslab_tree; 1656 int m = 0; 1657 1658 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1659 taskq_wait(mg->mg_taskq); 1660 return; 1661 } 1662 1663 mutex_enter(&mg->mg_lock); 1664 /* 1665 * Load the next potential metaslabs 1666 */ 1667 msp = avl_first(t); 1668 while (msp != NULL) { 1669 metaslab_t *msp_next = AVL_NEXT(t, msp); 1670 1671 /* 1672 * We preload only the maximum number of metaslabs specified 1673 * by metaslab_preload_limit. If a metaslab is being forced 1674 * to condense then we preload it too. This will ensure 1675 * that force condensing happens in the next txg. 1676 */ 1677 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1678 msp = msp_next; 1679 continue; 1680 } 1681 1682 /* 1683 * We must drop the metaslab group lock here to preserve 1684 * lock ordering with the ms_lock (when grabbing both 1685 * the mg_lock and the ms_lock, the ms_lock must be taken 1686 * first). As a result, it is possible that the ordering 1687 * of the metaslabs within the avl tree may change before 1688 * we reacquire the lock. The metaslab cannot be removed from 1689 * the tree while we're in syncing context so it is safe to 1690 * drop the mg_lock here. If the metaslabs are reordered 1691 * nothing will break -- we just may end up loading a 1692 * less than optimal one. 1693 */ 1694 mutex_exit(&mg->mg_lock); 1695 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1696 msp, TQ_SLEEP) != 0); 1697 mutex_enter(&mg->mg_lock); 1698 msp = msp_next; 1699 } 1700 mutex_exit(&mg->mg_lock); 1701} 1702 1703/* 1704 * Determine if the space map's on-disk footprint is past our tolerance 1705 * for inefficiency. We would like to use the following criteria to make 1706 * our decision: 1707 * 1708 * 1. The size of the space map object should not dramatically increase as a 1709 * result of writing out the free space range tree. 1710 * 1711 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1712 * times the size than the free space range tree representation 1713 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1714 * 1715 * 3. The on-disk size of the space map should actually decrease. 1716 * 1717 * Checking the first condition is tricky since we don't want to walk 1718 * the entire AVL tree calculating the estimated on-disk size. Instead we 1719 * use the size-ordered range tree in the metaslab and calculate the 1720 * size required to write out the largest segment in our free tree. If the 1721 * size required to represent that segment on disk is larger than the space 1722 * map object then we avoid condensing this map. 1723 * 1724 * To determine the second criterion we use a best-case estimate and assume 1725 * each segment can be represented on-disk as a single 64-bit entry. We refer 1726 * to this best-case estimate as the space map's minimal form. 1727 * 1728 * Unfortunately, we cannot compute the on-disk size of the space map in this 1729 * context because we cannot accurately compute the effects of compression, etc. 1730 * Instead, we apply the heuristic described in the block comment for 1731 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1732 * is greater than a threshold number of blocks. 1733 */ 1734static boolean_t 1735metaslab_should_condense(metaslab_t *msp) 1736{ 1737 space_map_t *sm = msp->ms_sm; 1738 range_seg_t *rs; 1739 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1740 dmu_object_info_t doi; 1741 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1742 1743 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1744 ASSERT(msp->ms_loaded); 1745 1746 /* 1747 * Use the ms_size_tree range tree, which is ordered by size, to 1748 * obtain the largest segment in the free tree. We always condense 1749 * metaslabs that are empty and metaslabs for which a condense 1750 * request has been made. 1751 */ 1752 rs = avl_last(&msp->ms_size_tree); 1753 if (rs == NULL || msp->ms_condense_wanted) 1754 return (B_TRUE); 1755 1756 /* 1757 * Calculate the number of 64-bit entries this segment would 1758 * require when written to disk. If this single segment would be 1759 * larger on-disk than the entire current on-disk structure, then 1760 * clearly condensing will increase the on-disk structure size. 1761 */ 1762 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1763 entries = size / (MIN(size, SM_RUN_MAX)); 1764 segsz = entries * sizeof (uint64_t); 1765 1766 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1767 object_size = space_map_length(msp->ms_sm); 1768 1769 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1770 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1771 1772 return (segsz <= object_size && 1773 object_size >= (optimal_size * zfs_condense_pct / 100) && 1774 object_size > zfs_metaslab_condense_block_threshold * record_size); 1775} 1776 1777/* 1778 * Condense the on-disk space map representation to its minimized form. 1779 * The minimized form consists of a small number of allocations followed by 1780 * the entries of the free range tree. 1781 */ 1782static void 1783metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1784{ 1785 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1786 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1787 range_tree_t *condense_tree; 1788 space_map_t *sm = msp->ms_sm; 1789 1790 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1791 ASSERT3U(spa_sync_pass(spa), ==, 1); 1792 ASSERT(msp->ms_loaded); 1793 1794 1795 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1796 "smp size %llu, segments %lu, forcing condense=%s", txg, 1797 msp->ms_id, msp, space_map_length(msp->ms_sm), 1798 avl_numnodes(&msp->ms_tree->rt_root), 1799 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1800 1801 msp->ms_condense_wanted = B_FALSE; 1802 1803 /* 1804 * Create an range tree that is 100% allocated. We remove segments 1805 * that have been freed in this txg, any deferred frees that exist, 1806 * and any allocation in the future. Removing segments should be 1807 * a relatively inexpensive operation since we expect these trees to 1808 * have a small number of nodes. 1809 */ 1810 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1811 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1812 1813 /* 1814 * Remove what's been freed in this txg from the condense_tree. 1815 * Since we're in sync_pass 1, we know that all the frees from 1816 * this txg are in the freetree. 1817 */ 1818 range_tree_walk(freetree, range_tree_remove, condense_tree); 1819 1820 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1821 range_tree_walk(msp->ms_defertree[t], 1822 range_tree_remove, condense_tree); 1823 } 1824 1825 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1826 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1827 range_tree_remove, condense_tree); 1828 } 1829 1830 /* 1831 * We're about to drop the metaslab's lock thus allowing 1832 * other consumers to change it's content. Set the 1833 * metaslab's ms_condensing flag to ensure that 1834 * allocations on this metaslab do not occur while we're 1835 * in the middle of committing it to disk. This is only critical 1836 * for the ms_tree as all other range trees use per txg 1837 * views of their content. 1838 */ 1839 msp->ms_condensing = B_TRUE; 1840 1841 mutex_exit(&msp->ms_lock); 1842 space_map_truncate(sm, tx); 1843 mutex_enter(&msp->ms_lock); 1844 1845 /* 1846 * While we would ideally like to create a space_map representation 1847 * that consists only of allocation records, doing so can be 1848 * prohibitively expensive because the in-core free tree can be 1849 * large, and therefore computationally expensive to subtract 1850 * from the condense_tree. Instead we sync out two trees, a cheap 1851 * allocation only tree followed by the in-core free tree. While not 1852 * optimal, this is typically close to optimal, and much cheaper to 1853 * compute. 1854 */ 1855 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1856 range_tree_vacate(condense_tree, NULL, NULL); 1857 range_tree_destroy(condense_tree); 1858 1859 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1860 msp->ms_condensing = B_FALSE; 1861} 1862 1863/* 1864 * Write a metaslab to disk in the context of the specified transaction group. 1865 */ 1866void 1867metaslab_sync(metaslab_t *msp, uint64_t txg) 1868{ 1869 metaslab_group_t *mg = msp->ms_group; 1870 vdev_t *vd = mg->mg_vd; 1871 spa_t *spa = vd->vdev_spa; 1872 objset_t *mos = spa_meta_objset(spa); 1873 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1874 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1875 range_tree_t **freed_tree = 1876 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1877 dmu_tx_t *tx; 1878 uint64_t object = space_map_object(msp->ms_sm); 1879 1880 ASSERT(!vd->vdev_ishole); 1881 1882 /* 1883 * This metaslab has just been added so there's no work to do now. 1884 */ 1885 if (*freetree == NULL) { 1886 ASSERT3P(alloctree, ==, NULL); 1887 return; 1888 } 1889 1890 ASSERT3P(alloctree, !=, NULL); 1891 ASSERT3P(*freetree, !=, NULL); 1892 ASSERT3P(*freed_tree, !=, NULL); 1893 1894 /* 1895 * Normally, we don't want to process a metaslab if there 1896 * are no allocations or frees to perform. However, if the metaslab 1897 * is being forced to condense we need to let it through. 1898 */ 1899 if (range_tree_space(alloctree) == 0 && 1900 range_tree_space(*freetree) == 0 && 1901 !msp->ms_condense_wanted) 1902 return; 1903 1904 /* 1905 * The only state that can actually be changing concurrently with 1906 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1907 * be modifying this txg's alloctree, freetree, freed_tree, or 1908 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1909 * space_map ASSERTs. We drop it whenever we call into the DMU, 1910 * because the DMU can call down to us (e.g. via zio_free()) at 1911 * any time. 1912 */ 1913 1914 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1915 1916 if (msp->ms_sm == NULL) { 1917 uint64_t new_object; 1918 1919 new_object = space_map_alloc(mos, tx); 1920 VERIFY3U(new_object, !=, 0); 1921 1922 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1923 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1924 &msp->ms_lock)); 1925 ASSERT(msp->ms_sm != NULL); 1926 } 1927 1928 mutex_enter(&msp->ms_lock); 1929 1930 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1931 metaslab_should_condense(msp)) { 1932 metaslab_condense(msp, txg, tx); 1933 } else { 1934 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1935 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1936 } 1937 1938 metaslab_group_histogram_verify(mg); 1939 metaslab_class_histogram_verify(mg->mg_class); 1940 metaslab_group_histogram_remove(mg, msp); 1941 if (msp->ms_loaded) { 1942 /* 1943 * When the space map is loaded, we have an accruate 1944 * histogram in the range tree. This gives us an opportunity 1945 * to bring the space map's histogram up-to-date so we clear 1946 * it first before updating it. 1947 */ 1948 space_map_histogram_clear(msp->ms_sm); 1949 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1950 } else { 1951 /* 1952 * Since the space map is not loaded we simply update the 1953 * exisiting histogram with what was freed in this txg. This 1954 * means that the on-disk histogram may not have an accurate 1955 * view of the free space but it's close enough to allow 1956 * us to make allocation decisions. 1957 */ 1958 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1959 } 1960 metaslab_group_histogram_add(mg, msp); 1961 metaslab_group_histogram_verify(mg); 1962 metaslab_class_histogram_verify(mg->mg_class); 1963 1964 /* 1965 * For sync pass 1, we avoid traversing this txg's free range tree 1966 * and instead will just swap the pointers for freetree and 1967 * freed_tree. We can safely do this since the freed_tree is 1968 * guaranteed to be empty on the initial pass. 1969 */ 1970 if (spa_sync_pass(spa) == 1) { 1971 range_tree_swap(freetree, freed_tree); 1972 } else { 1973 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1974 } 1975 range_tree_vacate(alloctree, NULL, NULL); 1976 1977 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1978 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1979 1980 mutex_exit(&msp->ms_lock); 1981 1982 if (object != space_map_object(msp->ms_sm)) { 1983 object = space_map_object(msp->ms_sm); 1984 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1985 msp->ms_id, sizeof (uint64_t), &object, tx); 1986 } 1987 dmu_tx_commit(tx); 1988} 1989 1990/* 1991 * Called after a transaction group has completely synced to mark 1992 * all of the metaslab's free space as usable. 1993 */ 1994void 1995metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1996{ 1997 metaslab_group_t *mg = msp->ms_group; 1998 vdev_t *vd = mg->mg_vd; 1999 range_tree_t **freed_tree; 2000 range_tree_t **defer_tree; 2001 int64_t alloc_delta, defer_delta; 2002 2003 ASSERT(!vd->vdev_ishole); 2004 2005 mutex_enter(&msp->ms_lock); 2006 2007 /* 2008 * If this metaslab is just becoming available, initialize its 2009 * alloctrees, freetrees, and defertree and add its capacity to 2010 * the vdev. 2011 */ 2012 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2013 for (int t = 0; t < TXG_SIZE; t++) { 2014 ASSERT(msp->ms_alloctree[t] == NULL); 2015 ASSERT(msp->ms_freetree[t] == NULL); 2016 2017 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2018 &msp->ms_lock); 2019 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2020 &msp->ms_lock); 2021 } 2022 2023 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2024 ASSERT(msp->ms_defertree[t] == NULL); 2025 2026 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2027 &msp->ms_lock); 2028 } 2029 2030 vdev_space_update(vd, 0, 0, msp->ms_size); 2031 } 2032 2033 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2034 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2035 2036 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2037 defer_delta = range_tree_space(*freed_tree) - 2038 range_tree_space(*defer_tree); 2039 2040 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2041 2042 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2043 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2044 2045 /* 2046 * If there's a metaslab_load() in progress, wait for it to complete 2047 * so that we have a consistent view of the in-core space map. 2048 */ 2049 metaslab_load_wait(msp); 2050 2051 /* 2052 * Move the frees from the defer_tree back to the free 2053 * range tree (if it's loaded). Swap the freed_tree and the 2054 * defer_tree -- this is safe to do because we've just emptied out 2055 * the defer_tree. 2056 */ 2057 range_tree_vacate(*defer_tree, 2058 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2059 range_tree_swap(freed_tree, defer_tree); 2060 2061 space_map_update(msp->ms_sm); 2062 2063 msp->ms_deferspace += defer_delta; 2064 ASSERT3S(msp->ms_deferspace, >=, 0); 2065 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2066 if (msp->ms_deferspace != 0) { 2067 /* 2068 * Keep syncing this metaslab until all deferred frees 2069 * are back in circulation. 2070 */ 2071 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2072 } 2073 2074 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2075 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2076 VERIFY0(range_tree_space( 2077 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2078 } 2079 2080 if (!metaslab_debug_unload) 2081 metaslab_unload(msp); 2082 } 2083 2084 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2085 mutex_exit(&msp->ms_lock); 2086} 2087 2088void 2089metaslab_sync_reassess(metaslab_group_t *mg) 2090{ 2091 metaslab_group_alloc_update(mg); 2092 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2093 2094 /* 2095 * Preload the next potential metaslabs 2096 */ 2097 metaslab_group_preload(mg); 2098} 2099 2100static uint64_t 2101metaslab_distance(metaslab_t *msp, dva_t *dva) 2102{ 2103 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2104 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2105 uint64_t start = msp->ms_id; 2106 2107 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2108 return (1ULL << 63); 2109 2110 if (offset < start) 2111 return ((start - offset) << ms_shift); 2112 if (offset > start) 2113 return ((offset - start) << ms_shift); 2114 return (0); 2115} 2116 2117static uint64_t 2118metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2119 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2120{ 2121 spa_t *spa = mg->mg_vd->vdev_spa; 2122 metaslab_t *msp = NULL; 2123 uint64_t offset = -1ULL; 2124 avl_tree_t *t = &mg->mg_metaslab_tree; 2125 uint64_t activation_weight; 2126 uint64_t target_distance; 2127 int i; 2128 2129 activation_weight = METASLAB_WEIGHT_PRIMARY; 2130 for (i = 0; i < d; i++) { 2131 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2132 activation_weight = METASLAB_WEIGHT_SECONDARY; 2133 break; 2134 } 2135 } 2136 2137 for (;;) { 2138 boolean_t was_active; 2139 2140 mutex_enter(&mg->mg_lock); 2141 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2142 if (msp->ms_weight < asize) { 2143 spa_dbgmsg(spa, "%s: failed to meet weight " 2144 "requirement: vdev %llu, txg %llu, mg %p, " 2145 "msp %p, psize %llu, asize %llu, " 2146 "weight %llu", spa_name(spa), 2147 mg->mg_vd->vdev_id, txg, 2148 mg, msp, psize, asize, msp->ms_weight); 2149 mutex_exit(&mg->mg_lock); 2150 return (-1ULL); 2151 } 2152 2153 /* 2154 * If the selected metaslab is condensing, skip it. 2155 */ 2156 if (msp->ms_condensing) 2157 continue; 2158 2159 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2160 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2161 break; 2162 2163 target_distance = min_distance + 2164 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2165 min_distance >> 1); 2166 2167 for (i = 0; i < d; i++) 2168 if (metaslab_distance(msp, &dva[i]) < 2169 target_distance) 2170 break; 2171 if (i == d) 2172 break; 2173 } 2174 mutex_exit(&mg->mg_lock); 2175 if (msp == NULL) 2176 return (-1ULL); 2177 2178 mutex_enter(&msp->ms_lock); 2179 2180 /* 2181 * Ensure that the metaslab we have selected is still 2182 * capable of handling our request. It's possible that 2183 * another thread may have changed the weight while we 2184 * were blocked on the metaslab lock. 2185 */ 2186 if (msp->ms_weight < asize || (was_active && 2187 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2188 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2189 mutex_exit(&msp->ms_lock); 2190 continue; 2191 } 2192 2193 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2194 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2195 metaslab_passivate(msp, 2196 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2197 mutex_exit(&msp->ms_lock); 2198 continue; 2199 } 2200 2201 if (metaslab_activate(msp, activation_weight) != 0) { 2202 mutex_exit(&msp->ms_lock); 2203 continue; 2204 } 2205 2206 /* 2207 * If this metaslab is currently condensing then pick again as 2208 * we can't manipulate this metaslab until it's committed 2209 * to disk. 2210 */ 2211 if (msp->ms_condensing) { 2212 mutex_exit(&msp->ms_lock); 2213 continue; 2214 } 2215 2216 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2217 break; 2218 2219 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2220 mutex_exit(&msp->ms_lock); 2221 } 2222 2223 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2224 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2225 2226 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2227 msp->ms_access_txg = txg + metaslab_unload_delay; 2228 2229 mutex_exit(&msp->ms_lock); 2230 2231 return (offset); 2232} 2233 2234/* 2235 * Allocate a block for the specified i/o. 2236 */ 2237static int 2238metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2239 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2240{ 2241 metaslab_group_t *mg, *rotor; 2242 vdev_t *vd; 2243 int dshift = 3; 2244 int all_zero; 2245 int zio_lock = B_FALSE; 2246 boolean_t allocatable; 2247 uint64_t offset = -1ULL; 2248 uint64_t asize; 2249 uint64_t distance; 2250 2251 ASSERT(!DVA_IS_VALID(&dva[d])); 2252 2253 /* 2254 * For testing, make some blocks above a certain size be gang blocks. 2255 */ 2256 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2257 return (SET_ERROR(ENOSPC)); 2258 2259 /* 2260 * Start at the rotor and loop through all mgs until we find something. 2261 * Note that there's no locking on mc_rotor or mc_aliquot because 2262 * nothing actually breaks if we miss a few updates -- we just won't 2263 * allocate quite as evenly. It all balances out over time. 2264 * 2265 * If we are doing ditto or log blocks, try to spread them across 2266 * consecutive vdevs. If we're forced to reuse a vdev before we've 2267 * allocated all of our ditto blocks, then try and spread them out on 2268 * that vdev as much as possible. If it turns out to not be possible, 2269 * gradually lower our standards until anything becomes acceptable. 2270 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2271 * gives us hope of containing our fault domains to something we're 2272 * able to reason about. Otherwise, any two top-level vdev failures 2273 * will guarantee the loss of data. With consecutive allocation, 2274 * only two adjacent top-level vdev failures will result in data loss. 2275 * 2276 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2277 * ourselves on the same vdev as our gang block header. That 2278 * way, we can hope for locality in vdev_cache, plus it makes our 2279 * fault domains something tractable. 2280 */ 2281 if (hintdva) { 2282 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2283 2284 /* 2285 * It's possible the vdev we're using as the hint no 2286 * longer exists (i.e. removed). Consult the rotor when 2287 * all else fails. 2288 */ 2289 if (vd != NULL) { 2290 mg = vd->vdev_mg; 2291 2292 if (flags & METASLAB_HINTBP_AVOID && 2293 mg->mg_next != NULL) 2294 mg = mg->mg_next; 2295 } else { 2296 mg = mc->mc_rotor; 2297 } 2298 } else if (d != 0) { 2299 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2300 mg = vd->vdev_mg->mg_next; 2301 } else { 2302 mg = mc->mc_rotor; 2303 } 2304 2305 /* 2306 * If the hint put us into the wrong metaslab class, or into a 2307 * metaslab group that has been passivated, just follow the rotor. 2308 */ 2309 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2310 mg = mc->mc_rotor; 2311 2312 rotor = mg; 2313top: 2314 all_zero = B_TRUE; 2315 do { 2316 ASSERT(mg->mg_activation_count == 1); 2317 2318 vd = mg->mg_vd; 2319 2320 /* 2321 * Don't allocate from faulted devices. 2322 */ 2323 if (zio_lock) { 2324 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2325 allocatable = vdev_allocatable(vd); 2326 spa_config_exit(spa, SCL_ZIO, FTAG); 2327 } else { 2328 allocatable = vdev_allocatable(vd); 2329 } 2330 2331 /* 2332 * Determine if the selected metaslab group is eligible 2333 * for allocations. If we're ganging or have requested 2334 * an allocation for the smallest gang block size 2335 * then we don't want to avoid allocating to the this 2336 * metaslab group. If we're in this condition we should 2337 * try to allocate from any device possible so that we 2338 * don't inadvertently return ENOSPC and suspend the pool 2339 * even though space is still available. 2340 */ 2341 if (allocatable && CAN_FASTGANG(flags) && 2342 psize > SPA_GANGBLOCKSIZE) 2343 allocatable = metaslab_group_allocatable(mg); 2344 2345 if (!allocatable) 2346 goto next; 2347 2348 /* 2349 * Avoid writing single-copy data to a failing vdev 2350 * unless the user instructs us that it is okay. 2351 */ 2352 if ((vd->vdev_stat.vs_write_errors > 0 || 2353 vd->vdev_state < VDEV_STATE_HEALTHY) && 2354 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2355 all_zero = B_FALSE; 2356 goto next; 2357 } 2358 2359 ASSERT(mg->mg_class == mc); 2360 2361 distance = vd->vdev_asize >> dshift; 2362 if (distance <= (1ULL << vd->vdev_ms_shift)) 2363 distance = 0; 2364 else 2365 all_zero = B_FALSE; 2366 2367 asize = vdev_psize_to_asize(vd, psize); 2368 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2369 2370 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2371 dva, d); 2372 if (offset != -1ULL) { 2373 /* 2374 * If we've just selected this metaslab group, 2375 * figure out whether the corresponding vdev is 2376 * over- or under-used relative to the pool, 2377 * and set an allocation bias to even it out. 2378 */ 2379 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2380 vdev_stat_t *vs = &vd->vdev_stat; 2381 int64_t vu, cu; 2382 2383 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2384 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2385 2386 /* 2387 * Calculate how much more or less we should 2388 * try to allocate from this device during 2389 * this iteration around the rotor. 2390 * For example, if a device is 80% full 2391 * and the pool is 20% full then we should 2392 * reduce allocations by 60% on this device. 2393 * 2394 * mg_bias = (20 - 80) * 512K / 100 = -307K 2395 * 2396 * This reduces allocations by 307K for this 2397 * iteration. 2398 */ 2399 mg->mg_bias = ((cu - vu) * 2400 (int64_t)mg->mg_aliquot) / 100; 2401 } else if (!metaslab_bias_enabled) { 2402 mg->mg_bias = 0; 2403 } 2404 2405 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2406 mg->mg_aliquot + mg->mg_bias) { 2407 mc->mc_rotor = mg->mg_next; 2408 mc->mc_aliquot = 0; 2409 } 2410 2411 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2412 DVA_SET_OFFSET(&dva[d], offset); 2413 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2414 DVA_SET_ASIZE(&dva[d], asize); 2415 2416 return (0); 2417 } 2418next: 2419 mc->mc_rotor = mg->mg_next; 2420 mc->mc_aliquot = 0; 2421 } while ((mg = mg->mg_next) != rotor); 2422 2423 if (!all_zero) { 2424 dshift++; 2425 ASSERT(dshift < 64); 2426 goto top; 2427 } 2428 2429 if (!allocatable && !zio_lock) { 2430 dshift = 3; 2431 zio_lock = B_TRUE; 2432 goto top; 2433 } 2434 2435 bzero(&dva[d], sizeof (dva_t)); 2436 2437 return (SET_ERROR(ENOSPC)); 2438} 2439 2440/* 2441 * Free the block represented by DVA in the context of the specified 2442 * transaction group. 2443 */ 2444static void 2445metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2446{ 2447 uint64_t vdev = DVA_GET_VDEV(dva); 2448 uint64_t offset = DVA_GET_OFFSET(dva); 2449 uint64_t size = DVA_GET_ASIZE(dva); 2450 vdev_t *vd; 2451 metaslab_t *msp; 2452 2453 ASSERT(DVA_IS_VALID(dva)); 2454 2455 if (txg > spa_freeze_txg(spa)) 2456 return; 2457 2458 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2459 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2460 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2461 (u_longlong_t)vdev, (u_longlong_t)offset); 2462 ASSERT(0); 2463 return; 2464 } 2465 2466 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2467 2468 if (DVA_GET_GANG(dva)) 2469 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2470 2471 mutex_enter(&msp->ms_lock); 2472 2473 if (now) { 2474 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2475 offset, size); 2476 2477 VERIFY(!msp->ms_condensing); 2478 VERIFY3U(offset, >=, msp->ms_start); 2479 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2480 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2481 msp->ms_size); 2482 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2483 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2484 range_tree_add(msp->ms_tree, offset, size); 2485 } else { 2486 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2487 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2488 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2489 offset, size); 2490 } 2491 2492 mutex_exit(&msp->ms_lock); 2493} 2494 2495/* 2496 * Intent log support: upon opening the pool after a crash, notify the SPA 2497 * of blocks that the intent log has allocated for immediate write, but 2498 * which are still considered free by the SPA because the last transaction 2499 * group didn't commit yet. 2500 */ 2501static int 2502metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2503{ 2504 uint64_t vdev = DVA_GET_VDEV(dva); 2505 uint64_t offset = DVA_GET_OFFSET(dva); 2506 uint64_t size = DVA_GET_ASIZE(dva); 2507 vdev_t *vd; 2508 metaslab_t *msp; 2509 int error = 0; 2510 2511 ASSERT(DVA_IS_VALID(dva)); 2512 2513 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2514 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2515 return (SET_ERROR(ENXIO)); 2516 2517 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2518 2519 if (DVA_GET_GANG(dva)) 2520 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2521 2522 mutex_enter(&msp->ms_lock); 2523 2524 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2525 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2526 2527 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2528 error = SET_ERROR(ENOENT); 2529 2530 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2531 mutex_exit(&msp->ms_lock); 2532 return (error); 2533 } 2534 2535 VERIFY(!msp->ms_condensing); 2536 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2537 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2538 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2539 range_tree_remove(msp->ms_tree, offset, size); 2540 2541 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2542 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2543 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2544 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2545 } 2546 2547 mutex_exit(&msp->ms_lock); 2548 2549 return (0); 2550} 2551 2552int 2553metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2554 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2555{ 2556 dva_t *dva = bp->blk_dva; 2557 dva_t *hintdva = hintbp->blk_dva; 2558 int error = 0; 2559 2560 ASSERT(bp->blk_birth == 0); 2561 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2562 2563 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2564 2565 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2566 spa_config_exit(spa, SCL_ALLOC, FTAG); 2567 return (SET_ERROR(ENOSPC)); 2568 } 2569 2570 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2571 ASSERT(BP_GET_NDVAS(bp) == 0); 2572 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2573 2574 for (int d = 0; d < ndvas; d++) { 2575 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2576 txg, flags); 2577 if (error != 0) { 2578 for (d--; d >= 0; d--) { 2579 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2580 bzero(&dva[d], sizeof (dva_t)); 2581 } 2582 spa_config_exit(spa, SCL_ALLOC, FTAG); 2583 return (error); 2584 } 2585 } 2586 ASSERT(error == 0); 2587 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2588 2589 spa_config_exit(spa, SCL_ALLOC, FTAG); 2590 2591 BP_SET_BIRTH(bp, txg, txg); 2592 2593 return (0); 2594} 2595 2596void 2597metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2598{ 2599 const dva_t *dva = bp->blk_dva; 2600 int ndvas = BP_GET_NDVAS(bp); 2601 2602 ASSERT(!BP_IS_HOLE(bp)); 2603 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2604 2605 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2606 2607 for (int d = 0; d < ndvas; d++) 2608 metaslab_free_dva(spa, &dva[d], txg, now); 2609 2610 spa_config_exit(spa, SCL_FREE, FTAG); 2611} 2612 2613int 2614metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2615{ 2616 const dva_t *dva = bp->blk_dva; 2617 int ndvas = BP_GET_NDVAS(bp); 2618 int error = 0; 2619 2620 ASSERT(!BP_IS_HOLE(bp)); 2621 2622 if (txg != 0) { 2623 /* 2624 * First do a dry run to make sure all DVAs are claimable, 2625 * so we don't have to unwind from partial failures below. 2626 */ 2627 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2628 return (error); 2629 } 2630 2631 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2632 2633 for (int d = 0; d < ndvas; d++) 2634 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2635 break; 2636 2637 spa_config_exit(spa, SCL_ALLOC, FTAG); 2638 2639 ASSERT(error == 0 || txg == 0); 2640 2641 return (error); 2642} 2643 2644void 2645metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2646{ 2647 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2648 return; 2649 2650 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2651 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2652 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2653 vdev_t *vd = vdev_lookup_top(spa, vdev); 2654 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2655 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2656 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2657 2658 if (msp->ms_loaded) 2659 range_tree_verify(msp->ms_tree, offset, size); 2660 2661 for (int j = 0; j < TXG_SIZE; j++) 2662 range_tree_verify(msp->ms_freetree[j], offset, size); 2663 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2664 range_tree_verify(msp->ms_defertree[j], offset, size); 2665 } 2666 spa_config_exit(spa, SCL_VDEV, FTAG); 2667} 2668