metaslab.c revision 269774
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h> 35#include <sys/zfeature.h> 36 37SYSCTL_DECL(_vfs_zfs); 38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 39 40/* 41 * Allow allocations to switch to gang blocks quickly. We do this to 42 * avoid having to load lots of space_maps in a given txg. There are, 43 * however, some cases where we want to avoid "fast" ganging and instead 44 * we want to do an exhaustive search of all metaslabs on this device. 45 * Currently we don't allow any gang, slog, or dump device related allocations 46 * to "fast" gang. 47 */ 48#define CAN_FASTGANG(flags) \ 49 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 50 METASLAB_GANG_AVOID))) 51 52#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 53#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 54#define METASLAB_ACTIVE_MASK \ 55 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 56 57uint64_t metaslab_aliquot = 512ULL << 10; 58uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 59TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 60SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 61 &metaslab_gang_bang, 0, 62 "Force gang block allocation for blocks larger than or equal to this value"); 63 64/* 65 * The in-core space map representation is more compact than its on-disk form. 66 * The zfs_condense_pct determines how much more compact the in-core 67 * space_map representation must be before we compact it on-disk. 68 * Values should be greater than or equal to 100. 69 */ 70int zfs_condense_pct = 200; 71TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 72SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 73 &zfs_condense_pct, 0, 74 "Condense on-disk spacemap when it is more than this many percents" 75 " of in-memory counterpart"); 76 77/* 78 * Condensing a metaslab is not guaranteed to actually reduce the amount of 79 * space used on disk. In particular, a space map uses data in increments of 80 * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the 81 * same number of blocks after condensing. Since the goal of condensing is to 82 * reduce the number of IOPs required to read the space map, we only want to 83 * condense when we can be sure we will reduce the number of blocks used by the 84 * space map. Unfortunately, we cannot precisely compute whether or not this is 85 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 86 * we apply the following heuristic: do not condense a spacemap unless the 87 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 88 * blocks. 89 */ 90int zfs_metaslab_condense_block_threshold = 4; 91 92/* 93 * The zfs_mg_noalloc_threshold defines which metaslab groups should 94 * be eligible for allocation. The value is defined as a percentage of 95 * free space. Metaslab groups that have more free space than 96 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 97 * a metaslab group's free space is less than or equal to the 98 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 99 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 100 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 101 * groups are allowed to accept allocations. Gang blocks are always 102 * eligible to allocate on any metaslab group. The default value of 0 means 103 * no metaslab group will be excluded based on this criterion. 104 */ 105int zfs_mg_noalloc_threshold = 0; 106TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 107SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 108 &zfs_mg_noalloc_threshold, 0, 109 "Percentage of metaslab group size that should be free" 110 " to make it eligible for allocation"); 111 112/* 113 * Metaslab groups are considered eligible for allocations if their 114 * fragmenation metric (measured as a percentage) is less than or equal to 115 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 116 * then it will be skipped unless all metaslab groups within the metaslab 117 * class have also crossed this threshold. 118 */ 119int zfs_mg_fragmentation_threshold = 85; 120TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 121SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 122 &zfs_mg_fragmentation_threshold, 0, 123 "Percentage of metaslab group size that should be considered " 124 "eligible for allocations unless all metaslab groups within the metaslab class " 125 "have also crossed this threshold"); 126 127/* 128 * Allow metaslabs to keep their active state as long as their fragmentation 129 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 130 * active metaslab that exceeds this threshold will no longer keep its active 131 * status allowing better metaslabs to be selected. 132 */ 133int zfs_metaslab_fragmentation_threshold = 70; 134TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 135 &zfs_metaslab_fragmentation_threshold); 136SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 137 &zfs_metaslab_fragmentation_threshold, 0, 138 "Maximum percentage of metaslab fragmentation level to keep their active state"); 139 140/* 141 * When set will load all metaslabs when pool is first opened. 142 */ 143int metaslab_debug_load = 0; 144TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 145SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 146 &metaslab_debug_load, 0, 147 "Load all metaslabs when pool is first opened"); 148 149/* 150 * When set will prevent metaslabs from being unloaded. 151 */ 152int metaslab_debug_unload = 0; 153TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 154SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 155 &metaslab_debug_unload, 0, 156 "Prevent metaslabs from being unloaded"); 157 158/* 159 * Minimum size which forces the dynamic allocator to change 160 * it's allocation strategy. Once the space map cannot satisfy 161 * an allocation of this size then it switches to using more 162 * aggressive strategy (i.e search by size rather than offset). 163 */ 164uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 165TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 166 &metaslab_df_alloc_threshold); 167SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 168 &metaslab_df_alloc_threshold, 0, 169 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 170 171/* 172 * The minimum free space, in percent, which must be available 173 * in a space map to continue allocations in a first-fit fashion. 174 * Once the space_map's free space drops below this level we dynamically 175 * switch to using best-fit allocations. 176 */ 177int metaslab_df_free_pct = 4; 178TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 179SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 180 &metaslab_df_free_pct, 0, 181 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 182 183/* 184 * A metaslab is considered "free" if it contains a contiguous 185 * segment which is greater than metaslab_min_alloc_size. 186 */ 187uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 188TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 189 &metaslab_min_alloc_size); 190SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 191 &metaslab_min_alloc_size, 0, 192 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 193 194/* 195 * Percentage of all cpus that can be used by the metaslab taskq. 196 */ 197int metaslab_load_pct = 50; 198TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 199SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 200 &metaslab_load_pct, 0, 201 "Percentage of cpus that can be used by the metaslab taskq"); 202 203/* 204 * Determines how many txgs a metaslab may remain loaded without having any 205 * allocations from it. As long as a metaslab continues to be used we will 206 * keep it loaded. 207 */ 208int metaslab_unload_delay = TXG_SIZE * 2; 209TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 210SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 211 &metaslab_unload_delay, 0, 212 "Number of TXGs that an unused metaslab can be kept in memory"); 213 214/* 215 * Max number of metaslabs per group to preload. 216 */ 217int metaslab_preload_limit = SPA_DVAS_PER_BP; 218TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 219SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 220 &metaslab_preload_limit, 0, 221 "Max number of metaslabs per group to preload"); 222 223/* 224 * Enable/disable preloading of metaslab. 225 */ 226boolean_t metaslab_preload_enabled = B_TRUE; 227TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 228SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 229 &metaslab_preload_enabled, 0, 230 "Max number of metaslabs per group to preload"); 231 232/* 233 * Enable/disable fragmentation weighting on metaslabs. 234 */ 235boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 236TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 237 &metaslab_fragmentation_factor_enabled); 238SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 239 &metaslab_fragmentation_factor_enabled, 0, 240 "Enable fragmentation weighting on metaslabs"); 241 242/* 243 * Enable/disable lba weighting (i.e. outer tracks are given preference). 244 */ 245boolean_t metaslab_lba_weighting_enabled = B_TRUE; 246TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 247 &metaslab_lba_weighting_enabled); 248SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 249 &metaslab_lba_weighting_enabled, 0, 250 "Enable LBA weighting (i.e. outer tracks are given preference)"); 251 252/* 253 * Enable/disable metaslab group biasing. 254 */ 255boolean_t metaslab_bias_enabled = B_TRUE; 256TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 257 &metaslab_bias_enabled); 258SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 259 &metaslab_bias_enabled, 0, 260 "Enable metaslab group biasing"); 261 262static uint64_t metaslab_fragmentation(metaslab_t *); 263 264/* 265 * ========================================================================== 266 * Metaslab classes 267 * ========================================================================== 268 */ 269metaslab_class_t * 270metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 271{ 272 metaslab_class_t *mc; 273 274 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 275 276 mc->mc_spa = spa; 277 mc->mc_rotor = NULL; 278 mc->mc_ops = ops; 279 280 return (mc); 281} 282 283void 284metaslab_class_destroy(metaslab_class_t *mc) 285{ 286 ASSERT(mc->mc_rotor == NULL); 287 ASSERT(mc->mc_alloc == 0); 288 ASSERT(mc->mc_deferred == 0); 289 ASSERT(mc->mc_space == 0); 290 ASSERT(mc->mc_dspace == 0); 291 292 kmem_free(mc, sizeof (metaslab_class_t)); 293} 294 295int 296metaslab_class_validate(metaslab_class_t *mc) 297{ 298 metaslab_group_t *mg; 299 vdev_t *vd; 300 301 /* 302 * Must hold one of the spa_config locks. 303 */ 304 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 305 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 306 307 if ((mg = mc->mc_rotor) == NULL) 308 return (0); 309 310 do { 311 vd = mg->mg_vd; 312 ASSERT(vd->vdev_mg != NULL); 313 ASSERT3P(vd->vdev_top, ==, vd); 314 ASSERT3P(mg->mg_class, ==, mc); 315 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 316 } while ((mg = mg->mg_next) != mc->mc_rotor); 317 318 return (0); 319} 320 321void 322metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 323 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 324{ 325 atomic_add_64(&mc->mc_alloc, alloc_delta); 326 atomic_add_64(&mc->mc_deferred, defer_delta); 327 atomic_add_64(&mc->mc_space, space_delta); 328 atomic_add_64(&mc->mc_dspace, dspace_delta); 329} 330 331void 332metaslab_class_minblocksize_update(metaslab_class_t *mc) 333{ 334 metaslab_group_t *mg; 335 vdev_t *vd; 336 uint64_t minashift = UINT64_MAX; 337 338 if ((mg = mc->mc_rotor) == NULL) { 339 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 340 return; 341 } 342 343 do { 344 vd = mg->mg_vd; 345 if (vd->vdev_ashift < minashift) 346 minashift = vd->vdev_ashift; 347 } while ((mg = mg->mg_next) != mc->mc_rotor); 348 349 mc->mc_minblocksize = 1ULL << minashift; 350} 351 352uint64_t 353metaslab_class_get_alloc(metaslab_class_t *mc) 354{ 355 return (mc->mc_alloc); 356} 357 358uint64_t 359metaslab_class_get_deferred(metaslab_class_t *mc) 360{ 361 return (mc->mc_deferred); 362} 363 364uint64_t 365metaslab_class_get_space(metaslab_class_t *mc) 366{ 367 return (mc->mc_space); 368} 369 370uint64_t 371metaslab_class_get_dspace(metaslab_class_t *mc) 372{ 373 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 374} 375 376uint64_t 377metaslab_class_get_minblocksize(metaslab_class_t *mc) 378{ 379 return (mc->mc_minblocksize); 380} 381 382void 383metaslab_class_histogram_verify(metaslab_class_t *mc) 384{ 385 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 386 uint64_t *mc_hist; 387 int i; 388 389 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 390 return; 391 392 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 393 KM_SLEEP); 394 395 for (int c = 0; c < rvd->vdev_children; c++) { 396 vdev_t *tvd = rvd->vdev_child[c]; 397 metaslab_group_t *mg = tvd->vdev_mg; 398 399 /* 400 * Skip any holes, uninitialized top-levels, or 401 * vdevs that are not in this metalab class. 402 */ 403 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 404 mg->mg_class != mc) { 405 continue; 406 } 407 408 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 409 mc_hist[i] += mg->mg_histogram[i]; 410 } 411 412 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 413 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 414 415 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 416} 417 418/* 419 * Calculate the metaslab class's fragmentation metric. The metric 420 * is weighted based on the space contribution of each metaslab group. 421 * The return value will be a number between 0 and 100 (inclusive), or 422 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 423 * zfs_frag_table for more information about the metric. 424 */ 425uint64_t 426metaslab_class_fragmentation(metaslab_class_t *mc) 427{ 428 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 429 uint64_t fragmentation = 0; 430 431 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 432 433 for (int c = 0; c < rvd->vdev_children; c++) { 434 vdev_t *tvd = rvd->vdev_child[c]; 435 metaslab_group_t *mg = tvd->vdev_mg; 436 437 /* 438 * Skip any holes, uninitialized top-levels, or 439 * vdevs that are not in this metalab class. 440 */ 441 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 442 mg->mg_class != mc) { 443 continue; 444 } 445 446 /* 447 * If a metaslab group does not contain a fragmentation 448 * metric then just bail out. 449 */ 450 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 451 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 452 return (ZFS_FRAG_INVALID); 453 } 454 455 /* 456 * Determine how much this metaslab_group is contributing 457 * to the overall pool fragmentation metric. 458 */ 459 fragmentation += mg->mg_fragmentation * 460 metaslab_group_get_space(mg); 461 } 462 fragmentation /= metaslab_class_get_space(mc); 463 464 ASSERT3U(fragmentation, <=, 100); 465 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 466 return (fragmentation); 467} 468 469/* 470 * Calculate the amount of expandable space that is available in 471 * this metaslab class. If a device is expanded then its expandable 472 * space will be the amount of allocatable space that is currently not 473 * part of this metaslab class. 474 */ 475uint64_t 476metaslab_class_expandable_space(metaslab_class_t *mc) 477{ 478 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 479 uint64_t space = 0; 480 481 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 482 for (int c = 0; c < rvd->vdev_children; c++) { 483 vdev_t *tvd = rvd->vdev_child[c]; 484 metaslab_group_t *mg = tvd->vdev_mg; 485 486 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 487 mg->mg_class != mc) { 488 continue; 489 } 490 491 space += tvd->vdev_max_asize - tvd->vdev_asize; 492 } 493 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 494 return (space); 495} 496 497/* 498 * ========================================================================== 499 * Metaslab groups 500 * ========================================================================== 501 */ 502static int 503metaslab_compare(const void *x1, const void *x2) 504{ 505 const metaslab_t *m1 = x1; 506 const metaslab_t *m2 = x2; 507 508 if (m1->ms_weight < m2->ms_weight) 509 return (1); 510 if (m1->ms_weight > m2->ms_weight) 511 return (-1); 512 513 /* 514 * If the weights are identical, use the offset to force uniqueness. 515 */ 516 if (m1->ms_start < m2->ms_start) 517 return (-1); 518 if (m1->ms_start > m2->ms_start) 519 return (1); 520 521 ASSERT3P(m1, ==, m2); 522 523 return (0); 524} 525 526/* 527 * Update the allocatable flag and the metaslab group's capacity. 528 * The allocatable flag is set to true if the capacity is below 529 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 530 * from allocatable to non-allocatable or vice versa then the metaslab 531 * group's class is updated to reflect the transition. 532 */ 533static void 534metaslab_group_alloc_update(metaslab_group_t *mg) 535{ 536 vdev_t *vd = mg->mg_vd; 537 metaslab_class_t *mc = mg->mg_class; 538 vdev_stat_t *vs = &vd->vdev_stat; 539 boolean_t was_allocatable; 540 541 ASSERT(vd == vd->vdev_top); 542 543 mutex_enter(&mg->mg_lock); 544 was_allocatable = mg->mg_allocatable; 545 546 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 547 (vs->vs_space + 1); 548 549 /* 550 * A metaslab group is considered allocatable if it has plenty 551 * of free space or is not heavily fragmented. We only take 552 * fragmentation into account if the metaslab group has a valid 553 * fragmentation metric (i.e. a value between 0 and 100). 554 */ 555 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 556 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 557 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 558 559 /* 560 * The mc_alloc_groups maintains a count of the number of 561 * groups in this metaslab class that are still above the 562 * zfs_mg_noalloc_threshold. This is used by the allocating 563 * threads to determine if they should avoid allocations to 564 * a given group. The allocator will avoid allocations to a group 565 * if that group has reached or is below the zfs_mg_noalloc_threshold 566 * and there are still other groups that are above the threshold. 567 * When a group transitions from allocatable to non-allocatable or 568 * vice versa we update the metaslab class to reflect that change. 569 * When the mc_alloc_groups value drops to 0 that means that all 570 * groups have reached the zfs_mg_noalloc_threshold making all groups 571 * eligible for allocations. This effectively means that all devices 572 * are balanced again. 573 */ 574 if (was_allocatable && !mg->mg_allocatable) 575 mc->mc_alloc_groups--; 576 else if (!was_allocatable && mg->mg_allocatable) 577 mc->mc_alloc_groups++; 578 579 mutex_exit(&mg->mg_lock); 580} 581 582metaslab_group_t * 583metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 584{ 585 metaslab_group_t *mg; 586 587 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 588 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 589 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 590 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 591 mg->mg_vd = vd; 592 mg->mg_class = mc; 593 mg->mg_activation_count = 0; 594 595 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 596 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 597 598 return (mg); 599} 600 601void 602metaslab_group_destroy(metaslab_group_t *mg) 603{ 604 ASSERT(mg->mg_prev == NULL); 605 ASSERT(mg->mg_next == NULL); 606 /* 607 * We may have gone below zero with the activation count 608 * either because we never activated in the first place or 609 * because we're done, and possibly removing the vdev. 610 */ 611 ASSERT(mg->mg_activation_count <= 0); 612 613 taskq_destroy(mg->mg_taskq); 614 avl_destroy(&mg->mg_metaslab_tree); 615 mutex_destroy(&mg->mg_lock); 616 kmem_free(mg, sizeof (metaslab_group_t)); 617} 618 619void 620metaslab_group_activate(metaslab_group_t *mg) 621{ 622 metaslab_class_t *mc = mg->mg_class; 623 metaslab_group_t *mgprev, *mgnext; 624 625 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 626 627 ASSERT(mc->mc_rotor != mg); 628 ASSERT(mg->mg_prev == NULL); 629 ASSERT(mg->mg_next == NULL); 630 ASSERT(mg->mg_activation_count <= 0); 631 632 if (++mg->mg_activation_count <= 0) 633 return; 634 635 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 636 metaslab_group_alloc_update(mg); 637 638 if ((mgprev = mc->mc_rotor) == NULL) { 639 mg->mg_prev = mg; 640 mg->mg_next = mg; 641 } else { 642 mgnext = mgprev->mg_next; 643 mg->mg_prev = mgprev; 644 mg->mg_next = mgnext; 645 mgprev->mg_next = mg; 646 mgnext->mg_prev = mg; 647 } 648 mc->mc_rotor = mg; 649 metaslab_class_minblocksize_update(mc); 650} 651 652void 653metaslab_group_passivate(metaslab_group_t *mg) 654{ 655 metaslab_class_t *mc = mg->mg_class; 656 metaslab_group_t *mgprev, *mgnext; 657 658 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 659 660 if (--mg->mg_activation_count != 0) { 661 ASSERT(mc->mc_rotor != mg); 662 ASSERT(mg->mg_prev == NULL); 663 ASSERT(mg->mg_next == NULL); 664 ASSERT(mg->mg_activation_count < 0); 665 return; 666 } 667 668 taskq_wait(mg->mg_taskq); 669 metaslab_group_alloc_update(mg); 670 671 mgprev = mg->mg_prev; 672 mgnext = mg->mg_next; 673 674 if (mg == mgnext) { 675 mc->mc_rotor = NULL; 676 } else { 677 mc->mc_rotor = mgnext; 678 mgprev->mg_next = mgnext; 679 mgnext->mg_prev = mgprev; 680 } 681 682 mg->mg_prev = NULL; 683 mg->mg_next = NULL; 684 metaslab_class_minblocksize_update(mc); 685} 686 687uint64_t 688metaslab_group_get_space(metaslab_group_t *mg) 689{ 690 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 691} 692 693void 694metaslab_group_histogram_verify(metaslab_group_t *mg) 695{ 696 uint64_t *mg_hist; 697 vdev_t *vd = mg->mg_vd; 698 uint64_t ashift = vd->vdev_ashift; 699 int i; 700 701 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 702 return; 703 704 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 705 KM_SLEEP); 706 707 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 708 SPACE_MAP_HISTOGRAM_SIZE + ashift); 709 710 for (int m = 0; m < vd->vdev_ms_count; m++) { 711 metaslab_t *msp = vd->vdev_ms[m]; 712 713 if (msp->ms_sm == NULL) 714 continue; 715 716 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 717 mg_hist[i + ashift] += 718 msp->ms_sm->sm_phys->smp_histogram[i]; 719 } 720 721 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 722 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 723 724 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 725} 726 727static void 728metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 729{ 730 metaslab_class_t *mc = mg->mg_class; 731 uint64_t ashift = mg->mg_vd->vdev_ashift; 732 733 ASSERT(MUTEX_HELD(&msp->ms_lock)); 734 if (msp->ms_sm == NULL) 735 return; 736 737 mutex_enter(&mg->mg_lock); 738 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 739 mg->mg_histogram[i + ashift] += 740 msp->ms_sm->sm_phys->smp_histogram[i]; 741 mc->mc_histogram[i + ashift] += 742 msp->ms_sm->sm_phys->smp_histogram[i]; 743 } 744 mutex_exit(&mg->mg_lock); 745} 746 747void 748metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 749{ 750 metaslab_class_t *mc = mg->mg_class; 751 uint64_t ashift = mg->mg_vd->vdev_ashift; 752 753 ASSERT(MUTEX_HELD(&msp->ms_lock)); 754 if (msp->ms_sm == NULL) 755 return; 756 757 mutex_enter(&mg->mg_lock); 758 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 759 ASSERT3U(mg->mg_histogram[i + ashift], >=, 760 msp->ms_sm->sm_phys->smp_histogram[i]); 761 ASSERT3U(mc->mc_histogram[i + ashift], >=, 762 msp->ms_sm->sm_phys->smp_histogram[i]); 763 764 mg->mg_histogram[i + ashift] -= 765 msp->ms_sm->sm_phys->smp_histogram[i]; 766 mc->mc_histogram[i + ashift] -= 767 msp->ms_sm->sm_phys->smp_histogram[i]; 768 } 769 mutex_exit(&mg->mg_lock); 770} 771 772static void 773metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 774{ 775 ASSERT(msp->ms_group == NULL); 776 mutex_enter(&mg->mg_lock); 777 msp->ms_group = mg; 778 msp->ms_weight = 0; 779 avl_add(&mg->mg_metaslab_tree, msp); 780 mutex_exit(&mg->mg_lock); 781 782 mutex_enter(&msp->ms_lock); 783 metaslab_group_histogram_add(mg, msp); 784 mutex_exit(&msp->ms_lock); 785} 786 787static void 788metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 789{ 790 mutex_enter(&msp->ms_lock); 791 metaslab_group_histogram_remove(mg, msp); 792 mutex_exit(&msp->ms_lock); 793 794 mutex_enter(&mg->mg_lock); 795 ASSERT(msp->ms_group == mg); 796 avl_remove(&mg->mg_metaslab_tree, msp); 797 msp->ms_group = NULL; 798 mutex_exit(&mg->mg_lock); 799} 800 801static void 802metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 803{ 804 /* 805 * Although in principle the weight can be any value, in 806 * practice we do not use values in the range [1, 511]. 807 */ 808 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 809 ASSERT(MUTEX_HELD(&msp->ms_lock)); 810 811 mutex_enter(&mg->mg_lock); 812 ASSERT(msp->ms_group == mg); 813 avl_remove(&mg->mg_metaslab_tree, msp); 814 msp->ms_weight = weight; 815 avl_add(&mg->mg_metaslab_tree, msp); 816 mutex_exit(&mg->mg_lock); 817} 818 819/* 820 * Calculate the fragmentation for a given metaslab group. We can use 821 * a simple average here since all metaslabs within the group must have 822 * the same size. The return value will be a value between 0 and 100 823 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 824 * group have a fragmentation metric. 825 */ 826uint64_t 827metaslab_group_fragmentation(metaslab_group_t *mg) 828{ 829 vdev_t *vd = mg->mg_vd; 830 uint64_t fragmentation = 0; 831 uint64_t valid_ms = 0; 832 833 for (int m = 0; m < vd->vdev_ms_count; m++) { 834 metaslab_t *msp = vd->vdev_ms[m]; 835 836 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 837 continue; 838 839 valid_ms++; 840 fragmentation += msp->ms_fragmentation; 841 } 842 843 if (valid_ms <= vd->vdev_ms_count / 2) 844 return (ZFS_FRAG_INVALID); 845 846 fragmentation /= valid_ms; 847 ASSERT3U(fragmentation, <=, 100); 848 return (fragmentation); 849} 850 851/* 852 * Determine if a given metaslab group should skip allocations. A metaslab 853 * group should avoid allocations if its free capacity is less than the 854 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 855 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 856 * that can still handle allocations. 857 */ 858static boolean_t 859metaslab_group_allocatable(metaslab_group_t *mg) 860{ 861 vdev_t *vd = mg->mg_vd; 862 spa_t *spa = vd->vdev_spa; 863 metaslab_class_t *mc = mg->mg_class; 864 865 /* 866 * We use two key metrics to determine if a metaslab group is 867 * considered allocatable -- free space and fragmentation. If 868 * the free space is greater than the free space threshold and 869 * the fragmentation is less than the fragmentation threshold then 870 * consider the group allocatable. There are two case when we will 871 * not consider these key metrics. The first is if the group is 872 * associated with a slog device and the second is if all groups 873 * in this metaslab class have already been consider ineligible 874 * for allocations. 875 */ 876 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 877 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 878 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 879 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 880} 881 882/* 883 * ========================================================================== 884 * Range tree callbacks 885 * ========================================================================== 886 */ 887 888/* 889 * Comparison function for the private size-ordered tree. Tree is sorted 890 * by size, larger sizes at the end of the tree. 891 */ 892static int 893metaslab_rangesize_compare(const void *x1, const void *x2) 894{ 895 const range_seg_t *r1 = x1; 896 const range_seg_t *r2 = x2; 897 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 898 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 899 900 if (rs_size1 < rs_size2) 901 return (-1); 902 if (rs_size1 > rs_size2) 903 return (1); 904 905 if (r1->rs_start < r2->rs_start) 906 return (-1); 907 908 if (r1->rs_start > r2->rs_start) 909 return (1); 910 911 return (0); 912} 913 914/* 915 * Create any block allocator specific components. The current allocators 916 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 917 */ 918static void 919metaslab_rt_create(range_tree_t *rt, void *arg) 920{ 921 metaslab_t *msp = arg; 922 923 ASSERT3P(rt->rt_arg, ==, msp); 924 ASSERT(msp->ms_tree == NULL); 925 926 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 927 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 928} 929 930/* 931 * Destroy the block allocator specific components. 932 */ 933static void 934metaslab_rt_destroy(range_tree_t *rt, void *arg) 935{ 936 metaslab_t *msp = arg; 937 938 ASSERT3P(rt->rt_arg, ==, msp); 939 ASSERT3P(msp->ms_tree, ==, rt); 940 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 941 942 avl_destroy(&msp->ms_size_tree); 943} 944 945static void 946metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 947{ 948 metaslab_t *msp = arg; 949 950 ASSERT3P(rt->rt_arg, ==, msp); 951 ASSERT3P(msp->ms_tree, ==, rt); 952 VERIFY(!msp->ms_condensing); 953 avl_add(&msp->ms_size_tree, rs); 954} 955 956static void 957metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 958{ 959 metaslab_t *msp = arg; 960 961 ASSERT3P(rt->rt_arg, ==, msp); 962 ASSERT3P(msp->ms_tree, ==, rt); 963 VERIFY(!msp->ms_condensing); 964 avl_remove(&msp->ms_size_tree, rs); 965} 966 967static void 968metaslab_rt_vacate(range_tree_t *rt, void *arg) 969{ 970 metaslab_t *msp = arg; 971 972 ASSERT3P(rt->rt_arg, ==, msp); 973 ASSERT3P(msp->ms_tree, ==, rt); 974 975 /* 976 * Normally one would walk the tree freeing nodes along the way. 977 * Since the nodes are shared with the range trees we can avoid 978 * walking all nodes and just reinitialize the avl tree. The nodes 979 * will be freed by the range tree, so we don't want to free them here. 980 */ 981 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 982 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 983} 984 985static range_tree_ops_t metaslab_rt_ops = { 986 metaslab_rt_create, 987 metaslab_rt_destroy, 988 metaslab_rt_add, 989 metaslab_rt_remove, 990 metaslab_rt_vacate 991}; 992 993/* 994 * ========================================================================== 995 * Metaslab block operations 996 * ========================================================================== 997 */ 998 999/* 1000 * Return the maximum contiguous segment within the metaslab. 1001 */ 1002uint64_t 1003metaslab_block_maxsize(metaslab_t *msp) 1004{ 1005 avl_tree_t *t = &msp->ms_size_tree; 1006 range_seg_t *rs; 1007 1008 if (t == NULL || (rs = avl_last(t)) == NULL) 1009 return (0ULL); 1010 1011 return (rs->rs_end - rs->rs_start); 1012} 1013 1014uint64_t 1015metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1016{ 1017 uint64_t start; 1018 range_tree_t *rt = msp->ms_tree; 1019 1020 VERIFY(!msp->ms_condensing); 1021 1022 start = msp->ms_ops->msop_alloc(msp, size); 1023 if (start != -1ULL) { 1024 vdev_t *vd = msp->ms_group->mg_vd; 1025 1026 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1027 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1028 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1029 range_tree_remove(rt, start, size); 1030 } 1031 return (start); 1032} 1033 1034/* 1035 * ========================================================================== 1036 * Common allocator routines 1037 * ========================================================================== 1038 */ 1039 1040/* 1041 * This is a helper function that can be used by the allocator to find 1042 * a suitable block to allocate. This will search the specified AVL 1043 * tree looking for a block that matches the specified criteria. 1044 */ 1045static uint64_t 1046metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1047 uint64_t align) 1048{ 1049 range_seg_t *rs, rsearch; 1050 avl_index_t where; 1051 1052 rsearch.rs_start = *cursor; 1053 rsearch.rs_end = *cursor + size; 1054 1055 rs = avl_find(t, &rsearch, &where); 1056 if (rs == NULL) 1057 rs = avl_nearest(t, where, AVL_AFTER); 1058 1059 while (rs != NULL) { 1060 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1061 1062 if (offset + size <= rs->rs_end) { 1063 *cursor = offset + size; 1064 return (offset); 1065 } 1066 rs = AVL_NEXT(t, rs); 1067 } 1068 1069 /* 1070 * If we know we've searched the whole map (*cursor == 0), give up. 1071 * Otherwise, reset the cursor to the beginning and try again. 1072 */ 1073 if (*cursor == 0) 1074 return (-1ULL); 1075 1076 *cursor = 0; 1077 return (metaslab_block_picker(t, cursor, size, align)); 1078} 1079 1080/* 1081 * ========================================================================== 1082 * The first-fit block allocator 1083 * ========================================================================== 1084 */ 1085static uint64_t 1086metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1087{ 1088 /* 1089 * Find the largest power of 2 block size that evenly divides the 1090 * requested size. This is used to try to allocate blocks with similar 1091 * alignment from the same area of the metaslab (i.e. same cursor 1092 * bucket) but it does not guarantee that other allocations sizes 1093 * may exist in the same region. 1094 */ 1095 uint64_t align = size & -size; 1096 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1097 avl_tree_t *t = &msp->ms_tree->rt_root; 1098 1099 return (metaslab_block_picker(t, cursor, size, align)); 1100} 1101 1102static metaslab_ops_t metaslab_ff_ops = { 1103 metaslab_ff_alloc 1104}; 1105 1106/* 1107 * ========================================================================== 1108 * Dynamic block allocator - 1109 * Uses the first fit allocation scheme until space get low and then 1110 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1111 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1112 * ========================================================================== 1113 */ 1114static uint64_t 1115metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1116{ 1117 /* 1118 * Find the largest power of 2 block size that evenly divides the 1119 * requested size. This is used to try to allocate blocks with similar 1120 * alignment from the same area of the metaslab (i.e. same cursor 1121 * bucket) but it does not guarantee that other allocations sizes 1122 * may exist in the same region. 1123 */ 1124 uint64_t align = size & -size; 1125 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1126 range_tree_t *rt = msp->ms_tree; 1127 avl_tree_t *t = &rt->rt_root; 1128 uint64_t max_size = metaslab_block_maxsize(msp); 1129 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1130 1131 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1132 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1133 1134 if (max_size < size) 1135 return (-1ULL); 1136 1137 /* 1138 * If we're running low on space switch to using the size 1139 * sorted AVL tree (best-fit). 1140 */ 1141 if (max_size < metaslab_df_alloc_threshold || 1142 free_pct < metaslab_df_free_pct) { 1143 t = &msp->ms_size_tree; 1144 *cursor = 0; 1145 } 1146 1147 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1148} 1149 1150static metaslab_ops_t metaslab_df_ops = { 1151 metaslab_df_alloc 1152}; 1153 1154/* 1155 * ========================================================================== 1156 * Cursor fit block allocator - 1157 * Select the largest region in the metaslab, set the cursor to the beginning 1158 * of the range and the cursor_end to the end of the range. As allocations 1159 * are made advance the cursor. Continue allocating from the cursor until 1160 * the range is exhausted and then find a new range. 1161 * ========================================================================== 1162 */ 1163static uint64_t 1164metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1165{ 1166 range_tree_t *rt = msp->ms_tree; 1167 avl_tree_t *t = &msp->ms_size_tree; 1168 uint64_t *cursor = &msp->ms_lbas[0]; 1169 uint64_t *cursor_end = &msp->ms_lbas[1]; 1170 uint64_t offset = 0; 1171 1172 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1173 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1174 1175 ASSERT3U(*cursor_end, >=, *cursor); 1176 1177 if ((*cursor + size) > *cursor_end) { 1178 range_seg_t *rs; 1179 1180 rs = avl_last(&msp->ms_size_tree); 1181 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1182 return (-1ULL); 1183 1184 *cursor = rs->rs_start; 1185 *cursor_end = rs->rs_end; 1186 } 1187 1188 offset = *cursor; 1189 *cursor += size; 1190 1191 return (offset); 1192} 1193 1194static metaslab_ops_t metaslab_cf_ops = { 1195 metaslab_cf_alloc 1196}; 1197 1198/* 1199 * ========================================================================== 1200 * New dynamic fit allocator - 1201 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1202 * contiguous blocks. If no region is found then just use the largest segment 1203 * that remains. 1204 * ========================================================================== 1205 */ 1206 1207/* 1208 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1209 * to request from the allocator. 1210 */ 1211uint64_t metaslab_ndf_clump_shift = 4; 1212 1213static uint64_t 1214metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1215{ 1216 avl_tree_t *t = &msp->ms_tree->rt_root; 1217 avl_index_t where; 1218 range_seg_t *rs, rsearch; 1219 uint64_t hbit = highbit64(size); 1220 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1221 uint64_t max_size = metaslab_block_maxsize(msp); 1222 1223 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1224 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1225 1226 if (max_size < size) 1227 return (-1ULL); 1228 1229 rsearch.rs_start = *cursor; 1230 rsearch.rs_end = *cursor + size; 1231 1232 rs = avl_find(t, &rsearch, &where); 1233 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1234 t = &msp->ms_size_tree; 1235 1236 rsearch.rs_start = 0; 1237 rsearch.rs_end = MIN(max_size, 1238 1ULL << (hbit + metaslab_ndf_clump_shift)); 1239 rs = avl_find(t, &rsearch, &where); 1240 if (rs == NULL) 1241 rs = avl_nearest(t, where, AVL_AFTER); 1242 ASSERT(rs != NULL); 1243 } 1244 1245 if ((rs->rs_end - rs->rs_start) >= size) { 1246 *cursor = rs->rs_start + size; 1247 return (rs->rs_start); 1248 } 1249 return (-1ULL); 1250} 1251 1252static metaslab_ops_t metaslab_ndf_ops = { 1253 metaslab_ndf_alloc 1254}; 1255 1256metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1257 1258/* 1259 * ========================================================================== 1260 * Metaslabs 1261 * ========================================================================== 1262 */ 1263 1264/* 1265 * Wait for any in-progress metaslab loads to complete. 1266 */ 1267void 1268metaslab_load_wait(metaslab_t *msp) 1269{ 1270 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1271 1272 while (msp->ms_loading) { 1273 ASSERT(!msp->ms_loaded); 1274 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1275 } 1276} 1277 1278int 1279metaslab_load(metaslab_t *msp) 1280{ 1281 int error = 0; 1282 1283 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1284 ASSERT(!msp->ms_loaded); 1285 ASSERT(!msp->ms_loading); 1286 1287 msp->ms_loading = B_TRUE; 1288 1289 /* 1290 * If the space map has not been allocated yet, then treat 1291 * all the space in the metaslab as free and add it to the 1292 * ms_tree. 1293 */ 1294 if (msp->ms_sm != NULL) 1295 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1296 else 1297 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1298 1299 msp->ms_loaded = (error == 0); 1300 msp->ms_loading = B_FALSE; 1301 1302 if (msp->ms_loaded) { 1303 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1304 range_tree_walk(msp->ms_defertree[t], 1305 range_tree_remove, msp->ms_tree); 1306 } 1307 } 1308 cv_broadcast(&msp->ms_load_cv); 1309 return (error); 1310} 1311 1312void 1313metaslab_unload(metaslab_t *msp) 1314{ 1315 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1316 range_tree_vacate(msp->ms_tree, NULL, NULL); 1317 msp->ms_loaded = B_FALSE; 1318 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1319} 1320 1321metaslab_t * 1322metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg) 1323{ 1324 vdev_t *vd = mg->mg_vd; 1325 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1326 metaslab_t *msp; 1327 1328 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1329 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1330 cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL); 1331 msp->ms_id = id; 1332 msp->ms_start = id << vd->vdev_ms_shift; 1333 msp->ms_size = 1ULL << vd->vdev_ms_shift; 1334 1335 /* 1336 * We only open space map objects that already exist. All others 1337 * will be opened when we finally allocate an object for it. 1338 */ 1339 if (object != 0) { 1340 VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start, 1341 msp->ms_size, vd->vdev_ashift, &msp->ms_lock)); 1342 ASSERT(msp->ms_sm != NULL); 1343 } 1344 1345 /* 1346 * We create the main range tree here, but we don't create the 1347 * alloctree and freetree until metaslab_sync_done(). This serves 1348 * two purposes: it allows metaslab_sync_done() to detect the 1349 * addition of new space; and for debugging, it ensures that we'd 1350 * data fault on any attempt to use this metaslab before it's ready. 1351 */ 1352 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 1353 metaslab_group_add(mg, msp); 1354 1355 msp->ms_fragmentation = metaslab_fragmentation(msp); 1356 msp->ms_ops = mg->mg_class->mc_ops; 1357 1358 /* 1359 * If we're opening an existing pool (txg == 0) or creating 1360 * a new one (txg == TXG_INITIAL), all space is available now. 1361 * If we're adding space to an existing pool, the new space 1362 * does not become available until after this txg has synced. 1363 */ 1364 if (txg <= TXG_INITIAL) 1365 metaslab_sync_done(msp, 0); 1366 1367 /* 1368 * If metaslab_debug_load is set and we're initializing a metaslab 1369 * that has an allocated space_map object then load the its space 1370 * map so that can verify frees. 1371 */ 1372 if (metaslab_debug_load && msp->ms_sm != NULL) { 1373 mutex_enter(&msp->ms_lock); 1374 VERIFY0(metaslab_load(msp)); 1375 mutex_exit(&msp->ms_lock); 1376 } 1377 1378 if (txg != 0) { 1379 vdev_dirty(vd, 0, NULL, txg); 1380 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1381 } 1382 1383 return (msp); 1384} 1385 1386void 1387metaslab_fini(metaslab_t *msp) 1388{ 1389 metaslab_group_t *mg = msp->ms_group; 1390 1391 metaslab_group_remove(mg, msp); 1392 1393 mutex_enter(&msp->ms_lock); 1394 1395 VERIFY(msp->ms_group == NULL); 1396 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1397 0, -msp->ms_size); 1398 space_map_close(msp->ms_sm); 1399 1400 metaslab_unload(msp); 1401 range_tree_destroy(msp->ms_tree); 1402 1403 for (int t = 0; t < TXG_SIZE; t++) { 1404 range_tree_destroy(msp->ms_alloctree[t]); 1405 range_tree_destroy(msp->ms_freetree[t]); 1406 } 1407 1408 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1409 range_tree_destroy(msp->ms_defertree[t]); 1410 } 1411 1412 ASSERT0(msp->ms_deferspace); 1413 1414 mutex_exit(&msp->ms_lock); 1415 cv_destroy(&msp->ms_load_cv); 1416 mutex_destroy(&msp->ms_lock); 1417 1418 kmem_free(msp, sizeof (metaslab_t)); 1419} 1420 1421#define FRAGMENTATION_TABLE_SIZE 17 1422 1423/* 1424 * This table defines a segment size based fragmentation metric that will 1425 * allow each metaslab to derive its own fragmentation value. This is done 1426 * by calculating the space in each bucket of the spacemap histogram and 1427 * multiplying that by the fragmetation metric in this table. Doing 1428 * this for all buckets and dividing it by the total amount of free 1429 * space in this metaslab (i.e. the total free space in all buckets) gives 1430 * us the fragmentation metric. This means that a high fragmentation metric 1431 * equates to most of the free space being comprised of small segments. 1432 * Conversely, if the metric is low, then most of the free space is in 1433 * large segments. A 10% change in fragmentation equates to approximately 1434 * double the number of segments. 1435 * 1436 * This table defines 0% fragmented space using 16MB segments. Testing has 1437 * shown that segments that are greater than or equal to 16MB do not suffer 1438 * from drastic performance problems. Using this value, we derive the rest 1439 * of the table. Since the fragmentation value is never stored on disk, it 1440 * is possible to change these calculations in the future. 1441 */ 1442int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1443 100, /* 512B */ 1444 100, /* 1K */ 1445 98, /* 2K */ 1446 95, /* 4K */ 1447 90, /* 8K */ 1448 80, /* 16K */ 1449 70, /* 32K */ 1450 60, /* 64K */ 1451 50, /* 128K */ 1452 40, /* 256K */ 1453 30, /* 512K */ 1454 20, /* 1M */ 1455 15, /* 2M */ 1456 10, /* 4M */ 1457 5, /* 8M */ 1458 0 /* 16M */ 1459}; 1460 1461/* 1462 * Calclate the metaslab's fragmentation metric. A return value 1463 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1464 * not support this metric. Otherwise, the return value should be in the 1465 * range [0, 100]. 1466 */ 1467static uint64_t 1468metaslab_fragmentation(metaslab_t *msp) 1469{ 1470 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1471 uint64_t fragmentation = 0; 1472 uint64_t total = 0; 1473 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1474 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1475 1476 if (!feature_enabled) 1477 return (ZFS_FRAG_INVALID); 1478 1479 /* 1480 * A null space map means that the entire metaslab is free 1481 * and thus is not fragmented. 1482 */ 1483 if (msp->ms_sm == NULL) 1484 return (0); 1485 1486 /* 1487 * If this metaslab's space_map has not been upgraded, flag it 1488 * so that we upgrade next time we encounter it. 1489 */ 1490 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1491 uint64_t txg = spa_syncing_txg(spa); 1492 vdev_t *vd = msp->ms_group->mg_vd; 1493 1494 msp->ms_condense_wanted = B_TRUE; 1495 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1496 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1497 "msp %p, vd %p", txg, msp, vd); 1498 return (ZFS_FRAG_INVALID); 1499 } 1500 1501 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1502 uint64_t space = 0; 1503 uint8_t shift = msp->ms_sm->sm_shift; 1504 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1505 FRAGMENTATION_TABLE_SIZE - 1); 1506 1507 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1508 continue; 1509 1510 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1511 total += space; 1512 1513 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1514 fragmentation += space * zfs_frag_table[idx]; 1515 } 1516 1517 if (total > 0) 1518 fragmentation /= total; 1519 ASSERT3U(fragmentation, <=, 100); 1520 return (fragmentation); 1521} 1522 1523/* 1524 * Compute a weight -- a selection preference value -- for the given metaslab. 1525 * This is based on the amount of free space, the level of fragmentation, 1526 * the LBA range, and whether the metaslab is loaded. 1527 */ 1528static uint64_t 1529metaslab_weight(metaslab_t *msp) 1530{ 1531 metaslab_group_t *mg = msp->ms_group; 1532 vdev_t *vd = mg->mg_vd; 1533 uint64_t weight, space; 1534 1535 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1536 1537 /* 1538 * This vdev is in the process of being removed so there is nothing 1539 * for us to do here. 1540 */ 1541 if (vd->vdev_removing) { 1542 ASSERT0(space_map_allocated(msp->ms_sm)); 1543 ASSERT0(vd->vdev_ms_shift); 1544 return (0); 1545 } 1546 1547 /* 1548 * The baseline weight is the metaslab's free space. 1549 */ 1550 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1551 1552 msp->ms_fragmentation = metaslab_fragmentation(msp); 1553 if (metaslab_fragmentation_factor_enabled && 1554 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1555 /* 1556 * Use the fragmentation information to inversely scale 1557 * down the baseline weight. We need to ensure that we 1558 * don't exclude this metaslab completely when it's 100% 1559 * fragmented. To avoid this we reduce the fragmented value 1560 * by 1. 1561 */ 1562 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1563 1564 /* 1565 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1566 * this metaslab again. The fragmentation metric may have 1567 * decreased the space to something smaller than 1568 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1569 * so that we can consume any remaining space. 1570 */ 1571 if (space > 0 && space < SPA_MINBLOCKSIZE) 1572 space = SPA_MINBLOCKSIZE; 1573 } 1574 weight = space; 1575 1576 /* 1577 * Modern disks have uniform bit density and constant angular velocity. 1578 * Therefore, the outer recording zones are faster (higher bandwidth) 1579 * than the inner zones by the ratio of outer to inner track diameter, 1580 * which is typically around 2:1. We account for this by assigning 1581 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1582 * In effect, this means that we'll select the metaslab with the most 1583 * free bandwidth rather than simply the one with the most free space. 1584 */ 1585 if (metaslab_lba_weighting_enabled) { 1586 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1587 ASSERT(weight >= space && weight <= 2 * space); 1588 } 1589 1590 /* 1591 * If this metaslab is one we're actively using, adjust its 1592 * weight to make it preferable to any inactive metaslab so 1593 * we'll polish it off. If the fragmentation on this metaslab 1594 * has exceed our threshold, then don't mark it active. 1595 */ 1596 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1597 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1598 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1599 } 1600 1601 return (weight); 1602} 1603 1604static int 1605metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1606{ 1607 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1608 1609 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1610 metaslab_load_wait(msp); 1611 if (!msp->ms_loaded) { 1612 int error = metaslab_load(msp); 1613 if (error) { 1614 metaslab_group_sort(msp->ms_group, msp, 0); 1615 return (error); 1616 } 1617 } 1618 1619 metaslab_group_sort(msp->ms_group, msp, 1620 msp->ms_weight | activation_weight); 1621 } 1622 ASSERT(msp->ms_loaded); 1623 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1624 1625 return (0); 1626} 1627 1628static void 1629metaslab_passivate(metaslab_t *msp, uint64_t size) 1630{ 1631 /* 1632 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1633 * this metaslab again. In that case, it had better be empty, 1634 * or we would be leaving space on the table. 1635 */ 1636 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1637 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1638 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1639} 1640 1641static void 1642metaslab_preload(void *arg) 1643{ 1644 metaslab_t *msp = arg; 1645 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1646 1647 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1648 1649 mutex_enter(&msp->ms_lock); 1650 metaslab_load_wait(msp); 1651 if (!msp->ms_loaded) 1652 (void) metaslab_load(msp); 1653 1654 /* 1655 * Set the ms_access_txg value so that we don't unload it right away. 1656 */ 1657 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1658 mutex_exit(&msp->ms_lock); 1659} 1660 1661static void 1662metaslab_group_preload(metaslab_group_t *mg) 1663{ 1664 spa_t *spa = mg->mg_vd->vdev_spa; 1665 metaslab_t *msp; 1666 avl_tree_t *t = &mg->mg_metaslab_tree; 1667 int m = 0; 1668 1669 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1670 taskq_wait(mg->mg_taskq); 1671 return; 1672 } 1673 1674 mutex_enter(&mg->mg_lock); 1675 /* 1676 * Load the next potential metaslabs 1677 */ 1678 msp = avl_first(t); 1679 while (msp != NULL) { 1680 metaslab_t *msp_next = AVL_NEXT(t, msp); 1681 1682 /* 1683 * We preload only the maximum number of metaslabs specified 1684 * by metaslab_preload_limit. If a metaslab is being forced 1685 * to condense then we preload it too. This will ensure 1686 * that force condensing happens in the next txg. 1687 */ 1688 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1689 msp = msp_next; 1690 continue; 1691 } 1692 1693 /* 1694 * We must drop the metaslab group lock here to preserve 1695 * lock ordering with the ms_lock (when grabbing both 1696 * the mg_lock and the ms_lock, the ms_lock must be taken 1697 * first). As a result, it is possible that the ordering 1698 * of the metaslabs within the avl tree may change before 1699 * we reacquire the lock. The metaslab cannot be removed from 1700 * the tree while we're in syncing context so it is safe to 1701 * drop the mg_lock here. If the metaslabs are reordered 1702 * nothing will break -- we just may end up loading a 1703 * less than optimal one. 1704 */ 1705 mutex_exit(&mg->mg_lock); 1706 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1707 msp, TQ_SLEEP) != 0); 1708 mutex_enter(&mg->mg_lock); 1709 msp = msp_next; 1710 } 1711 mutex_exit(&mg->mg_lock); 1712} 1713 1714/* 1715 * Determine if the space map's on-disk footprint is past our tolerance 1716 * for inefficiency. We would like to use the following criteria to make 1717 * our decision: 1718 * 1719 * 1. The size of the space map object should not dramatically increase as a 1720 * result of writing out the free space range tree. 1721 * 1722 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1723 * times the size than the free space range tree representation 1724 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1725 * 1726 * 3. The on-disk size of the space map should actually decrease. 1727 * 1728 * Checking the first condition is tricky since we don't want to walk 1729 * the entire AVL tree calculating the estimated on-disk size. Instead we 1730 * use the size-ordered range tree in the metaslab and calculate the 1731 * size required to write out the largest segment in our free tree. If the 1732 * size required to represent that segment on disk is larger than the space 1733 * map object then we avoid condensing this map. 1734 * 1735 * To determine the second criterion we use a best-case estimate and assume 1736 * each segment can be represented on-disk as a single 64-bit entry. We refer 1737 * to this best-case estimate as the space map's minimal form. 1738 * 1739 * Unfortunately, we cannot compute the on-disk size of the space map in this 1740 * context because we cannot accurately compute the effects of compression, etc. 1741 * Instead, we apply the heuristic described in the block comment for 1742 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1743 * is greater than a threshold number of blocks. 1744 */ 1745static boolean_t 1746metaslab_should_condense(metaslab_t *msp) 1747{ 1748 space_map_t *sm = msp->ms_sm; 1749 range_seg_t *rs; 1750 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1751 dmu_object_info_t doi; 1752 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1753 1754 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1755 ASSERT(msp->ms_loaded); 1756 1757 /* 1758 * Use the ms_size_tree range tree, which is ordered by size, to 1759 * obtain the largest segment in the free tree. We always condense 1760 * metaslabs that are empty and metaslabs for which a condense 1761 * request has been made. 1762 */ 1763 rs = avl_last(&msp->ms_size_tree); 1764 if (rs == NULL || msp->ms_condense_wanted) 1765 return (B_TRUE); 1766 1767 /* 1768 * Calculate the number of 64-bit entries this segment would 1769 * require when written to disk. If this single segment would be 1770 * larger on-disk than the entire current on-disk structure, then 1771 * clearly condensing will increase the on-disk structure size. 1772 */ 1773 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1774 entries = size / (MIN(size, SM_RUN_MAX)); 1775 segsz = entries * sizeof (uint64_t); 1776 1777 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1778 object_size = space_map_length(msp->ms_sm); 1779 1780 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1781 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1782 1783 return (segsz <= object_size && 1784 object_size >= (optimal_size * zfs_condense_pct / 100) && 1785 object_size > zfs_metaslab_condense_block_threshold * record_size); 1786} 1787 1788/* 1789 * Condense the on-disk space map representation to its minimized form. 1790 * The minimized form consists of a small number of allocations followed by 1791 * the entries of the free range tree. 1792 */ 1793static void 1794metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1795{ 1796 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1797 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1798 range_tree_t *condense_tree; 1799 space_map_t *sm = msp->ms_sm; 1800 1801 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1802 ASSERT3U(spa_sync_pass(spa), ==, 1); 1803 ASSERT(msp->ms_loaded); 1804 1805 1806 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1807 "smp size %llu, segments %lu, forcing condense=%s", txg, 1808 msp->ms_id, msp, space_map_length(msp->ms_sm), 1809 avl_numnodes(&msp->ms_tree->rt_root), 1810 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1811 1812 msp->ms_condense_wanted = B_FALSE; 1813 1814 /* 1815 * Create an range tree that is 100% allocated. We remove segments 1816 * that have been freed in this txg, any deferred frees that exist, 1817 * and any allocation in the future. Removing segments should be 1818 * a relatively inexpensive operation since we expect these trees to 1819 * have a small number of nodes. 1820 */ 1821 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1822 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1823 1824 /* 1825 * Remove what's been freed in this txg from the condense_tree. 1826 * Since we're in sync_pass 1, we know that all the frees from 1827 * this txg are in the freetree. 1828 */ 1829 range_tree_walk(freetree, range_tree_remove, condense_tree); 1830 1831 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1832 range_tree_walk(msp->ms_defertree[t], 1833 range_tree_remove, condense_tree); 1834 } 1835 1836 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1837 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1838 range_tree_remove, condense_tree); 1839 } 1840 1841 /* 1842 * We're about to drop the metaslab's lock thus allowing 1843 * other consumers to change it's content. Set the 1844 * metaslab's ms_condensing flag to ensure that 1845 * allocations on this metaslab do not occur while we're 1846 * in the middle of committing it to disk. This is only critical 1847 * for the ms_tree as all other range trees use per txg 1848 * views of their content. 1849 */ 1850 msp->ms_condensing = B_TRUE; 1851 1852 mutex_exit(&msp->ms_lock); 1853 space_map_truncate(sm, tx); 1854 mutex_enter(&msp->ms_lock); 1855 1856 /* 1857 * While we would ideally like to create a space_map representation 1858 * that consists only of allocation records, doing so can be 1859 * prohibitively expensive because the in-core free tree can be 1860 * large, and therefore computationally expensive to subtract 1861 * from the condense_tree. Instead we sync out two trees, a cheap 1862 * allocation only tree followed by the in-core free tree. While not 1863 * optimal, this is typically close to optimal, and much cheaper to 1864 * compute. 1865 */ 1866 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1867 range_tree_vacate(condense_tree, NULL, NULL); 1868 range_tree_destroy(condense_tree); 1869 1870 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1871 msp->ms_condensing = B_FALSE; 1872} 1873 1874/* 1875 * Write a metaslab to disk in the context of the specified transaction group. 1876 */ 1877void 1878metaslab_sync(metaslab_t *msp, uint64_t txg) 1879{ 1880 metaslab_group_t *mg = msp->ms_group; 1881 vdev_t *vd = mg->mg_vd; 1882 spa_t *spa = vd->vdev_spa; 1883 objset_t *mos = spa_meta_objset(spa); 1884 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1885 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1886 range_tree_t **freed_tree = 1887 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1888 dmu_tx_t *tx; 1889 uint64_t object = space_map_object(msp->ms_sm); 1890 1891 ASSERT(!vd->vdev_ishole); 1892 1893 /* 1894 * This metaslab has just been added so there's no work to do now. 1895 */ 1896 if (*freetree == NULL) { 1897 ASSERT3P(alloctree, ==, NULL); 1898 return; 1899 } 1900 1901 ASSERT3P(alloctree, !=, NULL); 1902 ASSERT3P(*freetree, !=, NULL); 1903 ASSERT3P(*freed_tree, !=, NULL); 1904 1905 /* 1906 * Normally, we don't want to process a metaslab if there 1907 * are no allocations or frees to perform. However, if the metaslab 1908 * is being forced to condense we need to let it through. 1909 */ 1910 if (range_tree_space(alloctree) == 0 && 1911 range_tree_space(*freetree) == 0 && 1912 !msp->ms_condense_wanted) 1913 return; 1914 1915 /* 1916 * The only state that can actually be changing concurrently with 1917 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1918 * be modifying this txg's alloctree, freetree, freed_tree, or 1919 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1920 * space_map ASSERTs. We drop it whenever we call into the DMU, 1921 * because the DMU can call down to us (e.g. via zio_free()) at 1922 * any time. 1923 */ 1924 1925 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1926 1927 if (msp->ms_sm == NULL) { 1928 uint64_t new_object; 1929 1930 new_object = space_map_alloc(mos, tx); 1931 VERIFY3U(new_object, !=, 0); 1932 1933 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1934 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1935 &msp->ms_lock)); 1936 ASSERT(msp->ms_sm != NULL); 1937 } 1938 1939 mutex_enter(&msp->ms_lock); 1940 1941 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1942 metaslab_should_condense(msp)) { 1943 metaslab_condense(msp, txg, tx); 1944 } else { 1945 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1946 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1947 } 1948 1949 metaslab_group_histogram_verify(mg); 1950 metaslab_class_histogram_verify(mg->mg_class); 1951 metaslab_group_histogram_remove(mg, msp); 1952 if (msp->ms_loaded) { 1953 /* 1954 * When the space map is loaded, we have an accruate 1955 * histogram in the range tree. This gives us an opportunity 1956 * to bring the space map's histogram up-to-date so we clear 1957 * it first before updating it. 1958 */ 1959 space_map_histogram_clear(msp->ms_sm); 1960 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1961 } else { 1962 /* 1963 * Since the space map is not loaded we simply update the 1964 * exisiting histogram with what was freed in this txg. This 1965 * means that the on-disk histogram may not have an accurate 1966 * view of the free space but it's close enough to allow 1967 * us to make allocation decisions. 1968 */ 1969 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1970 } 1971 metaslab_group_histogram_add(mg, msp); 1972 metaslab_group_histogram_verify(mg); 1973 metaslab_class_histogram_verify(mg->mg_class); 1974 1975 /* 1976 * For sync pass 1, we avoid traversing this txg's free range tree 1977 * and instead will just swap the pointers for freetree and 1978 * freed_tree. We can safely do this since the freed_tree is 1979 * guaranteed to be empty on the initial pass. 1980 */ 1981 if (spa_sync_pass(spa) == 1) { 1982 range_tree_swap(freetree, freed_tree); 1983 } else { 1984 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1985 } 1986 range_tree_vacate(alloctree, NULL, NULL); 1987 1988 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1989 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1990 1991 mutex_exit(&msp->ms_lock); 1992 1993 if (object != space_map_object(msp->ms_sm)) { 1994 object = space_map_object(msp->ms_sm); 1995 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1996 msp->ms_id, sizeof (uint64_t), &object, tx); 1997 } 1998 dmu_tx_commit(tx); 1999} 2000 2001/* 2002 * Called after a transaction group has completely synced to mark 2003 * all of the metaslab's free space as usable. 2004 */ 2005void 2006metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2007{ 2008 metaslab_group_t *mg = msp->ms_group; 2009 vdev_t *vd = mg->mg_vd; 2010 range_tree_t **freed_tree; 2011 range_tree_t **defer_tree; 2012 int64_t alloc_delta, defer_delta; 2013 2014 ASSERT(!vd->vdev_ishole); 2015 2016 mutex_enter(&msp->ms_lock); 2017 2018 /* 2019 * If this metaslab is just becoming available, initialize its 2020 * alloctrees, freetrees, and defertree and add its capacity to 2021 * the vdev. 2022 */ 2023 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2024 for (int t = 0; t < TXG_SIZE; t++) { 2025 ASSERT(msp->ms_alloctree[t] == NULL); 2026 ASSERT(msp->ms_freetree[t] == NULL); 2027 2028 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2029 &msp->ms_lock); 2030 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2031 &msp->ms_lock); 2032 } 2033 2034 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2035 ASSERT(msp->ms_defertree[t] == NULL); 2036 2037 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2038 &msp->ms_lock); 2039 } 2040 2041 vdev_space_update(vd, 0, 0, msp->ms_size); 2042 } 2043 2044 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2045 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2046 2047 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2048 defer_delta = range_tree_space(*freed_tree) - 2049 range_tree_space(*defer_tree); 2050 2051 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2052 2053 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2054 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2055 2056 /* 2057 * If there's a metaslab_load() in progress, wait for it to complete 2058 * so that we have a consistent view of the in-core space map. 2059 */ 2060 metaslab_load_wait(msp); 2061 2062 /* 2063 * Move the frees from the defer_tree back to the free 2064 * range tree (if it's loaded). Swap the freed_tree and the 2065 * defer_tree -- this is safe to do because we've just emptied out 2066 * the defer_tree. 2067 */ 2068 range_tree_vacate(*defer_tree, 2069 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2070 range_tree_swap(freed_tree, defer_tree); 2071 2072 space_map_update(msp->ms_sm); 2073 2074 msp->ms_deferspace += defer_delta; 2075 ASSERT3S(msp->ms_deferspace, >=, 0); 2076 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2077 if (msp->ms_deferspace != 0) { 2078 /* 2079 * Keep syncing this metaslab until all deferred frees 2080 * are back in circulation. 2081 */ 2082 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2083 } 2084 2085 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2086 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2087 VERIFY0(range_tree_space( 2088 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2089 } 2090 2091 if (!metaslab_debug_unload) 2092 metaslab_unload(msp); 2093 } 2094 2095 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2096 mutex_exit(&msp->ms_lock); 2097} 2098 2099void 2100metaslab_sync_reassess(metaslab_group_t *mg) 2101{ 2102 metaslab_group_alloc_update(mg); 2103 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2104 2105 /* 2106 * Preload the next potential metaslabs 2107 */ 2108 metaslab_group_preload(mg); 2109} 2110 2111static uint64_t 2112metaslab_distance(metaslab_t *msp, dva_t *dva) 2113{ 2114 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2115 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2116 uint64_t start = msp->ms_id; 2117 2118 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2119 return (1ULL << 63); 2120 2121 if (offset < start) 2122 return ((start - offset) << ms_shift); 2123 if (offset > start) 2124 return ((offset - start) << ms_shift); 2125 return (0); 2126} 2127 2128static uint64_t 2129metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2130 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2131{ 2132 spa_t *spa = mg->mg_vd->vdev_spa; 2133 metaslab_t *msp = NULL; 2134 uint64_t offset = -1ULL; 2135 avl_tree_t *t = &mg->mg_metaslab_tree; 2136 uint64_t activation_weight; 2137 uint64_t target_distance; 2138 int i; 2139 2140 activation_weight = METASLAB_WEIGHT_PRIMARY; 2141 for (i = 0; i < d; i++) { 2142 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2143 activation_weight = METASLAB_WEIGHT_SECONDARY; 2144 break; 2145 } 2146 } 2147 2148 for (;;) { 2149 boolean_t was_active; 2150 2151 mutex_enter(&mg->mg_lock); 2152 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2153 if (msp->ms_weight < asize) { 2154 spa_dbgmsg(spa, "%s: failed to meet weight " 2155 "requirement: vdev %llu, txg %llu, mg %p, " 2156 "msp %p, psize %llu, asize %llu, " 2157 "weight %llu", spa_name(spa), 2158 mg->mg_vd->vdev_id, txg, 2159 mg, msp, psize, asize, msp->ms_weight); 2160 mutex_exit(&mg->mg_lock); 2161 return (-1ULL); 2162 } 2163 2164 /* 2165 * If the selected metaslab is condensing, skip it. 2166 */ 2167 if (msp->ms_condensing) 2168 continue; 2169 2170 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2171 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2172 break; 2173 2174 target_distance = min_distance + 2175 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2176 min_distance >> 1); 2177 2178 for (i = 0; i < d; i++) 2179 if (metaslab_distance(msp, &dva[i]) < 2180 target_distance) 2181 break; 2182 if (i == d) 2183 break; 2184 } 2185 mutex_exit(&mg->mg_lock); 2186 if (msp == NULL) 2187 return (-1ULL); 2188 2189 mutex_enter(&msp->ms_lock); 2190 2191 /* 2192 * Ensure that the metaslab we have selected is still 2193 * capable of handling our request. It's possible that 2194 * another thread may have changed the weight while we 2195 * were blocked on the metaslab lock. 2196 */ 2197 if (msp->ms_weight < asize || (was_active && 2198 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2199 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2200 mutex_exit(&msp->ms_lock); 2201 continue; 2202 } 2203 2204 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2205 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2206 metaslab_passivate(msp, 2207 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2208 mutex_exit(&msp->ms_lock); 2209 continue; 2210 } 2211 2212 if (metaslab_activate(msp, activation_weight) != 0) { 2213 mutex_exit(&msp->ms_lock); 2214 continue; 2215 } 2216 2217 /* 2218 * If this metaslab is currently condensing then pick again as 2219 * we can't manipulate this metaslab until it's committed 2220 * to disk. 2221 */ 2222 if (msp->ms_condensing) { 2223 mutex_exit(&msp->ms_lock); 2224 continue; 2225 } 2226 2227 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2228 break; 2229 2230 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2231 mutex_exit(&msp->ms_lock); 2232 } 2233 2234 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2235 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2236 2237 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2238 msp->ms_access_txg = txg + metaslab_unload_delay; 2239 2240 mutex_exit(&msp->ms_lock); 2241 2242 return (offset); 2243} 2244 2245/* 2246 * Allocate a block for the specified i/o. 2247 */ 2248static int 2249metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2250 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2251{ 2252 metaslab_group_t *mg, *rotor; 2253 vdev_t *vd; 2254 int dshift = 3; 2255 int all_zero; 2256 int zio_lock = B_FALSE; 2257 boolean_t allocatable; 2258 uint64_t offset = -1ULL; 2259 uint64_t asize; 2260 uint64_t distance; 2261 2262 ASSERT(!DVA_IS_VALID(&dva[d])); 2263 2264 /* 2265 * For testing, make some blocks above a certain size be gang blocks. 2266 */ 2267 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2268 return (SET_ERROR(ENOSPC)); 2269 2270 /* 2271 * Start at the rotor and loop through all mgs until we find something. 2272 * Note that there's no locking on mc_rotor or mc_aliquot because 2273 * nothing actually breaks if we miss a few updates -- we just won't 2274 * allocate quite as evenly. It all balances out over time. 2275 * 2276 * If we are doing ditto or log blocks, try to spread them across 2277 * consecutive vdevs. If we're forced to reuse a vdev before we've 2278 * allocated all of our ditto blocks, then try and spread them out on 2279 * that vdev as much as possible. If it turns out to not be possible, 2280 * gradually lower our standards until anything becomes acceptable. 2281 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2282 * gives us hope of containing our fault domains to something we're 2283 * able to reason about. Otherwise, any two top-level vdev failures 2284 * will guarantee the loss of data. With consecutive allocation, 2285 * only two adjacent top-level vdev failures will result in data loss. 2286 * 2287 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2288 * ourselves on the same vdev as our gang block header. That 2289 * way, we can hope for locality in vdev_cache, plus it makes our 2290 * fault domains something tractable. 2291 */ 2292 if (hintdva) { 2293 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2294 2295 /* 2296 * It's possible the vdev we're using as the hint no 2297 * longer exists (i.e. removed). Consult the rotor when 2298 * all else fails. 2299 */ 2300 if (vd != NULL) { 2301 mg = vd->vdev_mg; 2302 2303 if (flags & METASLAB_HINTBP_AVOID && 2304 mg->mg_next != NULL) 2305 mg = mg->mg_next; 2306 } else { 2307 mg = mc->mc_rotor; 2308 } 2309 } else if (d != 0) { 2310 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2311 mg = vd->vdev_mg->mg_next; 2312 } else { 2313 mg = mc->mc_rotor; 2314 } 2315 2316 /* 2317 * If the hint put us into the wrong metaslab class, or into a 2318 * metaslab group that has been passivated, just follow the rotor. 2319 */ 2320 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2321 mg = mc->mc_rotor; 2322 2323 rotor = mg; 2324top: 2325 all_zero = B_TRUE; 2326 do { 2327 ASSERT(mg->mg_activation_count == 1); 2328 2329 vd = mg->mg_vd; 2330 2331 /* 2332 * Don't allocate from faulted devices. 2333 */ 2334 if (zio_lock) { 2335 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2336 allocatable = vdev_allocatable(vd); 2337 spa_config_exit(spa, SCL_ZIO, FTAG); 2338 } else { 2339 allocatable = vdev_allocatable(vd); 2340 } 2341 2342 /* 2343 * Determine if the selected metaslab group is eligible 2344 * for allocations. If we're ganging or have requested 2345 * an allocation for the smallest gang block size 2346 * then we don't want to avoid allocating to the this 2347 * metaslab group. If we're in this condition we should 2348 * try to allocate from any device possible so that we 2349 * don't inadvertently return ENOSPC and suspend the pool 2350 * even though space is still available. 2351 */ 2352 if (allocatable && CAN_FASTGANG(flags) && 2353 psize > SPA_GANGBLOCKSIZE) 2354 allocatable = metaslab_group_allocatable(mg); 2355 2356 if (!allocatable) 2357 goto next; 2358 2359 /* 2360 * Avoid writing single-copy data to a failing vdev 2361 * unless the user instructs us that it is okay. 2362 */ 2363 if ((vd->vdev_stat.vs_write_errors > 0 || 2364 vd->vdev_state < VDEV_STATE_HEALTHY) && 2365 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2366 all_zero = B_FALSE; 2367 goto next; 2368 } 2369 2370 ASSERT(mg->mg_class == mc); 2371 2372 distance = vd->vdev_asize >> dshift; 2373 if (distance <= (1ULL << vd->vdev_ms_shift)) 2374 distance = 0; 2375 else 2376 all_zero = B_FALSE; 2377 2378 asize = vdev_psize_to_asize(vd, psize); 2379 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2380 2381 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2382 dva, d); 2383 if (offset != -1ULL) { 2384 /* 2385 * If we've just selected this metaslab group, 2386 * figure out whether the corresponding vdev is 2387 * over- or under-used relative to the pool, 2388 * and set an allocation bias to even it out. 2389 */ 2390 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2391 vdev_stat_t *vs = &vd->vdev_stat; 2392 int64_t vu, cu; 2393 2394 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2395 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2396 2397 /* 2398 * Calculate how much more or less we should 2399 * try to allocate from this device during 2400 * this iteration around the rotor. 2401 * For example, if a device is 80% full 2402 * and the pool is 20% full then we should 2403 * reduce allocations by 60% on this device. 2404 * 2405 * mg_bias = (20 - 80) * 512K / 100 = -307K 2406 * 2407 * This reduces allocations by 307K for this 2408 * iteration. 2409 */ 2410 mg->mg_bias = ((cu - vu) * 2411 (int64_t)mg->mg_aliquot) / 100; 2412 } else if (!metaslab_bias_enabled) { 2413 mg->mg_bias = 0; 2414 } 2415 2416 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2417 mg->mg_aliquot + mg->mg_bias) { 2418 mc->mc_rotor = mg->mg_next; 2419 mc->mc_aliquot = 0; 2420 } 2421 2422 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2423 DVA_SET_OFFSET(&dva[d], offset); 2424 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2425 DVA_SET_ASIZE(&dva[d], asize); 2426 2427 return (0); 2428 } 2429next: 2430 mc->mc_rotor = mg->mg_next; 2431 mc->mc_aliquot = 0; 2432 } while ((mg = mg->mg_next) != rotor); 2433 2434 if (!all_zero) { 2435 dshift++; 2436 ASSERT(dshift < 64); 2437 goto top; 2438 } 2439 2440 if (!allocatable && !zio_lock) { 2441 dshift = 3; 2442 zio_lock = B_TRUE; 2443 goto top; 2444 } 2445 2446 bzero(&dva[d], sizeof (dva_t)); 2447 2448 return (SET_ERROR(ENOSPC)); 2449} 2450 2451/* 2452 * Free the block represented by DVA in the context of the specified 2453 * transaction group. 2454 */ 2455static void 2456metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2457{ 2458 uint64_t vdev = DVA_GET_VDEV(dva); 2459 uint64_t offset = DVA_GET_OFFSET(dva); 2460 uint64_t size = DVA_GET_ASIZE(dva); 2461 vdev_t *vd; 2462 metaslab_t *msp; 2463 2464 ASSERT(DVA_IS_VALID(dva)); 2465 2466 if (txg > spa_freeze_txg(spa)) 2467 return; 2468 2469 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2470 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2471 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2472 (u_longlong_t)vdev, (u_longlong_t)offset); 2473 ASSERT(0); 2474 return; 2475 } 2476 2477 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2478 2479 if (DVA_GET_GANG(dva)) 2480 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2481 2482 mutex_enter(&msp->ms_lock); 2483 2484 if (now) { 2485 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2486 offset, size); 2487 2488 VERIFY(!msp->ms_condensing); 2489 VERIFY3U(offset, >=, msp->ms_start); 2490 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2491 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2492 msp->ms_size); 2493 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2494 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2495 range_tree_add(msp->ms_tree, offset, size); 2496 } else { 2497 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2498 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2499 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2500 offset, size); 2501 } 2502 2503 mutex_exit(&msp->ms_lock); 2504} 2505 2506/* 2507 * Intent log support: upon opening the pool after a crash, notify the SPA 2508 * of blocks that the intent log has allocated for immediate write, but 2509 * which are still considered free by the SPA because the last transaction 2510 * group didn't commit yet. 2511 */ 2512static int 2513metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2514{ 2515 uint64_t vdev = DVA_GET_VDEV(dva); 2516 uint64_t offset = DVA_GET_OFFSET(dva); 2517 uint64_t size = DVA_GET_ASIZE(dva); 2518 vdev_t *vd; 2519 metaslab_t *msp; 2520 int error = 0; 2521 2522 ASSERT(DVA_IS_VALID(dva)); 2523 2524 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2525 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2526 return (SET_ERROR(ENXIO)); 2527 2528 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2529 2530 if (DVA_GET_GANG(dva)) 2531 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2532 2533 mutex_enter(&msp->ms_lock); 2534 2535 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2536 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2537 2538 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2539 error = SET_ERROR(ENOENT); 2540 2541 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2542 mutex_exit(&msp->ms_lock); 2543 return (error); 2544 } 2545 2546 VERIFY(!msp->ms_condensing); 2547 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2548 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2549 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2550 range_tree_remove(msp->ms_tree, offset, size); 2551 2552 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2553 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2554 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2555 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2556 } 2557 2558 mutex_exit(&msp->ms_lock); 2559 2560 return (0); 2561} 2562 2563int 2564metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2565 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2566{ 2567 dva_t *dva = bp->blk_dva; 2568 dva_t *hintdva = hintbp->blk_dva; 2569 int error = 0; 2570 2571 ASSERT(bp->blk_birth == 0); 2572 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2573 2574 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2575 2576 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2577 spa_config_exit(spa, SCL_ALLOC, FTAG); 2578 return (SET_ERROR(ENOSPC)); 2579 } 2580 2581 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2582 ASSERT(BP_GET_NDVAS(bp) == 0); 2583 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2584 2585 for (int d = 0; d < ndvas; d++) { 2586 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2587 txg, flags); 2588 if (error != 0) { 2589 for (d--; d >= 0; d--) { 2590 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2591 bzero(&dva[d], sizeof (dva_t)); 2592 } 2593 spa_config_exit(spa, SCL_ALLOC, FTAG); 2594 return (error); 2595 } 2596 } 2597 ASSERT(error == 0); 2598 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2599 2600 spa_config_exit(spa, SCL_ALLOC, FTAG); 2601 2602 BP_SET_BIRTH(bp, txg, txg); 2603 2604 return (0); 2605} 2606 2607void 2608metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2609{ 2610 const dva_t *dva = bp->blk_dva; 2611 int ndvas = BP_GET_NDVAS(bp); 2612 2613 ASSERT(!BP_IS_HOLE(bp)); 2614 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2615 2616 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2617 2618 for (int d = 0; d < ndvas; d++) 2619 metaslab_free_dva(spa, &dva[d], txg, now); 2620 2621 spa_config_exit(spa, SCL_FREE, FTAG); 2622} 2623 2624int 2625metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2626{ 2627 const dva_t *dva = bp->blk_dva; 2628 int ndvas = BP_GET_NDVAS(bp); 2629 int error = 0; 2630 2631 ASSERT(!BP_IS_HOLE(bp)); 2632 2633 if (txg != 0) { 2634 /* 2635 * First do a dry run to make sure all DVAs are claimable, 2636 * so we don't have to unwind from partial failures below. 2637 */ 2638 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2639 return (error); 2640 } 2641 2642 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2643 2644 for (int d = 0; d < ndvas; d++) 2645 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2646 break; 2647 2648 spa_config_exit(spa, SCL_ALLOC, FTAG); 2649 2650 ASSERT(error == 0 || txg == 0); 2651 2652 return (error); 2653} 2654 2655void 2656metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2657{ 2658 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2659 return; 2660 2661 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2662 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2663 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2664 vdev_t *vd = vdev_lookup_top(spa, vdev); 2665 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2666 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2667 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2668 2669 if (msp->ms_loaded) 2670 range_tree_verify(msp->ms_tree, offset, size); 2671 2672 for (int j = 0; j < TXG_SIZE; j++) 2673 range_tree_verify(msp->ms_freetree[j], offset, size); 2674 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2675 range_tree_verify(msp->ms_defertree[j], offset, size); 2676 } 2677 spa_config_exit(spa, SCL_VDEV, FTAG); 2678} 2679