metaslab.c revision 290753
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h> 35#include <sys/zfeature.h> 36 37SYSCTL_DECL(_vfs_zfs); 38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 39 40/* 41 * Allow allocations to switch to gang blocks quickly. We do this to 42 * avoid having to load lots of space_maps in a given txg. There are, 43 * however, some cases where we want to avoid "fast" ganging and instead 44 * we want to do an exhaustive search of all metaslabs on this device. 45 * Currently we don't allow any gang, slog, or dump device related allocations 46 * to "fast" gang. 47 */ 48#define CAN_FASTGANG(flags) \ 49 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 50 METASLAB_GANG_AVOID))) 51 52#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 53#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 54#define METASLAB_ACTIVE_MASK \ 55 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 56 57uint64_t metaslab_aliquot = 512ULL << 10; 58uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 59TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 60SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 61 &metaslab_gang_bang, 0, 62 "Force gang block allocation for blocks larger than or equal to this value"); 63 64/* 65 * The in-core space map representation is more compact than its on-disk form. 66 * The zfs_condense_pct determines how much more compact the in-core 67 * space_map representation must be before we compact it on-disk. 68 * Values should be greater than or equal to 100. 69 */ 70int zfs_condense_pct = 200; 71TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 72SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 73 &zfs_condense_pct, 0, 74 "Condense on-disk spacemap when it is more than this many percents" 75 " of in-memory counterpart"); 76 77/* 78 * Condensing a metaslab is not guaranteed to actually reduce the amount of 79 * space used on disk. In particular, a space map uses data in increments of 80 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 81 * same number of blocks after condensing. Since the goal of condensing is to 82 * reduce the number of IOPs required to read the space map, we only want to 83 * condense when we can be sure we will reduce the number of blocks used by the 84 * space map. Unfortunately, we cannot precisely compute whether or not this is 85 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 86 * we apply the following heuristic: do not condense a spacemap unless the 87 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 88 * blocks. 89 */ 90int zfs_metaslab_condense_block_threshold = 4; 91 92/* 93 * The zfs_mg_noalloc_threshold defines which metaslab groups should 94 * be eligible for allocation. The value is defined as a percentage of 95 * free space. Metaslab groups that have more free space than 96 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 97 * a metaslab group's free space is less than or equal to the 98 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 99 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 100 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 101 * groups are allowed to accept allocations. Gang blocks are always 102 * eligible to allocate on any metaslab group. The default value of 0 means 103 * no metaslab group will be excluded based on this criterion. 104 */ 105int zfs_mg_noalloc_threshold = 0; 106TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 107SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 108 &zfs_mg_noalloc_threshold, 0, 109 "Percentage of metaslab group size that should be free" 110 " to make it eligible for allocation"); 111 112/* 113 * Metaslab groups are considered eligible for allocations if their 114 * fragmenation metric (measured as a percentage) is less than or equal to 115 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 116 * then it will be skipped unless all metaslab groups within the metaslab 117 * class have also crossed this threshold. 118 */ 119int zfs_mg_fragmentation_threshold = 85; 120TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 121SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 122 &zfs_mg_fragmentation_threshold, 0, 123 "Percentage of metaslab group size that should be considered " 124 "eligible for allocations unless all metaslab groups within the metaslab class " 125 "have also crossed this threshold"); 126 127/* 128 * Allow metaslabs to keep their active state as long as their fragmentation 129 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 130 * active metaslab that exceeds this threshold will no longer keep its active 131 * status allowing better metaslabs to be selected. 132 */ 133int zfs_metaslab_fragmentation_threshold = 70; 134TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 135 &zfs_metaslab_fragmentation_threshold); 136SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 137 &zfs_metaslab_fragmentation_threshold, 0, 138 "Maximum percentage of metaslab fragmentation level to keep their active state"); 139 140/* 141 * When set will load all metaslabs when pool is first opened. 142 */ 143int metaslab_debug_load = 0; 144TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 145SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 146 &metaslab_debug_load, 0, 147 "Load all metaslabs when pool is first opened"); 148 149/* 150 * When set will prevent metaslabs from being unloaded. 151 */ 152int metaslab_debug_unload = 0; 153TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 154SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 155 &metaslab_debug_unload, 0, 156 "Prevent metaslabs from being unloaded"); 157 158/* 159 * Minimum size which forces the dynamic allocator to change 160 * it's allocation strategy. Once the space map cannot satisfy 161 * an allocation of this size then it switches to using more 162 * aggressive strategy (i.e search by size rather than offset). 163 */ 164uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 165TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 166 &metaslab_df_alloc_threshold); 167SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 168 &metaslab_df_alloc_threshold, 0, 169 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 170 171/* 172 * The minimum free space, in percent, which must be available 173 * in a space map to continue allocations in a first-fit fashion. 174 * Once the space_map's free space drops below this level we dynamically 175 * switch to using best-fit allocations. 176 */ 177int metaslab_df_free_pct = 4; 178TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 179SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 180 &metaslab_df_free_pct, 0, 181 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 182 183/* 184 * A metaslab is considered "free" if it contains a contiguous 185 * segment which is greater than metaslab_min_alloc_size. 186 */ 187uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 188TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 189 &metaslab_min_alloc_size); 190SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 191 &metaslab_min_alloc_size, 0, 192 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 193 194/* 195 * Percentage of all cpus that can be used by the metaslab taskq. 196 */ 197int metaslab_load_pct = 50; 198TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 199SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 200 &metaslab_load_pct, 0, 201 "Percentage of cpus that can be used by the metaslab taskq"); 202 203/* 204 * Determines how many txgs a metaslab may remain loaded without having any 205 * allocations from it. As long as a metaslab continues to be used we will 206 * keep it loaded. 207 */ 208int metaslab_unload_delay = TXG_SIZE * 2; 209TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 210SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 211 &metaslab_unload_delay, 0, 212 "Number of TXGs that an unused metaslab can be kept in memory"); 213 214/* 215 * Max number of metaslabs per group to preload. 216 */ 217int metaslab_preload_limit = SPA_DVAS_PER_BP; 218TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 219SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 220 &metaslab_preload_limit, 0, 221 "Max number of metaslabs per group to preload"); 222 223/* 224 * Enable/disable preloading of metaslab. 225 */ 226boolean_t metaslab_preload_enabled = B_TRUE; 227TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 228SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 229 &metaslab_preload_enabled, 0, 230 "Max number of metaslabs per group to preload"); 231 232/* 233 * Enable/disable fragmentation weighting on metaslabs. 234 */ 235boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 236TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 237 &metaslab_fragmentation_factor_enabled); 238SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 239 &metaslab_fragmentation_factor_enabled, 0, 240 "Enable fragmentation weighting on metaslabs"); 241 242/* 243 * Enable/disable lba weighting (i.e. outer tracks are given preference). 244 */ 245boolean_t metaslab_lba_weighting_enabled = B_TRUE; 246TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 247 &metaslab_lba_weighting_enabled); 248SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 249 &metaslab_lba_weighting_enabled, 0, 250 "Enable LBA weighting (i.e. outer tracks are given preference)"); 251 252/* 253 * Enable/disable metaslab group biasing. 254 */ 255boolean_t metaslab_bias_enabled = B_TRUE; 256TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 257 &metaslab_bias_enabled); 258SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 259 &metaslab_bias_enabled, 0, 260 "Enable metaslab group biasing"); 261 262static uint64_t metaslab_fragmentation(metaslab_t *); 263 264/* 265 * ========================================================================== 266 * Metaslab classes 267 * ========================================================================== 268 */ 269metaslab_class_t * 270metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 271{ 272 metaslab_class_t *mc; 273 274 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 275 276 mc->mc_spa = spa; 277 mc->mc_rotor = NULL; 278 mc->mc_ops = ops; 279 280 return (mc); 281} 282 283void 284metaslab_class_destroy(metaslab_class_t *mc) 285{ 286 ASSERT(mc->mc_rotor == NULL); 287 ASSERT(mc->mc_alloc == 0); 288 ASSERT(mc->mc_deferred == 0); 289 ASSERT(mc->mc_space == 0); 290 ASSERT(mc->mc_dspace == 0); 291 292 kmem_free(mc, sizeof (metaslab_class_t)); 293} 294 295int 296metaslab_class_validate(metaslab_class_t *mc) 297{ 298 metaslab_group_t *mg; 299 vdev_t *vd; 300 301 /* 302 * Must hold one of the spa_config locks. 303 */ 304 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 305 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 306 307 if ((mg = mc->mc_rotor) == NULL) 308 return (0); 309 310 do { 311 vd = mg->mg_vd; 312 ASSERT(vd->vdev_mg != NULL); 313 ASSERT3P(vd->vdev_top, ==, vd); 314 ASSERT3P(mg->mg_class, ==, mc); 315 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 316 } while ((mg = mg->mg_next) != mc->mc_rotor); 317 318 return (0); 319} 320 321void 322metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 323 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 324{ 325 atomic_add_64(&mc->mc_alloc, alloc_delta); 326 atomic_add_64(&mc->mc_deferred, defer_delta); 327 atomic_add_64(&mc->mc_space, space_delta); 328 atomic_add_64(&mc->mc_dspace, dspace_delta); 329} 330 331void 332metaslab_class_minblocksize_update(metaslab_class_t *mc) 333{ 334 metaslab_group_t *mg; 335 vdev_t *vd; 336 uint64_t minashift = UINT64_MAX; 337 338 if ((mg = mc->mc_rotor) == NULL) { 339 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 340 return; 341 } 342 343 do { 344 vd = mg->mg_vd; 345 if (vd->vdev_ashift < minashift) 346 minashift = vd->vdev_ashift; 347 } while ((mg = mg->mg_next) != mc->mc_rotor); 348 349 mc->mc_minblocksize = 1ULL << minashift; 350} 351 352uint64_t 353metaslab_class_get_alloc(metaslab_class_t *mc) 354{ 355 return (mc->mc_alloc); 356} 357 358uint64_t 359metaslab_class_get_deferred(metaslab_class_t *mc) 360{ 361 return (mc->mc_deferred); 362} 363 364uint64_t 365metaslab_class_get_space(metaslab_class_t *mc) 366{ 367 return (mc->mc_space); 368} 369 370uint64_t 371metaslab_class_get_dspace(metaslab_class_t *mc) 372{ 373 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 374} 375 376uint64_t 377metaslab_class_get_minblocksize(metaslab_class_t *mc) 378{ 379 return (mc->mc_minblocksize); 380} 381 382void 383metaslab_class_histogram_verify(metaslab_class_t *mc) 384{ 385 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 386 uint64_t *mc_hist; 387 int i; 388 389 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 390 return; 391 392 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 393 KM_SLEEP); 394 395 for (int c = 0; c < rvd->vdev_children; c++) { 396 vdev_t *tvd = rvd->vdev_child[c]; 397 metaslab_group_t *mg = tvd->vdev_mg; 398 399 /* 400 * Skip any holes, uninitialized top-levels, or 401 * vdevs that are not in this metalab class. 402 */ 403 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 404 mg->mg_class != mc) { 405 continue; 406 } 407 408 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 409 mc_hist[i] += mg->mg_histogram[i]; 410 } 411 412 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 413 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 414 415 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 416} 417 418/* 419 * Calculate the metaslab class's fragmentation metric. The metric 420 * is weighted based on the space contribution of each metaslab group. 421 * The return value will be a number between 0 and 100 (inclusive), or 422 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 423 * zfs_frag_table for more information about the metric. 424 */ 425uint64_t 426metaslab_class_fragmentation(metaslab_class_t *mc) 427{ 428 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 429 uint64_t fragmentation = 0; 430 431 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 432 433 for (int c = 0; c < rvd->vdev_children; c++) { 434 vdev_t *tvd = rvd->vdev_child[c]; 435 metaslab_group_t *mg = tvd->vdev_mg; 436 437 /* 438 * Skip any holes, uninitialized top-levels, or 439 * vdevs that are not in this metalab class. 440 */ 441 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 442 mg->mg_class != mc) { 443 continue; 444 } 445 446 /* 447 * If a metaslab group does not contain a fragmentation 448 * metric then just bail out. 449 */ 450 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 451 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 452 return (ZFS_FRAG_INVALID); 453 } 454 455 /* 456 * Determine how much this metaslab_group is contributing 457 * to the overall pool fragmentation metric. 458 */ 459 fragmentation += mg->mg_fragmentation * 460 metaslab_group_get_space(mg); 461 } 462 fragmentation /= metaslab_class_get_space(mc); 463 464 ASSERT3U(fragmentation, <=, 100); 465 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 466 return (fragmentation); 467} 468 469/* 470 * Calculate the amount of expandable space that is available in 471 * this metaslab class. If a device is expanded then its expandable 472 * space will be the amount of allocatable space that is currently not 473 * part of this metaslab class. 474 */ 475uint64_t 476metaslab_class_expandable_space(metaslab_class_t *mc) 477{ 478 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 479 uint64_t space = 0; 480 481 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 482 for (int c = 0; c < rvd->vdev_children; c++) { 483 vdev_t *tvd = rvd->vdev_child[c]; 484 metaslab_group_t *mg = tvd->vdev_mg; 485 486 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 487 mg->mg_class != mc) { 488 continue; 489 } 490 491 space += tvd->vdev_max_asize - tvd->vdev_asize; 492 } 493 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 494 return (space); 495} 496 497/* 498 * ========================================================================== 499 * Metaslab groups 500 * ========================================================================== 501 */ 502static int 503metaslab_compare(const void *x1, const void *x2) 504{ 505 const metaslab_t *m1 = x1; 506 const metaslab_t *m2 = x2; 507 508 if (m1->ms_weight < m2->ms_weight) 509 return (1); 510 if (m1->ms_weight > m2->ms_weight) 511 return (-1); 512 513 /* 514 * If the weights are identical, use the offset to force uniqueness. 515 */ 516 if (m1->ms_start < m2->ms_start) 517 return (-1); 518 if (m1->ms_start > m2->ms_start) 519 return (1); 520 521 ASSERT3P(m1, ==, m2); 522 523 return (0); 524} 525 526/* 527 * Update the allocatable flag and the metaslab group's capacity. 528 * The allocatable flag is set to true if the capacity is below 529 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 530 * from allocatable to non-allocatable or vice versa then the metaslab 531 * group's class is updated to reflect the transition. 532 */ 533static void 534metaslab_group_alloc_update(metaslab_group_t *mg) 535{ 536 vdev_t *vd = mg->mg_vd; 537 metaslab_class_t *mc = mg->mg_class; 538 vdev_stat_t *vs = &vd->vdev_stat; 539 boolean_t was_allocatable; 540 541 ASSERT(vd == vd->vdev_top); 542 543 mutex_enter(&mg->mg_lock); 544 was_allocatable = mg->mg_allocatable; 545 546 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 547 (vs->vs_space + 1); 548 549 /* 550 * A metaslab group is considered allocatable if it has plenty 551 * of free space or is not heavily fragmented. We only take 552 * fragmentation into account if the metaslab group has a valid 553 * fragmentation metric (i.e. a value between 0 and 100). 554 */ 555 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 556 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 557 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 558 559 /* 560 * The mc_alloc_groups maintains a count of the number of 561 * groups in this metaslab class that are still above the 562 * zfs_mg_noalloc_threshold. This is used by the allocating 563 * threads to determine if they should avoid allocations to 564 * a given group. The allocator will avoid allocations to a group 565 * if that group has reached or is below the zfs_mg_noalloc_threshold 566 * and there are still other groups that are above the threshold. 567 * When a group transitions from allocatable to non-allocatable or 568 * vice versa we update the metaslab class to reflect that change. 569 * When the mc_alloc_groups value drops to 0 that means that all 570 * groups have reached the zfs_mg_noalloc_threshold making all groups 571 * eligible for allocations. This effectively means that all devices 572 * are balanced again. 573 */ 574 if (was_allocatable && !mg->mg_allocatable) 575 mc->mc_alloc_groups--; 576 else if (!was_allocatable && mg->mg_allocatable) 577 mc->mc_alloc_groups++; 578 579 mutex_exit(&mg->mg_lock); 580} 581 582metaslab_group_t * 583metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 584{ 585 metaslab_group_t *mg; 586 587 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 588 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 589 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 590 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 591 mg->mg_vd = vd; 592 mg->mg_class = mc; 593 mg->mg_activation_count = 0; 594 595 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 596 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 597 598 return (mg); 599} 600 601void 602metaslab_group_destroy(metaslab_group_t *mg) 603{ 604 ASSERT(mg->mg_prev == NULL); 605 ASSERT(mg->mg_next == NULL); 606 /* 607 * We may have gone below zero with the activation count 608 * either because we never activated in the first place or 609 * because we're done, and possibly removing the vdev. 610 */ 611 ASSERT(mg->mg_activation_count <= 0); 612 613 taskq_destroy(mg->mg_taskq); 614 avl_destroy(&mg->mg_metaslab_tree); 615 mutex_destroy(&mg->mg_lock); 616 kmem_free(mg, sizeof (metaslab_group_t)); 617} 618 619void 620metaslab_group_activate(metaslab_group_t *mg) 621{ 622 metaslab_class_t *mc = mg->mg_class; 623 metaslab_group_t *mgprev, *mgnext; 624 625 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 626 627 ASSERT(mc->mc_rotor != mg); 628 ASSERT(mg->mg_prev == NULL); 629 ASSERT(mg->mg_next == NULL); 630 ASSERT(mg->mg_activation_count <= 0); 631 632 if (++mg->mg_activation_count <= 0) 633 return; 634 635 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 636 metaslab_group_alloc_update(mg); 637 638 if ((mgprev = mc->mc_rotor) == NULL) { 639 mg->mg_prev = mg; 640 mg->mg_next = mg; 641 } else { 642 mgnext = mgprev->mg_next; 643 mg->mg_prev = mgprev; 644 mg->mg_next = mgnext; 645 mgprev->mg_next = mg; 646 mgnext->mg_prev = mg; 647 } 648 mc->mc_rotor = mg; 649 metaslab_class_minblocksize_update(mc); 650} 651 652void 653metaslab_group_passivate(metaslab_group_t *mg) 654{ 655 metaslab_class_t *mc = mg->mg_class; 656 metaslab_group_t *mgprev, *mgnext; 657 658 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 659 660 if (--mg->mg_activation_count != 0) { 661 ASSERT(mc->mc_rotor != mg); 662 ASSERT(mg->mg_prev == NULL); 663 ASSERT(mg->mg_next == NULL); 664 ASSERT(mg->mg_activation_count < 0); 665 return; 666 } 667 668 taskq_wait(mg->mg_taskq); 669 metaslab_group_alloc_update(mg); 670 671 mgprev = mg->mg_prev; 672 mgnext = mg->mg_next; 673 674 if (mg == mgnext) { 675 mc->mc_rotor = NULL; 676 } else { 677 mc->mc_rotor = mgnext; 678 mgprev->mg_next = mgnext; 679 mgnext->mg_prev = mgprev; 680 } 681 682 mg->mg_prev = NULL; 683 mg->mg_next = NULL; 684 metaslab_class_minblocksize_update(mc); 685} 686 687uint64_t 688metaslab_group_get_space(metaslab_group_t *mg) 689{ 690 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 691} 692 693void 694metaslab_group_histogram_verify(metaslab_group_t *mg) 695{ 696 uint64_t *mg_hist; 697 vdev_t *vd = mg->mg_vd; 698 uint64_t ashift = vd->vdev_ashift; 699 int i; 700 701 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 702 return; 703 704 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 705 KM_SLEEP); 706 707 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 708 SPACE_MAP_HISTOGRAM_SIZE + ashift); 709 710 for (int m = 0; m < vd->vdev_ms_count; m++) { 711 metaslab_t *msp = vd->vdev_ms[m]; 712 713 if (msp->ms_sm == NULL) 714 continue; 715 716 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 717 mg_hist[i + ashift] += 718 msp->ms_sm->sm_phys->smp_histogram[i]; 719 } 720 721 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 722 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 723 724 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 725} 726 727static void 728metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 729{ 730 metaslab_class_t *mc = mg->mg_class; 731 uint64_t ashift = mg->mg_vd->vdev_ashift; 732 733 ASSERT(MUTEX_HELD(&msp->ms_lock)); 734 if (msp->ms_sm == NULL) 735 return; 736 737 mutex_enter(&mg->mg_lock); 738 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 739 mg->mg_histogram[i + ashift] += 740 msp->ms_sm->sm_phys->smp_histogram[i]; 741 mc->mc_histogram[i + ashift] += 742 msp->ms_sm->sm_phys->smp_histogram[i]; 743 } 744 mutex_exit(&mg->mg_lock); 745} 746 747void 748metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 749{ 750 metaslab_class_t *mc = mg->mg_class; 751 uint64_t ashift = mg->mg_vd->vdev_ashift; 752 753 ASSERT(MUTEX_HELD(&msp->ms_lock)); 754 if (msp->ms_sm == NULL) 755 return; 756 757 mutex_enter(&mg->mg_lock); 758 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 759 ASSERT3U(mg->mg_histogram[i + ashift], >=, 760 msp->ms_sm->sm_phys->smp_histogram[i]); 761 ASSERT3U(mc->mc_histogram[i + ashift], >=, 762 msp->ms_sm->sm_phys->smp_histogram[i]); 763 764 mg->mg_histogram[i + ashift] -= 765 msp->ms_sm->sm_phys->smp_histogram[i]; 766 mc->mc_histogram[i + ashift] -= 767 msp->ms_sm->sm_phys->smp_histogram[i]; 768 } 769 mutex_exit(&mg->mg_lock); 770} 771 772static void 773metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 774{ 775 ASSERT(msp->ms_group == NULL); 776 mutex_enter(&mg->mg_lock); 777 msp->ms_group = mg; 778 msp->ms_weight = 0; 779 avl_add(&mg->mg_metaslab_tree, msp); 780 mutex_exit(&mg->mg_lock); 781 782 mutex_enter(&msp->ms_lock); 783 metaslab_group_histogram_add(mg, msp); 784 mutex_exit(&msp->ms_lock); 785} 786 787static void 788metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 789{ 790 mutex_enter(&msp->ms_lock); 791 metaslab_group_histogram_remove(mg, msp); 792 mutex_exit(&msp->ms_lock); 793 794 mutex_enter(&mg->mg_lock); 795 ASSERT(msp->ms_group == mg); 796 avl_remove(&mg->mg_metaslab_tree, msp); 797 msp->ms_group = NULL; 798 mutex_exit(&mg->mg_lock); 799} 800 801static void 802metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 803{ 804 /* 805 * Although in principle the weight can be any value, in 806 * practice we do not use values in the range [1, 511]. 807 */ 808 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 809 ASSERT(MUTEX_HELD(&msp->ms_lock)); 810 811 mutex_enter(&mg->mg_lock); 812 ASSERT(msp->ms_group == mg); 813 avl_remove(&mg->mg_metaslab_tree, msp); 814 msp->ms_weight = weight; 815 avl_add(&mg->mg_metaslab_tree, msp); 816 mutex_exit(&mg->mg_lock); 817} 818 819/* 820 * Calculate the fragmentation for a given metaslab group. We can use 821 * a simple average here since all metaslabs within the group must have 822 * the same size. The return value will be a value between 0 and 100 823 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 824 * group have a fragmentation metric. 825 */ 826uint64_t 827metaslab_group_fragmentation(metaslab_group_t *mg) 828{ 829 vdev_t *vd = mg->mg_vd; 830 uint64_t fragmentation = 0; 831 uint64_t valid_ms = 0; 832 833 for (int m = 0; m < vd->vdev_ms_count; m++) { 834 metaslab_t *msp = vd->vdev_ms[m]; 835 836 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 837 continue; 838 839 valid_ms++; 840 fragmentation += msp->ms_fragmentation; 841 } 842 843 if (valid_ms <= vd->vdev_ms_count / 2) 844 return (ZFS_FRAG_INVALID); 845 846 fragmentation /= valid_ms; 847 ASSERT3U(fragmentation, <=, 100); 848 return (fragmentation); 849} 850 851/* 852 * Determine if a given metaslab group should skip allocations. A metaslab 853 * group should avoid allocations if its free capacity is less than the 854 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 855 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 856 * that can still handle allocations. 857 */ 858static boolean_t 859metaslab_group_allocatable(metaslab_group_t *mg) 860{ 861 vdev_t *vd = mg->mg_vd; 862 spa_t *spa = vd->vdev_spa; 863 metaslab_class_t *mc = mg->mg_class; 864 865 /* 866 * We use two key metrics to determine if a metaslab group is 867 * considered allocatable -- free space and fragmentation. If 868 * the free space is greater than the free space threshold and 869 * the fragmentation is less than the fragmentation threshold then 870 * consider the group allocatable. There are two case when we will 871 * not consider these key metrics. The first is if the group is 872 * associated with a slog device and the second is if all groups 873 * in this metaslab class have already been consider ineligible 874 * for allocations. 875 */ 876 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 877 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 878 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 879 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 880} 881 882/* 883 * ========================================================================== 884 * Range tree callbacks 885 * ========================================================================== 886 */ 887 888/* 889 * Comparison function for the private size-ordered tree. Tree is sorted 890 * by size, larger sizes at the end of the tree. 891 */ 892static int 893metaslab_rangesize_compare(const void *x1, const void *x2) 894{ 895 const range_seg_t *r1 = x1; 896 const range_seg_t *r2 = x2; 897 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 898 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 899 900 if (rs_size1 < rs_size2) 901 return (-1); 902 if (rs_size1 > rs_size2) 903 return (1); 904 905 if (r1->rs_start < r2->rs_start) 906 return (-1); 907 908 if (r1->rs_start > r2->rs_start) 909 return (1); 910 911 return (0); 912} 913 914/* 915 * Create any block allocator specific components. The current allocators 916 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 917 */ 918static void 919metaslab_rt_create(range_tree_t *rt, void *arg) 920{ 921 metaslab_t *msp = arg; 922 923 ASSERT3P(rt->rt_arg, ==, msp); 924 ASSERT(msp->ms_tree == NULL); 925 926 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 927 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 928} 929 930/* 931 * Destroy the block allocator specific components. 932 */ 933static void 934metaslab_rt_destroy(range_tree_t *rt, void *arg) 935{ 936 metaslab_t *msp = arg; 937 938 ASSERT3P(rt->rt_arg, ==, msp); 939 ASSERT3P(msp->ms_tree, ==, rt); 940 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 941 942 avl_destroy(&msp->ms_size_tree); 943} 944 945static void 946metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 947{ 948 metaslab_t *msp = arg; 949 950 ASSERT3P(rt->rt_arg, ==, msp); 951 ASSERT3P(msp->ms_tree, ==, rt); 952 VERIFY(!msp->ms_condensing); 953 avl_add(&msp->ms_size_tree, rs); 954} 955 956static void 957metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 958{ 959 metaslab_t *msp = arg; 960 961 ASSERT3P(rt->rt_arg, ==, msp); 962 ASSERT3P(msp->ms_tree, ==, rt); 963 VERIFY(!msp->ms_condensing); 964 avl_remove(&msp->ms_size_tree, rs); 965} 966 967static void 968metaslab_rt_vacate(range_tree_t *rt, void *arg) 969{ 970 metaslab_t *msp = arg; 971 972 ASSERT3P(rt->rt_arg, ==, msp); 973 ASSERT3P(msp->ms_tree, ==, rt); 974 975 /* 976 * Normally one would walk the tree freeing nodes along the way. 977 * Since the nodes are shared with the range trees we can avoid 978 * walking all nodes and just reinitialize the avl tree. The nodes 979 * will be freed by the range tree, so we don't want to free them here. 980 */ 981 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 982 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 983} 984 985static range_tree_ops_t metaslab_rt_ops = { 986 metaslab_rt_create, 987 metaslab_rt_destroy, 988 metaslab_rt_add, 989 metaslab_rt_remove, 990 metaslab_rt_vacate 991}; 992 993/* 994 * ========================================================================== 995 * Metaslab block operations 996 * ========================================================================== 997 */ 998 999/* 1000 * Return the maximum contiguous segment within the metaslab. 1001 */ 1002uint64_t 1003metaslab_block_maxsize(metaslab_t *msp) 1004{ 1005 avl_tree_t *t = &msp->ms_size_tree; 1006 range_seg_t *rs; 1007 1008 if (t == NULL || (rs = avl_last(t)) == NULL) 1009 return (0ULL); 1010 1011 return (rs->rs_end - rs->rs_start); 1012} 1013 1014uint64_t 1015metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1016{ 1017 uint64_t start; 1018 range_tree_t *rt = msp->ms_tree; 1019 1020 VERIFY(!msp->ms_condensing); 1021 1022 start = msp->ms_ops->msop_alloc(msp, size); 1023 if (start != -1ULL) { 1024 vdev_t *vd = msp->ms_group->mg_vd; 1025 1026 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1027 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1028 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1029 range_tree_remove(rt, start, size); 1030 } 1031 return (start); 1032} 1033 1034/* 1035 * ========================================================================== 1036 * Common allocator routines 1037 * ========================================================================== 1038 */ 1039 1040/* 1041 * This is a helper function that can be used by the allocator to find 1042 * a suitable block to allocate. This will search the specified AVL 1043 * tree looking for a block that matches the specified criteria. 1044 */ 1045static uint64_t 1046metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1047 uint64_t align) 1048{ 1049 range_seg_t *rs, rsearch; 1050 avl_index_t where; 1051 1052 rsearch.rs_start = *cursor; 1053 rsearch.rs_end = *cursor + size; 1054 1055 rs = avl_find(t, &rsearch, &where); 1056 if (rs == NULL) 1057 rs = avl_nearest(t, where, AVL_AFTER); 1058 1059 while (rs != NULL) { 1060 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1061 1062 if (offset + size <= rs->rs_end) { 1063 *cursor = offset + size; 1064 return (offset); 1065 } 1066 rs = AVL_NEXT(t, rs); 1067 } 1068 1069 /* 1070 * If we know we've searched the whole map (*cursor == 0), give up. 1071 * Otherwise, reset the cursor to the beginning and try again. 1072 */ 1073 if (*cursor == 0) 1074 return (-1ULL); 1075 1076 *cursor = 0; 1077 return (metaslab_block_picker(t, cursor, size, align)); 1078} 1079 1080/* 1081 * ========================================================================== 1082 * The first-fit block allocator 1083 * ========================================================================== 1084 */ 1085static uint64_t 1086metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1087{ 1088 /* 1089 * Find the largest power of 2 block size that evenly divides the 1090 * requested size. This is used to try to allocate blocks with similar 1091 * alignment from the same area of the metaslab (i.e. same cursor 1092 * bucket) but it does not guarantee that other allocations sizes 1093 * may exist in the same region. 1094 */ 1095 uint64_t align = size & -size; 1096 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1097 avl_tree_t *t = &msp->ms_tree->rt_root; 1098 1099 return (metaslab_block_picker(t, cursor, size, align)); 1100} 1101 1102static metaslab_ops_t metaslab_ff_ops = { 1103 metaslab_ff_alloc 1104}; 1105 1106/* 1107 * ========================================================================== 1108 * Dynamic block allocator - 1109 * Uses the first fit allocation scheme until space get low and then 1110 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1111 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1112 * ========================================================================== 1113 */ 1114static uint64_t 1115metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1116{ 1117 /* 1118 * Find the largest power of 2 block size that evenly divides the 1119 * requested size. This is used to try to allocate blocks with similar 1120 * alignment from the same area of the metaslab (i.e. same cursor 1121 * bucket) but it does not guarantee that other allocations sizes 1122 * may exist in the same region. 1123 */ 1124 uint64_t align = size & -size; 1125 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1126 range_tree_t *rt = msp->ms_tree; 1127 avl_tree_t *t = &rt->rt_root; 1128 uint64_t max_size = metaslab_block_maxsize(msp); 1129 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1130 1131 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1132 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1133 1134 if (max_size < size) 1135 return (-1ULL); 1136 1137 /* 1138 * If we're running low on space switch to using the size 1139 * sorted AVL tree (best-fit). 1140 */ 1141 if (max_size < metaslab_df_alloc_threshold || 1142 free_pct < metaslab_df_free_pct) { 1143 t = &msp->ms_size_tree; 1144 *cursor = 0; 1145 } 1146 1147 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1148} 1149 1150static metaslab_ops_t metaslab_df_ops = { 1151 metaslab_df_alloc 1152}; 1153 1154/* 1155 * ========================================================================== 1156 * Cursor fit block allocator - 1157 * Select the largest region in the metaslab, set the cursor to the beginning 1158 * of the range and the cursor_end to the end of the range. As allocations 1159 * are made advance the cursor. Continue allocating from the cursor until 1160 * the range is exhausted and then find a new range. 1161 * ========================================================================== 1162 */ 1163static uint64_t 1164metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1165{ 1166 range_tree_t *rt = msp->ms_tree; 1167 avl_tree_t *t = &msp->ms_size_tree; 1168 uint64_t *cursor = &msp->ms_lbas[0]; 1169 uint64_t *cursor_end = &msp->ms_lbas[1]; 1170 uint64_t offset = 0; 1171 1172 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1173 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1174 1175 ASSERT3U(*cursor_end, >=, *cursor); 1176 1177 if ((*cursor + size) > *cursor_end) { 1178 range_seg_t *rs; 1179 1180 rs = avl_last(&msp->ms_size_tree); 1181 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1182 return (-1ULL); 1183 1184 *cursor = rs->rs_start; 1185 *cursor_end = rs->rs_end; 1186 } 1187 1188 offset = *cursor; 1189 *cursor += size; 1190 1191 return (offset); 1192} 1193 1194static metaslab_ops_t metaslab_cf_ops = { 1195 metaslab_cf_alloc 1196}; 1197 1198/* 1199 * ========================================================================== 1200 * New dynamic fit allocator - 1201 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1202 * contiguous blocks. If no region is found then just use the largest segment 1203 * that remains. 1204 * ========================================================================== 1205 */ 1206 1207/* 1208 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1209 * to request from the allocator. 1210 */ 1211uint64_t metaslab_ndf_clump_shift = 4; 1212 1213static uint64_t 1214metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1215{ 1216 avl_tree_t *t = &msp->ms_tree->rt_root; 1217 avl_index_t where; 1218 range_seg_t *rs, rsearch; 1219 uint64_t hbit = highbit64(size); 1220 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1221 uint64_t max_size = metaslab_block_maxsize(msp); 1222 1223 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1224 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1225 1226 if (max_size < size) 1227 return (-1ULL); 1228 1229 rsearch.rs_start = *cursor; 1230 rsearch.rs_end = *cursor + size; 1231 1232 rs = avl_find(t, &rsearch, &where); 1233 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1234 t = &msp->ms_size_tree; 1235 1236 rsearch.rs_start = 0; 1237 rsearch.rs_end = MIN(max_size, 1238 1ULL << (hbit + metaslab_ndf_clump_shift)); 1239 rs = avl_find(t, &rsearch, &where); 1240 if (rs == NULL) 1241 rs = avl_nearest(t, where, AVL_AFTER); 1242 ASSERT(rs != NULL); 1243 } 1244 1245 if ((rs->rs_end - rs->rs_start) >= size) { 1246 *cursor = rs->rs_start + size; 1247 return (rs->rs_start); 1248 } 1249 return (-1ULL); 1250} 1251 1252static metaslab_ops_t metaslab_ndf_ops = { 1253 metaslab_ndf_alloc 1254}; 1255 1256metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1257 1258/* 1259 * ========================================================================== 1260 * Metaslabs 1261 * ========================================================================== 1262 */ 1263 1264/* 1265 * Wait for any in-progress metaslab loads to complete. 1266 */ 1267void 1268metaslab_load_wait(metaslab_t *msp) 1269{ 1270 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1271 1272 while (msp->ms_loading) { 1273 ASSERT(!msp->ms_loaded); 1274 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1275 } 1276} 1277 1278int 1279metaslab_load(metaslab_t *msp) 1280{ 1281 int error = 0; 1282 1283 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1284 ASSERT(!msp->ms_loaded); 1285 ASSERT(!msp->ms_loading); 1286 1287 msp->ms_loading = B_TRUE; 1288 1289 /* 1290 * If the space map has not been allocated yet, then treat 1291 * all the space in the metaslab as free and add it to the 1292 * ms_tree. 1293 */ 1294 if (msp->ms_sm != NULL) 1295 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1296 else 1297 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1298 1299 msp->ms_loaded = (error == 0); 1300 msp->ms_loading = B_FALSE; 1301 1302 if (msp->ms_loaded) { 1303 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1304 range_tree_walk(msp->ms_defertree[t], 1305 range_tree_remove, msp->ms_tree); 1306 } 1307 } 1308 cv_broadcast(&msp->ms_load_cv); 1309 return (error); 1310} 1311 1312void 1313metaslab_unload(metaslab_t *msp) 1314{ 1315 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1316 range_tree_vacate(msp->ms_tree, NULL, NULL); 1317 msp->ms_loaded = B_FALSE; 1318 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1319} 1320 1321int 1322metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1323 metaslab_t **msp) 1324{ 1325 vdev_t *vd = mg->mg_vd; 1326 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1327 metaslab_t *ms; 1328 int error; 1329 1330 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1331 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1332 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1333 ms->ms_id = id; 1334 ms->ms_start = id << vd->vdev_ms_shift; 1335 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1336 1337 /* 1338 * We only open space map objects that already exist. All others 1339 * will be opened when we finally allocate an object for it. 1340 */ 1341 if (object != 0) { 1342 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1343 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1344 1345 if (error != 0) { 1346 kmem_free(ms, sizeof (metaslab_t)); 1347 return (error); 1348 } 1349 1350 ASSERT(ms->ms_sm != NULL); 1351 } 1352 1353 /* 1354 * We create the main range tree here, but we don't create the 1355 * alloctree and freetree until metaslab_sync_done(). This serves 1356 * two purposes: it allows metaslab_sync_done() to detect the 1357 * addition of new space; and for debugging, it ensures that we'd 1358 * data fault on any attempt to use this metaslab before it's ready. 1359 */ 1360 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1361 metaslab_group_add(mg, ms); 1362 1363 ms->ms_fragmentation = metaslab_fragmentation(ms); 1364 ms->ms_ops = mg->mg_class->mc_ops; 1365 1366 /* 1367 * If we're opening an existing pool (txg == 0) or creating 1368 * a new one (txg == TXG_INITIAL), all space is available now. 1369 * If we're adding space to an existing pool, the new space 1370 * does not become available until after this txg has synced. 1371 */ 1372 if (txg <= TXG_INITIAL) 1373 metaslab_sync_done(ms, 0); 1374 1375 /* 1376 * If metaslab_debug_load is set and we're initializing a metaslab 1377 * that has an allocated space_map object then load the its space 1378 * map so that can verify frees. 1379 */ 1380 if (metaslab_debug_load && ms->ms_sm != NULL) { 1381 mutex_enter(&ms->ms_lock); 1382 VERIFY0(metaslab_load(ms)); 1383 mutex_exit(&ms->ms_lock); 1384 } 1385 1386 if (txg != 0) { 1387 vdev_dirty(vd, 0, NULL, txg); 1388 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1389 } 1390 1391 *msp = ms; 1392 1393 return (0); 1394} 1395 1396void 1397metaslab_fini(metaslab_t *msp) 1398{ 1399 metaslab_group_t *mg = msp->ms_group; 1400 1401 metaslab_group_remove(mg, msp); 1402 1403 mutex_enter(&msp->ms_lock); 1404 1405 VERIFY(msp->ms_group == NULL); 1406 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1407 0, -msp->ms_size); 1408 space_map_close(msp->ms_sm); 1409 1410 metaslab_unload(msp); 1411 range_tree_destroy(msp->ms_tree); 1412 1413 for (int t = 0; t < TXG_SIZE; t++) { 1414 range_tree_destroy(msp->ms_alloctree[t]); 1415 range_tree_destroy(msp->ms_freetree[t]); 1416 } 1417 1418 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1419 range_tree_destroy(msp->ms_defertree[t]); 1420 } 1421 1422 ASSERT0(msp->ms_deferspace); 1423 1424 mutex_exit(&msp->ms_lock); 1425 cv_destroy(&msp->ms_load_cv); 1426 mutex_destroy(&msp->ms_lock); 1427 1428 kmem_free(msp, sizeof (metaslab_t)); 1429} 1430 1431#define FRAGMENTATION_TABLE_SIZE 17 1432 1433/* 1434 * This table defines a segment size based fragmentation metric that will 1435 * allow each metaslab to derive its own fragmentation value. This is done 1436 * by calculating the space in each bucket of the spacemap histogram and 1437 * multiplying that by the fragmetation metric in this table. Doing 1438 * this for all buckets and dividing it by the total amount of free 1439 * space in this metaslab (i.e. the total free space in all buckets) gives 1440 * us the fragmentation metric. This means that a high fragmentation metric 1441 * equates to most of the free space being comprised of small segments. 1442 * Conversely, if the metric is low, then most of the free space is in 1443 * large segments. A 10% change in fragmentation equates to approximately 1444 * double the number of segments. 1445 * 1446 * This table defines 0% fragmented space using 16MB segments. Testing has 1447 * shown that segments that are greater than or equal to 16MB do not suffer 1448 * from drastic performance problems. Using this value, we derive the rest 1449 * of the table. Since the fragmentation value is never stored on disk, it 1450 * is possible to change these calculations in the future. 1451 */ 1452int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1453 100, /* 512B */ 1454 100, /* 1K */ 1455 98, /* 2K */ 1456 95, /* 4K */ 1457 90, /* 8K */ 1458 80, /* 16K */ 1459 70, /* 32K */ 1460 60, /* 64K */ 1461 50, /* 128K */ 1462 40, /* 256K */ 1463 30, /* 512K */ 1464 20, /* 1M */ 1465 15, /* 2M */ 1466 10, /* 4M */ 1467 5, /* 8M */ 1468 0 /* 16M */ 1469}; 1470 1471/* 1472 * Calclate the metaslab's fragmentation metric. A return value 1473 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1474 * not support this metric. Otherwise, the return value should be in the 1475 * range [0, 100]. 1476 */ 1477static uint64_t 1478metaslab_fragmentation(metaslab_t *msp) 1479{ 1480 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1481 uint64_t fragmentation = 0; 1482 uint64_t total = 0; 1483 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1484 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1485 1486 if (!feature_enabled) 1487 return (ZFS_FRAG_INVALID); 1488 1489 /* 1490 * A null space map means that the entire metaslab is free 1491 * and thus is not fragmented. 1492 */ 1493 if (msp->ms_sm == NULL) 1494 return (0); 1495 1496 /* 1497 * If this metaslab's space_map has not been upgraded, flag it 1498 * so that we upgrade next time we encounter it. 1499 */ 1500 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1501 uint64_t txg = spa_syncing_txg(spa); 1502 vdev_t *vd = msp->ms_group->mg_vd; 1503 1504 if (spa_writeable(spa)) { 1505 msp->ms_condense_wanted = B_TRUE; 1506 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1507 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1508 "msp %p, vd %p", txg, msp, vd); 1509 } 1510 return (ZFS_FRAG_INVALID); 1511 } 1512 1513 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1514 uint64_t space = 0; 1515 uint8_t shift = msp->ms_sm->sm_shift; 1516 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1517 FRAGMENTATION_TABLE_SIZE - 1); 1518 1519 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1520 continue; 1521 1522 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1523 total += space; 1524 1525 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1526 fragmentation += space * zfs_frag_table[idx]; 1527 } 1528 1529 if (total > 0) 1530 fragmentation /= total; 1531 ASSERT3U(fragmentation, <=, 100); 1532 return (fragmentation); 1533} 1534 1535/* 1536 * Compute a weight -- a selection preference value -- for the given metaslab. 1537 * This is based on the amount of free space, the level of fragmentation, 1538 * the LBA range, and whether the metaslab is loaded. 1539 */ 1540static uint64_t 1541metaslab_weight(metaslab_t *msp) 1542{ 1543 metaslab_group_t *mg = msp->ms_group; 1544 vdev_t *vd = mg->mg_vd; 1545 uint64_t weight, space; 1546 1547 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1548 1549 /* 1550 * This vdev is in the process of being removed so there is nothing 1551 * for us to do here. 1552 */ 1553 if (vd->vdev_removing) { 1554 ASSERT0(space_map_allocated(msp->ms_sm)); 1555 ASSERT0(vd->vdev_ms_shift); 1556 return (0); 1557 } 1558 1559 /* 1560 * The baseline weight is the metaslab's free space. 1561 */ 1562 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1563 1564 msp->ms_fragmentation = metaslab_fragmentation(msp); 1565 if (metaslab_fragmentation_factor_enabled && 1566 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1567 /* 1568 * Use the fragmentation information to inversely scale 1569 * down the baseline weight. We need to ensure that we 1570 * don't exclude this metaslab completely when it's 100% 1571 * fragmented. To avoid this we reduce the fragmented value 1572 * by 1. 1573 */ 1574 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1575 1576 /* 1577 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1578 * this metaslab again. The fragmentation metric may have 1579 * decreased the space to something smaller than 1580 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1581 * so that we can consume any remaining space. 1582 */ 1583 if (space > 0 && space < SPA_MINBLOCKSIZE) 1584 space = SPA_MINBLOCKSIZE; 1585 } 1586 weight = space; 1587 1588 /* 1589 * Modern disks have uniform bit density and constant angular velocity. 1590 * Therefore, the outer recording zones are faster (higher bandwidth) 1591 * than the inner zones by the ratio of outer to inner track diameter, 1592 * which is typically around 2:1. We account for this by assigning 1593 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1594 * In effect, this means that we'll select the metaslab with the most 1595 * free bandwidth rather than simply the one with the most free space. 1596 */ 1597 if (metaslab_lba_weighting_enabled) { 1598 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1599 ASSERT(weight >= space && weight <= 2 * space); 1600 } 1601 1602 /* 1603 * If this metaslab is one we're actively using, adjust its 1604 * weight to make it preferable to any inactive metaslab so 1605 * we'll polish it off. If the fragmentation on this metaslab 1606 * has exceed our threshold, then don't mark it active. 1607 */ 1608 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1609 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1610 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1611 } 1612 1613 return (weight); 1614} 1615 1616static int 1617metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1618{ 1619 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1620 1621 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1622 metaslab_load_wait(msp); 1623 if (!msp->ms_loaded) { 1624 int error = metaslab_load(msp); 1625 if (error) { 1626 metaslab_group_sort(msp->ms_group, msp, 0); 1627 return (error); 1628 } 1629 } 1630 1631 metaslab_group_sort(msp->ms_group, msp, 1632 msp->ms_weight | activation_weight); 1633 } 1634 ASSERT(msp->ms_loaded); 1635 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1636 1637 return (0); 1638} 1639 1640static void 1641metaslab_passivate(metaslab_t *msp, uint64_t size) 1642{ 1643 /* 1644 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1645 * this metaslab again. In that case, it had better be empty, 1646 * or we would be leaving space on the table. 1647 */ 1648 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1649 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1650 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1651} 1652 1653static void 1654metaslab_preload(void *arg) 1655{ 1656 metaslab_t *msp = arg; 1657 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1658 1659 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1660 1661 mutex_enter(&msp->ms_lock); 1662 metaslab_load_wait(msp); 1663 if (!msp->ms_loaded) 1664 (void) metaslab_load(msp); 1665 1666 /* 1667 * Set the ms_access_txg value so that we don't unload it right away. 1668 */ 1669 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1670 mutex_exit(&msp->ms_lock); 1671} 1672 1673static void 1674metaslab_group_preload(metaslab_group_t *mg) 1675{ 1676 spa_t *spa = mg->mg_vd->vdev_spa; 1677 metaslab_t *msp; 1678 avl_tree_t *t = &mg->mg_metaslab_tree; 1679 int m = 0; 1680 1681 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1682 taskq_wait(mg->mg_taskq); 1683 return; 1684 } 1685 1686 mutex_enter(&mg->mg_lock); 1687 /* 1688 * Load the next potential metaslabs 1689 */ 1690 msp = avl_first(t); 1691 while (msp != NULL) { 1692 metaslab_t *msp_next = AVL_NEXT(t, msp); 1693 1694 /* 1695 * We preload only the maximum number of metaslabs specified 1696 * by metaslab_preload_limit. If a metaslab is being forced 1697 * to condense then we preload it too. This will ensure 1698 * that force condensing happens in the next txg. 1699 */ 1700 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1701 msp = msp_next; 1702 continue; 1703 } 1704 1705 /* 1706 * We must drop the metaslab group lock here to preserve 1707 * lock ordering with the ms_lock (when grabbing both 1708 * the mg_lock and the ms_lock, the ms_lock must be taken 1709 * first). As a result, it is possible that the ordering 1710 * of the metaslabs within the avl tree may change before 1711 * we reacquire the lock. The metaslab cannot be removed from 1712 * the tree while we're in syncing context so it is safe to 1713 * drop the mg_lock here. If the metaslabs are reordered 1714 * nothing will break -- we just may end up loading a 1715 * less than optimal one. 1716 */ 1717 mutex_exit(&mg->mg_lock); 1718 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1719 msp, TQ_SLEEP) != 0); 1720 mutex_enter(&mg->mg_lock); 1721 msp = msp_next; 1722 } 1723 mutex_exit(&mg->mg_lock); 1724} 1725 1726/* 1727 * Determine if the space map's on-disk footprint is past our tolerance 1728 * for inefficiency. We would like to use the following criteria to make 1729 * our decision: 1730 * 1731 * 1. The size of the space map object should not dramatically increase as a 1732 * result of writing out the free space range tree. 1733 * 1734 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1735 * times the size than the free space range tree representation 1736 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1737 * 1738 * 3. The on-disk size of the space map should actually decrease. 1739 * 1740 * Checking the first condition is tricky since we don't want to walk 1741 * the entire AVL tree calculating the estimated on-disk size. Instead we 1742 * use the size-ordered range tree in the metaslab and calculate the 1743 * size required to write out the largest segment in our free tree. If the 1744 * size required to represent that segment on disk is larger than the space 1745 * map object then we avoid condensing this map. 1746 * 1747 * To determine the second criterion we use a best-case estimate and assume 1748 * each segment can be represented on-disk as a single 64-bit entry. We refer 1749 * to this best-case estimate as the space map's minimal form. 1750 * 1751 * Unfortunately, we cannot compute the on-disk size of the space map in this 1752 * context because we cannot accurately compute the effects of compression, etc. 1753 * Instead, we apply the heuristic described in the block comment for 1754 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1755 * is greater than a threshold number of blocks. 1756 */ 1757static boolean_t 1758metaslab_should_condense(metaslab_t *msp) 1759{ 1760 space_map_t *sm = msp->ms_sm; 1761 range_seg_t *rs; 1762 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1763 dmu_object_info_t doi; 1764 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1765 1766 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1767 ASSERT(msp->ms_loaded); 1768 1769 /* 1770 * Use the ms_size_tree range tree, which is ordered by size, to 1771 * obtain the largest segment in the free tree. We always condense 1772 * metaslabs that are empty and metaslabs for which a condense 1773 * request has been made. 1774 */ 1775 rs = avl_last(&msp->ms_size_tree); 1776 if (rs == NULL || msp->ms_condense_wanted) 1777 return (B_TRUE); 1778 1779 /* 1780 * Calculate the number of 64-bit entries this segment would 1781 * require when written to disk. If this single segment would be 1782 * larger on-disk than the entire current on-disk structure, then 1783 * clearly condensing will increase the on-disk structure size. 1784 */ 1785 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1786 entries = size / (MIN(size, SM_RUN_MAX)); 1787 segsz = entries * sizeof (uint64_t); 1788 1789 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1790 object_size = space_map_length(msp->ms_sm); 1791 1792 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1793 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1794 1795 return (segsz <= object_size && 1796 object_size >= (optimal_size * zfs_condense_pct / 100) && 1797 object_size > zfs_metaslab_condense_block_threshold * record_size); 1798} 1799 1800/* 1801 * Condense the on-disk space map representation to its minimized form. 1802 * The minimized form consists of a small number of allocations followed by 1803 * the entries of the free range tree. 1804 */ 1805static void 1806metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1807{ 1808 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1809 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1810 range_tree_t *condense_tree; 1811 space_map_t *sm = msp->ms_sm; 1812 1813 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1814 ASSERT3U(spa_sync_pass(spa), ==, 1); 1815 ASSERT(msp->ms_loaded); 1816 1817 1818 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, " 1819 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, 1820 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, 1821 msp->ms_group->mg_vd->vdev_spa->spa_name, 1822 space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root), 1823 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1824 1825 msp->ms_condense_wanted = B_FALSE; 1826 1827 /* 1828 * Create an range tree that is 100% allocated. We remove segments 1829 * that have been freed in this txg, any deferred frees that exist, 1830 * and any allocation in the future. Removing segments should be 1831 * a relatively inexpensive operation since we expect these trees to 1832 * have a small number of nodes. 1833 */ 1834 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1835 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1836 1837 /* 1838 * Remove what's been freed in this txg from the condense_tree. 1839 * Since we're in sync_pass 1, we know that all the frees from 1840 * this txg are in the freetree. 1841 */ 1842 range_tree_walk(freetree, range_tree_remove, condense_tree); 1843 1844 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1845 range_tree_walk(msp->ms_defertree[t], 1846 range_tree_remove, condense_tree); 1847 } 1848 1849 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1850 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1851 range_tree_remove, condense_tree); 1852 } 1853 1854 /* 1855 * We're about to drop the metaslab's lock thus allowing 1856 * other consumers to change it's content. Set the 1857 * metaslab's ms_condensing flag to ensure that 1858 * allocations on this metaslab do not occur while we're 1859 * in the middle of committing it to disk. This is only critical 1860 * for the ms_tree as all other range trees use per txg 1861 * views of their content. 1862 */ 1863 msp->ms_condensing = B_TRUE; 1864 1865 mutex_exit(&msp->ms_lock); 1866 space_map_truncate(sm, tx); 1867 mutex_enter(&msp->ms_lock); 1868 1869 /* 1870 * While we would ideally like to create a space_map representation 1871 * that consists only of allocation records, doing so can be 1872 * prohibitively expensive because the in-core free tree can be 1873 * large, and therefore computationally expensive to subtract 1874 * from the condense_tree. Instead we sync out two trees, a cheap 1875 * allocation only tree followed by the in-core free tree. While not 1876 * optimal, this is typically close to optimal, and much cheaper to 1877 * compute. 1878 */ 1879 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1880 range_tree_vacate(condense_tree, NULL, NULL); 1881 range_tree_destroy(condense_tree); 1882 1883 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1884 msp->ms_condensing = B_FALSE; 1885} 1886 1887/* 1888 * Write a metaslab to disk in the context of the specified transaction group. 1889 */ 1890void 1891metaslab_sync(metaslab_t *msp, uint64_t txg) 1892{ 1893 metaslab_group_t *mg = msp->ms_group; 1894 vdev_t *vd = mg->mg_vd; 1895 spa_t *spa = vd->vdev_spa; 1896 objset_t *mos = spa_meta_objset(spa); 1897 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1898 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1899 range_tree_t **freed_tree = 1900 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1901 dmu_tx_t *tx; 1902 uint64_t object = space_map_object(msp->ms_sm); 1903 1904 ASSERT(!vd->vdev_ishole); 1905 1906 /* 1907 * This metaslab has just been added so there's no work to do now. 1908 */ 1909 if (*freetree == NULL) { 1910 ASSERT3P(alloctree, ==, NULL); 1911 return; 1912 } 1913 1914 ASSERT3P(alloctree, !=, NULL); 1915 ASSERT3P(*freetree, !=, NULL); 1916 ASSERT3P(*freed_tree, !=, NULL); 1917 1918 /* 1919 * Normally, we don't want to process a metaslab if there 1920 * are no allocations or frees to perform. However, if the metaslab 1921 * is being forced to condense we need to let it through. 1922 */ 1923 if (range_tree_space(alloctree) == 0 && 1924 range_tree_space(*freetree) == 0 && 1925 !msp->ms_condense_wanted) 1926 return; 1927 1928 /* 1929 * The only state that can actually be changing concurrently with 1930 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1931 * be modifying this txg's alloctree, freetree, freed_tree, or 1932 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1933 * space_map ASSERTs. We drop it whenever we call into the DMU, 1934 * because the DMU can call down to us (e.g. via zio_free()) at 1935 * any time. 1936 */ 1937 1938 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1939 1940 if (msp->ms_sm == NULL) { 1941 uint64_t new_object; 1942 1943 new_object = space_map_alloc(mos, tx); 1944 VERIFY3U(new_object, !=, 0); 1945 1946 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1947 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1948 &msp->ms_lock)); 1949 ASSERT(msp->ms_sm != NULL); 1950 } 1951 1952 mutex_enter(&msp->ms_lock); 1953 1954 /* 1955 * Note: metaslab_condense() clears the space_map's histogram. 1956 * Therefore we must verify and remove this histogram before 1957 * condensing. 1958 */ 1959 metaslab_group_histogram_verify(mg); 1960 metaslab_class_histogram_verify(mg->mg_class); 1961 metaslab_group_histogram_remove(mg, msp); 1962 1963 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1964 metaslab_should_condense(msp)) { 1965 metaslab_condense(msp, txg, tx); 1966 } else { 1967 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1968 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1969 } 1970 1971 if (msp->ms_loaded) { 1972 /* 1973 * When the space map is loaded, we have an accruate 1974 * histogram in the range tree. This gives us an opportunity 1975 * to bring the space map's histogram up-to-date so we clear 1976 * it first before updating it. 1977 */ 1978 space_map_histogram_clear(msp->ms_sm); 1979 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1980 } else { 1981 /* 1982 * Since the space map is not loaded we simply update the 1983 * exisiting histogram with what was freed in this txg. This 1984 * means that the on-disk histogram may not have an accurate 1985 * view of the free space but it's close enough to allow 1986 * us to make allocation decisions. 1987 */ 1988 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1989 } 1990 metaslab_group_histogram_add(mg, msp); 1991 metaslab_group_histogram_verify(mg); 1992 metaslab_class_histogram_verify(mg->mg_class); 1993 1994 /* 1995 * For sync pass 1, we avoid traversing this txg's free range tree 1996 * and instead will just swap the pointers for freetree and 1997 * freed_tree. We can safely do this since the freed_tree is 1998 * guaranteed to be empty on the initial pass. 1999 */ 2000 if (spa_sync_pass(spa) == 1) { 2001 range_tree_swap(freetree, freed_tree); 2002 } else { 2003 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 2004 } 2005 range_tree_vacate(alloctree, NULL, NULL); 2006 2007 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2008 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2009 2010 mutex_exit(&msp->ms_lock); 2011 2012 if (object != space_map_object(msp->ms_sm)) { 2013 object = space_map_object(msp->ms_sm); 2014 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2015 msp->ms_id, sizeof (uint64_t), &object, tx); 2016 } 2017 dmu_tx_commit(tx); 2018} 2019 2020/* 2021 * Called after a transaction group has completely synced to mark 2022 * all of the metaslab's free space as usable. 2023 */ 2024void 2025metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2026{ 2027 metaslab_group_t *mg = msp->ms_group; 2028 vdev_t *vd = mg->mg_vd; 2029 range_tree_t **freed_tree; 2030 range_tree_t **defer_tree; 2031 int64_t alloc_delta, defer_delta; 2032 2033 ASSERT(!vd->vdev_ishole); 2034 2035 mutex_enter(&msp->ms_lock); 2036 2037 /* 2038 * If this metaslab is just becoming available, initialize its 2039 * alloctrees, freetrees, and defertree and add its capacity to 2040 * the vdev. 2041 */ 2042 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2043 for (int t = 0; t < TXG_SIZE; t++) { 2044 ASSERT(msp->ms_alloctree[t] == NULL); 2045 ASSERT(msp->ms_freetree[t] == NULL); 2046 2047 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2048 &msp->ms_lock); 2049 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2050 &msp->ms_lock); 2051 } 2052 2053 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2054 ASSERT(msp->ms_defertree[t] == NULL); 2055 2056 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2057 &msp->ms_lock); 2058 } 2059 2060 vdev_space_update(vd, 0, 0, msp->ms_size); 2061 } 2062 2063 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2064 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2065 2066 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2067 defer_delta = range_tree_space(*freed_tree) - 2068 range_tree_space(*defer_tree); 2069 2070 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2071 2072 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2073 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2074 2075 /* 2076 * If there's a metaslab_load() in progress, wait for it to complete 2077 * so that we have a consistent view of the in-core space map. 2078 */ 2079 metaslab_load_wait(msp); 2080 2081 /* 2082 * Move the frees from the defer_tree back to the free 2083 * range tree (if it's loaded). Swap the freed_tree and the 2084 * defer_tree -- this is safe to do because we've just emptied out 2085 * the defer_tree. 2086 */ 2087 range_tree_vacate(*defer_tree, 2088 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2089 range_tree_swap(freed_tree, defer_tree); 2090 2091 space_map_update(msp->ms_sm); 2092 2093 msp->ms_deferspace += defer_delta; 2094 ASSERT3S(msp->ms_deferspace, >=, 0); 2095 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2096 if (msp->ms_deferspace != 0) { 2097 /* 2098 * Keep syncing this metaslab until all deferred frees 2099 * are back in circulation. 2100 */ 2101 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2102 } 2103 2104 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2105 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2106 VERIFY0(range_tree_space( 2107 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2108 } 2109 2110 if (!metaslab_debug_unload) 2111 metaslab_unload(msp); 2112 } 2113 2114 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2115 mutex_exit(&msp->ms_lock); 2116} 2117 2118void 2119metaslab_sync_reassess(metaslab_group_t *mg) 2120{ 2121 metaslab_group_alloc_update(mg); 2122 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2123 2124 /* 2125 * Preload the next potential metaslabs 2126 */ 2127 metaslab_group_preload(mg); 2128} 2129 2130static uint64_t 2131metaslab_distance(metaslab_t *msp, dva_t *dva) 2132{ 2133 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2134 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2135 uint64_t start = msp->ms_id; 2136 2137 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2138 return (1ULL << 63); 2139 2140 if (offset < start) 2141 return ((start - offset) << ms_shift); 2142 if (offset > start) 2143 return ((offset - start) << ms_shift); 2144 return (0); 2145} 2146 2147static uint64_t 2148metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2149 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2150{ 2151 spa_t *spa = mg->mg_vd->vdev_spa; 2152 metaslab_t *msp = NULL; 2153 uint64_t offset = -1ULL; 2154 avl_tree_t *t = &mg->mg_metaslab_tree; 2155 uint64_t activation_weight; 2156 uint64_t target_distance; 2157 int i; 2158 2159 activation_weight = METASLAB_WEIGHT_PRIMARY; 2160 for (i = 0; i < d; i++) { 2161 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2162 activation_weight = METASLAB_WEIGHT_SECONDARY; 2163 break; 2164 } 2165 } 2166 2167 for (;;) { 2168 boolean_t was_active; 2169 2170 mutex_enter(&mg->mg_lock); 2171 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2172 if (msp->ms_weight < asize) { 2173 spa_dbgmsg(spa, "%s: failed to meet weight " 2174 "requirement: vdev %llu, txg %llu, mg %p, " 2175 "msp %p, psize %llu, asize %llu, " 2176 "weight %llu", spa_name(spa), 2177 mg->mg_vd->vdev_id, txg, 2178 mg, msp, psize, asize, msp->ms_weight); 2179 mutex_exit(&mg->mg_lock); 2180 return (-1ULL); 2181 } 2182 2183 /* 2184 * If the selected metaslab is condensing, skip it. 2185 */ 2186 if (msp->ms_condensing) 2187 continue; 2188 2189 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2190 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2191 break; 2192 2193 target_distance = min_distance + 2194 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2195 min_distance >> 1); 2196 2197 for (i = 0; i < d; i++) 2198 if (metaslab_distance(msp, &dva[i]) < 2199 target_distance) 2200 break; 2201 if (i == d) 2202 break; 2203 } 2204 mutex_exit(&mg->mg_lock); 2205 if (msp == NULL) 2206 return (-1ULL); 2207 2208 mutex_enter(&msp->ms_lock); 2209 2210 /* 2211 * Ensure that the metaslab we have selected is still 2212 * capable of handling our request. It's possible that 2213 * another thread may have changed the weight while we 2214 * were blocked on the metaslab lock. 2215 */ 2216 if (msp->ms_weight < asize || (was_active && 2217 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2218 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2219 mutex_exit(&msp->ms_lock); 2220 continue; 2221 } 2222 2223 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2224 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2225 metaslab_passivate(msp, 2226 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2227 mutex_exit(&msp->ms_lock); 2228 continue; 2229 } 2230 2231 if (metaslab_activate(msp, activation_weight) != 0) { 2232 mutex_exit(&msp->ms_lock); 2233 continue; 2234 } 2235 2236 /* 2237 * If this metaslab is currently condensing then pick again as 2238 * we can't manipulate this metaslab until it's committed 2239 * to disk. 2240 */ 2241 if (msp->ms_condensing) { 2242 mutex_exit(&msp->ms_lock); 2243 continue; 2244 } 2245 2246 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2247 break; 2248 2249 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2250 mutex_exit(&msp->ms_lock); 2251 } 2252 2253 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2254 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2255 2256 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2257 msp->ms_access_txg = txg + metaslab_unload_delay; 2258 2259 mutex_exit(&msp->ms_lock); 2260 2261 return (offset); 2262} 2263 2264/* 2265 * Allocate a block for the specified i/o. 2266 */ 2267static int 2268metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2269 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2270{ 2271 metaslab_group_t *mg, *rotor; 2272 vdev_t *vd; 2273 int dshift = 3; 2274 int all_zero; 2275 int zio_lock = B_FALSE; 2276 boolean_t allocatable; 2277 uint64_t offset = -1ULL; 2278 uint64_t asize; 2279 uint64_t distance; 2280 2281 ASSERT(!DVA_IS_VALID(&dva[d])); 2282 2283 /* 2284 * For testing, make some blocks above a certain size be gang blocks. 2285 */ 2286 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2287 return (SET_ERROR(ENOSPC)); 2288 2289 /* 2290 * Start at the rotor and loop through all mgs until we find something. 2291 * Note that there's no locking on mc_rotor or mc_aliquot because 2292 * nothing actually breaks if we miss a few updates -- we just won't 2293 * allocate quite as evenly. It all balances out over time. 2294 * 2295 * If we are doing ditto or log blocks, try to spread them across 2296 * consecutive vdevs. If we're forced to reuse a vdev before we've 2297 * allocated all of our ditto blocks, then try and spread them out on 2298 * that vdev as much as possible. If it turns out to not be possible, 2299 * gradually lower our standards until anything becomes acceptable. 2300 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2301 * gives us hope of containing our fault domains to something we're 2302 * able to reason about. Otherwise, any two top-level vdev failures 2303 * will guarantee the loss of data. With consecutive allocation, 2304 * only two adjacent top-level vdev failures will result in data loss. 2305 * 2306 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2307 * ourselves on the same vdev as our gang block header. That 2308 * way, we can hope for locality in vdev_cache, plus it makes our 2309 * fault domains something tractable. 2310 */ 2311 if (hintdva) { 2312 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2313 2314 /* 2315 * It's possible the vdev we're using as the hint no 2316 * longer exists (i.e. removed). Consult the rotor when 2317 * all else fails. 2318 */ 2319 if (vd != NULL) { 2320 mg = vd->vdev_mg; 2321 2322 if (flags & METASLAB_HINTBP_AVOID && 2323 mg->mg_next != NULL) 2324 mg = mg->mg_next; 2325 } else { 2326 mg = mc->mc_rotor; 2327 } 2328 } else if (d != 0) { 2329 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2330 mg = vd->vdev_mg->mg_next; 2331 } else { 2332 mg = mc->mc_rotor; 2333 } 2334 2335 /* 2336 * If the hint put us into the wrong metaslab class, or into a 2337 * metaslab group that has been passivated, just follow the rotor. 2338 */ 2339 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2340 mg = mc->mc_rotor; 2341 2342 rotor = mg; 2343top: 2344 all_zero = B_TRUE; 2345 do { 2346 ASSERT(mg->mg_activation_count == 1); 2347 2348 vd = mg->mg_vd; 2349 2350 /* 2351 * Don't allocate from faulted devices. 2352 */ 2353 if (zio_lock) { 2354 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2355 allocatable = vdev_allocatable(vd); 2356 spa_config_exit(spa, SCL_ZIO, FTAG); 2357 } else { 2358 allocatable = vdev_allocatable(vd); 2359 } 2360 2361 /* 2362 * Determine if the selected metaslab group is eligible 2363 * for allocations. If we're ganging or have requested 2364 * an allocation for the smallest gang block size 2365 * then we don't want to avoid allocating to the this 2366 * metaslab group. If we're in this condition we should 2367 * try to allocate from any device possible so that we 2368 * don't inadvertently return ENOSPC and suspend the pool 2369 * even though space is still available. 2370 */ 2371 if (allocatable && CAN_FASTGANG(flags) && 2372 psize > SPA_GANGBLOCKSIZE) 2373 allocatable = metaslab_group_allocatable(mg); 2374 2375 if (!allocatable) 2376 goto next; 2377 2378 /* 2379 * Avoid writing single-copy data to a failing vdev 2380 * unless the user instructs us that it is okay. 2381 */ 2382 if ((vd->vdev_stat.vs_write_errors > 0 || 2383 vd->vdev_state < VDEV_STATE_HEALTHY) && 2384 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2385 all_zero = B_FALSE; 2386 goto next; 2387 } 2388 2389 ASSERT(mg->mg_class == mc); 2390 2391 distance = vd->vdev_asize >> dshift; 2392 if (distance <= (1ULL << vd->vdev_ms_shift)) 2393 distance = 0; 2394 else 2395 all_zero = B_FALSE; 2396 2397 asize = vdev_psize_to_asize(vd, psize); 2398 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2399 2400 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2401 dva, d); 2402 if (offset != -1ULL) { 2403 /* 2404 * If we've just selected this metaslab group, 2405 * figure out whether the corresponding vdev is 2406 * over- or under-used relative to the pool, 2407 * and set an allocation bias to even it out. 2408 */ 2409 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2410 vdev_stat_t *vs = &vd->vdev_stat; 2411 int64_t vu, cu; 2412 2413 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2414 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2415 2416 /* 2417 * Calculate how much more or less we should 2418 * try to allocate from this device during 2419 * this iteration around the rotor. 2420 * For example, if a device is 80% full 2421 * and the pool is 20% full then we should 2422 * reduce allocations by 60% on this device. 2423 * 2424 * mg_bias = (20 - 80) * 512K / 100 = -307K 2425 * 2426 * This reduces allocations by 307K for this 2427 * iteration. 2428 */ 2429 mg->mg_bias = ((cu - vu) * 2430 (int64_t)mg->mg_aliquot) / 100; 2431 } else if (!metaslab_bias_enabled) { 2432 mg->mg_bias = 0; 2433 } 2434 2435 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2436 mg->mg_aliquot + mg->mg_bias) { 2437 mc->mc_rotor = mg->mg_next; 2438 mc->mc_aliquot = 0; 2439 } 2440 2441 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2442 DVA_SET_OFFSET(&dva[d], offset); 2443 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2444 DVA_SET_ASIZE(&dva[d], asize); 2445 2446 return (0); 2447 } 2448next: 2449 mc->mc_rotor = mg->mg_next; 2450 mc->mc_aliquot = 0; 2451 } while ((mg = mg->mg_next) != rotor); 2452 2453 if (!all_zero) { 2454 dshift++; 2455 ASSERT(dshift < 64); 2456 goto top; 2457 } 2458 2459 if (!allocatable && !zio_lock) { 2460 dshift = 3; 2461 zio_lock = B_TRUE; 2462 goto top; 2463 } 2464 2465 bzero(&dva[d], sizeof (dva_t)); 2466 2467 return (SET_ERROR(ENOSPC)); 2468} 2469 2470/* 2471 * Free the block represented by DVA in the context of the specified 2472 * transaction group. 2473 */ 2474static void 2475metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2476{ 2477 uint64_t vdev = DVA_GET_VDEV(dva); 2478 uint64_t offset = DVA_GET_OFFSET(dva); 2479 uint64_t size = DVA_GET_ASIZE(dva); 2480 vdev_t *vd; 2481 metaslab_t *msp; 2482 2483 ASSERT(DVA_IS_VALID(dva)); 2484 2485 if (txg > spa_freeze_txg(spa)) 2486 return; 2487 2488 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2489 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2490 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2491 (u_longlong_t)vdev, (u_longlong_t)offset); 2492 ASSERT(0); 2493 return; 2494 } 2495 2496 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2497 2498 if (DVA_GET_GANG(dva)) 2499 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2500 2501 mutex_enter(&msp->ms_lock); 2502 2503 if (now) { 2504 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2505 offset, size); 2506 2507 VERIFY(!msp->ms_condensing); 2508 VERIFY3U(offset, >=, msp->ms_start); 2509 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2510 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2511 msp->ms_size); 2512 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2513 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2514 range_tree_add(msp->ms_tree, offset, size); 2515 } else { 2516 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2517 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2518 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2519 offset, size); 2520 } 2521 2522 mutex_exit(&msp->ms_lock); 2523} 2524 2525/* 2526 * Intent log support: upon opening the pool after a crash, notify the SPA 2527 * of blocks that the intent log has allocated for immediate write, but 2528 * which are still considered free by the SPA because the last transaction 2529 * group didn't commit yet. 2530 */ 2531static int 2532metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2533{ 2534 uint64_t vdev = DVA_GET_VDEV(dva); 2535 uint64_t offset = DVA_GET_OFFSET(dva); 2536 uint64_t size = DVA_GET_ASIZE(dva); 2537 vdev_t *vd; 2538 metaslab_t *msp; 2539 int error = 0; 2540 2541 ASSERT(DVA_IS_VALID(dva)); 2542 2543 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2544 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2545 return (SET_ERROR(ENXIO)); 2546 2547 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2548 2549 if (DVA_GET_GANG(dva)) 2550 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2551 2552 mutex_enter(&msp->ms_lock); 2553 2554 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2555 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2556 2557 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2558 error = SET_ERROR(ENOENT); 2559 2560 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2561 mutex_exit(&msp->ms_lock); 2562 return (error); 2563 } 2564 2565 VERIFY(!msp->ms_condensing); 2566 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2567 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2568 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2569 range_tree_remove(msp->ms_tree, offset, size); 2570 2571 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2572 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2573 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2574 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2575 } 2576 2577 mutex_exit(&msp->ms_lock); 2578 2579 return (0); 2580} 2581 2582int 2583metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2584 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2585{ 2586 dva_t *dva = bp->blk_dva; 2587 dva_t *hintdva = hintbp->blk_dva; 2588 int error = 0; 2589 2590 ASSERT(bp->blk_birth == 0); 2591 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2592 2593 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2594 2595 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2596 spa_config_exit(spa, SCL_ALLOC, FTAG); 2597 return (SET_ERROR(ENOSPC)); 2598 } 2599 2600 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2601 ASSERT(BP_GET_NDVAS(bp) == 0); 2602 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2603 2604 for (int d = 0; d < ndvas; d++) { 2605 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2606 txg, flags); 2607 if (error != 0) { 2608 for (d--; d >= 0; d--) { 2609 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2610 bzero(&dva[d], sizeof (dva_t)); 2611 } 2612 spa_config_exit(spa, SCL_ALLOC, FTAG); 2613 return (error); 2614 } 2615 } 2616 ASSERT(error == 0); 2617 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2618 2619 spa_config_exit(spa, SCL_ALLOC, FTAG); 2620 2621 BP_SET_BIRTH(bp, txg, txg); 2622 2623 return (0); 2624} 2625 2626void 2627metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2628{ 2629 const dva_t *dva = bp->blk_dva; 2630 int ndvas = BP_GET_NDVAS(bp); 2631 2632 ASSERT(!BP_IS_HOLE(bp)); 2633 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2634 2635 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2636 2637 for (int d = 0; d < ndvas; d++) 2638 metaslab_free_dva(spa, &dva[d], txg, now); 2639 2640 spa_config_exit(spa, SCL_FREE, FTAG); 2641} 2642 2643int 2644metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2645{ 2646 const dva_t *dva = bp->blk_dva; 2647 int ndvas = BP_GET_NDVAS(bp); 2648 int error = 0; 2649 2650 ASSERT(!BP_IS_HOLE(bp)); 2651 2652 if (txg != 0) { 2653 /* 2654 * First do a dry run to make sure all DVAs are claimable, 2655 * so we don't have to unwind from partial failures below. 2656 */ 2657 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2658 return (error); 2659 } 2660 2661 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2662 2663 for (int d = 0; d < ndvas; d++) 2664 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2665 break; 2666 2667 spa_config_exit(spa, SCL_ALLOC, FTAG); 2668 2669 ASSERT(error == 0 || txg == 0); 2670 2671 return (error); 2672} 2673 2674void 2675metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2676{ 2677 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2678 return; 2679 2680 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2681 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2682 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2683 vdev_t *vd = vdev_lookup_top(spa, vdev); 2684 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2685 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2686 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2687 2688 if (msp->ms_loaded) 2689 range_tree_verify(msp->ms_tree, offset, size); 2690 2691 for (int j = 0; j < TXG_SIZE; j++) 2692 range_tree_verify(msp->ms_freetree[j], offset, size); 2693 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2694 range_tree_verify(msp->ms_defertree[j], offset, size); 2695 } 2696 spa_config_exit(spa, SCL_VDEV, FTAG); 2697} 2698