metaslab.c revision 276081
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h> 35#include <sys/zfeature.h> 36 37SYSCTL_DECL(_vfs_zfs); 38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 39 40/* 41 * Allow allocations to switch to gang blocks quickly. We do this to 42 * avoid having to load lots of space_maps in a given txg. There are, 43 * however, some cases where we want to avoid "fast" ganging and instead 44 * we want to do an exhaustive search of all metaslabs on this device. 45 * Currently we don't allow any gang, slog, or dump device related allocations 46 * to "fast" gang. 47 */ 48#define CAN_FASTGANG(flags) \ 49 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 50 METASLAB_GANG_AVOID))) 51 52#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 53#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 54#define METASLAB_ACTIVE_MASK \ 55 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 56 57uint64_t metaslab_aliquot = 512ULL << 10; 58uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 59TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 60SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 61 &metaslab_gang_bang, 0, 62 "Force gang block allocation for blocks larger than or equal to this value"); 63 64/* 65 * The in-core space map representation is more compact than its on-disk form. 66 * The zfs_condense_pct determines how much more compact the in-core 67 * space_map representation must be before we compact it on-disk. 68 * Values should be greater than or equal to 100. 69 */ 70int zfs_condense_pct = 200; 71TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 72SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 73 &zfs_condense_pct, 0, 74 "Condense on-disk spacemap when it is more than this many percents" 75 " of in-memory counterpart"); 76 77/* 78 * Condensing a metaslab is not guaranteed to actually reduce the amount of 79 * space used on disk. In particular, a space map uses data in increments of 80 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 81 * same number of blocks after condensing. Since the goal of condensing is to 82 * reduce the number of IOPs required to read the space map, we only want to 83 * condense when we can be sure we will reduce the number of blocks used by the 84 * space map. Unfortunately, we cannot precisely compute whether or not this is 85 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 86 * we apply the following heuristic: do not condense a spacemap unless the 87 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 88 * blocks. 89 */ 90int zfs_metaslab_condense_block_threshold = 4; 91 92/* 93 * The zfs_mg_noalloc_threshold defines which metaslab groups should 94 * be eligible for allocation. The value is defined as a percentage of 95 * free space. Metaslab groups that have more free space than 96 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 97 * a metaslab group's free space is less than or equal to the 98 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 99 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 100 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 101 * groups are allowed to accept allocations. Gang blocks are always 102 * eligible to allocate on any metaslab group. The default value of 0 means 103 * no metaslab group will be excluded based on this criterion. 104 */ 105int zfs_mg_noalloc_threshold = 0; 106TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 107SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 108 &zfs_mg_noalloc_threshold, 0, 109 "Percentage of metaslab group size that should be free" 110 " to make it eligible for allocation"); 111 112/* 113 * Metaslab groups are considered eligible for allocations if their 114 * fragmenation metric (measured as a percentage) is less than or equal to 115 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 116 * then it will be skipped unless all metaslab groups within the metaslab 117 * class have also crossed this threshold. 118 */ 119int zfs_mg_fragmentation_threshold = 85; 120TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 121SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 122 &zfs_mg_fragmentation_threshold, 0, 123 "Percentage of metaslab group size that should be considered " 124 "eligible for allocations unless all metaslab groups within the metaslab class " 125 "have also crossed this threshold"); 126 127/* 128 * Allow metaslabs to keep their active state as long as their fragmentation 129 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 130 * active metaslab that exceeds this threshold will no longer keep its active 131 * status allowing better metaslabs to be selected. 132 */ 133int zfs_metaslab_fragmentation_threshold = 70; 134TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 135 &zfs_metaslab_fragmentation_threshold); 136SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 137 &zfs_metaslab_fragmentation_threshold, 0, 138 "Maximum percentage of metaslab fragmentation level to keep their active state"); 139 140/* 141 * When set will load all metaslabs when pool is first opened. 142 */ 143int metaslab_debug_load = 0; 144TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 145SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 146 &metaslab_debug_load, 0, 147 "Load all metaslabs when pool is first opened"); 148 149/* 150 * When set will prevent metaslabs from being unloaded. 151 */ 152int metaslab_debug_unload = 0; 153TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 154SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 155 &metaslab_debug_unload, 0, 156 "Prevent metaslabs from being unloaded"); 157 158/* 159 * Minimum size which forces the dynamic allocator to change 160 * it's allocation strategy. Once the space map cannot satisfy 161 * an allocation of this size then it switches to using more 162 * aggressive strategy (i.e search by size rather than offset). 163 */ 164uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 165TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 166 &metaslab_df_alloc_threshold); 167SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 168 &metaslab_df_alloc_threshold, 0, 169 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 170 171/* 172 * The minimum free space, in percent, which must be available 173 * in a space map to continue allocations in a first-fit fashion. 174 * Once the space_map's free space drops below this level we dynamically 175 * switch to using best-fit allocations. 176 */ 177int metaslab_df_free_pct = 4; 178TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 179SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 180 &metaslab_df_free_pct, 0, 181 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 182 183/* 184 * A metaslab is considered "free" if it contains a contiguous 185 * segment which is greater than metaslab_min_alloc_size. 186 */ 187uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 188TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 189 &metaslab_min_alloc_size); 190SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 191 &metaslab_min_alloc_size, 0, 192 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 193 194/* 195 * Percentage of all cpus that can be used by the metaslab taskq. 196 */ 197int metaslab_load_pct = 50; 198TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 199SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 200 &metaslab_load_pct, 0, 201 "Percentage of cpus that can be used by the metaslab taskq"); 202 203/* 204 * Determines how many txgs a metaslab may remain loaded without having any 205 * allocations from it. As long as a metaslab continues to be used we will 206 * keep it loaded. 207 */ 208int metaslab_unload_delay = TXG_SIZE * 2; 209TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 210SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 211 &metaslab_unload_delay, 0, 212 "Number of TXGs that an unused metaslab can be kept in memory"); 213 214/* 215 * Max number of metaslabs per group to preload. 216 */ 217int metaslab_preload_limit = SPA_DVAS_PER_BP; 218TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 219SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 220 &metaslab_preload_limit, 0, 221 "Max number of metaslabs per group to preload"); 222 223/* 224 * Enable/disable preloading of metaslab. 225 */ 226boolean_t metaslab_preload_enabled = B_TRUE; 227TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 228SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 229 &metaslab_preload_enabled, 0, 230 "Max number of metaslabs per group to preload"); 231 232/* 233 * Enable/disable fragmentation weighting on metaslabs. 234 */ 235boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 236TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 237 &metaslab_fragmentation_factor_enabled); 238SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 239 &metaslab_fragmentation_factor_enabled, 0, 240 "Enable fragmentation weighting on metaslabs"); 241 242/* 243 * Enable/disable lba weighting (i.e. outer tracks are given preference). 244 */ 245boolean_t metaslab_lba_weighting_enabled = B_TRUE; 246TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 247 &metaslab_lba_weighting_enabled); 248SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 249 &metaslab_lba_weighting_enabled, 0, 250 "Enable LBA weighting (i.e. outer tracks are given preference)"); 251 252/* 253 * Enable/disable metaslab group biasing. 254 */ 255boolean_t metaslab_bias_enabled = B_TRUE; 256TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 257 &metaslab_bias_enabled); 258SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 259 &metaslab_bias_enabled, 0, 260 "Enable metaslab group biasing"); 261 262static uint64_t metaslab_fragmentation(metaslab_t *); 263 264/* 265 * ========================================================================== 266 * Metaslab classes 267 * ========================================================================== 268 */ 269metaslab_class_t * 270metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 271{ 272 metaslab_class_t *mc; 273 274 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 275 276 mc->mc_spa = spa; 277 mc->mc_rotor = NULL; 278 mc->mc_ops = ops; 279 280 return (mc); 281} 282 283void 284metaslab_class_destroy(metaslab_class_t *mc) 285{ 286 ASSERT(mc->mc_rotor == NULL); 287 ASSERT(mc->mc_alloc == 0); 288 ASSERT(mc->mc_deferred == 0); 289 ASSERT(mc->mc_space == 0); 290 ASSERT(mc->mc_dspace == 0); 291 292 kmem_free(mc, sizeof (metaslab_class_t)); 293} 294 295int 296metaslab_class_validate(metaslab_class_t *mc) 297{ 298 metaslab_group_t *mg; 299 vdev_t *vd; 300 301 /* 302 * Must hold one of the spa_config locks. 303 */ 304 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 305 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 306 307 if ((mg = mc->mc_rotor) == NULL) 308 return (0); 309 310 do { 311 vd = mg->mg_vd; 312 ASSERT(vd->vdev_mg != NULL); 313 ASSERT3P(vd->vdev_top, ==, vd); 314 ASSERT3P(mg->mg_class, ==, mc); 315 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 316 } while ((mg = mg->mg_next) != mc->mc_rotor); 317 318 return (0); 319} 320 321void 322metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 323 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 324{ 325 atomic_add_64(&mc->mc_alloc, alloc_delta); 326 atomic_add_64(&mc->mc_deferred, defer_delta); 327 atomic_add_64(&mc->mc_space, space_delta); 328 atomic_add_64(&mc->mc_dspace, dspace_delta); 329} 330 331void 332metaslab_class_minblocksize_update(metaslab_class_t *mc) 333{ 334 metaslab_group_t *mg; 335 vdev_t *vd; 336 uint64_t minashift = UINT64_MAX; 337 338 if ((mg = mc->mc_rotor) == NULL) { 339 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 340 return; 341 } 342 343 do { 344 vd = mg->mg_vd; 345 if (vd->vdev_ashift < minashift) 346 minashift = vd->vdev_ashift; 347 } while ((mg = mg->mg_next) != mc->mc_rotor); 348 349 mc->mc_minblocksize = 1ULL << minashift; 350} 351 352uint64_t 353metaslab_class_get_alloc(metaslab_class_t *mc) 354{ 355 return (mc->mc_alloc); 356} 357 358uint64_t 359metaslab_class_get_deferred(metaslab_class_t *mc) 360{ 361 return (mc->mc_deferred); 362} 363 364uint64_t 365metaslab_class_get_space(metaslab_class_t *mc) 366{ 367 return (mc->mc_space); 368} 369 370uint64_t 371metaslab_class_get_dspace(metaslab_class_t *mc) 372{ 373 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 374} 375 376uint64_t 377metaslab_class_get_minblocksize(metaslab_class_t *mc) 378{ 379 return (mc->mc_minblocksize); 380} 381 382void 383metaslab_class_histogram_verify(metaslab_class_t *mc) 384{ 385 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 386 uint64_t *mc_hist; 387 int i; 388 389 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 390 return; 391 392 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 393 KM_SLEEP); 394 395 for (int c = 0; c < rvd->vdev_children; c++) { 396 vdev_t *tvd = rvd->vdev_child[c]; 397 metaslab_group_t *mg = tvd->vdev_mg; 398 399 /* 400 * Skip any holes, uninitialized top-levels, or 401 * vdevs that are not in this metalab class. 402 */ 403 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 404 mg->mg_class != mc) { 405 continue; 406 } 407 408 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 409 mc_hist[i] += mg->mg_histogram[i]; 410 } 411 412 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 413 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 414 415 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 416} 417 418/* 419 * Calculate the metaslab class's fragmentation metric. The metric 420 * is weighted based on the space contribution of each metaslab group. 421 * The return value will be a number between 0 and 100 (inclusive), or 422 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 423 * zfs_frag_table for more information about the metric. 424 */ 425uint64_t 426metaslab_class_fragmentation(metaslab_class_t *mc) 427{ 428 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 429 uint64_t fragmentation = 0; 430 431 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 432 433 for (int c = 0; c < rvd->vdev_children; c++) { 434 vdev_t *tvd = rvd->vdev_child[c]; 435 metaslab_group_t *mg = tvd->vdev_mg; 436 437 /* 438 * Skip any holes, uninitialized top-levels, or 439 * vdevs that are not in this metalab class. 440 */ 441 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 442 mg->mg_class != mc) { 443 continue; 444 } 445 446 /* 447 * If a metaslab group does not contain a fragmentation 448 * metric then just bail out. 449 */ 450 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 451 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 452 return (ZFS_FRAG_INVALID); 453 } 454 455 /* 456 * Determine how much this metaslab_group is contributing 457 * to the overall pool fragmentation metric. 458 */ 459 fragmentation += mg->mg_fragmentation * 460 metaslab_group_get_space(mg); 461 } 462 fragmentation /= metaslab_class_get_space(mc); 463 464 ASSERT3U(fragmentation, <=, 100); 465 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 466 return (fragmentation); 467} 468 469/* 470 * Calculate the amount of expandable space that is available in 471 * this metaslab class. If a device is expanded then its expandable 472 * space will be the amount of allocatable space that is currently not 473 * part of this metaslab class. 474 */ 475uint64_t 476metaslab_class_expandable_space(metaslab_class_t *mc) 477{ 478 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 479 uint64_t space = 0; 480 481 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 482 for (int c = 0; c < rvd->vdev_children; c++) { 483 vdev_t *tvd = rvd->vdev_child[c]; 484 metaslab_group_t *mg = tvd->vdev_mg; 485 486 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 487 mg->mg_class != mc) { 488 continue; 489 } 490 491 space += tvd->vdev_max_asize - tvd->vdev_asize; 492 } 493 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 494 return (space); 495} 496 497/* 498 * ========================================================================== 499 * Metaslab groups 500 * ========================================================================== 501 */ 502static int 503metaslab_compare(const void *x1, const void *x2) 504{ 505 const metaslab_t *m1 = x1; 506 const metaslab_t *m2 = x2; 507 508 if (m1->ms_weight < m2->ms_weight) 509 return (1); 510 if (m1->ms_weight > m2->ms_weight) 511 return (-1); 512 513 /* 514 * If the weights are identical, use the offset to force uniqueness. 515 */ 516 if (m1->ms_start < m2->ms_start) 517 return (-1); 518 if (m1->ms_start > m2->ms_start) 519 return (1); 520 521 ASSERT3P(m1, ==, m2); 522 523 return (0); 524} 525 526/* 527 * Update the allocatable flag and the metaslab group's capacity. 528 * The allocatable flag is set to true if the capacity is below 529 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 530 * from allocatable to non-allocatable or vice versa then the metaslab 531 * group's class is updated to reflect the transition. 532 */ 533static void 534metaslab_group_alloc_update(metaslab_group_t *mg) 535{ 536 vdev_t *vd = mg->mg_vd; 537 metaslab_class_t *mc = mg->mg_class; 538 vdev_stat_t *vs = &vd->vdev_stat; 539 boolean_t was_allocatable; 540 541 ASSERT(vd == vd->vdev_top); 542 543 mutex_enter(&mg->mg_lock); 544 was_allocatable = mg->mg_allocatable; 545 546 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 547 (vs->vs_space + 1); 548 549 /* 550 * A metaslab group is considered allocatable if it has plenty 551 * of free space or is not heavily fragmented. We only take 552 * fragmentation into account if the metaslab group has a valid 553 * fragmentation metric (i.e. a value between 0 and 100). 554 */ 555 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 556 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 557 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 558 559 /* 560 * The mc_alloc_groups maintains a count of the number of 561 * groups in this metaslab class that are still above the 562 * zfs_mg_noalloc_threshold. This is used by the allocating 563 * threads to determine if they should avoid allocations to 564 * a given group. The allocator will avoid allocations to a group 565 * if that group has reached or is below the zfs_mg_noalloc_threshold 566 * and there are still other groups that are above the threshold. 567 * When a group transitions from allocatable to non-allocatable or 568 * vice versa we update the metaslab class to reflect that change. 569 * When the mc_alloc_groups value drops to 0 that means that all 570 * groups have reached the zfs_mg_noalloc_threshold making all groups 571 * eligible for allocations. This effectively means that all devices 572 * are balanced again. 573 */ 574 if (was_allocatable && !mg->mg_allocatable) 575 mc->mc_alloc_groups--; 576 else if (!was_allocatable && mg->mg_allocatable) 577 mc->mc_alloc_groups++; 578 579 mutex_exit(&mg->mg_lock); 580} 581 582metaslab_group_t * 583metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 584{ 585 metaslab_group_t *mg; 586 587 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 588 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 589 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 590 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 591 mg->mg_vd = vd; 592 mg->mg_class = mc; 593 mg->mg_activation_count = 0; 594 595 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 596 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 597 598 return (mg); 599} 600 601void 602metaslab_group_destroy(metaslab_group_t *mg) 603{ 604 ASSERT(mg->mg_prev == NULL); 605 ASSERT(mg->mg_next == NULL); 606 /* 607 * We may have gone below zero with the activation count 608 * either because we never activated in the first place or 609 * because we're done, and possibly removing the vdev. 610 */ 611 ASSERT(mg->mg_activation_count <= 0); 612 613 taskq_destroy(mg->mg_taskq); 614 avl_destroy(&mg->mg_metaslab_tree); 615 mutex_destroy(&mg->mg_lock); 616 kmem_free(mg, sizeof (metaslab_group_t)); 617} 618 619void 620metaslab_group_activate(metaslab_group_t *mg) 621{ 622 metaslab_class_t *mc = mg->mg_class; 623 metaslab_group_t *mgprev, *mgnext; 624 625 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 626 627 ASSERT(mc->mc_rotor != mg); 628 ASSERT(mg->mg_prev == NULL); 629 ASSERT(mg->mg_next == NULL); 630 ASSERT(mg->mg_activation_count <= 0); 631 632 if (++mg->mg_activation_count <= 0) 633 return; 634 635 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 636 metaslab_group_alloc_update(mg); 637 638 if ((mgprev = mc->mc_rotor) == NULL) { 639 mg->mg_prev = mg; 640 mg->mg_next = mg; 641 } else { 642 mgnext = mgprev->mg_next; 643 mg->mg_prev = mgprev; 644 mg->mg_next = mgnext; 645 mgprev->mg_next = mg; 646 mgnext->mg_prev = mg; 647 } 648 mc->mc_rotor = mg; 649 metaslab_class_minblocksize_update(mc); 650} 651 652void 653metaslab_group_passivate(metaslab_group_t *mg) 654{ 655 metaslab_class_t *mc = mg->mg_class; 656 metaslab_group_t *mgprev, *mgnext; 657 658 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 659 660 if (--mg->mg_activation_count != 0) { 661 ASSERT(mc->mc_rotor != mg); 662 ASSERT(mg->mg_prev == NULL); 663 ASSERT(mg->mg_next == NULL); 664 ASSERT(mg->mg_activation_count < 0); 665 return; 666 } 667 668 taskq_wait(mg->mg_taskq); 669 metaslab_group_alloc_update(mg); 670 671 mgprev = mg->mg_prev; 672 mgnext = mg->mg_next; 673 674 if (mg == mgnext) { 675 mc->mc_rotor = NULL; 676 } else { 677 mc->mc_rotor = mgnext; 678 mgprev->mg_next = mgnext; 679 mgnext->mg_prev = mgprev; 680 } 681 682 mg->mg_prev = NULL; 683 mg->mg_next = NULL; 684 metaslab_class_minblocksize_update(mc); 685} 686 687uint64_t 688metaslab_group_get_space(metaslab_group_t *mg) 689{ 690 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 691} 692 693void 694metaslab_group_histogram_verify(metaslab_group_t *mg) 695{ 696 uint64_t *mg_hist; 697 vdev_t *vd = mg->mg_vd; 698 uint64_t ashift = vd->vdev_ashift; 699 int i; 700 701 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 702 return; 703 704 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 705 KM_SLEEP); 706 707 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 708 SPACE_MAP_HISTOGRAM_SIZE + ashift); 709 710 for (int m = 0; m < vd->vdev_ms_count; m++) { 711 metaslab_t *msp = vd->vdev_ms[m]; 712 713 if (msp->ms_sm == NULL) 714 continue; 715 716 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 717 mg_hist[i + ashift] += 718 msp->ms_sm->sm_phys->smp_histogram[i]; 719 } 720 721 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 722 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 723 724 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 725} 726 727static void 728metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 729{ 730 metaslab_class_t *mc = mg->mg_class; 731 uint64_t ashift = mg->mg_vd->vdev_ashift; 732 733 ASSERT(MUTEX_HELD(&msp->ms_lock)); 734 if (msp->ms_sm == NULL) 735 return; 736 737 mutex_enter(&mg->mg_lock); 738 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 739 mg->mg_histogram[i + ashift] += 740 msp->ms_sm->sm_phys->smp_histogram[i]; 741 mc->mc_histogram[i + ashift] += 742 msp->ms_sm->sm_phys->smp_histogram[i]; 743 } 744 mutex_exit(&mg->mg_lock); 745} 746 747void 748metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 749{ 750 metaslab_class_t *mc = mg->mg_class; 751 uint64_t ashift = mg->mg_vd->vdev_ashift; 752 753 ASSERT(MUTEX_HELD(&msp->ms_lock)); 754 if (msp->ms_sm == NULL) 755 return; 756 757 mutex_enter(&mg->mg_lock); 758 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 759 ASSERT3U(mg->mg_histogram[i + ashift], >=, 760 msp->ms_sm->sm_phys->smp_histogram[i]); 761 ASSERT3U(mc->mc_histogram[i + ashift], >=, 762 msp->ms_sm->sm_phys->smp_histogram[i]); 763 764 mg->mg_histogram[i + ashift] -= 765 msp->ms_sm->sm_phys->smp_histogram[i]; 766 mc->mc_histogram[i + ashift] -= 767 msp->ms_sm->sm_phys->smp_histogram[i]; 768 } 769 mutex_exit(&mg->mg_lock); 770} 771 772static void 773metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 774{ 775 ASSERT(msp->ms_group == NULL); 776 mutex_enter(&mg->mg_lock); 777 msp->ms_group = mg; 778 msp->ms_weight = 0; 779 avl_add(&mg->mg_metaslab_tree, msp); 780 mutex_exit(&mg->mg_lock); 781 782 mutex_enter(&msp->ms_lock); 783 metaslab_group_histogram_add(mg, msp); 784 mutex_exit(&msp->ms_lock); 785} 786 787static void 788metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 789{ 790 mutex_enter(&msp->ms_lock); 791 metaslab_group_histogram_remove(mg, msp); 792 mutex_exit(&msp->ms_lock); 793 794 mutex_enter(&mg->mg_lock); 795 ASSERT(msp->ms_group == mg); 796 avl_remove(&mg->mg_metaslab_tree, msp); 797 msp->ms_group = NULL; 798 mutex_exit(&mg->mg_lock); 799} 800 801static void 802metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 803{ 804 /* 805 * Although in principle the weight can be any value, in 806 * practice we do not use values in the range [1, 511]. 807 */ 808 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 809 ASSERT(MUTEX_HELD(&msp->ms_lock)); 810 811 mutex_enter(&mg->mg_lock); 812 ASSERT(msp->ms_group == mg); 813 avl_remove(&mg->mg_metaslab_tree, msp); 814 msp->ms_weight = weight; 815 avl_add(&mg->mg_metaslab_tree, msp); 816 mutex_exit(&mg->mg_lock); 817} 818 819/* 820 * Calculate the fragmentation for a given metaslab group. We can use 821 * a simple average here since all metaslabs within the group must have 822 * the same size. The return value will be a value between 0 and 100 823 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 824 * group have a fragmentation metric. 825 */ 826uint64_t 827metaslab_group_fragmentation(metaslab_group_t *mg) 828{ 829 vdev_t *vd = mg->mg_vd; 830 uint64_t fragmentation = 0; 831 uint64_t valid_ms = 0; 832 833 for (int m = 0; m < vd->vdev_ms_count; m++) { 834 metaslab_t *msp = vd->vdev_ms[m]; 835 836 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 837 continue; 838 839 valid_ms++; 840 fragmentation += msp->ms_fragmentation; 841 } 842 843 if (valid_ms <= vd->vdev_ms_count / 2) 844 return (ZFS_FRAG_INVALID); 845 846 fragmentation /= valid_ms; 847 ASSERT3U(fragmentation, <=, 100); 848 return (fragmentation); 849} 850 851/* 852 * Determine if a given metaslab group should skip allocations. A metaslab 853 * group should avoid allocations if its free capacity is less than the 854 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 855 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 856 * that can still handle allocations. 857 */ 858static boolean_t 859metaslab_group_allocatable(metaslab_group_t *mg) 860{ 861 vdev_t *vd = mg->mg_vd; 862 spa_t *spa = vd->vdev_spa; 863 metaslab_class_t *mc = mg->mg_class; 864 865 /* 866 * We use two key metrics to determine if a metaslab group is 867 * considered allocatable -- free space and fragmentation. If 868 * the free space is greater than the free space threshold and 869 * the fragmentation is less than the fragmentation threshold then 870 * consider the group allocatable. There are two case when we will 871 * not consider these key metrics. The first is if the group is 872 * associated with a slog device and the second is if all groups 873 * in this metaslab class have already been consider ineligible 874 * for allocations. 875 */ 876 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 877 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 878 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 879 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 880} 881 882/* 883 * ========================================================================== 884 * Range tree callbacks 885 * ========================================================================== 886 */ 887 888/* 889 * Comparison function for the private size-ordered tree. Tree is sorted 890 * by size, larger sizes at the end of the tree. 891 */ 892static int 893metaslab_rangesize_compare(const void *x1, const void *x2) 894{ 895 const range_seg_t *r1 = x1; 896 const range_seg_t *r2 = x2; 897 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 898 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 899 900 if (rs_size1 < rs_size2) 901 return (-1); 902 if (rs_size1 > rs_size2) 903 return (1); 904 905 if (r1->rs_start < r2->rs_start) 906 return (-1); 907 908 if (r1->rs_start > r2->rs_start) 909 return (1); 910 911 return (0); 912} 913 914/* 915 * Create any block allocator specific components. The current allocators 916 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 917 */ 918static void 919metaslab_rt_create(range_tree_t *rt, void *arg) 920{ 921 metaslab_t *msp = arg; 922 923 ASSERT3P(rt->rt_arg, ==, msp); 924 ASSERT(msp->ms_tree == NULL); 925 926 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 927 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 928} 929 930/* 931 * Destroy the block allocator specific components. 932 */ 933static void 934metaslab_rt_destroy(range_tree_t *rt, void *arg) 935{ 936 metaslab_t *msp = arg; 937 938 ASSERT3P(rt->rt_arg, ==, msp); 939 ASSERT3P(msp->ms_tree, ==, rt); 940 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 941 942 avl_destroy(&msp->ms_size_tree); 943} 944 945static void 946metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 947{ 948 metaslab_t *msp = arg; 949 950 ASSERT3P(rt->rt_arg, ==, msp); 951 ASSERT3P(msp->ms_tree, ==, rt); 952 VERIFY(!msp->ms_condensing); 953 avl_add(&msp->ms_size_tree, rs); 954} 955 956static void 957metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 958{ 959 metaslab_t *msp = arg; 960 961 ASSERT3P(rt->rt_arg, ==, msp); 962 ASSERT3P(msp->ms_tree, ==, rt); 963 VERIFY(!msp->ms_condensing); 964 avl_remove(&msp->ms_size_tree, rs); 965} 966 967static void 968metaslab_rt_vacate(range_tree_t *rt, void *arg) 969{ 970 metaslab_t *msp = arg; 971 972 ASSERT3P(rt->rt_arg, ==, msp); 973 ASSERT3P(msp->ms_tree, ==, rt); 974 975 /* 976 * Normally one would walk the tree freeing nodes along the way. 977 * Since the nodes are shared with the range trees we can avoid 978 * walking all nodes and just reinitialize the avl tree. The nodes 979 * will be freed by the range tree, so we don't want to free them here. 980 */ 981 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 982 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 983} 984 985static range_tree_ops_t metaslab_rt_ops = { 986 metaslab_rt_create, 987 metaslab_rt_destroy, 988 metaslab_rt_add, 989 metaslab_rt_remove, 990 metaslab_rt_vacate 991}; 992 993/* 994 * ========================================================================== 995 * Metaslab block operations 996 * ========================================================================== 997 */ 998 999/* 1000 * Return the maximum contiguous segment within the metaslab. 1001 */ 1002uint64_t 1003metaslab_block_maxsize(metaslab_t *msp) 1004{ 1005 avl_tree_t *t = &msp->ms_size_tree; 1006 range_seg_t *rs; 1007 1008 if (t == NULL || (rs = avl_last(t)) == NULL) 1009 return (0ULL); 1010 1011 return (rs->rs_end - rs->rs_start); 1012} 1013 1014uint64_t 1015metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1016{ 1017 uint64_t start; 1018 range_tree_t *rt = msp->ms_tree; 1019 1020 VERIFY(!msp->ms_condensing); 1021 1022 start = msp->ms_ops->msop_alloc(msp, size); 1023 if (start != -1ULL) { 1024 vdev_t *vd = msp->ms_group->mg_vd; 1025 1026 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1027 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1028 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1029 range_tree_remove(rt, start, size); 1030 } 1031 return (start); 1032} 1033 1034/* 1035 * ========================================================================== 1036 * Common allocator routines 1037 * ========================================================================== 1038 */ 1039 1040/* 1041 * This is a helper function that can be used by the allocator to find 1042 * a suitable block to allocate. This will search the specified AVL 1043 * tree looking for a block that matches the specified criteria. 1044 */ 1045static uint64_t 1046metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1047 uint64_t align) 1048{ 1049 range_seg_t *rs, rsearch; 1050 avl_index_t where; 1051 1052 rsearch.rs_start = *cursor; 1053 rsearch.rs_end = *cursor + size; 1054 1055 rs = avl_find(t, &rsearch, &where); 1056 if (rs == NULL) 1057 rs = avl_nearest(t, where, AVL_AFTER); 1058 1059 while (rs != NULL) { 1060 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1061 1062 if (offset + size <= rs->rs_end) { 1063 *cursor = offset + size; 1064 return (offset); 1065 } 1066 rs = AVL_NEXT(t, rs); 1067 } 1068 1069 /* 1070 * If we know we've searched the whole map (*cursor == 0), give up. 1071 * Otherwise, reset the cursor to the beginning and try again. 1072 */ 1073 if (*cursor == 0) 1074 return (-1ULL); 1075 1076 *cursor = 0; 1077 return (metaslab_block_picker(t, cursor, size, align)); 1078} 1079 1080/* 1081 * ========================================================================== 1082 * The first-fit block allocator 1083 * ========================================================================== 1084 */ 1085static uint64_t 1086metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1087{ 1088 /* 1089 * Find the largest power of 2 block size that evenly divides the 1090 * requested size. This is used to try to allocate blocks with similar 1091 * alignment from the same area of the metaslab (i.e. same cursor 1092 * bucket) but it does not guarantee that other allocations sizes 1093 * may exist in the same region. 1094 */ 1095 uint64_t align = size & -size; 1096 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1097 avl_tree_t *t = &msp->ms_tree->rt_root; 1098 1099 return (metaslab_block_picker(t, cursor, size, align)); 1100} 1101 1102static metaslab_ops_t metaslab_ff_ops = { 1103 metaslab_ff_alloc 1104}; 1105 1106/* 1107 * ========================================================================== 1108 * Dynamic block allocator - 1109 * Uses the first fit allocation scheme until space get low and then 1110 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1111 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1112 * ========================================================================== 1113 */ 1114static uint64_t 1115metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1116{ 1117 /* 1118 * Find the largest power of 2 block size that evenly divides the 1119 * requested size. This is used to try to allocate blocks with similar 1120 * alignment from the same area of the metaslab (i.e. same cursor 1121 * bucket) but it does not guarantee that other allocations sizes 1122 * may exist in the same region. 1123 */ 1124 uint64_t align = size & -size; 1125 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1126 range_tree_t *rt = msp->ms_tree; 1127 avl_tree_t *t = &rt->rt_root; 1128 uint64_t max_size = metaslab_block_maxsize(msp); 1129 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1130 1131 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1132 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1133 1134 if (max_size < size) 1135 return (-1ULL); 1136 1137 /* 1138 * If we're running low on space switch to using the size 1139 * sorted AVL tree (best-fit). 1140 */ 1141 if (max_size < metaslab_df_alloc_threshold || 1142 free_pct < metaslab_df_free_pct) { 1143 t = &msp->ms_size_tree; 1144 *cursor = 0; 1145 } 1146 1147 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1148} 1149 1150static metaslab_ops_t metaslab_df_ops = { 1151 metaslab_df_alloc 1152}; 1153 1154/* 1155 * ========================================================================== 1156 * Cursor fit block allocator - 1157 * Select the largest region in the metaslab, set the cursor to the beginning 1158 * of the range and the cursor_end to the end of the range. As allocations 1159 * are made advance the cursor. Continue allocating from the cursor until 1160 * the range is exhausted and then find a new range. 1161 * ========================================================================== 1162 */ 1163static uint64_t 1164metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1165{ 1166 range_tree_t *rt = msp->ms_tree; 1167 avl_tree_t *t = &msp->ms_size_tree; 1168 uint64_t *cursor = &msp->ms_lbas[0]; 1169 uint64_t *cursor_end = &msp->ms_lbas[1]; 1170 uint64_t offset = 0; 1171 1172 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1173 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1174 1175 ASSERT3U(*cursor_end, >=, *cursor); 1176 1177 if ((*cursor + size) > *cursor_end) { 1178 range_seg_t *rs; 1179 1180 rs = avl_last(&msp->ms_size_tree); 1181 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1182 return (-1ULL); 1183 1184 *cursor = rs->rs_start; 1185 *cursor_end = rs->rs_end; 1186 } 1187 1188 offset = *cursor; 1189 *cursor += size; 1190 1191 return (offset); 1192} 1193 1194static metaslab_ops_t metaslab_cf_ops = { 1195 metaslab_cf_alloc 1196}; 1197 1198/* 1199 * ========================================================================== 1200 * New dynamic fit allocator - 1201 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1202 * contiguous blocks. If no region is found then just use the largest segment 1203 * that remains. 1204 * ========================================================================== 1205 */ 1206 1207/* 1208 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1209 * to request from the allocator. 1210 */ 1211uint64_t metaslab_ndf_clump_shift = 4; 1212 1213static uint64_t 1214metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1215{ 1216 avl_tree_t *t = &msp->ms_tree->rt_root; 1217 avl_index_t where; 1218 range_seg_t *rs, rsearch; 1219 uint64_t hbit = highbit64(size); 1220 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1221 uint64_t max_size = metaslab_block_maxsize(msp); 1222 1223 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1224 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1225 1226 if (max_size < size) 1227 return (-1ULL); 1228 1229 rsearch.rs_start = *cursor; 1230 rsearch.rs_end = *cursor + size; 1231 1232 rs = avl_find(t, &rsearch, &where); 1233 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1234 t = &msp->ms_size_tree; 1235 1236 rsearch.rs_start = 0; 1237 rsearch.rs_end = MIN(max_size, 1238 1ULL << (hbit + metaslab_ndf_clump_shift)); 1239 rs = avl_find(t, &rsearch, &where); 1240 if (rs == NULL) 1241 rs = avl_nearest(t, where, AVL_AFTER); 1242 ASSERT(rs != NULL); 1243 } 1244 1245 if ((rs->rs_end - rs->rs_start) >= size) { 1246 *cursor = rs->rs_start + size; 1247 return (rs->rs_start); 1248 } 1249 return (-1ULL); 1250} 1251 1252static metaslab_ops_t metaslab_ndf_ops = { 1253 metaslab_ndf_alloc 1254}; 1255 1256metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1257 1258/* 1259 * ========================================================================== 1260 * Metaslabs 1261 * ========================================================================== 1262 */ 1263 1264/* 1265 * Wait for any in-progress metaslab loads to complete. 1266 */ 1267void 1268metaslab_load_wait(metaslab_t *msp) 1269{ 1270 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1271 1272 while (msp->ms_loading) { 1273 ASSERT(!msp->ms_loaded); 1274 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1275 } 1276} 1277 1278int 1279metaslab_load(metaslab_t *msp) 1280{ 1281 int error = 0; 1282 1283 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1284 ASSERT(!msp->ms_loaded); 1285 ASSERT(!msp->ms_loading); 1286 1287 msp->ms_loading = B_TRUE; 1288 1289 /* 1290 * If the space map has not been allocated yet, then treat 1291 * all the space in the metaslab as free and add it to the 1292 * ms_tree. 1293 */ 1294 if (msp->ms_sm != NULL) 1295 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1296 else 1297 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1298 1299 msp->ms_loaded = (error == 0); 1300 msp->ms_loading = B_FALSE; 1301 1302 if (msp->ms_loaded) { 1303 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1304 range_tree_walk(msp->ms_defertree[t], 1305 range_tree_remove, msp->ms_tree); 1306 } 1307 } 1308 cv_broadcast(&msp->ms_load_cv); 1309 return (error); 1310} 1311 1312void 1313metaslab_unload(metaslab_t *msp) 1314{ 1315 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1316 range_tree_vacate(msp->ms_tree, NULL, NULL); 1317 msp->ms_loaded = B_FALSE; 1318 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1319} 1320 1321metaslab_t * 1322metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg) 1323{ 1324 vdev_t *vd = mg->mg_vd; 1325 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1326 metaslab_t *msp; 1327 1328 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1329 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1330 cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL); 1331 msp->ms_id = id; 1332 msp->ms_start = id << vd->vdev_ms_shift; 1333 msp->ms_size = 1ULL << vd->vdev_ms_shift; 1334 1335 /* 1336 * We only open space map objects that already exist. All others 1337 * will be opened when we finally allocate an object for it. 1338 */ 1339 if (object != 0) { 1340 VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start, 1341 msp->ms_size, vd->vdev_ashift, &msp->ms_lock)); 1342 ASSERT(msp->ms_sm != NULL); 1343 } 1344 1345 /* 1346 * We create the main range tree here, but we don't create the 1347 * alloctree and freetree until metaslab_sync_done(). This serves 1348 * two purposes: it allows metaslab_sync_done() to detect the 1349 * addition of new space; and for debugging, it ensures that we'd 1350 * data fault on any attempt to use this metaslab before it's ready. 1351 */ 1352 msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); 1353 metaslab_group_add(mg, msp); 1354 1355 msp->ms_fragmentation = metaslab_fragmentation(msp); 1356 msp->ms_ops = mg->mg_class->mc_ops; 1357 1358 /* 1359 * If we're opening an existing pool (txg == 0) or creating 1360 * a new one (txg == TXG_INITIAL), all space is available now. 1361 * If we're adding space to an existing pool, the new space 1362 * does not become available until after this txg has synced. 1363 */ 1364 if (txg <= TXG_INITIAL) 1365 metaslab_sync_done(msp, 0); 1366 1367 /* 1368 * If metaslab_debug_load is set and we're initializing a metaslab 1369 * that has an allocated space_map object then load the its space 1370 * map so that can verify frees. 1371 */ 1372 if (metaslab_debug_load && msp->ms_sm != NULL) { 1373 mutex_enter(&msp->ms_lock); 1374 VERIFY0(metaslab_load(msp)); 1375 mutex_exit(&msp->ms_lock); 1376 } 1377 1378 if (txg != 0) { 1379 vdev_dirty(vd, 0, NULL, txg); 1380 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1381 } 1382 1383 return (msp); 1384} 1385 1386void 1387metaslab_fini(metaslab_t *msp) 1388{ 1389 metaslab_group_t *mg = msp->ms_group; 1390 1391 metaslab_group_remove(mg, msp); 1392 1393 mutex_enter(&msp->ms_lock); 1394 1395 VERIFY(msp->ms_group == NULL); 1396 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1397 0, -msp->ms_size); 1398 space_map_close(msp->ms_sm); 1399 1400 metaslab_unload(msp); 1401 range_tree_destroy(msp->ms_tree); 1402 1403 for (int t = 0; t < TXG_SIZE; t++) { 1404 range_tree_destroy(msp->ms_alloctree[t]); 1405 range_tree_destroy(msp->ms_freetree[t]); 1406 } 1407 1408 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1409 range_tree_destroy(msp->ms_defertree[t]); 1410 } 1411 1412 ASSERT0(msp->ms_deferspace); 1413 1414 mutex_exit(&msp->ms_lock); 1415 cv_destroy(&msp->ms_load_cv); 1416 mutex_destroy(&msp->ms_lock); 1417 1418 kmem_free(msp, sizeof (metaslab_t)); 1419} 1420 1421#define FRAGMENTATION_TABLE_SIZE 17 1422 1423/* 1424 * This table defines a segment size based fragmentation metric that will 1425 * allow each metaslab to derive its own fragmentation value. This is done 1426 * by calculating the space in each bucket of the spacemap histogram and 1427 * multiplying that by the fragmetation metric in this table. Doing 1428 * this for all buckets and dividing it by the total amount of free 1429 * space in this metaslab (i.e. the total free space in all buckets) gives 1430 * us the fragmentation metric. This means that a high fragmentation metric 1431 * equates to most of the free space being comprised of small segments. 1432 * Conversely, if the metric is low, then most of the free space is in 1433 * large segments. A 10% change in fragmentation equates to approximately 1434 * double the number of segments. 1435 * 1436 * This table defines 0% fragmented space using 16MB segments. Testing has 1437 * shown that segments that are greater than or equal to 16MB do not suffer 1438 * from drastic performance problems. Using this value, we derive the rest 1439 * of the table. Since the fragmentation value is never stored on disk, it 1440 * is possible to change these calculations in the future. 1441 */ 1442int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1443 100, /* 512B */ 1444 100, /* 1K */ 1445 98, /* 2K */ 1446 95, /* 4K */ 1447 90, /* 8K */ 1448 80, /* 16K */ 1449 70, /* 32K */ 1450 60, /* 64K */ 1451 50, /* 128K */ 1452 40, /* 256K */ 1453 30, /* 512K */ 1454 20, /* 1M */ 1455 15, /* 2M */ 1456 10, /* 4M */ 1457 5, /* 8M */ 1458 0 /* 16M */ 1459}; 1460 1461/* 1462 * Calclate the metaslab's fragmentation metric. A return value 1463 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1464 * not support this metric. Otherwise, the return value should be in the 1465 * range [0, 100]. 1466 */ 1467static uint64_t 1468metaslab_fragmentation(metaslab_t *msp) 1469{ 1470 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1471 uint64_t fragmentation = 0; 1472 uint64_t total = 0; 1473 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1474 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1475 1476 if (!feature_enabled) 1477 return (ZFS_FRAG_INVALID); 1478 1479 /* 1480 * A null space map means that the entire metaslab is free 1481 * and thus is not fragmented. 1482 */ 1483 if (msp->ms_sm == NULL) 1484 return (0); 1485 1486 /* 1487 * If this metaslab's space_map has not been upgraded, flag it 1488 * so that we upgrade next time we encounter it. 1489 */ 1490 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1491 uint64_t txg = spa_syncing_txg(spa); 1492 vdev_t *vd = msp->ms_group->mg_vd; 1493 1494 if (spa_writeable(spa)) { 1495 msp->ms_condense_wanted = B_TRUE; 1496 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1497 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1498 "msp %p, vd %p", txg, msp, vd); 1499 } 1500 return (ZFS_FRAG_INVALID); 1501 } 1502 1503 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1504 uint64_t space = 0; 1505 uint8_t shift = msp->ms_sm->sm_shift; 1506 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1507 FRAGMENTATION_TABLE_SIZE - 1); 1508 1509 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1510 continue; 1511 1512 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1513 total += space; 1514 1515 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1516 fragmentation += space * zfs_frag_table[idx]; 1517 } 1518 1519 if (total > 0) 1520 fragmentation /= total; 1521 ASSERT3U(fragmentation, <=, 100); 1522 return (fragmentation); 1523} 1524 1525/* 1526 * Compute a weight -- a selection preference value -- for the given metaslab. 1527 * This is based on the amount of free space, the level of fragmentation, 1528 * the LBA range, and whether the metaslab is loaded. 1529 */ 1530static uint64_t 1531metaslab_weight(metaslab_t *msp) 1532{ 1533 metaslab_group_t *mg = msp->ms_group; 1534 vdev_t *vd = mg->mg_vd; 1535 uint64_t weight, space; 1536 1537 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1538 1539 /* 1540 * This vdev is in the process of being removed so there is nothing 1541 * for us to do here. 1542 */ 1543 if (vd->vdev_removing) { 1544 ASSERT0(space_map_allocated(msp->ms_sm)); 1545 ASSERT0(vd->vdev_ms_shift); 1546 return (0); 1547 } 1548 1549 /* 1550 * The baseline weight is the metaslab's free space. 1551 */ 1552 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1553 1554 msp->ms_fragmentation = metaslab_fragmentation(msp); 1555 if (metaslab_fragmentation_factor_enabled && 1556 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1557 /* 1558 * Use the fragmentation information to inversely scale 1559 * down the baseline weight. We need to ensure that we 1560 * don't exclude this metaslab completely when it's 100% 1561 * fragmented. To avoid this we reduce the fragmented value 1562 * by 1. 1563 */ 1564 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1565 1566 /* 1567 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1568 * this metaslab again. The fragmentation metric may have 1569 * decreased the space to something smaller than 1570 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1571 * so that we can consume any remaining space. 1572 */ 1573 if (space > 0 && space < SPA_MINBLOCKSIZE) 1574 space = SPA_MINBLOCKSIZE; 1575 } 1576 weight = space; 1577 1578 /* 1579 * Modern disks have uniform bit density and constant angular velocity. 1580 * Therefore, the outer recording zones are faster (higher bandwidth) 1581 * than the inner zones by the ratio of outer to inner track diameter, 1582 * which is typically around 2:1. We account for this by assigning 1583 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1584 * In effect, this means that we'll select the metaslab with the most 1585 * free bandwidth rather than simply the one with the most free space. 1586 */ 1587 if (metaslab_lba_weighting_enabled) { 1588 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1589 ASSERT(weight >= space && weight <= 2 * space); 1590 } 1591 1592 /* 1593 * If this metaslab is one we're actively using, adjust its 1594 * weight to make it preferable to any inactive metaslab so 1595 * we'll polish it off. If the fragmentation on this metaslab 1596 * has exceed our threshold, then don't mark it active. 1597 */ 1598 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1599 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1600 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1601 } 1602 1603 return (weight); 1604} 1605 1606static int 1607metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1608{ 1609 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1610 1611 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1612 metaslab_load_wait(msp); 1613 if (!msp->ms_loaded) { 1614 int error = metaslab_load(msp); 1615 if (error) { 1616 metaslab_group_sort(msp->ms_group, msp, 0); 1617 return (error); 1618 } 1619 } 1620 1621 metaslab_group_sort(msp->ms_group, msp, 1622 msp->ms_weight | activation_weight); 1623 } 1624 ASSERT(msp->ms_loaded); 1625 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1626 1627 return (0); 1628} 1629 1630static void 1631metaslab_passivate(metaslab_t *msp, uint64_t size) 1632{ 1633 /* 1634 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1635 * this metaslab again. In that case, it had better be empty, 1636 * or we would be leaving space on the table. 1637 */ 1638 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1639 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1640 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1641} 1642 1643static void 1644metaslab_preload(void *arg) 1645{ 1646 metaslab_t *msp = arg; 1647 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1648 1649 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1650 1651 mutex_enter(&msp->ms_lock); 1652 metaslab_load_wait(msp); 1653 if (!msp->ms_loaded) 1654 (void) metaslab_load(msp); 1655 1656 /* 1657 * Set the ms_access_txg value so that we don't unload it right away. 1658 */ 1659 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1660 mutex_exit(&msp->ms_lock); 1661} 1662 1663static void 1664metaslab_group_preload(metaslab_group_t *mg) 1665{ 1666 spa_t *spa = mg->mg_vd->vdev_spa; 1667 metaslab_t *msp; 1668 avl_tree_t *t = &mg->mg_metaslab_tree; 1669 int m = 0; 1670 1671 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1672 taskq_wait(mg->mg_taskq); 1673 return; 1674 } 1675 1676 mutex_enter(&mg->mg_lock); 1677 /* 1678 * Load the next potential metaslabs 1679 */ 1680 msp = avl_first(t); 1681 while (msp != NULL) { 1682 metaslab_t *msp_next = AVL_NEXT(t, msp); 1683 1684 /* 1685 * We preload only the maximum number of metaslabs specified 1686 * by metaslab_preload_limit. If a metaslab is being forced 1687 * to condense then we preload it too. This will ensure 1688 * that force condensing happens in the next txg. 1689 */ 1690 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1691 msp = msp_next; 1692 continue; 1693 } 1694 1695 /* 1696 * We must drop the metaslab group lock here to preserve 1697 * lock ordering with the ms_lock (when grabbing both 1698 * the mg_lock and the ms_lock, the ms_lock must be taken 1699 * first). As a result, it is possible that the ordering 1700 * of the metaslabs within the avl tree may change before 1701 * we reacquire the lock. The metaslab cannot be removed from 1702 * the tree while we're in syncing context so it is safe to 1703 * drop the mg_lock here. If the metaslabs are reordered 1704 * nothing will break -- we just may end up loading a 1705 * less than optimal one. 1706 */ 1707 mutex_exit(&mg->mg_lock); 1708 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1709 msp, TQ_SLEEP) != 0); 1710 mutex_enter(&mg->mg_lock); 1711 msp = msp_next; 1712 } 1713 mutex_exit(&mg->mg_lock); 1714} 1715 1716/* 1717 * Determine if the space map's on-disk footprint is past our tolerance 1718 * for inefficiency. We would like to use the following criteria to make 1719 * our decision: 1720 * 1721 * 1. The size of the space map object should not dramatically increase as a 1722 * result of writing out the free space range tree. 1723 * 1724 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1725 * times the size than the free space range tree representation 1726 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1727 * 1728 * 3. The on-disk size of the space map should actually decrease. 1729 * 1730 * Checking the first condition is tricky since we don't want to walk 1731 * the entire AVL tree calculating the estimated on-disk size. Instead we 1732 * use the size-ordered range tree in the metaslab and calculate the 1733 * size required to write out the largest segment in our free tree. If the 1734 * size required to represent that segment on disk is larger than the space 1735 * map object then we avoid condensing this map. 1736 * 1737 * To determine the second criterion we use a best-case estimate and assume 1738 * each segment can be represented on-disk as a single 64-bit entry. We refer 1739 * to this best-case estimate as the space map's minimal form. 1740 * 1741 * Unfortunately, we cannot compute the on-disk size of the space map in this 1742 * context because we cannot accurately compute the effects of compression, etc. 1743 * Instead, we apply the heuristic described in the block comment for 1744 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1745 * is greater than a threshold number of blocks. 1746 */ 1747static boolean_t 1748metaslab_should_condense(metaslab_t *msp) 1749{ 1750 space_map_t *sm = msp->ms_sm; 1751 range_seg_t *rs; 1752 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1753 dmu_object_info_t doi; 1754 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1755 1756 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1757 ASSERT(msp->ms_loaded); 1758 1759 /* 1760 * Use the ms_size_tree range tree, which is ordered by size, to 1761 * obtain the largest segment in the free tree. We always condense 1762 * metaslabs that are empty and metaslabs for which a condense 1763 * request has been made. 1764 */ 1765 rs = avl_last(&msp->ms_size_tree); 1766 if (rs == NULL || msp->ms_condense_wanted) 1767 return (B_TRUE); 1768 1769 /* 1770 * Calculate the number of 64-bit entries this segment would 1771 * require when written to disk. If this single segment would be 1772 * larger on-disk than the entire current on-disk structure, then 1773 * clearly condensing will increase the on-disk structure size. 1774 */ 1775 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1776 entries = size / (MIN(size, SM_RUN_MAX)); 1777 segsz = entries * sizeof (uint64_t); 1778 1779 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1780 object_size = space_map_length(msp->ms_sm); 1781 1782 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1783 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1784 1785 return (segsz <= object_size && 1786 object_size >= (optimal_size * zfs_condense_pct / 100) && 1787 object_size > zfs_metaslab_condense_block_threshold * record_size); 1788} 1789 1790/* 1791 * Condense the on-disk space map representation to its minimized form. 1792 * The minimized form consists of a small number of allocations followed by 1793 * the entries of the free range tree. 1794 */ 1795static void 1796metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1797{ 1798 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1799 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1800 range_tree_t *condense_tree; 1801 space_map_t *sm = msp->ms_sm; 1802 1803 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1804 ASSERT3U(spa_sync_pass(spa), ==, 1); 1805 ASSERT(msp->ms_loaded); 1806 1807 1808 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1809 "smp size %llu, segments %lu, forcing condense=%s", txg, 1810 msp->ms_id, msp, space_map_length(msp->ms_sm), 1811 avl_numnodes(&msp->ms_tree->rt_root), 1812 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1813 1814 msp->ms_condense_wanted = B_FALSE; 1815 1816 /* 1817 * Create an range tree that is 100% allocated. We remove segments 1818 * that have been freed in this txg, any deferred frees that exist, 1819 * and any allocation in the future. Removing segments should be 1820 * a relatively inexpensive operation since we expect these trees to 1821 * have a small number of nodes. 1822 */ 1823 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1824 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1825 1826 /* 1827 * Remove what's been freed in this txg from the condense_tree. 1828 * Since we're in sync_pass 1, we know that all the frees from 1829 * this txg are in the freetree. 1830 */ 1831 range_tree_walk(freetree, range_tree_remove, condense_tree); 1832 1833 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1834 range_tree_walk(msp->ms_defertree[t], 1835 range_tree_remove, condense_tree); 1836 } 1837 1838 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1839 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1840 range_tree_remove, condense_tree); 1841 } 1842 1843 /* 1844 * We're about to drop the metaslab's lock thus allowing 1845 * other consumers to change it's content. Set the 1846 * metaslab's ms_condensing flag to ensure that 1847 * allocations on this metaslab do not occur while we're 1848 * in the middle of committing it to disk. This is only critical 1849 * for the ms_tree as all other range trees use per txg 1850 * views of their content. 1851 */ 1852 msp->ms_condensing = B_TRUE; 1853 1854 mutex_exit(&msp->ms_lock); 1855 space_map_truncate(sm, tx); 1856 mutex_enter(&msp->ms_lock); 1857 1858 /* 1859 * While we would ideally like to create a space_map representation 1860 * that consists only of allocation records, doing so can be 1861 * prohibitively expensive because the in-core free tree can be 1862 * large, and therefore computationally expensive to subtract 1863 * from the condense_tree. Instead we sync out two trees, a cheap 1864 * allocation only tree followed by the in-core free tree. While not 1865 * optimal, this is typically close to optimal, and much cheaper to 1866 * compute. 1867 */ 1868 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1869 range_tree_vacate(condense_tree, NULL, NULL); 1870 range_tree_destroy(condense_tree); 1871 1872 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1873 msp->ms_condensing = B_FALSE; 1874} 1875 1876/* 1877 * Write a metaslab to disk in the context of the specified transaction group. 1878 */ 1879void 1880metaslab_sync(metaslab_t *msp, uint64_t txg) 1881{ 1882 metaslab_group_t *mg = msp->ms_group; 1883 vdev_t *vd = mg->mg_vd; 1884 spa_t *spa = vd->vdev_spa; 1885 objset_t *mos = spa_meta_objset(spa); 1886 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1887 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1888 range_tree_t **freed_tree = 1889 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1890 dmu_tx_t *tx; 1891 uint64_t object = space_map_object(msp->ms_sm); 1892 1893 ASSERT(!vd->vdev_ishole); 1894 1895 /* 1896 * This metaslab has just been added so there's no work to do now. 1897 */ 1898 if (*freetree == NULL) { 1899 ASSERT3P(alloctree, ==, NULL); 1900 return; 1901 } 1902 1903 ASSERT3P(alloctree, !=, NULL); 1904 ASSERT3P(*freetree, !=, NULL); 1905 ASSERT3P(*freed_tree, !=, NULL); 1906 1907 /* 1908 * Normally, we don't want to process a metaslab if there 1909 * are no allocations or frees to perform. However, if the metaslab 1910 * is being forced to condense we need to let it through. 1911 */ 1912 if (range_tree_space(alloctree) == 0 && 1913 range_tree_space(*freetree) == 0 && 1914 !msp->ms_condense_wanted) 1915 return; 1916 1917 /* 1918 * The only state that can actually be changing concurrently with 1919 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1920 * be modifying this txg's alloctree, freetree, freed_tree, or 1921 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1922 * space_map ASSERTs. We drop it whenever we call into the DMU, 1923 * because the DMU can call down to us (e.g. via zio_free()) at 1924 * any time. 1925 */ 1926 1927 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1928 1929 if (msp->ms_sm == NULL) { 1930 uint64_t new_object; 1931 1932 new_object = space_map_alloc(mos, tx); 1933 VERIFY3U(new_object, !=, 0); 1934 1935 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1936 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1937 &msp->ms_lock)); 1938 ASSERT(msp->ms_sm != NULL); 1939 } 1940 1941 mutex_enter(&msp->ms_lock); 1942 1943 /* 1944 * Note: metaslab_condense() clears the space_map's histogram. 1945 * Therefore we must verify and remove this histogram before 1946 * condensing. 1947 */ 1948 metaslab_group_histogram_verify(mg); 1949 metaslab_class_histogram_verify(mg->mg_class); 1950 metaslab_group_histogram_remove(mg, msp); 1951 1952 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1953 metaslab_should_condense(msp)) { 1954 metaslab_condense(msp, txg, tx); 1955 } else { 1956 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1957 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1958 } 1959 1960 if (msp->ms_loaded) { 1961 /* 1962 * When the space map is loaded, we have an accruate 1963 * histogram in the range tree. This gives us an opportunity 1964 * to bring the space map's histogram up-to-date so we clear 1965 * it first before updating it. 1966 */ 1967 space_map_histogram_clear(msp->ms_sm); 1968 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1969 } else { 1970 /* 1971 * Since the space map is not loaded we simply update the 1972 * exisiting histogram with what was freed in this txg. This 1973 * means that the on-disk histogram may not have an accurate 1974 * view of the free space but it's close enough to allow 1975 * us to make allocation decisions. 1976 */ 1977 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1978 } 1979 metaslab_group_histogram_add(mg, msp); 1980 metaslab_group_histogram_verify(mg); 1981 metaslab_class_histogram_verify(mg->mg_class); 1982 1983 /* 1984 * For sync pass 1, we avoid traversing this txg's free range tree 1985 * and instead will just swap the pointers for freetree and 1986 * freed_tree. We can safely do this since the freed_tree is 1987 * guaranteed to be empty on the initial pass. 1988 */ 1989 if (spa_sync_pass(spa) == 1) { 1990 range_tree_swap(freetree, freed_tree); 1991 } else { 1992 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 1993 } 1994 range_tree_vacate(alloctree, NULL, NULL); 1995 1996 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 1997 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 1998 1999 mutex_exit(&msp->ms_lock); 2000 2001 if (object != space_map_object(msp->ms_sm)) { 2002 object = space_map_object(msp->ms_sm); 2003 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2004 msp->ms_id, sizeof (uint64_t), &object, tx); 2005 } 2006 dmu_tx_commit(tx); 2007} 2008 2009/* 2010 * Called after a transaction group has completely synced to mark 2011 * all of the metaslab's free space as usable. 2012 */ 2013void 2014metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2015{ 2016 metaslab_group_t *mg = msp->ms_group; 2017 vdev_t *vd = mg->mg_vd; 2018 range_tree_t **freed_tree; 2019 range_tree_t **defer_tree; 2020 int64_t alloc_delta, defer_delta; 2021 2022 ASSERT(!vd->vdev_ishole); 2023 2024 mutex_enter(&msp->ms_lock); 2025 2026 /* 2027 * If this metaslab is just becoming available, initialize its 2028 * alloctrees, freetrees, and defertree and add its capacity to 2029 * the vdev. 2030 */ 2031 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2032 for (int t = 0; t < TXG_SIZE; t++) { 2033 ASSERT(msp->ms_alloctree[t] == NULL); 2034 ASSERT(msp->ms_freetree[t] == NULL); 2035 2036 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2037 &msp->ms_lock); 2038 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2039 &msp->ms_lock); 2040 } 2041 2042 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2043 ASSERT(msp->ms_defertree[t] == NULL); 2044 2045 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2046 &msp->ms_lock); 2047 } 2048 2049 vdev_space_update(vd, 0, 0, msp->ms_size); 2050 } 2051 2052 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2053 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2054 2055 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2056 defer_delta = range_tree_space(*freed_tree) - 2057 range_tree_space(*defer_tree); 2058 2059 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2060 2061 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2062 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2063 2064 /* 2065 * If there's a metaslab_load() in progress, wait for it to complete 2066 * so that we have a consistent view of the in-core space map. 2067 */ 2068 metaslab_load_wait(msp); 2069 2070 /* 2071 * Move the frees from the defer_tree back to the free 2072 * range tree (if it's loaded). Swap the freed_tree and the 2073 * defer_tree -- this is safe to do because we've just emptied out 2074 * the defer_tree. 2075 */ 2076 range_tree_vacate(*defer_tree, 2077 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2078 range_tree_swap(freed_tree, defer_tree); 2079 2080 space_map_update(msp->ms_sm); 2081 2082 msp->ms_deferspace += defer_delta; 2083 ASSERT3S(msp->ms_deferspace, >=, 0); 2084 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2085 if (msp->ms_deferspace != 0) { 2086 /* 2087 * Keep syncing this metaslab until all deferred frees 2088 * are back in circulation. 2089 */ 2090 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2091 } 2092 2093 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2094 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2095 VERIFY0(range_tree_space( 2096 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2097 } 2098 2099 if (!metaslab_debug_unload) 2100 metaslab_unload(msp); 2101 } 2102 2103 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2104 mutex_exit(&msp->ms_lock); 2105} 2106 2107void 2108metaslab_sync_reassess(metaslab_group_t *mg) 2109{ 2110 metaslab_group_alloc_update(mg); 2111 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2112 2113 /* 2114 * Preload the next potential metaslabs 2115 */ 2116 metaslab_group_preload(mg); 2117} 2118 2119static uint64_t 2120metaslab_distance(metaslab_t *msp, dva_t *dva) 2121{ 2122 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2123 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2124 uint64_t start = msp->ms_id; 2125 2126 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2127 return (1ULL << 63); 2128 2129 if (offset < start) 2130 return ((start - offset) << ms_shift); 2131 if (offset > start) 2132 return ((offset - start) << ms_shift); 2133 return (0); 2134} 2135 2136static uint64_t 2137metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2138 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2139{ 2140 spa_t *spa = mg->mg_vd->vdev_spa; 2141 metaslab_t *msp = NULL; 2142 uint64_t offset = -1ULL; 2143 avl_tree_t *t = &mg->mg_metaslab_tree; 2144 uint64_t activation_weight; 2145 uint64_t target_distance; 2146 int i; 2147 2148 activation_weight = METASLAB_WEIGHT_PRIMARY; 2149 for (i = 0; i < d; i++) { 2150 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2151 activation_weight = METASLAB_WEIGHT_SECONDARY; 2152 break; 2153 } 2154 } 2155 2156 for (;;) { 2157 boolean_t was_active; 2158 2159 mutex_enter(&mg->mg_lock); 2160 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2161 if (msp->ms_weight < asize) { 2162 spa_dbgmsg(spa, "%s: failed to meet weight " 2163 "requirement: vdev %llu, txg %llu, mg %p, " 2164 "msp %p, psize %llu, asize %llu, " 2165 "weight %llu", spa_name(spa), 2166 mg->mg_vd->vdev_id, txg, 2167 mg, msp, psize, asize, msp->ms_weight); 2168 mutex_exit(&mg->mg_lock); 2169 return (-1ULL); 2170 } 2171 2172 /* 2173 * If the selected metaslab is condensing, skip it. 2174 */ 2175 if (msp->ms_condensing) 2176 continue; 2177 2178 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2179 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2180 break; 2181 2182 target_distance = min_distance + 2183 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2184 min_distance >> 1); 2185 2186 for (i = 0; i < d; i++) 2187 if (metaslab_distance(msp, &dva[i]) < 2188 target_distance) 2189 break; 2190 if (i == d) 2191 break; 2192 } 2193 mutex_exit(&mg->mg_lock); 2194 if (msp == NULL) 2195 return (-1ULL); 2196 2197 mutex_enter(&msp->ms_lock); 2198 2199 /* 2200 * Ensure that the metaslab we have selected is still 2201 * capable of handling our request. It's possible that 2202 * another thread may have changed the weight while we 2203 * were blocked on the metaslab lock. 2204 */ 2205 if (msp->ms_weight < asize || (was_active && 2206 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2207 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2208 mutex_exit(&msp->ms_lock); 2209 continue; 2210 } 2211 2212 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2213 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2214 metaslab_passivate(msp, 2215 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2216 mutex_exit(&msp->ms_lock); 2217 continue; 2218 } 2219 2220 if (metaslab_activate(msp, activation_weight) != 0) { 2221 mutex_exit(&msp->ms_lock); 2222 continue; 2223 } 2224 2225 /* 2226 * If this metaslab is currently condensing then pick again as 2227 * we can't manipulate this metaslab until it's committed 2228 * to disk. 2229 */ 2230 if (msp->ms_condensing) { 2231 mutex_exit(&msp->ms_lock); 2232 continue; 2233 } 2234 2235 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2236 break; 2237 2238 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2239 mutex_exit(&msp->ms_lock); 2240 } 2241 2242 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2243 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2244 2245 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2246 msp->ms_access_txg = txg + metaslab_unload_delay; 2247 2248 mutex_exit(&msp->ms_lock); 2249 2250 return (offset); 2251} 2252 2253/* 2254 * Allocate a block for the specified i/o. 2255 */ 2256static int 2257metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2258 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2259{ 2260 metaslab_group_t *mg, *rotor; 2261 vdev_t *vd; 2262 int dshift = 3; 2263 int all_zero; 2264 int zio_lock = B_FALSE; 2265 boolean_t allocatable; 2266 uint64_t offset = -1ULL; 2267 uint64_t asize; 2268 uint64_t distance; 2269 2270 ASSERT(!DVA_IS_VALID(&dva[d])); 2271 2272 /* 2273 * For testing, make some blocks above a certain size be gang blocks. 2274 */ 2275 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2276 return (SET_ERROR(ENOSPC)); 2277 2278 /* 2279 * Start at the rotor and loop through all mgs until we find something. 2280 * Note that there's no locking on mc_rotor or mc_aliquot because 2281 * nothing actually breaks if we miss a few updates -- we just won't 2282 * allocate quite as evenly. It all balances out over time. 2283 * 2284 * If we are doing ditto or log blocks, try to spread them across 2285 * consecutive vdevs. If we're forced to reuse a vdev before we've 2286 * allocated all of our ditto blocks, then try and spread them out on 2287 * that vdev as much as possible. If it turns out to not be possible, 2288 * gradually lower our standards until anything becomes acceptable. 2289 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2290 * gives us hope of containing our fault domains to something we're 2291 * able to reason about. Otherwise, any two top-level vdev failures 2292 * will guarantee the loss of data. With consecutive allocation, 2293 * only two adjacent top-level vdev failures will result in data loss. 2294 * 2295 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2296 * ourselves on the same vdev as our gang block header. That 2297 * way, we can hope for locality in vdev_cache, plus it makes our 2298 * fault domains something tractable. 2299 */ 2300 if (hintdva) { 2301 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2302 2303 /* 2304 * It's possible the vdev we're using as the hint no 2305 * longer exists (i.e. removed). Consult the rotor when 2306 * all else fails. 2307 */ 2308 if (vd != NULL) { 2309 mg = vd->vdev_mg; 2310 2311 if (flags & METASLAB_HINTBP_AVOID && 2312 mg->mg_next != NULL) 2313 mg = mg->mg_next; 2314 } else { 2315 mg = mc->mc_rotor; 2316 } 2317 } else if (d != 0) { 2318 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2319 mg = vd->vdev_mg->mg_next; 2320 } else { 2321 mg = mc->mc_rotor; 2322 } 2323 2324 /* 2325 * If the hint put us into the wrong metaslab class, or into a 2326 * metaslab group that has been passivated, just follow the rotor. 2327 */ 2328 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2329 mg = mc->mc_rotor; 2330 2331 rotor = mg; 2332top: 2333 all_zero = B_TRUE; 2334 do { 2335 ASSERT(mg->mg_activation_count == 1); 2336 2337 vd = mg->mg_vd; 2338 2339 /* 2340 * Don't allocate from faulted devices. 2341 */ 2342 if (zio_lock) { 2343 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2344 allocatable = vdev_allocatable(vd); 2345 spa_config_exit(spa, SCL_ZIO, FTAG); 2346 } else { 2347 allocatable = vdev_allocatable(vd); 2348 } 2349 2350 /* 2351 * Determine if the selected metaslab group is eligible 2352 * for allocations. If we're ganging or have requested 2353 * an allocation for the smallest gang block size 2354 * then we don't want to avoid allocating to the this 2355 * metaslab group. If we're in this condition we should 2356 * try to allocate from any device possible so that we 2357 * don't inadvertently return ENOSPC and suspend the pool 2358 * even though space is still available. 2359 */ 2360 if (allocatable && CAN_FASTGANG(flags) && 2361 psize > SPA_GANGBLOCKSIZE) 2362 allocatable = metaslab_group_allocatable(mg); 2363 2364 if (!allocatable) 2365 goto next; 2366 2367 /* 2368 * Avoid writing single-copy data to a failing vdev 2369 * unless the user instructs us that it is okay. 2370 */ 2371 if ((vd->vdev_stat.vs_write_errors > 0 || 2372 vd->vdev_state < VDEV_STATE_HEALTHY) && 2373 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2374 all_zero = B_FALSE; 2375 goto next; 2376 } 2377 2378 ASSERT(mg->mg_class == mc); 2379 2380 distance = vd->vdev_asize >> dshift; 2381 if (distance <= (1ULL << vd->vdev_ms_shift)) 2382 distance = 0; 2383 else 2384 all_zero = B_FALSE; 2385 2386 asize = vdev_psize_to_asize(vd, psize); 2387 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2388 2389 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2390 dva, d); 2391 if (offset != -1ULL) { 2392 /* 2393 * If we've just selected this metaslab group, 2394 * figure out whether the corresponding vdev is 2395 * over- or under-used relative to the pool, 2396 * and set an allocation bias to even it out. 2397 */ 2398 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2399 vdev_stat_t *vs = &vd->vdev_stat; 2400 int64_t vu, cu; 2401 2402 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2403 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2404 2405 /* 2406 * Calculate how much more or less we should 2407 * try to allocate from this device during 2408 * this iteration around the rotor. 2409 * For example, if a device is 80% full 2410 * and the pool is 20% full then we should 2411 * reduce allocations by 60% on this device. 2412 * 2413 * mg_bias = (20 - 80) * 512K / 100 = -307K 2414 * 2415 * This reduces allocations by 307K for this 2416 * iteration. 2417 */ 2418 mg->mg_bias = ((cu - vu) * 2419 (int64_t)mg->mg_aliquot) / 100; 2420 } else if (!metaslab_bias_enabled) { 2421 mg->mg_bias = 0; 2422 } 2423 2424 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2425 mg->mg_aliquot + mg->mg_bias) { 2426 mc->mc_rotor = mg->mg_next; 2427 mc->mc_aliquot = 0; 2428 } 2429 2430 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2431 DVA_SET_OFFSET(&dva[d], offset); 2432 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2433 DVA_SET_ASIZE(&dva[d], asize); 2434 2435 return (0); 2436 } 2437next: 2438 mc->mc_rotor = mg->mg_next; 2439 mc->mc_aliquot = 0; 2440 } while ((mg = mg->mg_next) != rotor); 2441 2442 if (!all_zero) { 2443 dshift++; 2444 ASSERT(dshift < 64); 2445 goto top; 2446 } 2447 2448 if (!allocatable && !zio_lock) { 2449 dshift = 3; 2450 zio_lock = B_TRUE; 2451 goto top; 2452 } 2453 2454 bzero(&dva[d], sizeof (dva_t)); 2455 2456 return (SET_ERROR(ENOSPC)); 2457} 2458 2459/* 2460 * Free the block represented by DVA in the context of the specified 2461 * transaction group. 2462 */ 2463static void 2464metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2465{ 2466 uint64_t vdev = DVA_GET_VDEV(dva); 2467 uint64_t offset = DVA_GET_OFFSET(dva); 2468 uint64_t size = DVA_GET_ASIZE(dva); 2469 vdev_t *vd; 2470 metaslab_t *msp; 2471 2472 ASSERT(DVA_IS_VALID(dva)); 2473 2474 if (txg > spa_freeze_txg(spa)) 2475 return; 2476 2477 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2478 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2479 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2480 (u_longlong_t)vdev, (u_longlong_t)offset); 2481 ASSERT(0); 2482 return; 2483 } 2484 2485 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2486 2487 if (DVA_GET_GANG(dva)) 2488 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2489 2490 mutex_enter(&msp->ms_lock); 2491 2492 if (now) { 2493 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2494 offset, size); 2495 2496 VERIFY(!msp->ms_condensing); 2497 VERIFY3U(offset, >=, msp->ms_start); 2498 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2499 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2500 msp->ms_size); 2501 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2502 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2503 range_tree_add(msp->ms_tree, offset, size); 2504 } else { 2505 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2506 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2507 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2508 offset, size); 2509 } 2510 2511 mutex_exit(&msp->ms_lock); 2512} 2513 2514/* 2515 * Intent log support: upon opening the pool after a crash, notify the SPA 2516 * of blocks that the intent log has allocated for immediate write, but 2517 * which are still considered free by the SPA because the last transaction 2518 * group didn't commit yet. 2519 */ 2520static int 2521metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2522{ 2523 uint64_t vdev = DVA_GET_VDEV(dva); 2524 uint64_t offset = DVA_GET_OFFSET(dva); 2525 uint64_t size = DVA_GET_ASIZE(dva); 2526 vdev_t *vd; 2527 metaslab_t *msp; 2528 int error = 0; 2529 2530 ASSERT(DVA_IS_VALID(dva)); 2531 2532 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2533 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2534 return (SET_ERROR(ENXIO)); 2535 2536 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2537 2538 if (DVA_GET_GANG(dva)) 2539 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2540 2541 mutex_enter(&msp->ms_lock); 2542 2543 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2544 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2545 2546 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2547 error = SET_ERROR(ENOENT); 2548 2549 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2550 mutex_exit(&msp->ms_lock); 2551 return (error); 2552 } 2553 2554 VERIFY(!msp->ms_condensing); 2555 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2556 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2557 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2558 range_tree_remove(msp->ms_tree, offset, size); 2559 2560 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2561 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2562 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2563 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2564 } 2565 2566 mutex_exit(&msp->ms_lock); 2567 2568 return (0); 2569} 2570 2571int 2572metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2573 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2574{ 2575 dva_t *dva = bp->blk_dva; 2576 dva_t *hintdva = hintbp->blk_dva; 2577 int error = 0; 2578 2579 ASSERT(bp->blk_birth == 0); 2580 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2581 2582 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2583 2584 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2585 spa_config_exit(spa, SCL_ALLOC, FTAG); 2586 return (SET_ERROR(ENOSPC)); 2587 } 2588 2589 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2590 ASSERT(BP_GET_NDVAS(bp) == 0); 2591 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2592 2593 for (int d = 0; d < ndvas; d++) { 2594 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2595 txg, flags); 2596 if (error != 0) { 2597 for (d--; d >= 0; d--) { 2598 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2599 bzero(&dva[d], sizeof (dva_t)); 2600 } 2601 spa_config_exit(spa, SCL_ALLOC, FTAG); 2602 return (error); 2603 } 2604 } 2605 ASSERT(error == 0); 2606 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2607 2608 spa_config_exit(spa, SCL_ALLOC, FTAG); 2609 2610 BP_SET_BIRTH(bp, txg, txg); 2611 2612 return (0); 2613} 2614 2615void 2616metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2617{ 2618 const dva_t *dva = bp->blk_dva; 2619 int ndvas = BP_GET_NDVAS(bp); 2620 2621 ASSERT(!BP_IS_HOLE(bp)); 2622 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2623 2624 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2625 2626 for (int d = 0; d < ndvas; d++) 2627 metaslab_free_dva(spa, &dva[d], txg, now); 2628 2629 spa_config_exit(spa, SCL_FREE, FTAG); 2630} 2631 2632int 2633metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2634{ 2635 const dva_t *dva = bp->blk_dva; 2636 int ndvas = BP_GET_NDVAS(bp); 2637 int error = 0; 2638 2639 ASSERT(!BP_IS_HOLE(bp)); 2640 2641 if (txg != 0) { 2642 /* 2643 * First do a dry run to make sure all DVAs are claimable, 2644 * so we don't have to unwind from partial failures below. 2645 */ 2646 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2647 return (error); 2648 } 2649 2650 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2651 2652 for (int d = 0; d < ndvas; d++) 2653 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2654 break; 2655 2656 spa_config_exit(spa, SCL_ALLOC, FTAG); 2657 2658 ASSERT(error == 0 || txg == 0); 2659 2660 return (error); 2661} 2662 2663void 2664metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2665{ 2666 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2667 return; 2668 2669 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2670 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2671 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2672 vdev_t *vd = vdev_lookup_top(spa, vdev); 2673 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2674 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2675 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2676 2677 if (msp->ms_loaded) 2678 range_tree_verify(msp->ms_tree, offset, size); 2679 2680 for (int j = 0; j < TXG_SIZE; j++) 2681 range_tree_verify(msp->ms_freetree[j], offset, size); 2682 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2683 range_tree_verify(msp->ms_defertree[j], offset, size); 2684 } 2685 spa_config_exit(spa, SCL_VDEV, FTAG); 2686} 2687