metaslab.c revision 277553
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/dmu.h> 29#include <sys/dmu_tx.h> 30#include <sys/space_map.h> 31#include <sys/metaslab_impl.h> 32#include <sys/vdev_impl.h> 33#include <sys/zio.h> 34#include <sys/spa_impl.h> 35#include <sys/zfeature.h> 36 37SYSCTL_DECL(_vfs_zfs); 38SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 39 40/* 41 * Allow allocations to switch to gang blocks quickly. We do this to 42 * avoid having to load lots of space_maps in a given txg. There are, 43 * however, some cases where we want to avoid "fast" ganging and instead 44 * we want to do an exhaustive search of all metaslabs on this device. 45 * Currently we don't allow any gang, slog, or dump device related allocations 46 * to "fast" gang. 47 */ 48#define CAN_FASTGANG(flags) \ 49 (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 50 METASLAB_GANG_AVOID))) 51 52#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 53#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 54#define METASLAB_ACTIVE_MASK \ 55 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 56 57uint64_t metaslab_aliquot = 512ULL << 10; 58uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 59TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 60SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 61 &metaslab_gang_bang, 0, 62 "Force gang block allocation for blocks larger than or equal to this value"); 63 64/* 65 * The in-core space map representation is more compact than its on-disk form. 66 * The zfs_condense_pct determines how much more compact the in-core 67 * space_map representation must be before we compact it on-disk. 68 * Values should be greater than or equal to 100. 69 */ 70int zfs_condense_pct = 200; 71TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct); 72SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, 73 &zfs_condense_pct, 0, 74 "Condense on-disk spacemap when it is more than this many percents" 75 " of in-memory counterpart"); 76 77/* 78 * Condensing a metaslab is not guaranteed to actually reduce the amount of 79 * space used on disk. In particular, a space map uses data in increments of 80 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the 81 * same number of blocks after condensing. Since the goal of condensing is to 82 * reduce the number of IOPs required to read the space map, we only want to 83 * condense when we can be sure we will reduce the number of blocks used by the 84 * space map. Unfortunately, we cannot precisely compute whether or not this is 85 * the case in metaslab_should_condense since we are holding ms_lock. Instead, 86 * we apply the following heuristic: do not condense a spacemap unless the 87 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold 88 * blocks. 89 */ 90int zfs_metaslab_condense_block_threshold = 4; 91 92/* 93 * The zfs_mg_noalloc_threshold defines which metaslab groups should 94 * be eligible for allocation. The value is defined as a percentage of 95 * free space. Metaslab groups that have more free space than 96 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 97 * a metaslab group's free space is less than or equal to the 98 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 99 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 100 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 101 * groups are allowed to accept allocations. Gang blocks are always 102 * eligible to allocate on any metaslab group. The default value of 0 means 103 * no metaslab group will be excluded based on this criterion. 104 */ 105int zfs_mg_noalloc_threshold = 0; 106TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold); 107SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, 108 &zfs_mg_noalloc_threshold, 0, 109 "Percentage of metaslab group size that should be free" 110 " to make it eligible for allocation"); 111 112/* 113 * Metaslab groups are considered eligible for allocations if their 114 * fragmenation metric (measured as a percentage) is less than or equal to 115 * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold 116 * then it will be skipped unless all metaslab groups within the metaslab 117 * class have also crossed this threshold. 118 */ 119int zfs_mg_fragmentation_threshold = 85; 120TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold); 121SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN, 122 &zfs_mg_fragmentation_threshold, 0, 123 "Percentage of metaslab group size that should be considered " 124 "eligible for allocations unless all metaslab groups within the metaslab class " 125 "have also crossed this threshold"); 126 127/* 128 * Allow metaslabs to keep their active state as long as their fragmentation 129 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An 130 * active metaslab that exceeds this threshold will no longer keep its active 131 * status allowing better metaslabs to be selected. 132 */ 133int zfs_metaslab_fragmentation_threshold = 70; 134TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold", 135 &zfs_metaslab_fragmentation_threshold); 136SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN, 137 &zfs_metaslab_fragmentation_threshold, 0, 138 "Maximum percentage of metaslab fragmentation level to keep their active state"); 139 140/* 141 * When set will load all metaslabs when pool is first opened. 142 */ 143int metaslab_debug_load = 0; 144TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load); 145SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN, 146 &metaslab_debug_load, 0, 147 "Load all metaslabs when pool is first opened"); 148 149/* 150 * When set will prevent metaslabs from being unloaded. 151 */ 152int metaslab_debug_unload = 0; 153TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload); 154SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN, 155 &metaslab_debug_unload, 0, 156 "Prevent metaslabs from being unloaded"); 157 158/* 159 * Minimum size which forces the dynamic allocator to change 160 * it's allocation strategy. Once the space map cannot satisfy 161 * an allocation of this size then it switches to using more 162 * aggressive strategy (i.e search by size rather than offset). 163 */ 164uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; 165TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 166 &metaslab_df_alloc_threshold); 167SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 168 &metaslab_df_alloc_threshold, 0, 169 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 170 171/* 172 * The minimum free space, in percent, which must be available 173 * in a space map to continue allocations in a first-fit fashion. 174 * Once the space_map's free space drops below this level we dynamically 175 * switch to using best-fit allocations. 176 */ 177int metaslab_df_free_pct = 4; 178TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 179SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 180 &metaslab_df_free_pct, 0, 181 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 182 183/* 184 * A metaslab is considered "free" if it contains a contiguous 185 * segment which is greater than metaslab_min_alloc_size. 186 */ 187uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 188TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 189 &metaslab_min_alloc_size); 190SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 191 &metaslab_min_alloc_size, 0, 192 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 193 194/* 195 * Percentage of all cpus that can be used by the metaslab taskq. 196 */ 197int metaslab_load_pct = 50; 198TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct); 199SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, 200 &metaslab_load_pct, 0, 201 "Percentage of cpus that can be used by the metaslab taskq"); 202 203/* 204 * Determines how many txgs a metaslab may remain loaded without having any 205 * allocations from it. As long as a metaslab continues to be used we will 206 * keep it loaded. 207 */ 208int metaslab_unload_delay = TXG_SIZE * 2; 209TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay); 210SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, 211 &metaslab_unload_delay, 0, 212 "Number of TXGs that an unused metaslab can be kept in memory"); 213 214/* 215 * Max number of metaslabs per group to preload. 216 */ 217int metaslab_preload_limit = SPA_DVAS_PER_BP; 218TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit); 219SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, 220 &metaslab_preload_limit, 0, 221 "Max number of metaslabs per group to preload"); 222 223/* 224 * Enable/disable preloading of metaslab. 225 */ 226boolean_t metaslab_preload_enabled = B_TRUE; 227TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled); 228SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, 229 &metaslab_preload_enabled, 0, 230 "Max number of metaslabs per group to preload"); 231 232/* 233 * Enable/disable fragmentation weighting on metaslabs. 234 */ 235boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; 236TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled", 237 &metaslab_fragmentation_factor_enabled); 238SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, 239 &metaslab_fragmentation_factor_enabled, 0, 240 "Enable fragmentation weighting on metaslabs"); 241 242/* 243 * Enable/disable lba weighting (i.e. outer tracks are given preference). 244 */ 245boolean_t metaslab_lba_weighting_enabled = B_TRUE; 246TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled", 247 &metaslab_lba_weighting_enabled); 248SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, 249 &metaslab_lba_weighting_enabled, 0, 250 "Enable LBA weighting (i.e. outer tracks are given preference)"); 251 252/* 253 * Enable/disable metaslab group biasing. 254 */ 255boolean_t metaslab_bias_enabled = B_TRUE; 256TUNABLE_INT("vfs.zfs.metaslab.bias_enabled", 257 &metaslab_bias_enabled); 258SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, 259 &metaslab_bias_enabled, 0, 260 "Enable metaslab group biasing"); 261 262static uint64_t metaslab_fragmentation(metaslab_t *); 263 264/* 265 * ========================================================================== 266 * Metaslab classes 267 * ========================================================================== 268 */ 269metaslab_class_t * 270metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) 271{ 272 metaslab_class_t *mc; 273 274 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 275 276 mc->mc_spa = spa; 277 mc->mc_rotor = NULL; 278 mc->mc_ops = ops; 279 280 return (mc); 281} 282 283void 284metaslab_class_destroy(metaslab_class_t *mc) 285{ 286 ASSERT(mc->mc_rotor == NULL); 287 ASSERT(mc->mc_alloc == 0); 288 ASSERT(mc->mc_deferred == 0); 289 ASSERT(mc->mc_space == 0); 290 ASSERT(mc->mc_dspace == 0); 291 292 kmem_free(mc, sizeof (metaslab_class_t)); 293} 294 295int 296metaslab_class_validate(metaslab_class_t *mc) 297{ 298 metaslab_group_t *mg; 299 vdev_t *vd; 300 301 /* 302 * Must hold one of the spa_config locks. 303 */ 304 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 305 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 306 307 if ((mg = mc->mc_rotor) == NULL) 308 return (0); 309 310 do { 311 vd = mg->mg_vd; 312 ASSERT(vd->vdev_mg != NULL); 313 ASSERT3P(vd->vdev_top, ==, vd); 314 ASSERT3P(mg->mg_class, ==, mc); 315 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 316 } while ((mg = mg->mg_next) != mc->mc_rotor); 317 318 return (0); 319} 320 321void 322metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 323 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 324{ 325 atomic_add_64(&mc->mc_alloc, alloc_delta); 326 atomic_add_64(&mc->mc_deferred, defer_delta); 327 atomic_add_64(&mc->mc_space, space_delta); 328 atomic_add_64(&mc->mc_dspace, dspace_delta); 329} 330 331void 332metaslab_class_minblocksize_update(metaslab_class_t *mc) 333{ 334 metaslab_group_t *mg; 335 vdev_t *vd; 336 uint64_t minashift = UINT64_MAX; 337 338 if ((mg = mc->mc_rotor) == NULL) { 339 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 340 return; 341 } 342 343 do { 344 vd = mg->mg_vd; 345 if (vd->vdev_ashift < minashift) 346 minashift = vd->vdev_ashift; 347 } while ((mg = mg->mg_next) != mc->mc_rotor); 348 349 mc->mc_minblocksize = 1ULL << minashift; 350} 351 352uint64_t 353metaslab_class_get_alloc(metaslab_class_t *mc) 354{ 355 return (mc->mc_alloc); 356} 357 358uint64_t 359metaslab_class_get_deferred(metaslab_class_t *mc) 360{ 361 return (mc->mc_deferred); 362} 363 364uint64_t 365metaslab_class_get_space(metaslab_class_t *mc) 366{ 367 return (mc->mc_space); 368} 369 370uint64_t 371metaslab_class_get_dspace(metaslab_class_t *mc) 372{ 373 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 374} 375 376uint64_t 377metaslab_class_get_minblocksize(metaslab_class_t *mc) 378{ 379 return (mc->mc_minblocksize); 380} 381 382void 383metaslab_class_histogram_verify(metaslab_class_t *mc) 384{ 385 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 386 uint64_t *mc_hist; 387 int i; 388 389 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 390 return; 391 392 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 393 KM_SLEEP); 394 395 for (int c = 0; c < rvd->vdev_children; c++) { 396 vdev_t *tvd = rvd->vdev_child[c]; 397 metaslab_group_t *mg = tvd->vdev_mg; 398 399 /* 400 * Skip any holes, uninitialized top-levels, or 401 * vdevs that are not in this metalab class. 402 */ 403 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 404 mg->mg_class != mc) { 405 continue; 406 } 407 408 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 409 mc_hist[i] += mg->mg_histogram[i]; 410 } 411 412 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 413 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); 414 415 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 416} 417 418/* 419 * Calculate the metaslab class's fragmentation metric. The metric 420 * is weighted based on the space contribution of each metaslab group. 421 * The return value will be a number between 0 and 100 (inclusive), or 422 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the 423 * zfs_frag_table for more information about the metric. 424 */ 425uint64_t 426metaslab_class_fragmentation(metaslab_class_t *mc) 427{ 428 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 429 uint64_t fragmentation = 0; 430 431 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 432 433 for (int c = 0; c < rvd->vdev_children; c++) { 434 vdev_t *tvd = rvd->vdev_child[c]; 435 metaslab_group_t *mg = tvd->vdev_mg; 436 437 /* 438 * Skip any holes, uninitialized top-levels, or 439 * vdevs that are not in this metalab class. 440 */ 441 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 442 mg->mg_class != mc) { 443 continue; 444 } 445 446 /* 447 * If a metaslab group does not contain a fragmentation 448 * metric then just bail out. 449 */ 450 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { 451 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 452 return (ZFS_FRAG_INVALID); 453 } 454 455 /* 456 * Determine how much this metaslab_group is contributing 457 * to the overall pool fragmentation metric. 458 */ 459 fragmentation += mg->mg_fragmentation * 460 metaslab_group_get_space(mg); 461 } 462 fragmentation /= metaslab_class_get_space(mc); 463 464 ASSERT3U(fragmentation, <=, 100); 465 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 466 return (fragmentation); 467} 468 469/* 470 * Calculate the amount of expandable space that is available in 471 * this metaslab class. If a device is expanded then its expandable 472 * space will be the amount of allocatable space that is currently not 473 * part of this metaslab class. 474 */ 475uint64_t 476metaslab_class_expandable_space(metaslab_class_t *mc) 477{ 478 vdev_t *rvd = mc->mc_spa->spa_root_vdev; 479 uint64_t space = 0; 480 481 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); 482 for (int c = 0; c < rvd->vdev_children; c++) { 483 vdev_t *tvd = rvd->vdev_child[c]; 484 metaslab_group_t *mg = tvd->vdev_mg; 485 486 if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || 487 mg->mg_class != mc) { 488 continue; 489 } 490 491 space += tvd->vdev_max_asize - tvd->vdev_asize; 492 } 493 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); 494 return (space); 495} 496 497/* 498 * ========================================================================== 499 * Metaslab groups 500 * ========================================================================== 501 */ 502static int 503metaslab_compare(const void *x1, const void *x2) 504{ 505 const metaslab_t *m1 = x1; 506 const metaslab_t *m2 = x2; 507 508 if (m1->ms_weight < m2->ms_weight) 509 return (1); 510 if (m1->ms_weight > m2->ms_weight) 511 return (-1); 512 513 /* 514 * If the weights are identical, use the offset to force uniqueness. 515 */ 516 if (m1->ms_start < m2->ms_start) 517 return (-1); 518 if (m1->ms_start > m2->ms_start) 519 return (1); 520 521 ASSERT3P(m1, ==, m2); 522 523 return (0); 524} 525 526/* 527 * Update the allocatable flag and the metaslab group's capacity. 528 * The allocatable flag is set to true if the capacity is below 529 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 530 * from allocatable to non-allocatable or vice versa then the metaslab 531 * group's class is updated to reflect the transition. 532 */ 533static void 534metaslab_group_alloc_update(metaslab_group_t *mg) 535{ 536 vdev_t *vd = mg->mg_vd; 537 metaslab_class_t *mc = mg->mg_class; 538 vdev_stat_t *vs = &vd->vdev_stat; 539 boolean_t was_allocatable; 540 541 ASSERT(vd == vd->vdev_top); 542 543 mutex_enter(&mg->mg_lock); 544 was_allocatable = mg->mg_allocatable; 545 546 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 547 (vs->vs_space + 1); 548 549 /* 550 * A metaslab group is considered allocatable if it has plenty 551 * of free space or is not heavily fragmented. We only take 552 * fragmentation into account if the metaslab group has a valid 553 * fragmentation metric (i.e. a value between 0 and 100). 554 */ 555 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && 556 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 557 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); 558 559 /* 560 * The mc_alloc_groups maintains a count of the number of 561 * groups in this metaslab class that are still above the 562 * zfs_mg_noalloc_threshold. This is used by the allocating 563 * threads to determine if they should avoid allocations to 564 * a given group. The allocator will avoid allocations to a group 565 * if that group has reached or is below the zfs_mg_noalloc_threshold 566 * and there are still other groups that are above the threshold. 567 * When a group transitions from allocatable to non-allocatable or 568 * vice versa we update the metaslab class to reflect that change. 569 * When the mc_alloc_groups value drops to 0 that means that all 570 * groups have reached the zfs_mg_noalloc_threshold making all groups 571 * eligible for allocations. This effectively means that all devices 572 * are balanced again. 573 */ 574 if (was_allocatable && !mg->mg_allocatable) 575 mc->mc_alloc_groups--; 576 else if (!was_allocatable && mg->mg_allocatable) 577 mc->mc_alloc_groups++; 578 579 mutex_exit(&mg->mg_lock); 580} 581 582metaslab_group_t * 583metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 584{ 585 metaslab_group_t *mg; 586 587 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 588 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 589 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 590 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 591 mg->mg_vd = vd; 592 mg->mg_class = mc; 593 mg->mg_activation_count = 0; 594 595 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, 596 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT); 597 598 return (mg); 599} 600 601void 602metaslab_group_destroy(metaslab_group_t *mg) 603{ 604 ASSERT(mg->mg_prev == NULL); 605 ASSERT(mg->mg_next == NULL); 606 /* 607 * We may have gone below zero with the activation count 608 * either because we never activated in the first place or 609 * because we're done, and possibly removing the vdev. 610 */ 611 ASSERT(mg->mg_activation_count <= 0); 612 613 taskq_destroy(mg->mg_taskq); 614 avl_destroy(&mg->mg_metaslab_tree); 615 mutex_destroy(&mg->mg_lock); 616 kmem_free(mg, sizeof (metaslab_group_t)); 617} 618 619void 620metaslab_group_activate(metaslab_group_t *mg) 621{ 622 metaslab_class_t *mc = mg->mg_class; 623 metaslab_group_t *mgprev, *mgnext; 624 625 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 626 627 ASSERT(mc->mc_rotor != mg); 628 ASSERT(mg->mg_prev == NULL); 629 ASSERT(mg->mg_next == NULL); 630 ASSERT(mg->mg_activation_count <= 0); 631 632 if (++mg->mg_activation_count <= 0) 633 return; 634 635 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 636 metaslab_group_alloc_update(mg); 637 638 if ((mgprev = mc->mc_rotor) == NULL) { 639 mg->mg_prev = mg; 640 mg->mg_next = mg; 641 } else { 642 mgnext = mgprev->mg_next; 643 mg->mg_prev = mgprev; 644 mg->mg_next = mgnext; 645 mgprev->mg_next = mg; 646 mgnext->mg_prev = mg; 647 } 648 mc->mc_rotor = mg; 649 metaslab_class_minblocksize_update(mc); 650} 651 652void 653metaslab_group_passivate(metaslab_group_t *mg) 654{ 655 metaslab_class_t *mc = mg->mg_class; 656 metaslab_group_t *mgprev, *mgnext; 657 658 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 659 660 if (--mg->mg_activation_count != 0) { 661 ASSERT(mc->mc_rotor != mg); 662 ASSERT(mg->mg_prev == NULL); 663 ASSERT(mg->mg_next == NULL); 664 ASSERT(mg->mg_activation_count < 0); 665 return; 666 } 667 668 taskq_wait(mg->mg_taskq); 669 metaslab_group_alloc_update(mg); 670 671 mgprev = mg->mg_prev; 672 mgnext = mg->mg_next; 673 674 if (mg == mgnext) { 675 mc->mc_rotor = NULL; 676 } else { 677 mc->mc_rotor = mgnext; 678 mgprev->mg_next = mgnext; 679 mgnext->mg_prev = mgprev; 680 } 681 682 mg->mg_prev = NULL; 683 mg->mg_next = NULL; 684 metaslab_class_minblocksize_update(mc); 685} 686 687uint64_t 688metaslab_group_get_space(metaslab_group_t *mg) 689{ 690 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); 691} 692 693void 694metaslab_group_histogram_verify(metaslab_group_t *mg) 695{ 696 uint64_t *mg_hist; 697 vdev_t *vd = mg->mg_vd; 698 uint64_t ashift = vd->vdev_ashift; 699 int i; 700 701 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) 702 return; 703 704 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, 705 KM_SLEEP); 706 707 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, 708 SPACE_MAP_HISTOGRAM_SIZE + ashift); 709 710 for (int m = 0; m < vd->vdev_ms_count; m++) { 711 metaslab_t *msp = vd->vdev_ms[m]; 712 713 if (msp->ms_sm == NULL) 714 continue; 715 716 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) 717 mg_hist[i + ashift] += 718 msp->ms_sm->sm_phys->smp_histogram[i]; 719 } 720 721 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) 722 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); 723 724 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); 725} 726 727static void 728metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) 729{ 730 metaslab_class_t *mc = mg->mg_class; 731 uint64_t ashift = mg->mg_vd->vdev_ashift; 732 733 ASSERT(MUTEX_HELD(&msp->ms_lock)); 734 if (msp->ms_sm == NULL) 735 return; 736 737 mutex_enter(&mg->mg_lock); 738 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 739 mg->mg_histogram[i + ashift] += 740 msp->ms_sm->sm_phys->smp_histogram[i]; 741 mc->mc_histogram[i + ashift] += 742 msp->ms_sm->sm_phys->smp_histogram[i]; 743 } 744 mutex_exit(&mg->mg_lock); 745} 746 747void 748metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) 749{ 750 metaslab_class_t *mc = mg->mg_class; 751 uint64_t ashift = mg->mg_vd->vdev_ashift; 752 753 ASSERT(MUTEX_HELD(&msp->ms_lock)); 754 if (msp->ms_sm == NULL) 755 return; 756 757 mutex_enter(&mg->mg_lock); 758 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 759 ASSERT3U(mg->mg_histogram[i + ashift], >=, 760 msp->ms_sm->sm_phys->smp_histogram[i]); 761 ASSERT3U(mc->mc_histogram[i + ashift], >=, 762 msp->ms_sm->sm_phys->smp_histogram[i]); 763 764 mg->mg_histogram[i + ashift] -= 765 msp->ms_sm->sm_phys->smp_histogram[i]; 766 mc->mc_histogram[i + ashift] -= 767 msp->ms_sm->sm_phys->smp_histogram[i]; 768 } 769 mutex_exit(&mg->mg_lock); 770} 771 772static void 773metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 774{ 775 ASSERT(msp->ms_group == NULL); 776 mutex_enter(&mg->mg_lock); 777 msp->ms_group = mg; 778 msp->ms_weight = 0; 779 avl_add(&mg->mg_metaslab_tree, msp); 780 mutex_exit(&mg->mg_lock); 781 782 mutex_enter(&msp->ms_lock); 783 metaslab_group_histogram_add(mg, msp); 784 mutex_exit(&msp->ms_lock); 785} 786 787static void 788metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 789{ 790 mutex_enter(&msp->ms_lock); 791 metaslab_group_histogram_remove(mg, msp); 792 mutex_exit(&msp->ms_lock); 793 794 mutex_enter(&mg->mg_lock); 795 ASSERT(msp->ms_group == mg); 796 avl_remove(&mg->mg_metaslab_tree, msp); 797 msp->ms_group = NULL; 798 mutex_exit(&mg->mg_lock); 799} 800 801static void 802metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 803{ 804 /* 805 * Although in principle the weight can be any value, in 806 * practice we do not use values in the range [1, 511]. 807 */ 808 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); 809 ASSERT(MUTEX_HELD(&msp->ms_lock)); 810 811 mutex_enter(&mg->mg_lock); 812 ASSERT(msp->ms_group == mg); 813 avl_remove(&mg->mg_metaslab_tree, msp); 814 msp->ms_weight = weight; 815 avl_add(&mg->mg_metaslab_tree, msp); 816 mutex_exit(&mg->mg_lock); 817} 818 819/* 820 * Calculate the fragmentation for a given metaslab group. We can use 821 * a simple average here since all metaslabs within the group must have 822 * the same size. The return value will be a value between 0 and 100 823 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this 824 * group have a fragmentation metric. 825 */ 826uint64_t 827metaslab_group_fragmentation(metaslab_group_t *mg) 828{ 829 vdev_t *vd = mg->mg_vd; 830 uint64_t fragmentation = 0; 831 uint64_t valid_ms = 0; 832 833 for (int m = 0; m < vd->vdev_ms_count; m++) { 834 metaslab_t *msp = vd->vdev_ms[m]; 835 836 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) 837 continue; 838 839 valid_ms++; 840 fragmentation += msp->ms_fragmentation; 841 } 842 843 if (valid_ms <= vd->vdev_ms_count / 2) 844 return (ZFS_FRAG_INVALID); 845 846 fragmentation /= valid_ms; 847 ASSERT3U(fragmentation, <=, 100); 848 return (fragmentation); 849} 850 851/* 852 * Determine if a given metaslab group should skip allocations. A metaslab 853 * group should avoid allocations if its free capacity is less than the 854 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than 855 * zfs_mg_fragmentation_threshold and there is at least one metaslab group 856 * that can still handle allocations. 857 */ 858static boolean_t 859metaslab_group_allocatable(metaslab_group_t *mg) 860{ 861 vdev_t *vd = mg->mg_vd; 862 spa_t *spa = vd->vdev_spa; 863 metaslab_class_t *mc = mg->mg_class; 864 865 /* 866 * We use two key metrics to determine if a metaslab group is 867 * considered allocatable -- free space and fragmentation. If 868 * the free space is greater than the free space threshold and 869 * the fragmentation is less than the fragmentation threshold then 870 * consider the group allocatable. There are two case when we will 871 * not consider these key metrics. The first is if the group is 872 * associated with a slog device and the second is if all groups 873 * in this metaslab class have already been consider ineligible 874 * for allocations. 875 */ 876 return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && 877 (mg->mg_fragmentation == ZFS_FRAG_INVALID || 878 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || 879 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 880} 881 882/* 883 * ========================================================================== 884 * Range tree callbacks 885 * ========================================================================== 886 */ 887 888/* 889 * Comparison function for the private size-ordered tree. Tree is sorted 890 * by size, larger sizes at the end of the tree. 891 */ 892static int 893metaslab_rangesize_compare(const void *x1, const void *x2) 894{ 895 const range_seg_t *r1 = x1; 896 const range_seg_t *r2 = x2; 897 uint64_t rs_size1 = r1->rs_end - r1->rs_start; 898 uint64_t rs_size2 = r2->rs_end - r2->rs_start; 899 900 if (rs_size1 < rs_size2) 901 return (-1); 902 if (rs_size1 > rs_size2) 903 return (1); 904 905 if (r1->rs_start < r2->rs_start) 906 return (-1); 907 908 if (r1->rs_start > r2->rs_start) 909 return (1); 910 911 return (0); 912} 913 914/* 915 * Create any block allocator specific components. The current allocators 916 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. 917 */ 918static void 919metaslab_rt_create(range_tree_t *rt, void *arg) 920{ 921 metaslab_t *msp = arg; 922 923 ASSERT3P(rt->rt_arg, ==, msp); 924 ASSERT(msp->ms_tree == NULL); 925 926 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 927 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 928} 929 930/* 931 * Destroy the block allocator specific components. 932 */ 933static void 934metaslab_rt_destroy(range_tree_t *rt, void *arg) 935{ 936 metaslab_t *msp = arg; 937 938 ASSERT3P(rt->rt_arg, ==, msp); 939 ASSERT3P(msp->ms_tree, ==, rt); 940 ASSERT0(avl_numnodes(&msp->ms_size_tree)); 941 942 avl_destroy(&msp->ms_size_tree); 943} 944 945static void 946metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) 947{ 948 metaslab_t *msp = arg; 949 950 ASSERT3P(rt->rt_arg, ==, msp); 951 ASSERT3P(msp->ms_tree, ==, rt); 952 VERIFY(!msp->ms_condensing); 953 avl_add(&msp->ms_size_tree, rs); 954} 955 956static void 957metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 958{ 959 metaslab_t *msp = arg; 960 961 ASSERT3P(rt->rt_arg, ==, msp); 962 ASSERT3P(msp->ms_tree, ==, rt); 963 VERIFY(!msp->ms_condensing); 964 avl_remove(&msp->ms_size_tree, rs); 965} 966 967static void 968metaslab_rt_vacate(range_tree_t *rt, void *arg) 969{ 970 metaslab_t *msp = arg; 971 972 ASSERT3P(rt->rt_arg, ==, msp); 973 ASSERT3P(msp->ms_tree, ==, rt); 974 975 /* 976 * Normally one would walk the tree freeing nodes along the way. 977 * Since the nodes are shared with the range trees we can avoid 978 * walking all nodes and just reinitialize the avl tree. The nodes 979 * will be freed by the range tree, so we don't want to free them here. 980 */ 981 avl_create(&msp->ms_size_tree, metaslab_rangesize_compare, 982 sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); 983} 984 985static range_tree_ops_t metaslab_rt_ops = { 986 metaslab_rt_create, 987 metaslab_rt_destroy, 988 metaslab_rt_add, 989 metaslab_rt_remove, 990 metaslab_rt_vacate 991}; 992 993/* 994 * ========================================================================== 995 * Metaslab block operations 996 * ========================================================================== 997 */ 998 999/* 1000 * Return the maximum contiguous segment within the metaslab. 1001 */ 1002uint64_t 1003metaslab_block_maxsize(metaslab_t *msp) 1004{ 1005 avl_tree_t *t = &msp->ms_size_tree; 1006 range_seg_t *rs; 1007 1008 if (t == NULL || (rs = avl_last(t)) == NULL) 1009 return (0ULL); 1010 1011 return (rs->rs_end - rs->rs_start); 1012} 1013 1014uint64_t 1015metaslab_block_alloc(metaslab_t *msp, uint64_t size) 1016{ 1017 uint64_t start; 1018 range_tree_t *rt = msp->ms_tree; 1019 1020 VERIFY(!msp->ms_condensing); 1021 1022 start = msp->ms_ops->msop_alloc(msp, size); 1023 if (start != -1ULL) { 1024 vdev_t *vd = msp->ms_group->mg_vd; 1025 1026 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); 1027 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 1028 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); 1029 range_tree_remove(rt, start, size); 1030 } 1031 return (start); 1032} 1033 1034/* 1035 * ========================================================================== 1036 * Common allocator routines 1037 * ========================================================================== 1038 */ 1039 1040/* 1041 * This is a helper function that can be used by the allocator to find 1042 * a suitable block to allocate. This will search the specified AVL 1043 * tree looking for a block that matches the specified criteria. 1044 */ 1045static uint64_t 1046metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 1047 uint64_t align) 1048{ 1049 range_seg_t *rs, rsearch; 1050 avl_index_t where; 1051 1052 rsearch.rs_start = *cursor; 1053 rsearch.rs_end = *cursor + size; 1054 1055 rs = avl_find(t, &rsearch, &where); 1056 if (rs == NULL) 1057 rs = avl_nearest(t, where, AVL_AFTER); 1058 1059 while (rs != NULL) { 1060 uint64_t offset = P2ROUNDUP(rs->rs_start, align); 1061 1062 if (offset + size <= rs->rs_end) { 1063 *cursor = offset + size; 1064 return (offset); 1065 } 1066 rs = AVL_NEXT(t, rs); 1067 } 1068 1069 /* 1070 * If we know we've searched the whole map (*cursor == 0), give up. 1071 * Otherwise, reset the cursor to the beginning and try again. 1072 */ 1073 if (*cursor == 0) 1074 return (-1ULL); 1075 1076 *cursor = 0; 1077 return (metaslab_block_picker(t, cursor, size, align)); 1078} 1079 1080/* 1081 * ========================================================================== 1082 * The first-fit block allocator 1083 * ========================================================================== 1084 */ 1085static uint64_t 1086metaslab_ff_alloc(metaslab_t *msp, uint64_t size) 1087{ 1088 /* 1089 * Find the largest power of 2 block size that evenly divides the 1090 * requested size. This is used to try to allocate blocks with similar 1091 * alignment from the same area of the metaslab (i.e. same cursor 1092 * bucket) but it does not guarantee that other allocations sizes 1093 * may exist in the same region. 1094 */ 1095 uint64_t align = size & -size; 1096 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1097 avl_tree_t *t = &msp->ms_tree->rt_root; 1098 1099 return (metaslab_block_picker(t, cursor, size, align)); 1100} 1101 1102static metaslab_ops_t metaslab_ff_ops = { 1103 metaslab_ff_alloc 1104}; 1105 1106/* 1107 * ========================================================================== 1108 * Dynamic block allocator - 1109 * Uses the first fit allocation scheme until space get low and then 1110 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 1111 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 1112 * ========================================================================== 1113 */ 1114static uint64_t 1115metaslab_df_alloc(metaslab_t *msp, uint64_t size) 1116{ 1117 /* 1118 * Find the largest power of 2 block size that evenly divides the 1119 * requested size. This is used to try to allocate blocks with similar 1120 * alignment from the same area of the metaslab (i.e. same cursor 1121 * bucket) but it does not guarantee that other allocations sizes 1122 * may exist in the same region. 1123 */ 1124 uint64_t align = size & -size; 1125 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; 1126 range_tree_t *rt = msp->ms_tree; 1127 avl_tree_t *t = &rt->rt_root; 1128 uint64_t max_size = metaslab_block_maxsize(msp); 1129 int free_pct = range_tree_space(rt) * 100 / msp->ms_size; 1130 1131 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1132 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1133 1134 if (max_size < size) 1135 return (-1ULL); 1136 1137 /* 1138 * If we're running low on space switch to using the size 1139 * sorted AVL tree (best-fit). 1140 */ 1141 if (max_size < metaslab_df_alloc_threshold || 1142 free_pct < metaslab_df_free_pct) { 1143 t = &msp->ms_size_tree; 1144 *cursor = 0; 1145 } 1146 1147 return (metaslab_block_picker(t, cursor, size, 1ULL)); 1148} 1149 1150static metaslab_ops_t metaslab_df_ops = { 1151 metaslab_df_alloc 1152}; 1153 1154/* 1155 * ========================================================================== 1156 * Cursor fit block allocator - 1157 * Select the largest region in the metaslab, set the cursor to the beginning 1158 * of the range and the cursor_end to the end of the range. As allocations 1159 * are made advance the cursor. Continue allocating from the cursor until 1160 * the range is exhausted and then find a new range. 1161 * ========================================================================== 1162 */ 1163static uint64_t 1164metaslab_cf_alloc(metaslab_t *msp, uint64_t size) 1165{ 1166 range_tree_t *rt = msp->ms_tree; 1167 avl_tree_t *t = &msp->ms_size_tree; 1168 uint64_t *cursor = &msp->ms_lbas[0]; 1169 uint64_t *cursor_end = &msp->ms_lbas[1]; 1170 uint64_t offset = 0; 1171 1172 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1173 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root)); 1174 1175 ASSERT3U(*cursor_end, >=, *cursor); 1176 1177 if ((*cursor + size) > *cursor_end) { 1178 range_seg_t *rs; 1179 1180 rs = avl_last(&msp->ms_size_tree); 1181 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) 1182 return (-1ULL); 1183 1184 *cursor = rs->rs_start; 1185 *cursor_end = rs->rs_end; 1186 } 1187 1188 offset = *cursor; 1189 *cursor += size; 1190 1191 return (offset); 1192} 1193 1194static metaslab_ops_t metaslab_cf_ops = { 1195 metaslab_cf_alloc 1196}; 1197 1198/* 1199 * ========================================================================== 1200 * New dynamic fit allocator - 1201 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift 1202 * contiguous blocks. If no region is found then just use the largest segment 1203 * that remains. 1204 * ========================================================================== 1205 */ 1206 1207/* 1208 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) 1209 * to request from the allocator. 1210 */ 1211uint64_t metaslab_ndf_clump_shift = 4; 1212 1213static uint64_t 1214metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) 1215{ 1216 avl_tree_t *t = &msp->ms_tree->rt_root; 1217 avl_index_t where; 1218 range_seg_t *rs, rsearch; 1219 uint64_t hbit = highbit64(size); 1220 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; 1221 uint64_t max_size = metaslab_block_maxsize(msp); 1222 1223 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1224 ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree)); 1225 1226 if (max_size < size) 1227 return (-1ULL); 1228 1229 rsearch.rs_start = *cursor; 1230 rsearch.rs_end = *cursor + size; 1231 1232 rs = avl_find(t, &rsearch, &where); 1233 if (rs == NULL || (rs->rs_end - rs->rs_start) < size) { 1234 t = &msp->ms_size_tree; 1235 1236 rsearch.rs_start = 0; 1237 rsearch.rs_end = MIN(max_size, 1238 1ULL << (hbit + metaslab_ndf_clump_shift)); 1239 rs = avl_find(t, &rsearch, &where); 1240 if (rs == NULL) 1241 rs = avl_nearest(t, where, AVL_AFTER); 1242 ASSERT(rs != NULL); 1243 } 1244 1245 if ((rs->rs_end - rs->rs_start) >= size) { 1246 *cursor = rs->rs_start + size; 1247 return (rs->rs_start); 1248 } 1249 return (-1ULL); 1250} 1251 1252static metaslab_ops_t metaslab_ndf_ops = { 1253 metaslab_ndf_alloc 1254}; 1255 1256metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 1257 1258/* 1259 * ========================================================================== 1260 * Metaslabs 1261 * ========================================================================== 1262 */ 1263 1264/* 1265 * Wait for any in-progress metaslab loads to complete. 1266 */ 1267void 1268metaslab_load_wait(metaslab_t *msp) 1269{ 1270 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1271 1272 while (msp->ms_loading) { 1273 ASSERT(!msp->ms_loaded); 1274 cv_wait(&msp->ms_load_cv, &msp->ms_lock); 1275 } 1276} 1277 1278int 1279metaslab_load(metaslab_t *msp) 1280{ 1281 int error = 0; 1282 1283 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1284 ASSERT(!msp->ms_loaded); 1285 ASSERT(!msp->ms_loading); 1286 1287 msp->ms_loading = B_TRUE; 1288 1289 /* 1290 * If the space map has not been allocated yet, then treat 1291 * all the space in the metaslab as free and add it to the 1292 * ms_tree. 1293 */ 1294 if (msp->ms_sm != NULL) 1295 error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE); 1296 else 1297 range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size); 1298 1299 msp->ms_loaded = (error == 0); 1300 msp->ms_loading = B_FALSE; 1301 1302 if (msp->ms_loaded) { 1303 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1304 range_tree_walk(msp->ms_defertree[t], 1305 range_tree_remove, msp->ms_tree); 1306 } 1307 } 1308 cv_broadcast(&msp->ms_load_cv); 1309 return (error); 1310} 1311 1312void 1313metaslab_unload(metaslab_t *msp) 1314{ 1315 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1316 range_tree_vacate(msp->ms_tree, NULL, NULL); 1317 msp->ms_loaded = B_FALSE; 1318 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; 1319} 1320 1321int 1322metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, 1323 metaslab_t **msp) 1324{ 1325 vdev_t *vd = mg->mg_vd; 1326 objset_t *mos = vd->vdev_spa->spa_meta_objset; 1327 metaslab_t *ms; 1328 int error; 1329 1330 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 1331 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); 1332 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); 1333 ms->ms_id = id; 1334 ms->ms_start = id << vd->vdev_ms_shift; 1335 ms->ms_size = 1ULL << vd->vdev_ms_shift; 1336 1337 /* 1338 * We only open space map objects that already exist. All others 1339 * will be opened when we finally allocate an object for it. 1340 */ 1341 if (object != 0) { 1342 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, 1343 ms->ms_size, vd->vdev_ashift, &ms->ms_lock); 1344 1345 if (error != 0) { 1346 kmem_free(ms, sizeof (metaslab_t)); 1347 return (error); 1348 } 1349 1350 ASSERT(ms->ms_sm != NULL); 1351 } 1352 1353 /* 1354 * We create the main range tree here, but we don't create the 1355 * alloctree and freetree until metaslab_sync_done(). This serves 1356 * two purposes: it allows metaslab_sync_done() to detect the 1357 * addition of new space; and for debugging, it ensures that we'd 1358 * data fault on any attempt to use this metaslab before it's ready. 1359 */ 1360 ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock); 1361 metaslab_group_add(mg, ms); 1362 1363 ms->ms_fragmentation = metaslab_fragmentation(ms); 1364 ms->ms_ops = mg->mg_class->mc_ops; 1365 1366 /* 1367 * If we're opening an existing pool (txg == 0) or creating 1368 * a new one (txg == TXG_INITIAL), all space is available now. 1369 * If we're adding space to an existing pool, the new space 1370 * does not become available until after this txg has synced. 1371 */ 1372 if (txg <= TXG_INITIAL) 1373 metaslab_sync_done(ms, 0); 1374 1375 /* 1376 * If metaslab_debug_load is set and we're initializing a metaslab 1377 * that has an allocated space_map object then load the its space 1378 * map so that can verify frees. 1379 */ 1380 if (metaslab_debug_load && ms->ms_sm != NULL) { 1381 mutex_enter(&ms->ms_lock); 1382 VERIFY0(metaslab_load(ms)); 1383 mutex_exit(&ms->ms_lock); 1384 } 1385 1386 if (txg != 0) { 1387 vdev_dirty(vd, 0, NULL, txg); 1388 vdev_dirty(vd, VDD_METASLAB, ms, txg); 1389 } 1390 1391 *msp = ms; 1392 1393 return (0); 1394} 1395 1396void 1397metaslab_fini(metaslab_t *msp) 1398{ 1399 metaslab_group_t *mg = msp->ms_group; 1400 1401 metaslab_group_remove(mg, msp); 1402 1403 mutex_enter(&msp->ms_lock); 1404 1405 VERIFY(msp->ms_group == NULL); 1406 vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), 1407 0, -msp->ms_size); 1408 space_map_close(msp->ms_sm); 1409 1410 metaslab_unload(msp); 1411 range_tree_destroy(msp->ms_tree); 1412 1413 for (int t = 0; t < TXG_SIZE; t++) { 1414 range_tree_destroy(msp->ms_alloctree[t]); 1415 range_tree_destroy(msp->ms_freetree[t]); 1416 } 1417 1418 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1419 range_tree_destroy(msp->ms_defertree[t]); 1420 } 1421 1422 ASSERT0(msp->ms_deferspace); 1423 1424 mutex_exit(&msp->ms_lock); 1425 cv_destroy(&msp->ms_load_cv); 1426 mutex_destroy(&msp->ms_lock); 1427 1428 kmem_free(msp, sizeof (metaslab_t)); 1429} 1430 1431#define FRAGMENTATION_TABLE_SIZE 17 1432 1433/* 1434 * This table defines a segment size based fragmentation metric that will 1435 * allow each metaslab to derive its own fragmentation value. This is done 1436 * by calculating the space in each bucket of the spacemap histogram and 1437 * multiplying that by the fragmetation metric in this table. Doing 1438 * this for all buckets and dividing it by the total amount of free 1439 * space in this metaslab (i.e. the total free space in all buckets) gives 1440 * us the fragmentation metric. This means that a high fragmentation metric 1441 * equates to most of the free space being comprised of small segments. 1442 * Conversely, if the metric is low, then most of the free space is in 1443 * large segments. A 10% change in fragmentation equates to approximately 1444 * double the number of segments. 1445 * 1446 * This table defines 0% fragmented space using 16MB segments. Testing has 1447 * shown that segments that are greater than or equal to 16MB do not suffer 1448 * from drastic performance problems. Using this value, we derive the rest 1449 * of the table. Since the fragmentation value is never stored on disk, it 1450 * is possible to change these calculations in the future. 1451 */ 1452int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { 1453 100, /* 512B */ 1454 100, /* 1K */ 1455 98, /* 2K */ 1456 95, /* 4K */ 1457 90, /* 8K */ 1458 80, /* 16K */ 1459 70, /* 32K */ 1460 60, /* 64K */ 1461 50, /* 128K */ 1462 40, /* 256K */ 1463 30, /* 512K */ 1464 20, /* 1M */ 1465 15, /* 2M */ 1466 10, /* 4M */ 1467 5, /* 8M */ 1468 0 /* 16M */ 1469}; 1470 1471/* 1472 * Calclate the metaslab's fragmentation metric. A return value 1473 * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does 1474 * not support this metric. Otherwise, the return value should be in the 1475 * range [0, 100]. 1476 */ 1477static uint64_t 1478metaslab_fragmentation(metaslab_t *msp) 1479{ 1480 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1481 uint64_t fragmentation = 0; 1482 uint64_t total = 0; 1483 boolean_t feature_enabled = spa_feature_is_enabled(spa, 1484 SPA_FEATURE_SPACEMAP_HISTOGRAM); 1485 1486 if (!feature_enabled) 1487 return (ZFS_FRAG_INVALID); 1488 1489 /* 1490 * A null space map means that the entire metaslab is free 1491 * and thus is not fragmented. 1492 */ 1493 if (msp->ms_sm == NULL) 1494 return (0); 1495 1496 /* 1497 * If this metaslab's space_map has not been upgraded, flag it 1498 * so that we upgrade next time we encounter it. 1499 */ 1500 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { 1501 uint64_t txg = spa_syncing_txg(spa); 1502 vdev_t *vd = msp->ms_group->mg_vd; 1503 1504 if (spa_writeable(spa)) { 1505 msp->ms_condense_wanted = B_TRUE; 1506 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1507 spa_dbgmsg(spa, "txg %llu, requesting force condense: " 1508 "msp %p, vd %p", txg, msp, vd); 1509 } 1510 return (ZFS_FRAG_INVALID); 1511 } 1512 1513 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { 1514 uint64_t space = 0; 1515 uint8_t shift = msp->ms_sm->sm_shift; 1516 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, 1517 FRAGMENTATION_TABLE_SIZE - 1); 1518 1519 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) 1520 continue; 1521 1522 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); 1523 total += space; 1524 1525 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); 1526 fragmentation += space * zfs_frag_table[idx]; 1527 } 1528 1529 if (total > 0) 1530 fragmentation /= total; 1531 ASSERT3U(fragmentation, <=, 100); 1532 return (fragmentation); 1533} 1534 1535/* 1536 * Compute a weight -- a selection preference value -- for the given metaslab. 1537 * This is based on the amount of free space, the level of fragmentation, 1538 * the LBA range, and whether the metaslab is loaded. 1539 */ 1540static uint64_t 1541metaslab_weight(metaslab_t *msp) 1542{ 1543 metaslab_group_t *mg = msp->ms_group; 1544 vdev_t *vd = mg->mg_vd; 1545 uint64_t weight, space; 1546 1547 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1548 1549 /* 1550 * This vdev is in the process of being removed so there is nothing 1551 * for us to do here. 1552 */ 1553 if (vd->vdev_removing) { 1554 ASSERT0(space_map_allocated(msp->ms_sm)); 1555 ASSERT0(vd->vdev_ms_shift); 1556 return (0); 1557 } 1558 1559 /* 1560 * The baseline weight is the metaslab's free space. 1561 */ 1562 space = msp->ms_size - space_map_allocated(msp->ms_sm); 1563 1564 msp->ms_fragmentation = metaslab_fragmentation(msp); 1565 if (metaslab_fragmentation_factor_enabled && 1566 msp->ms_fragmentation != ZFS_FRAG_INVALID) { 1567 /* 1568 * Use the fragmentation information to inversely scale 1569 * down the baseline weight. We need to ensure that we 1570 * don't exclude this metaslab completely when it's 100% 1571 * fragmented. To avoid this we reduce the fragmented value 1572 * by 1. 1573 */ 1574 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; 1575 1576 /* 1577 * If space < SPA_MINBLOCKSIZE, then we will not allocate from 1578 * this metaslab again. The fragmentation metric may have 1579 * decreased the space to something smaller than 1580 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE 1581 * so that we can consume any remaining space. 1582 */ 1583 if (space > 0 && space < SPA_MINBLOCKSIZE) 1584 space = SPA_MINBLOCKSIZE; 1585 } 1586 weight = space; 1587 1588 /* 1589 * Modern disks have uniform bit density and constant angular velocity. 1590 * Therefore, the outer recording zones are faster (higher bandwidth) 1591 * than the inner zones by the ratio of outer to inner track diameter, 1592 * which is typically around 2:1. We account for this by assigning 1593 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 1594 * In effect, this means that we'll select the metaslab with the most 1595 * free bandwidth rather than simply the one with the most free space. 1596 */ 1597 if (metaslab_lba_weighting_enabled) { 1598 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; 1599 ASSERT(weight >= space && weight <= 2 * space); 1600 } 1601 1602 /* 1603 * If this metaslab is one we're actively using, adjust its 1604 * weight to make it preferable to any inactive metaslab so 1605 * we'll polish it off. If the fragmentation on this metaslab 1606 * has exceed our threshold, then don't mark it active. 1607 */ 1608 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && 1609 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { 1610 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1611 } 1612 1613 return (weight); 1614} 1615 1616static int 1617metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1618{ 1619 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1620 1621 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1622 metaslab_load_wait(msp); 1623 if (!msp->ms_loaded) { 1624 int error = metaslab_load(msp); 1625 if (error) { 1626 metaslab_group_sort(msp->ms_group, msp, 0); 1627 return (error); 1628 } 1629 } 1630 1631 metaslab_group_sort(msp->ms_group, msp, 1632 msp->ms_weight | activation_weight); 1633 } 1634 ASSERT(msp->ms_loaded); 1635 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1636 1637 return (0); 1638} 1639 1640static void 1641metaslab_passivate(metaslab_t *msp, uint64_t size) 1642{ 1643 /* 1644 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1645 * this metaslab again. In that case, it had better be empty, 1646 * or we would be leaving space on the table. 1647 */ 1648 ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0); 1649 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1650 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1651} 1652 1653static void 1654metaslab_preload(void *arg) 1655{ 1656 metaslab_t *msp = arg; 1657 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1658 1659 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); 1660 1661 mutex_enter(&msp->ms_lock); 1662 metaslab_load_wait(msp); 1663 if (!msp->ms_loaded) 1664 (void) metaslab_load(msp); 1665 1666 /* 1667 * Set the ms_access_txg value so that we don't unload it right away. 1668 */ 1669 msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1; 1670 mutex_exit(&msp->ms_lock); 1671} 1672 1673static void 1674metaslab_group_preload(metaslab_group_t *mg) 1675{ 1676 spa_t *spa = mg->mg_vd->vdev_spa; 1677 metaslab_t *msp; 1678 avl_tree_t *t = &mg->mg_metaslab_tree; 1679 int m = 0; 1680 1681 if (spa_shutting_down(spa) || !metaslab_preload_enabled) { 1682 taskq_wait(mg->mg_taskq); 1683 return; 1684 } 1685 1686 mutex_enter(&mg->mg_lock); 1687 /* 1688 * Load the next potential metaslabs 1689 */ 1690 msp = avl_first(t); 1691 while (msp != NULL) { 1692 metaslab_t *msp_next = AVL_NEXT(t, msp); 1693 1694 /* 1695 * We preload only the maximum number of metaslabs specified 1696 * by metaslab_preload_limit. If a metaslab is being forced 1697 * to condense then we preload it too. This will ensure 1698 * that force condensing happens in the next txg. 1699 */ 1700 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { 1701 msp = msp_next; 1702 continue; 1703 } 1704 1705 /* 1706 * We must drop the metaslab group lock here to preserve 1707 * lock ordering with the ms_lock (when grabbing both 1708 * the mg_lock and the ms_lock, the ms_lock must be taken 1709 * first). As a result, it is possible that the ordering 1710 * of the metaslabs within the avl tree may change before 1711 * we reacquire the lock. The metaslab cannot be removed from 1712 * the tree while we're in syncing context so it is safe to 1713 * drop the mg_lock here. If the metaslabs are reordered 1714 * nothing will break -- we just may end up loading a 1715 * less than optimal one. 1716 */ 1717 mutex_exit(&mg->mg_lock); 1718 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, 1719 msp, TQ_SLEEP) != 0); 1720 mutex_enter(&mg->mg_lock); 1721 msp = msp_next; 1722 } 1723 mutex_exit(&mg->mg_lock); 1724} 1725 1726/* 1727 * Determine if the space map's on-disk footprint is past our tolerance 1728 * for inefficiency. We would like to use the following criteria to make 1729 * our decision: 1730 * 1731 * 1. The size of the space map object should not dramatically increase as a 1732 * result of writing out the free space range tree. 1733 * 1734 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1735 * times the size than the free space range tree representation 1736 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). 1737 * 1738 * 3. The on-disk size of the space map should actually decrease. 1739 * 1740 * Checking the first condition is tricky since we don't want to walk 1741 * the entire AVL tree calculating the estimated on-disk size. Instead we 1742 * use the size-ordered range tree in the metaslab and calculate the 1743 * size required to write out the largest segment in our free tree. If the 1744 * size required to represent that segment on disk is larger than the space 1745 * map object then we avoid condensing this map. 1746 * 1747 * To determine the second criterion we use a best-case estimate and assume 1748 * each segment can be represented on-disk as a single 64-bit entry. We refer 1749 * to this best-case estimate as the space map's minimal form. 1750 * 1751 * Unfortunately, we cannot compute the on-disk size of the space map in this 1752 * context because we cannot accurately compute the effects of compression, etc. 1753 * Instead, we apply the heuristic described in the block comment for 1754 * zfs_metaslab_condense_block_threshold - we only condense if the space used 1755 * is greater than a threshold number of blocks. 1756 */ 1757static boolean_t 1758metaslab_should_condense(metaslab_t *msp) 1759{ 1760 space_map_t *sm = msp->ms_sm; 1761 range_seg_t *rs; 1762 uint64_t size, entries, segsz, object_size, optimal_size, record_size; 1763 dmu_object_info_t doi; 1764 uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; 1765 1766 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1767 ASSERT(msp->ms_loaded); 1768 1769 /* 1770 * Use the ms_size_tree range tree, which is ordered by size, to 1771 * obtain the largest segment in the free tree. We always condense 1772 * metaslabs that are empty and metaslabs for which a condense 1773 * request has been made. 1774 */ 1775 rs = avl_last(&msp->ms_size_tree); 1776 if (rs == NULL || msp->ms_condense_wanted) 1777 return (B_TRUE); 1778 1779 /* 1780 * Calculate the number of 64-bit entries this segment would 1781 * require when written to disk. If this single segment would be 1782 * larger on-disk than the entire current on-disk structure, then 1783 * clearly condensing will increase the on-disk structure size. 1784 */ 1785 size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; 1786 entries = size / (MIN(size, SM_RUN_MAX)); 1787 segsz = entries * sizeof (uint64_t); 1788 1789 optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); 1790 object_size = space_map_length(msp->ms_sm); 1791 1792 dmu_object_info_from_db(sm->sm_dbuf, &doi); 1793 record_size = MAX(doi.doi_data_block_size, vdev_blocksize); 1794 1795 return (segsz <= object_size && 1796 object_size >= (optimal_size * zfs_condense_pct / 100) && 1797 object_size > zfs_metaslab_condense_block_threshold * record_size); 1798} 1799 1800/* 1801 * Condense the on-disk space map representation to its minimized form. 1802 * The minimized form consists of a small number of allocations followed by 1803 * the entries of the free range tree. 1804 */ 1805static void 1806metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1807{ 1808 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1809 range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK]; 1810 range_tree_t *condense_tree; 1811 space_map_t *sm = msp->ms_sm; 1812 1813 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1814 ASSERT3U(spa_sync_pass(spa), ==, 1); 1815 ASSERT(msp->ms_loaded); 1816 1817 1818 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1819 "smp size %llu, segments %lu, forcing condense=%s", txg, 1820 msp->ms_id, msp, space_map_length(msp->ms_sm), 1821 avl_numnodes(&msp->ms_tree->rt_root), 1822 msp->ms_condense_wanted ? "TRUE" : "FALSE"); 1823 1824 msp->ms_condense_wanted = B_FALSE; 1825 1826 /* 1827 * Create an range tree that is 100% allocated. We remove segments 1828 * that have been freed in this txg, any deferred frees that exist, 1829 * and any allocation in the future. Removing segments should be 1830 * a relatively inexpensive operation since we expect these trees to 1831 * have a small number of nodes. 1832 */ 1833 condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock); 1834 range_tree_add(condense_tree, msp->ms_start, msp->ms_size); 1835 1836 /* 1837 * Remove what's been freed in this txg from the condense_tree. 1838 * Since we're in sync_pass 1, we know that all the frees from 1839 * this txg are in the freetree. 1840 */ 1841 range_tree_walk(freetree, range_tree_remove, condense_tree); 1842 1843 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1844 range_tree_walk(msp->ms_defertree[t], 1845 range_tree_remove, condense_tree); 1846 } 1847 1848 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 1849 range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK], 1850 range_tree_remove, condense_tree); 1851 } 1852 1853 /* 1854 * We're about to drop the metaslab's lock thus allowing 1855 * other consumers to change it's content. Set the 1856 * metaslab's ms_condensing flag to ensure that 1857 * allocations on this metaslab do not occur while we're 1858 * in the middle of committing it to disk. This is only critical 1859 * for the ms_tree as all other range trees use per txg 1860 * views of their content. 1861 */ 1862 msp->ms_condensing = B_TRUE; 1863 1864 mutex_exit(&msp->ms_lock); 1865 space_map_truncate(sm, tx); 1866 mutex_enter(&msp->ms_lock); 1867 1868 /* 1869 * While we would ideally like to create a space_map representation 1870 * that consists only of allocation records, doing so can be 1871 * prohibitively expensive because the in-core free tree can be 1872 * large, and therefore computationally expensive to subtract 1873 * from the condense_tree. Instead we sync out two trees, a cheap 1874 * allocation only tree followed by the in-core free tree. While not 1875 * optimal, this is typically close to optimal, and much cheaper to 1876 * compute. 1877 */ 1878 space_map_write(sm, condense_tree, SM_ALLOC, tx); 1879 range_tree_vacate(condense_tree, NULL, NULL); 1880 range_tree_destroy(condense_tree); 1881 1882 space_map_write(sm, msp->ms_tree, SM_FREE, tx); 1883 msp->ms_condensing = B_FALSE; 1884} 1885 1886/* 1887 * Write a metaslab to disk in the context of the specified transaction group. 1888 */ 1889void 1890metaslab_sync(metaslab_t *msp, uint64_t txg) 1891{ 1892 metaslab_group_t *mg = msp->ms_group; 1893 vdev_t *vd = mg->mg_vd; 1894 spa_t *spa = vd->vdev_spa; 1895 objset_t *mos = spa_meta_objset(spa); 1896 range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK]; 1897 range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK]; 1898 range_tree_t **freed_tree = 1899 &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 1900 dmu_tx_t *tx; 1901 uint64_t object = space_map_object(msp->ms_sm); 1902 1903 ASSERT(!vd->vdev_ishole); 1904 1905 /* 1906 * This metaslab has just been added so there's no work to do now. 1907 */ 1908 if (*freetree == NULL) { 1909 ASSERT3P(alloctree, ==, NULL); 1910 return; 1911 } 1912 1913 ASSERT3P(alloctree, !=, NULL); 1914 ASSERT3P(*freetree, !=, NULL); 1915 ASSERT3P(*freed_tree, !=, NULL); 1916 1917 /* 1918 * Normally, we don't want to process a metaslab if there 1919 * are no allocations or frees to perform. However, if the metaslab 1920 * is being forced to condense we need to let it through. 1921 */ 1922 if (range_tree_space(alloctree) == 0 && 1923 range_tree_space(*freetree) == 0 && 1924 !msp->ms_condense_wanted) 1925 return; 1926 1927 /* 1928 * The only state that can actually be changing concurrently with 1929 * metaslab_sync() is the metaslab's ms_tree. No other thread can 1930 * be modifying this txg's alloctree, freetree, freed_tree, or 1931 * space_map_phys_t. Therefore, we only hold ms_lock to satify 1932 * space_map ASSERTs. We drop it whenever we call into the DMU, 1933 * because the DMU can call down to us (e.g. via zio_free()) at 1934 * any time. 1935 */ 1936 1937 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1938 1939 if (msp->ms_sm == NULL) { 1940 uint64_t new_object; 1941 1942 new_object = space_map_alloc(mos, tx); 1943 VERIFY3U(new_object, !=, 0); 1944 1945 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, 1946 msp->ms_start, msp->ms_size, vd->vdev_ashift, 1947 &msp->ms_lock)); 1948 ASSERT(msp->ms_sm != NULL); 1949 } 1950 1951 mutex_enter(&msp->ms_lock); 1952 1953 /* 1954 * Note: metaslab_condense() clears the space_map's histogram. 1955 * Therefore we must verify and remove this histogram before 1956 * condensing. 1957 */ 1958 metaslab_group_histogram_verify(mg); 1959 metaslab_class_histogram_verify(mg->mg_class); 1960 metaslab_group_histogram_remove(mg, msp); 1961 1962 if (msp->ms_loaded && spa_sync_pass(spa) == 1 && 1963 metaslab_should_condense(msp)) { 1964 metaslab_condense(msp, txg, tx); 1965 } else { 1966 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); 1967 space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); 1968 } 1969 1970 if (msp->ms_loaded) { 1971 /* 1972 * When the space map is loaded, we have an accruate 1973 * histogram in the range tree. This gives us an opportunity 1974 * to bring the space map's histogram up-to-date so we clear 1975 * it first before updating it. 1976 */ 1977 space_map_histogram_clear(msp->ms_sm); 1978 space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx); 1979 } else { 1980 /* 1981 * Since the space map is not loaded we simply update the 1982 * exisiting histogram with what was freed in this txg. This 1983 * means that the on-disk histogram may not have an accurate 1984 * view of the free space but it's close enough to allow 1985 * us to make allocation decisions. 1986 */ 1987 space_map_histogram_add(msp->ms_sm, *freetree, tx); 1988 } 1989 metaslab_group_histogram_add(mg, msp); 1990 metaslab_group_histogram_verify(mg); 1991 metaslab_class_histogram_verify(mg->mg_class); 1992 1993 /* 1994 * For sync pass 1, we avoid traversing this txg's free range tree 1995 * and instead will just swap the pointers for freetree and 1996 * freed_tree. We can safely do this since the freed_tree is 1997 * guaranteed to be empty on the initial pass. 1998 */ 1999 if (spa_sync_pass(spa) == 1) { 2000 range_tree_swap(freetree, freed_tree); 2001 } else { 2002 range_tree_vacate(*freetree, range_tree_add, *freed_tree); 2003 } 2004 range_tree_vacate(alloctree, NULL, NULL); 2005 2006 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2007 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2008 2009 mutex_exit(&msp->ms_lock); 2010 2011 if (object != space_map_object(msp->ms_sm)) { 2012 object = space_map_object(msp->ms_sm); 2013 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 2014 msp->ms_id, sizeof (uint64_t), &object, tx); 2015 } 2016 dmu_tx_commit(tx); 2017} 2018 2019/* 2020 * Called after a transaction group has completely synced to mark 2021 * all of the metaslab's free space as usable. 2022 */ 2023void 2024metaslab_sync_done(metaslab_t *msp, uint64_t txg) 2025{ 2026 metaslab_group_t *mg = msp->ms_group; 2027 vdev_t *vd = mg->mg_vd; 2028 range_tree_t **freed_tree; 2029 range_tree_t **defer_tree; 2030 int64_t alloc_delta, defer_delta; 2031 2032 ASSERT(!vd->vdev_ishole); 2033 2034 mutex_enter(&msp->ms_lock); 2035 2036 /* 2037 * If this metaslab is just becoming available, initialize its 2038 * alloctrees, freetrees, and defertree and add its capacity to 2039 * the vdev. 2040 */ 2041 if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) { 2042 for (int t = 0; t < TXG_SIZE; t++) { 2043 ASSERT(msp->ms_alloctree[t] == NULL); 2044 ASSERT(msp->ms_freetree[t] == NULL); 2045 2046 msp->ms_alloctree[t] = range_tree_create(NULL, msp, 2047 &msp->ms_lock); 2048 msp->ms_freetree[t] = range_tree_create(NULL, msp, 2049 &msp->ms_lock); 2050 } 2051 2052 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 2053 ASSERT(msp->ms_defertree[t] == NULL); 2054 2055 msp->ms_defertree[t] = range_tree_create(NULL, msp, 2056 &msp->ms_lock); 2057 } 2058 2059 vdev_space_update(vd, 0, 0, msp->ms_size); 2060 } 2061 2062 freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; 2063 defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE]; 2064 2065 alloc_delta = space_map_alloc_delta(msp->ms_sm); 2066 defer_delta = range_tree_space(*freed_tree) - 2067 range_tree_space(*defer_tree); 2068 2069 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 2070 2071 ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); 2072 ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); 2073 2074 /* 2075 * If there's a metaslab_load() in progress, wait for it to complete 2076 * so that we have a consistent view of the in-core space map. 2077 */ 2078 metaslab_load_wait(msp); 2079 2080 /* 2081 * Move the frees from the defer_tree back to the free 2082 * range tree (if it's loaded). Swap the freed_tree and the 2083 * defer_tree -- this is safe to do because we've just emptied out 2084 * the defer_tree. 2085 */ 2086 range_tree_vacate(*defer_tree, 2087 msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree); 2088 range_tree_swap(freed_tree, defer_tree); 2089 2090 space_map_update(msp->ms_sm); 2091 2092 msp->ms_deferspace += defer_delta; 2093 ASSERT3S(msp->ms_deferspace, >=, 0); 2094 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); 2095 if (msp->ms_deferspace != 0) { 2096 /* 2097 * Keep syncing this metaslab until all deferred frees 2098 * are back in circulation. 2099 */ 2100 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 2101 } 2102 2103 if (msp->ms_loaded && msp->ms_access_txg < txg) { 2104 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { 2105 VERIFY0(range_tree_space( 2106 msp->ms_alloctree[(txg + t) & TXG_MASK])); 2107 } 2108 2109 if (!metaslab_debug_unload) 2110 metaslab_unload(msp); 2111 } 2112 2113 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 2114 mutex_exit(&msp->ms_lock); 2115} 2116 2117void 2118metaslab_sync_reassess(metaslab_group_t *mg) 2119{ 2120 metaslab_group_alloc_update(mg); 2121 mg->mg_fragmentation = metaslab_group_fragmentation(mg); 2122 2123 /* 2124 * Preload the next potential metaslabs 2125 */ 2126 metaslab_group_preload(mg); 2127} 2128 2129static uint64_t 2130metaslab_distance(metaslab_t *msp, dva_t *dva) 2131{ 2132 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 2133 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 2134 uint64_t start = msp->ms_id; 2135 2136 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 2137 return (1ULL << 63); 2138 2139 if (offset < start) 2140 return ((start - offset) << ms_shift); 2141 if (offset > start) 2142 return ((offset - start) << ms_shift); 2143 return (0); 2144} 2145 2146static uint64_t 2147metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 2148 uint64_t txg, uint64_t min_distance, dva_t *dva, int d) 2149{ 2150 spa_t *spa = mg->mg_vd->vdev_spa; 2151 metaslab_t *msp = NULL; 2152 uint64_t offset = -1ULL; 2153 avl_tree_t *t = &mg->mg_metaslab_tree; 2154 uint64_t activation_weight; 2155 uint64_t target_distance; 2156 int i; 2157 2158 activation_weight = METASLAB_WEIGHT_PRIMARY; 2159 for (i = 0; i < d; i++) { 2160 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 2161 activation_weight = METASLAB_WEIGHT_SECONDARY; 2162 break; 2163 } 2164 } 2165 2166 for (;;) { 2167 boolean_t was_active; 2168 2169 mutex_enter(&mg->mg_lock); 2170 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 2171 if (msp->ms_weight < asize) { 2172 spa_dbgmsg(spa, "%s: failed to meet weight " 2173 "requirement: vdev %llu, txg %llu, mg %p, " 2174 "msp %p, psize %llu, asize %llu, " 2175 "weight %llu", spa_name(spa), 2176 mg->mg_vd->vdev_id, txg, 2177 mg, msp, psize, asize, msp->ms_weight); 2178 mutex_exit(&mg->mg_lock); 2179 return (-1ULL); 2180 } 2181 2182 /* 2183 * If the selected metaslab is condensing, skip it. 2184 */ 2185 if (msp->ms_condensing) 2186 continue; 2187 2188 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 2189 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 2190 break; 2191 2192 target_distance = min_distance + 2193 (space_map_allocated(msp->ms_sm) != 0 ? 0 : 2194 min_distance >> 1); 2195 2196 for (i = 0; i < d; i++) 2197 if (metaslab_distance(msp, &dva[i]) < 2198 target_distance) 2199 break; 2200 if (i == d) 2201 break; 2202 } 2203 mutex_exit(&mg->mg_lock); 2204 if (msp == NULL) 2205 return (-1ULL); 2206 2207 mutex_enter(&msp->ms_lock); 2208 2209 /* 2210 * Ensure that the metaslab we have selected is still 2211 * capable of handling our request. It's possible that 2212 * another thread may have changed the weight while we 2213 * were blocked on the metaslab lock. 2214 */ 2215 if (msp->ms_weight < asize || (was_active && 2216 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 2217 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 2218 mutex_exit(&msp->ms_lock); 2219 continue; 2220 } 2221 2222 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 2223 activation_weight == METASLAB_WEIGHT_PRIMARY) { 2224 metaslab_passivate(msp, 2225 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 2226 mutex_exit(&msp->ms_lock); 2227 continue; 2228 } 2229 2230 if (metaslab_activate(msp, activation_weight) != 0) { 2231 mutex_exit(&msp->ms_lock); 2232 continue; 2233 } 2234 2235 /* 2236 * If this metaslab is currently condensing then pick again as 2237 * we can't manipulate this metaslab until it's committed 2238 * to disk. 2239 */ 2240 if (msp->ms_condensing) { 2241 mutex_exit(&msp->ms_lock); 2242 continue; 2243 } 2244 2245 if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL) 2246 break; 2247 2248 metaslab_passivate(msp, metaslab_block_maxsize(msp)); 2249 mutex_exit(&msp->ms_lock); 2250 } 2251 2252 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2253 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 2254 2255 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize); 2256 msp->ms_access_txg = txg + metaslab_unload_delay; 2257 2258 mutex_exit(&msp->ms_lock); 2259 2260 return (offset); 2261} 2262 2263/* 2264 * Allocate a block for the specified i/o. 2265 */ 2266static int 2267metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 2268 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 2269{ 2270 metaslab_group_t *mg, *rotor; 2271 vdev_t *vd; 2272 int dshift = 3; 2273 int all_zero; 2274 int zio_lock = B_FALSE; 2275 boolean_t allocatable; 2276 uint64_t offset = -1ULL; 2277 uint64_t asize; 2278 uint64_t distance; 2279 2280 ASSERT(!DVA_IS_VALID(&dva[d])); 2281 2282 /* 2283 * For testing, make some blocks above a certain size be gang blocks. 2284 */ 2285 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 2286 return (SET_ERROR(ENOSPC)); 2287 2288 /* 2289 * Start at the rotor and loop through all mgs until we find something. 2290 * Note that there's no locking on mc_rotor or mc_aliquot because 2291 * nothing actually breaks if we miss a few updates -- we just won't 2292 * allocate quite as evenly. It all balances out over time. 2293 * 2294 * If we are doing ditto or log blocks, try to spread them across 2295 * consecutive vdevs. If we're forced to reuse a vdev before we've 2296 * allocated all of our ditto blocks, then try and spread them out on 2297 * that vdev as much as possible. If it turns out to not be possible, 2298 * gradually lower our standards until anything becomes acceptable. 2299 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 2300 * gives us hope of containing our fault domains to something we're 2301 * able to reason about. Otherwise, any two top-level vdev failures 2302 * will guarantee the loss of data. With consecutive allocation, 2303 * only two adjacent top-level vdev failures will result in data loss. 2304 * 2305 * If we are doing gang blocks (hintdva is non-NULL), try to keep 2306 * ourselves on the same vdev as our gang block header. That 2307 * way, we can hope for locality in vdev_cache, plus it makes our 2308 * fault domains something tractable. 2309 */ 2310 if (hintdva) { 2311 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 2312 2313 /* 2314 * It's possible the vdev we're using as the hint no 2315 * longer exists (i.e. removed). Consult the rotor when 2316 * all else fails. 2317 */ 2318 if (vd != NULL) { 2319 mg = vd->vdev_mg; 2320 2321 if (flags & METASLAB_HINTBP_AVOID && 2322 mg->mg_next != NULL) 2323 mg = mg->mg_next; 2324 } else { 2325 mg = mc->mc_rotor; 2326 } 2327 } else if (d != 0) { 2328 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 2329 mg = vd->vdev_mg->mg_next; 2330 } else { 2331 mg = mc->mc_rotor; 2332 } 2333 2334 /* 2335 * If the hint put us into the wrong metaslab class, or into a 2336 * metaslab group that has been passivated, just follow the rotor. 2337 */ 2338 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 2339 mg = mc->mc_rotor; 2340 2341 rotor = mg; 2342top: 2343 all_zero = B_TRUE; 2344 do { 2345 ASSERT(mg->mg_activation_count == 1); 2346 2347 vd = mg->mg_vd; 2348 2349 /* 2350 * Don't allocate from faulted devices. 2351 */ 2352 if (zio_lock) { 2353 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 2354 allocatable = vdev_allocatable(vd); 2355 spa_config_exit(spa, SCL_ZIO, FTAG); 2356 } else { 2357 allocatable = vdev_allocatable(vd); 2358 } 2359 2360 /* 2361 * Determine if the selected metaslab group is eligible 2362 * for allocations. If we're ganging or have requested 2363 * an allocation for the smallest gang block size 2364 * then we don't want to avoid allocating to the this 2365 * metaslab group. If we're in this condition we should 2366 * try to allocate from any device possible so that we 2367 * don't inadvertently return ENOSPC and suspend the pool 2368 * even though space is still available. 2369 */ 2370 if (allocatable && CAN_FASTGANG(flags) && 2371 psize > SPA_GANGBLOCKSIZE) 2372 allocatable = metaslab_group_allocatable(mg); 2373 2374 if (!allocatable) 2375 goto next; 2376 2377 /* 2378 * Avoid writing single-copy data to a failing vdev 2379 * unless the user instructs us that it is okay. 2380 */ 2381 if ((vd->vdev_stat.vs_write_errors > 0 || 2382 vd->vdev_state < VDEV_STATE_HEALTHY) && 2383 d == 0 && dshift == 3 && vd->vdev_children == 0) { 2384 all_zero = B_FALSE; 2385 goto next; 2386 } 2387 2388 ASSERT(mg->mg_class == mc); 2389 2390 distance = vd->vdev_asize >> dshift; 2391 if (distance <= (1ULL << vd->vdev_ms_shift)) 2392 distance = 0; 2393 else 2394 all_zero = B_FALSE; 2395 2396 asize = vdev_psize_to_asize(vd, psize); 2397 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 2398 2399 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 2400 dva, d); 2401 if (offset != -1ULL) { 2402 /* 2403 * If we've just selected this metaslab group, 2404 * figure out whether the corresponding vdev is 2405 * over- or under-used relative to the pool, 2406 * and set an allocation bias to even it out. 2407 */ 2408 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { 2409 vdev_stat_t *vs = &vd->vdev_stat; 2410 int64_t vu, cu; 2411 2412 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 2413 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 2414 2415 /* 2416 * Calculate how much more or less we should 2417 * try to allocate from this device during 2418 * this iteration around the rotor. 2419 * For example, if a device is 80% full 2420 * and the pool is 20% full then we should 2421 * reduce allocations by 60% on this device. 2422 * 2423 * mg_bias = (20 - 80) * 512K / 100 = -307K 2424 * 2425 * This reduces allocations by 307K for this 2426 * iteration. 2427 */ 2428 mg->mg_bias = ((cu - vu) * 2429 (int64_t)mg->mg_aliquot) / 100; 2430 } else if (!metaslab_bias_enabled) { 2431 mg->mg_bias = 0; 2432 } 2433 2434 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 2435 mg->mg_aliquot + mg->mg_bias) { 2436 mc->mc_rotor = mg->mg_next; 2437 mc->mc_aliquot = 0; 2438 } 2439 2440 DVA_SET_VDEV(&dva[d], vd->vdev_id); 2441 DVA_SET_OFFSET(&dva[d], offset); 2442 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 2443 DVA_SET_ASIZE(&dva[d], asize); 2444 2445 return (0); 2446 } 2447next: 2448 mc->mc_rotor = mg->mg_next; 2449 mc->mc_aliquot = 0; 2450 } while ((mg = mg->mg_next) != rotor); 2451 2452 if (!all_zero) { 2453 dshift++; 2454 ASSERT(dshift < 64); 2455 goto top; 2456 } 2457 2458 if (!allocatable && !zio_lock) { 2459 dshift = 3; 2460 zio_lock = B_TRUE; 2461 goto top; 2462 } 2463 2464 bzero(&dva[d], sizeof (dva_t)); 2465 2466 return (SET_ERROR(ENOSPC)); 2467} 2468 2469/* 2470 * Free the block represented by DVA in the context of the specified 2471 * transaction group. 2472 */ 2473static void 2474metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 2475{ 2476 uint64_t vdev = DVA_GET_VDEV(dva); 2477 uint64_t offset = DVA_GET_OFFSET(dva); 2478 uint64_t size = DVA_GET_ASIZE(dva); 2479 vdev_t *vd; 2480 metaslab_t *msp; 2481 2482 ASSERT(DVA_IS_VALID(dva)); 2483 2484 if (txg > spa_freeze_txg(spa)) 2485 return; 2486 2487 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2488 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 2489 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 2490 (u_longlong_t)vdev, (u_longlong_t)offset); 2491 ASSERT(0); 2492 return; 2493 } 2494 2495 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2496 2497 if (DVA_GET_GANG(dva)) 2498 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2499 2500 mutex_enter(&msp->ms_lock); 2501 2502 if (now) { 2503 range_tree_remove(msp->ms_alloctree[txg & TXG_MASK], 2504 offset, size); 2505 2506 VERIFY(!msp->ms_condensing); 2507 VERIFY3U(offset, >=, msp->ms_start); 2508 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); 2509 VERIFY3U(range_tree_space(msp->ms_tree) + size, <=, 2510 msp->ms_size); 2511 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2512 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2513 range_tree_add(msp->ms_tree, offset, size); 2514 } else { 2515 if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0) 2516 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2517 range_tree_add(msp->ms_freetree[txg & TXG_MASK], 2518 offset, size); 2519 } 2520 2521 mutex_exit(&msp->ms_lock); 2522} 2523 2524/* 2525 * Intent log support: upon opening the pool after a crash, notify the SPA 2526 * of blocks that the intent log has allocated for immediate write, but 2527 * which are still considered free by the SPA because the last transaction 2528 * group didn't commit yet. 2529 */ 2530static int 2531metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 2532{ 2533 uint64_t vdev = DVA_GET_VDEV(dva); 2534 uint64_t offset = DVA_GET_OFFSET(dva); 2535 uint64_t size = DVA_GET_ASIZE(dva); 2536 vdev_t *vd; 2537 metaslab_t *msp; 2538 int error = 0; 2539 2540 ASSERT(DVA_IS_VALID(dva)); 2541 2542 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 2543 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 2544 return (SET_ERROR(ENXIO)); 2545 2546 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2547 2548 if (DVA_GET_GANG(dva)) 2549 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 2550 2551 mutex_enter(&msp->ms_lock); 2552 2553 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) 2554 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 2555 2556 if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size)) 2557 error = SET_ERROR(ENOENT); 2558 2559 if (error || txg == 0) { /* txg == 0 indicates dry run */ 2560 mutex_exit(&msp->ms_lock); 2561 return (error); 2562 } 2563 2564 VERIFY(!msp->ms_condensing); 2565 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); 2566 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); 2567 VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size); 2568 range_tree_remove(msp->ms_tree, offset, size); 2569 2570 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 2571 if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0) 2572 vdev_dirty(vd, VDD_METASLAB, msp, txg); 2573 range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size); 2574 } 2575 2576 mutex_exit(&msp->ms_lock); 2577 2578 return (0); 2579} 2580 2581int 2582metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 2583 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 2584{ 2585 dva_t *dva = bp->blk_dva; 2586 dva_t *hintdva = hintbp->blk_dva; 2587 int error = 0; 2588 2589 ASSERT(bp->blk_birth == 0); 2590 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 2591 2592 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2593 2594 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 2595 spa_config_exit(spa, SCL_ALLOC, FTAG); 2596 return (SET_ERROR(ENOSPC)); 2597 } 2598 2599 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 2600 ASSERT(BP_GET_NDVAS(bp) == 0); 2601 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 2602 2603 for (int d = 0; d < ndvas; d++) { 2604 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 2605 txg, flags); 2606 if (error != 0) { 2607 for (d--; d >= 0; d--) { 2608 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 2609 bzero(&dva[d], sizeof (dva_t)); 2610 } 2611 spa_config_exit(spa, SCL_ALLOC, FTAG); 2612 return (error); 2613 } 2614 } 2615 ASSERT(error == 0); 2616 ASSERT(BP_GET_NDVAS(bp) == ndvas); 2617 2618 spa_config_exit(spa, SCL_ALLOC, FTAG); 2619 2620 BP_SET_BIRTH(bp, txg, txg); 2621 2622 return (0); 2623} 2624 2625void 2626metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 2627{ 2628 const dva_t *dva = bp->blk_dva; 2629 int ndvas = BP_GET_NDVAS(bp); 2630 2631 ASSERT(!BP_IS_HOLE(bp)); 2632 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2633 2634 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2635 2636 for (int d = 0; d < ndvas; d++) 2637 metaslab_free_dva(spa, &dva[d], txg, now); 2638 2639 spa_config_exit(spa, SCL_FREE, FTAG); 2640} 2641 2642int 2643metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2644{ 2645 const dva_t *dva = bp->blk_dva; 2646 int ndvas = BP_GET_NDVAS(bp); 2647 int error = 0; 2648 2649 ASSERT(!BP_IS_HOLE(bp)); 2650 2651 if (txg != 0) { 2652 /* 2653 * First do a dry run to make sure all DVAs are claimable, 2654 * so we don't have to unwind from partial failures below. 2655 */ 2656 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2657 return (error); 2658 } 2659 2660 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2661 2662 for (int d = 0; d < ndvas; d++) 2663 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2664 break; 2665 2666 spa_config_exit(spa, SCL_ALLOC, FTAG); 2667 2668 ASSERT(error == 0 || txg == 0); 2669 2670 return (error); 2671} 2672 2673void 2674metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2675{ 2676 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2677 return; 2678 2679 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2680 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2681 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 2682 vdev_t *vd = vdev_lookup_top(spa, vdev); 2683 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); 2684 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2685 metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 2686 2687 if (msp->ms_loaded) 2688 range_tree_verify(msp->ms_tree, offset, size); 2689 2690 for (int j = 0; j < TXG_SIZE; j++) 2691 range_tree_verify(msp->ms_freetree[j], offset, size); 2692 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2693 range_tree_verify(msp->ms_defertree[j], offset, size); 2694 } 2695 spa_config_exit(spa, SCL_VDEV, FTAG); 2696} 2697