metaslab.c revision 260768
1216154Snwhitehorn/* 2216154Snwhitehorn * CDDL HEADER START 3216154Snwhitehorn * 4216154Snwhitehorn * The contents of this file are subject to the terms of the 5216154Snwhitehorn * Common Development and Distribution License (the "License"). 6216154Snwhitehorn * You may not use this file except in compliance with the License. 7216154Snwhitehorn * 8216154Snwhitehorn * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9216154Snwhitehorn * or http://www.opensolaris.org/os/licensing. 10216154Snwhitehorn * See the License for the specific language governing permissions 11216154Snwhitehorn * and limitations under the License. 12216154Snwhitehorn * 13216154Snwhitehorn * When distributing Covered Code, include this CDDL HEADER in each 14216154Snwhitehorn * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15216154Snwhitehorn * If applicable, add the following below this CDDL HEADER, with the 16216154Snwhitehorn * fields enclosed by brackets "[]" replaced with your own identifying 17216154Snwhitehorn * information: Portions Copyright [yyyy] [name of copyright owner] 18216154Snwhitehorn * 19216154Snwhitehorn * CDDL HEADER END 20216154Snwhitehorn */ 21216154Snwhitehorn/* 22216154Snwhitehorn * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23216154Snwhitehorn * Copyright (c) 2013 by Delphix. All rights reserved. 24216154Snwhitehorn * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 25216154Snwhitehorn */ 26216154Snwhitehorn 27216154Snwhitehorn#include <sys/zfs_context.h> 28216154Snwhitehorn#include <sys/dmu.h> 29216154Snwhitehorn#include <sys/dmu_tx.h> 30216154Snwhitehorn#include <sys/space_map.h> 31216154Snwhitehorn#include <sys/metaslab_impl.h> 32216154Snwhitehorn#include <sys/vdev_impl.h> 33216154Snwhitehorn#include <sys/zio.h> 34216154Snwhitehorn 35216154SnwhitehornSYSCTL_DECL(_vfs_zfs); 36216154SnwhitehornSYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); 37216154Snwhitehorn 38216154Snwhitehorn/* 39216154Snwhitehorn * Allow allocations to switch to gang blocks quickly. We do this to 40216154Snwhitehorn * avoid having to load lots of space_maps in a given txg. There are, 41216154Snwhitehorn * however, some cases where we want to avoid "fast" ganging and instead 42216154Snwhitehorn * we want to do an exhaustive search of all metaslabs on this device. 43216154Snwhitehorn * Currently we don't allow any gang, zil, or dump device related allocations 44216154Snwhitehorn * to "fast" gang. 45216154Snwhitehorn */ 46216154Snwhitehorn#define CAN_FASTGANG(flags) \ 47216154Snwhitehorn (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \ 48216154Snwhitehorn METASLAB_GANG_AVOID))) 49216154Snwhitehorn 50216154Snwhitehornuint64_t metaslab_aliquot = 512ULL << 10; 51216154Snwhitehornuint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ 52216154SnwhitehornTUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang); 53216154SnwhitehornSYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN, 54216154Snwhitehorn &metaslab_gang_bang, 0, 55 "Force gang block allocation for blocks larger than or equal to this value"); 56 57/* 58 * The in-core space map representation is more compact than its on-disk form. 59 * The zfs_condense_pct determines how much more compact the in-core 60 * space_map representation must be before we compact it on-disk. 61 * Values should be greater than or equal to 100. 62 */ 63int zfs_condense_pct = 200; 64 65/* 66 * This value defines the number of allowed allocation failures per vdev. 67 * If a device reaches this threshold in a given txg then we consider skipping 68 * allocations on that device. The value of zfs_mg_alloc_failures is computed 69 * in zio_init() unless it has been overridden in /etc/system. 70 */ 71int zfs_mg_alloc_failures = 0; 72TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures); 73SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RWTUN, 74 &zfs_mg_alloc_failures, 0, 75 "Number of allowed allocation failures per vdev"); 76 77/* 78 * The zfs_mg_noalloc_threshold defines which metaslab groups should 79 * be eligible for allocation. The value is defined as a percentage of 80 * a free space. Metaslab groups that have more free space than 81 * zfs_mg_noalloc_threshold are always eligible for allocations. Once 82 * a metaslab group's free space is less than or equal to the 83 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that 84 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. 85 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all 86 * groups are allowed to accept allocations. Gang blocks are always 87 * eligible to allocate on any metaslab group. The default value of 0 means 88 * no metaslab group will be excluded based on this criterion. 89 */ 90int zfs_mg_noalloc_threshold = 0; 91 92/* 93 * Metaslab debugging: when set, keeps all space maps in core to verify frees. 94 */ 95static int metaslab_debug = 0; 96TUNABLE_INT("vfs.zfs.metaslab.debug", &metaslab_debug); 97SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug, CTLFLAG_RWTUN, &metaslab_debug, 98 0, 99 "Metaslab debugging: when set, keeps all space maps in core to verify frees"); 100 101/* 102 * Minimum size which forces the dynamic allocator to change 103 * it's allocation strategy. Once the space map cannot satisfy 104 * an allocation of this size then it switches to using more 105 * aggressive strategy (i.e search by size rather than offset). 106 */ 107uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; 108TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold", 109 &metaslab_df_alloc_threshold); 110SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, 111 &metaslab_df_alloc_threshold, 0, 112 "Minimum size which forces the dynamic allocator to change it's allocation strategy"); 113 114/* 115 * The minimum free space, in percent, which must be available 116 * in a space map to continue allocations in a first-fit fashion. 117 * Once the space_map's free space drops below this level we dynamically 118 * switch to using best-fit allocations. 119 */ 120int metaslab_df_free_pct = 4; 121TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct); 122SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, 123 &metaslab_df_free_pct, 0, 124 "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion"); 125 126/* 127 * A metaslab is considered "free" if it contains a contiguous 128 * segment which is greater than metaslab_min_alloc_size. 129 */ 130uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; 131TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size", 132 &metaslab_min_alloc_size); 133SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN, 134 &metaslab_min_alloc_size, 0, 135 "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size"); 136 137/* 138 * Max number of space_maps to prefetch. 139 */ 140int metaslab_prefetch_limit = SPA_DVAS_PER_BP; 141TUNABLE_INT("vfs.zfs.metaslab.prefetch_limit", &metaslab_prefetch_limit); 142SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, prefetch_limit, CTLFLAG_RWTUN, 143 &metaslab_prefetch_limit, 0, "Maximum number of space_maps to prefetch"); 144 145/* 146 * Percentage bonus multiplier for metaslabs that are in the bonus area. 147 */ 148int metaslab_smo_bonus_pct = 150; 149TUNABLE_INT("vfs.zfs.metaslab.smo_bonus_pct", &metaslab_smo_bonus_pct); 150SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, smo_bonus_pct, CTLFLAG_RWTUN, 151 &metaslab_smo_bonus_pct, 0, "Maximum number of space_maps to prefetch"); 152 153/* 154 * Should we be willing to write data to degraded vdevs? 155 */ 156boolean_t zfs_write_to_degraded = B_FALSE; 157SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RWTUN, 158 &zfs_write_to_degraded, 0, "Allow writing data to degraded vdevs"); 159TUNABLE_INT("vfs.zfs.write_to_degraded", &zfs_write_to_degraded); 160 161/* 162 * ========================================================================== 163 * Metaslab classes 164 * ========================================================================== 165 */ 166metaslab_class_t * 167metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 168{ 169 metaslab_class_t *mc; 170 171 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 172 173 mc->mc_spa = spa; 174 mc->mc_rotor = NULL; 175 mc->mc_ops = ops; 176 177 return (mc); 178} 179 180void 181metaslab_class_destroy(metaslab_class_t *mc) 182{ 183 ASSERT(mc->mc_rotor == NULL); 184 ASSERT(mc->mc_alloc == 0); 185 ASSERT(mc->mc_deferred == 0); 186 ASSERT(mc->mc_space == 0); 187 ASSERT(mc->mc_dspace == 0); 188 189 kmem_free(mc, sizeof (metaslab_class_t)); 190} 191 192int 193metaslab_class_validate(metaslab_class_t *mc) 194{ 195 metaslab_group_t *mg; 196 vdev_t *vd; 197 198 /* 199 * Must hold one of the spa_config locks. 200 */ 201 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 202 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 203 204 if ((mg = mc->mc_rotor) == NULL) 205 return (0); 206 207 do { 208 vd = mg->mg_vd; 209 ASSERT(vd->vdev_mg != NULL); 210 ASSERT3P(vd->vdev_top, ==, vd); 211 ASSERT3P(mg->mg_class, ==, mc); 212 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 213 } while ((mg = mg->mg_next) != mc->mc_rotor); 214 215 return (0); 216} 217 218void 219metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 220 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 221{ 222 atomic_add_64(&mc->mc_alloc, alloc_delta); 223 atomic_add_64(&mc->mc_deferred, defer_delta); 224 atomic_add_64(&mc->mc_space, space_delta); 225 atomic_add_64(&mc->mc_dspace, dspace_delta); 226} 227 228void 229metaslab_class_minblocksize_update(metaslab_class_t *mc) 230{ 231 metaslab_group_t *mg; 232 vdev_t *vd; 233 uint64_t minashift = UINT64_MAX; 234 235 if ((mg = mc->mc_rotor) == NULL) { 236 mc->mc_minblocksize = SPA_MINBLOCKSIZE; 237 return; 238 } 239 240 do { 241 vd = mg->mg_vd; 242 if (vd->vdev_ashift < minashift) 243 minashift = vd->vdev_ashift; 244 } while ((mg = mg->mg_next) != mc->mc_rotor); 245 246 mc->mc_minblocksize = 1ULL << minashift; 247} 248 249uint64_t 250metaslab_class_get_alloc(metaslab_class_t *mc) 251{ 252 return (mc->mc_alloc); 253} 254 255uint64_t 256metaslab_class_get_deferred(metaslab_class_t *mc) 257{ 258 return (mc->mc_deferred); 259} 260 261uint64_t 262metaslab_class_get_space(metaslab_class_t *mc) 263{ 264 return (mc->mc_space); 265} 266 267uint64_t 268metaslab_class_get_dspace(metaslab_class_t *mc) 269{ 270 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 271} 272 273uint64_t 274metaslab_class_get_minblocksize(metaslab_class_t *mc) 275{ 276 return (mc->mc_minblocksize); 277} 278 279/* 280 * ========================================================================== 281 * Metaslab groups 282 * ========================================================================== 283 */ 284static int 285metaslab_compare(const void *x1, const void *x2) 286{ 287 const metaslab_t *m1 = x1; 288 const metaslab_t *m2 = x2; 289 290 if (m1->ms_weight < m2->ms_weight) 291 return (1); 292 if (m1->ms_weight > m2->ms_weight) 293 return (-1); 294 295 /* 296 * If the weights are identical, use the offset to force uniqueness. 297 */ 298 if (m1->ms_map->sm_start < m2->ms_map->sm_start) 299 return (-1); 300 if (m1->ms_map->sm_start > m2->ms_map->sm_start) 301 return (1); 302 303 ASSERT3P(m1, ==, m2); 304 305 return (0); 306} 307 308/* 309 * Update the allocatable flag and the metaslab group's capacity. 310 * The allocatable flag is set to true if the capacity is below 311 * the zfs_mg_noalloc_threshold. If a metaslab group transitions 312 * from allocatable to non-allocatable or vice versa then the metaslab 313 * group's class is updated to reflect the transition. 314 */ 315static void 316metaslab_group_alloc_update(metaslab_group_t *mg) 317{ 318 vdev_t *vd = mg->mg_vd; 319 metaslab_class_t *mc = mg->mg_class; 320 vdev_stat_t *vs = &vd->vdev_stat; 321 boolean_t was_allocatable; 322 323 ASSERT(vd == vd->vdev_top); 324 325 mutex_enter(&mg->mg_lock); 326 was_allocatable = mg->mg_allocatable; 327 328 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / 329 (vs->vs_space + 1); 330 331 mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); 332 333 /* 334 * The mc_alloc_groups maintains a count of the number of 335 * groups in this metaslab class that are still above the 336 * zfs_mg_noalloc_threshold. This is used by the allocating 337 * threads to determine if they should avoid allocations to 338 * a given group. The allocator will avoid allocations to a group 339 * if that group has reached or is below the zfs_mg_noalloc_threshold 340 * and there are still other groups that are above the threshold. 341 * When a group transitions from allocatable to non-allocatable or 342 * vice versa we update the metaslab class to reflect that change. 343 * When the mc_alloc_groups value drops to 0 that means that all 344 * groups have reached the zfs_mg_noalloc_threshold making all groups 345 * eligible for allocations. This effectively means that all devices 346 * are balanced again. 347 */ 348 if (was_allocatable && !mg->mg_allocatable) 349 mc->mc_alloc_groups--; 350 else if (!was_allocatable && mg->mg_allocatable) 351 mc->mc_alloc_groups++; 352 mutex_exit(&mg->mg_lock); 353} 354 355metaslab_group_t * 356metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 357{ 358 metaslab_group_t *mg; 359 360 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 361 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 362 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 363 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 364 mg->mg_vd = vd; 365 mg->mg_class = mc; 366 mg->mg_activation_count = 0; 367 368 return (mg); 369} 370 371void 372metaslab_group_destroy(metaslab_group_t *mg) 373{ 374 ASSERT(mg->mg_prev == NULL); 375 ASSERT(mg->mg_next == NULL); 376 /* 377 * We may have gone below zero with the activation count 378 * either because we never activated in the first place or 379 * because we're done, and possibly removing the vdev. 380 */ 381 ASSERT(mg->mg_activation_count <= 0); 382 383 avl_destroy(&mg->mg_metaslab_tree); 384 mutex_destroy(&mg->mg_lock); 385 kmem_free(mg, sizeof (metaslab_group_t)); 386} 387 388void 389metaslab_group_activate(metaslab_group_t *mg) 390{ 391 metaslab_class_t *mc = mg->mg_class; 392 metaslab_group_t *mgprev, *mgnext; 393 394 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 395 396 ASSERT(mc->mc_rotor != mg); 397 ASSERT(mg->mg_prev == NULL); 398 ASSERT(mg->mg_next == NULL); 399 ASSERT(mg->mg_activation_count <= 0); 400 401 if (++mg->mg_activation_count <= 0) 402 return; 403 404 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 405 metaslab_group_alloc_update(mg); 406 407 if ((mgprev = mc->mc_rotor) == NULL) { 408 mg->mg_prev = mg; 409 mg->mg_next = mg; 410 } else { 411 mgnext = mgprev->mg_next; 412 mg->mg_prev = mgprev; 413 mg->mg_next = mgnext; 414 mgprev->mg_next = mg; 415 mgnext->mg_prev = mg; 416 } 417 mc->mc_rotor = mg; 418 metaslab_class_minblocksize_update(mc); 419} 420 421void 422metaslab_group_passivate(metaslab_group_t *mg) 423{ 424 metaslab_class_t *mc = mg->mg_class; 425 metaslab_group_t *mgprev, *mgnext; 426 427 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 428 429 if (--mg->mg_activation_count != 0) { 430 ASSERT(mc->mc_rotor != mg); 431 ASSERT(mg->mg_prev == NULL); 432 ASSERT(mg->mg_next == NULL); 433 ASSERT(mg->mg_activation_count < 0); 434 return; 435 } 436 437 mgprev = mg->mg_prev; 438 mgnext = mg->mg_next; 439 440 if (mg == mgnext) { 441 mc->mc_rotor = NULL; 442 } else { 443 mc->mc_rotor = mgnext; 444 mgprev->mg_next = mgnext; 445 mgnext->mg_prev = mgprev; 446 } 447 448 mg->mg_prev = NULL; 449 mg->mg_next = NULL; 450 metaslab_class_minblocksize_update(mc); 451} 452 453static void 454metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 455{ 456 mutex_enter(&mg->mg_lock); 457 ASSERT(msp->ms_group == NULL); 458 msp->ms_group = mg; 459 msp->ms_weight = 0; 460 avl_add(&mg->mg_metaslab_tree, msp); 461 mutex_exit(&mg->mg_lock); 462} 463 464static void 465metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 466{ 467 mutex_enter(&mg->mg_lock); 468 ASSERT(msp->ms_group == mg); 469 avl_remove(&mg->mg_metaslab_tree, msp); 470 msp->ms_group = NULL; 471 mutex_exit(&mg->mg_lock); 472} 473 474static void 475metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 476{ 477 /* 478 * Although in principle the weight can be any value, in 479 * practice we do not use values in the range [1, 510]. 480 */ 481 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 482 ASSERT(MUTEX_HELD(&msp->ms_lock)); 483 484 mutex_enter(&mg->mg_lock); 485 ASSERT(msp->ms_group == mg); 486 avl_remove(&mg->mg_metaslab_tree, msp); 487 msp->ms_weight = weight; 488 avl_add(&mg->mg_metaslab_tree, msp); 489 mutex_exit(&mg->mg_lock); 490} 491 492/* 493 * Determine if a given metaslab group should skip allocations. A metaslab 494 * group should avoid allocations if its used capacity has crossed the 495 * zfs_mg_noalloc_threshold and there is at least one metaslab group 496 * that can still handle allocations. 497 */ 498static boolean_t 499metaslab_group_allocatable(metaslab_group_t *mg) 500{ 501 vdev_t *vd = mg->mg_vd; 502 spa_t *spa = vd->vdev_spa; 503 metaslab_class_t *mc = mg->mg_class; 504 505 /* 506 * A metaslab group is considered allocatable if its free capacity 507 * is greater than the set value of zfs_mg_noalloc_threshold, it's 508 * associated with a slog, or there are no other metaslab groups 509 * with free capacity greater than zfs_mg_noalloc_threshold. 510 */ 511 return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || 512 mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); 513} 514 515/* 516 * ========================================================================== 517 * Common allocator routines 518 * ========================================================================== 519 */ 520static int 521metaslab_segsize_compare(const void *x1, const void *x2) 522{ 523 const space_seg_t *s1 = x1; 524 const space_seg_t *s2 = x2; 525 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 526 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 527 528 if (ss_size1 < ss_size2) 529 return (-1); 530 if (ss_size1 > ss_size2) 531 return (1); 532 533 if (s1->ss_start < s2->ss_start) 534 return (-1); 535 if (s1->ss_start > s2->ss_start) 536 return (1); 537 538 return (0); 539} 540 541/* 542 * This is a helper function that can be used by the allocator to find 543 * a suitable block to allocate. This will search the specified AVL 544 * tree looking for a block that matches the specified criteria. 545 */ 546static uint64_t 547metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 548 uint64_t align) 549{ 550 space_seg_t *ss, ssearch; 551 avl_index_t where; 552 553 ssearch.ss_start = *cursor; 554 ssearch.ss_end = *cursor + size; 555 556 ss = avl_find(t, &ssearch, &where); 557 if (ss == NULL) 558 ss = avl_nearest(t, where, AVL_AFTER); 559 560 while (ss != NULL) { 561 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 562 563 if (offset + size <= ss->ss_end) { 564 *cursor = offset + size; 565 return (offset); 566 } 567 ss = AVL_NEXT(t, ss); 568 } 569 570 /* 571 * If we know we've searched the whole map (*cursor == 0), give up. 572 * Otherwise, reset the cursor to the beginning and try again. 573 */ 574 if (*cursor == 0) 575 return (-1ULL); 576 577 *cursor = 0; 578 return (metaslab_block_picker(t, cursor, size, align)); 579} 580 581static void 582metaslab_pp_load(space_map_t *sm) 583{ 584 space_seg_t *ss; 585 586 ASSERT(sm->sm_ppd == NULL); 587 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 588 589 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 590 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 591 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 592 593 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 594 avl_add(sm->sm_pp_root, ss); 595} 596 597static void 598metaslab_pp_unload(space_map_t *sm) 599{ 600 void *cookie = NULL; 601 602 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 603 sm->sm_ppd = NULL; 604 605 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 606 /* tear down the tree */ 607 } 608 609 avl_destroy(sm->sm_pp_root); 610 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 611 sm->sm_pp_root = NULL; 612} 613 614/* ARGSUSED */ 615static void 616metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 617{ 618 /* No need to update cursor */ 619} 620 621/* ARGSUSED */ 622static void 623metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 624{ 625 /* No need to update cursor */ 626} 627 628/* 629 * Return the maximum contiguous segment within the metaslab. 630 */ 631uint64_t 632metaslab_pp_maxsize(space_map_t *sm) 633{ 634 avl_tree_t *t = sm->sm_pp_root; 635 space_seg_t *ss; 636 637 if (t == NULL || (ss = avl_last(t)) == NULL) 638 return (0ULL); 639 640 return (ss->ss_end - ss->ss_start); 641} 642 643/* 644 * ========================================================================== 645 * The first-fit block allocator 646 * ========================================================================== 647 */ 648static uint64_t 649metaslab_ff_alloc(space_map_t *sm, uint64_t size) 650{ 651 avl_tree_t *t = &sm->sm_root; 652 uint64_t align = size & -size; 653 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 654 655 return (metaslab_block_picker(t, cursor, size, align)); 656} 657 658/* ARGSUSED */ 659boolean_t 660metaslab_ff_fragmented(space_map_t *sm) 661{ 662 return (B_TRUE); 663} 664 665static space_map_ops_t metaslab_ff_ops = { 666 metaslab_pp_load, 667 metaslab_pp_unload, 668 metaslab_ff_alloc, 669 metaslab_pp_claim, 670 metaslab_pp_free, 671 metaslab_pp_maxsize, 672 metaslab_ff_fragmented 673}; 674 675/* 676 * ========================================================================== 677 * Dynamic block allocator - 678 * Uses the first fit allocation scheme until space get low and then 679 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 680 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 681 * ========================================================================== 682 */ 683static uint64_t 684metaslab_df_alloc(space_map_t *sm, uint64_t size) 685{ 686 avl_tree_t *t = &sm->sm_root; 687 uint64_t align = size & -size; 688 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 689 uint64_t max_size = metaslab_pp_maxsize(sm); 690 int free_pct = sm->sm_space * 100 / sm->sm_size; 691 692 ASSERT(MUTEX_HELD(sm->sm_lock)); 693 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 694 695 if (max_size < size) 696 return (-1ULL); 697 698 /* 699 * If we're running low on space switch to using the size 700 * sorted AVL tree (best-fit). 701 */ 702 if (max_size < metaslab_df_alloc_threshold || 703 free_pct < metaslab_df_free_pct) { 704 t = sm->sm_pp_root; 705 *cursor = 0; 706 } 707 708 return (metaslab_block_picker(t, cursor, size, 1ULL)); 709} 710 711static boolean_t 712metaslab_df_fragmented(space_map_t *sm) 713{ 714 uint64_t max_size = metaslab_pp_maxsize(sm); 715 int free_pct = sm->sm_space * 100 / sm->sm_size; 716 717 if (max_size >= metaslab_df_alloc_threshold && 718 free_pct >= metaslab_df_free_pct) 719 return (B_FALSE); 720 721 return (B_TRUE); 722} 723 724static space_map_ops_t metaslab_df_ops = { 725 metaslab_pp_load, 726 metaslab_pp_unload, 727 metaslab_df_alloc, 728 metaslab_pp_claim, 729 metaslab_pp_free, 730 metaslab_pp_maxsize, 731 metaslab_df_fragmented 732}; 733 734/* 735 * ========================================================================== 736 * Other experimental allocators 737 * ========================================================================== 738 */ 739static uint64_t 740metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 741{ 742 avl_tree_t *t = &sm->sm_root; 743 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 744 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 745 uint64_t max_size = metaslab_pp_maxsize(sm); 746 uint64_t rsize = size; 747 uint64_t offset = 0; 748 749 ASSERT(MUTEX_HELD(sm->sm_lock)); 750 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 751 752 if (max_size < size) 753 return (-1ULL); 754 755 ASSERT3U(*extent_end, >=, *cursor); 756 757 /* 758 * If we're running low on space switch to using the size 759 * sorted AVL tree (best-fit). 760 */ 761 if ((*cursor + size) > *extent_end) { 762 763 t = sm->sm_pp_root; 764 *cursor = *extent_end = 0; 765 766 if (max_size > 2 * SPA_MAXBLOCKSIZE) 767 rsize = MIN(metaslab_min_alloc_size, max_size); 768 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 769 if (offset != -1) 770 *cursor = offset + size; 771 } else { 772 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 773 } 774 ASSERT3U(*cursor, <=, *extent_end); 775 return (offset); 776} 777 778static boolean_t 779metaslab_cdf_fragmented(space_map_t *sm) 780{ 781 uint64_t max_size = metaslab_pp_maxsize(sm); 782 783 if (max_size > (metaslab_min_alloc_size * 10)) 784 return (B_FALSE); 785 return (B_TRUE); 786} 787 788static space_map_ops_t metaslab_cdf_ops = { 789 metaslab_pp_load, 790 metaslab_pp_unload, 791 metaslab_cdf_alloc, 792 metaslab_pp_claim, 793 metaslab_pp_free, 794 metaslab_pp_maxsize, 795 metaslab_cdf_fragmented 796}; 797 798uint64_t metaslab_ndf_clump_shift = 4; 799 800static uint64_t 801metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 802{ 803 avl_tree_t *t = &sm->sm_root; 804 avl_index_t where; 805 space_seg_t *ss, ssearch; 806 uint64_t hbit = highbit(size); 807 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 808 uint64_t max_size = metaslab_pp_maxsize(sm); 809 810 ASSERT(MUTEX_HELD(sm->sm_lock)); 811 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 812 813 if (max_size < size) 814 return (-1ULL); 815 816 ssearch.ss_start = *cursor; 817 ssearch.ss_end = *cursor + size; 818 819 ss = avl_find(t, &ssearch, &where); 820 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 821 t = sm->sm_pp_root; 822 823 ssearch.ss_start = 0; 824 ssearch.ss_end = MIN(max_size, 825 1ULL << (hbit + metaslab_ndf_clump_shift)); 826 ss = avl_find(t, &ssearch, &where); 827 if (ss == NULL) 828 ss = avl_nearest(t, where, AVL_AFTER); 829 ASSERT(ss != NULL); 830 } 831 832 if (ss != NULL) { 833 if (ss->ss_start + size <= ss->ss_end) { 834 *cursor = ss->ss_start + size; 835 return (ss->ss_start); 836 } 837 } 838 return (-1ULL); 839} 840 841static boolean_t 842metaslab_ndf_fragmented(space_map_t *sm) 843{ 844 uint64_t max_size = metaslab_pp_maxsize(sm); 845 846 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 847 return (B_FALSE); 848 return (B_TRUE); 849} 850 851 852static space_map_ops_t metaslab_ndf_ops = { 853 metaslab_pp_load, 854 metaslab_pp_unload, 855 metaslab_ndf_alloc, 856 metaslab_pp_claim, 857 metaslab_pp_free, 858 metaslab_pp_maxsize, 859 metaslab_ndf_fragmented 860}; 861 862space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 863 864/* 865 * ========================================================================== 866 * Metaslabs 867 * ========================================================================== 868 */ 869metaslab_t * 870metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 871 uint64_t start, uint64_t size, uint64_t txg) 872{ 873 vdev_t *vd = mg->mg_vd; 874 metaslab_t *msp; 875 876 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 877 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 878 879 msp->ms_smo_syncing = *smo; 880 881 /* 882 * We create the main space map here, but we don't create the 883 * allocmaps and freemaps until metaslab_sync_done(). This serves 884 * two purposes: it allows metaslab_sync_done() to detect the 885 * addition of new space; and for debugging, it ensures that we'd 886 * data fault on any attempt to use this metaslab before it's ready. 887 */ 888 msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); 889 space_map_create(msp->ms_map, start, size, 890 vd->vdev_ashift, &msp->ms_lock); 891 892 metaslab_group_add(mg, msp); 893 894 if (metaslab_debug && smo->smo_object != 0) { 895 mutex_enter(&msp->ms_lock); 896 VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops, 897 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 898 mutex_exit(&msp->ms_lock); 899 } 900 901 /* 902 * If we're opening an existing pool (txg == 0) or creating 903 * a new one (txg == TXG_INITIAL), all space is available now. 904 * If we're adding space to an existing pool, the new space 905 * does not become available until after this txg has synced. 906 */ 907 if (txg <= TXG_INITIAL) 908 metaslab_sync_done(msp, 0); 909 910 if (txg != 0) { 911 vdev_dirty(vd, 0, NULL, txg); 912 vdev_dirty(vd, VDD_METASLAB, msp, txg); 913 } 914 915 return (msp); 916} 917 918void 919metaslab_fini(metaslab_t *msp) 920{ 921 metaslab_group_t *mg = msp->ms_group; 922 923 vdev_space_update(mg->mg_vd, 924 -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size); 925 926 metaslab_group_remove(mg, msp); 927 928 mutex_enter(&msp->ms_lock); 929 930 space_map_unload(msp->ms_map); 931 space_map_destroy(msp->ms_map); 932 kmem_free(msp->ms_map, sizeof (*msp->ms_map)); 933 934 for (int t = 0; t < TXG_SIZE; t++) { 935 space_map_destroy(msp->ms_allocmap[t]); 936 space_map_destroy(msp->ms_freemap[t]); 937 kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t])); 938 kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t])); 939 } 940 941 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 942 space_map_destroy(msp->ms_defermap[t]); 943 kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t])); 944 } 945 946 ASSERT0(msp->ms_deferspace); 947 948 mutex_exit(&msp->ms_lock); 949 mutex_destroy(&msp->ms_lock); 950 951 kmem_free(msp, sizeof (metaslab_t)); 952} 953 954#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 955#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 956#define METASLAB_ACTIVE_MASK \ 957 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 958 959static uint64_t 960metaslab_weight(metaslab_t *msp) 961{ 962 metaslab_group_t *mg = msp->ms_group; 963 space_map_t *sm = msp->ms_map; 964 space_map_obj_t *smo = &msp->ms_smo; 965 vdev_t *vd = mg->mg_vd; 966 uint64_t weight, space; 967 968 ASSERT(MUTEX_HELD(&msp->ms_lock)); 969 970 /* 971 * This vdev is in the process of being removed so there is nothing 972 * for us to do here. 973 */ 974 if (vd->vdev_removing) { 975 ASSERT0(smo->smo_alloc); 976 ASSERT0(vd->vdev_ms_shift); 977 return (0); 978 } 979 980 /* 981 * The baseline weight is the metaslab's free space. 982 */ 983 space = sm->sm_size - smo->smo_alloc; 984 weight = space; 985 986 /* 987 * Modern disks have uniform bit density and constant angular velocity. 988 * Therefore, the outer recording zones are faster (higher bandwidth) 989 * than the inner zones by the ratio of outer to inner track diameter, 990 * which is typically around 2:1. We account for this by assigning 991 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 992 * In effect, this means that we'll select the metaslab with the most 993 * free bandwidth rather than simply the one with the most free space. 994 */ 995 weight = 2 * weight - 996 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 997 ASSERT(weight >= space && weight <= 2 * space); 998 999 /* 1000 * For locality, assign higher weight to metaslabs which have 1001 * a lower offset than what we've already activated. 1002 */ 1003 if (sm->sm_start <= mg->mg_bonus_area) 1004 weight *= (metaslab_smo_bonus_pct / 100); 1005 ASSERT(weight >= space && 1006 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 1007 1008 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 1009 /* 1010 * If this metaslab is one we're actively using, adjust its 1011 * weight to make it preferable to any inactive metaslab so 1012 * we'll polish it off. 1013 */ 1014 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 1015 } 1016 return (weight); 1017} 1018 1019static void 1020metaslab_prefetch(metaslab_group_t *mg) 1021{ 1022 spa_t *spa = mg->mg_vd->vdev_spa; 1023 metaslab_t *msp; 1024 avl_tree_t *t = &mg->mg_metaslab_tree; 1025 int m; 1026 1027 mutex_enter(&mg->mg_lock); 1028 1029 /* 1030 * Prefetch the next potential metaslabs 1031 */ 1032 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 1033 space_map_t *sm = msp->ms_map; 1034 space_map_obj_t *smo = &msp->ms_smo; 1035 1036 /* If we have reached our prefetch limit then we're done */ 1037 if (m >= metaslab_prefetch_limit) 1038 break; 1039 1040 if (!sm->sm_loaded && smo->smo_object != 0) { 1041 mutex_exit(&mg->mg_lock); 1042 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 1043 0ULL, smo->smo_objsize); 1044 mutex_enter(&mg->mg_lock); 1045 } 1046 } 1047 mutex_exit(&mg->mg_lock); 1048} 1049 1050static int 1051metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 1052{ 1053 metaslab_group_t *mg = msp->ms_group; 1054 space_map_t *sm = msp->ms_map; 1055 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 1056 1057 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1058 1059 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1060 space_map_load_wait(sm); 1061 if (!sm->sm_loaded) { 1062 space_map_obj_t *smo = &msp->ms_smo; 1063 1064 int error = space_map_load(sm, sm_ops, SM_FREE, smo, 1065 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 1066 if (error) { 1067 metaslab_group_sort(msp->ms_group, msp, 0); 1068 return (error); 1069 } 1070 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1071 space_map_walk(msp->ms_defermap[t], 1072 space_map_claim, sm); 1073 1074 } 1075 1076 /* 1077 * Track the bonus area as we activate new metaslabs. 1078 */ 1079 if (sm->sm_start > mg->mg_bonus_area) { 1080 mutex_enter(&mg->mg_lock); 1081 mg->mg_bonus_area = sm->sm_start; 1082 mutex_exit(&mg->mg_lock); 1083 } 1084 1085 metaslab_group_sort(msp->ms_group, msp, 1086 msp->ms_weight | activation_weight); 1087 } 1088 ASSERT(sm->sm_loaded); 1089 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 1090 1091 return (0); 1092} 1093 1094static void 1095metaslab_passivate(metaslab_t *msp, uint64_t size) 1096{ 1097 /* 1098 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 1099 * this metaslab again. In that case, it had better be empty, 1100 * or we would be leaving space on the table. 1101 */ 1102 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0); 1103 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 1104 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 1105} 1106 1107/* 1108 * Determine if the in-core space map representation can be condensed on-disk. 1109 * We would like to use the following criteria to make our decision: 1110 * 1111 * 1. The size of the space map object should not dramatically increase as a 1112 * result of writing out our in-core free map. 1113 * 1114 * 2. The minimal on-disk space map representation is zfs_condense_pct/100 1115 * times the size than the in-core representation (i.e. zfs_condense_pct = 110 1116 * and in-core = 1MB, minimal = 1.1.MB). 1117 * 1118 * Checking the first condition is tricky since we don't want to walk 1119 * the entire AVL tree calculating the estimated on-disk size. Instead we 1120 * use the size-ordered AVL tree in the space map and calculate the 1121 * size required for the largest segment in our in-core free map. If the 1122 * size required to represent that segment on disk is larger than the space 1123 * map object then we avoid condensing this map. 1124 * 1125 * To determine the second criterion we use a best-case estimate and assume 1126 * each segment can be represented on-disk as a single 64-bit entry. We refer 1127 * to this best-case estimate as the space map's minimal form. 1128 */ 1129static boolean_t 1130metaslab_should_condense(metaslab_t *msp) 1131{ 1132 space_map_t *sm = msp->ms_map; 1133 space_map_obj_t *smo = &msp->ms_smo_syncing; 1134 space_seg_t *ss; 1135 uint64_t size, entries, segsz; 1136 1137 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1138 ASSERT(sm->sm_loaded); 1139 1140 /* 1141 * Use the sm_pp_root AVL tree, which is ordered by size, to obtain 1142 * the largest segment in the in-core free map. If the tree is 1143 * empty then we should condense the map. 1144 */ 1145 ss = avl_last(sm->sm_pp_root); 1146 if (ss == NULL) 1147 return (B_TRUE); 1148 1149 /* 1150 * Calculate the number of 64-bit entries this segment would 1151 * require when written to disk. If this single segment would be 1152 * larger on-disk than the entire current on-disk structure, then 1153 * clearly condensing will increase the on-disk structure size. 1154 */ 1155 size = (ss->ss_end - ss->ss_start) >> sm->sm_shift; 1156 entries = size / (MIN(size, SM_RUN_MAX)); 1157 segsz = entries * sizeof (uint64_t); 1158 1159 return (segsz <= smo->smo_objsize && 1160 smo->smo_objsize >= (zfs_condense_pct * 1161 sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100); 1162} 1163 1164/* 1165 * Condense the on-disk space map representation to its minimized form. 1166 * The minimized form consists of a small number of allocations followed by 1167 * the in-core free map. 1168 */ 1169static void 1170metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) 1171{ 1172 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 1173 space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK]; 1174 space_map_t condense_map; 1175 space_map_t *sm = msp->ms_map; 1176 objset_t *mos = spa_meta_objset(spa); 1177 space_map_obj_t *smo = &msp->ms_smo_syncing; 1178 1179 ASSERT(MUTEX_HELD(&msp->ms_lock)); 1180 ASSERT3U(spa_sync_pass(spa), ==, 1); 1181 ASSERT(sm->sm_loaded); 1182 1183 spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " 1184 "smo size %llu, segments %lu", txg, 1185 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, 1186 smo->smo_objsize, avl_numnodes(&sm->sm_root)); 1187 1188 /* 1189 * Create an map that is a 100% allocated map. We remove segments 1190 * that have been freed in this txg, any deferred frees that exist, 1191 * and any allocation in the future. Removing segments should be 1192 * a relatively inexpensive operation since we expect these maps to 1193 * a small number of nodes. 1194 */ 1195 space_map_create(&condense_map, sm->sm_start, sm->sm_size, 1196 sm->sm_shift, sm->sm_lock); 1197 space_map_add(&condense_map, condense_map.sm_start, 1198 condense_map.sm_size); 1199 1200 /* 1201 * Remove what's been freed in this txg from the condense_map. 1202 * Since we're in sync_pass 1, we know that all the frees from 1203 * this txg are in the freemap. 1204 */ 1205 space_map_walk(freemap, space_map_remove, &condense_map); 1206 1207 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1208 space_map_walk(msp->ms_defermap[t], 1209 space_map_remove, &condense_map); 1210 1211 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1212 space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK], 1213 space_map_remove, &condense_map); 1214 1215 /* 1216 * We're about to drop the metaslab's lock thus allowing 1217 * other consumers to change it's content. Set the 1218 * space_map's sm_condensing flag to ensure that 1219 * allocations on this metaslab do not occur while we're 1220 * in the middle of committing it to disk. This is only critical 1221 * for the ms_map as all other space_maps use per txg 1222 * views of their content. 1223 */ 1224 sm->sm_condensing = B_TRUE; 1225 1226 mutex_exit(&msp->ms_lock); 1227 space_map_truncate(smo, mos, tx); 1228 mutex_enter(&msp->ms_lock); 1229 1230 /* 1231 * While we would ideally like to create a space_map representation 1232 * that consists only of allocation records, doing so can be 1233 * prohibitively expensive because the in-core free map can be 1234 * large, and therefore computationally expensive to subtract 1235 * from the condense_map. Instead we sync out two maps, a cheap 1236 * allocation only map followed by the in-core free map. While not 1237 * optimal, this is typically close to optimal, and much cheaper to 1238 * compute. 1239 */ 1240 space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx); 1241 space_map_vacate(&condense_map, NULL, NULL); 1242 space_map_destroy(&condense_map); 1243 1244 space_map_sync(sm, SM_FREE, smo, mos, tx); 1245 sm->sm_condensing = B_FALSE; 1246 1247 spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, " 1248 "smo size %llu", txg, 1249 (msp->ms_map->sm_start / msp->ms_map->sm_size), msp, 1250 smo->smo_objsize); 1251} 1252 1253/* 1254 * Write a metaslab to disk in the context of the specified transaction group. 1255 */ 1256void 1257metaslab_sync(metaslab_t *msp, uint64_t txg) 1258{ 1259 vdev_t *vd = msp->ms_group->mg_vd; 1260 spa_t *spa = vd->vdev_spa; 1261 objset_t *mos = spa_meta_objset(spa); 1262 space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK]; 1263 space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK]; 1264 space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1265 space_map_t *sm = msp->ms_map; 1266 space_map_obj_t *smo = &msp->ms_smo_syncing; 1267 dmu_buf_t *db; 1268 dmu_tx_t *tx; 1269 1270 ASSERT(!vd->vdev_ishole); 1271 1272 /* 1273 * This metaslab has just been added so there's no work to do now. 1274 */ 1275 if (*freemap == NULL) { 1276 ASSERT3P(allocmap, ==, NULL); 1277 return; 1278 } 1279 1280 ASSERT3P(allocmap, !=, NULL); 1281 ASSERT3P(*freemap, !=, NULL); 1282 ASSERT3P(*freed_map, !=, NULL); 1283 1284 if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0) 1285 return; 1286 1287 /* 1288 * The only state that can actually be changing concurrently with 1289 * metaslab_sync() is the metaslab's ms_map. No other thread can 1290 * be modifying this txg's allocmap, freemap, freed_map, or smo. 1291 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 1292 * We drop it whenever we call into the DMU, because the DMU 1293 * can call down to us (e.g. via zio_free()) at any time. 1294 */ 1295 1296 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 1297 1298 if (smo->smo_object == 0) { 1299 ASSERT(smo->smo_objsize == 0); 1300 ASSERT(smo->smo_alloc == 0); 1301 smo->smo_object = dmu_object_alloc(mos, 1302 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1303 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1304 ASSERT(smo->smo_object != 0); 1305 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 1306 (sm->sm_start >> vd->vdev_ms_shift), 1307 sizeof (uint64_t), &smo->smo_object, tx); 1308 } 1309 1310 mutex_enter(&msp->ms_lock); 1311 1312 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && 1313 metaslab_should_condense(msp)) { 1314 metaslab_condense(msp, txg, tx); 1315 } else { 1316 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1317 space_map_sync(*freemap, SM_FREE, smo, mos, tx); 1318 } 1319 1320 space_map_vacate(allocmap, NULL, NULL); 1321 1322 /* 1323 * For sync pass 1, we avoid walking the entire space map and 1324 * instead will just swap the pointers for freemap and 1325 * freed_map. We can safely do this since the freed_map is 1326 * guaranteed to be empty on the initial pass. 1327 */ 1328 if (spa_sync_pass(spa) == 1) { 1329 ASSERT0((*freed_map)->sm_space); 1330 ASSERT0(avl_numnodes(&(*freed_map)->sm_root)); 1331 space_map_swap(freemap, freed_map); 1332 } else { 1333 space_map_vacate(*freemap, space_map_add, *freed_map); 1334 } 1335 1336 ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space); 1337 ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space); 1338 1339 mutex_exit(&msp->ms_lock); 1340 1341 VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1342 dmu_buf_will_dirty(db, tx); 1343 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1344 bcopy(smo, db->db_data, sizeof (*smo)); 1345 dmu_buf_rele(db, FTAG); 1346 1347 dmu_tx_commit(tx); 1348} 1349 1350/* 1351 * Called after a transaction group has completely synced to mark 1352 * all of the metaslab's free space as usable. 1353 */ 1354void 1355metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1356{ 1357 space_map_obj_t *smo = &msp->ms_smo; 1358 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1359 space_map_t *sm = msp->ms_map; 1360 space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1361 space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1362 metaslab_group_t *mg = msp->ms_group; 1363 vdev_t *vd = mg->mg_vd; 1364 int64_t alloc_delta, defer_delta; 1365 1366 ASSERT(!vd->vdev_ishole); 1367 1368 mutex_enter(&msp->ms_lock); 1369 1370 /* 1371 * If this metaslab is just becoming available, initialize its 1372 * allocmaps, freemaps, and defermap and add its capacity to the vdev. 1373 */ 1374 if (*freed_map == NULL) { 1375 ASSERT(*defer_map == NULL); 1376 for (int t = 0; t < TXG_SIZE; t++) { 1377 msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t), 1378 KM_SLEEP); 1379 space_map_create(msp->ms_allocmap[t], sm->sm_start, 1380 sm->sm_size, sm->sm_shift, sm->sm_lock); 1381 msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t), 1382 KM_SLEEP); 1383 space_map_create(msp->ms_freemap[t], sm->sm_start, 1384 sm->sm_size, sm->sm_shift, sm->sm_lock); 1385 } 1386 1387 for (int t = 0; t < TXG_DEFER_SIZE; t++) { 1388 msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t), 1389 KM_SLEEP); 1390 space_map_create(msp->ms_defermap[t], sm->sm_start, 1391 sm->sm_size, sm->sm_shift, sm->sm_lock); 1392 } 1393 1394 freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1395 defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1396 1397 vdev_space_update(vd, 0, 0, sm->sm_size); 1398 } 1399 1400 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1401 defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space; 1402 1403 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1404 1405 ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0); 1406 ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0); 1407 1408 /* 1409 * If there's a space_map_load() in progress, wait for it to complete 1410 * so that we have a consistent view of the in-core space map. 1411 */ 1412 space_map_load_wait(sm); 1413 1414 /* 1415 * Move the frees from the defer_map to this map (if it's loaded). 1416 * Swap the freed_map and the defer_map -- this is safe to do 1417 * because we've just emptied out the defer_map. 1418 */ 1419 space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1420 ASSERT0((*defer_map)->sm_space); 1421 ASSERT0(avl_numnodes(&(*defer_map)->sm_root)); 1422 space_map_swap(freed_map, defer_map); 1423 1424 *smo = *smosync; 1425 1426 msp->ms_deferspace += defer_delta; 1427 ASSERT3S(msp->ms_deferspace, >=, 0); 1428 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1429 if (msp->ms_deferspace != 0) { 1430 /* 1431 * Keep syncing this metaslab until all deferred frees 1432 * are back in circulation. 1433 */ 1434 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1435 } 1436 1437 /* 1438 * If the map is loaded but no longer active, evict it as soon as all 1439 * future allocations have synced. (If we unloaded it now and then 1440 * loaded a moment later, the map wouldn't reflect those allocations.) 1441 */ 1442 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1443 int evictable = 1; 1444 1445 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1446 if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space) 1447 evictable = 0; 1448 1449 if (evictable && !metaslab_debug) 1450 space_map_unload(sm); 1451 } 1452 1453 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1454 1455 mutex_exit(&msp->ms_lock); 1456} 1457 1458void 1459metaslab_sync_reassess(metaslab_group_t *mg) 1460{ 1461 vdev_t *vd = mg->mg_vd; 1462 int64_t failures = mg->mg_alloc_failures; 1463 1464 metaslab_group_alloc_update(mg); 1465 1466 /* 1467 * Re-evaluate all metaslabs which have lower offsets than the 1468 * bonus area. 1469 */ 1470 for (int m = 0; m < vd->vdev_ms_count; m++) { 1471 metaslab_t *msp = vd->vdev_ms[m]; 1472 1473 if (msp->ms_map->sm_start > mg->mg_bonus_area) 1474 break; 1475 1476 mutex_enter(&msp->ms_lock); 1477 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1478 mutex_exit(&msp->ms_lock); 1479 } 1480 1481 atomic_add_64(&mg->mg_alloc_failures, -failures); 1482 1483 /* 1484 * Prefetch the next potential metaslabs 1485 */ 1486 metaslab_prefetch(mg); 1487} 1488 1489static uint64_t 1490metaslab_distance(metaslab_t *msp, dva_t *dva) 1491{ 1492 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1493 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1494 uint64_t start = msp->ms_map->sm_start >> ms_shift; 1495 1496 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1497 return (1ULL << 63); 1498 1499 if (offset < start) 1500 return ((start - offset) << ms_shift); 1501 if (offset > start) 1502 return ((offset - start) << ms_shift); 1503 return (0); 1504} 1505 1506static uint64_t 1507metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1508 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1509{ 1510 spa_t *spa = mg->mg_vd->vdev_spa; 1511 metaslab_t *msp = NULL; 1512 uint64_t offset = -1ULL; 1513 avl_tree_t *t = &mg->mg_metaslab_tree; 1514 uint64_t activation_weight; 1515 uint64_t target_distance; 1516 int i; 1517 1518 activation_weight = METASLAB_WEIGHT_PRIMARY; 1519 for (i = 0; i < d; i++) { 1520 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1521 activation_weight = METASLAB_WEIGHT_SECONDARY; 1522 break; 1523 } 1524 } 1525 1526 for (;;) { 1527 boolean_t was_active; 1528 1529 mutex_enter(&mg->mg_lock); 1530 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1531 if (msp->ms_weight < asize) { 1532 spa_dbgmsg(spa, "%s: failed to meet weight " 1533 "requirement: vdev %llu, txg %llu, mg %p, " 1534 "msp %p, psize %llu, asize %llu, " 1535 "failures %llu, weight %llu", 1536 spa_name(spa), mg->mg_vd->vdev_id, txg, 1537 mg, msp, psize, asize, 1538 mg->mg_alloc_failures, msp->ms_weight); 1539 mutex_exit(&mg->mg_lock); 1540 return (-1ULL); 1541 } 1542 1543 /* 1544 * If the selected metaslab is condensing, skip it. 1545 */ 1546 if (msp->ms_map->sm_condensing) 1547 continue; 1548 1549 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1550 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1551 break; 1552 1553 target_distance = min_distance + 1554 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1555 1556 for (i = 0; i < d; i++) 1557 if (metaslab_distance(msp, &dva[i]) < 1558 target_distance) 1559 break; 1560 if (i == d) 1561 break; 1562 } 1563 mutex_exit(&mg->mg_lock); 1564 if (msp == NULL) 1565 return (-1ULL); 1566 1567 mutex_enter(&msp->ms_lock); 1568 1569 /* 1570 * If we've already reached the allowable number of failed 1571 * allocation attempts on this metaslab group then we 1572 * consider skipping it. We skip it only if we're allowed 1573 * to "fast" gang, the physical size is larger than 1574 * a gang block, and we're attempting to allocate from 1575 * the primary metaslab. 1576 */ 1577 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1578 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1579 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1580 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1581 "vdev %llu, txg %llu, mg %p, psize %llu, " 1582 "asize %llu, failures %llu", spa_name(spa), 1583 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1584 mg->mg_alloc_failures); 1585 mutex_exit(&msp->ms_lock); 1586 return (-1ULL); 1587 } 1588 1589 /* 1590 * Ensure that the metaslab we have selected is still 1591 * capable of handling our request. It's possible that 1592 * another thread may have changed the weight while we 1593 * were blocked on the metaslab lock. 1594 */ 1595 if (msp->ms_weight < asize || (was_active && 1596 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1597 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1598 mutex_exit(&msp->ms_lock); 1599 continue; 1600 } 1601 1602 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1603 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1604 metaslab_passivate(msp, 1605 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1606 mutex_exit(&msp->ms_lock); 1607 continue; 1608 } 1609 1610 if (metaslab_activate(msp, activation_weight) != 0) { 1611 mutex_exit(&msp->ms_lock); 1612 continue; 1613 } 1614 1615 /* 1616 * If this metaslab is currently condensing then pick again as 1617 * we can't manipulate this metaslab until it's committed 1618 * to disk. 1619 */ 1620 if (msp->ms_map->sm_condensing) { 1621 mutex_exit(&msp->ms_lock); 1622 continue; 1623 } 1624 1625 if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL) 1626 break; 1627 1628 atomic_inc_64(&mg->mg_alloc_failures); 1629 1630 metaslab_passivate(msp, space_map_maxsize(msp->ms_map)); 1631 1632 mutex_exit(&msp->ms_lock); 1633 } 1634 1635 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) 1636 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1637 1638 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1639 1640 mutex_exit(&msp->ms_lock); 1641 1642 return (offset); 1643} 1644 1645/* 1646 * Allocate a block for the specified i/o. 1647 */ 1648static int 1649metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1650 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1651{ 1652 metaslab_group_t *mg, *rotor; 1653 vdev_t *vd; 1654 int dshift = 3; 1655 int all_zero; 1656 int zio_lock = B_FALSE; 1657 boolean_t allocatable; 1658 uint64_t offset = -1ULL; 1659 uint64_t asize; 1660 uint64_t distance; 1661 1662 ASSERT(!DVA_IS_VALID(&dva[d])); 1663 1664 /* 1665 * For testing, make some blocks above a certain size be gang blocks. 1666 */ 1667 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1668 return (SET_ERROR(ENOSPC)); 1669 1670 /* 1671 * Start at the rotor and loop through all mgs until we find something. 1672 * Note that there's no locking on mc_rotor or mc_aliquot because 1673 * nothing actually breaks if we miss a few updates -- we just won't 1674 * allocate quite as evenly. It all balances out over time. 1675 * 1676 * If we are doing ditto or log blocks, try to spread them across 1677 * consecutive vdevs. If we're forced to reuse a vdev before we've 1678 * allocated all of our ditto blocks, then try and spread them out on 1679 * that vdev as much as possible. If it turns out to not be possible, 1680 * gradually lower our standards until anything becomes acceptable. 1681 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1682 * gives us hope of containing our fault domains to something we're 1683 * able to reason about. Otherwise, any two top-level vdev failures 1684 * will guarantee the loss of data. With consecutive allocation, 1685 * only two adjacent top-level vdev failures will result in data loss. 1686 * 1687 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1688 * ourselves on the same vdev as our gang block header. That 1689 * way, we can hope for locality in vdev_cache, plus it makes our 1690 * fault domains something tractable. 1691 */ 1692 if (hintdva) { 1693 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1694 1695 /* 1696 * It's possible the vdev we're using as the hint no 1697 * longer exists (i.e. removed). Consult the rotor when 1698 * all else fails. 1699 */ 1700 if (vd != NULL) { 1701 mg = vd->vdev_mg; 1702 1703 if (flags & METASLAB_HINTBP_AVOID && 1704 mg->mg_next != NULL) 1705 mg = mg->mg_next; 1706 } else { 1707 mg = mc->mc_rotor; 1708 } 1709 } else if (d != 0) { 1710 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1711 mg = vd->vdev_mg->mg_next; 1712 } else { 1713 mg = mc->mc_rotor; 1714 } 1715 1716 /* 1717 * If the hint put us into the wrong metaslab class, or into a 1718 * metaslab group that has been passivated, just follow the rotor. 1719 */ 1720 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1721 mg = mc->mc_rotor; 1722 1723 rotor = mg; 1724top: 1725 all_zero = B_TRUE; 1726 do { 1727 ASSERT(mg->mg_activation_count == 1); 1728 1729 vd = mg->mg_vd; 1730 1731 /* 1732 * Don't allocate from faulted devices. 1733 */ 1734 if (zio_lock) { 1735 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1736 allocatable = vdev_allocatable(vd); 1737 spa_config_exit(spa, SCL_ZIO, FTAG); 1738 } else { 1739 allocatable = vdev_allocatable(vd); 1740 } 1741 1742 /* 1743 * Determine if the selected metaslab group is eligible 1744 * for allocations. If we're ganging or have requested 1745 * an allocation for the smallest gang block size 1746 * then we don't want to avoid allocating to the this 1747 * metaslab group. If we're in this condition we should 1748 * try to allocate from any device possible so that we 1749 * don't inadvertently return ENOSPC and suspend the pool 1750 * even though space is still available. 1751 */ 1752 if (allocatable && CAN_FASTGANG(flags) && 1753 psize > SPA_GANGBLOCKSIZE) 1754 allocatable = metaslab_group_allocatable(mg); 1755 1756 if (!allocatable) 1757 goto next; 1758 1759 /* 1760 * Avoid writing single-copy data to a failing vdev 1761 * unless the user instructs us that it is okay. 1762 */ 1763 if ((vd->vdev_stat.vs_write_errors > 0 || 1764 vd->vdev_state < VDEV_STATE_HEALTHY) && 1765 d == 0 && dshift == 3 && 1766 !(zfs_write_to_degraded && vd->vdev_state == 1767 VDEV_STATE_DEGRADED)) { 1768 all_zero = B_FALSE; 1769 goto next; 1770 } 1771 1772 ASSERT(mg->mg_class == mc); 1773 1774 distance = vd->vdev_asize >> dshift; 1775 if (distance <= (1ULL << vd->vdev_ms_shift)) 1776 distance = 0; 1777 else 1778 all_zero = B_FALSE; 1779 1780 asize = vdev_psize_to_asize(vd, psize); 1781 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1782 1783 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1784 dva, d, flags); 1785 if (offset != -1ULL) { 1786 /* 1787 * If we've just selected this metaslab group, 1788 * figure out whether the corresponding vdev is 1789 * over- or under-used relative to the pool, 1790 * and set an allocation bias to even it out. 1791 */ 1792 if (mc->mc_aliquot == 0) { 1793 vdev_stat_t *vs = &vd->vdev_stat; 1794 int64_t vu, cu; 1795 1796 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1797 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1798 1799 /* 1800 * Calculate how much more or less we should 1801 * try to allocate from this device during 1802 * this iteration around the rotor. 1803 * For example, if a device is 80% full 1804 * and the pool is 20% full then we should 1805 * reduce allocations by 60% on this device. 1806 * 1807 * mg_bias = (20 - 80) * 512K / 100 = -307K 1808 * 1809 * This reduces allocations by 307K for this 1810 * iteration. 1811 */ 1812 mg->mg_bias = ((cu - vu) * 1813 (int64_t)mg->mg_aliquot) / 100; 1814 } 1815 1816 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1817 mg->mg_aliquot + mg->mg_bias) { 1818 mc->mc_rotor = mg->mg_next; 1819 mc->mc_aliquot = 0; 1820 } 1821 1822 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1823 DVA_SET_OFFSET(&dva[d], offset); 1824 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1825 DVA_SET_ASIZE(&dva[d], asize); 1826 1827 return (0); 1828 } 1829next: 1830 mc->mc_rotor = mg->mg_next; 1831 mc->mc_aliquot = 0; 1832 } while ((mg = mg->mg_next) != rotor); 1833 1834 if (!all_zero) { 1835 dshift++; 1836 ASSERT(dshift < 64); 1837 goto top; 1838 } 1839 1840 if (!allocatable && !zio_lock) { 1841 dshift = 3; 1842 zio_lock = B_TRUE; 1843 goto top; 1844 } 1845 1846 bzero(&dva[d], sizeof (dva_t)); 1847 1848 return (SET_ERROR(ENOSPC)); 1849} 1850 1851/* 1852 * Free the block represented by DVA in the context of the specified 1853 * transaction group. 1854 */ 1855static void 1856metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1857{ 1858 uint64_t vdev = DVA_GET_VDEV(dva); 1859 uint64_t offset = DVA_GET_OFFSET(dva); 1860 uint64_t size = DVA_GET_ASIZE(dva); 1861 vdev_t *vd; 1862 metaslab_t *msp; 1863 1864 ASSERT(DVA_IS_VALID(dva)); 1865 1866 if (txg > spa_freeze_txg(spa)) 1867 return; 1868 1869 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1870 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1871 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1872 (u_longlong_t)vdev, (u_longlong_t)offset); 1873 ASSERT(0); 1874 return; 1875 } 1876 1877 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1878 1879 if (DVA_GET_GANG(dva)) 1880 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1881 1882 mutex_enter(&msp->ms_lock); 1883 1884 if (now) { 1885 space_map_remove(msp->ms_allocmap[txg & TXG_MASK], 1886 offset, size); 1887 space_map_free(msp->ms_map, offset, size); 1888 } else { 1889 if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0) 1890 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1891 space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size); 1892 } 1893 1894 mutex_exit(&msp->ms_lock); 1895} 1896 1897/* 1898 * Intent log support: upon opening the pool after a crash, notify the SPA 1899 * of blocks that the intent log has allocated for immediate write, but 1900 * which are still considered free by the SPA because the last transaction 1901 * group didn't commit yet. 1902 */ 1903static int 1904metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1905{ 1906 uint64_t vdev = DVA_GET_VDEV(dva); 1907 uint64_t offset = DVA_GET_OFFSET(dva); 1908 uint64_t size = DVA_GET_ASIZE(dva); 1909 vdev_t *vd; 1910 metaslab_t *msp; 1911 int error = 0; 1912 1913 ASSERT(DVA_IS_VALID(dva)); 1914 1915 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1916 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1917 return (SET_ERROR(ENXIO)); 1918 1919 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1920 1921 if (DVA_GET_GANG(dva)) 1922 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1923 1924 mutex_enter(&msp->ms_lock); 1925 1926 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded) 1927 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1928 1929 if (error == 0 && !space_map_contains(msp->ms_map, offset, size)) 1930 error = SET_ERROR(ENOENT); 1931 1932 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1933 mutex_exit(&msp->ms_lock); 1934 return (error); 1935 } 1936 1937 space_map_claim(msp->ms_map, offset, size); 1938 1939 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1940 if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0) 1941 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1942 space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size); 1943 } 1944 1945 mutex_exit(&msp->ms_lock); 1946 1947 return (0); 1948} 1949 1950int 1951metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1952 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1953{ 1954 dva_t *dva = bp->blk_dva; 1955 dva_t *hintdva = hintbp->blk_dva; 1956 int error = 0; 1957 1958 ASSERT(bp->blk_birth == 0); 1959 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1960 1961 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1962 1963 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1964 spa_config_exit(spa, SCL_ALLOC, FTAG); 1965 return (SET_ERROR(ENOSPC)); 1966 } 1967 1968 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1969 ASSERT(BP_GET_NDVAS(bp) == 0); 1970 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1971 1972 for (int d = 0; d < ndvas; d++) { 1973 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1974 txg, flags); 1975 if (error) { 1976 for (d--; d >= 0; d--) { 1977 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1978 bzero(&dva[d], sizeof (dva_t)); 1979 } 1980 spa_config_exit(spa, SCL_ALLOC, FTAG); 1981 return (error); 1982 } 1983 } 1984 ASSERT(error == 0); 1985 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1986 1987 spa_config_exit(spa, SCL_ALLOC, FTAG); 1988 1989 BP_SET_BIRTH(bp, txg, txg); 1990 1991 return (0); 1992} 1993 1994void 1995metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1996{ 1997 const dva_t *dva = bp->blk_dva; 1998 int ndvas = BP_GET_NDVAS(bp); 1999 2000 ASSERT(!BP_IS_HOLE(bp)); 2001 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 2002 2003 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 2004 2005 for (int d = 0; d < ndvas; d++) 2006 metaslab_free_dva(spa, &dva[d], txg, now); 2007 2008 spa_config_exit(spa, SCL_FREE, FTAG); 2009} 2010 2011int 2012metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 2013{ 2014 const dva_t *dva = bp->blk_dva; 2015 int ndvas = BP_GET_NDVAS(bp); 2016 int error = 0; 2017 2018 ASSERT(!BP_IS_HOLE(bp)); 2019 2020 if (txg != 0) { 2021 /* 2022 * First do a dry run to make sure all DVAs are claimable, 2023 * so we don't have to unwind from partial failures below. 2024 */ 2025 if ((error = metaslab_claim(spa, bp, 0)) != 0) 2026 return (error); 2027 } 2028 2029 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 2030 2031 for (int d = 0; d < ndvas; d++) 2032 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 2033 break; 2034 2035 spa_config_exit(spa, SCL_ALLOC, FTAG); 2036 2037 ASSERT(error == 0 || txg == 0); 2038 2039 return (error); 2040} 2041 2042static void 2043checkmap(space_map_t *sm, uint64_t off, uint64_t size) 2044{ 2045 space_seg_t *ss; 2046 avl_index_t where; 2047 2048 mutex_enter(sm->sm_lock); 2049 ss = space_map_find(sm, off, size, &where); 2050 if (ss != NULL) 2051 panic("freeing free block; ss=%p", (void *)ss); 2052 mutex_exit(sm->sm_lock); 2053} 2054 2055void 2056metaslab_check_free(spa_t *spa, const blkptr_t *bp) 2057{ 2058 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) 2059 return; 2060 2061 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 2062 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 2063 uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]); 2064 vdev_t *vd = vdev_lookup_top(spa, vdid); 2065 uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]); 2066 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); 2067 metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift]; 2068 2069 if (ms->ms_map->sm_loaded) 2070 checkmap(ms->ms_map, off, size); 2071 2072 for (int j = 0; j < TXG_SIZE; j++) 2073 checkmap(ms->ms_freemap[j], off, size); 2074 for (int j = 0; j < TXG_DEFER_SIZE; j++) 2075 checkmap(ms->ms_defermap[j], off, size); 2076 } 2077 spa_config_exit(spa, SCL_VDEV, FTAG); 2078} 2079