ddt.c revision 290757
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/spa.h> 29#include <sys/spa_impl.h> 30#include <sys/zio.h> 31#include <sys/ddt.h> 32#include <sys/zap.h> 33#include <sys/dmu_tx.h> 34#include <sys/arc.h> 35#include <sys/dsl_pool.h> 36#include <sys/zio_checksum.h> 37#include <sys/zio_compress.h> 38#include <sys/dsl_scan.h> 39 40/* 41 * Enable/disable prefetching of dedup-ed blocks which are going to be freed. 42 */ 43int zfs_dedup_prefetch = 1; 44 45SYSCTL_DECL(_vfs_zfs); 46SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP"); 47TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch); 48SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch, 49 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); 50 51static const ddt_ops_t *ddt_ops[DDT_TYPES] = { 52 &ddt_zap_ops, 53}; 54 55static const char *ddt_class_name[DDT_CLASSES] = { 56 "ditto", 57 "duplicate", 58 "unique", 59}; 60 61static void 62ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 63 dmu_tx_t *tx) 64{ 65 spa_t *spa = ddt->ddt_spa; 66 objset_t *os = ddt->ddt_os; 67 uint64_t *objectp = &ddt->ddt_object[type][class]; 68 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & 69 ZCHECKSUM_FLAG_DEDUP; 70 char name[DDT_NAMELEN]; 71 72 ddt_object_name(ddt, type, class, name); 73 74 ASSERT(*objectp == 0); 75 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); 76 ASSERT(*objectp != 0); 77 78 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 79 sizeof (uint64_t), 1, objectp, tx) == 0); 80 81 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, 82 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 83 &ddt->ddt_histogram[type][class], tx) == 0); 84} 85 86static void 87ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 88 dmu_tx_t *tx) 89{ 90 spa_t *spa = ddt->ddt_spa; 91 objset_t *os = ddt->ddt_os; 92 uint64_t *objectp = &ddt->ddt_object[type][class]; 93 uint64_t count; 94 char name[DDT_NAMELEN]; 95 96 ddt_object_name(ddt, type, class, name); 97 98 ASSERT(*objectp != 0); 99 VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); 100 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 101 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); 102 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); 103 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); 104 bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); 105 106 *objectp = 0; 107} 108 109static int 110ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 111{ 112 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 113 dmu_object_info_t doi; 114 uint64_t count; 115 char name[DDT_NAMELEN]; 116 int error; 117 118 ddt_object_name(ddt, type, class, name); 119 120 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 121 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 122 123 if (error != 0) 124 return (error); 125 126 VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 127 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 128 &ddt->ddt_histogram[type][class])); 129 130 /* 131 * Seed the cached statistics. 132 */ 133 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 134 135 error = ddt_object_count(ddt, type, class, &count); 136 if (error) 137 return error; 138 139 ddo->ddo_count = count; 140 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 141 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 142 143 return (0); 144} 145 146static void 147ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 148 dmu_tx_t *tx) 149{ 150 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 151 dmu_object_info_t doi; 152 uint64_t count; 153 char name[DDT_NAMELEN]; 154 155 ddt_object_name(ddt, type, class, name); 156 157 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 158 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 159 &ddt->ddt_histogram[type][class], tx) == 0); 160 161 /* 162 * Cache DDT statistics; this is the only time they'll change. 163 */ 164 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 165 VERIFY(ddt_object_count(ddt, type, class, &count) == 0); 166 167 ddo->ddo_count = count; 168 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 169 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 170} 171 172static int 173ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 174 ddt_entry_t *dde) 175{ 176 if (!ddt_object_exists(ddt, type, class)) 177 return (SET_ERROR(ENOENT)); 178 179 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 180 ddt->ddt_object[type][class], dde)); 181} 182 183static void 184ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 185 ddt_entry_t *dde) 186{ 187 if (!ddt_object_exists(ddt, type, class)) 188 return; 189 190 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, 191 ddt->ddt_object[type][class], dde); 192} 193 194int 195ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 196 ddt_entry_t *dde, dmu_tx_t *tx) 197{ 198 ASSERT(ddt_object_exists(ddt, type, class)); 199 200 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 201 ddt->ddt_object[type][class], dde, tx)); 202} 203 204static int 205ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 206 ddt_entry_t *dde, dmu_tx_t *tx) 207{ 208 ASSERT(ddt_object_exists(ddt, type, class)); 209 210 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 211 ddt->ddt_object[type][class], dde, tx)); 212} 213 214int 215ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 216 uint64_t *walk, ddt_entry_t *dde) 217{ 218 ASSERT(ddt_object_exists(ddt, type, class)); 219 220 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 221 ddt->ddt_object[type][class], dde, walk)); 222} 223 224int 225ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count) 226{ 227 ASSERT(ddt_object_exists(ddt, type, class)); 228 229 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 230 ddt->ddt_object[type][class], count)); 231} 232 233int 234ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 235 dmu_object_info_t *doi) 236{ 237 if (!ddt_object_exists(ddt, type, class)) 238 return (SET_ERROR(ENOENT)); 239 240 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 241 doi)); 242} 243 244boolean_t 245ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 246{ 247 return (!!ddt->ddt_object[type][class]); 248} 249 250void 251ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 252 char *name) 253{ 254 (void) sprintf(name, DMU_POOL_DDT, 255 zio_checksum_table[ddt->ddt_checksum].ci_name, 256 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 257} 258 259void 260ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 261{ 262 ASSERT(txg != 0); 263 264 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 265 bp->blk_dva[d] = ddp->ddp_dva[d]; 266 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 267} 268 269void 270ddt_bp_create(enum zio_checksum checksum, 271 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) 272{ 273 BP_ZERO(bp); 274 275 if (ddp != NULL) 276 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 277 278 bp->blk_cksum = ddk->ddk_cksum; 279 bp->blk_fill = 1; 280 281 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 282 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 283 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 284 BP_SET_CHECKSUM(bp, checksum); 285 BP_SET_TYPE(bp, DMU_OT_DEDUP); 286 BP_SET_LEVEL(bp, 0); 287 BP_SET_DEDUP(bp, 0); 288 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 289} 290 291void 292ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 293{ 294 ddk->ddk_cksum = bp->blk_cksum; 295 ddk->ddk_prop = 0; 296 297 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 298 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 299 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 300} 301 302void 303ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 304{ 305 ASSERT(ddp->ddp_phys_birth == 0); 306 307 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 308 ddp->ddp_dva[d] = bp->blk_dva[d]; 309 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); 310} 311 312void 313ddt_phys_clear(ddt_phys_t *ddp) 314{ 315 bzero(ddp, sizeof (*ddp)); 316} 317 318void 319ddt_phys_addref(ddt_phys_t *ddp) 320{ 321 ddp->ddp_refcnt++; 322} 323 324void 325ddt_phys_decref(ddt_phys_t *ddp) 326{ 327 ASSERT((int64_t)ddp->ddp_refcnt > 0); 328 ddp->ddp_refcnt--; 329} 330 331void 332ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 333{ 334 blkptr_t blk; 335 336 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 337 ddt_phys_clear(ddp); 338 zio_free(ddt->ddt_spa, txg, &blk); 339} 340 341ddt_phys_t * 342ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 343{ 344 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 345 346 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 347 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 348 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) 349 return (ddp); 350 } 351 return (NULL); 352} 353 354uint64_t 355ddt_phys_total_refcnt(const ddt_entry_t *dde) 356{ 357 uint64_t refcnt = 0; 358 359 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 360 refcnt += dde->dde_phys[p].ddp_refcnt; 361 362 return (refcnt); 363} 364 365static void 366ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) 367{ 368 spa_t *spa = ddt->ddt_spa; 369 ddt_phys_t *ddp = dde->dde_phys; 370 ddt_key_t *ddk = &dde->dde_key; 371 uint64_t lsize = DDK_GET_LSIZE(ddk); 372 uint64_t psize = DDK_GET_PSIZE(ddk); 373 374 bzero(dds, sizeof (*dds)); 375 376 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 377 uint64_t dsize = 0; 378 uint64_t refcnt = ddp->ddp_refcnt; 379 380 if (ddp->ddp_phys_birth == 0) 381 continue; 382 383 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 384 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); 385 386 dds->dds_blocks += 1; 387 dds->dds_lsize += lsize; 388 dds->dds_psize += psize; 389 dds->dds_dsize += dsize; 390 391 dds->dds_ref_blocks += refcnt; 392 dds->dds_ref_lsize += lsize * refcnt; 393 dds->dds_ref_psize += psize * refcnt; 394 dds->dds_ref_dsize += dsize * refcnt; 395 } 396} 397 398void 399ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) 400{ 401 const uint64_t *s = (const uint64_t *)src; 402 uint64_t *d = (uint64_t *)dst; 403 uint64_t *d_end = (uint64_t *)(dst + 1); 404 405 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ 406 407 while (d < d_end) 408 *d++ += (*s++ ^ neg) - neg; 409} 410 411static void 412ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) 413{ 414 ddt_stat_t dds; 415 ddt_histogram_t *ddh; 416 int bucket; 417 418 ddt_stat_generate(ddt, dde, &dds); 419 420 bucket = highbit64(dds.dds_ref_blocks) - 1; 421 ASSERT(bucket >= 0); 422 423 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; 424 425 ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); 426} 427 428void 429ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) 430{ 431 for (int h = 0; h < 64; h++) 432 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); 433} 434 435void 436ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) 437{ 438 bzero(dds, sizeof (*dds)); 439 440 for (int h = 0; h < 64; h++) 441 ddt_stat_add(dds, &ddh->ddh_stat[h], 0); 442} 443 444boolean_t 445ddt_histogram_empty(const ddt_histogram_t *ddh) 446{ 447 const uint64_t *s = (const uint64_t *)ddh; 448 const uint64_t *s_end = (const uint64_t *)(ddh + 1); 449 450 while (s < s_end) 451 if (*s++ != 0) 452 return (B_FALSE); 453 454 return (B_TRUE); 455} 456 457void 458ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) 459{ 460 /* Sum the statistics we cached in ddt_object_sync(). */ 461 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 462 ddt_t *ddt = spa->spa_ddt[c]; 463 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 464 for (enum ddt_class class = 0; class < DDT_CLASSES; 465 class++) { 466 ddt_object_t *ddo = 467 &ddt->ddt_object_stats[type][class]; 468 ddo_total->ddo_count += ddo->ddo_count; 469 ddo_total->ddo_dspace += ddo->ddo_dspace; 470 ddo_total->ddo_mspace += ddo->ddo_mspace; 471 } 472 } 473 } 474 475 /* ... and compute the averages. */ 476 if (ddo_total->ddo_count != 0) { 477 ddo_total->ddo_dspace /= ddo_total->ddo_count; 478 ddo_total->ddo_mspace /= ddo_total->ddo_count; 479 } 480} 481 482void 483ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) 484{ 485 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 486 ddt_t *ddt = spa->spa_ddt[c]; 487 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 488 for (enum ddt_class class = 0; class < DDT_CLASSES; 489 class++) { 490 ddt_histogram_add(ddh, 491 &ddt->ddt_histogram_cache[type][class]); 492 } 493 } 494 } 495} 496 497void 498ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) 499{ 500 ddt_histogram_t *ddh_total; 501 502 ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); 503 ddt_get_dedup_histogram(spa, ddh_total); 504 ddt_histogram_stat(dds_total, ddh_total); 505 kmem_free(ddh_total, sizeof (ddt_histogram_t)); 506} 507 508uint64_t 509ddt_get_dedup_dspace(spa_t *spa) 510{ 511 ddt_stat_t dds_total = { 0 }; 512 513 ddt_get_dedup_stats(spa, &dds_total); 514 return (dds_total.dds_ref_dsize - dds_total.dds_dsize); 515} 516 517uint64_t 518ddt_get_pool_dedup_ratio(spa_t *spa) 519{ 520 ddt_stat_t dds_total = { 0 }; 521 522 ddt_get_dedup_stats(spa, &dds_total); 523 if (dds_total.dds_dsize == 0) 524 return (100); 525 526 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); 527} 528 529int 530ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) 531{ 532 spa_t *spa = ddt->ddt_spa; 533 uint64_t total_refcnt = 0; 534 uint64_t ditto = spa->spa_dedup_ditto; 535 int total_copies = 0; 536 int desired_copies = 0; 537 538 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 539 ddt_phys_t *ddp = &dde->dde_phys[p]; 540 zio_t *zio = dde->dde_lead_zio[p]; 541 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ 542 if (zio != NULL) 543 refcnt += zio->io_parent_count; /* pending refs */ 544 if (ddp == ddp_willref) 545 refcnt++; /* caller's ref */ 546 if (refcnt != 0) { 547 total_refcnt += refcnt; 548 total_copies += p; 549 } 550 } 551 552 if (ditto == 0 || ditto > UINT32_MAX) 553 ditto = UINT32_MAX; 554 555 if (total_refcnt >= 1) 556 desired_copies++; 557 if (total_refcnt >= ditto) 558 desired_copies++; 559 if (total_refcnt >= ditto * ditto) 560 desired_copies++; 561 562 return (MAX(desired_copies, total_copies) - total_copies); 563} 564 565int 566ddt_ditto_copies_present(ddt_entry_t *dde) 567{ 568 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; 569 dva_t *dva = ddp->ddp_dva; 570 int copies = 0 - DVA_GET_GANG(dva); 571 572 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) 573 if (DVA_IS_VALID(dva)) 574 copies++; 575 576 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); 577 578 return (copies); 579} 580 581size_t 582ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) 583{ 584 uchar_t *version = dst++; 585 int cpfunc = ZIO_COMPRESS_ZLE; 586 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 587 size_t c_len; 588 589 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ 590 591 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); 592 593 if (c_len == s_len) { 594 cpfunc = ZIO_COMPRESS_OFF; 595 bcopy(src, dst, s_len); 596 } 597 598 *version = cpfunc; 599 /* CONSTCOND */ 600 if (ZFS_HOST_BYTEORDER) 601 *version |= DDT_COMPRESS_BYTEORDER_MASK; 602 603 return (c_len + 1); 604} 605 606void 607ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) 608{ 609 uchar_t version = *src++; 610 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; 611 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 612 613 if (ci->ci_decompress != NULL) 614 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); 615 else 616 bcopy(src, dst, d_len); 617 618 if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != 619 (ZFS_HOST_BYTEORDER != 0)) 620 byteswap_uint64_array(dst, d_len); 621} 622 623ddt_t * 624ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) 625{ 626 return (spa->spa_ddt[c]); 627} 628 629ddt_t * 630ddt_select(spa_t *spa, const blkptr_t *bp) 631{ 632 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 633} 634 635void 636ddt_enter(ddt_t *ddt) 637{ 638 mutex_enter(&ddt->ddt_lock); 639} 640 641void 642ddt_exit(ddt_t *ddt) 643{ 644 mutex_exit(&ddt->ddt_lock); 645} 646 647static ddt_entry_t * 648ddt_alloc(const ddt_key_t *ddk) 649{ 650 ddt_entry_t *dde; 651 652 dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); 653 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 654 655 dde->dde_key = *ddk; 656 657 return (dde); 658} 659 660static void 661ddt_free(ddt_entry_t *dde) 662{ 663 ASSERT(!dde->dde_loading); 664 665 for (int p = 0; p < DDT_PHYS_TYPES; p++) 666 ASSERT(dde->dde_lead_zio[p] == NULL); 667 668 if (dde->dde_repair_data != NULL) 669 zio_buf_free(dde->dde_repair_data, 670 DDK_GET_PSIZE(&dde->dde_key)); 671 672 cv_destroy(&dde->dde_cv); 673 kmem_free(dde, sizeof (*dde)); 674} 675 676void 677ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 678{ 679 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 680 681 avl_remove(&ddt->ddt_tree, dde); 682 ddt_free(dde); 683} 684 685ddt_entry_t * 686ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 687{ 688 ddt_entry_t *dde, dde_search; 689 enum ddt_type type; 690 enum ddt_class class; 691 avl_index_t where; 692 int error; 693 694 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 695 696 ddt_key_fill(&dde_search.dde_key, bp); 697 698 dde = avl_find(&ddt->ddt_tree, &dde_search, &where); 699 if (dde == NULL) { 700 if (!add) 701 return (NULL); 702 dde = ddt_alloc(&dde_search.dde_key); 703 avl_insert(&ddt->ddt_tree, dde, where); 704 } 705 706 while (dde->dde_loading) 707 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 708 709 if (dde->dde_loaded) 710 return (dde); 711 712 dde->dde_loading = B_TRUE; 713 714 ddt_exit(ddt); 715 716 error = ENOENT; 717 718 for (type = 0; type < DDT_TYPES; type++) { 719 for (class = 0; class < DDT_CLASSES; class++) { 720 error = ddt_object_lookup(ddt, type, class, dde); 721 if (error != ENOENT) 722 break; 723 } 724 if (error != ENOENT) 725 break; 726 } 727 728 ASSERT(error == 0 || error == ENOENT); 729 730 ddt_enter(ddt); 731 732 ASSERT(dde->dde_loaded == B_FALSE); 733 ASSERT(dde->dde_loading == B_TRUE); 734 735 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 736 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 737 dde->dde_loaded = B_TRUE; 738 dde->dde_loading = B_FALSE; 739 740 if (error == 0) 741 ddt_stat_update(ddt, dde, -1ULL); 742 743 cv_broadcast(&dde->dde_cv); 744 745 return (dde); 746} 747 748void 749ddt_prefetch(spa_t *spa, const blkptr_t *bp) 750{ 751 ddt_t *ddt; 752 ddt_entry_t dde; 753 754 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) 755 return; 756 757 /* 758 * We only remove the DDT once all tables are empty and only 759 * prefetch dedup blocks when there are entries in the DDT. 760 * Thus no locking is required as the DDT can't disappear on us. 761 */ 762 ddt = ddt_select(spa, bp); 763 ddt_key_fill(&dde.dde_key, bp); 764 765 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 766 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 767 ddt_object_prefetch(ddt, type, class, &dde); 768 } 769 } 770} 771 772int 773ddt_entry_compare(const void *x1, const void *x2) 774{ 775 const ddt_entry_t *dde1 = x1; 776 const ddt_entry_t *dde2 = x2; 777 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; 778 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; 779 780 for (int i = 0; i < DDT_KEY_WORDS; i++) { 781 if (u1[i] < u2[i]) 782 return (-1); 783 if (u1[i] > u2[i]) 784 return (1); 785 } 786 787 return (0); 788} 789 790static ddt_t * 791ddt_table_alloc(spa_t *spa, enum zio_checksum c) 792{ 793 ddt_t *ddt; 794 795 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); 796 797 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 798 avl_create(&ddt->ddt_tree, ddt_entry_compare, 799 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 800 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, 801 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 802 ddt->ddt_checksum = c; 803 ddt->ddt_spa = spa; 804 ddt->ddt_os = spa->spa_meta_objset; 805 806 return (ddt); 807} 808 809static void 810ddt_table_free(ddt_t *ddt) 811{ 812 ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); 813 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); 814 avl_destroy(&ddt->ddt_tree); 815 avl_destroy(&ddt->ddt_repair_tree); 816 mutex_destroy(&ddt->ddt_lock); 817 kmem_free(ddt, sizeof (*ddt)); 818} 819 820void 821ddt_create(spa_t *spa) 822{ 823 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 824 825 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 826 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 827} 828 829int 830ddt_load(spa_t *spa) 831{ 832 int error; 833 834 ddt_create(spa); 835 836 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 837 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 838 &spa->spa_ddt_stat_object); 839 840 if (error) 841 return (error == ENOENT ? 0 : error); 842 843 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 844 ddt_t *ddt = spa->spa_ddt[c]; 845 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 846 for (enum ddt_class class = 0; class < DDT_CLASSES; 847 class++) { 848 error = ddt_object_load(ddt, type, class); 849 if (error != 0 && error != ENOENT) 850 return (error); 851 } 852 } 853 854 /* 855 * Seed the cached histograms. 856 */ 857 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 858 sizeof (ddt->ddt_histogram)); 859 } 860 861 return (0); 862} 863 864void 865ddt_unload(spa_t *spa) 866{ 867 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 868 if (spa->spa_ddt[c]) { 869 ddt_table_free(spa->spa_ddt[c]); 870 spa->spa_ddt[c] = NULL; 871 } 872 } 873} 874 875boolean_t 876ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) 877{ 878 ddt_t *ddt; 879 ddt_entry_t dde; 880 881 if (!BP_GET_DEDUP(bp)) 882 return (B_FALSE); 883 884 if (max_class == DDT_CLASS_UNIQUE) 885 return (B_TRUE); 886 887 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; 888 889 ddt_key_fill(&dde.dde_key, bp); 890 891 for (enum ddt_type type = 0; type < DDT_TYPES; type++) 892 for (enum ddt_class class = 0; class <= max_class; class++) 893 if (ddt_object_lookup(ddt, type, class, &dde) == 0) 894 return (B_TRUE); 895 896 return (B_FALSE); 897} 898 899ddt_entry_t * 900ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 901{ 902 ddt_key_t ddk; 903 ddt_entry_t *dde; 904 905 ddt_key_fill(&ddk, bp); 906 907 dde = ddt_alloc(&ddk); 908 909 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 910 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 911 /* 912 * We can only do repair if there are multiple copies 913 * of the block. For anything in the UNIQUE class, 914 * there's definitely only one copy, so don't even try. 915 */ 916 if (class != DDT_CLASS_UNIQUE && 917 ddt_object_lookup(ddt, type, class, dde) == 0) 918 return (dde); 919 } 920 } 921 922 bzero(dde->dde_phys, sizeof (dde->dde_phys)); 923 924 return (dde); 925} 926 927void 928ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 929{ 930 avl_index_t where; 931 932 ddt_enter(ddt); 933 934 if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && 935 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 936 avl_insert(&ddt->ddt_repair_tree, dde, where); 937 else 938 ddt_free(dde); 939 940 ddt_exit(ddt); 941} 942 943static void 944ddt_repair_entry_done(zio_t *zio) 945{ 946 ddt_entry_t *rdde = zio->io_private; 947 948 ddt_free(rdde); 949} 950 951static void 952ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 953{ 954 ddt_phys_t *ddp = dde->dde_phys; 955 ddt_phys_t *rddp = rdde->dde_phys; 956 ddt_key_t *ddk = &dde->dde_key; 957 ddt_key_t *rddk = &rdde->dde_key; 958 zio_t *zio; 959 blkptr_t blk; 960 961 zio = zio_null(rio, rio->io_spa, NULL, 962 ddt_repair_entry_done, rdde, rio->io_flags); 963 964 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 965 if (ddp->ddp_phys_birth == 0 || 966 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 967 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 968 continue; 969 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 970 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 971 rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, 972 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 973 } 974 975 zio_nowait(zio); 976} 977 978static void 979ddt_repair_table(ddt_t *ddt, zio_t *rio) 980{ 981 spa_t *spa = ddt->ddt_spa; 982 ddt_entry_t *dde, *rdde_next, *rdde; 983 avl_tree_t *t = &ddt->ddt_repair_tree; 984 blkptr_t blk; 985 986 if (spa_sync_pass(spa) > 1) 987 return; 988 989 ddt_enter(ddt); 990 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 991 rdde_next = AVL_NEXT(t, rdde); 992 avl_remove(&ddt->ddt_repair_tree, rdde); 993 ddt_exit(ddt); 994 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); 995 dde = ddt_repair_start(ddt, &blk); 996 ddt_repair_entry(ddt, dde, rdde, rio); 997 ddt_repair_done(ddt, dde); 998 ddt_enter(ddt); 999 } 1000 ddt_exit(ddt); 1001} 1002 1003static void 1004ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 1005{ 1006 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; 1007 ddt_phys_t *ddp = dde->dde_phys; 1008 ddt_key_t *ddk = &dde->dde_key; 1009 enum ddt_type otype = dde->dde_type; 1010 enum ddt_type ntype = DDT_TYPE_CURRENT; 1011 enum ddt_class oclass = dde->dde_class; 1012 enum ddt_class nclass; 1013 uint64_t total_refcnt = 0; 1014 1015 ASSERT(dde->dde_loaded); 1016 ASSERT(!dde->dde_loading); 1017 1018 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1019 ASSERT(dde->dde_lead_zio[p] == NULL); 1020 ASSERT((int64_t)ddp->ddp_refcnt >= 0); 1021 if (ddp->ddp_phys_birth == 0) { 1022 ASSERT(ddp->ddp_refcnt == 0); 1023 continue; 1024 } 1025 if (p == DDT_PHYS_DITTO) { 1026 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) 1027 ddt_phys_free(ddt, ddk, ddp, txg); 1028 continue; 1029 } 1030 if (ddp->ddp_refcnt == 0) 1031 ddt_phys_free(ddt, ddk, ddp, txg); 1032 total_refcnt += ddp->ddp_refcnt; 1033 } 1034 1035 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) 1036 nclass = DDT_CLASS_DITTO; 1037 else if (total_refcnt > 1) 1038 nclass = DDT_CLASS_DUPLICATE; 1039 else 1040 nclass = DDT_CLASS_UNIQUE; 1041 1042 if (otype != DDT_TYPES && 1043 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 1044 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); 1045 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); 1046 } 1047 1048 if (total_refcnt != 0) { 1049 dde->dde_type = ntype; 1050 dde->dde_class = nclass; 1051 ddt_stat_update(ddt, dde, 0); 1052 if (!ddt_object_exists(ddt, ntype, nclass)) 1053 ddt_object_create(ddt, ntype, nclass, tx); 1054 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); 1055 1056 /* 1057 * If the class changes, the order that we scan this bp 1058 * changes. If it decreases, we could miss it, so 1059 * scan it right now. (This covers both class changing 1060 * while we are doing ddt_walk(), and when we are 1061 * traversing.) 1062 */ 1063 if (nclass < oclass) { 1064 dsl_scan_ddt_entry(dp->dp_scan, 1065 ddt->ddt_checksum, dde, tx); 1066 } 1067 } 1068} 1069 1070static void 1071ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 1072{ 1073 spa_t *spa = ddt->ddt_spa; 1074 ddt_entry_t *dde; 1075 void *cookie = NULL; 1076 1077 if (avl_numnodes(&ddt->ddt_tree) == 0) 1078 return; 1079 1080 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); 1081 1082 if (spa->spa_ddt_stat_object == 0) { 1083 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, 1084 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, 1085 DMU_POOL_DDT_STATS, tx); 1086 } 1087 1088 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 1089 ddt_sync_entry(ddt, dde, tx, txg); 1090 ddt_free(dde); 1091 } 1092 1093 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 1094 uint64_t add, count = 0; 1095 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 1096 if (ddt_object_exists(ddt, type, class)) { 1097 ddt_object_sync(ddt, type, class, tx); 1098 VERIFY(ddt_object_count(ddt, type, class, 1099 &add) == 0); 1100 count += add; 1101 } 1102 } 1103 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 1104 if (count == 0 && ddt_object_exists(ddt, type, class)) 1105 ddt_object_destroy(ddt, type, class, tx); 1106 } 1107 } 1108 1109 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 1110 sizeof (ddt->ddt_histogram)); 1111} 1112 1113void 1114ddt_sync(spa_t *spa, uint64_t txg) 1115{ 1116 dmu_tx_t *tx; 1117 zio_t *rio = zio_root(spa, NULL, NULL, 1118 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1119 1120 ASSERT(spa_syncing_txg(spa) == txg); 1121 1122 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1123 1124 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1125 ddt_t *ddt = spa->spa_ddt[c]; 1126 if (ddt == NULL) 1127 continue; 1128 ddt_sync_table(ddt, tx, txg); 1129 ddt_repair_table(ddt, rio); 1130 } 1131 1132 (void) zio_wait(rio); 1133 1134 dmu_tx_commit(tx); 1135} 1136 1137int 1138ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) 1139{ 1140 do { 1141 do { 1142 do { 1143 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; 1144 int error = ENOENT; 1145 if (ddt_object_exists(ddt, ddb->ddb_type, 1146 ddb->ddb_class)) { 1147 error = ddt_object_walk(ddt, 1148 ddb->ddb_type, ddb->ddb_class, 1149 &ddb->ddb_cursor, dde); 1150 } 1151 dde->dde_type = ddb->ddb_type; 1152 dde->dde_class = ddb->ddb_class; 1153 if (error == 0) 1154 return (0); 1155 if (error != ENOENT) 1156 return (error); 1157 ddb->ddb_cursor = 0; 1158 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); 1159 ddb->ddb_checksum = 0; 1160 } while (++ddb->ddb_type < DDT_TYPES); 1161 ddb->ddb_type = 0; 1162 } while (++ddb->ddb_class < DDT_CLASSES); 1163 1164 return (SET_ERROR(ENOENT)); 1165} 1166