dsl_deadlist.c revision 297112
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 25 * Copyright (c) 2014 Integros [integros.com] 26 */ 27 28#include <sys/dsl_dataset.h> 29#include <sys/dmu.h> 30#include <sys/refcount.h> 31#include <sys/zap.h> 32#include <sys/zfs_context.h> 33#include <sys/dsl_pool.h> 34 35/* 36 * Deadlist concurrency: 37 * 38 * Deadlists can only be modified from the syncing thread. 39 * 40 * Except for dsl_deadlist_insert(), it can only be modified with the 41 * dp_config_rwlock held with RW_WRITER. 42 * 43 * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can 44 * be called concurrently, from open context, with the dl_config_rwlock held 45 * with RW_READER. 46 * 47 * Therefore, we only need to provide locking between dsl_deadlist_insert() and 48 * the accessors, protecting: 49 * dl_phys->dl_used,comp,uncomp 50 * and protecting the dl_tree from being loaded. 51 * The locking is provided by dl_lock. Note that locking on the bpobj_t 52 * provides its own locking, and dl_oldfmt is immutable. 53 */ 54 55static int 56dsl_deadlist_compare(const void *arg1, const void *arg2) 57{ 58 const dsl_deadlist_entry_t *dle1 = arg1; 59 const dsl_deadlist_entry_t *dle2 = arg2; 60 61 if (dle1->dle_mintxg < dle2->dle_mintxg) 62 return (-1); 63 else if (dle1->dle_mintxg > dle2->dle_mintxg) 64 return (+1); 65 else 66 return (0); 67} 68 69static void 70dsl_deadlist_load_tree(dsl_deadlist_t *dl) 71{ 72 zap_cursor_t zc; 73 zap_attribute_t za; 74 75 ASSERT(!dl->dl_oldfmt); 76 if (dl->dl_havetree) 77 return; 78 79 avl_create(&dl->dl_tree, dsl_deadlist_compare, 80 sizeof (dsl_deadlist_entry_t), 81 offsetof(dsl_deadlist_entry_t, dle_node)); 82 for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); 83 zap_cursor_retrieve(&zc, &za) == 0; 84 zap_cursor_advance(&zc)) { 85 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 86 dle->dle_mintxg = strtonum(za.za_name, NULL); 87 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, 88 za.za_first_integer)); 89 avl_add(&dl->dl_tree, dle); 90 } 91 zap_cursor_fini(&zc); 92 dl->dl_havetree = B_TRUE; 93} 94 95void 96dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) 97{ 98 dmu_object_info_t doi; 99 100 mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); 101 dl->dl_os = os; 102 dl->dl_object = object; 103 VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); 104 dmu_object_info_from_db(dl->dl_dbuf, &doi); 105 if (doi.doi_type == DMU_OT_BPOBJ) { 106 dmu_buf_rele(dl->dl_dbuf, dl); 107 dl->dl_dbuf = NULL; 108 dl->dl_oldfmt = B_TRUE; 109 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); 110 return; 111 } 112 113 dl->dl_oldfmt = B_FALSE; 114 dl->dl_phys = dl->dl_dbuf->db_data; 115 dl->dl_havetree = B_FALSE; 116} 117 118void 119dsl_deadlist_close(dsl_deadlist_t *dl) 120{ 121 void *cookie = NULL; 122 dsl_deadlist_entry_t *dle; 123 124 dl->dl_os = NULL; 125 126 if (dl->dl_oldfmt) { 127 dl->dl_oldfmt = B_FALSE; 128 bpobj_close(&dl->dl_bpobj); 129 return; 130 } 131 132 if (dl->dl_havetree) { 133 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) 134 != NULL) { 135 bpobj_close(&dle->dle_bpobj); 136 kmem_free(dle, sizeof (*dle)); 137 } 138 avl_destroy(&dl->dl_tree); 139 } 140 dmu_buf_rele(dl->dl_dbuf, dl); 141 mutex_destroy(&dl->dl_lock); 142 dl->dl_dbuf = NULL; 143 dl->dl_phys = NULL; 144} 145 146uint64_t 147dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) 148{ 149 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) 150 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); 151 return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, 152 sizeof (dsl_deadlist_phys_t), tx)); 153} 154 155void 156dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) 157{ 158 dmu_object_info_t doi; 159 zap_cursor_t zc; 160 zap_attribute_t za; 161 162 VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); 163 if (doi.doi_type == DMU_OT_BPOBJ) { 164 bpobj_free(os, dlobj, tx); 165 return; 166 } 167 168 for (zap_cursor_init(&zc, os, dlobj); 169 zap_cursor_retrieve(&zc, &za) == 0; 170 zap_cursor_advance(&zc)) { 171 uint64_t obj = za.za_first_integer; 172 if (obj == dmu_objset_pool(os)->dp_empty_bpobj) 173 bpobj_decr_empty(os, tx); 174 else 175 bpobj_free(os, obj, tx); 176 } 177 zap_cursor_fini(&zc); 178 VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); 179} 180 181static void 182dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 183 const blkptr_t *bp, dmu_tx_t *tx) 184{ 185 if (dle->dle_bpobj.bpo_object == 186 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 187 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 188 bpobj_close(&dle->dle_bpobj); 189 bpobj_decr_empty(dl->dl_os, tx); 190 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 191 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 192 dle->dle_mintxg, obj, tx)); 193 } 194 bpobj_enqueue(&dle->dle_bpobj, bp, tx); 195} 196 197static void 198dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, 199 uint64_t obj, dmu_tx_t *tx) 200{ 201 if (dle->dle_bpobj.bpo_object != 202 dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { 203 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); 204 } else { 205 bpobj_close(&dle->dle_bpobj); 206 bpobj_decr_empty(dl->dl_os, tx); 207 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 208 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, 209 dle->dle_mintxg, obj, tx)); 210 } 211} 212 213void 214dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) 215{ 216 dsl_deadlist_entry_t dle_tofind; 217 dsl_deadlist_entry_t *dle; 218 avl_index_t where; 219 220 if (dl->dl_oldfmt) { 221 bpobj_enqueue(&dl->dl_bpobj, bp, tx); 222 return; 223 } 224 225 dsl_deadlist_load_tree(dl); 226 227 dmu_buf_will_dirty(dl->dl_dbuf, tx); 228 mutex_enter(&dl->dl_lock); 229 dl->dl_phys->dl_used += 230 bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); 231 dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); 232 dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); 233 mutex_exit(&dl->dl_lock); 234 235 dle_tofind.dle_mintxg = bp->blk_birth; 236 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 237 if (dle == NULL) 238 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 239 else 240 dle = AVL_PREV(&dl->dl_tree, dle); 241 dle_enqueue(dl, dle, bp, tx); 242} 243 244/* 245 * Insert new key in deadlist, which must be > all current entries. 246 * mintxg is not inclusive. 247 */ 248void 249dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 250{ 251 uint64_t obj; 252 dsl_deadlist_entry_t *dle; 253 254 if (dl->dl_oldfmt) 255 return; 256 257 dsl_deadlist_load_tree(dl); 258 259 dle = kmem_alloc(sizeof (*dle), KM_SLEEP); 260 dle->dle_mintxg = mintxg; 261 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 262 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); 263 avl_add(&dl->dl_tree, dle); 264 265 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, 266 mintxg, obj, tx)); 267} 268 269/* 270 * Remove this key, merging its entries into the previous key. 271 */ 272void 273dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) 274{ 275 dsl_deadlist_entry_t dle_tofind; 276 dsl_deadlist_entry_t *dle, *dle_prev; 277 278 if (dl->dl_oldfmt) 279 return; 280 281 dsl_deadlist_load_tree(dl); 282 283 dle_tofind.dle_mintxg = mintxg; 284 dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); 285 dle_prev = AVL_PREV(&dl->dl_tree, dle); 286 287 dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); 288 289 avl_remove(&dl->dl_tree, dle); 290 bpobj_close(&dle->dle_bpobj); 291 kmem_free(dle, sizeof (*dle)); 292 293 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); 294} 295 296/* 297 * Walk ds's snapshots to regenerate generate ZAP & AVL. 298 */ 299static void 300dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, 301 uint64_t mrs_obj, dmu_tx_t *tx) 302{ 303 dsl_deadlist_t dl; 304 dsl_pool_t *dp = dmu_objset_pool(os); 305 306 dsl_deadlist_open(&dl, os, dlobj); 307 if (dl.dl_oldfmt) { 308 dsl_deadlist_close(&dl); 309 return; 310 } 311 312 while (mrs_obj != 0) { 313 dsl_dataset_t *ds; 314 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); 315 dsl_deadlist_add_key(&dl, 316 dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); 317 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 318 dsl_dataset_rele(ds, FTAG); 319 } 320 dsl_deadlist_close(&dl); 321} 322 323uint64_t 324dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, 325 uint64_t mrs_obj, dmu_tx_t *tx) 326{ 327 dsl_deadlist_entry_t *dle; 328 uint64_t newobj; 329 330 newobj = dsl_deadlist_alloc(dl->dl_os, tx); 331 332 if (dl->dl_oldfmt) { 333 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); 334 return (newobj); 335 } 336 337 dsl_deadlist_load_tree(dl); 338 339 for (dle = avl_first(&dl->dl_tree); dle; 340 dle = AVL_NEXT(&dl->dl_tree, dle)) { 341 uint64_t obj; 342 343 if (dle->dle_mintxg >= maxtxg) 344 break; 345 346 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); 347 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, 348 dle->dle_mintxg, obj, tx)); 349 } 350 return (newobj); 351} 352 353void 354dsl_deadlist_space(dsl_deadlist_t *dl, 355 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 356{ 357 if (dl->dl_oldfmt) { 358 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, 359 usedp, compp, uncompp)); 360 return; 361 } 362 363 mutex_enter(&dl->dl_lock); 364 *usedp = dl->dl_phys->dl_used; 365 *compp = dl->dl_phys->dl_comp; 366 *uncompp = dl->dl_phys->dl_uncomp; 367 mutex_exit(&dl->dl_lock); 368} 369 370/* 371 * return space used in the range (mintxg, maxtxg]. 372 * Includes maxtxg, does not include mintxg. 373 * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is 374 * larger than any bp in the deadlist (eg. UINT64_MAX)). 375 */ 376void 377dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, 378 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) 379{ 380 dsl_deadlist_entry_t *dle; 381 dsl_deadlist_entry_t dle_tofind; 382 avl_index_t where; 383 384 if (dl->dl_oldfmt) { 385 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, 386 mintxg, maxtxg, usedp, compp, uncompp)); 387 return; 388 } 389 390 *usedp = *compp = *uncompp = 0; 391 392 mutex_enter(&dl->dl_lock); 393 dsl_deadlist_load_tree(dl); 394 dle_tofind.dle_mintxg = mintxg; 395 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 396 /* 397 * If we don't find this mintxg, there shouldn't be anything 398 * after it either. 399 */ 400 ASSERT(dle != NULL || 401 avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); 402 403 for (; dle && dle->dle_mintxg < maxtxg; 404 dle = AVL_NEXT(&dl->dl_tree, dle)) { 405 uint64_t used, comp, uncomp; 406 407 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 408 &used, &comp, &uncomp)); 409 410 *usedp += used; 411 *compp += comp; 412 *uncompp += uncomp; 413 } 414 mutex_exit(&dl->dl_lock); 415} 416 417static void 418dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, 419 dmu_tx_t *tx) 420{ 421 dsl_deadlist_entry_t dle_tofind; 422 dsl_deadlist_entry_t *dle; 423 avl_index_t where; 424 uint64_t used, comp, uncomp; 425 bpobj_t bpo; 426 427 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 428 VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); 429 bpobj_close(&bpo); 430 431 dsl_deadlist_load_tree(dl); 432 433 dmu_buf_will_dirty(dl->dl_dbuf, tx); 434 mutex_enter(&dl->dl_lock); 435 dl->dl_phys->dl_used += used; 436 dl->dl_phys->dl_comp += comp; 437 dl->dl_phys->dl_uncomp += uncomp; 438 mutex_exit(&dl->dl_lock); 439 440 dle_tofind.dle_mintxg = birth; 441 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 442 if (dle == NULL) 443 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); 444 dle_enqueue_subobj(dl, dle, obj, tx); 445} 446 447static int 448dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 449{ 450 dsl_deadlist_t *dl = arg; 451 dsl_deadlist_insert(dl, bp, tx); 452 return (0); 453} 454 455/* 456 * Merge the deadlist pointed to by 'obj' into dl. obj will be left as 457 * an empty deadlist. 458 */ 459void 460dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) 461{ 462 zap_cursor_t zc; 463 zap_attribute_t za; 464 dmu_buf_t *bonus; 465 dsl_deadlist_phys_t *dlp; 466 dmu_object_info_t doi; 467 468 VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); 469 if (doi.doi_type == DMU_OT_BPOBJ) { 470 bpobj_t bpo; 471 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); 472 VERIFY3U(0, ==, bpobj_iterate(&bpo, 473 dsl_deadlist_insert_cb, dl, tx)); 474 bpobj_close(&bpo); 475 return; 476 } 477 478 for (zap_cursor_init(&zc, dl->dl_os, obj); 479 zap_cursor_retrieve(&zc, &za) == 0; 480 zap_cursor_advance(&zc)) { 481 uint64_t mintxg = strtonum(za.za_name, NULL); 482 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); 483 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); 484 } 485 zap_cursor_fini(&zc); 486 487 VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); 488 dlp = bonus->db_data; 489 dmu_buf_will_dirty(bonus, tx); 490 bzero(dlp, sizeof (*dlp)); 491 dmu_buf_rele(bonus, FTAG); 492} 493 494/* 495 * Remove entries on dl that are >= mintxg, and put them on the bpobj. 496 */ 497void 498dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, 499 dmu_tx_t *tx) 500{ 501 dsl_deadlist_entry_t dle_tofind; 502 dsl_deadlist_entry_t *dle; 503 avl_index_t where; 504 505 ASSERT(!dl->dl_oldfmt); 506 dmu_buf_will_dirty(dl->dl_dbuf, tx); 507 dsl_deadlist_load_tree(dl); 508 509 dle_tofind.dle_mintxg = mintxg; 510 dle = avl_find(&dl->dl_tree, &dle_tofind, &where); 511 if (dle == NULL) 512 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); 513 while (dle) { 514 uint64_t used, comp, uncomp; 515 dsl_deadlist_entry_t *dle_next; 516 517 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); 518 519 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, 520 &used, &comp, &uncomp)); 521 mutex_enter(&dl->dl_lock); 522 ASSERT3U(dl->dl_phys->dl_used, >=, used); 523 ASSERT3U(dl->dl_phys->dl_comp, >=, comp); 524 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); 525 dl->dl_phys->dl_used -= used; 526 dl->dl_phys->dl_comp -= comp; 527 dl->dl_phys->dl_uncomp -= uncomp; 528 mutex_exit(&dl->dl_lock); 529 530 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, 531 dle->dle_mintxg, tx)); 532 533 dle_next = AVL_NEXT(&dl->dl_tree, dle); 534 avl_remove(&dl->dl_tree, dle); 535 bpobj_close(&dle->dle_bpobj); 536 kmem_free(dle, sizeof (*dle)); 537 dle = dle_next; 538 } 539} 540