dsl_dir.c revision 289100
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 24 * All rights reserved. 25 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 26 * Copyright (c) 2014 Joyent, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 29 */ 30 31#include <sys/dmu.h> 32#include <sys/dmu_objset.h> 33#include <sys/dmu_tx.h> 34#include <sys/dsl_dataset.h> 35#include <sys/dsl_dir.h> 36#include <sys/dsl_prop.h> 37#include <sys/dsl_synctask.h> 38#include <sys/dsl_deleg.h> 39#include <sys/dmu_impl.h> 40#include <sys/spa.h> 41#include <sys/metaslab.h> 42#include <sys/zap.h> 43#include <sys/zio.h> 44#include <sys/arc.h> 45#include <sys/sunddi.h> 46#include <sys/zvol.h> 47#ifdef _KERNEL 48#include <sys/zfs_vfsops.h> 49#endif 50#include <sys/zfeature.h> 51#include <sys/policy.h> 52#include <sys/zfs_znode.h> 53#include "zfs_namecheck.h" 54#include "zfs_prop.h" 55 56/* 57 * Filesystem and Snapshot Limits 58 * ------------------------------ 59 * 60 * These limits are used to restrict the number of filesystems and/or snapshots 61 * that can be created at a given level in the tree or below. A typical 62 * use-case is with a delegated dataset where the administrator wants to ensure 63 * that a user within the zone is not creating too many additional filesystems 64 * or snapshots, even though they're not exceeding their space quota. 65 * 66 * The filesystem and snapshot counts are stored as extensible properties. This 67 * capability is controlled by a feature flag and must be enabled to be used. 68 * Once enabled, the feature is not active until the first limit is set. At 69 * that point, future operations to create/destroy filesystems or snapshots 70 * will validate and update the counts. 71 * 72 * Because the count properties will not exist before the feature is active, 73 * the counts are updated when a limit is first set on an uninitialized 74 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes 75 * all of the nested filesystems/snapshots. Thus, a new leaf node has a 76 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and 77 * snapshot count properties on a node indicate uninitialized counts on that 78 * node.) When first setting a limit on an uninitialized node, the code starts 79 * at the filesystem with the new limit and descends into all sub-filesystems 80 * to add the count properties. 81 * 82 * In practice this is lightweight since a limit is typically set when the 83 * filesystem is created and thus has no children. Once valid, changing the 84 * limit value won't require a re-traversal since the counts are already valid. 85 * When recursively fixing the counts, if a node with a limit is encountered 86 * during the descent, the counts are known to be valid and there is no need to 87 * descend into that filesystem's children. The counts on filesystems above the 88 * one with the new limit will still be uninitialized, unless a limit is 89 * eventually set on one of those filesystems. The counts are always recursively 90 * updated when a limit is set on a dataset, unless there is already a limit. 91 * When a new limit value is set on a filesystem with an existing limit, it is 92 * possible for the new limit to be less than the current count at that level 93 * since a user who can change the limit is also allowed to exceed the limit. 94 * 95 * Once the feature is active, then whenever a filesystem or snapshot is 96 * created, the code recurses up the tree, validating the new count against the 97 * limit at each initialized level. In practice, most levels will not have a 98 * limit set. If there is a limit at any initialized level up the tree, the 99 * check must pass or the creation will fail. Likewise, when a filesystem or 100 * snapshot is destroyed, the counts are recursively adjusted all the way up 101 * the initizized nodes in the tree. Renaming a filesystem into different point 102 * in the tree will first validate, then update the counts on each branch up to 103 * the common ancestor. A receive will also validate the counts and then update 104 * them. 105 * 106 * An exception to the above behavior is that the limit is not enforced if the 107 * user has permission to modify the limit. This is primarily so that 108 * recursive snapshots in the global zone always work. We want to prevent a 109 * denial-of-service in which a lower level delegated dataset could max out its 110 * limit and thus block recursive snapshots from being taken in the global zone. 111 * Because of this, it is possible for the snapshot count to be over the limit 112 * and snapshots taken in the global zone could cause a lower level dataset to 113 * hit or exceed its limit. The administrator taking the global zone recursive 114 * snapshot should be aware of this side-effect and behave accordingly. 115 * For consistency, the filesystem limit is also not enforced if the user can 116 * modify the limit. 117 * 118 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() 119 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in 120 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by 121 * dsl_dir_init_fs_ss_count(). 122 * 123 * There is a special case when we receive a filesystem that already exists. In 124 * this case a temporary clone name of %X is created (see dmu_recv_begin). We 125 * never update the filesystem counts for temporary clones. 126 * 127 * Likewise, we do not update the snapshot counts for temporary snapshots, 128 * such as those created by zfs diff. 129 */ 130 131extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); 132 133static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); 134 135static void 136dsl_dir_evict(void *dbu) 137{ 138 dsl_dir_t *dd = dbu; 139 dsl_pool_t *dp = dd->dd_pool; 140 int t; 141 142 dd->dd_dbuf = NULL; 143 144 for (t = 0; t < TXG_SIZE; t++) { 145 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); 146 ASSERT(dd->dd_tempreserved[t] == 0); 147 ASSERT(dd->dd_space_towrite[t] == 0); 148 } 149 150 if (dd->dd_parent) 151 dsl_dir_async_rele(dd->dd_parent, dd); 152 153 spa_async_close(dd->dd_pool->dp_spa, dd); 154 155 dsl_prop_fini(dd); 156 mutex_destroy(&dd->dd_lock); 157 kmem_free(dd, sizeof (dsl_dir_t)); 158} 159 160int 161dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, 162 const char *tail, void *tag, dsl_dir_t **ddp) 163{ 164 dmu_buf_t *dbuf; 165 dsl_dir_t *dd; 166 int err; 167 168 ASSERT(dsl_pool_config_held(dp)); 169 170 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); 171 if (err != 0) 172 return (err); 173 dd = dmu_buf_get_user(dbuf); 174#ifdef ZFS_DEBUG 175 { 176 dmu_object_info_t doi; 177 dmu_object_info_from_db(dbuf, &doi); 178 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); 179 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); 180 } 181#endif 182 if (dd == NULL) { 183 dsl_dir_t *winner; 184 185 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); 186 dd->dd_object = ddobj; 187 dd->dd_dbuf = dbuf; 188 dd->dd_pool = dp; 189 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); 190 dsl_prop_init(dd); 191 192 dsl_dir_snap_cmtime_update(dd); 193 194 if (dsl_dir_phys(dd)->dd_parent_obj) { 195 err = dsl_dir_hold_obj(dp, 196 dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, 197 &dd->dd_parent); 198 if (err != 0) 199 goto errout; 200 if (tail) { 201#ifdef ZFS_DEBUG 202 uint64_t foundobj; 203 204 err = zap_lookup(dp->dp_meta_objset, 205 dsl_dir_phys(dd->dd_parent)-> 206 dd_child_dir_zapobj, tail, 207 sizeof (foundobj), 1, &foundobj); 208 ASSERT(err || foundobj == ddobj); 209#endif 210 (void) strcpy(dd->dd_myname, tail); 211 } else { 212 err = zap_value_search(dp->dp_meta_objset, 213 dsl_dir_phys(dd->dd_parent)-> 214 dd_child_dir_zapobj, 215 ddobj, 0, dd->dd_myname); 216 } 217 if (err != 0) 218 goto errout; 219 } else { 220 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); 221 } 222 223 if (dsl_dir_is_clone(dd)) { 224 dmu_buf_t *origin_bonus; 225 dsl_dataset_phys_t *origin_phys; 226 227 /* 228 * We can't open the origin dataset, because 229 * that would require opening this dsl_dir. 230 * Just look at its phys directly instead. 231 */ 232 err = dmu_bonus_hold(dp->dp_meta_objset, 233 dsl_dir_phys(dd)->dd_origin_obj, FTAG, 234 &origin_bonus); 235 if (err != 0) 236 goto errout; 237 origin_phys = origin_bonus->db_data; 238 dd->dd_origin_txg = 239 origin_phys->ds_creation_txg; 240 dmu_buf_rele(origin_bonus, FTAG); 241 } 242 243 dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf); 244 winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); 245 if (winner != NULL) { 246 if (dd->dd_parent) 247 dsl_dir_rele(dd->dd_parent, dd); 248 dsl_prop_fini(dd); 249 mutex_destroy(&dd->dd_lock); 250 kmem_free(dd, sizeof (dsl_dir_t)); 251 dd = winner; 252 } else { 253 spa_open_ref(dp->dp_spa, dd); 254 } 255 } 256 257 /* 258 * The dsl_dir_t has both open-to-close and instantiate-to-evict 259 * holds on the spa. We need the open-to-close holds because 260 * otherwise the spa_refcnt wouldn't change when we open a 261 * dir which the spa also has open, so we could incorrectly 262 * think it was OK to unload/export/destroy the pool. We need 263 * the instantiate-to-evict hold because the dsl_dir_t has a 264 * pointer to the dd_pool, which has a pointer to the spa_t. 265 */ 266 spa_open_ref(dp->dp_spa, tag); 267 ASSERT3P(dd->dd_pool, ==, dp); 268 ASSERT3U(dd->dd_object, ==, ddobj); 269 ASSERT3P(dd->dd_dbuf, ==, dbuf); 270 *ddp = dd; 271 return (0); 272 273errout: 274 if (dd->dd_parent) 275 dsl_dir_rele(dd->dd_parent, dd); 276 dsl_prop_fini(dd); 277 mutex_destroy(&dd->dd_lock); 278 kmem_free(dd, sizeof (dsl_dir_t)); 279 dmu_buf_rele(dbuf, tag); 280 return (err); 281} 282 283void 284dsl_dir_rele(dsl_dir_t *dd, void *tag) 285{ 286 dprintf_dd(dd, "%s\n", ""); 287 spa_close(dd->dd_pool->dp_spa, tag); 288 dmu_buf_rele(dd->dd_dbuf, tag); 289} 290 291/* 292 * Remove a reference to the given dsl dir that is being asynchronously 293 * released. Async releases occur from a taskq performing eviction of 294 * dsl datasets and dirs. This process is identical to a normal release 295 * with the exception of using the async API for releasing the reference on 296 * the spa. 297 */ 298void 299dsl_dir_async_rele(dsl_dir_t *dd, void *tag) 300{ 301 dprintf_dd(dd, "%s\n", ""); 302 spa_async_close(dd->dd_pool->dp_spa, tag); 303 dmu_buf_rele(dd->dd_dbuf, tag); 304} 305 306/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ 307void 308dsl_dir_name(dsl_dir_t *dd, char *buf) 309{ 310 if (dd->dd_parent) { 311 dsl_dir_name(dd->dd_parent, buf); 312 (void) strcat(buf, "/"); 313 } else { 314 buf[0] = '\0'; 315 } 316 if (!MUTEX_HELD(&dd->dd_lock)) { 317 /* 318 * recursive mutex so that we can use 319 * dprintf_dd() with dd_lock held 320 */ 321 mutex_enter(&dd->dd_lock); 322 (void) strcat(buf, dd->dd_myname); 323 mutex_exit(&dd->dd_lock); 324 } else { 325 (void) strcat(buf, dd->dd_myname); 326 } 327} 328 329/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ 330int 331dsl_dir_namelen(dsl_dir_t *dd) 332{ 333 int result = 0; 334 335 if (dd->dd_parent) { 336 /* parent's name + 1 for the "/" */ 337 result = dsl_dir_namelen(dd->dd_parent) + 1; 338 } 339 340 if (!MUTEX_HELD(&dd->dd_lock)) { 341 /* see dsl_dir_name */ 342 mutex_enter(&dd->dd_lock); 343 result += strlen(dd->dd_myname); 344 mutex_exit(&dd->dd_lock); 345 } else { 346 result += strlen(dd->dd_myname); 347 } 348 349 return (result); 350} 351 352static int 353getcomponent(const char *path, char *component, const char **nextp) 354{ 355 char *p; 356 357 if ((path == NULL) || (path[0] == '\0')) 358 return (SET_ERROR(ENOENT)); 359 /* This would be a good place to reserve some namespace... */ 360 p = strpbrk(path, "/@"); 361 if (p && (p[1] == '/' || p[1] == '@')) { 362 /* two separators in a row */ 363 return (SET_ERROR(EINVAL)); 364 } 365 if (p == NULL || p == path) { 366 /* 367 * if the first thing is an @ or /, it had better be an 368 * @ and it had better not have any more ats or slashes, 369 * and it had better have something after the @. 370 */ 371 if (p != NULL && 372 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) 373 return (SET_ERROR(EINVAL)); 374 if (strlen(path) >= MAXNAMELEN) 375 return (SET_ERROR(ENAMETOOLONG)); 376 (void) strcpy(component, path); 377 p = NULL; 378 } else if (p[0] == '/') { 379 if (p - path >= MAXNAMELEN) 380 return (SET_ERROR(ENAMETOOLONG)); 381 (void) strncpy(component, path, p - path); 382 component[p - path] = '\0'; 383 p++; 384 } else if (p[0] == '@') { 385 /* 386 * if the next separator is an @, there better not be 387 * any more slashes. 388 */ 389 if (strchr(path, '/')) 390 return (SET_ERROR(EINVAL)); 391 if (p - path >= MAXNAMELEN) 392 return (SET_ERROR(ENAMETOOLONG)); 393 (void) strncpy(component, path, p - path); 394 component[p - path] = '\0'; 395 } else { 396 panic("invalid p=%p", (void *)p); 397 } 398 *nextp = p; 399 return (0); 400} 401 402/* 403 * Return the dsl_dir_t, and possibly the last component which couldn't 404 * be found in *tail. The name must be in the specified dsl_pool_t. This 405 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the 406 * path is bogus, or if tail==NULL and we couldn't parse the whole name. 407 * (*tail)[0] == '@' means that the last component is a snapshot. 408 */ 409int 410dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, 411 dsl_dir_t **ddp, const char **tailp) 412{ 413 char buf[MAXNAMELEN]; 414 const char *spaname, *next, *nextnext = NULL; 415 int err; 416 dsl_dir_t *dd; 417 uint64_t ddobj; 418 419 err = getcomponent(name, buf, &next); 420 if (err != 0) 421 return (err); 422 423 /* Make sure the name is in the specified pool. */ 424 spaname = spa_name(dp->dp_spa); 425 if (strcmp(buf, spaname) != 0) 426 return (SET_ERROR(EXDEV)); 427 428 ASSERT(dsl_pool_config_held(dp)); 429 430 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); 431 if (err != 0) { 432 return (err); 433 } 434 435 while (next != NULL) { 436 dsl_dir_t *child_dd; 437 err = getcomponent(next, buf, &nextnext); 438 if (err != 0) 439 break; 440 ASSERT(next[0] != '\0'); 441 if (next[0] == '@') 442 break; 443 dprintf("looking up %s in obj%lld\n", 444 buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); 445 446 err = zap_lookup(dp->dp_meta_objset, 447 dsl_dir_phys(dd)->dd_child_dir_zapobj, 448 buf, sizeof (ddobj), 1, &ddobj); 449 if (err != 0) { 450 if (err == ENOENT) 451 err = 0; 452 break; 453 } 454 455 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); 456 if (err != 0) 457 break; 458 dsl_dir_rele(dd, tag); 459 dd = child_dd; 460 next = nextnext; 461 } 462 463 if (err != 0) { 464 dsl_dir_rele(dd, tag); 465 return (err); 466 } 467 468 /* 469 * It's an error if there's more than one component left, or 470 * tailp==NULL and there's any component left. 471 */ 472 if (next != NULL && 473 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { 474 /* bad path name */ 475 dsl_dir_rele(dd, tag); 476 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); 477 err = SET_ERROR(ENOENT); 478 } 479 if (tailp != NULL) 480 *tailp = next; 481 *ddp = dd; 482 return (err); 483} 484 485/* 486 * If the counts are already initialized for this filesystem and its 487 * descendants then do nothing, otherwise initialize the counts. 488 * 489 * The counts on this filesystem, and those below, may be uninitialized due to 490 * either the use of a pre-existing pool which did not support the 491 * filesystem/snapshot limit feature, or one in which the feature had not yet 492 * been enabled. 493 * 494 * Recursively descend the filesystem tree and update the filesystem/snapshot 495 * counts on each filesystem below, then update the cumulative count on the 496 * current filesystem. If the filesystem already has a count set on it, 497 * then we know that its counts, and the counts on the filesystems below it, 498 * are already correct, so we don't have to update this filesystem. 499 */ 500static void 501dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) 502{ 503 uint64_t my_fs_cnt = 0; 504 uint64_t my_ss_cnt = 0; 505 dsl_pool_t *dp = dd->dd_pool; 506 objset_t *os = dp->dp_meta_objset; 507 zap_cursor_t *zc; 508 zap_attribute_t *za; 509 dsl_dataset_t *ds; 510 511 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); 512 ASSERT(dsl_pool_config_held(dp)); 513 ASSERT(dmu_tx_is_syncing(tx)); 514 515 dsl_dir_zapify(dd, tx); 516 517 /* 518 * If the filesystem count has already been initialized then we 519 * don't need to recurse down any further. 520 */ 521 if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) 522 return; 523 524 zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); 525 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 526 527 /* Iterate my child dirs */ 528 for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); 529 zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { 530 dsl_dir_t *chld_dd; 531 uint64_t count; 532 533 VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, 534 &chld_dd)); 535 536 /* 537 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and 538 * temporary datasets. 539 */ 540 if (chld_dd->dd_myname[0] == '$' || 541 chld_dd->dd_myname[0] == '%') { 542 dsl_dir_rele(chld_dd, FTAG); 543 continue; 544 } 545 546 my_fs_cnt++; /* count this child */ 547 548 dsl_dir_init_fs_ss_count(chld_dd, tx); 549 550 VERIFY0(zap_lookup(os, chld_dd->dd_object, 551 DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); 552 my_fs_cnt += count; 553 VERIFY0(zap_lookup(os, chld_dd->dd_object, 554 DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); 555 my_ss_cnt += count; 556 557 dsl_dir_rele(chld_dd, FTAG); 558 } 559 zap_cursor_fini(zc); 560 /* Count my snapshots (we counted children's snapshots above) */ 561 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 562 dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); 563 564 for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); 565 zap_cursor_retrieve(zc, za) == 0; 566 zap_cursor_advance(zc)) { 567 /* Don't count temporary snapshots */ 568 if (za->za_name[0] != '%') 569 my_ss_cnt++; 570 } 571 zap_cursor_fini(zc); 572 573 dsl_dataset_rele(ds, FTAG); 574 575 kmem_free(zc, sizeof (zap_cursor_t)); 576 kmem_free(za, sizeof (zap_attribute_t)); 577 578 /* we're in a sync task, update counts */ 579 dmu_buf_will_dirty(dd->dd_dbuf, tx); 580 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 581 sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); 582 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 583 sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); 584} 585 586static int 587dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) 588{ 589 char *ddname = (char *)arg; 590 dsl_pool_t *dp = dmu_tx_pool(tx); 591 dsl_dataset_t *ds; 592 dsl_dir_t *dd; 593 int error; 594 595 error = dsl_dataset_hold(dp, ddname, FTAG, &ds); 596 if (error != 0) 597 return (error); 598 599 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { 600 dsl_dataset_rele(ds, FTAG); 601 return (SET_ERROR(ENOTSUP)); 602 } 603 604 dd = ds->ds_dir; 605 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && 606 dsl_dir_is_zapified(dd) && 607 zap_contains(dp->dp_meta_objset, dd->dd_object, 608 DD_FIELD_FILESYSTEM_COUNT) == 0) { 609 dsl_dataset_rele(ds, FTAG); 610 return (SET_ERROR(EALREADY)); 611 } 612 613 dsl_dataset_rele(ds, FTAG); 614 return (0); 615} 616 617static void 618dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) 619{ 620 char *ddname = (char *)arg; 621 dsl_pool_t *dp = dmu_tx_pool(tx); 622 dsl_dataset_t *ds; 623 spa_t *spa; 624 625 VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); 626 627 spa = dsl_dataset_get_spa(ds); 628 629 if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { 630 /* 631 * Since the feature was not active and we're now setting a 632 * limit, increment the feature-active counter so that the 633 * feature becomes active for the first time. 634 * 635 * We are already in a sync task so we can update the MOS. 636 */ 637 spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); 638 } 639 640 /* 641 * Since we are now setting a non-UINT64_MAX limit on the filesystem, 642 * we need to ensure the counts are correct. Descend down the tree from 643 * this point and update all of the counts to be accurate. 644 */ 645 dsl_dir_init_fs_ss_count(ds->ds_dir, tx); 646 647 dsl_dataset_rele(ds, FTAG); 648} 649 650/* 651 * Make sure the feature is enabled and activate it if necessary. 652 * Since we're setting a limit, ensure the on-disk counts are valid. 653 * This is only called by the ioctl path when setting a limit value. 654 * 655 * We do not need to validate the new limit, since users who can change the 656 * limit are also allowed to exceed the limit. 657 */ 658int 659dsl_dir_activate_fs_ss_limit(const char *ddname) 660{ 661 int error; 662 663 error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, 664 dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, 665 ZFS_SPACE_CHECK_RESERVED); 666 667 if (error == EALREADY) 668 error = 0; 669 670 return (error); 671} 672 673/* 674 * Used to determine if the filesystem_limit or snapshot_limit should be 675 * enforced. We allow the limit to be exceeded if the user has permission to 676 * write the property value. We pass in the creds that we got in the open 677 * context since we will always be the GZ root in syncing context. We also have 678 * to handle the case where we are allowed to change the limit on the current 679 * dataset, but there may be another limit in the tree above. 680 * 681 * We can never modify these two properties within a non-global zone. In 682 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We 683 * can't use that function since we are already holding the dp_config_rwlock. 684 * In addition, we already have the dd and dealing with snapshots is simplified 685 * in this code. 686 */ 687 688typedef enum { 689 ENFORCE_ALWAYS, 690 ENFORCE_NEVER, 691 ENFORCE_ABOVE 692} enforce_res_t; 693 694static enforce_res_t 695dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) 696{ 697 enforce_res_t enforce = ENFORCE_ALWAYS; 698 uint64_t obj; 699 dsl_dataset_t *ds; 700 uint64_t zoned; 701 702 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 703 prop == ZFS_PROP_SNAPSHOT_LIMIT); 704 705#ifdef _KERNEL 706#ifdef __FreeBSD__ 707 if (jailed(cr)) 708#else 709 if (crgetzoneid(cr) != GLOBAL_ZONEID) 710#endif 711 return (ENFORCE_ALWAYS); 712 713 if (secpolicy_zfs(cr) == 0) 714 return (ENFORCE_NEVER); 715#endif 716 717 if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) 718 return (ENFORCE_ALWAYS); 719 720 ASSERT(dsl_pool_config_held(dd->dd_pool)); 721 722 if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) 723 return (ENFORCE_ALWAYS); 724 725 if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { 726 /* Only root can access zoned fs's from the GZ */ 727 enforce = ENFORCE_ALWAYS; 728 } else { 729 if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) 730 enforce = ENFORCE_ABOVE; 731 } 732 733 dsl_dataset_rele(ds, FTAG); 734 return (enforce); 735} 736 737/* 738 * Check if adding additional child filesystem(s) would exceed any filesystem 739 * limits or adding additional snapshot(s) would exceed any snapshot limits. 740 * The prop argument indicates which limit to check. 741 * 742 * Note that all filesystem limits up to the root (or the highest 743 * initialized) filesystem or the given ancestor must be satisfied. 744 */ 745int 746dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, 747 dsl_dir_t *ancestor, cred_t *cr) 748{ 749 objset_t *os = dd->dd_pool->dp_meta_objset; 750 uint64_t limit, count; 751 char *count_prop; 752 enforce_res_t enforce; 753 int err = 0; 754 755 ASSERT(dsl_pool_config_held(dd->dd_pool)); 756 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || 757 prop == ZFS_PROP_SNAPSHOT_LIMIT); 758 759 /* 760 * If we're allowed to change the limit, don't enforce the limit 761 * e.g. this can happen if a snapshot is taken by an administrative 762 * user in the global zone (i.e. a recursive snapshot by root). 763 * However, we must handle the case of delegated permissions where we 764 * are allowed to change the limit on the current dataset, but there 765 * is another limit in the tree above. 766 */ 767 enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); 768 if (enforce == ENFORCE_NEVER) 769 return (0); 770 771 /* 772 * e.g. if renaming a dataset with no snapshots, count adjustment 773 * is 0. 774 */ 775 if (delta == 0) 776 return (0); 777 778 if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { 779 /* 780 * We don't enforce the limit for temporary snapshots. This is 781 * indicated by a NULL cred_t argument. 782 */ 783 if (cr == NULL) 784 return (0); 785 786 count_prop = DD_FIELD_SNAPSHOT_COUNT; 787 } else { 788 count_prop = DD_FIELD_FILESYSTEM_COUNT; 789 } 790 791 /* 792 * If an ancestor has been provided, stop checking the limit once we 793 * hit that dir. We need this during rename so that we don't overcount 794 * the check once we recurse up to the common ancestor. 795 */ 796 if (ancestor == dd) 797 return (0); 798 799 /* 800 * If we hit an uninitialized node while recursing up the tree, we can 801 * stop since we know there is no limit here (or above). The counts are 802 * not valid on this node and we know we won't touch this node's counts. 803 */ 804 if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, 805 count_prop, sizeof (count), 1, &count) == ENOENT) 806 return (0); 807 808 err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, 809 B_FALSE); 810 if (err != 0) 811 return (err); 812 813 /* Is there a limit which we've hit? */ 814 if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) 815 return (SET_ERROR(EDQUOT)); 816 817 if (dd->dd_parent != NULL) 818 err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, 819 ancestor, cr); 820 821 return (err); 822} 823 824/* 825 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all 826 * parents. When a new filesystem/snapshot is created, increment the count on 827 * all parents, and when a filesystem/snapshot is destroyed, decrement the 828 * count. 829 */ 830void 831dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, 832 dmu_tx_t *tx) 833{ 834 int err; 835 objset_t *os = dd->dd_pool->dp_meta_objset; 836 uint64_t count; 837 838 ASSERT(dsl_pool_config_held(dd->dd_pool)); 839 ASSERT(dmu_tx_is_syncing(tx)); 840 ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || 841 strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); 842 843 /* 844 * When we receive an incremental stream into a filesystem that already 845 * exists, a temporary clone is created. We don't count this temporary 846 * clone, whose name begins with a '%'. We also ignore hidden ($FREE, 847 * $MOS & $ORIGIN) objsets. 848 */ 849 if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && 850 strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) 851 return; 852 853 /* 854 * e.g. if renaming a dataset with no snapshots, count adjustment is 0 855 */ 856 if (delta == 0) 857 return; 858 859 /* 860 * If we hit an uninitialized node while recursing up the tree, we can 861 * stop since we know the counts are not valid on this node and we 862 * know we shouldn't touch this node's counts. An uninitialized count 863 * on the node indicates that either the feature has not yet been 864 * activated or there are no limits on this part of the tree. 865 */ 866 if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, 867 prop, sizeof (count), 1, &count)) == ENOENT) 868 return; 869 VERIFY0(err); 870 871 count += delta; 872 /* Use a signed verify to make sure we're not neg. */ 873 VERIFY3S(count, >=, 0); 874 875 VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, 876 tx)); 877 878 /* Roll up this additional count into our ancestors */ 879 if (dd->dd_parent != NULL) 880 dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); 881} 882 883uint64_t 884dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, 885 dmu_tx_t *tx) 886{ 887 objset_t *mos = dp->dp_meta_objset; 888 uint64_t ddobj; 889 dsl_dir_phys_t *ddphys; 890 dmu_buf_t *dbuf; 891 892 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, 893 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); 894 if (pds) { 895 VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, 896 name, sizeof (uint64_t), 1, &ddobj, tx)); 897 } else { 898 /* it's the root dir */ 899 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, 900 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); 901 } 902 VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); 903 dmu_buf_will_dirty(dbuf, tx); 904 ddphys = dbuf->db_data; 905 906 ddphys->dd_creation_time = gethrestime_sec(); 907 if (pds) { 908 ddphys->dd_parent_obj = pds->dd_object; 909 910 /* update the filesystem counts */ 911 dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); 912 } 913 ddphys->dd_props_zapobj = zap_create(mos, 914 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); 915 ddphys->dd_child_dir_zapobj = zap_create(mos, 916 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); 917 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) 918 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; 919 dmu_buf_rele(dbuf, FTAG); 920 921 return (ddobj); 922} 923 924boolean_t 925dsl_dir_is_clone(dsl_dir_t *dd) 926{ 927 return (dsl_dir_phys(dd)->dd_origin_obj && 928 (dd->dd_pool->dp_origin_snap == NULL || 929 dsl_dir_phys(dd)->dd_origin_obj != 930 dd->dd_pool->dp_origin_snap->ds_object)); 931} 932 933void 934dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) 935{ 936 mutex_enter(&dd->dd_lock); 937 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 938 dsl_dir_phys(dd)->dd_used_bytes); 939 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, 940 dsl_dir_phys(dd)->dd_quota); 941 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, 942 dsl_dir_phys(dd)->dd_reserved); 943 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, 944 dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : 945 (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / 946 dsl_dir_phys(dd)->dd_compressed_bytes)); 947 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, 948 dsl_dir_phys(dd)->dd_uncompressed_bytes); 949 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 950 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, 951 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); 952 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, 953 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); 954 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, 955 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); 956 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, 957 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + 958 dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); 959 } 960 mutex_exit(&dd->dd_lock); 961 962 if (dsl_dir_is_zapified(dd)) { 963 uint64_t count; 964 objset_t *os = dd->dd_pool->dp_meta_objset; 965 966 if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, 967 sizeof (count), 1, &count) == 0) { 968 dsl_prop_nvlist_add_uint64(nv, 969 ZFS_PROP_FILESYSTEM_COUNT, count); 970 } 971 if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, 972 sizeof (count), 1, &count) == 0) { 973 dsl_prop_nvlist_add_uint64(nv, 974 ZFS_PROP_SNAPSHOT_COUNT, count); 975 } 976 } 977 978 if (dsl_dir_is_clone(dd)) { 979 dsl_dataset_t *ds; 980 char buf[MAXNAMELEN]; 981 982 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, 983 dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); 984 dsl_dataset_name(ds, buf); 985 dsl_dataset_rele(ds, FTAG); 986 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); 987 } 988} 989 990void 991dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) 992{ 993 dsl_pool_t *dp = dd->dd_pool; 994 995 ASSERT(dsl_dir_phys(dd)); 996 997 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { 998 /* up the hold count until we can be written out */ 999 dmu_buf_add_ref(dd->dd_dbuf, dd); 1000 } 1001} 1002 1003static int64_t 1004parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) 1005{ 1006 uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); 1007 uint64_t new_accounted = 1008 MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); 1009 return (new_accounted - old_accounted); 1010} 1011 1012void 1013dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) 1014{ 1015 ASSERT(dmu_tx_is_syncing(tx)); 1016 1017 mutex_enter(&dd->dd_lock); 1018 ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); 1019 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, 1020 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); 1021 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; 1022 mutex_exit(&dd->dd_lock); 1023 1024 /* release the hold from dsl_dir_dirty */ 1025 dmu_buf_rele(dd->dd_dbuf, dd); 1026} 1027 1028static uint64_t 1029dsl_dir_space_towrite(dsl_dir_t *dd) 1030{ 1031 uint64_t space = 0; 1032 int i; 1033 1034 ASSERT(MUTEX_HELD(&dd->dd_lock)); 1035 1036 for (i = 0; i < TXG_SIZE; i++) { 1037 space += dd->dd_space_towrite[i&TXG_MASK]; 1038 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); 1039 } 1040 return (space); 1041} 1042 1043/* 1044 * How much space would dd have available if ancestor had delta applied 1045 * to it? If ondiskonly is set, we're only interested in what's 1046 * on-disk, not estimated pending changes. 1047 */ 1048uint64_t 1049dsl_dir_space_available(dsl_dir_t *dd, 1050 dsl_dir_t *ancestor, int64_t delta, int ondiskonly) 1051{ 1052 uint64_t parentspace, myspace, quota, used; 1053 1054 /* 1055 * If there are no restrictions otherwise, assume we have 1056 * unlimited space available. 1057 */ 1058 quota = UINT64_MAX; 1059 parentspace = UINT64_MAX; 1060 1061 if (dd->dd_parent != NULL) { 1062 parentspace = dsl_dir_space_available(dd->dd_parent, 1063 ancestor, delta, ondiskonly); 1064 } 1065 1066 mutex_enter(&dd->dd_lock); 1067 if (dsl_dir_phys(dd)->dd_quota != 0) 1068 quota = dsl_dir_phys(dd)->dd_quota; 1069 used = dsl_dir_phys(dd)->dd_used_bytes; 1070 if (!ondiskonly) 1071 used += dsl_dir_space_towrite(dd); 1072 1073 if (dd->dd_parent == NULL) { 1074 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); 1075 quota = MIN(quota, poolsize); 1076 } 1077 1078 if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { 1079 /* 1080 * We have some space reserved, in addition to what our 1081 * parent gave us. 1082 */ 1083 parentspace += dsl_dir_phys(dd)->dd_reserved - used; 1084 } 1085 1086 if (dd == ancestor) { 1087 ASSERT(delta <= 0); 1088 ASSERT(used >= -delta); 1089 used += delta; 1090 if (parentspace != UINT64_MAX) 1091 parentspace -= delta; 1092 } 1093 1094 if (used > quota) { 1095 /* over quota */ 1096 myspace = 0; 1097 } else { 1098 /* 1099 * the lesser of the space provided by our parent and 1100 * the space left in our quota 1101 */ 1102 myspace = MIN(parentspace, quota - used); 1103 } 1104 1105 mutex_exit(&dd->dd_lock); 1106 1107 return (myspace); 1108} 1109 1110struct tempreserve { 1111 list_node_t tr_node; 1112 dsl_dir_t *tr_ds; 1113 uint64_t tr_size; 1114}; 1115 1116static int 1117dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, 1118 boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, 1119 dmu_tx_t *tx, boolean_t first) 1120{ 1121 uint64_t txg = tx->tx_txg; 1122 uint64_t est_inflight, used_on_disk, quota, parent_rsrv; 1123 uint64_t deferred = 0; 1124 struct tempreserve *tr; 1125 int retval = EDQUOT; 1126 int txgidx = txg & TXG_MASK; 1127 int i; 1128 uint64_t ref_rsrv = 0; 1129 1130 ASSERT3U(txg, !=, 0); 1131 ASSERT3S(asize, >, 0); 1132 1133 mutex_enter(&dd->dd_lock); 1134 1135 /* 1136 * Check against the dsl_dir's quota. We don't add in the delta 1137 * when checking for over-quota because they get one free hit. 1138 */ 1139 est_inflight = dsl_dir_space_towrite(dd); 1140 for (i = 0; i < TXG_SIZE; i++) 1141 est_inflight += dd->dd_tempreserved[i]; 1142 used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; 1143 1144 /* 1145 * On the first iteration, fetch the dataset's used-on-disk and 1146 * refreservation values. Also, if checkrefquota is set, test if 1147 * allocating this space would exceed the dataset's refquota. 1148 */ 1149 if (first && tx->tx_objset) { 1150 int error; 1151 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; 1152 1153 error = dsl_dataset_check_quota(ds, checkrefquota, 1154 asize, est_inflight, &used_on_disk, &ref_rsrv); 1155 if (error) { 1156 mutex_exit(&dd->dd_lock); 1157 return (error); 1158 } 1159 } 1160 1161 /* 1162 * If this transaction will result in a net free of space, 1163 * we want to let it through. 1164 */ 1165 if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) 1166 quota = UINT64_MAX; 1167 else 1168 quota = dsl_dir_phys(dd)->dd_quota; 1169 1170 /* 1171 * Adjust the quota against the actual pool size at the root 1172 * minus any outstanding deferred frees. 1173 * To ensure that it's possible to remove files from a full 1174 * pool without inducing transient overcommits, we throttle 1175 * netfree transactions against a quota that is slightly larger, 1176 * but still within the pool's allocation slop. In cases where 1177 * we're very close to full, this will allow a steady trickle of 1178 * removes to get through. 1179 */ 1180 if (dd->dd_parent == NULL) { 1181 spa_t *spa = dd->dd_pool->dp_spa; 1182 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); 1183 deferred = metaslab_class_get_deferred(spa_normal_class(spa)); 1184 if (poolsize - deferred < quota) { 1185 quota = poolsize - deferred; 1186 retval = ENOSPC; 1187 } 1188 } 1189 1190 /* 1191 * If they are requesting more space, and our current estimate 1192 * is over quota, they get to try again unless the actual 1193 * on-disk is over quota and there are no pending changes (which 1194 * may free up space for us). 1195 */ 1196 if (used_on_disk + est_inflight >= quota) { 1197 if (est_inflight > 0 || used_on_disk < quota || 1198 (retval == ENOSPC && used_on_disk < quota + deferred)) 1199 retval = ERESTART; 1200 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " 1201 "quota=%lluK tr=%lluK err=%d\n", 1202 used_on_disk>>10, est_inflight>>10, 1203 quota>>10, asize>>10, retval); 1204 mutex_exit(&dd->dd_lock); 1205 return (SET_ERROR(retval)); 1206 } 1207 1208 /* We need to up our estimated delta before dropping dd_lock */ 1209 dd->dd_tempreserved[txgidx] += asize; 1210 1211 parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, 1212 asize - ref_rsrv); 1213 mutex_exit(&dd->dd_lock); 1214 1215 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1216 tr->tr_ds = dd; 1217 tr->tr_size = asize; 1218 list_insert_tail(tr_list, tr); 1219 1220 /* see if it's OK with our parent */ 1221 if (dd->dd_parent && parent_rsrv) { 1222 boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); 1223 1224 return (dsl_dir_tempreserve_impl(dd->dd_parent, 1225 parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); 1226 } else { 1227 return (0); 1228 } 1229} 1230 1231/* 1232 * Reserve space in this dsl_dir, to be used in this tx's txg. 1233 * After the space has been dirtied (and dsl_dir_willuse_space() 1234 * has been called), the reservation should be canceled, using 1235 * dsl_dir_tempreserve_clear(). 1236 */ 1237int 1238dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, 1239 uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) 1240{ 1241 int err; 1242 list_t *tr_list; 1243 1244 if (asize == 0) { 1245 *tr_cookiep = NULL; 1246 return (0); 1247 } 1248 1249 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); 1250 list_create(tr_list, sizeof (struct tempreserve), 1251 offsetof(struct tempreserve, tr_node)); 1252 ASSERT3S(asize, >, 0); 1253 ASSERT3S(fsize, >=, 0); 1254 1255 err = arc_tempreserve_space(lsize, tx->tx_txg); 1256 if (err == 0) { 1257 struct tempreserve *tr; 1258 1259 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); 1260 tr->tr_size = lsize; 1261 list_insert_tail(tr_list, tr); 1262 } else { 1263 if (err == EAGAIN) { 1264 /* 1265 * If arc_memory_throttle() detected that pageout 1266 * is running and we are low on memory, we delay new 1267 * non-pageout transactions to give pageout an 1268 * advantage. 1269 * 1270 * It is unfortunate to be delaying while the caller's 1271 * locks are held. 1272 */ 1273 txg_delay(dd->dd_pool, tx->tx_txg, 1274 MSEC2NSEC(10), MSEC2NSEC(10)); 1275 err = SET_ERROR(ERESTART); 1276 } 1277 } 1278 1279 if (err == 0) { 1280 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, 1281 FALSE, asize > usize, tr_list, tx, TRUE); 1282 } 1283 1284 if (err != 0) 1285 dsl_dir_tempreserve_clear(tr_list, tx); 1286 else 1287 *tr_cookiep = tr_list; 1288 1289 return (err); 1290} 1291 1292/* 1293 * Clear a temporary reservation that we previously made with 1294 * dsl_dir_tempreserve_space(). 1295 */ 1296void 1297dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) 1298{ 1299 int txgidx = tx->tx_txg & TXG_MASK; 1300 list_t *tr_list = tr_cookie; 1301 struct tempreserve *tr; 1302 1303 ASSERT3U(tx->tx_txg, !=, 0); 1304 1305 if (tr_cookie == NULL) 1306 return; 1307 1308 while ((tr = list_head(tr_list)) != NULL) { 1309 if (tr->tr_ds) { 1310 mutex_enter(&tr->tr_ds->dd_lock); 1311 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, 1312 tr->tr_size); 1313 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; 1314 mutex_exit(&tr->tr_ds->dd_lock); 1315 } else { 1316 arc_tempreserve_clear(tr->tr_size); 1317 } 1318 list_remove(tr_list, tr); 1319 kmem_free(tr, sizeof (struct tempreserve)); 1320 } 1321 1322 kmem_free(tr_list, sizeof (list_t)); 1323} 1324 1325/* 1326 * This should be called from open context when we think we're going to write 1327 * or free space, for example when dirtying data. Be conservative; it's okay 1328 * to write less space or free more, but we don't want to write more or free 1329 * less than the amount specified. 1330 */ 1331void 1332dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) 1333{ 1334 int64_t parent_space; 1335 uint64_t est_used; 1336 1337 mutex_enter(&dd->dd_lock); 1338 if (space > 0) 1339 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; 1340 1341 est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; 1342 parent_space = parent_delta(dd, est_used, space); 1343 mutex_exit(&dd->dd_lock); 1344 1345 /* Make sure that we clean up dd_space_to* */ 1346 dsl_dir_dirty(dd, tx); 1347 1348 /* XXX this is potentially expensive and unnecessary... */ 1349 if (parent_space && dd->dd_parent) 1350 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); 1351} 1352 1353/* call from syncing context when we actually write/free space for this dd */ 1354void 1355dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, 1356 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) 1357{ 1358 int64_t accounted_delta; 1359 1360 /* 1361 * dsl_dataset_set_refreservation_sync_impl() calls this with 1362 * dd_lock held, so that it can atomically update 1363 * ds->ds_reserved and the dsl_dir accounting, so that 1364 * dsl_dataset_check_quota() can see dataset and dir accounting 1365 * consistently. 1366 */ 1367 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); 1368 1369 ASSERT(dmu_tx_is_syncing(tx)); 1370 ASSERT(type < DD_USED_NUM); 1371 1372 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1373 1374 if (needlock) 1375 mutex_enter(&dd->dd_lock); 1376 accounted_delta = 1377 parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); 1378 ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); 1379 ASSERT(compressed >= 0 || 1380 dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); 1381 ASSERT(uncompressed >= 0 || 1382 dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); 1383 dsl_dir_phys(dd)->dd_used_bytes += used; 1384 dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; 1385 dsl_dir_phys(dd)->dd_compressed_bytes += compressed; 1386 1387 if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { 1388 ASSERT(used > 0 || 1389 dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); 1390 dsl_dir_phys(dd)->dd_used_breakdown[type] += used; 1391#ifdef DEBUG 1392 dd_used_t t; 1393 uint64_t u = 0; 1394 for (t = 0; t < DD_USED_NUM; t++) 1395 u += dsl_dir_phys(dd)->dd_used_breakdown[t]; 1396 ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); 1397#endif 1398 } 1399 if (needlock) 1400 mutex_exit(&dd->dd_lock); 1401 1402 if (dd->dd_parent != NULL) { 1403 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 1404 accounted_delta, compressed, uncompressed, tx); 1405 dsl_dir_transfer_space(dd->dd_parent, 1406 used - accounted_delta, 1407 DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL); 1408 } 1409} 1410 1411void 1412dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, 1413 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) 1414{ 1415 ASSERT(tx == NULL || dmu_tx_is_syncing(tx)); 1416 ASSERT(oldtype < DD_USED_NUM); 1417 ASSERT(newtype < DD_USED_NUM); 1418 1419 if (delta == 0 || 1420 !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) 1421 return; 1422 1423 if (tx != NULL) 1424 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1425 mutex_enter(&dd->dd_lock); 1426 ASSERT(delta > 0 ? 1427 dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : 1428 dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); 1429 ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); 1430 dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; 1431 dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; 1432 mutex_exit(&dd->dd_lock); 1433} 1434 1435typedef struct dsl_dir_set_qr_arg { 1436 const char *ddsqra_name; 1437 zprop_source_t ddsqra_source; 1438 uint64_t ddsqra_value; 1439} dsl_dir_set_qr_arg_t; 1440 1441static int 1442dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) 1443{ 1444 dsl_dir_set_qr_arg_t *ddsqra = arg; 1445 dsl_pool_t *dp = dmu_tx_pool(tx); 1446 dsl_dataset_t *ds; 1447 int error; 1448 uint64_t towrite, newval; 1449 1450 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1451 if (error != 0) 1452 return (error); 1453 1454 error = dsl_prop_predict(ds->ds_dir, "quota", 1455 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1456 if (error != 0) { 1457 dsl_dataset_rele(ds, FTAG); 1458 return (error); 1459 } 1460 1461 if (newval == 0) { 1462 dsl_dataset_rele(ds, FTAG); 1463 return (0); 1464 } 1465 1466 mutex_enter(&ds->ds_dir->dd_lock); 1467 /* 1468 * If we are doing the preliminary check in open context, and 1469 * there are pending changes, then don't fail it, since the 1470 * pending changes could under-estimate the amount of space to be 1471 * freed up. 1472 */ 1473 towrite = dsl_dir_space_towrite(ds->ds_dir); 1474 if ((dmu_tx_is_syncing(tx) || towrite == 0) && 1475 (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || 1476 newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { 1477 error = SET_ERROR(ENOSPC); 1478 } 1479 mutex_exit(&ds->ds_dir->dd_lock); 1480 dsl_dataset_rele(ds, FTAG); 1481 return (error); 1482} 1483 1484static void 1485dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) 1486{ 1487 dsl_dir_set_qr_arg_t *ddsqra = arg; 1488 dsl_pool_t *dp = dmu_tx_pool(tx); 1489 dsl_dataset_t *ds; 1490 uint64_t newval; 1491 1492 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1493 1494 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1495 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), 1496 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1497 &ddsqra->ddsqra_value, tx); 1498 1499 VERIFY0(dsl_prop_get_int_ds(ds, 1500 zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); 1501 } else { 1502 newval = ddsqra->ddsqra_value; 1503 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1504 zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); 1505 } 1506 1507 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1508 mutex_enter(&ds->ds_dir->dd_lock); 1509 dsl_dir_phys(ds->ds_dir)->dd_quota = newval; 1510 mutex_exit(&ds->ds_dir->dd_lock); 1511 dsl_dataset_rele(ds, FTAG); 1512} 1513 1514int 1515dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) 1516{ 1517 dsl_dir_set_qr_arg_t ddsqra; 1518 1519 ddsqra.ddsqra_name = ddname; 1520 ddsqra.ddsqra_source = source; 1521 ddsqra.ddsqra_value = quota; 1522 1523 return (dsl_sync_task(ddname, dsl_dir_set_quota_check, 1524 dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); 1525} 1526 1527int 1528dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) 1529{ 1530 dsl_dir_set_qr_arg_t *ddsqra = arg; 1531 dsl_pool_t *dp = dmu_tx_pool(tx); 1532 dsl_dataset_t *ds; 1533 dsl_dir_t *dd; 1534 uint64_t newval, used, avail; 1535 int error; 1536 1537 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); 1538 if (error != 0) 1539 return (error); 1540 dd = ds->ds_dir; 1541 1542 /* 1543 * If we are doing the preliminary check in open context, the 1544 * space estimates may be inaccurate. 1545 */ 1546 if (!dmu_tx_is_syncing(tx)) { 1547 dsl_dataset_rele(ds, FTAG); 1548 return (0); 1549 } 1550 1551 error = dsl_prop_predict(ds->ds_dir, 1552 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1553 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); 1554 if (error != 0) { 1555 dsl_dataset_rele(ds, FTAG); 1556 return (error); 1557 } 1558 1559 mutex_enter(&dd->dd_lock); 1560 used = dsl_dir_phys(dd)->dd_used_bytes; 1561 mutex_exit(&dd->dd_lock); 1562 1563 if (dd->dd_parent) { 1564 avail = dsl_dir_space_available(dd->dd_parent, 1565 NULL, 0, FALSE); 1566 } else { 1567 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; 1568 } 1569 1570 if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { 1571 uint64_t delta = MAX(used, newval) - 1572 MAX(used, dsl_dir_phys(dd)->dd_reserved); 1573 1574 if (delta > avail || 1575 (dsl_dir_phys(dd)->dd_quota > 0 && 1576 newval > dsl_dir_phys(dd)->dd_quota)) 1577 error = SET_ERROR(ENOSPC); 1578 } 1579 1580 dsl_dataset_rele(ds, FTAG); 1581 return (error); 1582} 1583 1584void 1585dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) 1586{ 1587 uint64_t used; 1588 int64_t delta; 1589 1590 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1591 1592 mutex_enter(&dd->dd_lock); 1593 used = dsl_dir_phys(dd)->dd_used_bytes; 1594 delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); 1595 dsl_dir_phys(dd)->dd_reserved = value; 1596 1597 if (dd->dd_parent != NULL) { 1598 /* Roll up this additional usage into our ancestors */ 1599 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 1600 delta, 0, 0, tx); 1601 } 1602 mutex_exit(&dd->dd_lock); 1603} 1604 1605static void 1606dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) 1607{ 1608 dsl_dir_set_qr_arg_t *ddsqra = arg; 1609 dsl_pool_t *dp = dmu_tx_pool(tx); 1610 dsl_dataset_t *ds; 1611 uint64_t newval; 1612 1613 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); 1614 1615 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { 1616 dsl_prop_set_sync_impl(ds, 1617 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1618 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, 1619 &ddsqra->ddsqra_value, tx); 1620 1621 VERIFY0(dsl_prop_get_int_ds(ds, 1622 zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); 1623 } else { 1624 newval = ddsqra->ddsqra_value; 1625 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", 1626 zfs_prop_to_name(ZFS_PROP_RESERVATION), 1627 (longlong_t)newval); 1628 } 1629 1630 dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); 1631 dsl_dataset_rele(ds, FTAG); 1632} 1633 1634int 1635dsl_dir_set_reservation(const char *ddname, zprop_source_t source, 1636 uint64_t reservation) 1637{ 1638 dsl_dir_set_qr_arg_t ddsqra; 1639 1640 ddsqra.ddsqra_name = ddname; 1641 ddsqra.ddsqra_source = source; 1642 ddsqra.ddsqra_value = reservation; 1643 1644 return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, 1645 dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); 1646} 1647 1648static dsl_dir_t * 1649closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) 1650{ 1651 for (; ds1; ds1 = ds1->dd_parent) { 1652 dsl_dir_t *dd; 1653 for (dd = ds2; dd; dd = dd->dd_parent) { 1654 if (ds1 == dd) 1655 return (dd); 1656 } 1657 } 1658 return (NULL); 1659} 1660 1661/* 1662 * If delta is applied to dd, how much of that delta would be applied to 1663 * ancestor? Syncing context only. 1664 */ 1665static int64_t 1666would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) 1667{ 1668 if (dd == ancestor) 1669 return (delta); 1670 1671 mutex_enter(&dd->dd_lock); 1672 delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); 1673 mutex_exit(&dd->dd_lock); 1674 return (would_change(dd->dd_parent, delta, ancestor)); 1675} 1676 1677typedef struct dsl_dir_rename_arg { 1678 const char *ddra_oldname; 1679 const char *ddra_newname; 1680 cred_t *ddra_cred; 1681} dsl_dir_rename_arg_t; 1682 1683/* ARGSUSED */ 1684static int 1685dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 1686{ 1687 int *deltap = arg; 1688 char namebuf[MAXNAMELEN]; 1689 1690 dsl_dataset_name(ds, namebuf); 1691 1692 if (strlen(namebuf) + *deltap >= MAXNAMELEN) 1693 return (SET_ERROR(ENAMETOOLONG)); 1694 return (0); 1695} 1696 1697static int 1698dsl_dir_rename_check(void *arg, dmu_tx_t *tx) 1699{ 1700 dsl_dir_rename_arg_t *ddra = arg; 1701 dsl_pool_t *dp = dmu_tx_pool(tx); 1702 dsl_dir_t *dd, *newparent; 1703 const char *mynewname; 1704 int error; 1705 int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); 1706 1707 /* target dir should exist */ 1708 error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); 1709 if (error != 0) 1710 return (error); 1711 1712 /* new parent should exist */ 1713 error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, 1714 &newparent, &mynewname); 1715 if (error != 0) { 1716 dsl_dir_rele(dd, FTAG); 1717 return (error); 1718 } 1719 1720 /* can't rename to different pool */ 1721 if (dd->dd_pool != newparent->dd_pool) { 1722 dsl_dir_rele(newparent, FTAG); 1723 dsl_dir_rele(dd, FTAG); 1724 return (SET_ERROR(EXDEV)); 1725 } 1726 1727 /* new name should not already exist */ 1728 if (mynewname == NULL) { 1729 dsl_dir_rele(newparent, FTAG); 1730 dsl_dir_rele(dd, FTAG); 1731 return (SET_ERROR(EEXIST)); 1732 } 1733 1734 /* if the name length is growing, validate child name lengths */ 1735 if (delta > 0) { 1736 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, 1737 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 1738 if (error != 0) { 1739 dsl_dir_rele(newparent, FTAG); 1740 dsl_dir_rele(dd, FTAG); 1741 return (error); 1742 } 1743 } 1744 1745 if (dmu_tx_is_syncing(tx)) { 1746 if (spa_feature_is_active(dp->dp_spa, 1747 SPA_FEATURE_FS_SS_LIMIT)) { 1748 /* 1749 * Although this is the check function and we don't 1750 * normally make on-disk changes in check functions, 1751 * we need to do that here. 1752 * 1753 * Ensure this portion of the tree's counts have been 1754 * initialized in case the new parent has limits set. 1755 */ 1756 dsl_dir_init_fs_ss_count(dd, tx); 1757 } 1758 } 1759 1760 if (newparent != dd->dd_parent) { 1761 /* is there enough space? */ 1762 uint64_t myspace = 1763 MAX(dsl_dir_phys(dd)->dd_used_bytes, 1764 dsl_dir_phys(dd)->dd_reserved); 1765 objset_t *os = dd->dd_pool->dp_meta_objset; 1766 uint64_t fs_cnt = 0; 1767 uint64_t ss_cnt = 0; 1768 1769 if (dsl_dir_is_zapified(dd)) { 1770 int err; 1771 1772 err = zap_lookup(os, dd->dd_object, 1773 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 1774 &fs_cnt); 1775 if (err != ENOENT && err != 0) { 1776 dsl_dir_rele(newparent, FTAG); 1777 dsl_dir_rele(dd, FTAG); 1778 return (err); 1779 } 1780 1781 /* 1782 * have to add 1 for the filesystem itself that we're 1783 * moving 1784 */ 1785 fs_cnt++; 1786 1787 err = zap_lookup(os, dd->dd_object, 1788 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 1789 &ss_cnt); 1790 if (err != ENOENT && err != 0) { 1791 dsl_dir_rele(newparent, FTAG); 1792 dsl_dir_rele(dd, FTAG); 1793 return (err); 1794 } 1795 } 1796 1797 /* no rename into our descendant */ 1798 if (closest_common_ancestor(dd, newparent) == dd) { 1799 dsl_dir_rele(newparent, FTAG); 1800 dsl_dir_rele(dd, FTAG); 1801 return (SET_ERROR(EINVAL)); 1802 } 1803 1804 error = dsl_dir_transfer_possible(dd->dd_parent, 1805 newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); 1806 if (error != 0) { 1807 dsl_dir_rele(newparent, FTAG); 1808 dsl_dir_rele(dd, FTAG); 1809 return (error); 1810 } 1811 } 1812 1813 dsl_dir_rele(newparent, FTAG); 1814 dsl_dir_rele(dd, FTAG); 1815 return (0); 1816} 1817 1818static void 1819dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) 1820{ 1821 dsl_dir_rename_arg_t *ddra = arg; 1822 dsl_pool_t *dp = dmu_tx_pool(tx); 1823 dsl_dir_t *dd, *newparent; 1824 const char *mynewname; 1825 int error; 1826 objset_t *mos = dp->dp_meta_objset; 1827 1828 VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); 1829 VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, 1830 &mynewname)); 1831 1832 /* Log this before we change the name. */ 1833 spa_history_log_internal_dd(dd, "rename", tx, 1834 "-> %s", ddra->ddra_newname); 1835 1836 if (newparent != dd->dd_parent) { 1837 objset_t *os = dd->dd_pool->dp_meta_objset; 1838 uint64_t fs_cnt = 0; 1839 uint64_t ss_cnt = 0; 1840 1841 /* 1842 * We already made sure the dd counts were initialized in the 1843 * check function. 1844 */ 1845 if (spa_feature_is_active(dp->dp_spa, 1846 SPA_FEATURE_FS_SS_LIMIT)) { 1847 VERIFY0(zap_lookup(os, dd->dd_object, 1848 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, 1849 &fs_cnt)); 1850 /* add 1 for the filesystem itself that we're moving */ 1851 fs_cnt++; 1852 1853 VERIFY0(zap_lookup(os, dd->dd_object, 1854 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, 1855 &ss_cnt)); 1856 } 1857 1858 dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, 1859 DD_FIELD_FILESYSTEM_COUNT, tx); 1860 dsl_fs_ss_count_adjust(newparent, fs_cnt, 1861 DD_FIELD_FILESYSTEM_COUNT, tx); 1862 1863 dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, 1864 DD_FIELD_SNAPSHOT_COUNT, tx); 1865 dsl_fs_ss_count_adjust(newparent, ss_cnt, 1866 DD_FIELD_SNAPSHOT_COUNT, tx); 1867 1868 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, 1869 -dsl_dir_phys(dd)->dd_used_bytes, 1870 -dsl_dir_phys(dd)->dd_compressed_bytes, 1871 -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 1872 dsl_dir_diduse_space(newparent, DD_USED_CHILD, 1873 dsl_dir_phys(dd)->dd_used_bytes, 1874 dsl_dir_phys(dd)->dd_compressed_bytes, 1875 dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); 1876 1877 if (dsl_dir_phys(dd)->dd_reserved > 1878 dsl_dir_phys(dd)->dd_used_bytes) { 1879 uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - 1880 dsl_dir_phys(dd)->dd_used_bytes; 1881 1882 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, 1883 -unused_rsrv, 0, 0, tx); 1884 dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, 1885 unused_rsrv, 0, 0, tx); 1886 } 1887 } 1888 1889 dmu_buf_will_dirty(dd->dd_dbuf, tx); 1890 1891 /* remove from old parent zapobj */ 1892 error = zap_remove(mos, 1893 dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, 1894 dd->dd_myname, tx); 1895 ASSERT0(error); 1896 1897 (void) strcpy(dd->dd_myname, mynewname); 1898 dsl_dir_rele(dd->dd_parent, dd); 1899 dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; 1900 VERIFY0(dsl_dir_hold_obj(dp, 1901 newparent->dd_object, NULL, dd, &dd->dd_parent)); 1902 1903 /* add to new parent zapobj */ 1904 VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, 1905 dd->dd_myname, 8, 1, &dd->dd_object, tx)); 1906 1907#ifdef __FreeBSD__ 1908#ifdef _KERNEL 1909 zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); 1910 zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); 1911#endif 1912#endif 1913 1914 dsl_prop_notify_all(dd); 1915 1916 dsl_dir_rele(newparent, FTAG); 1917 dsl_dir_rele(dd, FTAG); 1918} 1919 1920int 1921dsl_dir_rename(const char *oldname, const char *newname) 1922{ 1923 dsl_dir_rename_arg_t ddra; 1924 1925 ddra.ddra_oldname = oldname; 1926 ddra.ddra_newname = newname; 1927 ddra.ddra_cred = CRED(); 1928 1929 return (dsl_sync_task(oldname, 1930 dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 1931 3, ZFS_SPACE_CHECK_RESERVED)); 1932} 1933 1934int 1935dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, 1936 uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) 1937{ 1938 dsl_dir_t *ancestor; 1939 int64_t adelta; 1940 uint64_t avail; 1941 int err; 1942 1943 ancestor = closest_common_ancestor(sdd, tdd); 1944 adelta = would_change(sdd, -space, ancestor); 1945 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); 1946 if (avail < space) 1947 return (SET_ERROR(ENOSPC)); 1948 1949 err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, 1950 ancestor, cr); 1951 if (err != 0) 1952 return (err); 1953 err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, 1954 ancestor, cr); 1955 if (err != 0) 1956 return (err); 1957 1958 return (0); 1959} 1960 1961timestruc_t 1962dsl_dir_snap_cmtime(dsl_dir_t *dd) 1963{ 1964 timestruc_t t; 1965 1966 mutex_enter(&dd->dd_lock); 1967 t = dd->dd_snap_cmtime; 1968 mutex_exit(&dd->dd_lock); 1969 1970 return (t); 1971} 1972 1973void 1974dsl_dir_snap_cmtime_update(dsl_dir_t *dd) 1975{ 1976 timestruc_t t; 1977 1978 gethrestime(&t); 1979 mutex_enter(&dd->dd_lock); 1980 dd->dd_snap_cmtime = t; 1981 mutex_exit(&dd->dd_lock); 1982} 1983 1984void 1985dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) 1986{ 1987 objset_t *mos = dd->dd_pool->dp_meta_objset; 1988 dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); 1989} 1990 1991boolean_t 1992dsl_dir_is_zapified(dsl_dir_t *dd) 1993{ 1994 dmu_object_info_t doi; 1995 1996 dmu_object_info_from_db(dd->dd_dbuf, &doi); 1997 return (doi.doi_type == DMU_OTN_ZAP_METADATA); 1998} 1999