zfs_ctldir.c revision 330736
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. 25 */ 26 27/* 28 * ZFS control directory (a.k.a. ".zfs") 29 * 30 * This directory provides a common location for all ZFS meta-objects. 31 * Currently, this is only the 'snapshot' directory, but this may expand in the 32 * future. The elements are built using the GFS primitives, as the hierarchy 33 * does not actually exist on disk. 34 * 35 * For 'snapshot', we don't want to have all snapshots always mounted, because 36 * this would take up a huge amount of space in /etc/mnttab. We have three 37 * types of objects: 38 * 39 * ctldir ------> snapshotdir -------> snapshot 40 * | 41 * | 42 * V 43 * mounted fs 44 * 45 * The 'snapshot' node contains just enough information to lookup '..' and act 46 * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we 47 * perform an automount of the underlying filesystem and return the 48 * corresponding vnode. 49 * 50 * All mounts are handled automatically by the kernel, but unmounts are 51 * (currently) handled from user land. The main reason is that there is no 52 * reliable way to auto-unmount the filesystem when it's "no longer in use". 53 * When the user unmounts a filesystem, we call zfsctl_unmount(), which 54 * unmounts any snapshots within the snapshot directory. 55 * 56 * The '.zfs', '.zfs/snapshot', and all directories created under 57 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and 58 * share the same vfs_t as the head filesystem (what '.zfs' lives under). 59 * 60 * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>' 61 * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. 62 * However, vnodes within these mounted on file systems have their v_vfsp 63 * fields set to the head filesystem to make NFS happy (see 64 * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t 65 * so that it cannot be freed until all snapshots have been unmounted. 66 */ 67 68#include <sys/zfs_context.h> 69#include <sys/zfs_ctldir.h> 70#include <sys/zfs_ioctl.h> 71#include <sys/zfs_vfsops.h> 72#include <sys/namei.h> 73#include <sys/stat.h> 74#include <sys/dmu.h> 75#include <sys/dsl_dataset.h> 76#include <sys/dsl_destroy.h> 77#include <sys/dsl_deleg.h> 78#include <sys/mount.h> 79#include <sys/zap.h> 80 81#include "zfs_namecheck.h" 82 83/* Common access mode for all virtual directories under the ctldir */ 84const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | 85 S_IROTH | S_IXOTH; 86 87/* 88 * "Synthetic" filesystem implementation. 89 */ 90 91/* 92 * Assert that A implies B. 93 */ 94#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); 95 96static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); 97 98typedef struct sfs_node { 99 char sn_name[ZFS_MAX_DATASET_NAME_LEN]; 100 uint64_t sn_parent_id; 101 uint64_t sn_id; 102} sfs_node_t; 103 104/* 105 * Check the parent's ID as well as the node's to account for a chance 106 * that IDs originating from different domains (snapshot IDs, artifical 107 * IDs, znode IDs) may clash. 108 */ 109static int 110sfs_compare_ids(struct vnode *vp, void *arg) 111{ 112 sfs_node_t *n1 = vp->v_data; 113 sfs_node_t *n2 = arg; 114 bool equal; 115 116 equal = n1->sn_id == n2->sn_id && 117 n1->sn_parent_id == n2->sn_parent_id; 118 119 /* Zero means equality. */ 120 return (!equal); 121} 122 123static int 124sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, 125 uint64_t id, struct vnode **vpp) 126{ 127 sfs_node_t search; 128 int err; 129 130 search.sn_id = id; 131 search.sn_parent_id = parent_id; 132 err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp, 133 sfs_compare_ids, &search); 134 return (err); 135} 136 137static int 138sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, 139 uint64_t id, struct vnode **vpp) 140{ 141 int err; 142 143 KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); 144 err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp, 145 sfs_compare_ids, vp->v_data); 146 return (err); 147} 148 149static void 150sfs_vnode_remove(struct vnode *vp) 151{ 152 vfs_hash_remove(vp); 153} 154 155typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); 156 157static int 158sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, 159 const char *tag, struct vop_vector *vops, 160 sfs_vnode_setup_fn setup, void *arg, 161 struct vnode **vpp) 162{ 163 struct vnode *vp; 164 int error; 165 166 error = sfs_vnode_get(mp, flags, parent_id, id, vpp); 167 if (error != 0 || *vpp != NULL) { 168 KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, 169 "sfs vnode with no data"); 170 return (error); 171 } 172 173 /* Allocate a new vnode/inode. */ 174 error = getnewvnode(tag, mp, vops, &vp); 175 if (error != 0) { 176 *vpp = NULL; 177 return (error); 178 } 179 180 /* 181 * Exclusively lock the vnode vnode while it's being constructed. 182 */ 183 lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); 184 error = insmntque(vp, mp); 185 if (error != 0) { 186 *vpp = NULL; 187 return (error); 188 } 189 190 setup(vp, arg); 191 192 error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); 193 if (error != 0 || *vpp != NULL) { 194 KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, 195 "sfs vnode with no data"); 196 return (error); 197 } 198 199 *vpp = vp; 200 return (0); 201} 202 203static void 204sfs_print_node(sfs_node_t *node) 205{ 206 printf("\tname = %s\n", node->sn_name); 207 printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); 208 printf("\tid = %ju\n", (uintmax_t)node->sn_id); 209} 210 211static sfs_node_t * 212sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) 213{ 214 struct sfs_node *node; 215 216 KASSERT(strlen(name) < sizeof(node->sn_name), 217 ("sfs node name is too long")); 218 KASSERT(size >= sizeof(*node), ("sfs node size is too small")); 219 node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); 220 strlcpy(node->sn_name, name, sizeof(node->sn_name)); 221 node->sn_parent_id = parent_id; 222 node->sn_id = id; 223 224 return (node); 225} 226 227static void 228sfs_destroy_node(sfs_node_t *node) 229{ 230 free(node, M_SFSNODES); 231} 232 233static void * 234sfs_reclaim_vnode(vnode_t *vp) 235{ 236 sfs_node_t *node; 237 void *data; 238 239 sfs_vnode_remove(vp); 240 data = vp->v_data; 241 vp->v_data = NULL; 242 return (data); 243} 244 245static int 246sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, 247 uio_t *uio, off_t *offp) 248{ 249 struct dirent entry; 250 int error; 251 252 /* Reset ncookies for subsequent use of vfs_read_dirent. */ 253 if (ap->a_ncookies != NULL) 254 *ap->a_ncookies = 0; 255 256 if (uio->uio_resid < sizeof(entry)) 257 return (SET_ERROR(EINVAL)); 258 259 if (uio->uio_offset < 0) 260 return (SET_ERROR(EINVAL)); 261 if (uio->uio_offset == 0) { 262 entry.d_fileno = id; 263 entry.d_type = DT_DIR; 264 entry.d_name[0] = '.'; 265 entry.d_name[1] = '\0'; 266 entry.d_namlen = 1; 267 entry.d_reclen = sizeof(entry); 268 error = vfs_read_dirent(ap, &entry, uio->uio_offset); 269 if (error != 0) 270 return (SET_ERROR(error)); 271 } 272 273 if (uio->uio_offset < sizeof(entry)) 274 return (SET_ERROR(EINVAL)); 275 if (uio->uio_offset == sizeof(entry)) { 276 entry.d_fileno = parent_id; 277 entry.d_type = DT_DIR; 278 entry.d_name[0] = '.'; 279 entry.d_name[1] = '.'; 280 entry.d_name[2] = '\0'; 281 entry.d_namlen = 2; 282 entry.d_reclen = sizeof(entry); 283 error = vfs_read_dirent(ap, &entry, uio->uio_offset); 284 if (error != 0) 285 return (SET_ERROR(error)); 286 } 287 288 if (offp != NULL) 289 *offp = 2 * sizeof(entry); 290 return (0); 291} 292 293 294/* 295 * .zfs inode namespace 296 * 297 * We need to generate unique inode numbers for all files and directories 298 * within the .zfs pseudo-filesystem. We use the following scheme: 299 * 300 * ENTRY ZFSCTL_INODE 301 * .zfs 1 302 * .zfs/snapshot 2 303 * .zfs/snapshot/<snap> objectid(snap) 304 */ 305#define ZFSCTL_INO_SNAP(id) (id) 306 307static struct vop_vector zfsctl_ops_root; 308static struct vop_vector zfsctl_ops_snapdir; 309static struct vop_vector zfsctl_ops_snapshot; 310static struct vop_vector zfsctl_ops_shares_dir; 311 312void 313zfsctl_init(void) 314{ 315} 316 317void 318zfsctl_fini(void) 319{ 320} 321 322boolean_t 323zfsctl_is_node(vnode_t *vp) 324{ 325 return (vn_matchops(vp, zfsctl_ops_root) || 326 vn_matchops(vp, zfsctl_ops_snapdir) || 327 vn_matchops(vp, zfsctl_ops_snapshot) || 328 vn_matchops(vp, zfsctl_ops_shares_dir)); 329 330} 331 332typedef struct zfsctl_root { 333 sfs_node_t node; 334 sfs_node_t *snapdir; 335 timestruc_t cmtime; 336} zfsctl_root_t; 337 338 339/* 340 * Create the '.zfs' directory. 341 */ 342void 343zfsctl_create(zfsvfs_t *zfsvfs) 344{ 345 zfsctl_root_t *dot_zfs; 346 sfs_node_t *snapdir; 347 vnode_t *rvp; 348 uint64_t crtime[2]; 349 350 ASSERT(zfsvfs->z_ctldir == NULL); 351 352 snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT, 353 ZFSCTL_INO_SNAPDIR); 354 dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0, 355 ZFSCTL_INO_ROOT); 356 dot_zfs->snapdir = snapdir; 357 358 VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); 359 VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), 360 &crtime, sizeof(crtime))); 361 ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); 362 vput(rvp); 363 364 zfsvfs->z_ctldir = dot_zfs; 365} 366 367/* 368 * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. 369 * The nodes must not have any associated vnodes by now as they should be 370 * vflush-ed. 371 */ 372void 373zfsctl_destroy(zfsvfs_t *zfsvfs) 374{ 375 sfs_destroy_node(zfsvfs->z_ctldir->snapdir); 376 sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); 377 zfsvfs->z_ctldir = NULL; 378} 379 380static int 381zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, 382 struct vnode **vpp) 383{ 384 return (VFS_ROOT(mp, flags, vpp)); 385} 386 387static void 388zfsctl_common_vnode_setup(vnode_t *vp, void *arg) 389{ 390 ASSERT_VOP_ELOCKED(vp, __func__); 391 392 /* We support shared locking. */ 393 VN_LOCK_ASHARE(vp); 394 vp->v_type = VDIR; 395 vp->v_data = arg; 396} 397 398static int 399zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, 400 struct vnode **vpp) 401{ 402 void *node; 403 int err; 404 405 node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir; 406 err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, 407 zfsctl_common_vnode_setup, node, vpp); 408 return (err); 409} 410 411static int 412zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, 413 struct vnode **vpp) 414{ 415 void *node; 416 int err; 417 418 node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir; 419 err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", 420 &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); 421 return (err); 422} 423 424/* 425 * Given a root znode, retrieve the associated .zfs directory. 426 * Add a hold to the vnode and return it. 427 */ 428int 429zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) 430{ 431 vnode_t *vp; 432 int error; 433 434 error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); 435 return (error); 436} 437 438/* 439 * Common open routine. Disallow any write access. 440 */ 441static int 442zfsctl_common_open(struct vop_open_args *ap) 443{ 444 int flags = ap->a_mode; 445 446 if (flags & FWRITE) 447 return (SET_ERROR(EACCES)); 448 449 return (0); 450} 451 452/* 453 * Common close routine. Nothing to do here. 454 */ 455/* ARGSUSED */ 456static int 457zfsctl_common_close(struct vop_close_args *ap) 458{ 459 return (0); 460} 461 462/* 463 * Common access routine. Disallow writes. 464 */ 465static int 466zfsctl_common_access(ap) 467 struct vop_access_args /* { 468 struct vnode *a_vp; 469 accmode_t a_accmode; 470 struct ucred *a_cred; 471 struct thread *a_td; 472 } */ *ap; 473{ 474 accmode_t accmode = ap->a_accmode; 475 476 if (accmode & VWRITE) 477 return (SET_ERROR(EACCES)); 478 return (0); 479} 480 481/* 482 * Common getattr function. Fill in basic information. 483 */ 484static void 485zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) 486{ 487 timestruc_t now; 488 sfs_node_t *node; 489 490 node = vp->v_data; 491 492 vap->va_uid = 0; 493 vap->va_gid = 0; 494 vap->va_rdev = 0; 495 /* 496 * We are a purely virtual object, so we have no 497 * blocksize or allocated blocks. 498 */ 499 vap->va_blksize = 0; 500 vap->va_nblocks = 0; 501 vap->va_seq = 0; 502 vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; 503 vap->va_mode = zfsctl_ctldir_mode; 504 vap->va_type = VDIR; 505 /* 506 * We live in the now (for atime). 507 */ 508 gethrestime(&now); 509 vap->va_atime = now; 510 /* FreeBSD: Reset chflags(2) flags. */ 511 vap->va_flags = 0; 512 513 vap->va_nodeid = node->sn_id; 514 515 /* At least '.' and '..'. */ 516 vap->va_nlink = 2; 517} 518 519static int 520zfsctl_common_fid(ap) 521 struct vop_fid_args /* { 522 struct vnode *a_vp; 523 struct fid *a_fid; 524 } */ *ap; 525{ 526 vnode_t *vp = ap->a_vp; 527 fid_t *fidp = (void *)ap->a_fid; 528 sfs_node_t *node = vp->v_data; 529 uint64_t object = node->sn_id; 530 zfid_short_t *zfid; 531 int i; 532 533 zfid = (zfid_short_t *)fidp; 534 zfid->zf_len = SHORT_FID_LEN; 535 536 for (i = 0; i < sizeof(zfid->zf_object); i++) 537 zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); 538 539 /* .zfs nodes always have a generation number of 0 */ 540 for (i = 0; i < sizeof(zfid->zf_gen); i++) 541 zfid->zf_gen[i] = 0; 542 543 return (0); 544} 545 546static int 547zfsctl_common_reclaim(ap) 548 struct vop_reclaim_args /* { 549 struct vnode *a_vp; 550 struct thread *a_td; 551 } */ *ap; 552{ 553 vnode_t *vp = ap->a_vp; 554 555 (void) sfs_reclaim_vnode(vp); 556 return (0); 557} 558 559static int 560zfsctl_common_print(ap) 561 struct vop_print_args /* { 562 struct vnode *a_vp; 563 } */ *ap; 564{ 565 sfs_print_node(ap->a_vp->v_data); 566 return (0); 567} 568 569/* 570 * Get root directory attributes. 571 */ 572static int 573zfsctl_root_getattr(ap) 574 struct vop_getattr_args /* { 575 struct vnode *a_vp; 576 struct vattr *a_vap; 577 struct ucred *a_cred; 578 } */ *ap; 579{ 580 struct vnode *vp = ap->a_vp; 581 struct vattr *vap = ap->a_vap; 582 zfsctl_root_t *node = vp->v_data; 583 584 zfsctl_common_getattr(vp, vap); 585 vap->va_ctime = node->cmtime; 586 vap->va_mtime = vap->va_ctime; 587 vap->va_birthtime = vap->va_ctime; 588 vap->va_nlink += 1; /* snapdir */ 589 vap->va_size = vap->va_nlink; 590 return (0); 591} 592 593/* 594 * When we lookup "." we still can be asked to lock it 595 * differently, can't we? 596 */ 597int 598zfsctl_relock_dot(vnode_t *dvp, int ltype) 599{ 600 vref(dvp); 601 if (ltype != VOP_ISLOCKED(dvp)) { 602 if (ltype == LK_EXCLUSIVE) 603 vn_lock(dvp, LK_UPGRADE | LK_RETRY); 604 else /* if (ltype == LK_SHARED) */ 605 vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); 606 607 /* Relock for the "." case may left us with reclaimed vnode. */ 608 if ((dvp->v_iflag & VI_DOOMED) != 0) { 609 vrele(dvp); 610 return (SET_ERROR(ENOENT)); 611 } 612 } 613 return (0); 614} 615 616/* 617 * Special case the handling of "..". 618 */ 619int 620zfsctl_root_lookup(ap) 621 struct vop_lookup_args /* { 622 struct vnode *a_dvp; 623 struct vnode **a_vpp; 624 struct componentname *a_cnp; 625 } */ *ap; 626{ 627 struct componentname *cnp = ap->a_cnp; 628 vnode_t *dvp = ap->a_dvp; 629 vnode_t **vpp = ap->a_vpp; 630 cred_t *cr = ap->a_cnp->cn_cred; 631 int flags = ap->a_cnp->cn_flags; 632 int lkflags = ap->a_cnp->cn_lkflags; 633 int nameiop = ap->a_cnp->cn_nameiop; 634 int err; 635 int ltype; 636 637 ASSERT(dvp->v_type == VDIR); 638 639 if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) 640 return (SET_ERROR(ENOTSUP)); 641 642 if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { 643 err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); 644 if (err == 0) 645 *vpp = dvp; 646 } else if ((flags & ISDOTDOT) != 0) { 647 err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, 648 lkflags, vpp); 649 } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { 650 err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); 651 } else { 652 err = SET_ERROR(ENOENT); 653 } 654 if (err != 0) 655 *vpp = NULL; 656 return (err); 657} 658 659static int 660zfsctl_root_readdir(ap) 661 struct vop_readdir_args /* { 662 struct vnode *a_vp; 663 struct uio *a_uio; 664 struct ucred *a_cred; 665 int *a_eofflag; 666 int *ncookies; 667 u_long **a_cookies; 668 } */ *ap; 669{ 670 struct dirent entry; 671 vnode_t *vp = ap->a_vp; 672 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 673 zfsctl_root_t *node = vp->v_data; 674 uio_t *uio = ap->a_uio; 675 int *eofp = ap->a_eofflag; 676 off_t dots_offset; 677 int error; 678 679 ASSERT(vp->v_type == VDIR); 680 681 error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, 682 &dots_offset); 683 if (error != 0) { 684 if (error == ENAMETOOLONG) /* ran out of destination space */ 685 error = 0; 686 return (error); 687 } 688 if (uio->uio_offset != dots_offset) 689 return (SET_ERROR(EINVAL)); 690 691 CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name)); 692 entry.d_fileno = node->snapdir->sn_id; 693 entry.d_type = DT_DIR; 694 strcpy(entry.d_name, node->snapdir->sn_name); 695 entry.d_namlen = strlen(entry.d_name); 696 entry.d_reclen = sizeof(entry); 697 error = vfs_read_dirent(ap, &entry, uio->uio_offset); 698 if (error != 0) { 699 if (error == ENAMETOOLONG) 700 error = 0; 701 return (SET_ERROR(error)); 702 } 703 if (eofp != NULL) 704 *eofp = 1; 705 return (0); 706} 707 708static int 709zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) 710{ 711 static const char dotzfs_name[4] = ".zfs"; 712 vnode_t *dvp; 713 int error; 714 715 if (*ap->a_buflen < sizeof (dotzfs_name)) 716 return (SET_ERROR(ENOMEM)); 717 718 error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, 719 LK_SHARED, &dvp); 720 if (error != 0) 721 return (SET_ERROR(error)); 722 723 VOP_UNLOCK(dvp, 0); 724 *ap->a_vpp = dvp; 725 *ap->a_buflen -= sizeof (dotzfs_name); 726 bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); 727 return (0); 728} 729 730static int 731zfsctl_common_pathconf(ap) 732 struct vop_pathconf_args /* { 733 struct vnode *a_vp; 734 int a_name; 735 int *a_retval; 736 } */ *ap; 737{ 738 /* 739 * We care about ACL variables so that user land utilities like ls 740 * can display them correctly. Since the ctldir's st_dev is set to be 741 * the same as the parent dataset, we must support all variables that 742 * it supports. 743 */ 744 switch (ap->a_name) { 745 case _PC_LINK_MAX: 746 *ap->a_retval = INT_MAX; 747 return (0); 748 749 case _PC_FILESIZEBITS: 750 *ap->a_retval = 64; 751 return (0); 752 753 case _PC_MIN_HOLE_SIZE: 754 *ap->a_retval = (int)SPA_MINBLOCKSIZE; 755 return (0); 756 757 case _PC_ACL_EXTENDED: 758 *ap->a_retval = 0; 759 return (0); 760 761 case _PC_ACL_NFS4: 762 *ap->a_retval = 1; 763 return (0); 764 765 case _PC_ACL_PATH_MAX: 766 *ap->a_retval = ACL_MAX_ENTRIES; 767 return (0); 768 769 case _PC_NAME_MAX: 770 *ap->a_retval = NAME_MAX; 771 return (0); 772 773 default: 774 return (vop_stdpathconf(ap)); 775 } 776} 777 778/** 779 * Returns a trivial ACL 780 */ 781int 782zfsctl_common_getacl(ap) 783 struct vop_getacl_args /* { 784 struct vnode *vp; 785 acl_type_t a_type; 786 struct acl *a_aclp; 787 struct ucred *cred; 788 struct thread *td; 789 } */ *ap; 790{ 791 int i; 792 793 if (ap->a_type != ACL_TYPE_NFS4) 794 return (EINVAL); 795 796 acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); 797 /* 798 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify 799 * attributes. That is not the case for the ctldir, so we must clear 800 * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs 801 * aren't supported by the ctldir. 802 */ 803 for (i = 0; i < ap->a_aclp->acl_cnt; i++) { 804 struct acl_entry *entry; 805 entry = &(ap->a_aclp->acl_entry[i]); 806 uint32_t old_perm = entry->ae_perm; 807 entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | 808 ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | 809 ACL_READ_NAMED_ATTRS ); 810 } 811 812 return (0); 813} 814 815static struct vop_vector zfsctl_ops_root = { 816 .vop_default = &default_vnodeops, 817 .vop_open = zfsctl_common_open, 818 .vop_close = zfsctl_common_close, 819 .vop_ioctl = VOP_EINVAL, 820 .vop_getattr = zfsctl_root_getattr, 821 .vop_access = zfsctl_common_access, 822 .vop_readdir = zfsctl_root_readdir, 823 .vop_lookup = zfsctl_root_lookup, 824 .vop_inactive = VOP_NULL, 825 .vop_reclaim = zfsctl_common_reclaim, 826 .vop_fid = zfsctl_common_fid, 827 .vop_print = zfsctl_common_print, 828 .vop_vptocnp = zfsctl_root_vptocnp, 829 .vop_pathconf = zfsctl_common_pathconf, 830 .vop_getacl = zfsctl_common_getacl, 831}; 832 833static int 834zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) 835{ 836 objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; 837 838 dmu_objset_name(os, zname); 839 if (strlen(zname) + 1 + strlen(name) >= len) 840 return (SET_ERROR(ENAMETOOLONG)); 841 (void) strcat(zname, "@"); 842 (void) strcat(zname, name); 843 return (0); 844} 845 846static int 847zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) 848{ 849 objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; 850 int err; 851 852 err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); 853 return (err); 854} 855 856/* 857 * Given a vnode get a root vnode of a filesystem mounted on top of 858 * the vnode, if any. The root vnode is referenced and locked. 859 * If no filesystem is mounted then the orinal vnode remains referenced 860 * and locked. If any error happens the orinal vnode is unlocked and 861 * released. 862 */ 863static int 864zfsctl_mounted_here(vnode_t **vpp, int flags) 865{ 866 struct mount *mp; 867 int err; 868 869 ASSERT_VOP_LOCKED(*vpp, __func__); 870 ASSERT3S((*vpp)->v_type, ==, VDIR); 871 872 if ((mp = (*vpp)->v_mountedhere) != NULL) { 873 err = vfs_busy(mp, 0); 874 KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); 875 KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); 876 vput(*vpp); 877 err = VFS_ROOT(mp, flags, vpp); 878 vfs_unbusy(mp); 879 return (err); 880 } 881 return (EJUSTRETURN); 882} 883 884typedef struct { 885 const char *snap_name; 886 uint64_t snap_id; 887} snapshot_setup_arg_t; 888 889static void 890zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) 891{ 892 snapshot_setup_arg_t *ssa = arg; 893 sfs_node_t *node; 894 895 ASSERT_VOP_ELOCKED(vp, __func__); 896 897 node = sfs_alloc_node(sizeof(sfs_node_t), 898 ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); 899 zfsctl_common_vnode_setup(vp, node); 900 901 /* We have to support recursive locking. */ 902 VN_LOCK_AREC(vp); 903} 904 905/* 906 * Lookup entry point for the 'snapshot' directory. Try to open the 907 * snapshot if it exist, creating the pseudo filesystem vnode as necessary. 908 * Perform a mount of the associated dataset on top of the vnode. 909 * There are four possibilities: 910 * - the snapshot node and vnode do not exist 911 * - the snapshot vnode is covered by the mounted snapshot 912 * - the snapshot vnode is not covered yet, the mount operation is in progress 913 * - the snapshot vnode is not covered, because the snapshot has been unmounted 914 * The last two states are transient and should be relatively short-lived. 915 */ 916int 917zfsctl_snapdir_lookup(ap) 918 struct vop_lookup_args /* { 919 struct vnode *a_dvp; 920 struct vnode **a_vpp; 921 struct componentname *a_cnp; 922 } */ *ap; 923{ 924 vnode_t *dvp = ap->a_dvp; 925 vnode_t **vpp = ap->a_vpp; 926 struct componentname *cnp = ap->a_cnp; 927 char name[NAME_MAX + 1]; 928 char fullname[ZFS_MAX_DATASET_NAME_LEN]; 929 char *mountpoint; 930 size_t mountpoint_len; 931 zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; 932 uint64_t snap_id; 933 int nameiop = cnp->cn_nameiop; 934 int lkflags = cnp->cn_lkflags; 935 int flags = cnp->cn_flags; 936 int err; 937 938 ASSERT(dvp->v_type == VDIR); 939 940 if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) 941 return (SET_ERROR(ENOTSUP)); 942 943 if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { 944 err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); 945 if (err == 0) 946 *vpp = dvp; 947 return (err); 948 } 949 if (flags & ISDOTDOT) { 950 err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, 951 vpp); 952 return (err); 953 } 954 955 if (cnp->cn_namelen >= sizeof(name)) 956 return (SET_ERROR(ENAMETOOLONG)); 957 958 strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); 959 err = zfsctl_snapshot_lookup(dvp, name, &snap_id); 960 if (err != 0) 961 return (SET_ERROR(ENOENT)); 962 963 for (;;) { 964 snapshot_setup_arg_t ssa; 965 966 ssa.snap_name = name; 967 ssa.snap_id = snap_id; 968 err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, 969 snap_id, "zfs", &zfsctl_ops_snapshot, 970 zfsctl_snapshot_vnode_setup, &ssa, vpp); 971 if (err != 0) 972 return (err); 973 974 /* Check if a new vnode has just been created. */ 975 if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) 976 break; 977 978 /* 979 * Check if a snapshot is already mounted on top of the vnode. 980 */ 981 err = zfsctl_mounted_here(vpp, lkflags); 982 if (err != EJUSTRETURN) 983 return (err); 984 985 /* 986 * If the vnode is not covered, then either the mount operation 987 * is in progress or the snapshot has already been unmounted 988 * but the vnode hasn't been inactivated and reclaimed yet. 989 * We can try to re-use the vnode in the latter case. 990 */ 991 VI_LOCK(*vpp); 992 if (((*vpp)->v_iflag & VI_MOUNT) == 0) { 993 /* Upgrade to exclusive lock in order to: 994 * - avoid race conditions 995 * - satisfy the contract of mount_snapshot() 996 */ 997 err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); 998 if (err == 0) 999 break; 1000 } else { 1001 VI_UNLOCK(*vpp); 1002 } 1003 1004 /* 1005 * In this state we can loop on uncontested locks and starve 1006 * the thread doing the lengthy, non-trivial mount operation. 1007 * So, yield to prevent that from happening. 1008 */ 1009 vput(*vpp); 1010 kern_yield(PRI_USER); 1011 } 1012 1013 VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname)); 1014 1015 mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + 1016 strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; 1017 mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); 1018 (void) snprintf(mountpoint, mountpoint_len, 1019 "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", 1020 dvp->v_vfsp->mnt_stat.f_mntonname, name); 1021 1022 err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); 1023 kmem_free(mountpoint, mountpoint_len); 1024 if (err == 0) { 1025 /* 1026 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>. 1027 * 1028 * This is where we lie about our v_vfsp in order to 1029 * make .zfs/snapshot/<snapname> accessible over NFS 1030 * without requiring manual mounts of <snapname>. 1031 */ 1032 ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); 1033 VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; 1034 1035 /* Clear the root flag (set via VFS_ROOT) as well. */ 1036 (*vpp)->v_vflag &= ~VV_ROOT; 1037 } 1038 1039 if (err != 0) 1040 *vpp = NULL; 1041 return (err); 1042} 1043 1044static int 1045zfsctl_snapdir_readdir(ap) 1046 struct vop_readdir_args /* { 1047 struct vnode *a_vp; 1048 struct uio *a_uio; 1049 struct ucred *a_cred; 1050 int *a_eofflag; 1051 int *ncookies; 1052 u_long **a_cookies; 1053 } */ *ap; 1054{ 1055 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 1056 struct dirent entry; 1057 vnode_t *vp = ap->a_vp; 1058 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 1059 uio_t *uio = ap->a_uio; 1060 int *eofp = ap->a_eofflag; 1061 off_t dots_offset; 1062 int error; 1063 1064 ASSERT(vp->v_type == VDIR); 1065 1066 error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, 1067 &dots_offset); 1068 if (error != 0) { 1069 if (error == ENAMETOOLONG) /* ran out of destination space */ 1070 error = 0; 1071 return (error); 1072 } 1073 1074 for (;;) { 1075 uint64_t cookie; 1076 uint64_t id; 1077 1078 cookie = uio->uio_offset - dots_offset; 1079 1080 dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); 1081 error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), 1082 snapname, &id, &cookie, NULL); 1083 dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); 1084 if (error != 0) { 1085 if (error == ENOENT) { 1086 if (eofp != NULL) 1087 *eofp = 1; 1088 error = 0; 1089 } 1090 return (error); 1091 } 1092 1093 entry.d_fileno = id; 1094 entry.d_type = DT_DIR; 1095 strcpy(entry.d_name, snapname); 1096 entry.d_namlen = strlen(entry.d_name); 1097 entry.d_reclen = sizeof(entry); 1098 error = vfs_read_dirent(ap, &entry, uio->uio_offset); 1099 if (error != 0) { 1100 if (error == ENAMETOOLONG) 1101 error = 0; 1102 return (SET_ERROR(error)); 1103 } 1104 uio->uio_offset = cookie + dots_offset; 1105 } 1106 /* NOTREACHED */ 1107} 1108 1109static int 1110zfsctl_snapdir_getattr(ap) 1111 struct vop_getattr_args /* { 1112 struct vnode *a_vp; 1113 struct vattr *a_vap; 1114 struct ucred *a_cred; 1115 } */ *ap; 1116{ 1117 vnode_t *vp = ap->a_vp; 1118 vattr_t *vap = ap->a_vap; 1119 zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; 1120 dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); 1121 sfs_node_t *node = vp->v_data; 1122 uint64_t snap_count; 1123 int err; 1124 1125 zfsctl_common_getattr(vp, vap); 1126 vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); 1127 vap->va_mtime = vap->va_ctime; 1128 vap->va_birthtime = vap->va_ctime; 1129 if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { 1130 err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, 1131 dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); 1132 if (err != 0) 1133 return (err); 1134 vap->va_nlink += snap_count; 1135 } 1136 vap->va_size = vap->va_nlink; 1137 1138 return (0); 1139} 1140 1141static struct vop_vector zfsctl_ops_snapdir = { 1142 .vop_default = &default_vnodeops, 1143 .vop_open = zfsctl_common_open, 1144 .vop_close = zfsctl_common_close, 1145 .vop_getattr = zfsctl_snapdir_getattr, 1146 .vop_access = zfsctl_common_access, 1147 .vop_readdir = zfsctl_snapdir_readdir, 1148 .vop_lookup = zfsctl_snapdir_lookup, 1149 .vop_reclaim = zfsctl_common_reclaim, 1150 .vop_fid = zfsctl_common_fid, 1151 .vop_print = zfsctl_common_print, 1152 .vop_pathconf = zfsctl_common_pathconf, 1153 .vop_getacl = zfsctl_common_getacl, 1154}; 1155 1156static int 1157zfsctl_snapshot_inactive(ap) 1158 struct vop_inactive_args /* { 1159 struct vnode *a_vp; 1160 struct thread *a_td; 1161 } */ *ap; 1162{ 1163 vnode_t *vp = ap->a_vp; 1164 1165 VERIFY(vrecycle(vp) == 1); 1166 return (0); 1167} 1168 1169static int 1170zfsctl_snapshot_reclaim(ap) 1171 struct vop_reclaim_args /* { 1172 struct vnode *a_vp; 1173 struct thread *a_td; 1174 } */ *ap; 1175{ 1176 vnode_t *vp = ap->a_vp; 1177 void *data = vp->v_data; 1178 1179 sfs_reclaim_vnode(vp); 1180 sfs_destroy_node(data); 1181 return (0); 1182} 1183 1184static int 1185zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) 1186{ 1187 struct mount *mp; 1188 vnode_t *dvp; 1189 vnode_t *vp; 1190 sfs_node_t *node; 1191 size_t len; 1192 int locked; 1193 int error; 1194 1195 vp = ap->a_vp; 1196 node = vp->v_data; 1197 len = strlen(node->sn_name); 1198 if (*ap->a_buflen < len) 1199 return (SET_ERROR(ENOMEM)); 1200 1201 /* 1202 * Prevent unmounting of the snapshot while the vnode lock 1203 * is not held. That is not strictly required, but allows 1204 * us to assert that an uncovered snapshot vnode is never 1205 * "leaked". 1206 */ 1207 mp = vp->v_mountedhere; 1208 if (mp == NULL) 1209 return (SET_ERROR(ENOENT)); 1210 error = vfs_busy(mp, 0); 1211 KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); 1212 1213 /* 1214 * We can vput the vnode as we can now depend on the reference owned 1215 * by the busied mp. But we also need to hold the vnode, because 1216 * the reference may go after vfs_unbusy() which has to be called 1217 * before we can lock the vnode again. 1218 */ 1219 locked = VOP_ISLOCKED(vp); 1220 vhold(vp); 1221 vput(vp); 1222 1223 /* Look up .zfs/snapshot, our parent. */ 1224 error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); 1225 if (error == 0) { 1226 VOP_UNLOCK(dvp, 0); 1227 *ap->a_vpp = dvp; 1228 *ap->a_buflen -= len; 1229 bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); 1230 } 1231 vfs_unbusy(mp); 1232 vget(vp, locked | LK_RETRY, curthread); 1233 vdrop(vp); 1234 return (error); 1235} 1236 1237/* 1238 * These VP's should never see the light of day. They should always 1239 * be covered. 1240 */ 1241static struct vop_vector zfsctl_ops_snapshot = { 1242 .vop_default = NULL, /* ensure very restricted access */ 1243 .vop_inactive = zfsctl_snapshot_inactive, 1244 .vop_reclaim = zfsctl_snapshot_reclaim, 1245 .vop_vptocnp = zfsctl_snapshot_vptocnp, 1246 .vop_lock1 = vop_stdlock, 1247 .vop_unlock = vop_stdunlock, 1248 .vop_islocked = vop_stdislocked, 1249 .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ 1250 .vop_print = zfsctl_common_print, 1251}; 1252 1253int 1254zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) 1255{ 1256 struct mount *mp; 1257 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1258 vnode_t *vp; 1259 int error; 1260 1261 ASSERT(zfsvfs->z_ctldir != NULL); 1262 *zfsvfsp = NULL; 1263 error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, 1264 ZFSCTL_INO_SNAPDIR, objsetid, &vp); 1265 if (error == 0 && vp != NULL) { 1266 /* 1267 * XXX Probably need to at least reference, if not busy, the mp. 1268 */ 1269 if (vp->v_mountedhere != NULL) 1270 *zfsvfsp = vp->v_mountedhere->mnt_data; 1271 vput(vp); 1272 } 1273 if (*zfsvfsp == NULL) 1274 return (SET_ERROR(EINVAL)); 1275 return (0); 1276} 1277 1278/* 1279 * Unmount any snapshots for the given filesystem. This is called from 1280 * zfs_umount() - if we have a ctldir, then go through and unmount all the 1281 * snapshots. 1282 */ 1283int 1284zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) 1285{ 1286 char snapname[ZFS_MAX_DATASET_NAME_LEN]; 1287 zfsvfs_t *zfsvfs = vfsp->vfs_data; 1288 struct mount *mp; 1289 vnode_t *dvp; 1290 vnode_t *vp; 1291 sfs_node_t *node; 1292 sfs_node_t *snap; 1293 uint64_t cookie; 1294 int error; 1295 1296 ASSERT(zfsvfs->z_ctldir != NULL); 1297 1298 cookie = 0; 1299 for (;;) { 1300 uint64_t id; 1301 1302 dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); 1303 error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname), 1304 snapname, &id, &cookie, NULL); 1305 dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); 1306 if (error != 0) { 1307 if (error == ENOENT) 1308 error = 0; 1309 break; 1310 } 1311 1312 for (;;) { 1313 error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, 1314 ZFSCTL_INO_SNAPDIR, id, &vp); 1315 if (error != 0 || vp == NULL) 1316 break; 1317 1318 mp = vp->v_mountedhere; 1319 1320 /* 1321 * v_mountedhere being NULL means that the 1322 * (uncovered) vnode is in a transient state 1323 * (mounting or unmounting), so loop until it 1324 * settles down. 1325 */ 1326 if (mp != NULL) 1327 break; 1328 vput(vp); 1329 } 1330 if (error != 0) 1331 break; 1332 if (vp == NULL) 1333 continue; /* no mountpoint, nothing to do */ 1334 1335 /* 1336 * The mount-point vnode is kept locked to avoid spurious EBUSY 1337 * from a concurrent umount. 1338 * The vnode lock must have recursive locking enabled. 1339 */ 1340 vfs_ref(mp); 1341 error = dounmount(mp, fflags, curthread); 1342 KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, 1343 ("extra references after unmount")); 1344 vput(vp); 1345 if (error != 0) 1346 break; 1347 } 1348 KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, 1349 ("force unmounting failed")); 1350 return (error); 1351} 1352 1353