zfs_ctldir.c revision 330736
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
25 */
26
27/*
28 * ZFS control directory (a.k.a. ".zfs")
29 *
30 * This directory provides a common location for all ZFS meta-objects.
31 * Currently, this is only the 'snapshot' directory, but this may expand in the
32 * future.  The elements are built using the GFS primitives, as the hierarchy
33 * does not actually exist on disk.
34 *
35 * For 'snapshot', we don't want to have all snapshots always mounted, because
36 * this would take up a huge amount of space in /etc/mnttab.  We have three
37 * types of objects:
38 *
39 * 	ctldir ------> snapshotdir -------> snapshot
40 *                                             |
41 *                                             |
42 *                                             V
43 *                                         mounted fs
44 *
45 * The 'snapshot' node contains just enough information to lookup '..' and act
46 * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
47 * perform an automount of the underlying filesystem and return the
48 * corresponding vnode.
49 *
50 * All mounts are handled automatically by the kernel, but unmounts are
51 * (currently) handled from user land.  The main reason is that there is no
52 * reliable way to auto-unmount the filesystem when it's "no longer in use".
53 * When the user unmounts a filesystem, we call zfsctl_unmount(), which
54 * unmounts any snapshots within the snapshot directory.
55 *
56 * The '.zfs', '.zfs/snapshot', and all directories created under
57 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
58 * share the same vfs_t as the head filesystem (what '.zfs' lives under).
59 *
60 * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
61 * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
62 * However, vnodes within these mounted on file systems have their v_vfsp
63 * fields set to the head filesystem to make NFS happy (see
64 * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
65 * so that it cannot be freed until all snapshots have been unmounted.
66 */
67
68#include <sys/zfs_context.h>
69#include <sys/zfs_ctldir.h>
70#include <sys/zfs_ioctl.h>
71#include <sys/zfs_vfsops.h>
72#include <sys/namei.h>
73#include <sys/stat.h>
74#include <sys/dmu.h>
75#include <sys/dsl_dataset.h>
76#include <sys/dsl_destroy.h>
77#include <sys/dsl_deleg.h>
78#include <sys/mount.h>
79#include <sys/zap.h>
80
81#include "zfs_namecheck.h"
82
83/* Common access mode for all virtual directories under the ctldir */
84const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
85    S_IROTH | S_IXOTH;
86
87/*
88 * "Synthetic" filesystem implementation.
89 */
90
91/*
92 * Assert that A implies B.
93 */
94#define KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
95
96static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
97
98typedef struct sfs_node {
99	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
100	uint64_t	sn_parent_id;
101	uint64_t	sn_id;
102} sfs_node_t;
103
104/*
105 * Check the parent's ID as well as the node's to account for a chance
106 * that IDs originating from different domains (snapshot IDs, artifical
107 * IDs, znode IDs) may clash.
108 */
109static int
110sfs_compare_ids(struct vnode *vp, void *arg)
111{
112	sfs_node_t *n1 = vp->v_data;
113	sfs_node_t *n2 = arg;
114	bool equal;
115
116	equal = n1->sn_id == n2->sn_id &&
117	    n1->sn_parent_id == n2->sn_parent_id;
118
119	/* Zero means equality. */
120	return (!equal);
121}
122
123static int
124sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
125   uint64_t id, struct vnode **vpp)
126{
127	sfs_node_t search;
128	int err;
129
130	search.sn_id = id;
131	search.sn_parent_id = parent_id;
132	err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
133	    sfs_compare_ids, &search);
134	return (err);
135}
136
137static int
138sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
139   uint64_t id, struct vnode **vpp)
140{
141	int err;
142
143	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
144	err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
145	    sfs_compare_ids, vp->v_data);
146	return (err);
147}
148
149static void
150sfs_vnode_remove(struct vnode *vp)
151{
152	vfs_hash_remove(vp);
153}
154
155typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
156
157static int
158sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
159    const char *tag, struct vop_vector *vops,
160    sfs_vnode_setup_fn setup, void *arg,
161    struct vnode **vpp)
162{
163	struct vnode *vp;
164	int error;
165
166	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
167	if (error != 0 || *vpp != NULL) {
168		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
169		    "sfs vnode with no data");
170		return (error);
171	}
172
173	/* Allocate a new vnode/inode. */
174	error = getnewvnode(tag, mp, vops, &vp);
175	if (error != 0) {
176		*vpp = NULL;
177		return (error);
178	}
179
180	/*
181	 * Exclusively lock the vnode vnode while it's being constructed.
182	 */
183	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
184	error = insmntque(vp, mp);
185	if (error != 0) {
186		*vpp = NULL;
187		return (error);
188	}
189
190	setup(vp, arg);
191
192	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
193	if (error != 0 || *vpp != NULL) {
194		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
195		    "sfs vnode with no data");
196		return (error);
197	}
198
199	*vpp = vp;
200	return (0);
201}
202
203static void
204sfs_print_node(sfs_node_t *node)
205{
206	printf("\tname = %s\n", node->sn_name);
207	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
208	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
209}
210
211static sfs_node_t *
212sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
213{
214	struct sfs_node *node;
215
216	KASSERT(strlen(name) < sizeof(node->sn_name),
217	    ("sfs node name is too long"));
218	KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
219	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
220	strlcpy(node->sn_name, name, sizeof(node->sn_name));
221	node->sn_parent_id = parent_id;
222	node->sn_id = id;
223
224	return (node);
225}
226
227static void
228sfs_destroy_node(sfs_node_t *node)
229{
230	free(node, M_SFSNODES);
231}
232
233static void *
234sfs_reclaim_vnode(vnode_t *vp)
235{
236	sfs_node_t *node;
237	void *data;
238
239	sfs_vnode_remove(vp);
240	data = vp->v_data;
241	vp->v_data = NULL;
242	return (data);
243}
244
245static int
246sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
247    uio_t *uio, off_t *offp)
248{
249	struct dirent entry;
250	int error;
251
252	/* Reset ncookies for subsequent use of vfs_read_dirent. */
253	if (ap->a_ncookies != NULL)
254		*ap->a_ncookies = 0;
255
256	if (uio->uio_resid < sizeof(entry))
257		return (SET_ERROR(EINVAL));
258
259	if (uio->uio_offset < 0)
260		return (SET_ERROR(EINVAL));
261	if (uio->uio_offset == 0) {
262		entry.d_fileno = id;
263		entry.d_type = DT_DIR;
264		entry.d_name[0] = '.';
265		entry.d_name[1] = '\0';
266		entry.d_namlen = 1;
267		entry.d_reclen = sizeof(entry);
268		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
269		if (error != 0)
270			return (SET_ERROR(error));
271	}
272
273	if (uio->uio_offset < sizeof(entry))
274		return (SET_ERROR(EINVAL));
275	if (uio->uio_offset == sizeof(entry)) {
276		entry.d_fileno = parent_id;
277		entry.d_type = DT_DIR;
278		entry.d_name[0] = '.';
279		entry.d_name[1] = '.';
280		entry.d_name[2] = '\0';
281		entry.d_namlen = 2;
282		entry.d_reclen = sizeof(entry);
283		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
284		if (error != 0)
285			return (SET_ERROR(error));
286	}
287
288	if (offp != NULL)
289		*offp = 2 * sizeof(entry);
290	return (0);
291}
292
293
294/*
295 * .zfs inode namespace
296 *
297 * We need to generate unique inode numbers for all files and directories
298 * within the .zfs pseudo-filesystem.  We use the following scheme:
299 *
300 * 	ENTRY			ZFSCTL_INODE
301 * 	.zfs			1
302 * 	.zfs/snapshot		2
303 * 	.zfs/snapshot/<snap>	objectid(snap)
304 */
305#define	ZFSCTL_INO_SNAP(id)	(id)
306
307static struct vop_vector zfsctl_ops_root;
308static struct vop_vector zfsctl_ops_snapdir;
309static struct vop_vector zfsctl_ops_snapshot;
310static struct vop_vector zfsctl_ops_shares_dir;
311
312void
313zfsctl_init(void)
314{
315}
316
317void
318zfsctl_fini(void)
319{
320}
321
322boolean_t
323zfsctl_is_node(vnode_t *vp)
324{
325	return (vn_matchops(vp, zfsctl_ops_root) ||
326	    vn_matchops(vp, zfsctl_ops_snapdir) ||
327	    vn_matchops(vp, zfsctl_ops_snapshot) ||
328	    vn_matchops(vp, zfsctl_ops_shares_dir));
329
330}
331
332typedef struct zfsctl_root {
333	sfs_node_t	node;
334	sfs_node_t	*snapdir;
335	timestruc_t	cmtime;
336} zfsctl_root_t;
337
338
339/*
340 * Create the '.zfs' directory.
341 */
342void
343zfsctl_create(zfsvfs_t *zfsvfs)
344{
345	zfsctl_root_t *dot_zfs;
346	sfs_node_t *snapdir;
347	vnode_t *rvp;
348	uint64_t crtime[2];
349
350	ASSERT(zfsvfs->z_ctldir == NULL);
351
352	snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
353	    ZFSCTL_INO_SNAPDIR);
354	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
355	    ZFSCTL_INO_ROOT);
356	dot_zfs->snapdir = snapdir;
357
358	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
359	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
360	    &crtime, sizeof(crtime)));
361	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
362	vput(rvp);
363
364	zfsvfs->z_ctldir = dot_zfs;
365}
366
367/*
368 * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
369 * The nodes must not have any associated vnodes by now as they should be
370 * vflush-ed.
371 */
372void
373zfsctl_destroy(zfsvfs_t *zfsvfs)
374{
375	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
376	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
377	zfsvfs->z_ctldir = NULL;
378}
379
380static int
381zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
382    struct vnode **vpp)
383{
384	return (VFS_ROOT(mp, flags, vpp));
385}
386
387static void
388zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
389{
390	ASSERT_VOP_ELOCKED(vp, __func__);
391
392	/* We support shared locking. */
393	VN_LOCK_ASHARE(vp);
394	vp->v_type = VDIR;
395	vp->v_data = arg;
396}
397
398static int
399zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
400    struct vnode **vpp)
401{
402	void *node;
403	int err;
404
405	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
406	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
407	    zfsctl_common_vnode_setup, node, vpp);
408	return (err);
409}
410
411static int
412zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
413    struct vnode **vpp)
414{
415	void *node;
416	int err;
417
418	node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
419	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
420	   &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
421	return (err);
422}
423
424/*
425 * Given a root znode, retrieve the associated .zfs directory.
426 * Add a hold to the vnode and return it.
427 */
428int
429zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
430{
431	vnode_t *vp;
432	int error;
433
434	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
435	return (error);
436}
437
438/*
439 * Common open routine.  Disallow any write access.
440 */
441static int
442zfsctl_common_open(struct vop_open_args *ap)
443{
444	int flags = ap->a_mode;
445
446	if (flags & FWRITE)
447		return (SET_ERROR(EACCES));
448
449	return (0);
450}
451
452/*
453 * Common close routine.  Nothing to do here.
454 */
455/* ARGSUSED */
456static int
457zfsctl_common_close(struct vop_close_args *ap)
458{
459	return (0);
460}
461
462/*
463 * Common access routine.  Disallow writes.
464 */
465static int
466zfsctl_common_access(ap)
467	struct vop_access_args /* {
468		struct vnode *a_vp;
469		accmode_t a_accmode;
470		struct ucred *a_cred;
471		struct thread *a_td;
472	} */ *ap;
473{
474	accmode_t accmode = ap->a_accmode;
475
476	if (accmode & VWRITE)
477		return (SET_ERROR(EACCES));
478	return (0);
479}
480
481/*
482 * Common getattr function.  Fill in basic information.
483 */
484static void
485zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
486{
487	timestruc_t	now;
488	sfs_node_t *node;
489
490	node = vp->v_data;
491
492	vap->va_uid = 0;
493	vap->va_gid = 0;
494	vap->va_rdev = 0;
495	/*
496	 * We are a purely virtual object, so we have no
497	 * blocksize or allocated blocks.
498	 */
499	vap->va_blksize = 0;
500	vap->va_nblocks = 0;
501	vap->va_seq = 0;
502	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
503	vap->va_mode = zfsctl_ctldir_mode;
504	vap->va_type = VDIR;
505	/*
506	 * We live in the now (for atime).
507	 */
508	gethrestime(&now);
509	vap->va_atime = now;
510	/* FreeBSD: Reset chflags(2) flags. */
511	vap->va_flags = 0;
512
513	vap->va_nodeid = node->sn_id;
514
515	/* At least '.' and '..'. */
516	vap->va_nlink = 2;
517}
518
519static int
520zfsctl_common_fid(ap)
521	struct vop_fid_args /* {
522		struct vnode *a_vp;
523		struct fid *a_fid;
524	} */ *ap;
525{
526	vnode_t		*vp = ap->a_vp;
527	fid_t		*fidp = (void *)ap->a_fid;
528	sfs_node_t	*node = vp->v_data;
529	uint64_t	object = node->sn_id;
530	zfid_short_t	*zfid;
531	int		i;
532
533	zfid = (zfid_short_t *)fidp;
534	zfid->zf_len = SHORT_FID_LEN;
535
536	for (i = 0; i < sizeof(zfid->zf_object); i++)
537		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
538
539	/* .zfs nodes always have a generation number of 0 */
540	for (i = 0; i < sizeof(zfid->zf_gen); i++)
541		zfid->zf_gen[i] = 0;
542
543	return (0);
544}
545
546static int
547zfsctl_common_reclaim(ap)
548	struct vop_reclaim_args /* {
549		struct vnode *a_vp;
550		struct thread *a_td;
551	} */ *ap;
552{
553	vnode_t *vp = ap->a_vp;
554
555	(void) sfs_reclaim_vnode(vp);
556	return (0);
557}
558
559static int
560zfsctl_common_print(ap)
561	struct vop_print_args /* {
562		struct vnode *a_vp;
563	} */ *ap;
564{
565	sfs_print_node(ap->a_vp->v_data);
566	return (0);
567}
568
569/*
570 * Get root directory attributes.
571 */
572static int
573zfsctl_root_getattr(ap)
574	struct vop_getattr_args /* {
575		struct vnode *a_vp;
576		struct vattr *a_vap;
577		struct ucred *a_cred;
578	} */ *ap;
579{
580	struct vnode *vp = ap->a_vp;
581	struct vattr *vap = ap->a_vap;
582	zfsctl_root_t *node = vp->v_data;
583
584	zfsctl_common_getattr(vp, vap);
585	vap->va_ctime = node->cmtime;
586	vap->va_mtime = vap->va_ctime;
587	vap->va_birthtime = vap->va_ctime;
588	vap->va_nlink += 1; /* snapdir */
589	vap->va_size = vap->va_nlink;
590	return (0);
591}
592
593/*
594 * When we lookup "." we still can be asked to lock it
595 * differently, can't we?
596 */
597int
598zfsctl_relock_dot(vnode_t *dvp, int ltype)
599{
600	vref(dvp);
601	if (ltype != VOP_ISLOCKED(dvp)) {
602		if (ltype == LK_EXCLUSIVE)
603			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
604		else /* if (ltype == LK_SHARED) */
605			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
606
607		/* Relock for the "." case may left us with reclaimed vnode. */
608		if ((dvp->v_iflag & VI_DOOMED) != 0) {
609			vrele(dvp);
610			return (SET_ERROR(ENOENT));
611		}
612	}
613	return (0);
614}
615
616/*
617 * Special case the handling of "..".
618 */
619int
620zfsctl_root_lookup(ap)
621	struct vop_lookup_args /* {
622		struct vnode *a_dvp;
623		struct vnode **a_vpp;
624		struct componentname *a_cnp;
625	} */ *ap;
626{
627	struct componentname *cnp = ap->a_cnp;
628	vnode_t *dvp = ap->a_dvp;
629	vnode_t **vpp = ap->a_vpp;
630	cred_t *cr = ap->a_cnp->cn_cred;
631	int flags = ap->a_cnp->cn_flags;
632	int lkflags = ap->a_cnp->cn_lkflags;
633	int nameiop = ap->a_cnp->cn_nameiop;
634	int err;
635	int ltype;
636
637	ASSERT(dvp->v_type == VDIR);
638
639	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
640		return (SET_ERROR(ENOTSUP));
641
642	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
643		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
644		if (err == 0)
645			*vpp = dvp;
646	} else if ((flags & ISDOTDOT) != 0) {
647		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
648		    lkflags, vpp);
649	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
650		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
651	} else {
652		err = SET_ERROR(ENOENT);
653	}
654	if (err != 0)
655		*vpp = NULL;
656	return (err);
657}
658
659static int
660zfsctl_root_readdir(ap)
661	struct vop_readdir_args /* {
662		struct vnode *a_vp;
663		struct uio *a_uio;
664		struct ucred *a_cred;
665		int *a_eofflag;
666		int *ncookies;
667		u_long **a_cookies;
668	} */ *ap;
669{
670	struct dirent entry;
671	vnode_t *vp = ap->a_vp;
672	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
673	zfsctl_root_t *node = vp->v_data;
674	uio_t *uio = ap->a_uio;
675	int *eofp = ap->a_eofflag;
676	off_t dots_offset;
677	int error;
678
679	ASSERT(vp->v_type == VDIR);
680
681	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
682	    &dots_offset);
683	if (error != 0) {
684		if (error == ENAMETOOLONG) /* ran out of destination space */
685			error = 0;
686		return (error);
687	}
688	if (uio->uio_offset != dots_offset)
689		return (SET_ERROR(EINVAL));
690
691	CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
692	entry.d_fileno = node->snapdir->sn_id;
693	entry.d_type = DT_DIR;
694	strcpy(entry.d_name, node->snapdir->sn_name);
695	entry.d_namlen = strlen(entry.d_name);
696	entry.d_reclen = sizeof(entry);
697	error = vfs_read_dirent(ap, &entry, uio->uio_offset);
698	if (error != 0) {
699		if (error == ENAMETOOLONG)
700			error = 0;
701		return (SET_ERROR(error));
702	}
703	if (eofp != NULL)
704		*eofp = 1;
705	return (0);
706}
707
708static int
709zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
710{
711	static const char dotzfs_name[4] = ".zfs";
712	vnode_t *dvp;
713	int error;
714
715	if (*ap->a_buflen < sizeof (dotzfs_name))
716		return (SET_ERROR(ENOMEM));
717
718	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
719	    LK_SHARED, &dvp);
720	if (error != 0)
721		return (SET_ERROR(error));
722
723	VOP_UNLOCK(dvp, 0);
724	*ap->a_vpp = dvp;
725	*ap->a_buflen -= sizeof (dotzfs_name);
726	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
727	return (0);
728}
729
730static int
731zfsctl_common_pathconf(ap)
732	struct vop_pathconf_args /* {
733		struct vnode *a_vp;
734		int a_name;
735		int *a_retval;
736	} */ *ap;
737{
738	/*
739	 * We care about ACL variables so that user land utilities like ls
740	 * can display them correctly.  Since the ctldir's st_dev is set to be
741	 * the same as the parent dataset, we must support all variables that
742	 * it supports.
743	 */
744	switch (ap->a_name) {
745	case _PC_LINK_MAX:
746		*ap->a_retval = INT_MAX;
747		return (0);
748
749	case _PC_FILESIZEBITS:
750		*ap->a_retval = 64;
751		return (0);
752
753	case _PC_MIN_HOLE_SIZE:
754		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
755		return (0);
756
757	case _PC_ACL_EXTENDED:
758		*ap->a_retval = 0;
759		return (0);
760
761	case _PC_ACL_NFS4:
762		*ap->a_retval = 1;
763		return (0);
764
765	case _PC_ACL_PATH_MAX:
766		*ap->a_retval = ACL_MAX_ENTRIES;
767		return (0);
768
769	case _PC_NAME_MAX:
770		*ap->a_retval = NAME_MAX;
771		return (0);
772
773	default:
774		return (vop_stdpathconf(ap));
775	}
776}
777
778/**
779 * Returns a trivial ACL
780 */
781int
782zfsctl_common_getacl(ap)
783	struct vop_getacl_args /* {
784		struct vnode *vp;
785		acl_type_t a_type;
786		struct acl *a_aclp;
787		struct ucred *cred;
788		struct thread *td;
789	} */ *ap;
790{
791	int i;
792
793	if (ap->a_type != ACL_TYPE_NFS4)
794		return (EINVAL);
795
796	acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
797	/*
798	 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
799	 * attributes.  That is not the case for the ctldir, so we must clear
800	 * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
801	 * aren't supported by the ctldir.
802	 */
803	for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
804		struct acl_entry *entry;
805		entry = &(ap->a_aclp->acl_entry[i]);
806		uint32_t old_perm = entry->ae_perm;
807		entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
808		    ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
809		    ACL_READ_NAMED_ATTRS );
810	}
811
812	return (0);
813}
814
815static struct vop_vector zfsctl_ops_root = {
816	.vop_default =	&default_vnodeops,
817	.vop_open =	zfsctl_common_open,
818	.vop_close =	zfsctl_common_close,
819	.vop_ioctl =	VOP_EINVAL,
820	.vop_getattr =	zfsctl_root_getattr,
821	.vop_access =	zfsctl_common_access,
822	.vop_readdir =	zfsctl_root_readdir,
823	.vop_lookup =	zfsctl_root_lookup,
824	.vop_inactive =	VOP_NULL,
825	.vop_reclaim =	zfsctl_common_reclaim,
826	.vop_fid =	zfsctl_common_fid,
827	.vop_print =	zfsctl_common_print,
828	.vop_vptocnp =	zfsctl_root_vptocnp,
829	.vop_pathconf =	zfsctl_common_pathconf,
830	.vop_getacl =	zfsctl_common_getacl,
831};
832
833static int
834zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
835{
836	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
837
838	dmu_objset_name(os, zname);
839	if (strlen(zname) + 1 + strlen(name) >= len)
840		return (SET_ERROR(ENAMETOOLONG));
841	(void) strcat(zname, "@");
842	(void) strcat(zname, name);
843	return (0);
844}
845
846static int
847zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
848{
849	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
850	int err;
851
852	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
853	return (err);
854}
855
856/*
857 * Given a vnode get a root vnode of a filesystem mounted on top of
858 * the vnode, if any.  The root vnode is referenced and locked.
859 * If no filesystem is mounted then the orinal vnode remains referenced
860 * and locked.  If any error happens the orinal vnode is unlocked and
861 * released.
862 */
863static int
864zfsctl_mounted_here(vnode_t **vpp, int flags)
865{
866	struct mount *mp;
867	int err;
868
869	ASSERT_VOP_LOCKED(*vpp, __func__);
870	ASSERT3S((*vpp)->v_type, ==, VDIR);
871
872	if ((mp = (*vpp)->v_mountedhere) != NULL) {
873		err = vfs_busy(mp, 0);
874		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
875		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
876		vput(*vpp);
877		err = VFS_ROOT(mp, flags, vpp);
878		vfs_unbusy(mp);
879		return (err);
880	}
881	return (EJUSTRETURN);
882}
883
884typedef struct {
885	const char *snap_name;
886	uint64_t    snap_id;
887} snapshot_setup_arg_t;
888
889static void
890zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
891{
892	snapshot_setup_arg_t *ssa = arg;
893	sfs_node_t *node;
894
895	ASSERT_VOP_ELOCKED(vp, __func__);
896
897	node = sfs_alloc_node(sizeof(sfs_node_t),
898	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
899	zfsctl_common_vnode_setup(vp, node);
900
901	/* We have to support recursive locking. */
902	VN_LOCK_AREC(vp);
903}
904
905/*
906 * Lookup entry point for the 'snapshot' directory.  Try to open the
907 * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
908 * Perform a mount of the associated dataset on top of the vnode.
909 * There are four possibilities:
910 * - the snapshot node and vnode do not exist
911 * - the snapshot vnode is covered by the mounted snapshot
912 * - the snapshot vnode is not covered yet, the mount operation is in progress
913 * - the snapshot vnode is not covered, because the snapshot has been unmounted
914 * The last two states are transient and should be relatively short-lived.
915 */
916int
917zfsctl_snapdir_lookup(ap)
918	struct vop_lookup_args /* {
919		struct vnode *a_dvp;
920		struct vnode **a_vpp;
921		struct componentname *a_cnp;
922	} */ *ap;
923{
924	vnode_t *dvp = ap->a_dvp;
925	vnode_t **vpp = ap->a_vpp;
926	struct componentname *cnp = ap->a_cnp;
927	char name[NAME_MAX + 1];
928	char fullname[ZFS_MAX_DATASET_NAME_LEN];
929	char *mountpoint;
930	size_t mountpoint_len;
931	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
932	uint64_t snap_id;
933	int nameiop = cnp->cn_nameiop;
934	int lkflags = cnp->cn_lkflags;
935	int flags = cnp->cn_flags;
936	int err;
937
938	ASSERT(dvp->v_type == VDIR);
939
940	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
941		return (SET_ERROR(ENOTSUP));
942
943	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
944		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
945		if (err == 0)
946			*vpp = dvp;
947		return (err);
948	}
949	if (flags & ISDOTDOT) {
950		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
951		    vpp);
952		return (err);
953	}
954
955	if (cnp->cn_namelen >= sizeof(name))
956		return (SET_ERROR(ENAMETOOLONG));
957
958	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
959	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
960	if (err != 0)
961		return (SET_ERROR(ENOENT));
962
963	for (;;) {
964		snapshot_setup_arg_t ssa;
965
966		ssa.snap_name = name;
967		ssa.snap_id = snap_id;
968		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
969		   snap_id, "zfs", &zfsctl_ops_snapshot,
970		   zfsctl_snapshot_vnode_setup, &ssa, vpp);
971		if (err != 0)
972			return (err);
973
974		/* Check if a new vnode has just been created. */
975		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
976			break;
977
978		/*
979		 * Check if a snapshot is already mounted on top of the vnode.
980		 */
981		err = zfsctl_mounted_here(vpp, lkflags);
982		if (err != EJUSTRETURN)
983			return (err);
984
985		/*
986		 * If the vnode is not covered, then either the mount operation
987		 * is in progress or the snapshot has already been unmounted
988		 * but the vnode hasn't been inactivated and reclaimed yet.
989		 * We can try to re-use the vnode in the latter case.
990		 */
991		VI_LOCK(*vpp);
992		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
993			/* Upgrade to exclusive lock in order to:
994			 * - avoid race conditions
995			 * - satisfy the contract of mount_snapshot()
996			 */
997			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
998			if (err == 0)
999				break;
1000		} else {
1001			VI_UNLOCK(*vpp);
1002		}
1003
1004		/*
1005		 * In this state we can loop on uncontested locks and starve
1006		 * the thread doing the lengthy, non-trivial mount operation.
1007		 * So, yield to prevent that from happening.
1008		 */
1009		vput(*vpp);
1010		kern_yield(PRI_USER);
1011	}
1012
1013	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
1014
1015	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
1016	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
1017	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
1018	(void) snprintf(mountpoint, mountpoint_len,
1019	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
1020	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
1021
1022	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
1023	kmem_free(mountpoint, mountpoint_len);
1024	if (err == 0) {
1025		/*
1026		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
1027		 *
1028		 * This is where we lie about our v_vfsp in order to
1029		 * make .zfs/snapshot/<snapname> accessible over NFS
1030		 * without requiring manual mounts of <snapname>.
1031		 */
1032		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
1033		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
1034
1035		/* Clear the root flag (set via VFS_ROOT) as well. */
1036		(*vpp)->v_vflag &= ~VV_ROOT;
1037	}
1038
1039	if (err != 0)
1040		*vpp = NULL;
1041	return (err);
1042}
1043
1044static int
1045zfsctl_snapdir_readdir(ap)
1046	struct vop_readdir_args /* {
1047		struct vnode *a_vp;
1048		struct uio *a_uio;
1049		struct ucred *a_cred;
1050		int *a_eofflag;
1051		int *ncookies;
1052		u_long **a_cookies;
1053	} */ *ap;
1054{
1055	char snapname[ZFS_MAX_DATASET_NAME_LEN];
1056	struct dirent entry;
1057	vnode_t *vp = ap->a_vp;
1058	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1059	uio_t *uio = ap->a_uio;
1060	int *eofp = ap->a_eofflag;
1061	off_t dots_offset;
1062	int error;
1063
1064	ASSERT(vp->v_type == VDIR);
1065
1066	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
1067	    &dots_offset);
1068	if (error != 0) {
1069		if (error == ENAMETOOLONG) /* ran out of destination space */
1070			error = 0;
1071		return (error);
1072	}
1073
1074	for (;;) {
1075		uint64_t cookie;
1076		uint64_t id;
1077
1078		cookie = uio->uio_offset - dots_offset;
1079
1080		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
1081		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
1082		    snapname, &id, &cookie, NULL);
1083		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
1084		if (error != 0) {
1085			if (error == ENOENT) {
1086				if (eofp != NULL)
1087					*eofp = 1;
1088				error = 0;
1089			}
1090			return (error);
1091		}
1092
1093		entry.d_fileno = id;
1094		entry.d_type = DT_DIR;
1095		strcpy(entry.d_name, snapname);
1096		entry.d_namlen = strlen(entry.d_name);
1097		entry.d_reclen = sizeof(entry);
1098		error = vfs_read_dirent(ap, &entry, uio->uio_offset);
1099		if (error != 0) {
1100			if (error == ENAMETOOLONG)
1101				error = 0;
1102			return (SET_ERROR(error));
1103		}
1104		uio->uio_offset = cookie + dots_offset;
1105	}
1106	/* NOTREACHED */
1107}
1108
1109static int
1110zfsctl_snapdir_getattr(ap)
1111	struct vop_getattr_args /* {
1112		struct vnode *a_vp;
1113		struct vattr *a_vap;
1114		struct ucred *a_cred;
1115	} */ *ap;
1116{
1117	vnode_t *vp = ap->a_vp;
1118	vattr_t *vap = ap->a_vap;
1119	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
1120	dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
1121	sfs_node_t *node = vp->v_data;
1122	uint64_t snap_count;
1123	int err;
1124
1125	zfsctl_common_getattr(vp, vap);
1126	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
1127	vap->va_mtime = vap->va_ctime;
1128	vap->va_birthtime = vap->va_ctime;
1129	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
1130		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
1131		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
1132		if (err != 0)
1133			return (err);
1134		vap->va_nlink += snap_count;
1135	}
1136	vap->va_size = vap->va_nlink;
1137
1138	return (0);
1139}
1140
1141static struct vop_vector zfsctl_ops_snapdir = {
1142	.vop_default =	&default_vnodeops,
1143	.vop_open =	zfsctl_common_open,
1144	.vop_close =	zfsctl_common_close,
1145	.vop_getattr =	zfsctl_snapdir_getattr,
1146	.vop_access =	zfsctl_common_access,
1147	.vop_readdir =	zfsctl_snapdir_readdir,
1148	.vop_lookup =	zfsctl_snapdir_lookup,
1149	.vop_reclaim =	zfsctl_common_reclaim,
1150	.vop_fid =	zfsctl_common_fid,
1151	.vop_print =	zfsctl_common_print,
1152	.vop_pathconf =	zfsctl_common_pathconf,
1153	.vop_getacl =	zfsctl_common_getacl,
1154};
1155
1156static int
1157zfsctl_snapshot_inactive(ap)
1158	struct vop_inactive_args /* {
1159		struct vnode *a_vp;
1160		struct thread *a_td;
1161	} */ *ap;
1162{
1163	vnode_t *vp = ap->a_vp;
1164
1165	VERIFY(vrecycle(vp) == 1);
1166	return (0);
1167}
1168
1169static int
1170zfsctl_snapshot_reclaim(ap)
1171	struct vop_reclaim_args /* {
1172		struct vnode *a_vp;
1173		struct thread *a_td;
1174	} */ *ap;
1175{
1176	vnode_t *vp = ap->a_vp;
1177	void *data = vp->v_data;
1178
1179	sfs_reclaim_vnode(vp);
1180	sfs_destroy_node(data);
1181	return (0);
1182}
1183
1184static int
1185zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
1186{
1187	struct mount *mp;
1188	vnode_t *dvp;
1189	vnode_t *vp;
1190	sfs_node_t *node;
1191	size_t len;
1192	int locked;
1193	int error;
1194
1195	vp = ap->a_vp;
1196	node = vp->v_data;
1197	len = strlen(node->sn_name);
1198	if (*ap->a_buflen < len)
1199		return (SET_ERROR(ENOMEM));
1200
1201	/*
1202	 * Prevent unmounting of the snapshot while the vnode lock
1203	 * is not held.  That is not strictly required, but allows
1204	 * us to assert that an uncovered snapshot vnode is never
1205	 * "leaked".
1206	 */
1207	mp = vp->v_mountedhere;
1208	if (mp == NULL)
1209		return (SET_ERROR(ENOENT));
1210	error = vfs_busy(mp, 0);
1211	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
1212
1213	/*
1214	 * We can vput the vnode as we can now depend on the reference owned
1215	 * by the busied mp.  But we also need to hold the vnode, because
1216	 * the reference may go after vfs_unbusy() which has to be called
1217	 * before we can lock the vnode again.
1218	 */
1219	locked = VOP_ISLOCKED(vp);
1220	vhold(vp);
1221	vput(vp);
1222
1223	/* Look up .zfs/snapshot, our parent. */
1224	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
1225	if (error == 0) {
1226		VOP_UNLOCK(dvp, 0);
1227		*ap->a_vpp = dvp;
1228		*ap->a_buflen -= len;
1229		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
1230	}
1231	vfs_unbusy(mp);
1232	vget(vp, locked | LK_RETRY, curthread);
1233	vdrop(vp);
1234	return (error);
1235}
1236
1237/*
1238 * These VP's should never see the light of day.  They should always
1239 * be covered.
1240 */
1241static struct vop_vector zfsctl_ops_snapshot = {
1242	.vop_default =		NULL, /* ensure very restricted access */
1243	.vop_inactive =		zfsctl_snapshot_inactive,
1244	.vop_reclaim =		zfsctl_snapshot_reclaim,
1245	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
1246	.vop_lock1 =		vop_stdlock,
1247	.vop_unlock =		vop_stdunlock,
1248	.vop_islocked =		vop_stdislocked,
1249	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
1250	.vop_print =		zfsctl_common_print,
1251};
1252
1253int
1254zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
1255{
1256	struct mount *mp;
1257	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1258	vnode_t *vp;
1259	int error;
1260
1261	ASSERT(zfsvfs->z_ctldir != NULL);
1262	*zfsvfsp = NULL;
1263	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
1264	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
1265	if (error == 0 && vp != NULL) {
1266		/*
1267		 * XXX Probably need to at least reference, if not busy, the mp.
1268		 */
1269		if (vp->v_mountedhere != NULL)
1270			*zfsvfsp = vp->v_mountedhere->mnt_data;
1271		vput(vp);
1272	}
1273	if (*zfsvfsp == NULL)
1274		return (SET_ERROR(EINVAL));
1275	return (0);
1276}
1277
1278/*
1279 * Unmount any snapshots for the given filesystem.  This is called from
1280 * zfs_umount() - if we have a ctldir, then go through and unmount all the
1281 * snapshots.
1282 */
1283int
1284zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
1285{
1286	char snapname[ZFS_MAX_DATASET_NAME_LEN];
1287	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1288	struct mount *mp;
1289	vnode_t *dvp;
1290	vnode_t *vp;
1291	sfs_node_t *node;
1292	sfs_node_t *snap;
1293	uint64_t cookie;
1294	int error;
1295
1296	ASSERT(zfsvfs->z_ctldir != NULL);
1297
1298	cookie = 0;
1299	for (;;) {
1300		uint64_t id;
1301
1302		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
1303		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
1304		    snapname, &id, &cookie, NULL);
1305		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
1306		if (error != 0) {
1307			if (error == ENOENT)
1308				error = 0;
1309			break;
1310		}
1311
1312		for (;;) {
1313			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
1314			    ZFSCTL_INO_SNAPDIR, id, &vp);
1315			if (error != 0 || vp == NULL)
1316				break;
1317
1318			mp = vp->v_mountedhere;
1319
1320			/*
1321			 * v_mountedhere being NULL means that the
1322			 * (uncovered) vnode is in a transient state
1323			 * (mounting or unmounting), so loop until it
1324			 * settles down.
1325			 */
1326			if (mp != NULL)
1327				break;
1328			vput(vp);
1329		}
1330		if (error != 0)
1331			break;
1332		if (vp == NULL)
1333			continue;	/* no mountpoint, nothing to do */
1334
1335		/*
1336		 * The mount-point vnode is kept locked to avoid spurious EBUSY
1337		 * from a concurrent umount.
1338		 * The vnode lock must have recursive locking enabled.
1339		 */
1340		vfs_ref(mp);
1341		error = dounmount(mp, fflags, curthread);
1342		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
1343		    ("extra references after unmount"));
1344		vput(vp);
1345		if (error != 0)
1346			break;
1347	}
1348	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
1349	    ("force unmounting failed"));
1350	return (error);
1351}
1352
1353