1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23290765Smav * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
24168404Spjd */
25168404Spjd
26168404Spjd#include <sys/types.h>
27168404Spjd#include <sys/param.h>
28168404Spjd#include <sys/time.h>
29168404Spjd#include <sys/systm.h>
30168404Spjd#include <sys/sysmacros.h>
31168404Spjd#include <sys/resource.h>
32168404Spjd#include <sys/vfs.h>
33168404Spjd#include <sys/vnode.h>
34168404Spjd#include <sys/file.h>
35168404Spjd#include <sys/kmem.h>
36168404Spjd#include <sys/uio.h>
37168404Spjd#include <sys/cmn_err.h>
38168404Spjd#include <sys/errno.h>
39168404Spjd#include <sys/stat.h>
40168404Spjd#include <sys/unistd.h>
41185029Spjd#include <sys/sunddi.h>
42168404Spjd#include <sys/random.h>
43169023Spjd#include <sys/policy.h>
44168404Spjd#include <sys/kcondvar.h>
45168404Spjd#include <sys/callb.h>
46168404Spjd#include <sys/smp.h>
47168404Spjd#include <sys/zfs_dir.h>
48168404Spjd#include <sys/zfs_acl.h>
49168404Spjd#include <sys/fs/zfs.h>
50168404Spjd#include <sys/zap.h>
51168404Spjd#include <sys/dmu.h>
52168404Spjd#include <sys/atomic.h>
53168404Spjd#include <sys/zfs_ctldir.h>
54185029Spjd#include <sys/zfs_fuid.h>
55219089Spjd#include <sys/sa.h>
56219089Spjd#include <sys/zfs_sa.h>
57168404Spjd#include <sys/dnlc.h>
58185029Spjd#include <sys/extdirent.h>
59168404Spjd
60168404Spjd/*
61304671Savg * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
62185029Spjd * of names after deciding which is the appropriate lookup interface.
63185029Spjd */
64185029Spjdstatic int
65304671Savgzfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
66304671Savg    boolean_t exact, uint64_t *zoid)
67185029Spjd{
68185029Spjd	int error;
69185029Spjd
70185029Spjd	if (zfsvfs->z_norm) {
71304671Savg		matchtype_t mt = exact? MT_EXACT : MT_FIRST;
72185029Spjd
73185029Spjd		/*
74185029Spjd		 * In the non-mixed case we only expect there would ever
75185029Spjd		 * be one match, but we need to use the normalizing lookup.
76185029Spjd		 */
77185029Spjd		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
78304671Savg		    zoid, mt, NULL, 0, NULL);
79185029Spjd	} else {
80185029Spjd		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
81185029Spjd	}
82185029Spjd	*zoid = ZFS_DIRENT_OBJ(*zoid);
83185029Spjd
84185029Spjd	return (error);
85185029Spjd}
86185029Spjd
87185029Spjd/*
88304671Savg * Look up a directory entry under a locked vnode.
89304671Savg * dvp being locked gives us a guarantee that there are no concurrent
90304671Savg * modification of the directory and, thus, if a node can be found in
91304671Savg * the directory, then it must not be unlinked.
92168404Spjd *
93168404Spjd * Input arguments:
94168404Spjd *	dzp	- znode for directory
95168404Spjd *	name	- name of entry to lock
96168404Spjd *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
97168404Spjd *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
98168404Spjd *		  ZXATTR: we want dzp's xattr directory
99168404Spjd *
100168404Spjd * Output arguments:
101168404Spjd *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
102168404Spjd *
103168404Spjd * Return value: 0 on success or errno on failure.
104168404Spjd *
105168404Spjd * NOTE: Always checks for, and rejects, '.' and '..'.
106168404Spjd */
107168404Spjdint
108304671Savgzfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
109168404Spjd{
110168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
111185029Spjd	boolean_t	exact;
112168404Spjd	uint64_t	zoid;
113185029Spjd	vnode_t		*vp = NULL;
114185029Spjd	int		error = 0;
115168404Spjd
116304671Savg	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
117304671Savg
118168404Spjd	*zpp = NULL;
119168404Spjd
120168404Spjd	/*
121168404Spjd	 * Verify that we are not trying to lock '.', '..', or '.zfs'
122168404Spjd	 */
123168404Spjd	if (name[0] == '.' &&
124168404Spjd	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')) ||
125168404Spjd	    zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)
126249195Smm		return (SET_ERROR(EEXIST));
127168404Spjd
128168404Spjd	/*
129185029Spjd	 * Case sensitivity and normalization preferences are set when
130185029Spjd	 * the file system is created.  These are stored in the
131185029Spjd	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
132304671Savg	 * affect how we perform zap lookups.
133185029Spjd	 *
134185029Spjd	 * Decide if exact matches should be requested when performing
135185029Spjd	 * a zap lookup on file systems supporting case-insensitive
136185029Spjd	 * access.
137185029Spjd	 *
138304671Savg	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
139304671Savg	 * because in that case MT_EXACT and MT_FIRST should produce exactly
140304671Savg	 * the same result.
141185029Spjd	 */
142304671Savg	exact = zfsvfs->z_case == ZFS_CASE_MIXED;
143185029Spjd
144304671Savg	if (dzp->z_unlinked && !(flag & ZXATTR))
145304671Savg		return (ENOENT);
146168404Spjd	if (flag & ZXATTR) {
147219089Spjd		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
148219089Spjd		    sizeof (zoid));
149219089Spjd		if (error == 0)
150219089Spjd			error = (zoid == 0 ? ENOENT : 0);
151168404Spjd	} else {
152304671Savg		error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
153168404Spjd	}
154168404Spjd	if (error) {
155168404Spjd		if (error != ENOENT || (flag & ZEXISTS)) {
156168404Spjd			return (error);
157168404Spjd		}
158168404Spjd	} else {
159168404Spjd		if (flag & ZNEW) {
160249195Smm			return (SET_ERROR(EEXIST));
161168404Spjd		}
162168404Spjd		error = zfs_zget(zfsvfs, zoid, zpp);
163304671Savg		if (error)
164168404Spjd			return (error);
165304671Savg		ASSERT(!(*zpp)->z_unlinked);
166168404Spjd	}
167168404Spjd
168168404Spjd	return (0);
169168404Spjd}
170168404Spjd
171304671Savgstatic int
172304671Savgzfs_dd_lookup(znode_t *dzp, znode_t **zpp)
173168404Spjd{
174304671Savg	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
175304671Savg	znode_t *zp;
176304671Savg	uint64_t parent;
177304671Savg	int error;
178168404Spjd
179304671Savg	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
180304671Savg	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
181208131Smm
182304671Savg	if (dzp->z_unlinked)
183304671Savg		return (ENOENT);
184208131Smm
185304671Savg	if ((error = sa_lookup(dzp->z_sa_hdl,
186304671Savg	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
187304671Savg		return (error);
188168404Spjd
189304671Savg	error = zfs_zget(zfsvfs, parent, &zp);
190304671Savg	if (error == 0)
191304671Savg		*zpp = zp;
192304671Savg	return (error);
193168404Spjd}
194168404Spjd
195168404Spjdint
196304671Savgzfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
197168404Spjd{
198304671Savg	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
199168404Spjd	znode_t *zp;
200168404Spjd	int error = 0;
201168404Spjd
202304671Savg	ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
203304671Savg	ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
204304671Savg
205304671Savg	if (dzp->z_unlinked)
206304671Savg		return (SET_ERROR(ENOENT));
207304671Savg
208168404Spjd	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
209304671Savg		*zpp = dzp;
210168404Spjd	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
211304671Savg		error = zfs_dd_lookup(dzp, zpp);
212168404Spjd	} else {
213304671Savg		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
214168404Spjd		if (error == 0) {
215168404Spjd			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
216304671Savg			*zpp = zp;
217168404Spjd		}
218168404Spjd	}
219168404Spjd	return (error);
220168404Spjd}
221168404Spjd
222168404Spjd/*
223168404Spjd * unlinked Set (formerly known as the "delete queue") Error Handling
224168404Spjd *
225168404Spjd * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
226168404Spjd * don't specify the name of the entry that we will be manipulating.  We
227168404Spjd * also fib and say that we won't be adding any new entries to the
228168404Spjd * unlinked set, even though we might (this is to lower the minimum file
229168404Spjd * size that can be deleted in a full filesystem).  So on the small
230168404Spjd * chance that the nlink list is using a fat zap (ie. has more than
231168404Spjd * 2000 entries), we *may* not pre-read a block that's needed.
232168404Spjd * Therefore it is remotely possible for some of the assertions
233168404Spjd * regarding the unlinked set below to fail due to i/o error.  On a
234168404Spjd * nondebug system, this will result in the space being leaked.
235168404Spjd */
236168404Spjdvoid
237168404Spjdzfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
238168404Spjd{
239168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
240168404Spjd
241168404Spjd	ASSERT(zp->z_unlinked);
242219089Spjd	ASSERT(zp->z_links == 0);
243168404Spjd
244185029Spjd	VERIFY3U(0, ==,
245185029Spjd	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
246168404Spjd}
247168404Spjd
248168404Spjd/*
249168404Spjd * Clean up any znodes that had no links when we either crashed or
250168404Spjd * (force) umounted the file system.
251168404Spjd */
252168404Spjdvoid
253168404Spjdzfs_unlinked_drain(zfsvfs_t *zfsvfs)
254168404Spjd{
255168404Spjd	zap_cursor_t	zc;
256168404Spjd	zap_attribute_t zap;
257168404Spjd	dmu_object_info_t doi;
258168404Spjd	znode_t		*zp;
259168404Spjd	int		error;
260168404Spjd
261168404Spjd	/*
262168404Spjd	 * Interate over the contents of the unlinked set.
263168404Spjd	 */
264168404Spjd	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
265168404Spjd	    zap_cursor_retrieve(&zc, &zap) == 0;
266168404Spjd	    zap_cursor_advance(&zc)) {
267168404Spjd
268168404Spjd		/*
269168404Spjd		 * See what kind of object we have in list
270168404Spjd		 */
271168404Spjd
272168404Spjd		error = dmu_object_info(zfsvfs->z_os,
273168404Spjd		    zap.za_first_integer, &doi);
274168404Spjd		if (error != 0)
275168404Spjd			continue;
276168404Spjd
277168404Spjd		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
278168404Spjd		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
279168404Spjd		/*
280168404Spjd		 * We need to re-mark these list entries for deletion,
281168404Spjd		 * so we pull them back into core and set zp->z_unlinked.
282168404Spjd		 */
283168404Spjd		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
284168404Spjd
285168404Spjd		/*
286168404Spjd		 * We may pick up znodes that are already marked for deletion.
287168404Spjd		 * This could happen during the purge of an extended attribute
288168404Spjd		 * directory.  All we need to do is skip over them, since they
289168404Spjd		 * are already in the system marked z_unlinked.
290168404Spjd		 */
291168404Spjd		if (error != 0)
292168404Spjd			continue;
293168404Spjd
294304671Savg		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
295168404Spjd		zp->z_unlinked = B_TRUE;
296304671Savg		vput(ZTOV(zp));
297168404Spjd	}
298168404Spjd	zap_cursor_fini(&zc);
299168404Spjd}
300168404Spjd
301168404Spjd/*
302168404Spjd * Delete the entire contents of a directory.  Return a count
303185029Spjd * of the number of entries that could not be deleted. If we encounter
304185029Spjd * an error, return a count of at least one so that the directory stays
305185029Spjd * in the unlinked set.
306168404Spjd *
307168404Spjd * NOTE: this function assumes that the directory is inactive,
308168404Spjd *	so there is no need to lock its entries before deletion.
309168404Spjd *	Also, it assumes the directory contents is *only* regular
310168404Spjd *	files.
311168404Spjd */
312168404Spjdstatic int
313168404Spjdzfs_purgedir(znode_t *dzp)
314168404Spjd{
315168404Spjd	zap_cursor_t	zc;
316168404Spjd	zap_attribute_t	zap;
317168404Spjd	znode_t		*xzp;
318168404Spjd	dmu_tx_t	*tx;
319168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
320168404Spjd	int skipped = 0;
321168404Spjd	int error;
322168404Spjd
323168404Spjd	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
324168404Spjd	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
325168404Spjd	    zap_cursor_advance(&zc)) {
326168404Spjd		error = zfs_zget(zfsvfs,
327168404Spjd		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
328185029Spjd		if (error) {
329185029Spjd			skipped += 1;
330185029Spjd			continue;
331185029Spjd		}
332168404Spjd
333304671Savg		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
334168404Spjd		ASSERT((ZTOV(xzp)->v_type == VREG) ||
335168404Spjd		    (ZTOV(xzp)->v_type == VLNK));
336168404Spjd
337168404Spjd		tx = dmu_tx_create(zfsvfs->z_os);
338219089Spjd		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
339168404Spjd		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
340219089Spjd		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
341168404Spjd		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
342219089Spjd		/* Is this really needed ? */
343219089Spjd		zfs_sa_upgrade_txholds(tx, xzp);
344269002Sdelphij		dmu_tx_mark_netfree(tx);
345168404Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
346168404Spjd		if (error) {
347168404Spjd			dmu_tx_abort(tx);
348304671Savg			vput(ZTOV(xzp));
349168404Spjd			skipped += 1;
350168404Spjd			continue;
351168404Spjd		}
352168404Spjd
353304671Savg		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
354185029Spjd		if (error)
355185029Spjd			skipped += 1;
356168404Spjd		dmu_tx_commit(tx);
357168404Spjd
358304671Savg		vput(ZTOV(xzp));
359168404Spjd	}
360168404Spjd	zap_cursor_fini(&zc);
361185029Spjd	if (error != ENOENT)
362185029Spjd		skipped += 1;
363168404Spjd	return (skipped);
364168404Spjd}
365168404Spjd
366168404Spjdvoid
367168404Spjdzfs_rmnode(znode_t *zp)
368168404Spjd{
369168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
370168404Spjd	objset_t	*os = zfsvfs->z_os;
371168404Spjd	znode_t		*xzp = NULL;
372168404Spjd	dmu_tx_t	*tx;
373168404Spjd	uint64_t	acl_obj;
374219089Spjd	uint64_t	xattr_obj;
375168404Spjd	int		error;
376168404Spjd
377219089Spjd	ASSERT(zp->z_links == 0);
378304671Savg	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
379168404Spjd
380168404Spjd	/*
381168404Spjd	 * If this is an attribute directory, purge its contents.
382168404Spjd	 */
383168404Spjd	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
384219089Spjd	    (zp->z_pflags & ZFS_XATTR)) {
385168404Spjd		if (zfs_purgedir(zp) != 0) {
386168404Spjd			/*
387168404Spjd			 * Not enough space to delete some xattrs.
388185029Spjd			 * Leave it in the unlinked set.
389168404Spjd			 */
390185029Spjd			zfs_znode_dmu_fini(zp);
391185029Spjd			zfs_znode_free(zp);
392168404Spjd			return;
393168404Spjd		}
394168404Spjd	}
395168404Spjd
396168404Spjd	/*
397185029Spjd	 * Free up all the data in the file.
398185029Spjd	 */
399185029Spjd	error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
400185029Spjd	if (error) {
401185029Spjd		/*
402185029Spjd		 * Not enough space.  Leave the file in the unlinked set.
403185029Spjd		 */
404185029Spjd		zfs_znode_dmu_fini(zp);
405185029Spjd		zfs_znode_free(zp);
406185029Spjd		return;
407185029Spjd	}
408185029Spjd
409185029Spjd	/*
410168404Spjd	 * If the file has extended attributes, we're going to unlink
411168404Spjd	 * the xattr dir.
412168404Spjd	 */
413219089Spjd	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
414219089Spjd	    &xattr_obj, sizeof (xattr_obj));
415219089Spjd	if (error == 0 && xattr_obj) {
416219089Spjd		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
417304671Savg		ASSERT3S(error, ==, 0);
418304671Savg		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
419168404Spjd	}
420168404Spjd
421219089Spjd	acl_obj = zfs_external_acl(zp);
422168404Spjd
423168404Spjd	/*
424185029Spjd	 * Set up the final transaction.
425168404Spjd	 */
426168404Spjd	tx = dmu_tx_create(os);
427168404Spjd	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
428168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
429168404Spjd	if (xzp) {
430168404Spjd		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
431219089Spjd		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
432168404Spjd	}
433168404Spjd	if (acl_obj)
434168404Spjd		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
435219089Spjd
436219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
437168404Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
438168404Spjd	if (error) {
439168404Spjd		/*
440168404Spjd		 * Not enough space to delete the file.  Leave it in the
441168404Spjd		 * unlinked set, leaking it until the fs is remounted (at
442168404Spjd		 * which point we'll call zfs_unlinked_drain() to process it).
443168404Spjd		 */
444168404Spjd		dmu_tx_abort(tx);
445185029Spjd		zfs_znode_dmu_fini(zp);
446185029Spjd		zfs_znode_free(zp);
447185029Spjd		goto out;
448168404Spjd	}
449168404Spjd
450168404Spjd	if (xzp) {
451219089Spjd		ASSERT(error == 0);
452168404Spjd		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
453219089Spjd		xzp->z_links = 0;	/* no more links to it */
454219089Spjd		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
455219089Spjd		    &xzp->z_links, sizeof (xzp->z_links), tx));
456168404Spjd		zfs_unlinked_add(xzp, tx);
457168404Spjd	}
458168404Spjd
459168404Spjd	/* Remove this znode from the unlinked set */
460185029Spjd	VERIFY3U(0, ==,
461185029Spjd	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
462168404Spjd
463168404Spjd	zfs_znode_delete(zp, tx);
464168404Spjd
465168404Spjd	dmu_tx_commit(tx);
466185029Spjdout:
467168404Spjd	if (xzp)
468304671Savg		vput(ZTOV(xzp));
469168404Spjd}
470168404Spjd
471185029Spjdstatic uint64_t
472219089Spjdzfs_dirent(znode_t *zp, uint64_t mode)
473185029Spjd{
474185029Spjd	uint64_t de = zp->z_id;
475219089Spjd
476185029Spjd	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
477219089Spjd		de |= IFTODT(mode) << 60;
478185029Spjd	return (de);
479185029Spjd}
480185029Spjd
481168404Spjd/*
482304671Savg * Link zp into dzp.  Can only fail if zp has been unlinked.
483168404Spjd */
484168404Spjdint
485304671Savgzfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
486304671Savg    int flag)
487168404Spjd{
488219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
489168404Spjd	vnode_t *vp = ZTOV(zp);
490168404Spjd	uint64_t value;
491168404Spjd	int zp_is_dir = (vp->v_type == VDIR);
492219089Spjd	sa_bulk_attr_t bulk[5];
493219089Spjd	uint64_t mtime[2], ctime[2];
494219089Spjd	int count = 0;
495168404Spjd	int error;
496168404Spjd
497304671Savg	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
498304671Savg	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
499304671Savg#if 0
500304671Savg	if (zp_is_dir) {
501304671Savg		error = 0;
502304671Savg		if (dzp->z_links >= LINK_MAX)
503304671Savg			error = SET_ERROR(EMLINK);
504304671Savg		return (error);
505304671Savg	}
506304671Savg#endif
507168404Spjd	if (!(flag & ZRENAMING)) {
508168404Spjd		if (zp->z_unlinked) {	/* no new links to unlinked zp */
509168404Spjd			ASSERT(!(flag & (ZNEW | ZEXISTS)));
510249195Smm			return (SET_ERROR(ENOENT));
511168404Spjd		}
512304671Savg#if 0
513304671Savg		if (zp->z_links >= LINK_MAX) {
514304671Savg			return (SET_ERROR(EMLINK));
515304671Savg		}
516304671Savg#endif
517219089Spjd		zp->z_links++;
518219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
519219089Spjd		    &zp->z_links, sizeof (zp->z_links));
520219089Spjd
521304671Savg	} else {
522304671Savg		ASSERT(zp->z_unlinked == 0);
523168404Spjd	}
524219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
525219089Spjd	    &dzp->z_id, sizeof (dzp->z_id));
526219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
527219089Spjd	    &zp->z_pflags, sizeof (zp->z_pflags));
528168404Spjd
529219089Spjd	if (!(flag & ZNEW)) {
530219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
531219089Spjd		    ctime, sizeof (ctime));
532219089Spjd		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
533219089Spjd		    ctime, B_TRUE);
534219089Spjd	}
535219089Spjd	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
536304671Savg	ASSERT0(error);
537219089Spjd
538219089Spjd	dzp->z_size++;
539219089Spjd	dzp->z_links += zp_is_dir;
540219089Spjd	count = 0;
541219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
542219089Spjd	    &dzp->z_size, sizeof (dzp->z_size));
543219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
544219089Spjd	    &dzp->z_links, sizeof (dzp->z_links));
545219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
546219089Spjd	    mtime, sizeof (mtime));
547219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
548219089Spjd	    ctime, sizeof (ctime));
549219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
550219089Spjd	    &dzp->z_pflags, sizeof (dzp->z_pflags));
551219089Spjd	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
552219089Spjd	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
553304671Savg	ASSERT0(error);
554168404Spjd
555219089Spjd	value = zfs_dirent(zp, zp->z_mode);
556304671Savg	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
557168404Spjd	    8, 1, &value, tx);
558304671Savg	VERIFY0(error);
559168404Spjd
560168404Spjd	return (0);
561168404Spjd}
562168404Spjd
563219089Spjdstatic int
564304671Savgzfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
565219089Spjd    int flag)
566219089Spjd{
567219089Spjd	int error;
568219089Spjd
569219089Spjd	if (zp->z_zfsvfs->z_norm) {
570304671Savg		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
571219089Spjd			error = zap_remove_norm(zp->z_zfsvfs->z_os,
572304671Savg			    dzp->z_id, name, MT_EXACT, tx);
573219089Spjd		else
574219089Spjd			error = zap_remove_norm(zp->z_zfsvfs->z_os,
575304671Savg			    dzp->z_id, name, MT_FIRST, tx);
576219089Spjd	} else {
577219089Spjd		error = zap_remove(zp->z_zfsvfs->z_os,
578304671Savg		    dzp->z_id, name, tx);
579219089Spjd	}
580219089Spjd
581219089Spjd	return (error);
582219089Spjd}
583219089Spjd
584168404Spjd/*
585304671Savg * Unlink zp from dzp, and mark zp for deletion if this was the last link.
586168404Spjd * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
587168404Spjd * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
588168404Spjd * If it's non-NULL, we use it to indicate whether the znode needs deletion,
589168404Spjd * and it's the caller's job to do it.
590168404Spjd */
591168404Spjdint
592304671Savgzfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
593304671Savg    int flag, boolean_t *unlinkedp)
594168404Spjd{
595219089Spjd	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
596168404Spjd	vnode_t *vp = ZTOV(zp);
597168404Spjd	int zp_is_dir = (vp->v_type == VDIR);
598168404Spjd	boolean_t unlinked = B_FALSE;
599219089Spjd	sa_bulk_attr_t bulk[5];
600219089Spjd	uint64_t mtime[2], ctime[2];
601219089Spjd	int count = 0;
602168404Spjd	int error;
603168404Spjd
604304671Savg	ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
605304671Savg	ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
606168404Spjd
607168404Spjd	if (!(flag & ZRENAMING)) {
608168404Spjd
609219089Spjd		if (zp_is_dir && !zfs_dirempty(zp)) {
610249195Smm#ifdef illumos
611249195Smm			return (SET_ERROR(EEXIST));
612249195Smm#else
613249195Smm			return (SET_ERROR(ENOTEMPTY));
614249195Smm#endif
615168404Spjd		}
616219089Spjd
617219089Spjd		/*
618219089Spjd		 * If we get here, we are going to try to remove the object.
619219089Spjd		 * First try removing the name from the directory; if that
620219089Spjd		 * fails, return the error.
621219089Spjd		 */
622304671Savg		error = zfs_dropname(dzp, name, zp, tx, flag);
623219089Spjd		if (error != 0) {
624219089Spjd			return (error);
625219089Spjd		}
626219089Spjd
627219089Spjd		if (zp->z_links <= zp_is_dir) {
628168404Spjd			zfs_panic_recover("zfs: link count on vnode %p is %u, "
629168404Spjd			    "should be at least %u", zp->z_vnode,
630219089Spjd			    (int)zp->z_links,
631168404Spjd			    zp_is_dir + 1);
632219089Spjd			zp->z_links = zp_is_dir + 1;
633168404Spjd		}
634219089Spjd		if (--zp->z_links == zp_is_dir) {
635168404Spjd			zp->z_unlinked = B_TRUE;
636219089Spjd			zp->z_links = 0;
637168404Spjd			unlinked = B_TRUE;
638168404Spjd		} else {
639219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
640219089Spjd			    NULL, &ctime, sizeof (ctime));
641219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
642219089Spjd			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
643219089Spjd			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
644219089Spjd			    B_TRUE);
645168404Spjd		}
646219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
647219089Spjd		    NULL, &zp->z_links, sizeof (zp->z_links));
648219089Spjd		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
649219089Spjd		count = 0;
650304671Savg		ASSERT0(error);
651219089Spjd	} else {
652304671Savg		ASSERT(zp->z_unlinked == 0);
653304671Savg		error = zfs_dropname(dzp, name, zp, tx, flag);
654219089Spjd		if (error != 0)
655219089Spjd			return (error);
656168404Spjd	}
657168404Spjd
658219089Spjd	dzp->z_size--;		/* one dirent removed */
659219089Spjd	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
660219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
661219089Spjd	    NULL, &dzp->z_links, sizeof (dzp->z_links));
662219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
663219089Spjd	    NULL, &dzp->z_size, sizeof (dzp->z_size));
664219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
665219089Spjd	    NULL, ctime, sizeof (ctime));
666219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
667219089Spjd	    NULL, mtime, sizeof (mtime));
668219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
669219089Spjd	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
670219089Spjd	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
671219089Spjd	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
672304671Savg	ASSERT0(error);
673168404Spjd
674168404Spjd	if (unlinkedp != NULL)
675168404Spjd		*unlinkedp = unlinked;
676168404Spjd	else if (unlinked)
677168404Spjd		zfs_unlinked_add(zp, tx);
678168404Spjd
679168404Spjd	return (0);
680168404Spjd}
681168404Spjd
682168404Spjd/*
683304671Savg * Indicate whether the directory is empty.
684168404Spjd */
685168404Spjdboolean_t
686168404Spjdzfs_dirempty(znode_t *dzp)
687168404Spjd{
688304671Savg	return (dzp->z_size == 2);
689168404Spjd}
690168404Spjd
691168404Spjdint
692168404Spjdzfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
693168404Spjd{
694168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
695168404Spjd	znode_t *xzp;
696168404Spjd	dmu_tx_t *tx;
697168404Spjd	int error;
698209962Smm	zfs_acl_ids_t acl_ids;
699209962Smm	boolean_t fuid_dirtied;
700219089Spjd	uint64_t parent;
701168404Spjd
702168404Spjd	*xvpp = NULL;
703168404Spjd
704195785Strasz	/*
705195785Strasz	 * In FreeBSD, access checking for creating an EA is being done
706195785Strasz	 * in zfs_setextattr(),
707195785Strasz	 */
708252431Srmh#ifndef __FreeBSD_kernel__
709185029Spjd	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
710168404Spjd		return (error);
711195785Strasz#endif
712168404Spjd
713209962Smm	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
714209962Smm	    &acl_ids)) != 0)
715209962Smm		return (error);
716209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
717209962Smm		zfs_acl_ids_free(&acl_ids);
718249195Smm		return (SET_ERROR(EDQUOT));
719209962Smm	}
720209962Smm
721262112Savg	getnewvnode_reserve(1);
722262112Savg
723168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
724219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
725219089Spjd	    ZFS_SA_BASE_ATTR_SIZE);
726219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
727168404Spjd	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
728209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
729209962Smm	if (fuid_dirtied)
730209962Smm		zfs_fuid_txhold(zfsvfs, tx);
731260776Savg	error = dmu_tx_assign(tx, TXG_WAIT);
732168404Spjd	if (error) {
733209962Smm		zfs_acl_ids_free(&acl_ids);
734168404Spjd		dmu_tx_abort(tx);
735168404Spjd		return (error);
736168404Spjd	}
737219089Spjd	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
738209962Smm
739209962Smm	if (fuid_dirtied)
740209962Smm		zfs_fuid_sync(zfsvfs, tx);
741209962Smm
742219089Spjd#ifdef DEBUG
743219089Spjd	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
744219089Spjd	    &parent, sizeof (parent));
745219089Spjd	ASSERT(error == 0 && parent == zp->z_id);
746219089Spjd#endif
747168404Spjd
748219089Spjd	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
749219089Spjd	    sizeof (xzp->z_id), tx));
750219089Spjd
751185029Spjd	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
752209962Smm	    xzp, "", NULL, acl_ids.z_fuidp, vap);
753209962Smm
754209962Smm	zfs_acl_ids_free(&acl_ids);
755168404Spjd	dmu_tx_commit(tx);
756168404Spjd
757262112Savg	getnewvnode_drop_reserve();
758262112Savg
759168404Spjd	*xvpp = ZTOV(xzp);
760168404Spjd
761168404Spjd	return (0);
762168404Spjd}
763168404Spjd
764168404Spjd/*
765168404Spjd * Return a znode for the extended attribute directory for zp.
766168404Spjd * ** If the directory does not already exist, it is created **
767168404Spjd *
768168404Spjd *	IN:	zp	- znode to obtain attribute directory from
769168404Spjd *		cr	- credentials of caller
770168404Spjd *		flags	- flags from the VOP_LOOKUP call
771168404Spjd *
772168404Spjd *	OUT:	xzpp	- pointer to extended attribute znode
773168404Spjd *
774168404Spjd *	RETURN:	0 on success
775168404Spjd *		error number on failure
776168404Spjd */
777168404Spjdint
778168404Spjdzfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
779168404Spjd{
780168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
781168404Spjd	znode_t		*xzp;
782168404Spjd	vattr_t		va;
783168404Spjd	int		error;
784168404Spjdtop:
785304671Savg	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
786168404Spjd	if (error)
787168404Spjd		return (error);
788168404Spjd
789168404Spjd	if (xzp != NULL) {
790168404Spjd		*xvpp = ZTOV(xzp);
791168404Spjd		return (0);
792168404Spjd	}
793168404Spjd
794168404Spjd
795168404Spjd	if (!(flags & CREATE_XATTR_DIR)) {
796249195Smm#ifdef illumos
797249195Smm		return (SET_ERROR(ENOENT));
798195785Strasz#else
799249195Smm		return (SET_ERROR(ENOATTR));
800195785Strasz#endif
801168404Spjd	}
802168404Spjd
803168404Spjd	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
804249195Smm		return (SET_ERROR(EROFS));
805168404Spjd	}
806168404Spjd
807168404Spjd	/*
808168404Spjd	 * The ability to 'create' files in an attribute
809168404Spjd	 * directory comes from the write_xattr permission on the base file.
810168404Spjd	 *
811168404Spjd	 * The ability to 'search' an attribute directory requires
812168404Spjd	 * read_xattr permission on the base file.
813168404Spjd	 *
814168404Spjd	 * Once in a directory the ability to read/write attributes
815168404Spjd	 * is controlled by the permissions on the attribute file.
816168404Spjd	 */
817168404Spjd	va.va_mask = AT_TYPE | AT_MODE | AT_UID | AT_GID;
818168404Spjd	va.va_type = VDIR;
819168404Spjd	va.va_mode = S_IFDIR | S_ISVTX | 0777;
820185029Spjd	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
821168404Spjd
822168404Spjd	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
823168404Spjd
824209962Smm	if (error == ERESTART) {
825168404Spjd		/* NB: we already did dmu_tx_wait() if necessary */
826168404Spjd		goto top;
827168404Spjd	}
828189967Sjhb	if (error == 0)
829189967Sjhb		VOP_UNLOCK(*xvpp, 0);
830168404Spjd
831168404Spjd	return (error);
832168404Spjd}
833168404Spjd
834168404Spjd/*
835168404Spjd * Decide whether it is okay to remove within a sticky directory.
836168404Spjd *
837168404Spjd * In sticky directories, write access is not sufficient;
838168404Spjd * you can remove entries from a directory only if:
839168404Spjd *
840168404Spjd *	you own the directory,
841168404Spjd *	you own the entry,
842168404Spjd *	the entry is a plain file and you have write access,
843168404Spjd *	or you are privileged (checked in secpolicy...).
844168404Spjd *
845168404Spjd * The function returns 0 if remove access is granted.
846168404Spjd */
847168404Spjdint
848168404Spjdzfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
849168404Spjd{
850168404Spjd	uid_t  		uid;
851185029Spjd	uid_t		downer;
852185029Spjd	uid_t		fowner;
853185029Spjd	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
854168404Spjd
855209962Smm	if (zdp->z_zfsvfs->z_replay)
856168404Spjd		return (0);
857168404Spjd
858219089Spjd	if ((zdp->z_mode & S_ISVTX) == 0)
859185029Spjd		return (0);
860185029Spjd
861219089Spjd	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
862219089Spjd	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
863185029Spjd
864185029Spjd	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
865168404Spjd	    (ZTOV(zp)->v_type == VREG &&
866185029Spjd	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
867168404Spjd		return (0);
868168404Spjd	else
869185029Spjd		return (secpolicy_vnode_remove(ZTOV(zp), cr));
870168404Spjd}
871