zfs_znode.c revision 210470
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
28#ifdef _KERNEL
29#include <sys/types.h>
30#include <sys/param.h>
31#include <sys/time.h>
32#include <sys/systm.h>
33#include <sys/sysmacros.h>
34#include <sys/resource.h>
35#include <sys/mntent.h>
36#include <sys/u8_textprep.h>
37#include <sys/dsl_dataset.h>
38#include <sys/vfs.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/kmem.h>
42#include <sys/errno.h>
43#include <sys/unistd.h>
44#include <sys/atomic.h>
45#include <sys/zfs_dir.h>
46#include <sys/zfs_acl.h>
47#include <sys/zfs_ioctl.h>
48#include <sys/zfs_rlock.h>
49#include <sys/zfs_fuid.h>
50#include <sys/fs/zfs.h>
51#include <sys/kidmap.h>
52#endif /* _KERNEL */
53
54#include <sys/dmu.h>
55#include <sys/refcount.h>
56#include <sys/stat.h>
57#include <sys/zap.h>
58#include <sys/zfs_znode.h>
59#include <sys/refcount.h>
60
61#include "zfs_prop.h"
62
63/* Used by fstat(1). */
64SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
65    "sizeof(znode_t)");
66
67/*
68 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
69 * turned on when DEBUG is also defined.
70 */
71#ifdef	DEBUG
72#define	ZNODE_STATS
73#endif	/* DEBUG */
74
75#ifdef	ZNODE_STATS
76#define	ZNODE_STAT_ADD(stat)			((stat)++)
77#else
78#define	ZNODE_STAT_ADD(stat)			/* nothing */
79#endif	/* ZNODE_STATS */
80
81#define	POINTER_IS_VALID(p)	(!((uintptr_t)(p) & 0x3))
82#define	POINTER_INVALIDATE(pp)	(*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1))
83
84/*
85 * Functions needed for userland (ie: libzpool) are not put under
86 * #ifdef_KERNEL; the rest of the functions have dependencies
87 * (such as VFS logic) that will not compile easily in userland.
88 */
89#ifdef _KERNEL
90/*
91 * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
92 * be freed before it can be safely accessed.
93 */
94krwlock_t zfsvfs_lock;
95
96static kmem_cache_t *znode_cache = NULL;
97
98/*ARGSUSED*/
99static void
100znode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
101{
102#if 1	/* XXXPJD: From OpenSolaris. */
103	/*
104	 * We should never drop all dbuf refs without first clearing
105	 * the eviction callback.
106	 */
107	panic("evicting znode %p\n", user_ptr);
108#else	/* XXXPJD */
109	znode_t *zp = user_ptr;
110	vnode_t *vp;
111
112	mutex_enter(&zp->z_lock);
113	zp->z_dbuf = NULL;
114	vp = ZTOV(zp);
115	if (vp == NULL) {
116		mutex_exit(&zp->z_lock);
117		zfs_znode_free(zp);
118	} else if (vp->v_count == 0) {
119		zp->z_vnode = NULL;
120		vhold(vp);
121		mutex_exit(&zp->z_lock);
122		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
123		vrecycle(vp, curthread);
124		VOP_UNLOCK(vp, 0);
125		vdrop(vp);
126		zfs_znode_free(zp);
127	} else {
128		mutex_exit(&zp->z_lock);
129	}
130#endif
131}
132
133extern struct vop_vector zfs_vnodeops;
134extern struct vop_vector zfs_fifoops;
135extern struct vop_vector zfs_shareops;
136
137/*
138 * XXX: We cannot use this function as a cache constructor, because
139 *      there is one global cache for all file systems and we need
140 *      to pass vfsp here, which is not possible, because argument
141 *      'cdrarg' is defined at kmem_cache_create() time.
142 */
143static int
144zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
145{
146	znode_t *zp = buf;
147	vnode_t *vp;
148	vfs_t *vfsp = arg;
149	int error;
150
151	POINTER_INVALIDATE(&zp->z_zfsvfs);
152	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
153
154	if (vfsp != NULL) {
155		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
156		if (error != 0 && (kmflags & KM_NOSLEEP))
157			return (-1);
158		ASSERT(error == 0);
159		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
160		zp->z_vnode = vp;
161		vp->v_data = (caddr_t)zp;
162		VN_LOCK_AREC(vp);
163	} else {
164		zp->z_vnode = NULL;
165	}
166
167	list_link_init(&zp->z_link_node);
168
169	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
170	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
171	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
172	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
173
174	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
175	avl_create(&zp->z_range_avl, zfs_range_compare,
176	    sizeof (rl_t), offsetof(rl_t, r_node));
177
178	zp->z_dbuf = NULL;
179	zp->z_dirlocks = NULL;
180	return (0);
181}
182
183/*ARGSUSED*/
184static void
185zfs_znode_cache_destructor(void *buf, void *arg)
186{
187	znode_t *zp = buf;
188
189	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
190	ASSERT(ZTOV(zp) == NULL);
191	vn_free(ZTOV(zp));
192	ASSERT(!list_link_active(&zp->z_link_node));
193	mutex_destroy(&zp->z_lock);
194	rw_destroy(&zp->z_parent_lock);
195	rw_destroy(&zp->z_name_lock);
196	mutex_destroy(&zp->z_acl_lock);
197	avl_destroy(&zp->z_range_avl);
198	mutex_destroy(&zp->z_range_lock);
199
200	ASSERT(zp->z_dbuf == NULL);
201	ASSERT(zp->z_dirlocks == NULL);
202}
203
204#ifdef	ZNODE_STATS
205static struct {
206	uint64_t zms_zfsvfs_invalid;
207	uint64_t zms_zfsvfs_recheck1;
208	uint64_t zms_zfsvfs_unmounted;
209	uint64_t zms_zfsvfs_recheck2;
210	uint64_t zms_obj_held;
211	uint64_t zms_vnode_locked;
212	uint64_t zms_not_only_dnlc;
213} znode_move_stats;
214#endif	/* ZNODE_STATS */
215
216#if defined(sun)
217static void
218zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
219{
220	vnode_t *vp;
221
222	/* Copy fields. */
223	nzp->z_zfsvfs = ozp->z_zfsvfs;
224
225	/* Swap vnodes. */
226	vp = nzp->z_vnode;
227	nzp->z_vnode = ozp->z_vnode;
228	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
229	ZTOV(ozp)->v_data = ozp;
230	ZTOV(nzp)->v_data = nzp;
231
232	nzp->z_id = ozp->z_id;
233	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
234	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
235	nzp->z_unlinked = ozp->z_unlinked;
236	nzp->z_atime_dirty = ozp->z_atime_dirty;
237	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
238	nzp->z_blksz = ozp->z_blksz;
239	nzp->z_seq = ozp->z_seq;
240	nzp->z_mapcnt = ozp->z_mapcnt;
241	nzp->z_last_itx = ozp->z_last_itx;
242	nzp->z_gen = ozp->z_gen;
243	nzp->z_sync_cnt = ozp->z_sync_cnt;
244	nzp->z_phys = ozp->z_phys;
245	nzp->z_dbuf = ozp->z_dbuf;
246
247	/* Update back pointers. */
248	(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
249	    znode_evict_error);
250
251	/*
252	 * Invalidate the original znode by clearing fields that provide a
253	 * pointer back to the znode. Set the low bit of the vfs pointer to
254	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
255	 * subsequent callback.
256	 */
257	ozp->z_dbuf = NULL;
258	POINTER_INVALIDATE(&ozp->z_zfsvfs);
259}
260
261/*ARGSUSED*/
262static kmem_cbrc_t
263zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
264{
265	znode_t *ozp = buf, *nzp = newbuf;
266	zfsvfs_t *zfsvfs;
267	vnode_t *vp;
268
269	/*
270	 * The znode is on the file system's list of known znodes if the vfs
271	 * pointer is valid. We set the low bit of the vfs pointer when freeing
272	 * the znode to invalidate it, and the memory patterns written by kmem
273	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
274	 * created znode sets the vfs pointer last of all to indicate that the
275	 * znode is known and in a valid state to be moved by this function.
276	 */
277	zfsvfs = ozp->z_zfsvfs;
278	if (!POINTER_IS_VALID(zfsvfs)) {
279		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
280		return (KMEM_CBRC_DONT_KNOW);
281	}
282
283	/*
284	 * Close a small window in which it's possible that the filesystem could
285	 * be unmounted and freed, and zfsvfs, though valid in the previous
286	 * statement, could point to unrelated memory by the time we try to
287	 * prevent the filesystem from being unmounted.
288	 */
289	rw_enter(&zfsvfs_lock, RW_WRITER);
290	if (zfsvfs != ozp->z_zfsvfs) {
291		rw_exit(&zfsvfs_lock);
292		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
293		return (KMEM_CBRC_DONT_KNOW);
294	}
295
296	/*
297	 * If the znode is still valid, then so is the file system. We know that
298	 * no valid file system can be freed while we hold zfsvfs_lock, so we
299	 * can safely ensure that the filesystem is not and will not be
300	 * unmounted. The next statement is equivalent to ZFS_ENTER().
301	 */
302	rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
303	if (zfsvfs->z_unmounted) {
304		ZFS_EXIT(zfsvfs);
305		rw_exit(&zfsvfs_lock);
306		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
307		return (KMEM_CBRC_DONT_KNOW);
308	}
309	rw_exit(&zfsvfs_lock);
310
311	mutex_enter(&zfsvfs->z_znodes_lock);
312	/*
313	 * Recheck the vfs pointer in case the znode was removed just before
314	 * acquiring the lock.
315	 */
316	if (zfsvfs != ozp->z_zfsvfs) {
317		mutex_exit(&zfsvfs->z_znodes_lock);
318		ZFS_EXIT(zfsvfs);
319		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
320		return (KMEM_CBRC_DONT_KNOW);
321	}
322
323	/*
324	 * At this point we know that as long as we hold z_znodes_lock, the
325	 * znode cannot be freed and fields within the znode can be safely
326	 * accessed. Now, prevent a race with zfs_zget().
327	 */
328	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
329		mutex_exit(&zfsvfs->z_znodes_lock);
330		ZFS_EXIT(zfsvfs);
331		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
332		return (KMEM_CBRC_LATER);
333	}
334
335	vp = ZTOV(ozp);
336	if (mutex_tryenter(&vp->v_lock) == 0) {
337		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
338		mutex_exit(&zfsvfs->z_znodes_lock);
339		ZFS_EXIT(zfsvfs);
340		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
341		return (KMEM_CBRC_LATER);
342	}
343
344	/* Only move znodes that are referenced _only_ by the DNLC. */
345	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
346		mutex_exit(&vp->v_lock);
347		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
348		mutex_exit(&zfsvfs->z_znodes_lock);
349		ZFS_EXIT(zfsvfs);
350		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
351		return (KMEM_CBRC_LATER);
352	}
353
354	/*
355	 * The znode is known and in a valid state to move. We're holding the
356	 * locks needed to execute the critical section.
357	 */
358	zfs_znode_move_impl(ozp, nzp);
359	mutex_exit(&vp->v_lock);
360	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
361
362	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
363	mutex_exit(&zfsvfs->z_znodes_lock);
364	ZFS_EXIT(zfsvfs);
365
366	return (KMEM_CBRC_YES);
367}
368#endif /* sun */
369
370void
371zfs_znode_init(void)
372{
373	/*
374	 * Initialize zcache
375	 */
376	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
377	ASSERT(znode_cache == NULL);
378	znode_cache = kmem_cache_create("zfs_znode_cache",
379	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
380	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
381#if defined(sun)
382	kmem_cache_set_move(znode_cache, zfs_znode_move);
383#endif
384}
385
386void
387zfs_znode_fini(void)
388{
389	/*
390	 * Cleanup zcache
391	 */
392	if (znode_cache)
393		kmem_cache_destroy(znode_cache);
394	znode_cache = NULL;
395	rw_destroy(&zfsvfs_lock);
396}
397
398int
399zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
400{
401	zfs_acl_ids_t acl_ids;
402	vattr_t vattr;
403	znode_t *sharezp;
404	vnode_t *vp, vnode;
405	znode_t *zp;
406	int error;
407
408	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
409	vattr.va_type = VDIR;
410	vattr.va_mode = S_IFDIR|0555;
411	vattr.va_uid = crgetuid(kcred);
412	vattr.va_gid = crgetgid(kcred);
413
414	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
415	zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
416	sharezp->z_unlinked = 0;
417	sharezp->z_atime_dirty = 0;
418	sharezp->z_zfsvfs = zfsvfs;
419
420	sharezp->z_vnode = &vnode;
421	vnode.v_data = sharezp;
422
423	vp = ZTOV(sharezp);
424	vp->v_type = VDIR;
425
426	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
427	    kcred, NULL, &acl_ids));
428	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
429	    &zp, 0, &acl_ids);
430	ASSERT3P(zp, ==, sharezp);
431	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
432	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
433	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
434	zfsvfs->z_shares_dir = sharezp->z_id;
435
436	zfs_acl_ids_free(&acl_ids);
437	ZTOV(sharezp)->v_data = NULL;
438	ZTOV(sharezp)->v_count = 0;
439	ZTOV(sharezp)->v_holdcnt = 0;
440	zp->z_vnode = NULL;
441	sharezp->z_vnode = NULL;
442	dmu_buf_rele(sharezp->z_dbuf, NULL);
443	sharezp->z_dbuf = NULL;
444	kmem_cache_free(znode_cache, sharezp);
445
446	return (error);
447}
448
449/*
450 * define a couple of values we need available
451 * for both 64 and 32 bit environments.
452 */
453#ifndef NBITSMINOR64
454#define	NBITSMINOR64	32
455#endif
456#ifndef MAXMAJ64
457#define	MAXMAJ64	0xffffffffUL
458#endif
459#ifndef	MAXMIN64
460#define	MAXMIN64	0xffffffffUL
461#endif
462
463/*
464 * Create special expldev for ZFS private use.
465 * Can't use standard expldev since it doesn't do
466 * what we want.  The standard expldev() takes a
467 * dev32_t in LP64 and expands it to a long dev_t.
468 * We need an interface that takes a dev32_t in ILP32
469 * and expands it to a long dev_t.
470 */
471static uint64_t
472zfs_expldev(dev_t dev)
473{
474	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
475}
476/*
477 * Special cmpldev for ZFS private use.
478 * Can't use standard cmpldev since it takes
479 * a long dev_t and compresses it to dev32_t in
480 * LP64.  We need to do a compaction of a long dev_t
481 * to a dev32_t in ILP32.
482 */
483dev_t
484zfs_cmpldev(uint64_t dev)
485{
486	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
487}
488
489static void
490zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
491{
492	znode_t		*nzp;
493
494	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
495	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
496
497	mutex_enter(&zp->z_lock);
498
499	ASSERT(zp->z_dbuf == NULL);
500	zp->z_dbuf = db;
501	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
502
503	/*
504	 * there should be no
505	 * concurrent zgets on this object.
506	 */
507	if (nzp != NULL)
508		panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db);
509
510	/*
511	 * Slap on VROOT if we are the root znode
512	 */
513	if (zp->z_id == zfsvfs->z_root)
514		ZTOV(zp)->v_flag |= VROOT;
515
516	mutex_exit(&zp->z_lock);
517	vn_exists(ZTOV(zp));
518}
519
520void
521zfs_znode_dmu_fini(znode_t *zp)
522{
523	dmu_buf_t *db = zp->z_dbuf;
524	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
525	    zp->z_unlinked ||
526	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
527	ASSERT(zp->z_dbuf != NULL);
528	zp->z_dbuf = NULL;
529	VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL));
530	dmu_buf_rele(db, NULL);
531}
532
533/*
534 * Construct a new znode/vnode and intialize.
535 *
536 * This does not do a call to dmu_set_user() that is
537 * up to the caller to do, in case you don't want to
538 * return the znode
539 */
540static znode_t *
541zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
542{
543	znode_t	*zp;
544	vnode_t *vp;
545
546	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
547	zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
548
549	ASSERT(zp->z_dirlocks == NULL);
550	ASSERT(zp->z_dbuf == NULL);
551	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
552
553	/*
554	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
555	 * the zfs_znode_move() callback.
556	 */
557	zp->z_phys = NULL;
558	zp->z_unlinked = 0;
559	zp->z_atime_dirty = 0;
560	zp->z_mapcnt = 0;
561	zp->z_last_itx = 0;
562	zp->z_id = db->db_object;
563	zp->z_blksz = blksz;
564	zp->z_seq = 0x7A4653;
565	zp->z_sync_cnt = 0;
566
567	vp = ZTOV(zp);
568#ifdef TODO
569	vn_reinit(vp);
570#endif
571
572	zfs_znode_dmu_init(zfsvfs, zp, db);
573
574	zp->z_gen = zp->z_phys->zp_gen;
575
576#if 0
577	if (vp == NULL)
578		return (zp);
579#endif
580
581	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
582	switch (vp->v_type) {
583	case VDIR:
584		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
585		break;
586	case VFIFO:
587		vp->v_op = &zfs_fifoops;
588		break;
589        case VREG:
590		if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
591			vp->v_op = &zfs_shareops;
592		}
593		break;
594	}
595	if (vp->v_type != VFIFO)
596		VN_LOCK_ASHARE(vp);
597
598	mutex_enter(&zfsvfs->z_znodes_lock);
599	list_insert_tail(&zfsvfs->z_all_znodes, zp);
600	membar_producer();
601	/*
602	 * Everything else must be valid before assigning z_zfsvfs makes the
603	 * znode eligible for zfs_znode_move().
604	 */
605	zp->z_zfsvfs = zfsvfs;
606	mutex_exit(&zfsvfs->z_znodes_lock);
607
608	VFS_HOLD(zfsvfs->z_vfs);
609	return (zp);
610}
611
612/*
613 * Create a new DMU object to hold a zfs znode.
614 *
615 *	IN:	dzp	- parent directory for new znode
616 *		vap	- file attributes for new znode
617 *		tx	- dmu transaction id for zap operations
618 *		cr	- credentials of caller
619 *		flag	- flags:
620 *			  IS_ROOT_NODE	- new object will be root
621 *			  IS_XATTR	- new object is an attribute
622 *		bonuslen - length of bonus buffer
623 *		setaclp  - File/Dir initial ACL
624 *		fuidp	 - Tracks fuid allocation.
625 *
626 *	OUT:	zpp	- allocated znode
627 *
628 */
629void
630zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
631    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
632{
633	dmu_buf_t	*db;
634	znode_phys_t	*pzp;
635	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
636	timestruc_t	now;
637	uint64_t	gen, obj;
638	int		err;
639
640	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
641
642	if (zfsvfs->z_replay) {
643		obj = vap->va_nodeid;
644		now = vap->va_ctime;		/* see zfs_replay_create() */
645		gen = vap->va_nblocks;		/* ditto */
646	} else {
647		obj = 0;
648		gethrestime(&now);
649		gen = dmu_tx_get_txg(tx);
650	}
651
652	/*
653	 * Create a new DMU object.
654	 */
655	/*
656	 * There's currently no mechanism for pre-reading the blocks that will
657	 * be to needed allocate a new object, so we accept the small chance
658	 * that there will be an i/o error and we will fail one of the
659	 * assertions below.
660	 */
661	if (vap->va_type == VDIR) {
662		if (zfsvfs->z_replay) {
663			err = zap_create_claim_norm(zfsvfs->z_os, obj,
664			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
665			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
666			ASSERT3U(err, ==, 0);
667		} else {
668			obj = zap_create_norm(zfsvfs->z_os,
669			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
670			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
671		}
672	} else {
673		if (zfsvfs->z_replay) {
674			err = dmu_object_claim(zfsvfs->z_os, obj,
675			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
676			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
677			ASSERT3U(err, ==, 0);
678		} else {
679			obj = dmu_object_alloc(zfsvfs->z_os,
680			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
681			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
682		}
683	}
684
685	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
686	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
687	dmu_buf_will_dirty(db, tx);
688
689	/*
690	 * Initialize the znode physical data to zero.
691	 */
692	ASSERT(db->db_size >= sizeof (znode_phys_t));
693	bzero(db->db_data, db->db_size);
694	pzp = db->db_data;
695
696	/*
697	 * If this is the root, fix up the half-initialized parent pointer
698	 * to reference the just-allocated physical data area.
699	 */
700	if (flag & IS_ROOT_NODE) {
701		dzp->z_dbuf = db;
702		dzp->z_phys = pzp;
703		dzp->z_id = obj;
704	}
705
706	/*
707	 * If parent is an xattr, so am I.
708	 */
709	if (dzp->z_phys->zp_flags & ZFS_XATTR)
710		flag |= IS_XATTR;
711
712	if (vap->va_type == VBLK || vap->va_type == VCHR) {
713		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
714	}
715
716	if (zfsvfs->z_use_fuids)
717		pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
718
719	if (vap->va_type == VDIR) {
720		pzp->zp_size = 2;		/* contents ("." and "..") */
721		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
722	}
723
724	pzp->zp_parent = dzp->z_id;
725	if (flag & IS_XATTR)
726		pzp->zp_flags |= ZFS_XATTR;
727
728	pzp->zp_gen = gen;
729
730	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
731	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
732
733	if (vap->va_mask & AT_ATIME) {
734		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
735	} else {
736		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
737	}
738
739	if (vap->va_mask & AT_MTIME) {
740		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
741	} else {
742		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
743	}
744
745	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
746	if (!(flag & IS_ROOT_NODE)) {
747		*zpp = zfs_znode_alloc(zfsvfs, db, 0);
748	} else {
749		/*
750		 * If we are creating the root node, the "parent" we
751		 * passed in is the znode for the root.
752		 */
753		*zpp = dzp;
754	}
755	pzp->zp_uid = acl_ids->z_fuid;
756	pzp->zp_gid = acl_ids->z_fgid;
757	pzp->zp_mode = acl_ids->z_mode;
758	VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
759	if (vap->va_mask & AT_XVATTR)
760		zfs_xvattr_set(*zpp, (xvattr_t *)vap);
761	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
762	if (!(flag & IS_ROOT_NODE)) {
763		vnode_t *vp;
764
765		vp = ZTOV(*zpp);
766		vp->v_vflag |= VV_FORCEINSMQ;
767		err = insmntque(vp, zfsvfs->z_vfs);
768		vp->v_vflag &= ~VV_FORCEINSMQ;
769		KASSERT(err == 0, ("insmntque() failed: error %d", err));
770	}
771}
772
773void
774zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
775{
776	xoptattr_t *xoap;
777
778	xoap = xva_getxoptattr(xvap);
779	ASSERT(xoap);
780
781	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
782		ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime);
783		XVA_SET_RTN(xvap, XAT_CREATETIME);
784	}
785	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
786		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly);
787		XVA_SET_RTN(xvap, XAT_READONLY);
788	}
789	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
790		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden);
791		XVA_SET_RTN(xvap, XAT_HIDDEN);
792	}
793	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
794		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system);
795		XVA_SET_RTN(xvap, XAT_SYSTEM);
796	}
797	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
798		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive);
799		XVA_SET_RTN(xvap, XAT_ARCHIVE);
800	}
801	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
802		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable);
803		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
804	}
805	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
806		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink);
807		XVA_SET_RTN(xvap, XAT_NOUNLINK);
808	}
809	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
810		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly);
811		XVA_SET_RTN(xvap, XAT_APPENDONLY);
812	}
813	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
814		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump);
815		XVA_SET_RTN(xvap, XAT_NODUMP);
816	}
817	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
818		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque);
819		XVA_SET_RTN(xvap, XAT_OPAQUE);
820	}
821	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
822		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
823		    xoap->xoa_av_quarantined);
824		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
825	}
826	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
827		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified);
828		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
829	}
830	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
831		(void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp,
832		    sizeof (xoap->xoa_av_scanstamp));
833		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
834		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
835	}
836}
837
838int
839zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
840{
841	dmu_object_info_t doi;
842	dmu_buf_t	*db;
843	znode_t		*zp;
844	vnode_t		*vp;
845	int err, first = 1;
846
847	*zpp = NULL;
848again:
849	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
850
851	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
852	if (err) {
853		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
854		return (err);
855	}
856
857	dmu_object_info_from_db(db, &doi);
858	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
859	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
860		dmu_buf_rele(db, NULL);
861		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
862		return (EINVAL);
863	}
864
865	zp = dmu_buf_get_user(db);
866	if (zp != NULL) {
867		mutex_enter(&zp->z_lock);
868
869		/*
870		 * Since we do immediate eviction of the z_dbuf, we
871		 * should never find a dbuf with a znode that doesn't
872		 * know about the dbuf.
873		 */
874		ASSERT3P(zp->z_dbuf, ==, db);
875		ASSERT3U(zp->z_id, ==, obj_num);
876		if (zp->z_unlinked) {
877			err = ENOENT;
878		} else {
879			int dying = 0;
880
881			vp = ZTOV(zp);
882			if (vp == NULL)
883				dying = 1;
884			else {
885				VN_HOLD(vp);
886				if ((vp->v_iflag & VI_DOOMED) != 0) {
887					dying = 1;
888					/*
889					 * Don't VN_RELE() vnode here, because
890					 * it can call vn_lock() which creates
891					 * LOR between vnode lock and znode
892					 * lock. We will VN_RELE() the vnode
893					 * after droping znode lock.
894					 */
895				}
896			}
897			if (dying) {
898				if (first) {
899					ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
900					first = 0;
901				}
902				/*
903				 * znode is dying so we can't reuse it, we must
904				 * wait until destruction is completed.
905				 */
906				dmu_buf_rele(db, NULL);
907				mutex_exit(&zp->z_lock);
908				ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
909				if (vp != NULL)
910					VN_RELE(vp);
911				tsleep(zp, 0, "zcollide", 1);
912				goto again;
913			}
914			*zpp = zp;
915			err = 0;
916		}
917		dmu_buf_rele(db, NULL);
918		mutex_exit(&zp->z_lock);
919		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
920		return (err);
921	}
922
923	/*
924	 * Not found create new znode/vnode
925	 * but only if file exists.
926	 *
927	 * There is a small window where zfs_vget() could
928	 * find this object while a file create is still in
929	 * progress.  Since a gen number can never be zero
930	 * we will check that to determine if its an allocated
931	 * file.
932	 */
933
934	if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
935		zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
936		*zpp = zp;
937		vp = ZTOV(zp);
938		vp->v_vflag |= VV_FORCEINSMQ;
939		err = insmntque(vp, zfsvfs->z_vfs);
940		vp->v_vflag &= ~VV_FORCEINSMQ;
941		KASSERT(err == 0, ("insmntque() failed: error %d", err));
942		VOP_UNLOCK(vp, 0);
943		err = 0;
944	} else {
945		dmu_buf_rele(db, NULL);
946		err = ENOENT;
947	}
948	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
949	return (err);
950}
951
952int
953zfs_rezget(znode_t *zp)
954{
955	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
956	dmu_object_info_t doi;
957	dmu_buf_t *db;
958	uint64_t obj_num = zp->z_id;
959	int err;
960
961	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
962
963	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
964	if (err) {
965		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
966		return (err);
967	}
968
969	dmu_object_info_from_db(db, &doi);
970	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
971	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
972		dmu_buf_rele(db, NULL);
973		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
974		return (EINVAL);
975	}
976
977	if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) {
978		dmu_buf_rele(db, NULL);
979		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
980		return (EIO);
981	}
982
983	zfs_znode_dmu_init(zfsvfs, zp, db);
984	zp->z_unlinked = (zp->z_phys->zp_links == 0);
985	zp->z_blksz = doi.doi_data_block_size;
986
987	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
988
989	return (0);
990}
991
992void
993zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
994{
995	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
996	objset_t *os = zfsvfs->z_os;
997	uint64_t obj = zp->z_id;
998	uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj;
999
1000	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1001	if (acl_obj)
1002		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1003	VERIFY(0 == dmu_object_free(os, obj, tx));
1004	zfs_znode_dmu_fini(zp);
1005	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1006	zfs_znode_free(zp);
1007}
1008
1009void
1010zfs_zinactive(znode_t *zp)
1011{
1012	vnode_t	*vp = ZTOV(zp);
1013	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1014	uint64_t z_id = zp->z_id;
1015	int vfslocked;
1016
1017	ASSERT(zp->z_dbuf && zp->z_phys);
1018
1019	/*
1020	 * Don't allow a zfs_zget() while were trying to release this znode
1021	 */
1022	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1023
1024	mutex_enter(&zp->z_lock);
1025	VI_LOCK(vp);
1026	if (vp->v_count > 0) {
1027		/*
1028		 * If the hold count is greater than zero, somebody has
1029		 * obtained a new reference on this znode while we were
1030		 * processing it here, so we are done.
1031		 */
1032		VI_UNLOCK(vp);
1033		mutex_exit(&zp->z_lock);
1034		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1035		return;
1036	}
1037	VI_UNLOCK(vp);
1038
1039	/*
1040	 * If this was the last reference to a file with no links,
1041	 * remove the file from the file system.
1042	 */
1043	if (zp->z_unlinked) {
1044		mutex_exit(&zp->z_lock);
1045		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1046		ASSERT(vp->v_count == 0);
1047		vrecycle(vp, curthread);
1048		vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
1049		zfs_rmnode(zp);
1050		VFS_UNLOCK_GIANT(vfslocked);
1051		return;
1052	}
1053	mutex_exit(&zp->z_lock);
1054	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1055}
1056
1057void
1058zfs_znode_free(znode_t *zp)
1059{
1060	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1061
1062	ASSERT(ZTOV(zp) == NULL);
1063	mutex_enter(&zfsvfs->z_znodes_lock);
1064	POINTER_INVALIDATE(&zp->z_zfsvfs);
1065	list_remove(&zfsvfs->z_all_znodes, zp);
1066	mutex_exit(&zfsvfs->z_znodes_lock);
1067
1068	kmem_cache_free(znode_cache, zp);
1069
1070	VFS_RELE(zfsvfs->z_vfs);
1071}
1072
1073void
1074zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1075{
1076	timestruc_t	now;
1077
1078	ASSERT(MUTEX_HELD(&zp->z_lock));
1079
1080	gethrestime(&now);
1081
1082	if (tx) {
1083		dmu_buf_will_dirty(zp->z_dbuf, tx);
1084		zp->z_atime_dirty = 0;
1085		zp->z_seq++;
1086	} else {
1087		zp->z_atime_dirty = 1;
1088	}
1089
1090	if (flag & AT_ATIME)
1091		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
1092
1093	if (flag & AT_MTIME) {
1094		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
1095		if (zp->z_zfsvfs->z_use_fuids)
1096			zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED);
1097	}
1098
1099	if (flag & AT_CTIME) {
1100		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
1101		if (zp->z_zfsvfs->z_use_fuids)
1102			zp->z_phys->zp_flags |= ZFS_ARCHIVE;
1103	}
1104}
1105
1106/*
1107 * Update the requested znode timestamps with the current time.
1108 * If we are in a transaction, then go ahead and mark the znode
1109 * dirty in the transaction so the timestamps will go to disk.
1110 * Otherwise, we will get pushed next time the znode is updated
1111 * in a transaction, or when this znode eventually goes inactive.
1112 *
1113 * Why is this OK?
1114 *  1 - Only the ACCESS time is ever updated outside of a transaction.
1115 *  2 - Multiple consecutive updates will be collapsed into a single
1116 *	znode update by the transaction grouping semantics of the DMU.
1117 */
1118void
1119zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
1120{
1121	mutex_enter(&zp->z_lock);
1122	zfs_time_stamper_locked(zp, flag, tx);
1123	mutex_exit(&zp->z_lock);
1124}
1125
1126/*
1127 * Grow the block size for a file.
1128 *
1129 *	IN:	zp	- znode of file to free data in.
1130 *		size	- requested block size
1131 *		tx	- open transaction.
1132 *
1133 * NOTE: this function assumes that the znode is write locked.
1134 */
1135void
1136zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1137{
1138	int		error;
1139	u_longlong_t	dummy;
1140
1141	if (size <= zp->z_blksz)
1142		return;
1143	/*
1144	 * If the file size is already greater than the current blocksize,
1145	 * we will not grow.  If there is more than one block in a file,
1146	 * the blocksize cannot change.
1147	 */
1148	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
1149		return;
1150
1151	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1152	    size, 0, tx);
1153	if (error == ENOTSUP)
1154		return;
1155	ASSERT3U(error, ==, 0);
1156
1157	/* What blocksize did we actually get? */
1158	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
1159}
1160
1161/*
1162 * Increase the file length
1163 *
1164 *	IN:	zp	- znode of file to free data in.
1165 *		end	- new end-of-file
1166 *
1167 * 	RETURN:	0 if success
1168 *		error code if failure
1169 */
1170static int
1171zfs_extend(znode_t *zp, uint64_t end)
1172{
1173	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1174	dmu_tx_t *tx;
1175	rl_t *rl;
1176	uint64_t newblksz;
1177	int error;
1178
1179	/*
1180	 * We will change zp_size, lock the whole file.
1181	 */
1182	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1183
1184	/*
1185	 * Nothing to do if file already at desired length.
1186	 */
1187	if (end <= zp->z_phys->zp_size) {
1188		zfs_range_unlock(rl);
1189		return (0);
1190	}
1191top:
1192	tx = dmu_tx_create(zfsvfs->z_os);
1193	dmu_tx_hold_bonus(tx, zp->z_id);
1194	if (end > zp->z_blksz &&
1195	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1196		/*
1197		 * We are growing the file past the current block size.
1198		 */
1199		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1200			ASSERT(!ISP2(zp->z_blksz));
1201			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
1202		} else {
1203			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1204		}
1205		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1206	} else {
1207		newblksz = 0;
1208	}
1209
1210	error = dmu_tx_assign(tx, TXG_NOWAIT);
1211	if (error) {
1212		if (error == ERESTART) {
1213			dmu_tx_wait(tx);
1214			dmu_tx_abort(tx);
1215			goto top;
1216		}
1217		dmu_tx_abort(tx);
1218		zfs_range_unlock(rl);
1219		return (error);
1220	}
1221	dmu_buf_will_dirty(zp->z_dbuf, tx);
1222
1223	if (newblksz)
1224		zfs_grow_blocksize(zp, newblksz, tx);
1225
1226	zp->z_phys->zp_size = end;
1227
1228	zfs_range_unlock(rl);
1229
1230	dmu_tx_commit(tx);
1231
1232	vnode_pager_setsize(ZTOV(zp), end);
1233
1234	return (0);
1235}
1236
1237/*
1238 * Free space in a file.
1239 *
1240 *	IN:	zp	- znode of file to free data in.
1241 *		off	- start of section to free.
1242 *		len	- length of section to free.
1243 *
1244 * 	RETURN:	0 if success
1245 *		error code if failure
1246 */
1247static int
1248zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1249{
1250	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1251	rl_t *rl;
1252	int error;
1253
1254	/*
1255	 * Lock the range being freed.
1256	 */
1257	rl = zfs_range_lock(zp, off, len, RL_WRITER);
1258
1259	/*
1260	 * Nothing to do if file already at desired length.
1261	 */
1262	if (off >= zp->z_phys->zp_size) {
1263		zfs_range_unlock(rl);
1264		return (0);
1265	}
1266
1267	if (off + len > zp->z_phys->zp_size)
1268		len = zp->z_phys->zp_size - off;
1269
1270	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1271
1272	if (error == 0) {
1273		/*
1274		 * In FreeBSD we cannot free block in the middle of a file,
1275		 * but only at the end of a file.
1276		 */
1277		vnode_pager_setsize(ZTOV(zp), off);
1278	}
1279
1280	zfs_range_unlock(rl);
1281
1282	return (error);
1283}
1284
1285/*
1286 * Truncate a file
1287 *
1288 *	IN:	zp	- znode of file to free data in.
1289 *		end	- new end-of-file.
1290 *
1291 * 	RETURN:	0 if success
1292 *		error code if failure
1293 */
1294static int
1295zfs_trunc(znode_t *zp, uint64_t end)
1296{
1297	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1298	vnode_t *vp = ZTOV(zp);
1299	dmu_tx_t *tx;
1300	rl_t *rl;
1301	int error;
1302
1303	/*
1304	 * We will change zp_size, lock the whole file.
1305	 */
1306	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1307
1308	/*
1309	 * Nothing to do if file already at desired length.
1310	 */
1311	if (end >= zp->z_phys->zp_size) {
1312		zfs_range_unlock(rl);
1313		return (0);
1314	}
1315
1316	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
1317	if (error) {
1318		zfs_range_unlock(rl);
1319		return (error);
1320	}
1321top:
1322	tx = dmu_tx_create(zfsvfs->z_os);
1323	dmu_tx_hold_bonus(tx, zp->z_id);
1324	error = dmu_tx_assign(tx, TXG_NOWAIT);
1325	if (error) {
1326		if (error == ERESTART) {
1327			dmu_tx_wait(tx);
1328			dmu_tx_abort(tx);
1329			goto top;
1330		}
1331		dmu_tx_abort(tx);
1332		zfs_range_unlock(rl);
1333		return (error);
1334	}
1335	dmu_buf_will_dirty(zp->z_dbuf, tx);
1336
1337	zp->z_phys->zp_size = end;
1338
1339	dmu_tx_commit(tx);
1340
1341	/*
1342	 * Clear any mapped pages in the truncated region.  This has to
1343	 * happen outside of the transaction to avoid the possibility of
1344	 * a deadlock with someone trying to push a page that we are
1345	 * about to invalidate.
1346	 */
1347	vnode_pager_setsize(vp, end);
1348
1349	zfs_range_unlock(rl);
1350
1351	return (0);
1352}
1353
1354/*
1355 * Free space in a file
1356 *
1357 *	IN:	zp	- znode of file to free data in.
1358 *		off	- start of range
1359 *		len	- end of range (0 => EOF)
1360 *		flag	- current file open mode flags.
1361 *		log	- TRUE if this action should be logged
1362 *
1363 * 	RETURN:	0 if success
1364 *		error code if failure
1365 */
1366int
1367zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1368{
1369	vnode_t *vp = ZTOV(zp);
1370	dmu_tx_t *tx;
1371	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1372	zilog_t *zilog = zfsvfs->z_log;
1373	int error;
1374
1375	if (off > zp->z_phys->zp_size) {
1376		error =  zfs_extend(zp, off+len);
1377		if (error == 0 && log)
1378			goto log;
1379		else
1380			return (error);
1381	}
1382
1383	if (len == 0) {
1384		error = zfs_trunc(zp, off);
1385	} else {
1386		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1387		    off + len > zp->z_phys->zp_size)
1388			error = zfs_extend(zp, off+len);
1389	}
1390	if (error || !log)
1391		return (error);
1392log:
1393	tx = dmu_tx_create(zfsvfs->z_os);
1394	dmu_tx_hold_bonus(tx, zp->z_id);
1395	error = dmu_tx_assign(tx, TXG_NOWAIT);
1396	if (error) {
1397		if (error == ERESTART) {
1398			dmu_tx_wait(tx);
1399			dmu_tx_abort(tx);
1400			goto log;
1401		}
1402		dmu_tx_abort(tx);
1403		return (error);
1404	}
1405
1406	zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
1407	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1408
1409	dmu_tx_commit(tx);
1410	return (0);
1411}
1412
1413void
1414zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1415{
1416	zfsvfs_t	zfsvfs;
1417	uint64_t	moid, obj, version;
1418	uint64_t	sense = ZFS_CASE_SENSITIVE;
1419	uint64_t	norm = 0;
1420	nvpair_t	*elem;
1421	int		error;
1422	int		i;
1423	znode_t		*rootzp = NULL;
1424	vnode_t		vnode;
1425	vattr_t		vattr;
1426	znode_t		*zp;
1427	zfs_acl_ids_t	acl_ids;
1428
1429	/*
1430	 * First attempt to create master node.
1431	 */
1432	/*
1433	 * In an empty objset, there are no blocks to read and thus
1434	 * there can be no i/o errors (which we assert below).
1435	 */
1436	moid = MASTER_NODE_OBJ;
1437	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1438	    DMU_OT_NONE, 0, tx);
1439	ASSERT(error == 0);
1440
1441	/*
1442	 * Set starting attributes.
1443	 */
1444	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
1445		version = ZPL_VERSION;
1446	else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
1447		version = ZPL_VERSION_USERSPACE - 1;
1448	else
1449		version = ZPL_VERSION_FUID - 1;
1450	elem = NULL;
1451	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1452		/* For the moment we expect all zpl props to be uint64_ts */
1453		uint64_t val;
1454		char *name;
1455
1456		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1457		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1458		name = nvpair_name(elem);
1459		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1460			if (val < version)
1461				version = val;
1462		} else {
1463			error = zap_update(os, moid, name, 8, 1, &val, tx);
1464		}
1465		ASSERT(error == 0);
1466		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1467			norm = val;
1468		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1469			sense = val;
1470	}
1471	ASSERT(version != 0);
1472	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1473
1474	/*
1475	 * Create a delete queue.
1476	 */
1477	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1478
1479	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1480	ASSERT(error == 0);
1481
1482	/*
1483	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1484	 * to allow zfs_mknode to work.
1485	 */
1486	VATTR_NULL(&vattr);
1487	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1488	vattr.va_type = VDIR;
1489	vattr.va_mode = S_IFDIR|0755;
1490	vattr.va_uid = crgetuid(cr);
1491	vattr.va_gid = crgetgid(cr);
1492
1493	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1494	zfs_znode_cache_constructor(rootzp, NULL, 0);
1495	rootzp->z_unlinked = 0;
1496	rootzp->z_atime_dirty = 0;
1497
1498	vnode.v_type = VDIR;
1499	vnode.v_data = rootzp;
1500	rootzp->z_vnode = &vnode;
1501
1502	bzero(&zfsvfs, sizeof (zfsvfs_t));
1503
1504	zfsvfs.z_os = os;
1505	zfsvfs.z_parent = &zfsvfs;
1506	zfsvfs.z_version = version;
1507	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1508	zfsvfs.z_norm = norm;
1509	/*
1510	 * Fold case on file systems that are always or sometimes case
1511	 * insensitive.
1512	 */
1513	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1514		zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1515
1516	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1517	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1518	    offsetof(znode_t, z_link_node));
1519
1520	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1521		mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1522
1523	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1524	rootzp->z_zfsvfs = &zfsvfs;
1525	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1526	    cr, NULL, &acl_ids));
1527	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
1528	ASSERT3P(zp, ==, rootzp);
1529	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1530	ASSERT(error == 0);
1531	zfs_acl_ids_free(&acl_ids);
1532	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1533
1534	dmu_buf_rele(rootzp->z_dbuf, NULL);
1535	rootzp->z_dbuf = NULL;
1536	rootzp->z_vnode = NULL;
1537	kmem_cache_free(znode_cache, rootzp);
1538
1539	/*
1540	 * Create shares directory
1541	 */
1542
1543	error = zfs_create_share_dir(&zfsvfs, tx);
1544
1545	ASSERT(error == 0);
1546
1547	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1548		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
1549}
1550
1551#endif /* _KERNEL */
1552/*
1553 * Given an object number, return its parent object number and whether
1554 * or not the object is an extended attribute directory.
1555 */
1556static int
1557zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1558{
1559	dmu_buf_t *db;
1560	dmu_object_info_t doi;
1561	znode_phys_t *zp;
1562	int error;
1563
1564	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1565		return (error);
1566
1567	dmu_object_info_from_db(db, &doi);
1568	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1569	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1570		dmu_buf_rele(db, FTAG);
1571		return (EINVAL);
1572	}
1573
1574	zp = db->db_data;
1575	*pobjp = zp->zp_parent;
1576	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1577	    S_ISDIR(zp->zp_mode);
1578	dmu_buf_rele(db, FTAG);
1579
1580	return (0);
1581}
1582
1583int
1584zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1585{
1586	char *path = buf + len - 1;
1587	int error;
1588
1589	*path = '\0';
1590
1591	for (;;) {
1592		uint64_t pobj;
1593		char component[MAXNAMELEN + 2];
1594		size_t complen;
1595		int is_xattrdir;
1596
1597		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1598		    &is_xattrdir)) != 0)
1599			break;
1600
1601		if (pobj == obj) {
1602			if (path[0] != '/')
1603				*--path = '/';
1604			break;
1605		}
1606
1607		component[0] = '/';
1608		if (is_xattrdir) {
1609			(void) sprintf(component + 1, "<xattrdir>");
1610		} else {
1611			error = zap_value_search(osp, pobj, obj,
1612			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
1613			if (error != 0)
1614				break;
1615		}
1616
1617		complen = strlen(component);
1618		path -= complen;
1619		ASSERT(path >= buf);
1620		bcopy(component, path, complen);
1621		obj = pobj;
1622	}
1623
1624	if (error == 0)
1625		(void) memmove(buf, path, buf + len - path);
1626	return (error);
1627}
1628