zfs_znode.c revision 169325
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30#ifdef _KERNEL
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/mntent.h>
38#include <sys/vfs.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/kmem.h>
42#include <sys/cmn_err.h>
43#include <sys/errno.h>
44#include <sys/unistd.h>
45#include <sys/atomic.h>
46#include <sys/zfs_dir.h>
47#include <sys/zfs_acl.h>
48#include <sys/zfs_ioctl.h>
49#include <sys/zfs_rlock.h>
50#include <sys/fs/zfs.h>
51#endif /* _KERNEL */
52
53#include <sys/dmu.h>
54#include <sys/refcount.h>
55#include <sys/stat.h>
56#include <sys/zap.h>
57#include <sys/zfs_znode.h>
58#include <sys/refcount.h>
59
60/*
61 * Functions needed for userland (ie: libzpool) are not put under
62 * #ifdef_KERNEL; the rest of the functions have dependencies
63 * (such as VFS logic) that will not compile easily in userland.
64 */
65#ifdef _KERNEL
66struct kmem_cache *znode_cache = NULL;
67
68/*ARGSUSED*/
69static void
70znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
71{
72	znode_t *zp = user_ptr;
73	vnode_t *vp;
74
75	mutex_enter(&zp->z_lock);
76	vp = ZTOV(zp);
77	if (vp == NULL) {
78		mutex_exit(&zp->z_lock);
79		zfs_znode_free(zp);
80	} else if (vp->v_count == 0) {
81		ZTOV(zp) = NULL;
82		vhold(vp);
83		mutex_exit(&zp->z_lock);
84		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
85		vrecycle(vp, curthread);
86		VOP_UNLOCK(vp, 0, curthread);
87		vdrop(vp);
88		zfs_znode_free(zp);
89	} else {
90		/* signal force unmount that this znode can be freed */
91		zp->z_dbuf = NULL;
92		mutex_exit(&zp->z_lock);
93	}
94}
95
96extern struct vop_vector zfs_vnodeops;
97extern struct vop_vector zfs_fifoops;
98
99/*
100 * XXX: We cannot use this function as a cache constructor, because
101 *      there is one global cache for all file systems and we need
102 *      to pass vfsp here, which is not possible, because argument
103 *      'cdrarg' is defined at kmem_cache_create() time.
104 */
105static int
106zfs_znode_cache_constructor(void *buf, void *cdrarg, int kmflags)
107{
108	znode_t *zp = buf;
109	vnode_t *vp;
110	vfs_t *vfsp = cdrarg;
111	int error;
112
113	if (cdrarg != NULL) {
114		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
115		ASSERT(error == 0);
116		zp->z_vnode = vp;
117		vp->v_data = (caddr_t)zp;
118		vhold(vp);
119		vp->v_vnlock->lk_flags |= LK_CANRECURSE;
120		vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
121	} else {
122		zp->z_vnode = NULL;
123	}
124	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
125	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
126	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
127	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
128	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
129
130	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
131	avl_create(&zp->z_range_avl, zfs_range_compare,
132	    sizeof (rl_t), offsetof(rl_t, r_node));
133
134	zp->z_dbuf_held = 0;
135	zp->z_dirlocks = 0;
136	zp->z_lockf = NULL;
137	return (0);
138}
139
140/*ARGSUSED*/
141static void
142zfs_znode_cache_destructor(void *buf, void *cdarg)
143{
144	znode_t *zp = buf;
145
146	ASSERT(zp->z_dirlocks == 0);
147	mutex_destroy(&zp->z_lock);
148	rw_destroy(&zp->z_map_lock);
149	rw_destroy(&zp->z_parent_lock);
150	rw_destroy(&zp->z_name_lock);
151	mutex_destroy(&zp->z_acl_lock);
152	mutex_destroy(&zp->z_range_lock);
153	avl_destroy(&zp->z_range_avl);
154
155	ASSERT(zp->z_dbuf_held == 0);
156}
157
158void
159zfs_znode_init(void)
160{
161	/*
162	 * Initialize zcache
163	 */
164	ASSERT(znode_cache == NULL);
165	znode_cache = kmem_cache_create("zfs_znode_cache",
166	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
167	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
168}
169
170void
171zfs_znode_fini(void)
172{
173	/*
174	 * Cleanup zcache
175	 */
176	if (znode_cache)
177		kmem_cache_destroy(znode_cache);
178	znode_cache = NULL;
179}
180
181/*
182 * zfs_init_fs - Initialize the zfsvfs struct and the file system
183 *	incore "master" object.  Verify version compatibility.
184 */
185int
186zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
187{
188	objset_t	*os = zfsvfs->z_os;
189	uint64_t	version = ZPL_VERSION;
190	int		i, error;
191	dmu_object_info_t doi;
192	uint64_t fsid_guid;
193
194	*zpp = NULL;
195
196	/*
197	 * XXX - hack to auto-create the pool root filesystem at
198	 * the first attempted mount.
199	 */
200	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
201		dmu_tx_t *tx = dmu_tx_create(os);
202
203		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
204		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
205		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
206		error = dmu_tx_assign(tx, TXG_WAIT);
207		ASSERT3U(error, ==, 0);
208		zfs_create_fs(os, cr, tx);
209		dmu_tx_commit(tx);
210	}
211
212	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_OBJ, 8, 1,
213	    &version);
214	if (error) {
215		return (error);
216	} else if (version != ZPL_VERSION) {
217		(void) printf("Mismatched versions:  File system "
218		    "is version %lld on-disk format, which is "
219		    "incompatible with this software version %lld!",
220		    (u_longlong_t)version, ZPL_VERSION);
221		return (ENOTSUP);
222	}
223
224	/*
225	 * The fsid is 64 bits, composed of an 8-bit fs type, which
226	 * separates our fsid from any other filesystem types, and a
227	 * 56-bit objset unique ID.  The objset unique ID is unique to
228	 * all objsets open on this system, provided by unique_create().
229	 * The 8-bit fs type must be put in the low bits of fsid[1]
230	 * because that's where other Solaris filesystems put it.
231	 */
232	fsid_guid = dmu_objset_fsid_guid(os);
233	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
234	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
235	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
236	    zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF;
237
238	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
239	    &zfsvfs->z_root);
240	if (error)
241		return (error);
242	ASSERT(zfsvfs->z_root != 0);
243
244	/*
245	 * Create the per mount vop tables.
246	 */
247
248	/*
249	 * Initialize zget mutex's
250	 */
251	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
252		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
253
254	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
255	if (error)
256		return (error);
257	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
258
259	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
260	    &zfsvfs->z_unlinkedobj);
261	if (error)
262		return (error);
263
264	return (0);
265}
266
267/*
268 * define a couple of values we need available
269 * for both 64 and 32 bit environments.
270 */
271#ifndef NBITSMINOR64
272#define	NBITSMINOR64	32
273#endif
274#ifndef MAXMAJ64
275#define	MAXMAJ64	0xffffffffUL
276#endif
277#ifndef	MAXMIN64
278#define	MAXMIN64	0xffffffffUL
279#endif
280#ifndef major
281#define	major(x)	((int)(((u_int)(x) >> 8)&0xff))	/* major number */
282#endif
283#ifndef minor
284#define	minor(x)	((int)((x)&0xffff00ff))		/* minor number */
285#endif
286
287/*
288 * Create special expldev for ZFS private use.
289 * Can't use standard expldev since it doesn't do
290 * what we want.  The standard expldev() takes a
291 * dev32_t in LP64 and expands it to a long dev_t.
292 * We need an interface that takes a dev32_t in ILP32
293 * and expands it to a long dev_t.
294 */
295static uint64_t
296zfs_expldev(dev_t dev)
297{
298	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
299}
300/*
301 * Special cmpldev for ZFS private use.
302 * Can't use standard cmpldev since it takes
303 * a long dev_t and compresses it to dev32_t in
304 * LP64.  We need to do a compaction of a long dev_t
305 * to a dev32_t in ILP32.
306 */
307dev_t
308zfs_cmpldev(uint64_t dev)
309{
310	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
311}
312
313/*
314 * Construct a new znode/vnode and intialize.
315 *
316 * This does not do a call to dmu_set_user() that is
317 * up to the caller to do, in case you don't want to
318 * return the znode
319 */
320static znode_t *
321zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
322{
323	znode_t	*zp;
324	vnode_t *vp;
325	int error;
326
327	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
328	zfs_znode_cache_constructor(zp, zfsvfs->z_vfs, 0);
329
330	ASSERT(zp->z_dirlocks == NULL);
331
332	zp->z_phys = db->db_data;
333	zp->z_zfsvfs = zfsvfs;
334	zp->z_unlinked = 0;
335	zp->z_atime_dirty = 0;
336	zp->z_dbuf_held = 0;
337	zp->z_mapcnt = 0;
338	zp->z_last_itx = 0;
339	zp->z_dbuf = db;
340	zp->z_id = obj_num;
341	zp->z_blksz = blksz;
342	zp->z_seq = 0x7A4653;
343	zp->z_sync_cnt = 0;
344
345	mutex_enter(&zfsvfs->z_znodes_lock);
346	list_insert_tail(&zfsvfs->z_all_znodes, zp);
347	mutex_exit(&zfsvfs->z_znodes_lock);
348
349	vp = ZTOV(zp);
350	if (vp == NULL)
351		return (zp);
352
353	error = insmntque(vp, zfsvfs->z_vfs);
354	KASSERT(error == 0, ("insmntque() failed: error %d", error));
355
356	vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
357	switch (vp->v_type) {
358	case VDIR:
359		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
360		break;
361	case VFIFO:
362		vp->v_op = &zfs_fifoops;
363		break;
364	}
365
366	return (zp);
367}
368
369static void
370zfs_znode_dmu_init(znode_t *zp)
371{
372	znode_t		*nzp;
373	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
374	dmu_buf_t	*db = zp->z_dbuf;
375
376	mutex_enter(&zp->z_lock);
377
378	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_pageout_func);
379
380	/*
381	 * there should be no
382	 * concurrent zgets on this object.
383	 */
384	ASSERT3P(nzp, ==, NULL);
385
386	/*
387	 * Slap on VROOT if we are the root znode
388	 */
389	if (zp->z_id == zfsvfs->z_root) {
390		ZTOV(zp)->v_flag |= VROOT;
391	}
392
393	ASSERT(zp->z_dbuf_held == 0);
394	zp->z_dbuf_held = 1;
395	VFS_HOLD(zfsvfs->z_vfs);
396	mutex_exit(&zp->z_lock);
397}
398
399/*
400 * Create a new DMU object to hold a zfs znode.
401 *
402 *	IN:	dzp	- parent directory for new znode
403 *		vap	- file attributes for new znode
404 *		tx	- dmu transaction id for zap operations
405 *		cr	- credentials of caller
406 *		flag	- flags:
407 *			  IS_ROOT_NODE	- new object will be root
408 *			  IS_XATTR	- new object is an attribute
409 *			  IS_REPLAY	- intent log replay
410 *
411 *	OUT:	oid	- ID of created object
412 *
413 */
414void
415zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
416	uint_t flag, znode_t **zpp, int bonuslen)
417{
418	dmu_buf_t	*dbp;
419	znode_phys_t	*pzp;
420	znode_t		*zp;
421	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
422	timestruc_t	now;
423	uint64_t	gen;
424	int		err;
425
426	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
427
428	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
429		*oid = vap->va_nodeid;
430		flag |= IS_REPLAY;
431		now = vap->va_ctime;		/* see zfs_replay_create() */
432		gen = vap->va_nblocks;		/* ditto */
433	} else {
434		*oid = 0;
435		gethrestime(&now);
436		gen = dmu_tx_get_txg(tx);
437	}
438
439	/*
440	 * Create a new DMU object.
441	 */
442	/*
443	 * There's currently no mechanism for pre-reading the blocks that will
444	 * be to needed allocate a new object, so we accept the small chance
445	 * that there will be an i/o error and we will fail one of the
446	 * assertions below.
447	 */
448	if (vap->va_type == VDIR) {
449		if (flag & IS_REPLAY) {
450			err = zap_create_claim(zfsvfs->z_os, *oid,
451			    DMU_OT_DIRECTORY_CONTENTS,
452			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
453			ASSERT3U(err, ==, 0);
454		} else {
455			*oid = zap_create(zfsvfs->z_os,
456			    DMU_OT_DIRECTORY_CONTENTS,
457			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
458		}
459	} else {
460		if (flag & IS_REPLAY) {
461			err = dmu_object_claim(zfsvfs->z_os, *oid,
462			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
463			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
464			ASSERT3U(err, ==, 0);
465		} else {
466			*oid = dmu_object_alloc(zfsvfs->z_os,
467			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
468			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
469		}
470	}
471	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
472	dmu_buf_will_dirty(dbp, tx);
473
474	/*
475	 * Initialize the znode physical data to zero.
476	 */
477	ASSERT(dbp->db_size >= sizeof (znode_phys_t));
478	bzero(dbp->db_data, dbp->db_size);
479	pzp = dbp->db_data;
480
481	/*
482	 * If this is the root, fix up the half-initialized parent pointer
483	 * to reference the just-allocated physical data area.
484	 */
485	if (flag & IS_ROOT_NODE) {
486		dzp->z_phys = pzp;
487		dzp->z_id = *oid;
488	}
489
490	/*
491	 * If parent is an xattr, so am I.
492	 */
493	if (dzp->z_phys->zp_flags & ZFS_XATTR)
494		flag |= IS_XATTR;
495
496	if (vap->va_type == VBLK || vap->va_type == VCHR) {
497		pzp->zp_rdev = zfs_expldev(vap->va_rdev);
498	}
499
500	if (vap->va_type == VDIR) {
501		pzp->zp_size = 2;		/* contents ("." and "..") */
502		pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
503	}
504
505	pzp->zp_parent = dzp->z_id;
506	if (flag & IS_XATTR)
507		pzp->zp_flags |= ZFS_XATTR;
508
509	pzp->zp_gen = gen;
510
511	ZFS_TIME_ENCODE(&now, pzp->zp_crtime);
512	ZFS_TIME_ENCODE(&now, pzp->zp_ctime);
513
514	if (vap->va_mask & AT_ATIME) {
515		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
516	} else {
517		ZFS_TIME_ENCODE(&now, pzp->zp_atime);
518	}
519
520	if (vap->va_mask & AT_MTIME) {
521		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
522	} else {
523		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
524	}
525
526	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
527	zp = zfs_znode_alloc(zfsvfs, dbp, *oid, 0);
528
529	zfs_perm_init(zp, dzp, flag, vap, tx, cr);
530
531	if (zpp) {
532		kmutex_t *hash_mtx = ZFS_OBJ_MUTEX(zp);
533
534		mutex_enter(hash_mtx);
535		zfs_znode_dmu_init(zp);
536		mutex_exit(hash_mtx);
537
538		*zpp = zp;
539	} else {
540		if (ZTOV(zp) != NULL)
541			ZTOV(zp)->v_count = 0;
542		dmu_buf_rele(dbp, NULL);
543		zfs_znode_free(zp);
544	}
545}
546
547int
548zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
549{
550	dmu_object_info_t doi;
551	dmu_buf_t	*db;
552	znode_t		*zp;
553	vnode_t		*vp;
554	int err;
555
556	*zpp = NULL;
557
558	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
559
560	err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
561	if (err) {
562		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
563		return (err);
564	}
565
566	dmu_object_info_from_db(db, &doi);
567	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
568	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
569		dmu_buf_rele(db, NULL);
570		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
571		return (EINVAL);
572	}
573
574	ASSERT(db->db_object == obj_num);
575	ASSERT(db->db_offset == -1);
576	ASSERT(db->db_data != NULL);
577
578	zp = dmu_buf_get_user(db);
579
580	if (zp != NULL) {
581		mutex_enter(&zp->z_lock);
582
583		ASSERT3U(zp->z_id, ==, obj_num);
584		if (zp->z_unlinked) {
585			dmu_buf_rele(db, NULL);
586			mutex_exit(&zp->z_lock);
587			ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
588			return (ENOENT);
589		} else if (zp->z_dbuf_held) {
590			dmu_buf_rele(db, NULL);
591		} else {
592			zp->z_dbuf_held = 1;
593			VFS_HOLD(zfsvfs->z_vfs);
594		}
595
596		if (ZTOV(zp) != NULL)
597			VN_HOLD(ZTOV(zp));
598		else {
599			err = getnewvnode("zfs", zfsvfs->z_vfs, &zfs_vnodeops,
600			    &zp->z_vnode);
601			ASSERT(err == 0);
602			vp = ZTOV(zp);
603			vp->v_data = (caddr_t)zp;
604			vhold(vp);
605			vp->v_vnlock->lk_flags |= LK_CANRECURSE;
606			vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
607			vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode);
608			if (vp->v_type == VDIR)
609				zp->z_zn_prefetch = B_TRUE;	/* z_prefetch default is enabled */
610			err = insmntque(vp, zfsvfs->z_vfs);
611			KASSERT(err == 0, ("insmntque() failed: error %d", err));
612		}
613		mutex_exit(&zp->z_lock);
614		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
615		*zpp = zp;
616		return (0);
617	}
618
619	/*
620	 * Not found create new znode/vnode
621	 */
622	zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
623	ASSERT3U(zp->z_id, ==, obj_num);
624	zfs_znode_dmu_init(zp);
625	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
626	*zpp = zp;
627	return (0);
628}
629
630void
631zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
632{
633	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
634	int error;
635
636	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
637	if (zp->z_phys->zp_acl.z_acl_extern_obj) {
638		error = dmu_object_free(zfsvfs->z_os,
639		    zp->z_phys->zp_acl.z_acl_extern_obj, tx);
640		ASSERT3U(error, ==, 0);
641	}
642	error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
643	ASSERT3U(error, ==, 0);
644	zp->z_dbuf_held = 0;
645	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
646	dmu_buf_rele(zp->z_dbuf, NULL);
647}
648
649void
650zfs_zinactive(znode_t *zp)
651{
652	vnode_t	*vp = ZTOV(zp);
653	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
654	uint64_t z_id = zp->z_id;
655
656	ASSERT(zp->z_dbuf_held && zp->z_phys);
657
658	/*
659	 * Don't allow a zfs_zget() while were trying to release this znode
660	 */
661	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
662
663	mutex_enter(&zp->z_lock);
664	VI_LOCK(vp);
665	if (vp->v_count > 0) {
666		/*
667		 * If the hold count is greater than zero, somebody has
668		 * obtained a new reference on this znode while we were
669		 * processing it here, so we are done.
670		 */
671		VI_UNLOCK(vp);
672		mutex_exit(&zp->z_lock);
673		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
674		return;
675	}
676	VI_UNLOCK(vp);
677
678	/*
679	 * If this was the last reference to a file with no links,
680	 * remove the file from the file system.
681	 */
682	if (zp->z_unlinked) {
683		ZTOV(zp) = NULL;
684		mutex_exit(&zp->z_lock);
685		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
686		ASSERT(vp->v_count == 0);
687		vrecycle(vp, curthread);
688		zfs_rmnode(zp);
689		VFS_RELE(zfsvfs->z_vfs);
690		return;
691	}
692	ASSERT(zp->z_phys);
693	ASSERT(zp->z_dbuf_held);
694	mutex_exit(&zp->z_lock);
695	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
696}
697
698void
699zfs_znode_free(znode_t *zp)
700{
701	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
702
703	mutex_enter(&zfsvfs->z_znodes_lock);
704	list_remove(&zfsvfs->z_all_znodes, zp);
705	mutex_exit(&zfsvfs->z_znodes_lock);
706
707	kmem_cache_free(znode_cache, zp);
708}
709
710void
711zfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx)
712{
713	timestruc_t	now;
714
715	ASSERT(MUTEX_HELD(&zp->z_lock));
716
717	gethrestime(&now);
718
719	if (tx) {
720		dmu_buf_will_dirty(zp->z_dbuf, tx);
721		zp->z_atime_dirty = 0;
722		zp->z_seq++;
723	} else {
724		zp->z_atime_dirty = 1;
725	}
726
727	if (flag & AT_ATIME)
728		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime);
729
730	if (flag & AT_MTIME)
731		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime);
732
733	if (flag & AT_CTIME)
734		ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime);
735}
736
737/*
738 * Update the requested znode timestamps with the current time.
739 * If we are in a transaction, then go ahead and mark the znode
740 * dirty in the transaction so the timestamps will go to disk.
741 * Otherwise, we will get pushed next time the znode is updated
742 * in a transaction, or when this znode eventually goes inactive.
743 *
744 * Why is this OK?
745 *  1 - Only the ACCESS time is ever updated outside of a transaction.
746 *  2 - Multiple consecutive updates will be collapsed into a single
747 *	znode update by the transaction grouping semantics of the DMU.
748 */
749void
750zfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx)
751{
752	mutex_enter(&zp->z_lock);
753	zfs_time_stamper_locked(zp, flag, tx);
754	mutex_exit(&zp->z_lock);
755}
756
757/*
758 * Grow the block size for a file.
759 *
760 *	IN:	zp	- znode of file to free data in.
761 *		size	- requested block size
762 *		tx	- open transaction.
763 *
764 * NOTE: this function assumes that the znode is write locked.
765 */
766void
767zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
768{
769	int		error;
770	u_longlong_t	dummy;
771
772	if (size <= zp->z_blksz)
773		return;
774	/*
775	 * If the file size is already greater than the current blocksize,
776	 * we will not grow.  If there is more than one block in a file,
777	 * the blocksize cannot change.
778	 */
779	if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz)
780		return;
781
782	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
783	    size, 0, tx);
784	if (error == ENOTSUP)
785		return;
786	ASSERT3U(error, ==, 0);
787
788	/* What blocksize did we actually get? */
789	dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy);
790}
791
792/*
793 * Free space in a file.
794 *
795 *	IN:	zp	- znode of file to free data in.
796 *		off	- start of section to free.
797 *		len	- length of section to free (0 => to EOF).
798 *		flag	- current file open mode flags.
799 *
800 * 	RETURN:	0 if success
801 *		error code if failure
802 */
803int
804zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
805{
806	vnode_t *vp = ZTOV(zp);
807	dmu_tx_t *tx;
808	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
809	zilog_t *zilog = zfsvfs->z_log;
810	rl_t *rl;
811	uint64_t end = off + len;
812	uint64_t size, new_blksz;
813	int error;
814
815	if (ZTOV(zp)->v_type == VFIFO)
816		return (0);
817
818	/*
819	 * If we will change zp_size then lock the whole file,
820	 * otherwise just lock the range being freed.
821	 */
822	if (len == 0 || off + len > zp->z_phys->zp_size) {
823		rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
824	} else {
825		rl = zfs_range_lock(zp, off, len, RL_WRITER);
826		/* recheck, in case zp_size changed */
827		if (off + len > zp->z_phys->zp_size) {
828			/* lost race: file size changed, lock whole file */
829			zfs_range_unlock(rl);
830			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
831		}
832	}
833
834	/*
835	 * Nothing to do if file already at desired length.
836	 */
837	size = zp->z_phys->zp_size;
838	if (len == 0 && size == off && off != 0) {
839		zfs_range_unlock(rl);
840		return (0);
841	}
842
843	tx = dmu_tx_create(zfsvfs->z_os);
844	dmu_tx_hold_bonus(tx, zp->z_id);
845	new_blksz = 0;
846	if (end > size &&
847	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
848		/*
849		 * We are growing the file past the current block size.
850		 */
851		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
852			ASSERT(!ISP2(zp->z_blksz));
853			new_blksz = MIN(end, SPA_MAXBLOCKSIZE);
854		} else {
855			new_blksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
856		}
857		dmu_tx_hold_write(tx, zp->z_id, 0, MIN(end, new_blksz));
858	} else if (off < size) {
859		/*
860		 * If len == 0, we are truncating the file.
861		 */
862		dmu_tx_hold_free(tx, zp->z_id, off, len ? len : DMU_OBJECT_END);
863	}
864
865	error = dmu_tx_assign(tx, zfsvfs->z_assign);
866	if (error) {
867		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
868			dmu_tx_wait(tx);
869		dmu_tx_abort(tx);
870		zfs_range_unlock(rl);
871		return (error);
872	}
873
874	if (new_blksz)
875		zfs_grow_blocksize(zp, new_blksz, tx);
876
877	if (end > size || len == 0)
878		zp->z_phys->zp_size = end;
879
880	if (off < size) {
881		objset_t *os = zfsvfs->z_os;
882		uint64_t rlen = len;
883
884		if (len == 0)
885			rlen = -1;
886		else if (end > size)
887			rlen = size - off;
888		VERIFY(0 == dmu_free_range(os, zp->z_id, off, rlen, tx));
889	}
890
891	if (log) {
892		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
893		zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
894	}
895
896	zfs_range_unlock(rl);
897
898	dmu_tx_commit(tx);
899
900	/*
901	 * Clear any mapped pages in the truncated region.  This has to
902	 * happen outside of the transaction to avoid the possibility of
903	 * a deadlock with someone trying to push a page that we are
904	 * about to invalidate.
905	 */
906	rw_enter(&zp->z_map_lock, RW_WRITER);
907	if (end > size)
908		vnode_pager_setsize(vp, end);
909	else if (len == 0) {
910#if 0
911		error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE);
912#else
913		error = vinvalbuf(vp, V_SAVE, curthread, 0, 0);
914		vnode_pager_setsize(vp, end);
915#endif
916	}
917	rw_exit(&zp->z_map_lock);
918
919	return (0);
920}
921
922void
923zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
924{
925	zfsvfs_t	zfsvfs;
926	uint64_t	moid, doid, roid = 0;
927	uint64_t	version = ZPL_VERSION;
928	int		error;
929	znode_t		*rootzp = NULL;
930	vattr_t		vattr;
931
932	/*
933	 * First attempt to create master node.
934	 */
935	/*
936	 * In an empty objset, there are no blocks to read and thus
937	 * there can be no i/o errors (which we assert below).
938	 */
939	moid = MASTER_NODE_OBJ;
940	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
941	    DMU_OT_NONE, 0, tx);
942	ASSERT(error == 0);
943
944	/*
945	 * Set starting attributes.
946	 */
947
948	error = zap_update(os, moid, ZPL_VERSION_OBJ, 8, 1, &version, tx);
949	ASSERT(error == 0);
950
951	/*
952	 * Create a delete queue.
953	 */
954	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
955
956	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
957	ASSERT(error == 0);
958
959	/*
960	 * Create root znode.  Create minimal znode/vnode/zfsvfs
961	 * to allow zfs_mknode to work.
962	 */
963	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
964	vattr.va_type = VDIR;
965	vattr.va_mode = S_IFDIR|0755;
966	vattr.va_uid = UID_ROOT;
967	vattr.va_gid = GID_WHEEL;
968
969	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
970	zfs_znode_cache_constructor(rootzp, NULL, 0);
971	rootzp->z_zfsvfs = &zfsvfs;
972	rootzp->z_unlinked = 0;
973	rootzp->z_atime_dirty = 0;
974	rootzp->z_dbuf_held = 0;
975
976	bzero(&zfsvfs, sizeof (zfsvfs_t));
977
978	zfsvfs.z_os = os;
979	zfsvfs.z_assign = TXG_NOWAIT;
980	zfsvfs.z_parent = &zfsvfs;
981
982	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
983	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
984	    offsetof(znode_t, z_link_node));
985
986	zfs_mknode(rootzp, &vattr, &roid, tx, cr, IS_ROOT_NODE, NULL, 0);
987	ASSERT3U(rootzp->z_id, ==, roid);
988	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &roid, tx);
989	ASSERT(error == 0);
990
991	mutex_destroy(&zfsvfs.z_znodes_lock);
992	kmem_cache_free(znode_cache, rootzp);
993}
994#endif /* _KERNEL */
995
996/*
997 * Given an object number, return its parent object number and whether
998 * or not the object is an extended attribute directory.
999 */
1000static int
1001zfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir)
1002{
1003	dmu_buf_t *db;
1004	dmu_object_info_t doi;
1005	znode_phys_t *zp;
1006	int error;
1007
1008	if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0)
1009		return (error);
1010
1011	dmu_object_info_from_db(db, &doi);
1012	if (doi.doi_bonus_type != DMU_OT_ZNODE ||
1013	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1014		dmu_buf_rele(db, FTAG);
1015		return (EINVAL);
1016	}
1017
1018	zp = db->db_data;
1019	*pobjp = zp->zp_parent;
1020	*is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) &&
1021	    S_ISDIR(zp->zp_mode);
1022	dmu_buf_rele(db, FTAG);
1023
1024	return (0);
1025}
1026
1027int
1028zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
1029{
1030	char *path = buf + len - 1;
1031	int error;
1032
1033	*path = '\0';
1034
1035	for (;;) {
1036		uint64_t pobj;
1037		char component[MAXNAMELEN + 2];
1038		size_t complen;
1039		int is_xattrdir;
1040
1041		if ((error = zfs_obj_to_pobj(osp, obj, &pobj,
1042		    &is_xattrdir)) != 0)
1043			break;
1044
1045		if (pobj == obj) {
1046			if (path[0] != '/')
1047				*--path = '/';
1048			break;
1049		}
1050
1051		component[0] = '/';
1052		if (is_xattrdir) {
1053			(void) sprintf(component + 1, "<xattrdir>");
1054		} else {
1055			error = zap_value_search(osp, pobj, obj, component + 1);
1056			if (error != 0)
1057				break;
1058		}
1059
1060		complen = strlen(component);
1061		path -= complen;
1062		ASSERT(path >= buf);
1063		bcopy(component, path, complen);
1064		obj = pobj;
1065	}
1066
1067	if (error == 0)
1068		(void) memmove(buf, path, buf + len - path);
1069	return (error);
1070}
1071