zfs_znode.c revision 185029
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22185029Spjd * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23168404Spjd * Use is subject to license terms. 24168404Spjd */ 25168404Spjd 26169195Spjd/* Portions Copyright 2007 Jeremy Teo */ 27169195Spjd 28168404Spjd#ifdef _KERNEL 29168404Spjd#include <sys/types.h> 30168404Spjd#include <sys/param.h> 31168404Spjd#include <sys/time.h> 32168404Spjd#include <sys/systm.h> 33168404Spjd#include <sys/sysmacros.h> 34168404Spjd#include <sys/resource.h> 35168404Spjd#include <sys/mntent.h> 36185029Spjd#include <sys/u8_textprep.h> 37185029Spjd#include <sys/dsl_dataset.h> 38168404Spjd#include <sys/vfs.h> 39168404Spjd#include <sys/vnode.h> 40168404Spjd#include <sys/file.h> 41168404Spjd#include <sys/kmem.h> 42168404Spjd#include <sys/errno.h> 43168404Spjd#include <sys/unistd.h> 44168404Spjd#include <sys/atomic.h> 45168404Spjd#include <sys/zfs_dir.h> 46168404Spjd#include <sys/zfs_acl.h> 47168404Spjd#include <sys/zfs_ioctl.h> 48168404Spjd#include <sys/zfs_rlock.h> 49185029Spjd#include <sys/zfs_fuid.h> 50168404Spjd#include <sys/fs/zfs.h> 51185029Spjd#include <sys/kidmap.h> 52168404Spjd#endif /* _KERNEL */ 53168404Spjd 54168404Spjd#include <sys/dmu.h> 55168404Spjd#include <sys/refcount.h> 56168404Spjd#include <sys/stat.h> 57168404Spjd#include <sys/zap.h> 58168404Spjd#include <sys/zfs_znode.h> 59168404Spjd#include <sys/refcount.h> 60168404Spjd 61185029Spjd#include "zfs_prop.h" 62185029Spjd 63173268Slulf/* Used by fstat(1). */ 64173268SlulfSYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t), 65173268Slulf "sizeof(znode_t)"); 66173268Slulf 67168404Spjd/* 68185029Spjd * Define ZNODE_STATS to turn on statistic gathering. By default, it is only 69185029Spjd * turned on when DEBUG is also defined. 70185029Spjd */ 71185029Spjd#ifdef DEBUG 72185029Spjd#define ZNODE_STATS 73185029Spjd#endif /* DEBUG */ 74185029Spjd 75185029Spjd#ifdef ZNODE_STATS 76185029Spjd#define ZNODE_STAT_ADD(stat) ((stat)++) 77185029Spjd#else 78185029Spjd#define ZNODE_STAT_ADD(stat) /* nothing */ 79185029Spjd#endif /* ZNODE_STATS */ 80185029Spjd 81185029Spjd#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) 82185029Spjd#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) 83185029Spjd 84185029Spjd/* 85168404Spjd * Functions needed for userland (ie: libzpool) are not put under 86168404Spjd * #ifdef_KERNEL; the rest of the functions have dependencies 87168404Spjd * (such as VFS logic) that will not compile easily in userland. 88168404Spjd */ 89168404Spjd#ifdef _KERNEL 90185029Spjdstatic kmem_cache_t *znode_cache = NULL; 91168404Spjd 92168404Spjd/*ARGSUSED*/ 93168404Spjdstatic void 94185029Spjdznode_evict_error(dmu_buf_t *dbuf, void *user_ptr) 95168404Spjd{ 96185029Spjd#if 1 /* XXXPJD: From OpenSolaris. */ 97185029Spjd /* 98185029Spjd * We should never drop all dbuf refs without first clearing 99185029Spjd * the eviction callback. 100185029Spjd */ 101185029Spjd panic("evicting znode %p\n", user_ptr); 102185029Spjd#else /* XXXPJD */ 103168404Spjd znode_t *zp = user_ptr; 104168488Spjd vnode_t *vp; 105168404Spjd 106168404Spjd mutex_enter(&zp->z_lock); 107185029Spjd zp->z_dbuf = NULL; 108168488Spjd vp = ZTOV(zp); 109168404Spjd if (vp == NULL) { 110168404Spjd mutex_exit(&zp->z_lock); 111168404Spjd zfs_znode_free(zp); 112168404Spjd } else if (vp->v_count == 0) { 113168404Spjd ZTOV(zp) = NULL; 114168488Spjd vhold(vp); 115168404Spjd mutex_exit(&zp->z_lock); 116185029Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 117168404Spjd vrecycle(vp, curthread); 118175294Sattilio VOP_UNLOCK(vp, 0); 119168404Spjd vdrop(vp); 120168404Spjd zfs_znode_free(zp); 121168404Spjd } else { 122168404Spjd mutex_exit(&zp->z_lock); 123168404Spjd } 124185029Spjd#endif 125168404Spjd} 126168404Spjd 127168404Spjdextern struct vop_vector zfs_vnodeops; 128168404Spjdextern struct vop_vector zfs_fifoops; 129168404Spjd 130168404Spjd/* 131168404Spjd * XXX: We cannot use this function as a cache constructor, because 132168404Spjd * there is one global cache for all file systems and we need 133168404Spjd * to pass vfsp here, which is not possible, because argument 134168404Spjd * 'cdrarg' is defined at kmem_cache_create() time. 135168404Spjd */ 136168404Spjdstatic int 137185029Spjdzfs_znode_cache_constructor(void *buf, void *arg, int kmflags) 138168404Spjd{ 139168404Spjd znode_t *zp = buf; 140169196Spjd vnode_t *vp; 141185029Spjd vfs_t *vfsp = arg; 142168404Spjd int error; 143168404Spjd 144185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 145185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 146185029Spjd ASSERT(vfsp != NULL); 147185029Spjd 148185029Spjd error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp); 149185029Spjd if (error != 0 && (kmflags & KM_NOSLEEP)) 150185029Spjd return (-1); 151185029Spjd ASSERT(error == 0); 152185029Spjd vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 153185029Spjd zp->z_vnode = vp; 154185029Spjd vp->v_data = (caddr_t)zp; 155185029Spjd VN_LOCK_AREC(vp); 156185029Spjd VN_LOCK_ASHARE(vp); 157185029Spjd 158185029Spjd list_link_init(&zp->z_link_node); 159185029Spjd 160168404Spjd mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); 161168404Spjd rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); 162168404Spjd rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); 163168404Spjd rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); 164168404Spjd mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); 165168404Spjd 166168404Spjd mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL); 167168404Spjd avl_create(&zp->z_range_avl, zfs_range_compare, 168168404Spjd sizeof (rl_t), offsetof(rl_t, r_node)); 169168404Spjd 170185029Spjd zp->z_dbuf = NULL; 171185029Spjd zp->z_dirlocks = NULL; 172168404Spjd return (0); 173168404Spjd} 174168404Spjd 175168404Spjd/*ARGSUSED*/ 176168404Spjdstatic void 177185029Spjdzfs_znode_cache_destructor(void *buf, void *arg) 178168404Spjd{ 179168404Spjd znode_t *zp = buf; 180168404Spjd 181185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 182185029Spjd ASSERT(ZTOV(zp) == NULL); 183185029Spjd vn_free(ZTOV(zp)); 184185029Spjd ASSERT(!list_link_active(&zp->z_link_node)); 185168404Spjd mutex_destroy(&zp->z_lock); 186168404Spjd rw_destroy(&zp->z_map_lock); 187168404Spjd rw_destroy(&zp->z_parent_lock); 188168404Spjd rw_destroy(&zp->z_name_lock); 189168404Spjd mutex_destroy(&zp->z_acl_lock); 190185029Spjd avl_destroy(&zp->z_range_avl); 191168404Spjd mutex_destroy(&zp->z_range_lock); 192168404Spjd 193185029Spjd ASSERT(zp->z_dbuf == NULL); 194185029Spjd ASSERT(zp->z_dirlocks == NULL); 195168404Spjd} 196168404Spjd 197185029Spjd#ifdef ZNODE_STATS 198185029Spjdstatic struct { 199185029Spjd uint64_t zms_zfsvfs_invalid; 200185029Spjd uint64_t zms_zfsvfs_unmounted; 201185029Spjd uint64_t zms_zfsvfs_recheck_invalid; 202185029Spjd uint64_t zms_obj_held; 203185029Spjd uint64_t zms_vnode_locked; 204185029Spjd uint64_t zms_not_only_dnlc; 205185029Spjd} znode_move_stats; 206185029Spjd#endif /* ZNODE_STATS */ 207185029Spjd 208185029Spjd#if defined(sun) 209185029Spjdstatic void 210185029Spjdzfs_znode_move_impl(znode_t *ozp, znode_t *nzp) 211185029Spjd{ 212185029Spjd vnode_t *vp; 213185029Spjd 214185029Spjd /* Copy fields. */ 215185029Spjd nzp->z_zfsvfs = ozp->z_zfsvfs; 216185029Spjd 217185029Spjd /* Swap vnodes. */ 218185029Spjd vp = nzp->z_vnode; 219185029Spjd nzp->z_vnode = ozp->z_vnode; 220185029Spjd ozp->z_vnode = vp; /* let destructor free the overwritten vnode */ 221185029Spjd ZTOV(ozp)->v_data = ozp; 222185029Spjd ZTOV(nzp)->v_data = nzp; 223185029Spjd 224185029Spjd nzp->z_id = ozp->z_id; 225185029Spjd ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */ 226185029Spjd ASSERT(avl_numnodes(&ozp->z_range_avl) == 0); 227185029Spjd nzp->z_unlinked = ozp->z_unlinked; 228185029Spjd nzp->z_atime_dirty = ozp->z_atime_dirty; 229185029Spjd nzp->z_zn_prefetch = ozp->z_zn_prefetch; 230185029Spjd nzp->z_blksz = ozp->z_blksz; 231185029Spjd nzp->z_seq = ozp->z_seq; 232185029Spjd nzp->z_mapcnt = ozp->z_mapcnt; 233185029Spjd nzp->z_last_itx = ozp->z_last_itx; 234185029Spjd nzp->z_gen = ozp->z_gen; 235185029Spjd nzp->z_sync_cnt = ozp->z_sync_cnt; 236185029Spjd nzp->z_phys = ozp->z_phys; 237185029Spjd nzp->z_dbuf = ozp->z_dbuf; 238185029Spjd 239185029Spjd /* Update back pointers. */ 240185029Spjd (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, 241185029Spjd znode_evict_error); 242185029Spjd 243185029Spjd /* 244185029Spjd * Invalidate the original znode by clearing fields that provide a 245185029Spjd * pointer back to the znode. Set the low bit of the vfs pointer to 246185029Spjd * ensure that zfs_znode_move() recognizes the znode as invalid in any 247185029Spjd * subsequent callback. 248185029Spjd */ 249185029Spjd ozp->z_dbuf = NULL; 250185029Spjd POINTER_INVALIDATE(&ozp->z_zfsvfs); 251185029Spjd} 252185029Spjd 253185029Spjd/* 254185029Spjd * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise 255185029Spjd * returns a non-zero error code. 256185029Spjd */ 257185029Spjdstatic int 258185029Spjdzfs_enter(zfsvfs_t *zfsvfs) 259185029Spjd{ 260185029Spjd ZFS_ENTER(zfsvfs); 261185029Spjd return (0); 262185029Spjd} 263185029Spjd 264185029Spjd/*ARGSUSED*/ 265185029Spjdstatic kmem_cbrc_t 266185029Spjdzfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) 267185029Spjd{ 268185029Spjd znode_t *ozp = buf, *nzp = newbuf; 269185029Spjd zfsvfs_t *zfsvfs; 270185029Spjd vnode_t *vp; 271185029Spjd 272185029Spjd /* 273185029Spjd * The znode is on the file system's list of known znodes if the vfs 274185029Spjd * pointer is valid. We set the low bit of the vfs pointer when freeing 275185029Spjd * the znode to invalidate it, and the memory patterns written by kmem 276185029Spjd * (baddcafe and deadbeef) set at least one of the two low bits. A newly 277185029Spjd * created znode sets the vfs pointer last of all to indicate that the 278185029Spjd * znode is known and in a valid state to be moved by this function. 279185029Spjd */ 280185029Spjd zfsvfs = ozp->z_zfsvfs; 281185029Spjd if (!POINTER_IS_VALID(zfsvfs)) { 282185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid); 283185029Spjd return (KMEM_CBRC_DONT_KNOW); 284185029Spjd } 285185029Spjd 286185029Spjd /* 287185029Spjd * Ensure that the filesystem is not unmounted during the move. 288185029Spjd */ 289185029Spjd if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ 290185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); 291185029Spjd return (KMEM_CBRC_DONT_KNOW); 292185029Spjd } 293185029Spjd 294185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 295185029Spjd /* 296185029Spjd * Recheck the vfs pointer in case the znode was removed just before 297185029Spjd * acquiring the lock. 298185029Spjd */ 299185029Spjd if (zfsvfs != ozp->z_zfsvfs) { 300185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 301185029Spjd ZFS_EXIT(zfsvfs); 302185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); 303185029Spjd return (KMEM_CBRC_DONT_KNOW); 304185029Spjd } 305185029Spjd 306185029Spjd /* 307185029Spjd * At this point we know that as long as we hold z_znodes_lock, the 308185029Spjd * znode cannot be freed and fields within the znode can be safely 309185029Spjd * accessed. Now, prevent a race with zfs_zget(). 310185029Spjd */ 311185029Spjd if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) { 312185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 313185029Spjd ZFS_EXIT(zfsvfs); 314185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_obj_held); 315185029Spjd return (KMEM_CBRC_LATER); 316185029Spjd } 317185029Spjd 318185029Spjd vp = ZTOV(ozp); 319185029Spjd if (mutex_tryenter(&vp->v_lock) == 0) { 320185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 321185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 322185029Spjd ZFS_EXIT(zfsvfs); 323185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked); 324185029Spjd return (KMEM_CBRC_LATER); 325185029Spjd } 326185029Spjd 327185029Spjd /* Only move znodes that are referenced _only_ by the DNLC. */ 328185029Spjd if (vp->v_count != 1 || !vn_in_dnlc(vp)) { 329185029Spjd mutex_exit(&vp->v_lock); 330185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 331185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 332185029Spjd ZFS_EXIT(zfsvfs); 333185029Spjd ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc); 334185029Spjd return (KMEM_CBRC_LATER); 335185029Spjd } 336185029Spjd 337185029Spjd /* 338185029Spjd * The znode is known and in a valid state to move. We're holding the 339185029Spjd * locks needed to execute the critical section. 340185029Spjd */ 341185029Spjd zfs_znode_move_impl(ozp, nzp); 342185029Spjd mutex_exit(&vp->v_lock); 343185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id); 344185029Spjd 345185029Spjd list_link_replace(&ozp->z_link_node, &nzp->z_link_node); 346185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 347185029Spjd ZFS_EXIT(zfsvfs); 348185029Spjd 349185029Spjd return (KMEM_CBRC_YES); 350185029Spjd} 351185029Spjd#endif /* sun */ 352185029Spjd 353168404Spjdvoid 354168404Spjdzfs_znode_init(void) 355168404Spjd{ 356168404Spjd /* 357168404Spjd * Initialize zcache 358168404Spjd */ 359168404Spjd ASSERT(znode_cache == NULL); 360168404Spjd znode_cache = kmem_cache_create("zfs_znode_cache", 361168404Spjd sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL, 362168404Spjd zfs_znode_cache_destructor, NULL, NULL, NULL, 0); 363185029Spjd#if defined(sun) 364185029Spjd kmem_cache_set_move(znode_cache, zfs_znode_move); 365185029Spjd#endif 366168404Spjd} 367168404Spjd 368168404Spjdvoid 369168404Spjdzfs_znode_fini(void) 370168404Spjd{ 371168404Spjd /* 372168404Spjd * Cleanup zcache 373168404Spjd */ 374168404Spjd if (znode_cache) 375168404Spjd kmem_cache_destroy(znode_cache); 376168404Spjd znode_cache = NULL; 377168404Spjd} 378168404Spjd 379168404Spjd/* 380168404Spjd * zfs_init_fs - Initialize the zfsvfs struct and the file system 381168404Spjd * incore "master" object. Verify version compatibility. 382168404Spjd */ 383168404Spjdint 384185029Spjdzfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) 385168404Spjd{ 386168404Spjd objset_t *os = zfsvfs->z_os; 387168404Spjd int i, error; 388168404Spjd uint64_t fsid_guid; 389185029Spjd uint64_t zval; 390168404Spjd 391168404Spjd *zpp = NULL; 392168404Spjd 393185029Spjd error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); 394168404Spjd if (error) { 395168404Spjd return (error); 396185029Spjd } else if (zfsvfs->z_version > ZPL_VERSION) { 397168404Spjd (void) printf("Mismatched versions: File system " 398185029Spjd "is version %llu on-disk format, which is " 399168404Spjd "incompatible with this software version %lld!", 400185029Spjd (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); 401168404Spjd return (ENOTSUP); 402168404Spjd } 403168404Spjd 404185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) 405185029Spjd return (error); 406185029Spjd zfsvfs->z_norm = (int)zval; 407185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) 408185029Spjd return (error); 409185029Spjd zfsvfs->z_utf8 = (zval != 0); 410185029Spjd if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) 411185029Spjd return (error); 412185029Spjd zfsvfs->z_case = (uint_t)zval; 413168404Spjd /* 414185029Spjd * Fold case on file systems that are always or sometimes case 415185029Spjd * insensitive. 416185029Spjd */ 417185029Spjd if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || 418185029Spjd zfsvfs->z_case == ZFS_CASE_MIXED) 419185029Spjd zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; 420185029Spjd 421185029Spjd /* 422168404Spjd * The fsid is 64 bits, composed of an 8-bit fs type, which 423168404Spjd * separates our fsid from any other filesystem types, and a 424168404Spjd * 56-bit objset unique ID. The objset unique ID is unique to 425168404Spjd * all objsets open on this system, provided by unique_create(). 426168404Spjd * The 8-bit fs type must be put in the low bits of fsid[1] 427168404Spjd * because that's where other Solaris filesystems put it. 428168404Spjd */ 429168404Spjd fsid_guid = dmu_objset_fsid_guid(os); 430168404Spjd ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); 431168404Spjd zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; 432168404Spjd zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | 433168404Spjd zfsvfs->z_vfs->mnt_vfc->vfc_typenum & 0xFF; 434168404Spjd 435168404Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, 436168404Spjd &zfsvfs->z_root); 437168404Spjd if (error) 438168404Spjd return (error); 439168404Spjd ASSERT(zfsvfs->z_root != 0); 440168404Spjd 441185029Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, 442185029Spjd &zfsvfs->z_unlinkedobj); 443185029Spjd if (error) 444185029Spjd return (error); 445168404Spjd 446168404Spjd /* 447168404Spjd * Initialize zget mutex's 448168404Spjd */ 449168404Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 450168404Spjd mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); 451168404Spjd 452168404Spjd error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); 453185029Spjd if (error) { 454185029Spjd /* 455185029Spjd * On error, we destroy the mutexes here since it's not 456185029Spjd * possible for the caller to determine if the mutexes were 457185029Spjd * initialized properly. 458185029Spjd */ 459185029Spjd for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) 460185029Spjd mutex_destroy(&zfsvfs->z_hold_mtx[i]); 461168404Spjd return (error); 462185029Spjd } 463168404Spjd ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); 464185029Spjd error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, 465185029Spjd &zfsvfs->z_fuid_obj); 466185029Spjd if (error == ENOENT) 467185029Spjd error = 0; 468168404Spjd 469168404Spjd return (0); 470168404Spjd} 471168404Spjd 472168404Spjd/* 473168404Spjd * define a couple of values we need available 474168404Spjd * for both 64 and 32 bit environments. 475168404Spjd */ 476168404Spjd#ifndef NBITSMINOR64 477168404Spjd#define NBITSMINOR64 32 478168404Spjd#endif 479168404Spjd#ifndef MAXMAJ64 480168404Spjd#define MAXMAJ64 0xffffffffUL 481168404Spjd#endif 482168404Spjd#ifndef MAXMIN64 483168404Spjd#define MAXMIN64 0xffffffffUL 484168404Spjd#endif 485168404Spjd 486168404Spjd/* 487168404Spjd * Create special expldev for ZFS private use. 488168404Spjd * Can't use standard expldev since it doesn't do 489168404Spjd * what we want. The standard expldev() takes a 490168404Spjd * dev32_t in LP64 and expands it to a long dev_t. 491168404Spjd * We need an interface that takes a dev32_t in ILP32 492168404Spjd * and expands it to a long dev_t. 493168404Spjd */ 494168404Spjdstatic uint64_t 495168404Spjdzfs_expldev(dev_t dev) 496168404Spjd{ 497179757Sed return (((uint64_t)umajor(dev) << NBITSMINOR64) | uminor(dev)); 498168404Spjd} 499168404Spjd/* 500168404Spjd * Special cmpldev for ZFS private use. 501168404Spjd * Can't use standard cmpldev since it takes 502168404Spjd * a long dev_t and compresses it to dev32_t in 503168404Spjd * LP64. We need to do a compaction of a long dev_t 504168404Spjd * to a dev32_t in ILP32. 505168404Spjd */ 506168404Spjddev_t 507168404Spjdzfs_cmpldev(uint64_t dev) 508168404Spjd{ 509168958Spjd return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); 510168404Spjd} 511168404Spjd 512185029Spjdstatic void 513185029Spjdzfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) 514185029Spjd{ 515185029Spjd znode_t *nzp; 516185029Spjd 517185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); 518185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); 519185029Spjd 520185029Spjd mutex_enter(&zp->z_lock); 521185029Spjd 522185029Spjd ASSERT(zp->z_dbuf == NULL); 523185029Spjd zp->z_dbuf = db; 524185029Spjd nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); 525185029Spjd 526185029Spjd /* 527185029Spjd * there should be no 528185029Spjd * concurrent zgets on this object. 529185029Spjd */ 530185029Spjd if (nzp != NULL) 531185029Spjd panic("existing znode %p for dbuf %p", (void *)nzp, (void *)db); 532185029Spjd 533185029Spjd /* 534185029Spjd * Slap on VROOT if we are the root znode 535185029Spjd */ 536185029Spjd if (zp->z_id == zfsvfs->z_root) 537185029Spjd ZTOV(zp)->v_flag |= VROOT; 538185029Spjd 539185029Spjd mutex_exit(&zp->z_lock); 540185029Spjd vn_exists(ZTOV(zp)); 541185029Spjd} 542185029Spjd 543185029Spjdvoid 544185029Spjdzfs_znode_dmu_fini(znode_t *zp) 545185029Spjd{ 546185029Spjd dmu_buf_t *db = zp->z_dbuf; 547185029Spjd ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || 548185029Spjd zp->z_unlinked || 549185029Spjd RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); 550185029Spjd ASSERT(zp->z_dbuf != NULL); 551185029Spjd zp->z_dbuf = NULL; 552185029Spjd VERIFY(zp == dmu_buf_update_user(db, zp, NULL, NULL, NULL)); 553185029Spjd dmu_buf_rele(db, NULL); 554185029Spjd} 555185029Spjd 556168404Spjd/* 557168404Spjd * Construct a new znode/vnode and intialize. 558168404Spjd * 559168404Spjd * This does not do a call to dmu_set_user() that is 560168404Spjd * up to the caller to do, in case you don't want to 561168404Spjd * return the znode 562168404Spjd */ 563168404Spjdstatic znode_t * 564185029Spjdzfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) 565168404Spjd{ 566168404Spjd znode_t *zp; 567168404Spjd vnode_t *vp; 568168404Spjd 569168404Spjd zp = kmem_cache_alloc(znode_cache, KM_SLEEP); 570185029Spjd zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0); 571168404Spjd 572168404Spjd ASSERT(zp->z_dirlocks == NULL); 573185029Spjd ASSERT(zp->z_dbuf == NULL); 574185029Spjd ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); 575168404Spjd 576185029Spjd /* 577185029Spjd * Defer setting z_zfsvfs until the znode is ready to be a candidate for 578185029Spjd * the zfs_znode_move() callback. 579185029Spjd */ 580185029Spjd zp->z_phys = NULL; 581168404Spjd zp->z_unlinked = 0; 582168404Spjd zp->z_atime_dirty = 0; 583168404Spjd zp->z_mapcnt = 0; 584168404Spjd zp->z_last_itx = 0; 585185029Spjd zp->z_id = db->db_object; 586168404Spjd zp->z_blksz = blksz; 587168404Spjd zp->z_seq = 0x7A4653; 588168404Spjd zp->z_sync_cnt = 0; 589168404Spjd 590185029Spjd vp = ZTOV(zp); 591185029Spjd#ifdef TODO 592185029Spjd vn_reinit(vp); 593185029Spjd#endif 594168404Spjd 595185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 596185029Spjd 597185029Spjd zp->z_gen = zp->z_phys->zp_gen; 598185029Spjd 599185029Spjd#if 0 600168404Spjd if (vp == NULL) 601168404Spjd return (zp); 602185029Spjd#endif 603168404Spjd 604168404Spjd vp->v_type = IFTOVT((mode_t)zp->z_phys->zp_mode); 605168404Spjd switch (vp->v_type) { 606168404Spjd case VDIR: 607168404Spjd zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ 608168404Spjd break; 609168404Spjd case VFIFO: 610168404Spjd vp->v_op = &zfs_fifoops; 611168404Spjd break; 612168404Spjd } 613168404Spjd 614185029Spjd mutex_enter(&zfsvfs->z_znodes_lock); 615185029Spjd list_insert_tail(&zfsvfs->z_all_znodes, zp); 616185029Spjd membar_producer(); 617168404Spjd /* 618185029Spjd * Everything else must be valid before assigning z_zfsvfs makes the 619185029Spjd * znode eligible for zfs_znode_move(). 620168404Spjd */ 621185029Spjd zp->z_zfsvfs = zfsvfs; 622185029Spjd mutex_exit(&zfsvfs->z_znodes_lock); 623168404Spjd 624168404Spjd VFS_HOLD(zfsvfs->z_vfs); 625185029Spjd return (zp); 626168404Spjd} 627168404Spjd 628168404Spjd/* 629168404Spjd * Create a new DMU object to hold a zfs znode. 630168404Spjd * 631168404Spjd * IN: dzp - parent directory for new znode 632168404Spjd * vap - file attributes for new znode 633168404Spjd * tx - dmu transaction id for zap operations 634168404Spjd * cr - credentials of caller 635168404Spjd * flag - flags: 636168404Spjd * IS_ROOT_NODE - new object will be root 637168404Spjd * IS_XATTR - new object is an attribute 638168404Spjd * IS_REPLAY - intent log replay 639185029Spjd * bonuslen - length of bonus buffer 640185029Spjd * setaclp - File/Dir initial ACL 641185029Spjd * fuidp - Tracks fuid allocation. 642168404Spjd * 643185029Spjd * OUT: zpp - allocated znode 644168404Spjd * 645168404Spjd */ 646168404Spjdvoid 647185029Spjdzfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, 648185029Spjd uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, 649185029Spjd zfs_fuid_info_t **fuidp) 650168404Spjd{ 651185029Spjd dmu_buf_t *db; 652168404Spjd znode_phys_t *pzp; 653168404Spjd zfsvfs_t *zfsvfs = dzp->z_zfsvfs; 654168404Spjd timestruc_t now; 655185029Spjd uint64_t gen, obj; 656168404Spjd int err; 657168404Spjd 658168404Spjd ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 659168404Spjd 660168404Spjd if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ 661185029Spjd obj = vap->va_nodeid; 662168404Spjd flag |= IS_REPLAY; 663168404Spjd now = vap->va_ctime; /* see zfs_replay_create() */ 664168404Spjd gen = vap->va_nblocks; /* ditto */ 665168404Spjd } else { 666185029Spjd obj = 0; 667168404Spjd gethrestime(&now); 668168404Spjd gen = dmu_tx_get_txg(tx); 669168404Spjd } 670168404Spjd 671168404Spjd /* 672168404Spjd * Create a new DMU object. 673168404Spjd */ 674168404Spjd /* 675168404Spjd * There's currently no mechanism for pre-reading the blocks that will 676168404Spjd * be to needed allocate a new object, so we accept the small chance 677168404Spjd * that there will be an i/o error and we will fail one of the 678168404Spjd * assertions below. 679168404Spjd */ 680168404Spjd if (vap->va_type == VDIR) { 681168404Spjd if (flag & IS_REPLAY) { 682185029Spjd err = zap_create_claim_norm(zfsvfs->z_os, obj, 683185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 684168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 685168404Spjd ASSERT3U(err, ==, 0); 686168404Spjd } else { 687185029Spjd obj = zap_create_norm(zfsvfs->z_os, 688185029Spjd zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, 689168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 690168404Spjd } 691168404Spjd } else { 692168404Spjd if (flag & IS_REPLAY) { 693185029Spjd err = dmu_object_claim(zfsvfs->z_os, obj, 694168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 695168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 696168404Spjd ASSERT3U(err, ==, 0); 697168404Spjd } else { 698185029Spjd obj = dmu_object_alloc(zfsvfs->z_os, 699168404Spjd DMU_OT_PLAIN_FILE_CONTENTS, 0, 700168404Spjd DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); 701168404Spjd } 702168404Spjd } 703185029Spjd VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); 704185029Spjd dmu_buf_will_dirty(db, tx); 705168404Spjd 706168404Spjd /* 707168404Spjd * Initialize the znode physical data to zero. 708168404Spjd */ 709185029Spjd ASSERT(db->db_size >= sizeof (znode_phys_t)); 710185029Spjd bzero(db->db_data, db->db_size); 711185029Spjd pzp = db->db_data; 712168404Spjd 713168404Spjd /* 714168404Spjd * If this is the root, fix up the half-initialized parent pointer 715168404Spjd * to reference the just-allocated physical data area. 716168404Spjd */ 717168404Spjd if (flag & IS_ROOT_NODE) { 718185029Spjd dzp->z_dbuf = db; 719168404Spjd dzp->z_phys = pzp; 720185029Spjd dzp->z_id = obj; 721168404Spjd } 722168404Spjd 723168404Spjd /* 724168404Spjd * If parent is an xattr, so am I. 725168404Spjd */ 726168404Spjd if (dzp->z_phys->zp_flags & ZFS_XATTR) 727168404Spjd flag |= IS_XATTR; 728168404Spjd 729168404Spjd if (vap->va_type == VBLK || vap->va_type == VCHR) { 730168404Spjd pzp->zp_rdev = zfs_expldev(vap->va_rdev); 731168404Spjd } 732168404Spjd 733185029Spjd if (zfsvfs->z_use_fuids) 734185029Spjd pzp->zp_flags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; 735185029Spjd 736168404Spjd if (vap->va_type == VDIR) { 737168404Spjd pzp->zp_size = 2; /* contents ("." and "..") */ 738168404Spjd pzp->zp_links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; 739168404Spjd } 740168404Spjd 741168404Spjd pzp->zp_parent = dzp->z_id; 742168404Spjd if (flag & IS_XATTR) 743168404Spjd pzp->zp_flags |= ZFS_XATTR; 744168404Spjd 745168404Spjd pzp->zp_gen = gen; 746168404Spjd 747168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_crtime); 748168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_ctime); 749168404Spjd 750168404Spjd if (vap->va_mask & AT_ATIME) { 751168404Spjd ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime); 752168404Spjd } else { 753168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_atime); 754168404Spjd } 755168404Spjd 756168404Spjd if (vap->va_mask & AT_MTIME) { 757168404Spjd ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime); 758168404Spjd } else { 759168404Spjd ZFS_TIME_ENCODE(&now, pzp->zp_mtime); 760168404Spjd } 761168404Spjd 762168404Spjd pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); 763185029Spjd if (!(flag & IS_ROOT_NODE)) { 764185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 765185029Spjd *zpp = zfs_znode_alloc(zfsvfs, db, 0); 766185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 767185029Spjd } else { 768185029Spjd /* 769185029Spjd * If we are creating the root node, the "parent" we 770185029Spjd * passed in is the znode for the root. 771185029Spjd */ 772185029Spjd *zpp = dzp; 773185029Spjd } 774185029Spjd zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); 775185029Spjd if (!(flag & IS_ROOT_NODE)) { 776185029Spjd vnode_t *vp; 777168404Spjd 778185029Spjd vp = ZTOV(*zpp); 779185029Spjd vp->v_vflag |= VV_FORCEINSMQ; 780185029Spjd err = insmntque(vp, zfsvfs->z_vfs); 781185029Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 782185029Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 783185029Spjd } 784185029Spjd} 785168404Spjd 786185029Spjdvoid 787185029Spjdzfs_xvattr_set(znode_t *zp, xvattr_t *xvap) 788185029Spjd{ 789185029Spjd xoptattr_t *xoap; 790168404Spjd 791185029Spjd xoap = xva_getxoptattr(xvap); 792185029Spjd ASSERT(xoap); 793168404Spjd 794185029Spjd if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { 795185029Spjd ZFS_TIME_ENCODE(&xoap->xoa_createtime, zp->z_phys->zp_crtime); 796185029Spjd XVA_SET_RTN(xvap, XAT_CREATETIME); 797168404Spjd } 798185029Spjd if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { 799185029Spjd ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly); 800185029Spjd XVA_SET_RTN(xvap, XAT_READONLY); 801185029Spjd } 802185029Spjd if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { 803185029Spjd ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden); 804185029Spjd XVA_SET_RTN(xvap, XAT_HIDDEN); 805185029Spjd } 806185029Spjd if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { 807185029Spjd ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system); 808185029Spjd XVA_SET_RTN(xvap, XAT_SYSTEM); 809185029Spjd } 810185029Spjd if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { 811185029Spjd ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive); 812185029Spjd XVA_SET_RTN(xvap, XAT_ARCHIVE); 813185029Spjd } 814185029Spjd if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { 815185029Spjd ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable); 816185029Spjd XVA_SET_RTN(xvap, XAT_IMMUTABLE); 817185029Spjd } 818185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { 819185029Spjd ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink); 820185029Spjd XVA_SET_RTN(xvap, XAT_NOUNLINK); 821185029Spjd } 822185029Spjd if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { 823185029Spjd ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly); 824185029Spjd XVA_SET_RTN(xvap, XAT_APPENDONLY); 825185029Spjd } 826185029Spjd if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { 827185029Spjd ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump); 828185029Spjd XVA_SET_RTN(xvap, XAT_NODUMP); 829185029Spjd } 830185029Spjd if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { 831185029Spjd ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque); 832185029Spjd XVA_SET_RTN(xvap, XAT_OPAQUE); 833185029Spjd } 834185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { 835185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, 836185029Spjd xoap->xoa_av_quarantined); 837185029Spjd XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); 838185029Spjd } 839185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { 840185029Spjd ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified); 841185029Spjd XVA_SET_RTN(xvap, XAT_AV_MODIFIED); 842185029Spjd } 843185029Spjd if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { 844185029Spjd (void) memcpy(zp->z_phys + 1, xoap->xoa_av_scanstamp, 845185029Spjd sizeof (xoap->xoa_av_scanstamp)); 846185029Spjd zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; 847185029Spjd XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); 848185029Spjd } 849168404Spjd} 850168404Spjd 851168404Spjdint 852168404Spjdzfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) 853168404Spjd{ 854168404Spjd dmu_object_info_t doi; 855168404Spjd dmu_buf_t *db; 856168404Spjd znode_t *zp; 857168404Spjd vnode_t *vp; 858185029Spjd int err, first = 1; 859168404Spjd 860168404Spjd *zpp = NULL; 861185029Spjdagain: 862168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 863168404Spjd 864168404Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 865168404Spjd if (err) { 866168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 867168404Spjd return (err); 868168404Spjd } 869168404Spjd 870168404Spjd dmu_object_info_from_db(db, &doi); 871168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 872168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 873168404Spjd dmu_buf_rele(db, NULL); 874168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 875168404Spjd return (EINVAL); 876168404Spjd } 877168404Spjd 878168404Spjd zp = dmu_buf_get_user(db); 879168404Spjd if (zp != NULL) { 880168404Spjd mutex_enter(&zp->z_lock); 881168404Spjd 882185029Spjd /* 883185029Spjd * Since we do immediate eviction of the z_dbuf, we 884185029Spjd * should never find a dbuf with a znode that doesn't 885185029Spjd * know about the dbuf. 886185029Spjd */ 887185029Spjd ASSERT3P(zp->z_dbuf, ==, db); 888168404Spjd ASSERT3U(zp->z_id, ==, obj_num); 889168404Spjd if (zp->z_unlinked) { 890185029Spjd err = ENOENT; 891168404Spjd } else { 892185029Spjd if (ZTOV(zp) != NULL) 893185029Spjd VN_HOLD(ZTOV(zp)); 894185029Spjd else { 895185029Spjd if (first) { 896185029Spjd ZFS_LOG(1, "dying znode detected (zp=%p)", zp); 897185029Spjd first = 0; 898185029Spjd } 899185029Spjd /* 900185029Spjd * znode is dying so we can't reuse it, we must 901185029Spjd * wait until destruction is completed. 902185029Spjd */ 903185029Spjd dmu_buf_rele(db, NULL); 904185029Spjd mutex_exit(&zp->z_lock); 905185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 906185029Spjd tsleep(zp, 0, "zcollide", 1); 907185029Spjd goto again; 908185029Spjd } 909185029Spjd *zpp = zp; 910185029Spjd err = 0; 911168404Spjd } 912185029Spjd dmu_buf_rele(db, NULL); 913168404Spjd mutex_exit(&zp->z_lock); 914168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 915185029Spjd return (err); 916168404Spjd } 917168404Spjd 918168404Spjd /* 919168404Spjd * Not found create new znode/vnode 920168404Spjd */ 921185029Spjd zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); 922185029Spjd 923185029Spjd vp = ZTOV(zp); 924185029Spjd vp->v_vflag |= VV_FORCEINSMQ; 925185029Spjd err = insmntque(vp, zfsvfs->z_vfs); 926185029Spjd vp->v_vflag &= ~VV_FORCEINSMQ; 927185029Spjd KASSERT(err == 0, ("insmntque() failed: error %d", err)); 928185029Spjd VOP_UNLOCK(vp, 0); 929185029Spjd 930168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 931168404Spjd *zpp = zp; 932168404Spjd return (0); 933168404Spjd} 934168404Spjd 935185029Spjdint 936185029Spjdzfs_rezget(znode_t *zp) 937185029Spjd{ 938185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 939185029Spjd dmu_object_info_t doi; 940185029Spjd dmu_buf_t *db; 941185029Spjd uint64_t obj_num = zp->z_id; 942185029Spjd int err; 943185029Spjd 944185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); 945185029Spjd 946185029Spjd err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); 947185029Spjd if (err) { 948185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 949185029Spjd return (err); 950185029Spjd } 951185029Spjd 952185029Spjd dmu_object_info_from_db(db, &doi); 953185029Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 954185029Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 955185029Spjd dmu_buf_rele(db, NULL); 956185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 957185029Spjd return (EINVAL); 958185029Spjd } 959185029Spjd 960185029Spjd if (((znode_phys_t *)db->db_data)->zp_gen != zp->z_gen) { 961185029Spjd dmu_buf_rele(db, NULL); 962185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 963185029Spjd return (EIO); 964185029Spjd } 965185029Spjd 966185029Spjd zfs_znode_dmu_init(zfsvfs, zp, db); 967185029Spjd zp->z_unlinked = (zp->z_phys->zp_links == 0); 968185029Spjd zp->z_blksz = doi.doi_data_block_size; 969185029Spjd 970185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); 971185029Spjd 972185029Spjd return (0); 973185029Spjd} 974185029Spjd 975168404Spjdvoid 976168404Spjdzfs_znode_delete(znode_t *zp, dmu_tx_t *tx) 977168404Spjd{ 978168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 979185029Spjd objset_t *os = zfsvfs->z_os; 980185029Spjd uint64_t obj = zp->z_id; 981185029Spjd uint64_t acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj; 982168404Spjd 983185029Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); 984185029Spjd if (acl_obj) 985185029Spjd VERIFY(0 == dmu_object_free(os, acl_obj, tx)); 986185029Spjd VERIFY(0 == dmu_object_free(os, obj, tx)); 987185029Spjd zfs_znode_dmu_fini(zp); 988185029Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); 989185029Spjd zfs_znode_free(zp); 990168404Spjd} 991168404Spjd 992168404Spjdvoid 993168404Spjdzfs_zinactive(znode_t *zp) 994168404Spjd{ 995168404Spjd vnode_t *vp = ZTOV(zp); 996168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 997168404Spjd uint64_t z_id = zp->z_id; 998168404Spjd 999185029Spjd ASSERT(zp->z_dbuf && zp->z_phys); 1000168404Spjd 1001168404Spjd /* 1002168404Spjd * Don't allow a zfs_zget() while were trying to release this znode 1003168404Spjd */ 1004168404Spjd ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); 1005168404Spjd 1006168404Spjd mutex_enter(&zp->z_lock); 1007168404Spjd VI_LOCK(vp); 1008168404Spjd if (vp->v_count > 0) { 1009168404Spjd /* 1010168404Spjd * If the hold count is greater than zero, somebody has 1011168404Spjd * obtained a new reference on this znode while we were 1012168404Spjd * processing it here, so we are done. 1013168404Spjd */ 1014168404Spjd VI_UNLOCK(vp); 1015168404Spjd mutex_exit(&zp->z_lock); 1016168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1017168404Spjd return; 1018168404Spjd } 1019168404Spjd VI_UNLOCK(vp); 1020168404Spjd 1021168404Spjd /* 1022168404Spjd * If this was the last reference to a file with no links, 1023168404Spjd * remove the file from the file system. 1024168404Spjd */ 1025168404Spjd if (zp->z_unlinked) { 1026168404Spjd mutex_exit(&zp->z_lock); 1027168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1028168404Spjd ASSERT(vp->v_count == 0); 1029168404Spjd vrecycle(vp, curthread); 1030168404Spjd zfs_rmnode(zp); 1031168404Spjd return; 1032168404Spjd } 1033168404Spjd mutex_exit(&zp->z_lock); 1034168404Spjd ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); 1035168404Spjd} 1036168404Spjd 1037168404Spjdvoid 1038168404Spjdzfs_znode_free(znode_t *zp) 1039168404Spjd{ 1040168404Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1041168404Spjd 1042185029Spjd ASSERT(ZTOV(zp) == NULL); 1043168404Spjd mutex_enter(&zfsvfs->z_znodes_lock); 1044185029Spjd POINTER_INVALIDATE(&zp->z_zfsvfs); 1045168404Spjd list_remove(&zfsvfs->z_all_znodes, zp); 1046168404Spjd mutex_exit(&zfsvfs->z_znodes_lock); 1047168404Spjd 1048168404Spjd kmem_cache_free(znode_cache, zp); 1049185029Spjd 1050185029Spjd VFS_RELE(zfsvfs->z_vfs); 1051168404Spjd} 1052168404Spjd 1053168404Spjdvoid 1054168404Spjdzfs_time_stamper_locked(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1055168404Spjd{ 1056168404Spjd timestruc_t now; 1057168404Spjd 1058168404Spjd ASSERT(MUTEX_HELD(&zp->z_lock)); 1059168404Spjd 1060168404Spjd gethrestime(&now); 1061168404Spjd 1062168404Spjd if (tx) { 1063168404Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1064168404Spjd zp->z_atime_dirty = 0; 1065168404Spjd zp->z_seq++; 1066168404Spjd } else { 1067168404Spjd zp->z_atime_dirty = 1; 1068168404Spjd } 1069168404Spjd 1070168404Spjd if (flag & AT_ATIME) 1071168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_atime); 1072168404Spjd 1073185029Spjd if (flag & AT_MTIME) { 1074168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_mtime); 1075185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1076185029Spjd zp->z_phys->zp_flags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); 1077185029Spjd } 1078168404Spjd 1079185029Spjd if (flag & AT_CTIME) { 1080168404Spjd ZFS_TIME_ENCODE(&now, zp->z_phys->zp_ctime); 1081185029Spjd if (zp->z_zfsvfs->z_use_fuids) 1082185029Spjd zp->z_phys->zp_flags |= ZFS_ARCHIVE; 1083185029Spjd } 1084168404Spjd} 1085168404Spjd 1086168404Spjd/* 1087168404Spjd * Update the requested znode timestamps with the current time. 1088168404Spjd * If we are in a transaction, then go ahead and mark the znode 1089168404Spjd * dirty in the transaction so the timestamps will go to disk. 1090168404Spjd * Otherwise, we will get pushed next time the znode is updated 1091168404Spjd * in a transaction, or when this znode eventually goes inactive. 1092168404Spjd * 1093168404Spjd * Why is this OK? 1094168404Spjd * 1 - Only the ACCESS time is ever updated outside of a transaction. 1095168404Spjd * 2 - Multiple consecutive updates will be collapsed into a single 1096168404Spjd * znode update by the transaction grouping semantics of the DMU. 1097168404Spjd */ 1098168404Spjdvoid 1099168404Spjdzfs_time_stamper(znode_t *zp, uint_t flag, dmu_tx_t *tx) 1100168404Spjd{ 1101168404Spjd mutex_enter(&zp->z_lock); 1102168404Spjd zfs_time_stamper_locked(zp, flag, tx); 1103168404Spjd mutex_exit(&zp->z_lock); 1104168404Spjd} 1105168404Spjd 1106168404Spjd/* 1107168404Spjd * Grow the block size for a file. 1108168404Spjd * 1109168404Spjd * IN: zp - znode of file to free data in. 1110168404Spjd * size - requested block size 1111168404Spjd * tx - open transaction. 1112168404Spjd * 1113168404Spjd * NOTE: this function assumes that the znode is write locked. 1114168404Spjd */ 1115168404Spjdvoid 1116168404Spjdzfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) 1117168404Spjd{ 1118168404Spjd int error; 1119168404Spjd u_longlong_t dummy; 1120168404Spjd 1121168404Spjd if (size <= zp->z_blksz) 1122168404Spjd return; 1123168404Spjd /* 1124168404Spjd * If the file size is already greater than the current blocksize, 1125168404Spjd * we will not grow. If there is more than one block in a file, 1126168404Spjd * the blocksize cannot change. 1127168404Spjd */ 1128168404Spjd if (zp->z_blksz && zp->z_phys->zp_size > zp->z_blksz) 1129168404Spjd return; 1130168404Spjd 1131168404Spjd error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, 1132168404Spjd size, 0, tx); 1133168404Spjd if (error == ENOTSUP) 1134168404Spjd return; 1135168404Spjd ASSERT3U(error, ==, 0); 1136168404Spjd 1137168404Spjd /* What blocksize did we actually get? */ 1138168404Spjd dmu_object_size_from_db(zp->z_dbuf, &zp->z_blksz, &dummy); 1139168404Spjd} 1140168404Spjd 1141168404Spjd/* 1142185029Spjd * Increase the file length 1143168404Spjd * 1144168404Spjd * IN: zp - znode of file to free data in. 1145185029Spjd * end - new end-of-file 1146168404Spjd * 1147168404Spjd * RETURN: 0 if success 1148168404Spjd * error code if failure 1149168404Spjd */ 1150185029Spjdstatic int 1151185029Spjdzfs_extend(znode_t *zp, uint64_t end) 1152168404Spjd{ 1153185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1154168404Spjd dmu_tx_t *tx; 1155168404Spjd rl_t *rl; 1156185029Spjd uint64_t newblksz; 1157168404Spjd int error; 1158168404Spjd 1159168404Spjd /* 1160185029Spjd * We will change zp_size, lock the whole file. 1161168404Spjd */ 1162185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1163168404Spjd 1164168404Spjd /* 1165168404Spjd * Nothing to do if file already at desired length. 1166168404Spjd */ 1167185029Spjd if (end <= zp->z_phys->zp_size) { 1168168404Spjd zfs_range_unlock(rl); 1169168404Spjd return (0); 1170168404Spjd } 1171185029Spjdtop: 1172168404Spjd tx = dmu_tx_create(zfsvfs->z_os); 1173168404Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1174185029Spjd if (end > zp->z_blksz && 1175168404Spjd (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { 1176168404Spjd /* 1177168404Spjd * We are growing the file past the current block size. 1178168404Spjd */ 1179168404Spjd if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { 1180168404Spjd ASSERT(!ISP2(zp->z_blksz)); 1181185029Spjd newblksz = MIN(end, SPA_MAXBLOCKSIZE); 1182168404Spjd } else { 1183185029Spjd newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); 1184168404Spjd } 1185185029Spjd dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); 1186185029Spjd } else { 1187185029Spjd newblksz = 0; 1188168404Spjd } 1189168404Spjd 1190168404Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1191168404Spjd if (error) { 1192185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1193168404Spjd dmu_tx_wait(tx); 1194185029Spjd dmu_tx_abort(tx); 1195185029Spjd goto top; 1196185029Spjd } 1197168404Spjd dmu_tx_abort(tx); 1198168404Spjd zfs_range_unlock(rl); 1199168404Spjd return (error); 1200168404Spjd } 1201185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1202168404Spjd 1203185029Spjd if (newblksz) 1204185029Spjd zfs_grow_blocksize(zp, newblksz, tx); 1205168404Spjd 1206185029Spjd zp->z_phys->zp_size = end; 1207168404Spjd 1208185029Spjd zfs_range_unlock(rl); 1209168404Spjd 1210185029Spjd dmu_tx_commit(tx); 1211185029Spjd 1212185029Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1213185029Spjd error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1214185029Spjd ASSERT(error == 0); 1215185029Spjd vnode_pager_setsize(ZTOV(zp), end); 1216185029Spjd rw_exit(&zp->z_map_lock); 1217185029Spjd 1218185029Spjd return (0); 1219185029Spjd} 1220185029Spjd 1221185029Spjd/* 1222185029Spjd * Free space in a file. 1223185029Spjd * 1224185029Spjd * IN: zp - znode of file to free data in. 1225185029Spjd * off - start of section to free. 1226185029Spjd * len - length of section to free. 1227185029Spjd * 1228185029Spjd * RETURN: 0 if success 1229185029Spjd * error code if failure 1230185029Spjd */ 1231185029Spjdstatic int 1232185029Spjdzfs_free_range(znode_t *zp, uint64_t off, uint64_t len) 1233185029Spjd{ 1234185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1235185029Spjd rl_t *rl; 1236185029Spjd int error; 1237185029Spjd 1238185029Spjd /* 1239185029Spjd * Lock the range being freed. 1240185029Spjd */ 1241185029Spjd rl = zfs_range_lock(zp, off, len, RL_WRITER); 1242185029Spjd 1243185029Spjd /* 1244185029Spjd * Nothing to do if file already at desired length. 1245185029Spjd */ 1246185029Spjd if (off >= zp->z_phys->zp_size) { 1247185029Spjd zfs_range_unlock(rl); 1248185029Spjd return (0); 1249168404Spjd } 1250168404Spjd 1251185029Spjd if (off + len > zp->z_phys->zp_size) 1252185029Spjd len = zp->z_phys->zp_size - off; 1253185029Spjd 1254185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); 1255185029Spjd 1256185029Spjd if (error == 0) { 1257185029Spjd /* 1258185029Spjd * In FreeBSD we cannot free block in the middle of a file, 1259185029Spjd * but only at the end of a file. 1260185029Spjd */ 1261185029Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1262185029Spjd error = vinvalbuf(ZTOV(zp), V_SAVE, 0, 0); 1263185029Spjd ASSERT(error == 0); 1264185029Spjd vnode_pager_setsize(ZTOV(zp), off); 1265185029Spjd rw_exit(&zp->z_map_lock); 1266168404Spjd } 1267168404Spjd 1268168404Spjd zfs_range_unlock(rl); 1269168404Spjd 1270185029Spjd return (error); 1271185029Spjd} 1272185029Spjd 1273185029Spjd/* 1274185029Spjd * Truncate a file 1275185029Spjd * 1276185029Spjd * IN: zp - znode of file to free data in. 1277185029Spjd * end - new end-of-file. 1278185029Spjd * 1279185029Spjd * RETURN: 0 if success 1280185029Spjd * error code if failure 1281185029Spjd */ 1282185029Spjdstatic int 1283185029Spjdzfs_trunc(znode_t *zp, uint64_t end) 1284185029Spjd{ 1285185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1286185029Spjd vnode_t *vp = ZTOV(zp); 1287185029Spjd dmu_tx_t *tx; 1288185029Spjd rl_t *rl; 1289185029Spjd int error; 1290185029Spjd 1291185029Spjd /* 1292185029Spjd * We will change zp_size, lock the whole file. 1293185029Spjd */ 1294185029Spjd rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER); 1295185029Spjd 1296185029Spjd /* 1297185029Spjd * Nothing to do if file already at desired length. 1298185029Spjd */ 1299185029Spjd if (end >= zp->z_phys->zp_size) { 1300185029Spjd zfs_range_unlock(rl); 1301185029Spjd return (0); 1302185029Spjd } 1303185029Spjd 1304185029Spjd error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1); 1305185029Spjd if (error) { 1306185029Spjd zfs_range_unlock(rl); 1307185029Spjd return (error); 1308185029Spjd } 1309185029Spjdtop: 1310185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1311185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1312185029Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1313185029Spjd if (error) { 1314185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1315185029Spjd dmu_tx_wait(tx); 1316185029Spjd dmu_tx_abort(tx); 1317185029Spjd goto top; 1318185029Spjd } 1319185029Spjd dmu_tx_abort(tx); 1320185029Spjd zfs_range_unlock(rl); 1321185029Spjd return (error); 1322185029Spjd } 1323185029Spjd dmu_buf_will_dirty(zp->z_dbuf, tx); 1324185029Spjd 1325185029Spjd zp->z_phys->zp_size = end; 1326185029Spjd 1327168404Spjd dmu_tx_commit(tx); 1328168404Spjd 1329185029Spjd zfs_range_unlock(rl); 1330185029Spjd 1331168404Spjd /* 1332168404Spjd * Clear any mapped pages in the truncated region. This has to 1333168404Spjd * happen outside of the transaction to avoid the possibility of 1334168404Spjd * a deadlock with someone trying to push a page that we are 1335168404Spjd * about to invalidate. 1336168404Spjd */ 1337168404Spjd rw_enter(&zp->z_map_lock, RW_WRITER); 1338168404Spjd#if 0 1339185029Spjd error = vtruncbuf(vp, curthread->td_ucred, curthread, end, PAGE_SIZE); 1340168404Spjd#else 1341185029Spjd error = vinvalbuf(vp, V_SAVE, 0, 0); 1342185029Spjd ASSERT(error == 0); 1343185029Spjd vnode_pager_setsize(vp, end); 1344168404Spjd#endif 1345168404Spjd rw_exit(&zp->z_map_lock); 1346168404Spjd 1347168404Spjd return (0); 1348168404Spjd} 1349168404Spjd 1350185029Spjd/* 1351185029Spjd * Free space in a file 1352185029Spjd * 1353185029Spjd * IN: zp - znode of file to free data in. 1354185029Spjd * off - start of range 1355185029Spjd * len - end of range (0 => EOF) 1356185029Spjd * flag - current file open mode flags. 1357185029Spjd * log - TRUE if this action should be logged 1358185029Spjd * 1359185029Spjd * RETURN: 0 if success 1360185029Spjd * error code if failure 1361185029Spjd */ 1362185029Spjdint 1363185029Spjdzfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) 1364185029Spjd{ 1365185029Spjd vnode_t *vp = ZTOV(zp); 1366185029Spjd dmu_tx_t *tx; 1367185029Spjd zfsvfs_t *zfsvfs = zp->z_zfsvfs; 1368185029Spjd zilog_t *zilog = zfsvfs->z_log; 1369185029Spjd int error; 1370185029Spjd 1371185029Spjd if (off > zp->z_phys->zp_size) { 1372185029Spjd error = zfs_extend(zp, off+len); 1373185029Spjd if (error == 0 && log) 1374185029Spjd goto log; 1375185029Spjd else 1376185029Spjd return (error); 1377185029Spjd } 1378185029Spjd 1379185029Spjd if (len == 0) { 1380185029Spjd error = zfs_trunc(zp, off); 1381185029Spjd } else { 1382185029Spjd if ((error = zfs_free_range(zp, off, len)) == 0 && 1383185029Spjd off + len > zp->z_phys->zp_size) 1384185029Spjd error = zfs_extend(zp, off+len); 1385185029Spjd } 1386185029Spjd if (error || !log) 1387185029Spjd return (error); 1388185029Spjdlog: 1389185029Spjd tx = dmu_tx_create(zfsvfs->z_os); 1390185029Spjd dmu_tx_hold_bonus(tx, zp->z_id); 1391185029Spjd error = dmu_tx_assign(tx, zfsvfs->z_assign); 1392185029Spjd if (error) { 1393185029Spjd if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { 1394185029Spjd dmu_tx_wait(tx); 1395185029Spjd dmu_tx_abort(tx); 1396185029Spjd goto log; 1397185029Spjd } 1398185029Spjd dmu_tx_abort(tx); 1399185029Spjd return (error); 1400185029Spjd } 1401185029Spjd 1402185029Spjd zfs_time_stamper(zp, CONTENT_MODIFIED, tx); 1403185029Spjd zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); 1404185029Spjd 1405185029Spjd dmu_tx_commit(tx); 1406185029Spjd return (0); 1407185029Spjd} 1408185029Spjd 1409168404Spjdvoid 1410185029Spjdzfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) 1411168404Spjd{ 1412168404Spjd zfsvfs_t zfsvfs; 1413185029Spjd uint64_t moid, doid, version; 1414185029Spjd uint64_t sense = ZFS_CASE_SENSITIVE; 1415185029Spjd uint64_t norm = 0; 1416185029Spjd nvpair_t *elem; 1417168404Spjd int error; 1418168404Spjd znode_t *rootzp = NULL; 1419185029Spjd vnode_t *vp; 1420168404Spjd vattr_t vattr; 1421185029Spjd znode_t *zp; 1422168404Spjd 1423168404Spjd /* 1424168404Spjd * First attempt to create master node. 1425168404Spjd */ 1426168404Spjd /* 1427168404Spjd * In an empty objset, there are no blocks to read and thus 1428168404Spjd * there can be no i/o errors (which we assert below). 1429168404Spjd */ 1430168404Spjd moid = MASTER_NODE_OBJ; 1431168404Spjd error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, 1432168404Spjd DMU_OT_NONE, 0, tx); 1433168404Spjd ASSERT(error == 0); 1434168404Spjd 1435168404Spjd /* 1436168404Spjd * Set starting attributes. 1437168404Spjd */ 1438185029Spjd if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) 1439185029Spjd version = ZPL_VERSION; 1440185029Spjd else 1441185029Spjd version = ZPL_VERSION_FUID - 1; 1442185029Spjd error = zap_update(os, moid, ZPL_VERSION_STR, 1443185029Spjd 8, 1, &version, tx); 1444185029Spjd elem = NULL; 1445185029Spjd while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { 1446185029Spjd /* For the moment we expect all zpl props to be uint64_ts */ 1447185029Spjd uint64_t val; 1448185029Spjd char *name; 1449168404Spjd 1450185029Spjd ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); 1451185029Spjd VERIFY(nvpair_value_uint64(elem, &val) == 0); 1452185029Spjd name = nvpair_name(elem); 1453185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { 1454185029Spjd version = val; 1455185029Spjd error = zap_update(os, moid, ZPL_VERSION_STR, 1456185029Spjd 8, 1, &version, tx); 1457185029Spjd } else { 1458185029Spjd error = zap_update(os, moid, name, 8, 1, &val, tx); 1459185029Spjd } 1460185029Spjd ASSERT(error == 0); 1461185029Spjd if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) 1462185029Spjd norm = val; 1463185029Spjd else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) 1464185029Spjd sense = val; 1465185029Spjd } 1466185029Spjd ASSERT(version != 0); 1467168404Spjd 1468168404Spjd /* 1469168404Spjd * Create a delete queue. 1470168404Spjd */ 1471168404Spjd doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); 1472168404Spjd 1473168404Spjd error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); 1474168404Spjd ASSERT(error == 0); 1475168404Spjd 1476168404Spjd /* 1477168404Spjd * Create root znode. Create minimal znode/vnode/zfsvfs 1478168404Spjd * to allow zfs_mknode to work. 1479168404Spjd */ 1480185029Spjd VATTR_NULL(&vattr); 1481168404Spjd vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; 1482168404Spjd vattr.va_type = VDIR; 1483168404Spjd vattr.va_mode = S_IFDIR|0755; 1484185029Spjd vattr.va_uid = crgetuid(cr); 1485185029Spjd vattr.va_gid = crgetgid(cr); 1486168404Spjd 1487168404Spjd rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); 1488185029Spjd zfs_znode_cache_constructor(rootzp, &zfsvfs, 0); 1489168404Spjd rootzp->z_unlinked = 0; 1490168404Spjd rootzp->z_atime_dirty = 0; 1491168404Spjd 1492185029Spjd vp = ZTOV(rootzp); 1493185029Spjd vp->v_type = VDIR; 1494185029Spjd 1495168404Spjd bzero(&zfsvfs, sizeof (zfsvfs_t)); 1496168404Spjd 1497168404Spjd zfsvfs.z_os = os; 1498168404Spjd zfsvfs.z_assign = TXG_NOWAIT; 1499168404Spjd zfsvfs.z_parent = &zfsvfs; 1500185029Spjd zfsvfs.z_version = version; 1501185029Spjd zfsvfs.z_use_fuids = USE_FUIDS(version, os); 1502185029Spjd zfsvfs.z_norm = norm; 1503185029Spjd /* 1504185029Spjd * Fold case on file systems that are always or sometimes case 1505185029Spjd * insensitive. 1506185029Spjd */ 1507185029Spjd if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) 1508185029Spjd zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER; 1509168404Spjd 1510168404Spjd mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); 1511168404Spjd list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), 1512168404Spjd offsetof(znode_t, z_link_node)); 1513168404Spjd 1514185029Spjd ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); 1515185029Spjd rootzp->z_zfsvfs = &zfsvfs; 1516185029Spjd zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); 1517185029Spjd ASSERT3P(zp, ==, rootzp); 1518185029Spjd error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); 1519168404Spjd ASSERT(error == 0); 1520185029Spjd POINTER_INVALIDATE(&rootzp->z_zfsvfs); 1521168404Spjd 1522185029Spjd VI_LOCK(vp); 1523185029Spjd ZTOV(rootzp)->v_data = NULL; 1524185029Spjd ZTOV(rootzp)->v_count = 0; 1525185029Spjd ZTOV(rootzp)->v_holdcnt = 0; 1526185029Spjd ZTOV(rootzp) = NULL; 1527185029Spjd VOP_UNLOCK(vp, 0); 1528185029Spjd vdestroy(vp); 1529185029Spjd dmu_buf_rele(rootzp->z_dbuf, NULL); 1530185029Spjd rootzp->z_dbuf = NULL; 1531169325Spjd mutex_destroy(&zfsvfs.z_znodes_lock); 1532168404Spjd kmem_cache_free(znode_cache, rootzp); 1533168404Spjd} 1534185029Spjd 1535168404Spjd#endif /* _KERNEL */ 1536168404Spjd/* 1537168404Spjd * Given an object number, return its parent object number and whether 1538168404Spjd * or not the object is an extended attribute directory. 1539168404Spjd */ 1540168404Spjdstatic int 1541168404Spjdzfs_obj_to_pobj(objset_t *osp, uint64_t obj, uint64_t *pobjp, int *is_xattrdir) 1542168404Spjd{ 1543168404Spjd dmu_buf_t *db; 1544168404Spjd dmu_object_info_t doi; 1545168404Spjd znode_phys_t *zp; 1546168404Spjd int error; 1547168404Spjd 1548168404Spjd if ((error = dmu_bonus_hold(osp, obj, FTAG, &db)) != 0) 1549168404Spjd return (error); 1550168404Spjd 1551168404Spjd dmu_object_info_from_db(db, &doi); 1552168404Spjd if (doi.doi_bonus_type != DMU_OT_ZNODE || 1553168404Spjd doi.doi_bonus_size < sizeof (znode_phys_t)) { 1554168404Spjd dmu_buf_rele(db, FTAG); 1555168404Spjd return (EINVAL); 1556168404Spjd } 1557168404Spjd 1558168404Spjd zp = db->db_data; 1559168404Spjd *pobjp = zp->zp_parent; 1560168404Spjd *is_xattrdir = ((zp->zp_flags & ZFS_XATTR) != 0) && 1561168404Spjd S_ISDIR(zp->zp_mode); 1562168404Spjd dmu_buf_rele(db, FTAG); 1563168404Spjd 1564168404Spjd return (0); 1565168404Spjd} 1566168404Spjd 1567168404Spjdint 1568168404Spjdzfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) 1569168404Spjd{ 1570168404Spjd char *path = buf + len - 1; 1571168404Spjd int error; 1572168404Spjd 1573168404Spjd *path = '\0'; 1574168404Spjd 1575168404Spjd for (;;) { 1576168404Spjd uint64_t pobj; 1577168404Spjd char component[MAXNAMELEN + 2]; 1578168404Spjd size_t complen; 1579168404Spjd int is_xattrdir; 1580168404Spjd 1581168404Spjd if ((error = zfs_obj_to_pobj(osp, obj, &pobj, 1582168404Spjd &is_xattrdir)) != 0) 1583168404Spjd break; 1584168404Spjd 1585168404Spjd if (pobj == obj) { 1586168404Spjd if (path[0] != '/') 1587168404Spjd *--path = '/'; 1588168404Spjd break; 1589168404Spjd } 1590168404Spjd 1591168404Spjd component[0] = '/'; 1592168404Spjd if (is_xattrdir) { 1593168404Spjd (void) sprintf(component + 1, "<xattrdir>"); 1594168404Spjd } else { 1595185029Spjd error = zap_value_search(osp, pobj, obj, 1596185029Spjd ZFS_DIRENT_OBJ(-1ULL), component + 1); 1597168404Spjd if (error != 0) 1598168404Spjd break; 1599168404Spjd } 1600168404Spjd 1601168404Spjd complen = strlen(component); 1602168404Spjd path -= complen; 1603168404Spjd ASSERT(path >= buf); 1604168404Spjd bcopy(component, path, complen); 1605168404Spjd obj = pobj; 1606168404Spjd } 1607168404Spjd 1608168404Spjd if (error == 0) 1609168404Spjd (void) memmove(buf, path, buf + len - path); 1610168404Spjd return (error); 1611168404Spjd} 1612